Percona-Server-5.6.14-rel62.0 merge

support ha_innodb.so as a dynamic plugin. * remove obsolete *,innodb_plugin.rdiff files * s/--plugin-load=/--plugin-load-add=/ * MYSQL_PLUGIN_IMPORT glob_hostname[] * use my_error instead of push_warning_printf(ER_DEFAULT) * don't use tdc_size and tc_size in a module update test cases (XtraDB is 5.6.14, InnoDB is 5.6.10) * copy new tests over * disable some tests for (old) InnoDB * delete XtraDB tests that no longer apply small compatibility changes: * s/HTON_EXTENDED_KEYS/HTON_SUPPORTS_EXTENDED_KEYS/ * revert unnecessary InnoDB changes to make it a bit closer to the upstream fix XtraDB to compile on Windows (both as a static and a dynamic plugin) disable XtraDB on Windows (deadlocks) and where no atomic ops are available (e.g. CentOS 5) storage/innobase/handler/ha_innodb.cc: revert few unnecessary changes to make it a bit closer to the original InnoDB storage/innobase/include/univ.i: correct the version to match what it was merged from
author: Sergei Golubchik <sergii@pisem.net> 2013-12-22 17:06:50 +0100
committer: Sergei Golubchik <sergii@pisem.net> 2013-12-22 17:06:50 +0100
commit: ffa8c4cfcc41d4f160e3bdfca5cfd4b01a7d6e63 (patch)
tree: 728585c36f22a5db3cea796430883d0ebc5c05eb
parent: e27c34f9e4ca15c797fcd3191ee5679c2f237a09 (diff)
parent: 52c26f7a1f675185d2ef1a28aca7f9bcc67c6414 (diff)
download: mariadb-git-ffa8c4cfcc41d4f160e3bdfca5cfd4b01a7d6e63.tar.gz
506 files changed, 137888 insertions, 53270 deletions
diff --git a/BUILD/compile-innodb b/BUILD/compile-innodb
index 6b0b2df66da..fa791282b28 100644..100755
--- a/BUILD/compile-innodb
+++ b/BUILD/compile-innodb
@@ -1,6 +1,6 @@
 #!/bin/sh
 #
-# Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2006, 2013, Oracle and/or its affiliates. All rights reserved.
 #
 # This program is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@
 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License along with
-# this program; if not, write to the Free Software Foundation, Inc., 
-# 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+# this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St,
+# Fifth Floor, Boston, MA 02110-1301 USA
 #
 
 # we assume this script is in storage/innobase/
diff --git a/mysql-test/include/have_innodb.combinations b/mysql-test/include/have_innodb.combinations
index b76f783b928..f647f15ddb6 100644
--- a/mysql-test/include/have_innodb.combinations
+++ b/mysql-test/include/have_innodb.combinations
@@ -9,6 +9,7 @@ innodb-buffer-page
 innodb-buffer-page-lru
 innodb-sys-foreign
 innodb-sys-foreign-col
+innodb-metrics
 
 [xtradb_plugin]
 ignore-builtin-innodb
@@ -21,6 +22,7 @@ innodb-buffer-page
 innodb-buffer-page-lru
 innodb-sys-foreign
 innodb-sys-foreign-col
+innodb-metrics
 
 [xtradb]
 innodb
@@ -32,3 +34,4 @@ innodb-buffer-page
 innodb-buffer-page-lru
 innodb-sys-foreign
 innodb-sys-foreign-col
+innodb-metrics
diff --git a/mysql-test/include/have_semisync.opt b/mysql-test/include/have_semisync.opt
index 2888844cdcd..19e29c7e4de 100644
--- a/mysql-test/include/have_semisync.opt
+++ b/mysql-test/include/have_semisync.opt
@@ -1,4 +1,4 @@
---plugin-load=$SEMISYNC_MASTER_SO
---plugin-load=$SEMISYNC_SLAVE_SO
+--plugin-load-add=$SEMISYNC_MASTER_SO
+--plugin-load-add=$SEMISYNC_SLAVE_SO
 --loose-rpl-semi-sync-master
 --loose-rpl-semi-sync-slave
diff --git a/mysql-test/include/linux.inc b/mysql-test/include/linux.inc
new file mode 100644
index 00000000000..f24832ca476
--- /dev/null
+++ b/mysql-test/include/linux.inc
@@ -0,0 +1,5 @@
+if (`select convert(@@version_compile_os using latin1) LIKE 'Linux' = 0`)
+{
+  skip Need Linux;
+}
+
diff --git a/mysql-test/mysql-test-run.pl b/mysql-test/mysql-test-run.pl
index 124ede14dbb..427cff31547 100755
--- a/mysql-test/mysql-test-run.pl
+++ b/mysql-test/mysql-test-run.pl
@@ -4822,6 +4822,7 @@ sub extract_warning_lines ($$) {
      qr|Plugin 'FEEDBACK' init function returned error|,
      qr|Plugin 'FEEDBACK' registration as a INFORMATION SCHEMA failed|,
      qr|'log-bin-use-v1-row-events' is MySQL 5.6 compatible option|,
+     qr|InnoDB: Setting thread \d+ nice to \d+ failed, current nice \d+, errno 13|, # setpriority() fails under valgrind
     );
 
   my $matched_lines= [];
diff --git a/mysql-test/r/information_schema-big.result b/mysql-test/r/information_schema-big.result
index 55ed95f6452..8ce5c0dad68 100644
--- a/mysql-test/r/information_schema-big.result
+++ b/mysql-test/r/information_schema-big.result
@@ -57,7 +57,6 @@ TRIGGERS	TRIGGER_SCHEMA
 USER_PRIVILEGES	GRANTEE
 USER_STATISTICS	USER
 VIEWS	TABLE_SCHEMA
-XTRADB_ADMIN_COMMAND	result_message
 SELECT t.table_name, c1.column_name
 FROM information_schema.tables t
 INNER JOIN
@@ -112,4 +111,3 @@ TRIGGERS	TRIGGER_SCHEMA
 USER_PRIVILEGES	GRANTEE
 USER_STATISTICS	USER
 VIEWS	TABLE_SCHEMA
-XTRADB_ADMIN_COMMAND	result_message
diff --git a/mysql-test/r/information_schema.result b/mysql-test/r/information_schema.result
index 4ccf3ba1ca4..d27b03e85d3 100644
--- a/mysql-test/r/information_schema.result
+++ b/mysql-test/r/information_schema.result
@@ -477,7 +477,11 @@ drop table t1;
 create table t1 (a int null, primary key(a));
 alter table t1 add constraint constraint_1 unique (a);
 alter table t1 add constraint unique key_1(a);
+Warnings:
+Note	1831	Duplicate index 'key_1' defined on the table 'test.t1'. This is deprecated and will be disallowed in a future release.
 alter table t1 add constraint constraint_2 unique key_2(a);
+Warnings:
+Note	1831	Duplicate index 'key_2' defined on the table 'test.t1'. This is deprecated and will be disallowed in a future release.
 show create table t1;
 Table	Create Table
 t1	CREATE TABLE `t1` (
@@ -709,6 +713,7 @@ max_questions	select,insert,update,references
 max_connections	select,insert,update,references
 max_user_connections	select,insert,update,references
 authentication_string	select,insert,update,references
+password_expired	select,insert,update,references
 is_role	select,insert,update,references
 use test;
 create function sub1(i int) returns int
@@ -816,6 +821,14 @@ information_schema	ROUTINES	ROUTINE_COMMENT
 information_schema	TRIGGERS	ACTION_CONDITION
 information_schema	TRIGGERS	ACTION_STATEMENT
 information_schema	VIEWS	VIEW_DEFINITION
+performance_schema	events_statements_current	SQL_TEXT
+performance_schema	events_statements_current	DIGEST_TEXT
+performance_schema	events_statements_history	SQL_TEXT
+performance_schema	events_statements_history	DIGEST_TEXT
+performance_schema	events_statements_history_long	SQL_TEXT
+performance_schema	events_statements_history_long	DIGEST_TEXT
+performance_schema	events_statements_summary_by_digest	DIGEST_TEXT
+performance_schema	threads	PROCESSLIST_INFO
 select table_name, column_name, data_type from information_schema.columns
 where data_type = 'datetime' and table_name not like 'innodb_%';
 table_name	column_name	data_type
diff --git a/mysql-test/r/information_schema_all_engines.result b/mysql-test/r/information_schema_all_engines.result
index ec9027fa32f..7ced16404a6 100644
--- a/mysql-test/r/information_schema_all_engines.result
+++ b/mysql-test/r/information_schema_all_engines.result
@@ -16,27 +16,21 @@ FILES
 GLOBAL_STATUS
 GLOBAL_VARIABLES
 INDEX_STATISTICS
-INNODB_BUFFER_POOL_PAGES
-INNODB_BUFFER_POOL_PAGES_BLOB
-INNODB_BUFFER_POOL_PAGES_INDEX
 INNODB_CHANGED_PAGES
 INNODB_CMP
 INNODB_CMPMEM
 INNODB_CMPMEM_RESET
+INNODB_CMP_PER_INDEX
 INNODB_CMP_RESET
-INNODB_INDEX_STATS
 INNODB_LOCKS
 INNODB_LOCK_WAITS
-INNODB_RSEG
 INNODB_SYS_COLUMNS
 INNODB_SYS_FIELDS
 INNODB_SYS_FOREIGN
 INNODB_SYS_FOREIGN_COLS
 INNODB_SYS_INDEXES
-INNODB_SYS_STATS
 INNODB_SYS_TABLES
 INNODB_SYS_TABLESTATS
-INNODB_TABLE_STATS
 INNODB_TRX
 KEY_CACHES
 KEY_COLUMN_USAGE
@@ -61,7 +55,9 @@ TRIGGERS
 USER_PRIVILEGES
 USER_STATISTICS
 VIEWS
-XTRADB_ADMIN_COMMAND
+XTRADB_INTERNAL_HASH_TABLES
+XTRADB_READ_VIEW
+XTRADB_RSEG
 SELECT t.table_name, c1.column_name
 FROM information_schema.tables t
 INNER JOIN
@@ -92,27 +88,21 @@ FILES	TABLE_SCHEMA
 GLOBAL_STATUS	VARIABLE_NAME
 GLOBAL_VARIABLES	VARIABLE_NAME
 INDEX_STATISTICS	TABLE_SCHEMA
-INNODB_BUFFER_POOL_PAGES	page_type
-INNODB_BUFFER_POOL_PAGES_BLOB	space_id
-INNODB_BUFFER_POOL_PAGES_INDEX	index_id
 INNODB_CHANGED_PAGES	space_id
 INNODB_CMP	page_size
 INNODB_CMPMEM	page_size
 INNODB_CMPMEM_RESET	page_size
+INNODB_CMP_PER_INDEX	database_name
 INNODB_CMP_RESET	page_size
-INNODB_INDEX_STATS	table_schema
 INNODB_LOCKS	lock_id
 INNODB_LOCK_WAITS	requesting_trx_id
-INNODB_RSEG	rseg_id
 INNODB_SYS_COLUMNS	TABLE_ID
 INNODB_SYS_FIELDS	INDEX_ID
 INNODB_SYS_FOREIGN	ID
 INNODB_SYS_FOREIGN_COLS	ID
 INNODB_SYS_INDEXES	INDEX_ID
-INNODB_SYS_STATS	INDEX_ID
-INNODB_SYS_TABLES	SCHEMA
-INNODB_SYS_TABLESTATS	SCHEMA
-INNODB_TABLE_STATS	table_schema
+INNODB_SYS_TABLES	TABLE_ID
+INNODB_SYS_TABLESTATS	TABLE_ID
 INNODB_TRX	trx_id
 KEY_CACHES	KEY_CACHE_NAME
 KEY_COLUMN_USAGE	CONSTRAINT_SCHEMA
@@ -137,7 +127,9 @@ TRIGGERS	TRIGGER_SCHEMA
 USER_PRIVILEGES	GRANTEE
 USER_STATISTICS	USER
 VIEWS	TABLE_SCHEMA
-XTRADB_ADMIN_COMMAND	result_message
+XTRADB_INTERNAL_HASH_TABLES	INTERNAL_HASH_TABLE_NAME
+XTRADB_READ_VIEW	READ_VIEW_UNDO_NUMBER
+XTRADB_RSEG	rseg_id
 SELECT t.table_name, c1.column_name
 FROM information_schema.tables t
 INNER JOIN
@@ -168,27 +160,21 @@ FILES	TABLE_SCHEMA
 GLOBAL_STATUS	VARIABLE_NAME
 GLOBAL_VARIABLES	VARIABLE_NAME
 INDEX_STATISTICS	TABLE_SCHEMA
-INNODB_BUFFER_POOL_PAGES	page_type
-INNODB_BUFFER_POOL_PAGES_BLOB	space_id
-INNODB_BUFFER_POOL_PAGES_INDEX	index_id
 INNODB_CHANGED_PAGES	space_id
 INNODB_CMP	page_size
 INNODB_CMPMEM	page_size
 INNODB_CMPMEM_RESET	page_size
+INNODB_CMP_PER_INDEX	database_name
 INNODB_CMP_RESET	page_size
-INNODB_INDEX_STATS	table_schema
 INNODB_LOCKS	lock_id
 INNODB_LOCK_WAITS	requesting_trx_id
-INNODB_RSEG	rseg_id
 INNODB_SYS_COLUMNS	TABLE_ID
 INNODB_SYS_FIELDS	INDEX_ID
 INNODB_SYS_FOREIGN	ID
 INNODB_SYS_FOREIGN_COLS	ID
 INNODB_SYS_INDEXES	INDEX_ID
-INNODB_SYS_STATS	INDEX_ID
-INNODB_SYS_TABLES	SCHEMA
-INNODB_SYS_TABLESTATS	SCHEMA
-INNODB_TABLE_STATS	table_schema
+INNODB_SYS_TABLES	TABLE_ID
+INNODB_SYS_TABLESTATS	TABLE_ID
 INNODB_TRX	trx_id
 KEY_CACHES	KEY_CACHE_NAME
 KEY_COLUMN_USAGE	CONSTRAINT_SCHEMA
@@ -213,11 +199,13 @@ TRIGGERS	TRIGGER_SCHEMA
 USER_PRIVILEGES	GRANTEE
 USER_STATISTICS	USER
 VIEWS	TABLE_SCHEMA
-XTRADB_ADMIN_COMMAND	result_message
-select 1 as f1 from information_schema.tables  where "ALL_PLUGINS"=
+XTRADB_INTERNAL_HASH_TABLES	INTERNAL_HASH_TABLE_NAME
+XTRADB_READ_VIEW	READ_VIEW_UNDO_NUMBER
+XTRADB_RSEG	rseg_id
+select 1 as "must be 1" from information_schema.tables  where "ACCOUNTS"=
 (select cast(table_name as char)  from information_schema.tables
 order by table_name limit 1) limit 1;
-f1
+must be 1
 1
 select t.table_name, group_concat(t.table_schema, '.', t.table_name),
 count(*) as num1
@@ -249,27 +237,21 @@ FILES	information_schema.FILES	1
 GLOBAL_STATUS	information_schema.GLOBAL_STATUS	1
 GLOBAL_VARIABLES	information_schema.GLOBAL_VARIABLES	1
 INDEX_STATISTICS	information_schema.INDEX_STATISTICS	1
-INNODB_BUFFER_POOL_PAGES	information_schema.INNODB_BUFFER_POOL_PAGES	1
-INNODB_BUFFER_POOL_PAGES_BLOB	information_schema.INNODB_BUFFER_POOL_PAGES_BLOB	1
-INNODB_BUFFER_POOL_PAGES_INDEX	information_schema.INNODB_BUFFER_POOL_PAGES_INDEX	1
 INNODB_CHANGED_PAGES	information_schema.INNODB_CHANGED_PAGES	1
 INNODB_CMP	information_schema.INNODB_CMP	1
 INNODB_CMPMEM	information_schema.INNODB_CMPMEM	1
 INNODB_CMPMEM_RESET	information_schema.INNODB_CMPMEM_RESET	1
+INNODB_CMP_PER_INDEX	information_schema.INNODB_CMP_PER_INDEX	1
 INNODB_CMP_RESET	information_schema.INNODB_CMP_RESET	1
-INNODB_INDEX_STATS	information_schema.INNODB_INDEX_STATS	1
 INNODB_LOCKS	information_schema.INNODB_LOCKS	1
 INNODB_LOCK_WAITS	information_schema.INNODB_LOCK_WAITS	1
-INNODB_RSEG	information_schema.INNODB_RSEG	1
 INNODB_SYS_COLUMNS	information_schema.INNODB_SYS_COLUMNS	1
 INNODB_SYS_FIELDS	information_schema.INNODB_SYS_FIELDS	1
 INNODB_SYS_FOREIGN	information_schema.INNODB_SYS_FOREIGN	1
 INNODB_SYS_FOREIGN_COLS	information_schema.INNODB_SYS_FOREIGN_COLS	1
 INNODB_SYS_INDEXES	information_schema.INNODB_SYS_INDEXES	1
-INNODB_SYS_STATS	information_schema.INNODB_SYS_STATS	1
 INNODB_SYS_TABLES	information_schema.INNODB_SYS_TABLES	1
 INNODB_SYS_TABLESTATS	information_schema.INNODB_SYS_TABLESTATS	1
-INNODB_TABLE_STATS	information_schema.INNODB_TABLE_STATS	1
 INNODB_TRX	information_schema.INNODB_TRX	1
 KEY_CACHES	information_schema.KEY_CACHES	1
 KEY_COLUMN_USAGE	information_schema.KEY_COLUMN_USAGE	1
@@ -294,6 +276,9 @@ TRIGGERS	information_schema.TRIGGERS	1
 USER_PRIVILEGES	information_schema.USER_PRIVILEGES	1
 USER_STATISTICS	information_schema.USER_STATISTICS	1
 VIEWS	information_schema.VIEWS	1
+XTRADB_INTERNAL_HASH_TABLES	information_schema.XTRADB_INTERNAL_HASH_TABLES	1
+XTRADB_READ_VIEW	information_schema.XTRADB_READ_VIEW	1
+XTRADB_RSEG	information_schema.XTRADB_RSEG	1
 +---------------------------------------+
 +---------------------------------------+
 +---------------------------------------+
@@ -314,27 +299,21 @@ Database: information_schema
 | GLOBAL_STATUS                         |
 | GLOBAL_VARIABLES                      |
 | INDEX_STATISTICS                      |
-| INNODB_BUFFER_POOL_PAGES              |
-| INNODB_BUFFER_POOL_PAGES_BLOB         |
-| INNODB_BUFFER_POOL_PAGES_INDEX        |
 | INNODB_CHANGED_PAGES                  |
 | INNODB_CMP                            |
 | INNODB_CMPMEM                         |
 | INNODB_CMPMEM_RESET                   |
+| INNODB_CMP_PER_INDEX                  |
 | INNODB_CMP_RESET                      |
-| INNODB_INDEX_STATS                    |
 | INNODB_LOCKS                          |
 | INNODB_LOCK_WAITS                     |
-| INNODB_RSEG                           |
 | INNODB_SYS_COLUMNS                    |
 | INNODB_SYS_FIELDS                     |
 | INNODB_SYS_FOREIGN                    |
 | INNODB_SYS_FOREIGN_COLS               |
 | INNODB_SYS_INDEXES                    |
-| INNODB_SYS_STATS                      |
 | INNODB_SYS_TABLES                     |
 | INNODB_SYS_TABLESTATS                 |
-| INNODB_TABLE_STATS                    |
 | INNODB_TRX                            |
 | KEY_CACHES                            |
 | KEY_COLUMN_USAGE                      |
@@ -359,7 +338,9 @@ Database: information_schema
 | USER_PRIVILEGES                       |
 | USER_STATISTICS                       |
 | VIEWS                                 |
-| XTRADB_ADMIN_COMMAND                  |
+| XTRADB_INTERNAL_HASH_TABLES           |
+| XTRADB_READ_VIEW                      |
+| XTRADB_RSEG                           |
 +---------------------------------------+
 +---------------------------------------+
 +---------------------------------------+
@@ -380,27 +361,21 @@ Database: INFORMATION_SCHEMA
 | GLOBAL_STATUS                         |
 | GLOBAL_VARIABLES                      |
 | INDEX_STATISTICS                      |
-| INNODB_BUFFER_POOL_PAGES              |
-| INNODB_BUFFER_POOL_PAGES_BLOB         |
-| INNODB_BUFFER_POOL_PAGES_INDEX        |
 | INNODB_CHANGED_PAGES                  |
 | INNODB_CMP                            |
 | INNODB_CMPMEM                         |
 | INNODB_CMPMEM_RESET                   |
+| INNODB_CMP_PER_INDEX                  |
 | INNODB_CMP_RESET                      |
-| INNODB_INDEX_STATS                    |
 | INNODB_LOCKS                          |
 | INNODB_LOCK_WAITS                     |
-| INNODB_RSEG                           |
 | INNODB_SYS_COLUMNS                    |
 | INNODB_SYS_FIELDS                     |
 | INNODB_SYS_FOREIGN                    |
 | INNODB_SYS_FOREIGN_COLS               |
 | INNODB_SYS_INDEXES                    |
-| INNODB_SYS_STATS                      |
 | INNODB_SYS_TABLES                     |
 | INNODB_SYS_TABLESTATS                 |
-| INNODB_TABLE_STATS                    |
 | INNODB_TRX                            |
 | KEY_CACHES                            |
 | KEY_COLUMN_USAGE                      |
@@ -425,7 +400,9 @@ Database: INFORMATION_SCHEMA
 | USER_PRIVILEGES                       |
 | USER_STATISTICS                       |
 | VIEWS                                 |
-| XTRADB_ADMIN_COMMAND                  |
+| XTRADB_INTERNAL_HASH_TABLES           |
+| XTRADB_READ_VIEW                      |
+| XTRADB_RSEG                           |
 +--------------------+
 +--------------------+
 +--------------------+
@@ -434,5 +411,5 @@ Wildcard: inf_rmation_schema
 | information_schema |
 SELECT table_schema, count(*) FROM information_schema.TABLES WHERE table_schema IN ('mysql', 'INFORMATION_SCHEMA', 'test', 'mysqltest') AND table_name<>'ndb_binlog_index' AND table_name<>'ndb_apply_status' GROUP BY TABLE_SCHEMA;
 table_schema	count(*)
-information_schema	61
-mysql	28
+information_schema	57
+mysql	30
diff --git a/mysql-test/r/innodb_bug878769,innodb_plugin.rdiff b/mysql-test/r/innodb_bug878769,innodb_plugin.rdiff
deleted file mode 100644
index 0a8ea2d8c67..00000000000
--- a/mysql-test/r/innodb_bug878769,innodb_plugin.rdiff
+++ /dev/null
@@ -1,11 +0,0 @@
---- r/innodb_bug878769.result	2011-11-22 18:50:25.000000000 +0100
-+++ r/innodb_bug878769.reject	2012-02-07 12:45:07.000000000 +0100
-@@ -39,7 +39,7 @@
- GROUP BY 1,2;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
- 1	SIMPLE	t2	index	col_int_key	col_int_key	5	NULL	12	Using where; Using index; Using temporary; Using filesort
--1	SIMPLE	t1	ref	col_int_key	col_int_key	5	test.t2.col_int_key	1	Using join buffer (flat, BKA join); Key-ordered Rowid-ordered scan
-+1	SIMPLE	t1	ref	col_int_key	col_int_key	5	test.t2.col_int_key	1	
- SELECT t1.col_time_key, t1.col_varchar_key
- FROM t2 STRAIGHT_JOIN t1 ON t1.col_int_key = t2.col_int_key
- GROUP BY 1,2;
diff --git a/mysql-test/r/innodb_icp,innodb_plugin.rdiff b/mysql-test/r/innodb_icp,innodb_plugin.rdiff
deleted file mode 100644
index 3cb85b79ece..00000000000
--- a/mysql-test/r/innodb_icp,innodb_plugin.rdiff
+++ /dev/null
@@ -1,58 +0,0 @@
---- r/innodb_icp.result	2013-07-16 17:01:00.000000000 +0400
-+++ r/innodb_icp,innodb_plugin.reject	2013-07-16 17:16:53.000000000 +0400
-@@ -213,7 +213,7 @@
- EXPLAIN
- SELECT c1 FROM t3 WHERE c1 >= 'c-1004=w' and c1 <= 'c-1006=w' and i1 > 2;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	t3	range	c1	c1	12	NULL	2	Using index condition; Using where
-+1	SIMPLE	t3	range	c1	c1	12	NULL	2	Using where
- SELECT c1 FROM t3 WHERE c1 >= 'c-1004=w' and c1 <= 'c-1006=w' and i1 > 2;
- c1
- EXPLAIN
-@@ -637,7 +637,7 @@
- WHERE NOT(b = 'Texas') AND b BETWEEN 'wy' AND 'y' OR b = 'Pennsylvania'
-   ORDER BY a;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	t1	range	b	b	13	NULL	2	Using where; Rowid-ordered scan; Using filesort
-+1	SIMPLE	t1	range	b	b	13	NULL	2	Using where; Using filesort
- SELECT * FROM t1 
- WHERE NOT(b = 'Texas') AND b BETWEEN 'wy' AND 'y' OR b = 'Pennsylvania'
-   ORDER BY a;
-@@ -649,7 +649,7 @@
- WHERE NOT(b = 'Texas') AND b BETWEEN 'wy' AND 'y' OR b = 'Pennsylvania'
-   ORDER BY a;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	t1	range	b	b	13	NULL	2	Using index condition; Using where; Rowid-ordered scan; Using filesort
-+1	SIMPLE	t1	range	b	b	13	NULL	2	Using where; Using filesort
- SELECT * FROM t1 
- WHERE NOT(b = 'Texas') AND b BETWEEN 'wy' AND 'y' OR b = 'Pennsylvania'
-   ORDER BY a;
-@@ -825,15 +825,15 @@
- 4	4
- show status like "Handler_icp%";
- Variable_name	Value
--Handler_icp_attempts	2
--Handler_icp_match	1
-+Handler_icp_attempts	0
-+Handler_icp_match	0
- SELECT * FROM t1 WHERE (c2='3' or c2='4') and c2 % 2 = 0 ;
- c1	c2
- 4	4
- show status like "Handler_icp%";
- Variable_name	Value
--Handler_icp_attempts	2
--Handler_icp_match	1
-+Handler_icp_attempts	0
-+Handler_icp_match	0
- DROP TABLE t1;
- create table t1 (a int,b char(5),primary key (a), key (b(1)));
- insert into t1 values ('a','b');
-@@ -868,7 +868,7 @@
- EXPLAIN
- SELECT * FROM t1 FORCE INDEX(idx1)       WHERE (c1='aa' AND c2='x') OR (c1='a'  AND c2='y');
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	t1	range	idx1	idx1	10	NULL	2	Using index condition; Using where
-+1	SIMPLE	t1	range	idx1	idx1	10	NULL	2	Using where
- SELECT * FROM t1 FORCE INDEX(idx1)       WHERE (c1='aa' AND c2='x') OR (c1='a'  AND c2='y');
- c1	c2
- aa	x
diff --git a/mysql-test/r/innodb_mrr_cpk,innodb_plugin.rdiff b/mysql-test/r/innodb_mrr_cpk,innodb_plugin.rdiff
deleted file mode 100644
index ab64fc153cc..00000000000
--- a/mysql-test/r/innodb_mrr_cpk,innodb_plugin.rdiff
+++ /dev/null
@@ -1,111 +0,0 @@
---- r/innodb_mrr_cpk.result	2012-02-23 15:57:49.000000000 +0100
-+++ r/innodb_mrr_cpk,innodb_plugin.reject	2012-02-23 19:44:57.000000000 +0100
-@@ -27,13 +27,13 @@
- explain select * from t1, t2 where t1.a=t2.a;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
- 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	Using where
--1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	8	test.t2.a	1	Using join buffer (flat, BKA join); Key-ordered scan
-+1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	8	test.t2.a	1	
- This output must be sorted by value of t1.a:
- select * from t1, t2 where t1.a=t2.a;
- a	b	filler	a
- a-1010=A	b-1010=B	filler	a-1010=A
--a-1020=A	b-1020=B	filler	a-1020=A
- a-1030=A	b-1030=B	filler	a-1030=A
-+a-1020=A	b-1020=B	filler	a-1020=A
- drop table t1, t2;
- create table t1(
- a char(8) character set utf8, b int, filler char(100), 
-@@ -49,24 +49,24 @@
- explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
- 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	Using where
--1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	28	test.t2.a,test.t2.b	1	Using join buffer (flat, BKA join); Key-ordered scan
-+1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	28	test.t2.a,test.t2.b	1	
- select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
- a	b	filler	a	b
- a-1010=A	1010	filler	a-1010=A	1010
--a-1020=A	1020	filler	a-1020=A	1020
- a-1030=A	1030	filler	a-1030=A	1030
-+a-1020=A	1020	filler	a-1020=A	1020
- insert into t2 values ('a-1030=A', 1030), ('a-1020=A', 1020);
- explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
- 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	5	Using where
--1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	28	test.t2.a,test.t2.b	1	Using join buffer (flat, BKA join); Key-ordered scan
-+1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	28	test.t2.a,test.t2.b	1	
- select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
- a	b	filler	a	b
- a-1010=A	1010	filler	a-1010=A	1010
--a-1020=A	1020	filler	a-1020=A	1020
--a-1020=A	1020	filler	a-1020=A	1020
- a-1030=A	1030	filler	a-1030=A	1030
-+a-1020=A	1020	filler	a-1020=A	1020
- a-1030=A	1030	filler	a-1030=A	1030
-+a-1020=A	1020	filler	a-1020=A	1020
- drop table t1, t2;
- create table t1(
- a varchar(8) character set utf8, b int, filler char(100), 
-@@ -82,21 +82,21 @@
- explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
- 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	Using where
--1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	30	test.t2.a,test.t2.b	1	Using where; Using join buffer (flat, BKA join); Key-ordered scan
-+1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	30	test.t2.a,test.t2.b	1	Using where
- select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
- a	b	filler	a	b
- a-1010=A	1010	filler	a-1010=A	1010
--a-1020=A	1020	filler	a-1020=A	1020
- a-1030=A	1030	filler	a-1030=A	1030
-+a-1020=A	1020	filler	a-1020=A	1020
- explain select * from t1, t2 where t1.a=t2.a;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
- 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	Using where
--1	SIMPLE	t1	ref	PRIMARY	PRIMARY	26	test.t2.a	1	Using where; Using join buffer (flat, BKA join); Key-ordered scan
-+1	SIMPLE	t1	ref	PRIMARY	PRIMARY	26	test.t2.a	1	Using where
- select * from t1, t2 where t1.a=t2.a;
- a	b	filler	a	b
- a-1010=A	1010	filler	a-1010=A	1010
--a-1020=A	1020	filler	a-1020=A	1020
- a-1030=A	1030	filler	a-1030=A	1030
-+a-1020=A	1020	filler	a-1020=A	1020
- drop table t1, t2;
- create table t1 (a int, b int, c int, filler char(100), primary key(a,b,c));
- insert into t1 select A.a, B.a, C.a, 'filler' from t0 A, t0 B, t0 C;
-@@ -111,15 +111,15 @@
- explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
- 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	Using where
--1	SIMPLE	t1	ref	PRIMARY	PRIMARY	8	test.t2.a,test.t2.b	1	Using join buffer (flat, BKA join); Key-ordered scan
-+1	SIMPLE	t1	ref	PRIMARY	PRIMARY	8	test.t2.a,test.t2.b	1	
- select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
- a	b	c	filler	a	b
-+11	33	124	filler	11	33
-+11	33	125	filler	11	33
-+11	22	1234	filler	11	22
- 11	11	11	filler	11	11
- 11	11	12	filler	11	11
- 11	11	13	filler	11	11
--11	22	1234	filler	11	22
--11	33	124	filler	11	33
--11	33	125	filler	11	33
- set join_cache_level=0;
- select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
- a	b	c	filler	a	b
-@@ -133,14 +133,14 @@
- explain select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
- 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	Using where
--1	SIMPLE	t1	ref	PRIMARY	PRIMARY	4	test.t2.a	1	Using where; Using join buffer (flat, BKA join); Key-ordered scan
-+1	SIMPLE	t1	ref	PRIMARY	PRIMARY	4	test.t2.a	1	Using where
- select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
- a	b	c	filler	a	b
- set optimizer_switch='index_condition_pushdown=off';
- explain select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
- 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	Using where
--1	SIMPLE	t1	ref	PRIMARY	PRIMARY	4	test.t2.a	1	Using where; Using join buffer (flat, BKA join); Key-ordered scan
-+1	SIMPLE	t1	ref	PRIMARY	PRIMARY	4	test.t2.a	1	Using where
- select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
- a	b	c	filler	a	b
- set optimizer_switch='index_condition_pushdown=on';
diff --git a/mysql-test/r/range_vs_index_merge_innodb,innodb_plugin.rdiff b/mysql-test/r/range_vs_index_merge_innodb,innodb_plugin.rdiff
deleted file mode 100644
index ecae2c809c1..00000000000
--- a/mysql-test/r/range_vs_index_merge_innodb,innodb_plugin.rdiff
+++ /dev/null
@@ -1,280 +0,0 @@
---- ./r/range_vs_index_merge_innodb.result	2012-11-21 19:35:14.000000000 +0100
-+++ ./r/range_vs_index_merge_innodb,innodb_plugin.reject	2012-11-21 20:56:00.000000000 +0100
-@@ -50,14 +50,14 @@
- WHERE (Population >= 100000 OR Name LIKE 'P%') AND Country='CAN' OR
- (Population < 100000 OR Name Like 'T%') AND Country='ARG';
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Population,Country,Name	Country	3	NULL	106	Using index condition; Using where
-+1	SIMPLE	City	range	Population,Country,Name	Country	3	NULL	106	Using where
- EXPLAIN
- SELECT * FROM City
- WHERE Population < 200000 AND Name LIKE 'P%' AND
- (Population > 300000 OR Name LIKE 'T%') AND
- (Population < 100000 OR Name LIKE 'Pa%');
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Population,Name	Name	35	NULL	235	Using index condition; Using where
-+1	SIMPLE	City	range	Population,Name	Name	35	NULL	235	Using where
- EXPLAIN
- SELECT * FROM City
- WHERE Population > 100000 AND Name LIKE 'Aba%' OR
-@@ -70,12 +70,12 @@
- SELECT * FROM City
- WHERE (Population > 101000 AND Population < 115000);
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Population	Population	4	NULL	458	Using index condition
-+1	SIMPLE	City	range	Population	Population	4	NULL	458	Using where
- EXPLAIN 
- SELECT * FROM City
- WHERE (Population > 101000 AND Population < 102000);
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Population	Population	4	NULL	38	Using index condition
-+1	SIMPLE	City	range	Population	Population	4	NULL	38	Using where
- EXPLAIN 
- SELECT * FROM City
- WHERE ((Name > 'Ca' AND Name < 'Cf') OR (Country > 'E' AND Country < 'F'));
-@@ -92,7 +92,7 @@
- WHERE ((Name > 'Ca' AND Name < 'Cf') OR (Country > 'E' AND Country < 'F'))
- AND (Population > 101000 AND Population < 102000);
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Population,Country,Name	Population	4	NULL	38	Using index condition; Using where
-+1	SIMPLE	City	range	Population,Country,Name	Population	4	NULL	38	Using where
- SELECT * FROM City USE INDEX ()
- WHERE ((Name > 'Ca' AND Name < 'Cf') OR (Country > 'E' AND Country < 'F'))
- AND (Population > 101000 AND Population < 115000);
-@@ -172,37 +172,37 @@
- EXPLAIN
- SELECT  * FROM City WHERE (Name < 'Ac');
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Name	Name	35	NULL	23	Using index condition
-+1	SIMPLE	City	range	Name	Name	35	NULL	23	Using where
- EXPLAIN
- SELECT  * FROM City WHERE (Name < 'Bb');
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Name	Name	35	NULL	373	Using index condition
-+1	SIMPLE	City	range	Name	Name	35	NULL	373	Using where
- EXPLAIN
- SELECT  * FROM City WHERE (Country > 'A' AND Country < 'B');
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Country	Country	3	NULL	106	Using index condition
-+1	SIMPLE	City	range	Country	Country	3	NULL	106	Using where
- EXPLAIN
- SELECT  * FROM City WHERE (Name BETWEEN 'P' AND 'Pb');
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Name	Name	35	NULL	71	Using index condition
-+1	SIMPLE	City	range	Name	Name	35	NULL	71	Using where
- EXPLAIN
- SELECT  * FROM City WHERE (Name BETWEEN 'P' AND 'S');
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Name	Name	35	NULL	384	Using index condition
-+1	SIMPLE	City	range	Name	Name	35	NULL	384	Using where
- EXPLAIN
- SELECT  * FROM City WHERE (Population > 101000 AND Population < 110000);
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Population	Population	4	NULL	327	Using index condition
-+1	SIMPLE	City	range	Population	Population	4	NULL	327	Using where
- EXPLAIN
- SELECT  * FROM City WHERE (Population > 103000 AND Population < 104000);
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Population	Population	4	NULL	36	Using index condition
-+1	SIMPLE	City	range	Population	Population	4	NULL	36	Using where
- EXPLAIN
- SELECT  * FROM City 
- WHERE (Name < 'Ac' AND (Country > 'A' AND Country < 'B')) OR
- (Name BETWEEN 'P' AND 'Pb' AND (Population > 101000 AND Population < 110000));
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Population,Country,Name	Name	35	NULL	94	Using index condition; Using where
-+1	SIMPLE	City	range	Population,Country,Name	Name	35	NULL	94	Using where
- EXPLAIN
- SELECT  * FROM City
- WHERE (Name < 'Ac' AND (Country > 'A' AND Country < 'B')) OR
-@@ -340,15 +340,15 @@
- EXPLAIN
- SELECT * FROM City WHERE Country > 'A' AND Country < 'ARG';
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Country	Country	3	NULL	19	Using index condition
-+1	SIMPLE	City	range	Country	Country	3	NULL	19	Using where
- EXPLAIN
- SELECT * FROM City WHERE Name LIKE 'H%' OR Name LIKE 'P%' ;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Name	Name	35	NULL	394	Using index condition; Using where
-+1	SIMPLE	City	range	Name	Name	35	NULL	394	Using where
- EXPLAIN
- SELECT * FROM City WHERE Name LIKE 'Ha%' OR Name LIKE 'Pa%' ;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Name	Name	35	NULL	133	Using index condition; Using where
-+1	SIMPLE	City	range	Name	Name	35	NULL	133	Using where
- EXPLAIN
- SELECT * FROM City
- WHERE ((ID < 10) AND (Name LIKE 'H%' OR (Country > 'A' AND Country < 'ARG')))
-@@ -577,27 +577,27 @@
- EXPLAIN 
- SELECT * FROM City WHERE Population > 101000 AND Population < 102000;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Population	Population	4	NULL	38	Using index condition
-+1	SIMPLE	City	range	Population	Population	4	NULL	38	Using where
- EXPLAIN 
- SELECT * FROM City WHERE Population > 101000 AND Population < 110000;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Population	Population	4	NULL	327	Using index condition
-+1	SIMPLE	City	range	Population	Population	4	NULL	327	Using where
- EXPLAIN 
- SELECT * FROM City WHERE Country < 'C';
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Country	Country	3	NULL	446	Using index condition
-+1	SIMPLE	City	range	Country	Country	3	NULL	446	Using where
- EXPLAIN 
- SELECT * FROM City WHERE Country < 'AGO';
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Country	Country	3	NULL	5	Using index condition
-+1	SIMPLE	City	range	Country	Country	3	NULL	5	Using where
- EXPLAIN 
- SELECT * FROM City WHERE Name BETWEEN 'P' AND 'S';
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Name	Name	35	NULL	384	Using index condition
-+1	SIMPLE	City	range	Name	Name	35	NULL	384	Using where
- EXPLAIN 
- SELECT * FROM City WHERE Name BETWEEN 'P' AND 'Pb';
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Name	Name	35	NULL	71	Using index condition
-+1	SIMPLE	City	range	Name	Name	35	NULL	71	Using where
- EXPLAIN 
- SELECT * FROM City WHERE ID BETWEEN 3400 AND 3800;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-@@ -609,7 +609,7 @@
- EXPLAIN 
- SELECT * FROM City WHERE Name LIKE 'P%';
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Name	Name	35	NULL	235	Using index condition
-+1	SIMPLE	City	range	Name	Name	35	NULL	235	Using where
- EXPLAIN
- SELECT * FROM City
- WHERE ((Population > 101000 AND Population < 102000) AND
-@@ -680,23 +680,23 @@
- EXPLAIN
- SELECT * FROM City WHERE Name LIKE 'Pas%';
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Name	Name	35	NULL	8	Using index condition
-+1	SIMPLE	City	range	Name	Name	35	NULL	8	Using where
- EXPLAIN
- SELECT * FROM City WHERE Name LIKE 'P%';
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Name	Name	35	NULL	235	Using index condition
-+1	SIMPLE	City	range	Name	Name	35	NULL	235	Using where
- EXPLAIN
- SELECT * FROM City WHERE (Population > 101000 AND Population < 103000);
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Population	Population	4	NULL	80	Using index condition
-+1	SIMPLE	City	range	Population	Population	4	NULL	80	Using where
- EXPLAIN
- SELECT * FROM City WHERE Country='USA';
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	ref	Country,CountryPopulation	Country	3	const	274	Using index condition
-+1	SIMPLE	City	ref	Country,CountryPopulation	Country	3	const	274	Using where
- EXPLAIN
- SELECT * FROM City WHERE Country='FIN';
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	ref	Country,CountryPopulation	Country	3	const	7	Using index condition
-+1	SIMPLE	City	ref	Country,CountryPopulation	Country	3	const	7	Using where
- EXPLAIN
- SELECT * FROM City 
- WHERE ((Population > 101000 AND Population < 103000) OR Name LIKE 'Pas%')
-@@ -708,7 +708,7 @@
- WHERE ((Population > 101000 AND Population < 103000) OR Name LIKE 'P%')
- AND Country='FIN';
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	ref	Population,Country,Name,CountryPopulation	Country	3	const	7	Using index condition; Using where
-+1	SIMPLE	City	ref	Population,Country,Name,CountryPopulation	Country	3	const	7	Using where
- SELECT * FROM City 
- WHERE ((Population > 101000 AND Population < 103000) OR Name LIKE 'Pas%')
- AND Country='USA';
-@@ -753,15 +753,15 @@
- EXPLAIN
- SELECT * FROM City WHERE Country='USA';
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	ref	Country,CountryPopulation,CountryName	Country	3	const	274	Using index condition
-+1	SIMPLE	City	ref	Country,CountryPopulation,CountryName	Country	3	const	274	Using where
- EXPLAIN
- SELECT * FROM City WHERE Country='FIN';
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	ref	Country,CountryPopulation,CountryName	Country	3	const	7	Using index condition
-+1	SIMPLE	City	ref	Country,CountryPopulation,CountryName	Country	3	const	7	Using where
- EXPLAIN
- SELECT * FROM City WHERE Country='BRA';
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	ref	Country,CountryPopulation,CountryName	Country	3	const	250	Using index condition
-+1	SIMPLE	City	ref	Country,CountryPopulation,CountryName	Country	3	const	250	Using where
- EXPLAIN
- SELECT * FROM City WHERE ID BETWEEN 3790 AND 3800;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-@@ -789,15 +789,15 @@
- EXPLAIN
- SELECT * FROM City WHERE (Population > 101000 AND Population < 102000);
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Population	Population	4	NULL	38	Using index condition
-+1	SIMPLE	City	range	Population	Population	4	NULL	38	Using where
- EXPLAIN
- SELECT * FROM City WHERE (Population > 101000 AND Population < 103000);
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Population	Population	4	NULL	80	Using index condition
-+1	SIMPLE	City	range	Population	Population	4	NULL	80	Using where
- EXPLAIN
- SELECT * FROM City WHERE Name LIKE 'Pa%';
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Name	Name	35	NULL	71	Using index condition
-+1	SIMPLE	City	range	Name	Name	35	NULL	71	Using where
- EXPLAIN
- SELECT * FROM City
- WHERE ((Population > 101000 AND Population < 102000) OR
-@@ -818,7 +818,7 @@
- ID BETWEEN 3500 AND 3800) AND Country='FIN'
-         AND (Name BETWEEN 'P' AND 'T' OR ID BETWEEN 4000 AND 4300);
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	ref	PRIMARY,Population,Country,Name,CountryPopulation,CountryName	Country	3	const	7	Using index condition; Using where
-+1	SIMPLE	City	ref	PRIMARY,Population,Country,Name,CountryPopulation,CountryName	Country	3	const	7	Using where
- SELECT * FROM City USE INDEX ()
- WHERE ((Population > 101000 AND Population < 102000) OR
- ID BETWEEN 3790 AND 3800) AND Country='USA'
-@@ -950,14 +950,14 @@
- ID BETWEEN 3500 AND 3800) AND Country='USA'
-         AND (Name LIKE 'P%' OR ID BETWEEN 4000 AND 4300);
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	PRIMARY,Population,Country,Name,CountryPopulation,CountryName	CountryName	38	NULL	18	Using index condition; Using where
-+1	SIMPLE	City	range	PRIMARY,Population,Country,Name,CountryPopulation,CountryName	CountryName	38	NULL	18	Using where
- EXPLAIN
- SELECT * FROM City
- WHERE ((Population > 101000 AND Population < 11000) OR
- ID BETWEEN 3500 AND 3800) AND Country='USA'
-         AND (Name LIKE 'Pho%' OR ID BETWEEN 4000 AND 4300);
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	PRIMARY,Population,Country,Name,CountryPopulation,CountryName	Name	35	NULL	1	Using index condition; Using where
-+1	SIMPLE	City	range	PRIMARY,Population,Country,Name,CountryPopulation,CountryName	Name	35	NULL	1	Using where
- SELECT * FROM City USE INDEX ()
- WHERE ((Population > 101000 AND Population < 11000) OR
- ID BETWEEN 3500 AND 3800) AND Country='USA'
-@@ -1077,7 +1077,7 @@
- (Name='Samara' AND Country='RUS') OR  
- (Name='Seattle' AND Country='USA');
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Country,CountryPopulation,CountryName,CityName	CountryName	38	NULL	27	Using index condition; Using where
-+1	SIMPLE	City	range	Country,CountryPopulation,CountryName,CityName	CountryName	38	NULL	27	Using where
- SELECT Name, Country, Population FROM City WHERE
- (Name='Manila' AND Country='PHL') OR
- (Name='Addis Abeba' AND Country='ETH') OR          
-@@ -1164,7 +1164,7 @@
- (Name='Samara' AND Country='RUS') OR  
- (Name='Seattle' AND Country='USA');
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Country,CountryPopulation,CountryName,CityName	CountryName	38	NULL	27	Using index condition; Using where
-+1	SIMPLE	City	range	Country,CountryPopulation,CountryName,CityName	CountryName	38	NULL	27	Using where
- SELECT Name, Country, Population FROM City WHERE
- (Name='Manila' AND Country='PHL') OR
- (Name='Addis Abeba' AND Country='ETH') OR          
-@@ -1346,7 +1346,7 @@
- AND (Population >= 100000 AND Population < 120000)
- ORDER BY Population LIMIT 5;
- id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
--1	SIMPLE	City	range	Country,Name,Population	Population	4	NULL	#	Using index condition; Using where
-+1	SIMPLE	City	range	Country,Name,Population	Population	4	NULL	#	Using where
- FLUSH STATUS;
- SELECT * FROM City
- WHERE ((Name > 'Ca' AND Name < 'Cf') OR (Country > 'E' AND Country < 'H'))
diff --git a/mysql-test/r/subselect_sj2_jcl6,innodb_plugin.rdiff b/mysql-test/r/subselect_sj2_jcl6,innodb_plugin.rdiff
deleted file mode 100644
index c32f52fe0c2..00000000000
--- a/mysql-test/r/subselect_sj2_jcl6,innodb_plugin.rdiff
+++ /dev/null
@@ -1,11 +0,0 @@
---- r/subselect_sj2_jcl6.result	2012-04-07 12:45:03.000000000 +0200
-+++ r/subselect_sj2_jcl6,innodb_plugin.reject	2012-04-07 13:10:38.000000000 +0200
-@@ -1009,7 +1009,7 @@
- 1	PRIMARY	t2	ALL	a	NULL	NULL	NULL	38	
- 1	PRIMARY	<subquery2>	eq_ref	distinct_key	distinct_key	8	func,func	1	
- 2	MATERIALIZED	alias1	ALL	a	NULL	NULL	NULL	19	Using where
--2	MATERIALIZED	alias2	ref	a	a	4	test.alias1.a	1	Using where; Using join buffer (flat, BKA join); Key-ordered Rowid-ordered scan
-+2	MATERIALIZED	alias2	ref	a	a	4	test.alias1.a	1	Using where
- SELECT * FROM t2 
- WHERE (a, a) IN (SELECT alias2.b, alias2.a FROM t1 AS alias1, t1 AS alias2 
- WHERE 
diff --git a/mysql-test/suite/funcs_1/r/is_columns_is.result b/mysql-test/suite/funcs_1/r/is_columns_is.result
index 0fb2b8988df..6d79be772cb 100644
--- a/mysql-test/suite/funcs_1/r/is_columns_is.result
+++ b/mysql-test/suite/funcs_1/r/is_columns_is.result
@@ -429,7 +429,20 @@ def	information_schema	VIEWS	TABLE_CATALOG	1		NO	varchar	512	1536	NULL	NULL	NULL
 def	information_schema	VIEWS	TABLE_NAME	3		NO	varchar	64	192	NULL	NULL	NULL	utf8	utf8_general_ci	varchar(64)			select	
 def	information_schema	VIEWS	TABLE_SCHEMA	2		NO	varchar	64	192	NULL	NULL	NULL	utf8	utf8_general_ci	varchar(64)			select	
 def	information_schema	VIEWS	VIEW_DEFINITION	4	NULL	NO	longtext	4294967295	4294967295	NULL	NULL	NULL	utf8	utf8_general_ci	longtext			select	
-def	information_schema	XTRADB_ADMIN_COMMAND	result_message	1		NO	varchar	1024	3072	NULL	NULL	NULL	utf8	utf8_general_ci	varchar(1024)			select	
+def	information_schema	XTRADB_INTERNAL_HASH_TABLES	CONSTANT_MEMORY	3	0	NO	bigint	NULL	NULL	20	0	NULL	NULL	NULL	bigint(21) unsigned			select	
+def	information_schema	XTRADB_INTERNAL_HASH_TABLES	INTERNAL_HASH_TABLE_NAME	1		NO	varchar	100	300	NULL	NULL	NULL	utf8	utf8_general_ci	varchar(100)			select	
+def	information_schema	XTRADB_INTERNAL_HASH_TABLES	TOTAL_MEMORY	2	0	NO	bigint	NULL	NULL	20	0	NULL	NULL	NULL	bigint(21) unsigned			select	
+def	information_schema	XTRADB_INTERNAL_HASH_TABLES	VARIABLE_MEMORY	4	0	NO	bigint	NULL	NULL	20	0	NULL	NULL	NULL	bigint(21) unsigned			select	
+def	information_schema	XTRADB_READ_VIEW	READ_VIEW_LOW_LIMIT_TRX_ID	4		NO	varchar	18	54	NULL	NULL	NULL	utf8	utf8_general_ci	varchar(18)			select	
+def	information_schema	XTRADB_READ_VIEW	READ_VIEW_LOW_LIMIT_TRX_NUMBER	2		NO	varchar	18	54	NULL	NULL	NULL	utf8	utf8_general_ci	varchar(18)			select	
+def	information_schema	XTRADB_READ_VIEW	READ_VIEW_UNDO_NUMBER	1	0	NO	bigint	NULL	NULL	20	0	NULL	NULL	NULL	bigint(21) unsigned			select	
+def	information_schema	XTRADB_READ_VIEW	READ_VIEW_UPPER_LIMIT_TRX_ID	3		NO	varchar	18	54	NULL	NULL	NULL	utf8	utf8_general_ci	varchar(18)			select	
+def	information_schema	XTRADB_RSEG	curr_size	6	0	NO	bigint	NULL	NULL	20	0	NULL	NULL	NULL	bigint(21) unsigned			select	
+def	information_schema	XTRADB_RSEG	max_size	5	0	NO	bigint	NULL	NULL	20	0	NULL	NULL	NULL	bigint(21) unsigned			select	
+def	information_schema	XTRADB_RSEG	page_no	4	0	NO	bigint	NULL	NULL	20	0	NULL	NULL	NULL	bigint(21) unsigned			select	
+def	information_schema	XTRADB_RSEG	rseg_id	1	0	NO	bigint	NULL	NULL	20	0	NULL	NULL	NULL	bigint(21) unsigned			select	
+def	information_schema	XTRADB_RSEG	space_id	2	0	NO	bigint	NULL	NULL	20	0	NULL	NULL	NULL	bigint(21) unsigned			select	
+def	information_schema	XTRADB_RSEG	zip_size	3	0	NO	bigint	NULL	NULL	20	0	NULL	NULL	NULL	bigint(21) unsigned			select	
 ##########################################################################
 # Show the quotient of CHARACTER_OCTET_LENGTH and CHARACTER_MAXIMUM_LENGTH
 ##########################################################################
@@ -917,4 +930,17 @@ NULL	information_schema	USER_STATISTICS	EMPTY_QUERIES	bigint	NULL	NULL	NULL	NULL
 3.0000	information_schema	VIEWS	SECURITY_TYPE	varchar	7	21	utf8	utf8_general_ci	varchar(7)
 3.0000	information_schema	VIEWS	CHARACTER_SET_CLIENT	varchar	32	96	utf8	utf8_general_ci	varchar(32)
 3.0000	information_schema	VIEWS	COLLATION_CONNECTION	varchar	32	96	utf8	utf8_general_ci	varchar(32)
-3.0000	information_schema	XTRADB_ADMIN_COMMAND	result_message	varchar	1024	3072	utf8	utf8_general_ci	varchar(1024)
+3.0000	information_schema	XTRADB_INTERNAL_HASH_TABLES	INTERNAL_HASH_TABLE_NAME	varchar	100	300	utf8	utf8_general_ci	varchar(100)
+NULL	information_schema	XTRADB_INTERNAL_HASH_TABLES	TOTAL_MEMORY	bigint	NULL	NULL	NULL	NULL	bigint(21) unsigned
+NULL	information_schema	XTRADB_INTERNAL_HASH_TABLES	CONSTANT_MEMORY	bigint	NULL	NULL	NULL	NULL	bigint(21) unsigned
+NULL	information_schema	XTRADB_INTERNAL_HASH_TABLES	VARIABLE_MEMORY	bigint	NULL	NULL	NULL	NULL	bigint(21) unsigned
+NULL	information_schema	XTRADB_READ_VIEW	READ_VIEW_UNDO_NUMBER	bigint	NULL	NULL	NULL	NULL	bigint(21) unsigned
+3.0000	information_schema	XTRADB_READ_VIEW	READ_VIEW_LOW_LIMIT_TRX_NUMBER	varchar	18	54	utf8	utf8_general_ci	varchar(18)
+3.0000	information_schema	XTRADB_READ_VIEW	READ_VIEW_UPPER_LIMIT_TRX_ID	varchar	18	54	utf8	utf8_general_ci	varchar(18)
+3.0000	information_schema	XTRADB_READ_VIEW	READ_VIEW_LOW_LIMIT_TRX_ID	varchar	18	54	utf8	utf8_general_ci	varchar(18)
+NULL	information_schema	XTRADB_RSEG	rseg_id	bigint	NULL	NULL	NULL	NULL	bigint(21) unsigned
+NULL	information_schema	XTRADB_RSEG	space_id	bigint	NULL	NULL	NULL	NULL	bigint(21) unsigned
+NULL	information_schema	XTRADB_RSEG	zip_size	bigint	NULL	NULL	NULL	NULL	bigint(21) unsigned
+NULL	information_schema	XTRADB_RSEG	page_no	bigint	NULL	NULL	NULL	NULL	bigint(21) unsigned
+NULL	information_schema	XTRADB_RSEG	max_size	bigint	NULL	NULL	NULL	NULL	bigint(21) unsigned
+NULL	information_schema	XTRADB_RSEG	curr_size	bigint	NULL	NULL	NULL	NULL	bigint(21) unsigned
diff --git a/mysql-test/suite/funcs_1/r/is_columns_is_embedded.result b/mysql-test/suite/funcs_1/r/is_columns_is_embedded.result
index 76b7571fb7d..f46f1c0ce62 100644
--- a/mysql-test/suite/funcs_1/r/is_columns_is_embedded.result
+++ b/mysql-test/suite/funcs_1/r/is_columns_is_embedded.result
@@ -429,7 +429,6 @@ def	information_schema	VIEWS	TABLE_CATALOG	1		NO	varchar	512	1536	NULL	NULL	NULL
 def	information_schema	VIEWS	TABLE_NAME	3		NO	varchar	64	192	NULL	NULL	NULL	utf8	utf8_general_ci	varchar(64)				
 def	information_schema	VIEWS	TABLE_SCHEMA	2		NO	varchar	64	192	NULL	NULL	NULL	utf8	utf8_general_ci	varchar(64)				
 def	information_schema	VIEWS	VIEW_DEFINITION	4	NULL	NO	longtext	4294967295	4294967295	NULL	NULL	NULL	utf8	utf8_general_ci	longtext				
-def	information_schema	XTRADB_ADMIN_COMMAND	result_message	1		NO	varchar	1024	3072	NULL	NULL	NULL	utf8	utf8_general_ci	varchar(1024)				
 ##########################################################################
 # Show the quotient of CHARACTER_OCTET_LENGTH and CHARACTER_MAXIMUM_LENGTH
 ##########################################################################
@@ -917,4 +916,3 @@ NULL	information_schema	USER_STATISTICS	EMPTY_QUERIES	bigint	NULL	NULL	NULL	NULL
 3.0000	information_schema	VIEWS	SECURITY_TYPE	varchar	7	21	utf8	utf8_general_ci	varchar(7)
 3.0000	information_schema	VIEWS	CHARACTER_SET_CLIENT	varchar	32	96	utf8	utf8_general_ci	varchar(32)
 3.0000	information_schema	VIEWS	COLLATION_CONNECTION	varchar	32	96	utf8	utf8_general_ci	varchar(32)
-3.0000	information_schema	XTRADB_ADMIN_COMMAND	result_message	varchar	1024	3072	utf8	utf8_general_ci	varchar(1024)
diff --git a/mysql-test/suite/funcs_1/t/is_engines_innodb.test b/mysql-test/suite/funcs_1/t/is_engines_innodb.test
index 44c7d7652d7..b78cb647514 100644
--- a/mysql-test/suite/funcs_1/t/is_engines_innodb.test
+++ b/mysql-test/suite/funcs_1/t/is_engines_innodb.test
@@ -11,6 +11,6 @@
 let $engine_type= InnoDB;
 --source include/have_innodb.inc
 --vertical_results
---replace_regex /XtraDB engine based on InnoDB plugin. //
+--replace_regex /Percona-XtraDB, //
 eval SELECT * FROM information_schema.engines
 WHERE ENGINE = '$engine_type';
diff --git a/mysql-test/suite/innodb/r/innodb-autoinc-44030.result b/mysql-test/suite/innodb/r/innodb-autoinc-44030.result
index 93e6ede30f2..cf3ca93db27 100644
--- a/mysql-test/suite/innodb/r/innodb-autoinc-44030.result
+++ b/mysql-test/suite/innodb/r/innodb-autoinc-44030.result
@@ -1,4 +1,3 @@
-drop table if exists t1;
 SET @@SESSION.AUTO_INCREMENT_INCREMENT=1, @@SESSION.AUTO_INCREMENT_OFFSET=1;
 CREATE TABLE t1 (c1 INT PRIMARY KEY AUTO_INCREMENT) ENGINE=InnoDB;
 INSERT INTO t1 VALUES (null);
@@ -14,14 +13,15 @@ d1
 2
 INSERT INTO t1 VALUES(null);
 ALTER TABLE t1 AUTO_INCREMENT = 3;
+affected rows: 0
+info: Records: 0  Duplicates: 0  Warnings: 0
 SHOW CREATE TABLE t1;
 Table	Create Table
 t1	CREATE TABLE `t1` (
   `d1` int(11) NOT NULL AUTO_INCREMENT,
   PRIMARY KEY (`d1`)
-) ENGINE=InnoDB AUTO_INCREMENT=3 DEFAULT CHARSET=latin1
+) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=latin1
 INSERT INTO t1 VALUES(null);
-ERROR 23000: Duplicate entry '3' for key 'PRIMARY'
 INSERT INTO t1 VALUES(null);
 SELECT * FROM t1;
 d1
@@ -29,4 +29,5 @@ d1
 2
 3
 4
+5
 DROP TABLE t1;
diff --git a/mysql-test/suite/innodb/r/innodb-index.result b/mysql-test/suite/innodb/r/innodb-index.result
index d3c12ee2828..996bc45f7e2 100644
--- a/mysql-test/suite/innodb/r/innodb-index.result
+++ b/mysql-test/suite/innodb/r/innodb-index.result
@@ -967,12 +967,8 @@ ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca
 FOREIGN KEY (c3,c2) REFERENCES t1(c1,c2);
 ERROR HY000: Failed to add the foreign key constaint. Missing index for constraint 'fk_t2_ca' in the referenced table 't1'
 ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca
-FOREIGN KEY (c3,c2) REFERENCES t1(c2,c1);
-affected rows: 0
-info: Records: 0  Duplicates: 0  Warnings: 0
-ALTER TABLE t2 DROP FOREIGN KEY fk_t2_ca;
-affected rows: 0
-info: Records: 0  Duplicates: 0  Warnings: 0
+FOREIGN KEY (c3,c2) REFERENCES t1(c2,c1), ALGORITHM=INPLACE;
+ERROR HY000: Failed to add the foreign key constraint on table 't2'. Incorrect options in FOREIGN KEY constraint 'test/fk_t2_ca'
 ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca
 FOREIGN KEY (c3,c2) REFERENCES t1(c2,c1), ALGORITHM=COPY;
 ERROR HY000: Can't create table `test`.`#sql-temporary` (errno: 150 "Foreign key constraint is incorrectly formed")
diff --git a/mysql-test/suite/innodb/t/innodb-autoinc-44030.test b/mysql-test/suite/innodb/t/innodb-autoinc-44030.test
index fd90d5d92de..c3754b47ba5 100644
--- a/mysql-test/suite/innodb/t/innodb-autoinc-44030.test
+++ b/mysql-test/suite/innodb/t/innodb-autoinc-44030.test
@@ -1,10 +1,11 @@
 -- source include/have_innodb.inc
-# embedded server ignores 'delayed', so skip this
+# embedded server does not support restarting
 -- source include/not_embedded.inc
 
---disable_warnings
-drop table if exists t1;
---enable_warnings
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip Not fixed in InnoDB 5.6.10 or earlier
+}
 
 #
 # 44030: Error: (1500) Couldn't read the MAX(ID) autoinc value from
@@ -31,9 +32,12 @@ INSERT INTO t1 VALUES(null);
 # and effectively set AUTO_INCREMENT to 4, because while copying
 # it would write values 1,2,3 to the column.
 # WL#5534 makes this an in-place ALTER, setting AUTO_INCREMENT=3 for real.
+# However, to keep compatibility with ALGORITHM=COPY MySQL 5.6.11 will
+# go back to the original behaviour, setting AUTO_INCREMENT to 4.
+--enable_info
 ALTER TABLE t1 AUTO_INCREMENT = 3;
+--disable_info
 SHOW CREATE TABLE t1;
--- error ER_DUP_ENTRY
 INSERT INTO t1 VALUES(null);
 INSERT INTO t1 VALUES(null);
 SELECT * FROM t1;
diff --git a/mysql-test/suite/innodb/t/innodb-index.test b/mysql-test/suite/innodb/t/innodb-index.test
index ce2d69b7bba..01592ae9bb8 100644
--- a/mysql-test/suite/innodb/t/innodb-index.test
+++ b/mysql-test/suite/innodb/t/innodb-index.test
@@ -1,5 +1,10 @@
 -- source include/have_innodb.inc
 
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip Not fixed in InnoDB 5.6.10 or earlier
+}
+
 let $innodb_file_format_orig=`select @@innodb_file_format`;
 let $innodb_file_format_max_orig=`select @@innodb_file_format_max`;
 
@@ -445,10 +450,9 @@ ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca
 ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca
  FOREIGN KEY (c3,c2) REFERENCES t1(c1,c2);
 
-# FIXME (WL#6251 problem): this should fail, like the ALGORITHM=COPY below
+--error ER_FK_INCORRECT_OPTION
 ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca
- FOREIGN KEY (c3,c2) REFERENCES t1(c2,c1);
-ALTER TABLE t2 DROP FOREIGN KEY fk_t2_ca;
+ FOREIGN KEY (c3,c2) REFERENCES t1(c2,c1), ALGORITHM=INPLACE;
 
 # mysqltest first does replace_regex, then replace_result
 --replace_regex /#sql-[0-9a-f_]*`/#sql-temporary`/
diff --git a/mysql-test/suite/innodb_fts/r/fulltext.result b/mysql-test/suite/innodb_fts/r/fulltext.result
index 7e965c9ca44..f3c913110d2 100644
--- a/mysql-test/suite/innodb_fts/r/fulltext.result
+++ b/mysql-test/suite/innodb_fts/r/fulltext.result
@@ -120,8 +120,8 @@ Function MATCH ... AGAINST()	is used to do a search
 Full-text search in MySQL	implements vector space model
 select * from t1 where MATCH(a,b) AGAINST("+search +(support vector)" IN BOOLEAN MODE);
 a	b
-MySQL has now support	for full-text search
 Full-text search in MySQL	implements vector space model
+MySQL has now support	for full-text search
 select * from t1 where MATCH(a,b) AGAINST("+search -(support vector)" IN BOOLEAN MODE);
 a	b
 Function MATCH ... AGAINST()	is used to do a search
@@ -164,6 +164,7 @@ select * from t1 where MATCH a,b AGAINST ('"text i"' IN BOOLEAN MODE);
 a	b
 select * from t1 where MATCH a,b AGAINST ('"xt indexes"' IN BOOLEAN MODE);
 a	b
+Full-text indexes	are called collections
 select * from t1 where MATCH a,b AGAINST ('+(support collections) +foobar*' IN BOOLEAN MODE);
 a	b
 select * from t1 where MATCH a,b AGAINST ('+(+(support collections)) +foobar*' IN BOOLEAN MODE);
@@ -201,6 +202,8 @@ a
 aaa10 bbb20
 select * from t1 where match a against ("+(+aaa* +bbb1*)" in boolean mode);
 a
+aaa20 bbb15
+aaa30 bbb10
 select * from t1 where match a against ("(+aaa* +bbb1*)" in boolean mode);
 a
 aaa20 bbb15
@@ -395,6 +398,7 @@ a
 testword''
 SELECT a FROM t1 WHERE MATCH a AGAINST('testword\'\'' IN BOOLEAN MODE);
 a
+testword''
 INSERT INTO t1 VALUES('test\'s');
 SELECT a FROM t1 WHERE MATCH a AGAINST('test' IN BOOLEAN MODE);
 a
@@ -440,7 +444,6 @@ CREATE TABLE t1(a VARCHAR(20), FULLTEXT(a)) ENGINE = InnoDB;
 INSERT INTO t1 VALUES('Offside'),('City Of God');
 SELECT a FROM t1 WHERE MATCH a AGAINST ('+city of*' IN BOOLEAN MODE);
 a
-Offside
 City Of God
 SELECT a FROM t1 WHERE MATCH a AGAINST ('+city (of*)' IN BOOLEAN MODE);
 a
@@ -448,7 +451,6 @@ Offside
 City Of God
 SELECT a FROM t1 WHERE MATCH a AGAINST ('+city* of*' IN BOOLEAN MODE);
 a
-Offside
 City Of God
 DROP TABLE t1;
 create table t1(a text,b date,fulltext index(a)) ENGINE = InnoDB;
diff --git a/mysql-test/suite/innodb_fts/r/fulltext2.result b/mysql-test/suite/innodb_fts/r/fulltext2.result
index 45a9618b100..2aa7d2a6754 100644
--- a/mysql-test/suite/innodb_fts/r/fulltext2.result
+++ b/mysql-test/suite/innodb_fts/r/fulltext2.result
@@ -239,5 +239,6 @@ CREATE TABLE t1(a VARCHAR(255), FULLTEXT(a)) ENGINE = INNODB DEFAULT CHARSET=utf
 INSERT INTO t1 VALUES('„MySQL“');
 SELECT a FROM t1 WHERE MATCH a AGAINST('“MySQL„' IN BOOLEAN MODE);
 a
+„MySQL“
 DROP TABLE t1;
 SET NAMES latin1;
diff --git a/mysql-test/suite/innodb_fts/r/fulltext_var.result b/mysql-test/suite/innodb_fts/r/fulltext_var.result
index 4e4ae3a8380..9fe586210c8 100644
--- a/mysql-test/suite/innodb_fts/r/fulltext_var.result
+++ b/mysql-test/suite/innodb_fts/r/fulltext_var.result
@@ -13,7 +13,6 @@ insert t1 values ('aaaaaa cccccc');
 select * from t1 where match b against ('+aaaaaa bbbbbb' in boolean mode);
 b
 aaaaaa bbbbbb cccccc
-bbbbbb cccccc
 aaaaaa cccccc
 set ft_boolean_syntax=' +-><()~*:""&|';
 ERROR HY000: Variable 'ft_boolean_syntax' is a GLOBAL variable and should be set with SET GLOBAL
@@ -21,16 +20,14 @@ set global ft_boolean_syntax=' +-><()~*:""&|';
 select * from t1 where match b against ('+aaaaaa bbbbbb' in boolean mode);
 b
 aaaaaa bbbbbb cccccc
-bbbbbb cccccc
 aaaaaa cccccc
 set global ft_boolean_syntax='@ -><()~*:""&|';
 select * from t1 where match b against ('+aaaaaa bbbbbb' in boolean mode);
 b
 aaaaaa bbbbbb cccccc
-bbbbbb cccccc
 aaaaaa cccccc
 select * from t1 where match b against ('+aaaaaa @bbbbbb' in boolean mode);
-b
+ERROR 42000: syntax error, unexpected '@', expecting $end
 set global ft_boolean_syntax='@ -><()~*:""@|';
 ERROR 42000: Variable 'ft_boolean_syntax' can't be set to the value of '@ -><()~*:""@|'
 set global ft_boolean_syntax='+ -><()~*:""@!|';
diff --git a/mysql-test/suite/innodb_fts/r/innodb-fts-basic.result b/mysql-test/suite/innodb_fts/r/innodb-fts-basic.result
index 9cfe3119739..fe767476fe6 100644
--- a/mysql-test/suite/innodb_fts/r/innodb-fts-basic.result
+++ b/mysql-test/suite/innodb_fts/r/innodb-fts-basic.result
@@ -134,12 +134,33 @@ SELECT * FROM articles WHERE MATCH (title,body)
 AGAINST ('YourSQL + (+MySQL - (Tricks Security))' IN BOOLEAN MODE);
 id	title	body
 5	MySQL vs. YourSQL	In the following database comparison ...
+1	MySQL Tutorial	DBMS stands for DataBase ...
+2	How To Use MySQL Well	After you went through a ...
+3	Optimizing MySQL	In this tutorial we will show ...
 SELECT * FROM articles WHERE MATCH (title,body)
 AGAINST ('(+MySQL - (Tricks Security)) - YourSQL' IN BOOLEAN MODE);
 id	title	body
 1	MySQL Tutorial	DBMS stands for DataBase ...
 2	How To Use MySQL Well	After you went through a ...
 3	Optimizing MySQL	In this tutorial we will show ...
+SELECT * FROM articles WHERE MATCH (title,body) AGAINST ('mysql - Security&DBMS' IN BOOLEAN MODE);
+id	title	body
+2	How To Use MySQL Well	After you went through a ...
+3	Optimizing MySQL	In this tutorial we will show ...
+4	1001 MySQL Tricks	1. Never run mysqld as root. 2. ...
+5	MySQL vs. YourSQL	In the following database comparison ...
+SELECT * FROM articles WHERE MATCH (title,body) AGAINST ('mysql - (Security DBMS)' IN BOOLEAN MODE);
+id	title	body
+2	How To Use MySQL Well	After you went through a ...
+3	Optimizing MySQL	In this tutorial we will show ...
+4	1001 MySQL Tricks	1. Never run mysqld as root. 2. ...
+5	MySQL vs. YourSQL	In the following database comparison ...
+SELECT * FROM articles WHERE MATCH (title,body) AGAINST (' - Security&DBMS + YourSQL' IN BOOLEAN MODE);
+id	title	body
+5	MySQL vs. YourSQL	In the following database comparison ...
+SELECT * FROM articles WHERE MATCH (title,body) AGAINST ('+YourSQL - Security&DBMS' IN BOOLEAN MODE);
+id	title	body
+5	MySQL vs. YourSQL	In the following database comparison ...
 SELECT COUNT(*) FROM articles
 WHERE MATCH (title,body)
 AGAINST ('database' WITH QUERY EXPANSION);
diff --git a/mysql-test/suite/innodb_fts/r/innodb_fts_misc.result b/mysql-test/suite/innodb_fts/r/innodb_fts_misc.result
index 2a14adc00d5..3f22e79a384 100644
--- a/mysql-test/suite/innodb_fts/r/innodb_fts_misc.result
+++ b/mysql-test/suite/innodb_fts/r/innodb_fts_misc.result
@@ -476,6 +476,7 @@ insert into t50 (s2) values ('FGHIJ'),('KLMNO'),('VÐƷWİ'),('ABCD*');
 select * from t50 where match(s2) against ('abcd*' in natural language
 mode);
 id	s2
+4	ABCD*
 select * from t50 where match(s2) against ('abcd*' in boolean mode);
 id	s2
 4	ABCD*
@@ -659,16 +660,16 @@ Warnings:
 Warning	124	InnoDB rebuilding table to add column FTS_DOC_ID
 INSERT INTO t1 VALUES (1,'ペペペ'),(2,'テテテ'),(3,'ルルル'),(4,'グググ');
 DROP TABLE t1;
-"----------Test15a--------"
-CREATE TABLE t1 (s1 VARCHAR (60) CHARACTER SET UTF8 COLLATE UTF8_UNICODE_CI) ENGINE = MyISAM;
+"----------Test15---------"
+CREATE TABLE t1 (s1 VARCHAR (60) CHARACTER SET UTF8 COLLATE UTF8_UNICODE_520_CI) ENGINE = MyISAM;
 CREATE FULLTEXT INDEX i ON t1 (s1);
 INSERT INTO t1 VALUES
-('a'),('b'),('c'),('d'),('ÓÓÓÓ'),('OOOO'),(NULL),('ÓÓÓÓ ÓÓÓÓ'),('OOOOOOOO');
-SELECT * FROM t1 WHERE MATCH(s1) AGAINST ('OOOO' COLLATE UTF8_UNICODE_CI);
+('a'),('b'),('c'),('d'),('ŁŁŁŁ'),('LLLL'),(NULL),('ŁŁŁŁ ŁŁŁŁ'),('LLLLLLLL');
+SELECT * FROM t1 WHERE MATCH(s1) AGAINST ('LLLL' COLLATE UTF8_UNICODE_520_CI);
 s1
-ÓÓÓÓ
-OOOO
-ÓÓÓÓ ÓÓÓÓ
+ŁŁŁŁ
+LLLL
+ŁŁŁŁ ŁŁŁŁ
 DROP TABLE if EXISTS t2;
 Warnings:
 Note	1051	Unknown table 'test.t2'
@@ -677,10 +678,10 @@ CREATE FULLTEXT INDEX i ON t2 ( s1);
 Warnings:
 Warning	124	InnoDB rebuilding table to add column FTS_DOC_ID
 INSERT INTO t2 VALUES
-('a'),('b'),('c'),('d'),('ÓÓÓÓ'),('OOOO'),(NULL),('ÓÓÓÓ ÓÓÓÓ'),('OOOOOOOO');
-SELECT * FROM t2 WHERE MATCH(s1) AGAINST ('OOOO' COLLATE UTF8_UNICODE_CI);
+('a'),('b'),('c'),('d'),('ŁŁŁŁ'),('LLLL'),(NULL),('ŁŁŁŁ ŁŁŁŁ'),('LLLLLLLL');
+SELECT * FROM t2 WHERE MATCH(s1) AGAINST ('LLLL' COLLATE UTF8_UNICODE_520_CI);
 s1
-OOOO
+LLLL
 DROP TABLE t1,t2;
 "----------Test16---------"
 CREATE TABLE t1 (s1 INT, s2 VARCHAR(50) CHARACTER SET UTF8) ENGINE = InnoDB;
@@ -1237,3 +1238,192 @@ DROP TABLE `A B`;
 CREATE TABLE `t-26`(a VARCHAR(10),FULLTEXT KEY(a)) ENGINE=INNODB;
 INSERT INTO `t-26` VALUES('117');
 DROP TABLE `t-26`;
+CREATE TABLE `t1` (
+`id` INTEGER UNSIGNED NOT NULL AUTO_INCREMENT,
+`content` TEXT NOT NULL,
+PRIMARY KEY (`id`),
+FULLTEXT INDEX `IDX_CONTEXT_FULLTEXT`(`content`)
+)
+ENGINE = InnoDB;
+insert into t1 (content)
+values
+('This is a story which has has a complicated phrase structure here in the
+middle'),
+('This is a story which doesn''t have that text'),
+('This is a story that has complicated the phrase structure');
+select * from t1
+where match(content) against('"complicated phrase structure"' in boolean
+mode);
+id	content
+1	This is a story which has has a complicated phrase structure here in the
+middle
+select * from t1
+where match(content) against('+"complicated phrase structure"' in boolean
+mode);
+id	content
+1	This is a story which has has a complicated phrase structure here in the
+middle
+select * from t1
+where match(content) against('"complicated the phrase structure"' in boolean
+mode);
+id	content
+3	This is a story that has complicated the phrase structure
+select * from t1 where match(content) against('+"this is a story which" +"complicated the phrase structure"' in boolean mode);
+id	content
+select * from t1 where match(content) against('"the complicated the phrase structure"' in boolean mode);
+id	content
+3	This is a story that has complicated the phrase structure
+select * from t1 where match(content) against('"complicated a phrase structure"' in boolean mode);
+id	content
+DROP TABLE t1;
+CREATE TABLE my (id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+c VARCHAR(32), FULLTEXT(c)) ENGINE = INNODB;
+INSERT INTO my (c) VALUES ('green-iguana');
+SELECT * FROM my WHERE MATCH(c) AGAINST ('green-iguana');
+id	c
+1	green-iguana
+DROP TABLE my;
+CREATE TABLE ift (
+`a` int(11) NOT NULL,
+`b` text,
+PRIMARY KEY (`a`),
+FULLTEXT KEY `b` (`b`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1;
+INSERT INTO ift values (1, "skip");
+INSERT INTO ift values (2, "skip and networking");
+INSERT INTO ift values (3, "--skip-networking");
+INSERT INTO ift values (4, "-donot--skip-networking");
+SELECT * FROM ift WHERE MATCH (b) AGAINST ('--skip-networking');
+a	b
+2	skip and networking
+3	--skip-networking
+4	-donot--skip-networking
+1	skip
+SELECT * FROM ift WHERE MATCH (b) AGAINST ('skip-networking');
+a	b
+2	skip and networking
+3	--skip-networking
+4	-donot--skip-networking
+1	skip
+SELECT * FROM ift WHERE MATCH (b) AGAINST ('----');
+a	b
+SELECT * FROM ift WHERE MATCH (b) AGAINST ('-donot--skip-networking');
+a	b
+4	-donot--skip-networking
+2	skip and networking
+3	--skip-networking
+1	skip
+DROP TABLE ift;
+CREATE TABLE articles (
+id INT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,
+title VARCHAR(200),
+body TEXT,
+FULLTEXT (title,body)
+) ENGINE=InnoDB;
+INSERT INTO articles (title,body) VALUES
+('MySQL Tutorial','DBMS stands for DataBase ...')  ,
+('How To Use MySQL Well','After you went through a ...'),
+('Optimizing MySQL','In this tutorial we will show ...'),
+('1001 MySQL Tricks','1. Never run mysqld as root. 2. ...'),
+('MySQL vs. YourSQL','In the following database comparison ...'),
+('( that''s me )','When configured properly, MySQL ...');
+SELECT * FROM articles WHERE MATCH (title,body)
+AGAINST ('( yours''s* )' IN BOOLEAN MODE);
+id	title	body
+5	MySQL vs. YourSQL	In the following database comparison ...
+SELECT * FROM articles WHERE MATCH (title,body)
+AGAINST ('s*' IN BOOLEAN MODE);
+id	title	body
+1	MySQL Tutorial	DBMS stands for DataBase ...
+3	Optimizing MySQL	In this tutorial we will show ...
+SELECT * FROM articles WHERE MATCH (title,body)
+AGAINST ('stands\'] | * | show[@database' IN NATURAL LANGUAGE MODE);
+id	title	body
+1	MySQL Tutorial	DBMS stands for DataBase ...
+3	Optimizing MySQL	In this tutorial we will show ...
+5	MySQL vs. YourSQL	In the following database comparison ...
+DROP TABLE articles;
+CREATE TABLE t1(a TEXT CHARACTER SET LATIN1, FULLTEXT INDEX(a)) ENGINE=INNODB;
+SELECT * FROM t1 WHERE MATCH(a) AGAINST("*");
+ERROR 42000: syntax error, unexpected $end, expecting FTS_TERM or FTS_NUMB or '*'
+DROP TABLE t1;
+CREATE TABLE t1 (
+id INT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,
+a VARCHAR(200),
+FULLTEXT (a)
+) ENGINE= InnoDB;
+INSERT INTO t1 (a) VALUES
+('Do you know MySQL is a good database'),
+('How to build a good database'),
+('Do you know'),
+('Do you know MySQL'),
+('How to use MySQL'),
+('Do you feel good'),
+('MySQL is good'),
+('MySQL is good to know'),
+('What is database');
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+"know mysql"' IN BOOLEAN MODE);
+id	a
+1	Do you know MySQL is a good database
+4	Do you know MySQL
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+("know mysql")' IN BOOLEAN MODE);
+id	a
+1	Do you know MySQL is a good database
+4	Do you know MySQL
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('("know mysql" good)' IN BOOLEAN MODE);
+id	a
+1	Do you know MySQL is a good database
+4	Do you know MySQL
+2	How to build a good database
+6	Do you feel good
+7	MySQL is good
+8	MySQL is good to know
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+("know mysql" good)' IN BOOLEAN MODE);
+id	a
+1	Do you know MySQL is a good database
+4	Do you know MySQL
+2	How to build a good database
+6	Do you feel good
+7	MySQL is good
+8	MySQL is good to know
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('(good "know mysql")' IN BOOLEAN MODE);
+id	a
+1	Do you know MySQL is a good database
+4	Do you know MySQL
+2	How to build a good database
+6	Do you feel good
+7	MySQL is good
+8	MySQL is good to know
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+(good "know mysql")' IN BOOLEAN MODE);
+id	a
+1	Do you know MySQL is a good database
+4	Do you know MySQL
+2	How to build a good database
+6	Do you feel good
+7	MySQL is good
+8	MySQL is good to know
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+("know mysql" "good database")' IN BOOLEAN MODE);
+id	a
+1	Do you know MySQL is a good database
+2	How to build a good database
+4	Do you know MySQL
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+"know mysql" +"good database"' IN BOOLEAN MODE);
+id	a
+1	Do you know MySQL is a good database
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+"know database"@4' IN BOOLEAN MODE);
+id	a
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+"know database"@8' IN BOOLEAN MODE);
+id	a
+1	Do you know MySQL is a good database
+DROP TABLE t1;
+CREATE TABLE t1 (
+id INT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,
+a VARCHAR(200),
+FULLTEXT (a)
+) ENGINE= InnoDB;
+INSERT INTO t1 (a) VALUES
+('know mysql good database');
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+"good database"' IN BOOLEAN MODE);
+id	a
+1	know mysql good database
+DROP TABLE t1;
diff --git a/mysql-test/suite/innodb_fts/r/innodb_fts_misc_1.result b/mysql-test/suite/innodb_fts/r/innodb_fts_misc_1.result
index 5b2255e2b03..0f50d6943a8 100644
--- a/mysql-test/suite/innodb_fts/r/innodb_fts_misc_1.result
+++ b/mysql-test/suite/innodb_fts/r/innodb_fts_misc_1.result
@@ -485,7 +485,7 @@ select * from t1 where MATCH(a,b) AGAINST("+tutorial +VÐƷWİ" IN BOOLEAN MODE)
 id	a	b
 1	MySQL Tutorial	DBMS stands for DataBase VÐƷWİ...
 select * from t1 where MATCH(a,b) AGAINST("+-VÐƷWİ" IN BOOLEAN MODE);
-id	a	b
+ERROR 42000: syntax error, unexpected '-'
 select * from t1 where MATCH(a,b) AGAINST("+Mysql +(tricks never)" IN BOOLEAN MODE);
 id	a	b
 4	1001 MySQL Tricks	1. Never run mysqld as root. 2. ...
@@ -669,11 +669,13 @@ SELECT * FROM t1 WHERE MATCH(a,b) AGAINST("+tutorial +(Мога τίποτα)" I
 id	a	b
 SELECT * FROM t1 WHERE MATCH(a,b) AGAINST ("あさきゆめみじ　ゑひもせず");
 id	a	b
+7	いろはにほへど　ちりぬる	あさきゆめみじ　ゑひもせず
 SELECT * FROM t1 WHERE MATCH(a,b) AGAINST ("ちりぬる" WITH QUERY EXPANSION);
 id	a	b
 7	いろはにほへど　ちりぬる	あさきゆめみじ　ゑひもせず
 SELECT * FROM t1 WHERE MATCH(a,b) AGAINST ("+あさきゆめみじ　+ゑひもせず" IN BOOLEAN MODE);
 id	a	b
+7	いろはにほへど　ちりぬる	あさきゆめみじ　ゑひもせず
 SELECT * FROM t1 WHERE MATCH(a,b) AGAINST("うゐのおく*" IN BOOLEAN MODE);
 id	a	b
 6	うゐのおくやま	けふこえて
diff --git a/mysql-test/suite/innodb_fts/r/innodb_fts_proximity.result b/mysql-test/suite/innodb_fts/r/innodb_fts_proximity.result
index 3336af3a092..a61ff47c9a1 100644
--- a/mysql-test/suite/innodb_fts/r/innodb_fts_proximity.result
+++ b/mysql-test/suite/innodb_fts/r/innodb_fts_proximity.result
@@ -51,6 +51,7 @@ SELECT * FROM t1
 WHERE MATCH (a,b)
 AGAINST ('"request docteam@oraclehelp.com"@10' IN BOOLEAN MODE);
 id	a	b
+4	MySQL Tutorial	request docteam@oraclehelp.com ...
 SELECT * FROM t1
 WHERE MATCH (a,b)
 AGAINST ('"1255 minute"@1' IN BOOLEAN MODE);
diff --git a/mysql-test/suite/innodb_fts/t/fulltext.test b/mysql-test/suite/innodb_fts/t/fulltext.test
index db8b1bb29eb..d75a650ca4d 100644
--- a/mysql-test/suite/innodb_fts/t/fulltext.test
+++ b/mysql-test/suite/innodb_fts/t/fulltext.test
@@ -4,6 +4,11 @@
 
 --source include/have_innodb.inc
 
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip Not fixed in InnoDB 5.6.10 or earlier
+}
+
 --disable_warnings
 drop table if exists t1,t2,t3;
 --enable_warnings
diff --git a/mysql-test/suite/innodb_fts/t/fulltext2.test b/mysql-test/suite/innodb_fts/t/fulltext2.test
index b9da4e334ac..33b6a7ac88e 100644
--- a/mysql-test/suite/innodb_fts/t/fulltext2.test
+++ b/mysql-test/suite/innodb_fts/t/fulltext2.test
@@ -11,6 +11,11 @@
 DROP TABLE IF EXISTS t1;
 --enable_warnings
 
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip Not fixed in InnoDB 5.6.10 or earlier
+}
+
 CREATE TABLE t1 (
   i int(10) unsigned not null auto_increment primary key,
   a varchar(255) not null,
diff --git a/mysql-test/suite/innodb_fts/t/fulltext_var.test b/mysql-test/suite/innodb_fts/t/fulltext_var.test
index 27b5e8c3130..03eab7e8557 100644
--- a/mysql-test/suite/innodb_fts/t/fulltext_var.test
+++ b/mysql-test/suite/innodb_fts/t/fulltext_var.test
@@ -7,6 +7,10 @@
 drop table if exists t1;
 --enable_warnings
 
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip Not fixed in InnoDB 5.6.10 or earlier
+}
 
 # Save ft_boolean_syntax variable
 let $saved_ft_boolean_syntax=`select @@global.ft_boolean_syntax`;
@@ -27,6 +31,8 @@ set global ft_boolean_syntax=' +-><()~*:""&|';
 select * from t1 where match b against ('+aaaaaa bbbbbb' in boolean mode);
 set global ft_boolean_syntax='@ -><()~*:""&|';
 select * from t1 where match b against ('+aaaaaa bbbbbb' in boolean mode);
+
+--error ER_PARSE_ERROR
 select * from t1 where match b against ('+aaaaaa @bbbbbb' in boolean mode);
 -- error 1231
 set global ft_boolean_syntax='@ -><()~*:""@|';
diff --git a/mysql-test/suite/innodb_fts/t/innodb-fts-basic.test b/mysql-test/suite/innodb_fts/t/innodb-fts-basic.test
index 43e5912e61c..095713130f1 100644
--- a/mysql-test/suite/innodb_fts/t/innodb-fts-basic.test
+++ b/mysql-test/suite/innodb_fts/t/innodb-fts-basic.test
@@ -2,6 +2,11 @@
 
 -- source include/have_innodb.inc
 
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip Not fixed in InnoDB 5.6.10 or earlier
+}
+
 # Create FTS table
 CREATE TABLE articles (
 	id INT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,
@@ -113,6 +118,16 @@ SELECT * FROM articles WHERE MATCH (title,body)
 SELECT * FROM articles WHERE MATCH (title,body)
 	AGAINST ('(+MySQL - (Tricks Security)) - YourSQL' IN BOOLEAN MODE);
 
+# Test non-word delimiter combined with negate "-" operator
+# This should return the same result as 'mysql - (Security DBMS)'
+SELECT * FROM articles WHERE MATCH (title,body) AGAINST ('mysql - Security&DBMS' IN BOOLEAN MODE);
+SELECT * FROM articles WHERE MATCH (title,body) AGAINST ('mysql - (Security DBMS)' IN BOOLEAN MODE);
+
+# Again, the operator sequence should not matter
+SELECT * FROM articles WHERE MATCH (title,body) AGAINST (' - Security&DBMS + YourSQL' IN BOOLEAN MODE);
+
+SELECT * FROM articles WHERE MATCH (title,body) AGAINST ('+YourSQL - Security&DBMS' IN BOOLEAN MODE);
+
 # Test query expansion
 SELECT COUNT(*) FROM articles
         WHERE MATCH (title,body)
diff --git a/mysql-test/suite/innodb_fts/t/innodb_fts_misc.test b/mysql-test/suite/innodb_fts/t/innodb_fts_misc.test
index 803895d20fc..934d52f764f 100644
--- a/mysql-test/suite/innodb_fts/t/innodb_fts_misc.test
+++ b/mysql-test/suite/innodb_fts/t/innodb_fts_misc.test
@@ -9,6 +9,11 @@ let collation=UTF8_UNICODE_CI;
 drop table if exists t1;
 --enable_warnings
 
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip Not fixed in InnoDB 5.6.10 or earlier
+}
+
 # Create FTS table
 CREATE TABLE t1 (
         id INT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,
@@ -600,35 +605,18 @@ INSERT INTO t1 VALUES (1,'ペペペ'),(2,'テテテ'),(3,'ルルル'),(4,'ググ
 DROP TABLE t1;
 
 
-# TODO: uncomment this when utf8_unicode_520_ci is merged
-#--echo "----------Test15---------"
-#CREATE TABLE t1 (s1 VARCHAR (60) CHARACTER SET UTF8 COLLATE UTF8_UNICODE_520_CI) ENGINE = MyISAM;
-#CREATE FULLTEXT INDEX i ON t1 (s1);
-#INSERT INTO t1 VALUES
-#('a'),('b'),('c'),('d'),('ŁŁŁŁ'),('LLLL'),(NULL),('ŁŁŁŁ ŁŁŁŁ'),('LLLLLLLL');
-#SELECT * FROM t1 WHERE MATCH(s1) AGAINST ('LLLL' COLLATE UTF8_UNICODE_520_CI);
-#DROP TABLE if EXISTS t2;
-#CREATE TABLE t2 (s1 VARCHAR(60) CHARACTER SET UTF8 COLLATE UTF8_POLISH_CI) ENGINE = InnoDB;
-#CREATE FULLTEXT INDEX i ON t2 ( s1);
-#INSERT INTO t2 VALUES
-#('a'),('b'),('c'),('d'),('ŁŁŁŁ'),('LLLL'),(NULL),('ŁŁŁŁ ŁŁŁŁ'),('LLLLLLLL');
-#SELECT * FROM t2 WHERE MATCH(s1) AGAINST ('LLLL' COLLATE UTF8_UNICODE_520_CI);
-#--disable_warnings
-#DROP TABLE t1,t2;
-#--enable_warnings
-
---echo "----------Test15a--------"
-CREATE TABLE t1 (s1 VARCHAR (60) CHARACTER SET UTF8 COLLATE UTF8_UNICODE_CI) ENGINE = MyISAM;
+--echo "----------Test15---------"
+CREATE TABLE t1 (s1 VARCHAR (60) CHARACTER SET UTF8 COLLATE UTF8_UNICODE_520_CI) ENGINE = MyISAM;
 CREATE FULLTEXT INDEX i ON t1 (s1);
 INSERT INTO t1 VALUES
-('a'),('b'),('c'),('d'),('ÓÓÓÓ'),('OOOO'),(NULL),('ÓÓÓÓ ÓÓÓÓ'),('OOOOOOOO');
-SELECT * FROM t1 WHERE MATCH(s1) AGAINST ('OOOO' COLLATE UTF8_UNICODE_CI);
+('a'),('b'),('c'),('d'),('ŁŁŁŁ'),('LLLL'),(NULL),('ŁŁŁŁ ŁŁŁŁ'),('LLLLLLLL');
+SELECT * FROM t1 WHERE MATCH(s1) AGAINST ('LLLL' COLLATE UTF8_UNICODE_520_CI);
 DROP TABLE if EXISTS t2;
 CREATE TABLE t2 (s1 VARCHAR(60) CHARACTER SET UTF8 COLLATE UTF8_POLISH_CI) ENGINE = InnoDB;
 CREATE FULLTEXT INDEX i ON t2 ( s1);
 INSERT INTO t2 VALUES
-('a'),('b'),('c'),('d'),('ÓÓÓÓ'),('OOOO'),(NULL),('ÓÓÓÓ ÓÓÓÓ'),('OOOOOOOO');
-SELECT * FROM t2 WHERE MATCH(s1) AGAINST ('OOOO' COLLATE UTF8_UNICODE_CI);
+('a'),('b'),('c'),('d'),('ŁŁŁŁ'),('LLLL'),(NULL),('ŁŁŁŁ ŁŁŁŁ'),('LLLLLLLL');
+SELECT * FROM t2 WHERE MATCH(s1) AGAINST ('LLLL' COLLATE UTF8_UNICODE_520_CI);
 --disable_warnings
 DROP TABLE t1,t2;
 --enable_warnings
@@ -1179,3 +1167,170 @@ DROP TABLE `A B`;
 CREATE TABLE `t-26`(a VARCHAR(10),FULLTEXT KEY(a)) ENGINE=INNODB;
 INSERT INTO `t-26` VALUES('117');
 DROP TABLE `t-26`;
+
+# Test on phrase search with stopwords contained in the search string
+CREATE TABLE `t1` (
+  `id` INTEGER UNSIGNED NOT NULL AUTO_INCREMENT,
+  `content` TEXT NOT NULL,
+  PRIMARY KEY (`id`),
+  FULLTEXT INDEX `IDX_CONTEXT_FULLTEXT`(`content`)
+)
+ENGINE = InnoDB;
+
+insert into t1 (content)
+values
+('This is a story which has has a complicated phrase structure here in the
+middle'),
+('This is a story which doesn''t have that text'),
+('This is a story that has complicated the phrase structure');
+
+select * from t1
+where match(content) against('"complicated phrase structure"' in boolean
+mode);
+
+# Test single phrase search with "+" symbol, one row should be returned
+select * from t1
+where match(content) against('+"complicated phrase structure"' in boolean
+mode);
+
+# Test phrase search with stopwords in between, one row should be returned
+select * from t1
+where match(content) against('"complicated the phrase structure"' in boolean
+mode);
+
+# Test phrase search with multiple "+" symbols
+select * from t1 where match(content) against('+"this is a story which" +"complicated the phrase structure"' in boolean mode);
+
+# Test phrase search with leading word is a stopword, such stopword would be
+# ignored
+select * from t1 where match(content) against('"the complicated the phrase structure"' in boolean mode);
+
+# Test phrase search with non-matching stopword in between, no row should be
+# returned
+select * from t1 where match(content) against('"complicated a phrase structure"' in boolean mode);
+
+DROP TABLE t1;
+
+CREATE TABLE my (id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+c VARCHAR(32), FULLTEXT(c)) ENGINE = INNODB;
+
+INSERT INTO my (c) VALUES ('green-iguana');
+
+SELECT * FROM my WHERE MATCH(c) AGAINST ('green-iguana');
+
+DROP TABLE my;
+
+CREATE TABLE ift (
+  `a` int(11) NOT NULL,
+  `b` text,
+  PRIMARY KEY (`a`),
+  FULLTEXT KEY `b` (`b`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1;
+
+INSERT INTO ift values (1, "skip");
+INSERT INTO ift values (2, "skip and networking");
+INSERT INTO ift values (3, "--skip-networking");
+INSERT INTO ift values (4, "-donot--skip-networking");
+
+SELECT * FROM ift WHERE MATCH (b) AGAINST ('--skip-networking');
+SELECT * FROM ift WHERE MATCH (b) AGAINST ('skip-networking');
+SELECT * FROM ift WHERE MATCH (b) AGAINST ('----');
+SELECT * FROM ift WHERE MATCH (b) AGAINST ('-donot--skip-networking');
+
+DROP TABLE ift;
+
+# Test special cases of wildword.
+# Create FTS table
+CREATE TABLE articles (
+        id INT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,
+        title VARCHAR(200),
+        body TEXT,
+        FULLTEXT (title,body)
+        ) ENGINE=InnoDB;
+
+# Insert six rows
+INSERT INTO articles (title,body) VALUES
+        ('MySQL Tutorial','DBMS stands for DataBase ...')  ,
+        ('How To Use MySQL Well','After you went through a ...'),
+        ('Optimizing MySQL','In this tutorial we will show ...'),
+        ('1001 MySQL Tricks','1. Never run mysqld as root. 2. ...'),
+        ('MySQL vs. YourSQL','In the following database comparison ...'),
+        ('( that''s me )','When configured properly, MySQL ...');
+
+SELECT * FROM articles WHERE MATCH (title,body)
+        AGAINST ('( yours''s* )' IN BOOLEAN MODE);
+
+SELECT * FROM articles WHERE MATCH (title,body)
+	AGAINST ('s*' IN BOOLEAN MODE);
+
+SELECT * FROM articles WHERE MATCH (title,body)
+        AGAINST ('stands\'] | * | show[@database' IN NATURAL LANGUAGE MODE);
+
+DROP TABLE articles;
+
+# Test for BUG#16429688 - FTS: SYNTAX ERROR, UNEXPECTED '*', EXPECTING $END
+CREATE TABLE t1(a TEXT CHARACTER SET LATIN1, FULLTEXT INDEX(a)) ENGINE=INNODB;
+
+--error ER_PARSE_ERROR
+SELECT * FROM t1 WHERE MATCH(a) AGAINST("*");
+
+DROP TABLE t1;
+
+# Test for BUG#16516193 - LITERAL PHRASES CANNOT BE COMBINED WITH + OR - OPERATOR
+# Create FTS table
+CREATE TABLE t1 (
+	id INT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,
+	a VARCHAR(200),
+	FULLTEXT (a)
+	) ENGINE= InnoDB;
+
+# Insert rows
+INSERT INTO t1 (a) VALUES
+	('Do you know MySQL is a good database'),
+	('How to build a good database'),
+	('Do you know'),
+	('Do you know MySQL'),
+	('How to use MySQL'),
+	('Do you feel good'),
+	('MySQL is good'),
+	('MySQL is good to know'),
+	('What is database');
+
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+"know mysql"' IN BOOLEAN MODE);
+
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+("know mysql")' IN BOOLEAN MODE);
+
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('("know mysql" good)' IN BOOLEAN MODE);
+
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+("know mysql" good)' IN BOOLEAN MODE);
+
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('(good "know mysql")' IN BOOLEAN MODE);
+
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+(good "know mysql")' IN BOOLEAN MODE);
+
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+("know mysql" "good database")' IN BOOLEAN MODE);
+
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+"know mysql" +"good database"' IN BOOLEAN MODE);
+
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+"know database"@4' IN BOOLEAN MODE);
+
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+"know database"@8' IN BOOLEAN MODE);
+
+# Drop table
+DROP TABLE t1;
+
+# Test for BUG#16885178 - INNODB FULLTEXT PHRASE SEARCH VALGRIND ERROR
+CREATE TABLE t1 (
+	id INT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,
+	a VARCHAR(200),
+	FULLTEXT (a)
+	) ENGINE= InnoDB;
+
+# Insert a special row
+INSERT INTO t1 (a) VALUES
+        ('know mysql good database');
+
+# This phrase search fails in valgrind test before the fix.
+SELECT * FROM t1 WHERE MATCH (a) AGAINST ('+"good database"' IN BOOLEAN MODE);
+
+DROP TABLE t1;
diff --git a/mysql-test/suite/innodb_fts/t/innodb_fts_misc_1.test b/mysql-test/suite/innodb_fts/t/innodb_fts_misc_1.test
index 80c39a048d7..6ef0452f4c6 100644
--- a/mysql-test/suite/innodb_fts/t/innodb_fts_misc_1.test
+++ b/mysql-test/suite/innodb_fts/t/innodb_fts_misc_1.test
@@ -3,6 +3,11 @@
 #-------------------------------------------------------------------------------
 --source include/have_innodb.inc
 
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip Not fixed in InnoDB 5.6.10 or earlier
+}
+
 --disable_warnings
 drop table if exists t2,t1;
 --enable_warnings
@@ -475,6 +480,7 @@ SELECT * FROM t1 WHERE MATCH (a,b)
 
 # boolean mode
 select * from t1 where MATCH(a,b) AGAINST("+tutorial +VÐƷWİ" IN BOOLEAN MODE);
+--error ER_PARSE_ERROR
 select * from t1 where MATCH(a,b) AGAINST("+-VÐƷWİ" IN BOOLEAN MODE);
 select * from t1 where MATCH(a,b) AGAINST("+Mysql +(tricks never)" IN BOOLEAN MODE);
 select * from t1 where MATCH(a,b) AGAINST("+mysql -(tricks never)" IN BOOLEAN MODE);
diff --git a/mysql-test/suite/innodb_fts/t/innodb_fts_proximity.test b/mysql-test/suite/innodb_fts/t/innodb_fts_proximity.test
index 3dc05be3365..b2ac81e2840 100644
--- a/mysql-test/suite/innodb_fts/t/innodb_fts_proximity.test
+++ b/mysql-test/suite/innodb_fts/t/innodb_fts_proximity.test
@@ -3,6 +3,11 @@
 # and try search default words
 --source include/have_innodb.inc
 
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip Not fixed in InnoDB 5.6.10 or earlier
+}
+
 --disable_warnings
 drop table if exists t1;
 --enable_warnings
diff --git a/mysql-test/suite/percona/innodb_fix_misc_bug51325.result b/mysql-test/suite/percona/innodb_fix_misc_bug51325.result
deleted file mode 100644
index c63a33accdd..00000000000
--- a/mysql-test/suite/percona/innodb_fix_misc_bug51325.result
+++ /dev/null
@@ -1,13 +0,0 @@
-DROP TABLE IF EXISTS t1;
-SET GLOBAL innodb_file_per_table=ON;
-SHOW VARIABLES LIKE 'innodb_lazy_drop_table';
-Variable_name	Value
-innodb_lazy_drop_table	0
-SET GLOBAL innodb_lazy_drop_table=1;
-SHOW VARIABLES LIKE 'innodb_lazy_drop_table';
-Variable_name	Value
-innodb_lazy_drop_table	1
-CREATE TABLE t1 (a INT) ENGINE=InnoDB;
-DROP TABLE t1;
-SET GLOBAL innodb_lazy_drop_table=default;
-SET GLOBAL innodb_file_per_table=default;
diff --git a/mysql-test/suite/percona/innodb_fix_misc_bug51325.test b/mysql-test/suite/percona/innodb_fix_misc_bug51325.test
deleted file mode 100644
index 54fa3a80179..00000000000
--- a/mysql-test/suite/percona/innodb_fix_misc_bug51325.test
+++ /dev/null
@@ -1,13 +0,0 @@
-# Test for 'innodb_lazy_drop_table' variable
---source include/have_xtradb.inc
---disable_warnings
-DROP TABLE IF EXISTS t1; 
---enable_warnings
-SET GLOBAL innodb_file_per_table=ON;
-SHOW VARIABLES LIKE 'innodb_lazy_drop_table';
-SET GLOBAL innodb_lazy_drop_table=1;
-SHOW VARIABLES LIKE 'innodb_lazy_drop_table';
-CREATE TABLE t1 (a INT) ENGINE=InnoDB;
-DROP TABLE t1;
-SET GLOBAL innodb_lazy_drop_table=default;
-SET GLOBAL innodb_file_per_table=default;
diff --git a/mysql-test/suite/percona/innodb_sys_index.result b/mysql-test/suite/percona/innodb_sys_index.result
index 8bf4fa745ba..67604236366 100644
--- a/mysql-test/suite/percona/innodb_sys_index.result
+++ b/mysql-test/suite/percona/innodb_sys_index.result
@@ -1,6 +1,6 @@
 drop table if exists t1;
 Warnings:
-Note	1051	Unknown table 't1'
+Note	1051	Unknown table 'test.t1'
 select @@version_comment limit 1 ;
 @@version_comment
 Source distribution
diff --git a/mysql-test/suite/percona/percona_innodb_doublewrite_file-master.opt b/mysql-test/suite/percona/percona_innodb_doublewrite_file-master.opt
deleted file mode 100644
index 54f9f550277..00000000000
--- a/mysql-test/suite/percona/percona_innodb_doublewrite_file-master.opt
+++ /dev/null
@@ -1 +0,0 @@
---loose-innodb_doublewrite_file=ib_doublewrite
diff --git a/mysql-test/suite/percona/percona_innodb_doublewrite_file.result b/mysql-test/suite/percona/percona_innodb_doublewrite_file.result
deleted file mode 100644
index 4d086cc4498..00000000000
--- a/mysql-test/suite/percona/percona_innodb_doublewrite_file.result
+++ /dev/null
@@ -1,4 +0,0 @@
-show variables like 'innodb_doublewrite%';
-Variable_name	Value
-innodb_doublewrite	ON
-innodb_doublewrite_file	ib_doublewrite
diff --git a/mysql-test/suite/percona/percona_innodb_doublewrite_file.test b/mysql-test/suite/percona/percona_innodb_doublewrite_file.test
deleted file mode 100644
index d9e94db8463..00000000000
--- a/mysql-test/suite/percona/percona_innodb_doublewrite_file.test
+++ /dev/null
@@ -1,2 +0,0 @@
---source include/have_xtradb.inc
-show variables like 'innodb_doublewrite%';
diff --git a/mysql-test/suite/percona/percona_innodb_fake_changes.result b/mysql-test/suite/percona/percona_innodb_fake_changes.result
index 434b7283146..95f0c07cd11 100644
--- a/mysql-test/suite/percona/percona_innodb_fake_changes.result
+++ b/mysql-test/suite/percona/percona_innodb_fake_changes.result
@@ -45,7 +45,7 @@ BEGIN;
 CREATE TABLE t2 (a INT) ENGINE=InnoDB;
 ERROR HY000: Can't create table `test`.`t2` (errno: 131 "Command not supported by database")
 DROP TABLE t1;
-ERROR 42S02: Unknown table 't1'
+ERROR 42S02: Unknown table 'test.t1'
 TRUNCATE TABLE t1;
 ERROR HY000: Got error 131 "Command not supported by database" during COMMIT
 ALTER TABLE t1 ENGINE=MyISAM;
diff --git a/mysql-test/suite/percona/percona_innodb_use_sys_stats_table-master.opt b/mysql-test/suite/percona/percona_innodb_use_sys_stats_table-master.opt
deleted file mode 100644
index 7479e2036aa..00000000000
--- a/mysql-test/suite/percona/percona_innodb_use_sys_stats_table-master.opt
+++ /dev/null
@@ -1 +0,0 @@
---loose-innodb_use_sys_stats_table
diff --git a/mysql-test/suite/percona/percona_innodb_use_sys_stats_table.result b/mysql-test/suite/percona/percona_innodb_use_sys_stats_table.result
deleted file mode 100644
index cb64de41901..00000000000
--- a/mysql-test/suite/percona/percona_innodb_use_sys_stats_table.result
+++ /dev/null
@@ -1,3 +0,0 @@
-show variables like 'innodb_use_sys_stats%';
-Variable_name	Value
-innodb_use_sys_stats_table	ON
diff --git a/mysql-test/suite/percona/percona_innodb_use_sys_stats_table.test b/mysql-test/suite/percona/percona_innodb_use_sys_stats_table.test
deleted file mode 100644
index 534b26a3b45..00000000000
--- a/mysql-test/suite/percona/percona_innodb_use_sys_stats_table.test
+++ /dev/null
@@ -1,2 +0,0 @@
---source include/have_xtradb.inc
-show variables like 'innodb_use_sys_stats%';
diff --git a/mysql-test/suite/percona/percona_xtradb_admin_command.result b/mysql-test/suite/percona/percona_xtradb_admin_command.result
deleted file mode 100644
index 26ba14f2f3b..00000000000
--- a/mysql-test/suite/percona/percona_xtradb_admin_command.result
+++ /dev/null
@@ -1,6 +0,0 @@
-select * from information_schema.XTRADB_ADMIN_COMMAND;
-result_message
-No XTRA_* command in the SQL statement. Please add /*!XTRA_xxxx*/ to the SQL.
-select * from information_schema.XTRADB_ADMIN_COMMAND /*!XTRA_HELLO*/;
-result_message
-Hello!
diff --git a/mysql-test/suite/percona/percona_xtradb_admin_command.test b/mysql-test/suite/percona/percona_xtradb_admin_command.test
deleted file mode 100644
index b675c2afa21..00000000000
--- a/mysql-test/suite/percona/percona_xtradb_admin_command.test
+++ /dev/null
@@ -1,3 +0,0 @@
---source include/have_xtradb.inc
-select * from information_schema.XTRADB_ADMIN_COMMAND;
-select * from information_schema.XTRADB_ADMIN_COMMAND /*!XTRA_HELLO*/;
diff --git a/mysql-test/suite/plugins/t/cassandra.opt b/mysql-test/suite/plugins/t/cassandra.opt
index 98a4a081de5..76639e7d3e2 100644
--- a/mysql-test/suite/plugins/t/cassandra.opt
+++ b/mysql-test/suite/plugins/t/cassandra.opt
@@ -1 +1 @@
---plugin-load=$HA_CASSANDRA_SO --loose-cassandra=on
+--plugin-load-add=$HA_CASSANDRA_SO --loose-cassandra=on
diff --git a/mysql-test/suite/plugins/t/cassandra_qcache.opt b/mysql-test/suite/plugins/t/cassandra_qcache.opt
index 98a4a081de5..76639e7d3e2 100644
--- a/mysql-test/suite/plugins/t/cassandra_qcache.opt
+++ b/mysql-test/suite/plugins/t/cassandra_qcache.opt
@@ -1 +1 @@
---plugin-load=$HA_CASSANDRA_SO --loose-cassandra=on
+--plugin-load-add=$HA_CASSANDRA_SO --loose-cassandra=on
diff --git a/mysql-test/suite/plugins/t/locales.opt b/mysql-test/suite/plugins/t/locales.opt
index 594283f8c65..21f53ca3f0b 100644
--- a/mysql-test/suite/plugins/t/locales.opt
+++ b/mysql-test/suite/plugins/t/locales.opt
@@ -1,3 +1,3 @@
 --loose-locale
---plugin-load=$LOCALES_SO
+--plugin-load-add=$LOCALES_SO
 
diff --git a/mysql-test/suite/plugins/t/qc_info_init.opt b/mysql-test/suite/plugins/t/qc_info_init.opt
index 663de4da7d7..53b4ff314df 100644
--- a/mysql-test/suite/plugins/t/qc_info_init.opt
+++ b/mysql-test/suite/plugins/t/qc_info_init.opt
@@ -1,2 +1,2 @@
 --loose-query_cache_info
---plugin-load=$QUERY_CACHE_INFO_SO
+--plugin-load-add=$QUERY_CACHE_INFO_SO
diff --git a/mysql-test/suite/rpl/r/rpl_mdev382.result b/mysql-test/suite/rpl/r/rpl_mdev382.result
index b339188f8b8..6cd6aa09ede 100644
--- a/mysql-test/suite/rpl/r/rpl_mdev382.result
+++ b/mysql-test/suite/rpl/r/rpl_mdev382.result
@@ -315,7 +315,7 @@ CREATE TABLE `t``1` ( `a``` INT PRIMARY KEY) ENGINE=innodb;
 CREATE TABLE `t``2` ( `b``` INT PRIMARY KEY, `c``` INT NOT NULL,
 FOREIGN KEY fk (`c```) REFERENCES `t``1`(`a```)) ENGINE=innodb;
 TRUNCATE `t``1`;
-ERROR 42000: Cannot truncate a table referenced in a foreign key constraint (`db1``; select 'oops!'`.`t``2`, CONSTRAINT `t@00602_ibfk_1` FOREIGN KEY (`c```) REFERENCES `db1``; select 'oops!'`.`t``1` (`a```))
+ERROR 42000: Cannot truncate a table referenced in a foreign key constraint (`db1``; select 'oops!'`.`t``2`, CONSTRAINT `t``2_ibfk_1` FOREIGN KEY (`c```) REFERENCES `db1``; select 'oops!'`.`t``1` (`a```))
 DROP TABLE `t``2`;
 DROP TABLE `t``1`;
 *** Test correct quoting of DELETE FROM statement binlogged for HEAP table that is emptied due to server restart
diff --git a/mysql-test/suite/rpl/t/rpl_mdev382.test b/mysql-test/suite/rpl/t/rpl_mdev382.test
index 3ec877cdb1a..cb67052b47d 100644
--- a/mysql-test/suite/rpl/t/rpl_mdev382.test
+++ b/mysql-test/suite/rpl/t/rpl_mdev382.test
@@ -2,6 +2,11 @@
 --source include/have_binlog_format_statement.inc
 --source include/master-slave.inc
 
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip Not fixed in InnoDB 5.6.10 or earlier
+}
+
 # MDEV-382: multiple SQL injections in replication code.
 
 # Test previous SQL injection attack against binlog for SAVEPOINT statement.
diff --git a/mysql-test/suite/sys_vars/r/innodb_cleaner_eviction_factor_basic.result b/mysql-test/suite/sys_vars/r/innodb_cleaner_eviction_factor_basic.result
new file mode 100644
index 00000000000..8f017ea40ec
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_cleaner_eviction_factor_basic.result
@@ -0,0 +1,31 @@
+SET @start_value = @@GLOBAL.innodb_cleaner_eviction_factor;
+SELECT @@GLOBAL.innodb_cleaner_eviction_factor;
+@@GLOBAL.innodb_cleaner_eviction_factor
+0
+SELECT @@SESSION.innodb_cleaner_eviction_factor;
+ERROR HY000: Variable 'innodb_cleaner_eviction_factor' is a GLOBAL variable
+SET GLOBAL innodb_cleaner_eviction_factor='OFF';
+SELECT @@GLOBAL.innodb_cleaner_eviction_factor;
+@@GLOBAL.innodb_cleaner_eviction_factor
+0
+SET GLOBAL innodb_cleaner_eviction_factor='ON';
+SELECT @@GLOBAL.innodb_cleaner_eviction_factor;
+@@GLOBAL.innodb_cleaner_eviction_factor
+1
+SET GLOBAL innodb_cleaner_eviction_factor=0;
+SELECT @@GLOBAL.innodb_cleaner_eviction_factor;
+@@GLOBAL.innodb_cleaner_eviction_factor
+0
+SET GLOBAL innodb_cleaner_eviction_factor=1;
+SELECT @@GLOBAL.innodb_cleaner_eviction_factor;
+@@GLOBAL.innodb_cleaner_eviction_factor
+1
+SET GLOBAL innodb_cleaner_eviction_factor=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_eviction_factor'
+SET GLOBAL innodb_cleaner_eviction_factor=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_eviction_factor'
+SET GLOBAL innodb_cleaner_eviction_factor=2;
+ERROR 42000: Variable 'innodb_cleaner_eviction_factor' can't be set to the value of '2'
+SET GLOBAL innodb_cleaner_eviction_factor='foo';
+ERROR 42000: Variable 'innodb_cleaner_eviction_factor' can't be set to the value of 'foo'
+SET GLOBAL innodb_cleaner_eviction_factor = @start_value;
diff --git a/mysql-test/suite/sys_vars/r/innodb_cleaner_flush_chunk_size_basic.result b/mysql-test/suite/sys_vars/r/innodb_cleaner_flush_chunk_size_basic.result
new file mode 100644
index 00000000000..651023d7a38
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_cleaner_flush_chunk_size_basic.result
@@ -0,0 +1,31 @@
+SET @start_value = @@GLOBAL.innodb_cleaner_flush_chunk_size;
+SELECT @@GLOBAL.innodb_cleaner_flush_chunk_size;
+@@GLOBAL.innodb_cleaner_flush_chunk_size
+100
+SELECT @@SESSION.innodb_cleaner_flush_chunk_size;
+ERROR HY000: Variable 'innodb_cleaner_flush_chunk_size' is a GLOBAL variable
+SET GLOBAL innodb_cleaner_flush_chunk_size=1;
+SELECT @@GLOBAL.innodb_cleaner_flush_chunk_size;
+@@GLOBAL.innodb_cleaner_flush_chunk_size
+1
+SET GLOBAL innodb_cleaner_flush_chunk_size=1000;
+SELECT @@GLOBAL.innodb_cleaner_flush_chunk_size;
+@@GLOBAL.innodb_cleaner_flush_chunk_size
+1000
+SET GLOBAL innodb_cleaner_flush_chunk_size=4294967295;
+SELECT @@GLOBAL.innodb_cleaner_flush_chunk_size;
+@@GLOBAL.innodb_cleaner_flush_chunk_size
+4294967295
+SET GLOBAL innodb_cleaner_flush_chunk_size=0;
+Warnings:
+Warning	1292	Truncated incorrect innodb_cleaner_flush_chunk_size value: '0'
+SELECT @@GLOBAL.innodb_cleaner_flush_chunk_size;
+@@GLOBAL.innodb_cleaner_flush_chunk_size
+1
+SET GLOBAL innodb_cleaner_flush_chunk_size=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_flush_chunk_size'
+SET GLOBAL innodb_cleaner_flush_chunk_size=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_flush_chunk_size'
+SET GLOBAL innodb_cleaner_flush_chunk_size='foo';
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_flush_chunk_size'
+SET GLOBAL innodb_cleaner_flush_chunk_size = @start_value;
diff --git a/mysql-test/suite/sys_vars/r/innodb_cleaner_free_list_lwm_basic.result b/mysql-test/suite/sys_vars/r/innodb_cleaner_free_list_lwm_basic.result
new file mode 100644
index 00000000000..2d7883b7d83
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_cleaner_free_list_lwm_basic.result
@@ -0,0 +1,35 @@
+SET @start_value = @@GLOBAL.innodb_cleaner_free_list_lwm;
+SELECT @@GLOBAL.innodb_cleaner_free_list_lwm;
+@@GLOBAL.innodb_cleaner_free_list_lwm
+10
+SELECT @@SESSION.innodb_cleaner_free_list_lwm;
+ERROR HY000: Variable 'innodb_cleaner_free_list_lwm' is a GLOBAL variable
+SET GLOBAL innodb_cleaner_free_list_lwm=0;
+SELECT @@GLOBAL.innodb_cleaner_free_list_lwm;
+@@GLOBAL.innodb_cleaner_free_list_lwm
+0
+SET GLOBAL innodb_cleaner_free_list_lwm=1;
+SELECT @@GLOBAL.innodb_cleaner_free_list_lwm;
+@@GLOBAL.innodb_cleaner_free_list_lwm
+1
+SET GLOBAL innodb_cleaner_free_list_lwm=99;
+SELECT @@GLOBAL.innodb_cleaner_free_list_lwm;
+@@GLOBAL.innodb_cleaner_free_list_lwm
+99
+SET GLOBAL innodb_cleaner_free_list_lwm=100;
+SELECT @@GLOBAL.innodb_cleaner_free_list_lwm;
+@@GLOBAL.innodb_cleaner_free_list_lwm
+100
+SET GLOBAL innodb_cleaner_free_list_lwm=101;
+Warnings:
+Warning	1292	Truncated incorrect innodb_cleaner_free_list_lwm value: '101'
+SELECT @@innodb_cleaner_free_list_lwm;
+@@innodb_cleaner_free_list_lwm
+100
+SET GLOBAL innodb_cleaner_free_list_lwm=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_free_list_lwm'
+SET GLOBAL innodb_cleaner_free_list_lwm=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_free_list_lwm'
+SET GLOBAL innodb_cleaner_free_list_lwm='foo';
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_free_list_lwm'
+SET GLOBAL innodb_cleaner_free_list_lwm = @start_value;
diff --git a/mysql-test/suite/sys_vars/r/innodb_cleaner_lru_chunk_size_basic.result b/mysql-test/suite/sys_vars/r/innodb_cleaner_lru_chunk_size_basic.result
new file mode 100644
index 00000000000..5dfc6738e11
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_cleaner_lru_chunk_size_basic.result
@@ -0,0 +1,31 @@
+SET @start_value = @@GLOBAL.innodb_cleaner_lru_chunk_size;
+SELECT @@GLOBAL.innodb_cleaner_lru_chunk_size;
+@@GLOBAL.innodb_cleaner_lru_chunk_size
+100
+SELECT @@SESSION.innodb_cleaner_lru_chunk_size;
+ERROR HY000: Variable 'innodb_cleaner_lru_chunk_size' is a GLOBAL variable
+SET GLOBAL innodb_cleaner_lru_chunk_size=1;
+SELECT @@GLOBAL.innodb_cleaner_lru_chunk_size;
+@@GLOBAL.innodb_cleaner_lru_chunk_size
+1
+SET GLOBAL innodb_cleaner_lru_chunk_size=1000;
+SELECT @@GLOBAL.innodb_cleaner_lru_chunk_size;
+@@GLOBAL.innodb_cleaner_lru_chunk_size
+1000
+SET GLOBAL innodb_cleaner_lru_chunk_size=4294967295;
+SELECT @@GLOBAL.innodb_cleaner_lru_chunk_size;
+@@GLOBAL.innodb_cleaner_lru_chunk_size
+4294967295
+SET GLOBAL innodb_cleaner_lru_chunk_size=0;
+Warnings:
+Warning	1292	Truncated incorrect innodb_cleaner_lru_chunk_size value: '0'
+SELECT @@GLOBAL.innodb_cleaner_lru_chunk_size;
+@@GLOBAL.innodb_cleaner_lru_chunk_size
+1
+SET GLOBAL innodb_cleaner_lru_chunk_size=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_lru_chunk_size'
+SET GLOBAL innodb_cleaner_lru_chunk_size=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_lru_chunk_size'
+SET GLOBAL innodb_cleaner_lru_chunk_size='foo';
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_lru_chunk_size'
+SET GLOBAL innodb_cleaner_lru_chunk_size = @start_value;
diff --git a/mysql-test/suite/sys_vars/r/innodb_cleaner_lsn_age_factor_basic.result b/mysql-test/suite/sys_vars/r/innodb_cleaner_lsn_age_factor_basic.result
new file mode 100644
index 00000000000..6dd1b6dc489
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_cleaner_lsn_age_factor_basic.result
@@ -0,0 +1,21 @@
+SET @start_value = @@GLOBAL.innodb_cleaner_lsn_age_factor;
+SELECT @@GLOBAL.innodb_cleaner_lsn_age_factor;
+@@GLOBAL.innodb_cleaner_lsn_age_factor
+high_checkpoint
+SELECT @@SESSION.innodb_cleaner_lsn_age_factor;
+ERROR HY000: Variable 'innodb_cleaner_lsn_age_factor' is a GLOBAL variable
+SET GLOBAL innodb_cleaner_lsn_age_factor='legacy';
+SELECT @@GLOBAL.innodb_cleaner_lsn_age_factor;
+@@GLOBAL.innodb_cleaner_lsn_age_factor
+legacy
+SET GLOBAL innodb_cleaner_lsn_age_factor='high_checkpoint';
+SELECT @@GLOBAL.innodb_cleaner_lsn_age_factor;
+@@GLOBAL.innodb_cleaner_lsn_age_factor
+high_checkpoint
+SET GLOBAL innodb_cleaner_lsn_age_factor=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_lsn_age_factor'
+SET GLOBAL innodb_cleaner_lsn_age_factor=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_lsn_age_factor'
+SET GLOBAL innodb_cleaner_lsn_age_factor='foo';
+ERROR 42000: Variable 'innodb_cleaner_lsn_age_factor' can't be set to the value of 'foo'
+SET GLOBAL innodb_cleaner_lsn_age_factor = @start_value;
diff --git a/mysql-test/suite/sys_vars/r/innodb_cleaner_max_flush_time_basic.result b/mysql-test/suite/sys_vars/r/innodb_cleaner_max_flush_time_basic.result
new file mode 100644
index 00000000000..e4a3fa26e73
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_cleaner_max_flush_time_basic.result
@@ -0,0 +1,25 @@
+SET @start_value = @@GLOBAL.innodb_cleaner_max_flush_time;
+SELECT @@GLOBAL.innodb_cleaner_max_flush_time;
+@@GLOBAL.innodb_cleaner_max_flush_time
+1000
+SELECT @@SESSION.innodb_cleaner_max_flush_time;
+ERROR HY000: Variable 'innodb_cleaner_max_flush_time' is a GLOBAL variable
+SET GLOBAL innodb_cleaner_max_flush_time=0;
+SELECT @@GLOBAL.innodb_cleaner_max_flush_time;
+@@GLOBAL.innodb_cleaner_max_flush_time
+0
+SET GLOBAL innodb_cleaner_max_flush_time=1000;
+SELECT @@GLOBAL.innodb_cleaner_max_flush_time;
+@@GLOBAL.innodb_cleaner_max_flush_time
+1000
+SET GLOBAL innodb_cleaner_max_flush_time=4294967295;
+SELECT @@GLOBAL.innodb_cleaner_max_flush_time;
+@@GLOBAL.innodb_cleaner_max_flush_time
+4294967295
+SET GLOBAL innodb_cleaner_max_flush_time=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_max_flush_time'
+SET GLOBAL innodb_cleaner_max_flush_time=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_max_flush_time'
+SET GLOBAL innodb_cleaner_max_flush_time='foo';
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_max_flush_time'
+SET GLOBAL innodb_cleaner_max_flush_time = @start_value;
diff --git a/mysql-test/suite/sys_vars/r/innodb_cleaner_max_lru_time_basic.result b/mysql-test/suite/sys_vars/r/innodb_cleaner_max_lru_time_basic.result
new file mode 100644
index 00000000000..f7bacbbd62e
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_cleaner_max_lru_time_basic.result
@@ -0,0 +1,25 @@
+SET @start_value = @@GLOBAL.innodb_cleaner_max_lru_time;
+SELECT @@GLOBAL.innodb_cleaner_max_lru_time;
+@@GLOBAL.innodb_cleaner_max_lru_time
+1000
+SELECT @@SESSION.innodb_cleaner_max_lru_time;
+ERROR HY000: Variable 'innodb_cleaner_max_lru_time' is a GLOBAL variable
+SET GLOBAL innodb_cleaner_max_lru_time=0;
+SELECT @@GLOBAL.innodb_cleaner_max_lru_time;
+@@GLOBAL.innodb_cleaner_max_lru_time
+0
+SET GLOBAL innodb_cleaner_max_lru_time=1000;
+SELECT @@GLOBAL.innodb_cleaner_max_lru_time;
+@@GLOBAL.innodb_cleaner_max_lru_time
+1000
+SET GLOBAL innodb_cleaner_max_lru_time=4294967295;
+SELECT @@GLOBAL.innodb_cleaner_max_lru_time;
+@@GLOBAL.innodb_cleaner_max_lru_time
+4294967295
+SET GLOBAL innodb_cleaner_max_lru_time=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_max_lru_time'
+SET GLOBAL innodb_cleaner_max_lru_time=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_max_lru_time'
+SET GLOBAL innodb_cleaner_max_lru_time='foo';
+ERROR 42000: Incorrect argument type to variable 'innodb_cleaner_max_lru_time'
+SET GLOBAL innodb_cleaner_max_lru_time = @start_value;
diff --git a/mysql-test/suite/sys_vars/r/innodb_empty_free_list_algorithm_basic.result b/mysql-test/suite/sys_vars/r/innodb_empty_free_list_algorithm_basic.result
new file mode 100644
index 00000000000..f95553e3fa2
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_empty_free_list_algorithm_basic.result
@@ -0,0 +1,23 @@
+SET @start_value = @@GLOBAL.innodb_empty_free_list_algorithm;
+SELECT @@GLOBAL.innodb_empty_free_list_algorithm;
+@@GLOBAL.innodb_empty_free_list_algorithm
+backoff
+SELECT @@SESSION.innodb_empty_free_list_algorithm;
+ERROR HY000: Variable 'innodb_empty_free_list_algorithm' is a GLOBAL variable
+SET GLOBAL innodb_empty_free_list_algorithm='legacy';
+SELECT @@GLOBAL.innodb_empty_free_list_algorithm;
+@@GLOBAL.innodb_empty_free_list_algorithm
+legacy
+SET GLOBAL innodb_empty_free_list_algorithm='backoff';
+SELECT @@GLOBAL.innodb_empty_free_list_algorithm;
+@@GLOBAL.innodb_empty_free_list_algorithm
+backoff
+SET GLOBAL innodb_empty_free_list_algorithm=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_empty_free_list_algorithm'
+SET GLOBAL innodb_empty_free_list_algorithm=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_empty_free_list_algorithm'
+SET GLOBAL innodb_empty_free_list_algorithm=2;
+ERROR 42000: Variable 'innodb_empty_free_list_algorithm' can't be set to the value of '2'
+SET GLOBAL innodb_empty_free_list_algorithm='foo';
+ERROR 42000: Variable 'innodb_empty_free_list_algorithm' can't be set to the value of 'foo'
+SET GLOBAL innodb_empty_free_list_algorithm = @start_value;
diff --git a/mysql-test/suite/sys_vars/r/innodb_foreground_preflush_basic.result b/mysql-test/suite/sys_vars/r/innodb_foreground_preflush_basic.result
new file mode 100644
index 00000000000..4ad0ce8d78e
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_foreground_preflush_basic.result
@@ -0,0 +1,23 @@
+SET @start_value = @@GLOBAL.innodb_foreground_preflush;
+SELECT @@GLOBAL.innodb_foreground_preflush;
+@@GLOBAL.innodb_foreground_preflush
+exponential_backoff
+SELECT @@SESSION.innodb_foreground_preflush;
+ERROR HY000: Variable 'innodb_foreground_preflush' is a GLOBAL variable
+SET GLOBAL innodb_foreground_preflush='sync_preflush';
+SELECT @@GLOBAL.innodb_foreground_preflush;
+@@GLOBAL.innodb_foreground_preflush
+sync_preflush
+SET GLOBAL innodb_foreground_preflush='exponential_backoff';
+SELECT @@GLOBAL.innodb_foreground_preflush;
+@@GLOBAL.innodb_foreground_preflush
+exponential_backoff
+SET GLOBAL innodb_foreground_preflush=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_foreground_preflush'
+SET GLOBAL innodb_foreground_preflush=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_foreground_preflush'
+SET GLOBAL innodb_foreground_preflush=2;
+ERROR 42000: Variable 'innodb_foreground_preflush' can't be set to the value of '2'
+SET GLOBAL innodb_foreground_preflush='foo';
+ERROR 42000: Variable 'innodb_foreground_preflush' can't be set to the value of 'foo'
+SET GLOBAL innodb_foreground_preflush = @start_value;
diff --git a/mysql-test/suite/sys_vars/r/innodb_ft_result_cache_limit_basic.result b/mysql-test/suite/sys_vars/r/innodb_ft_result_cache_limit_basic.result
new file mode 100644
index 00000000000..0aefabd48f7
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_ft_result_cache_limit_basic.result
@@ -0,0 +1,32 @@
+select @@global.innodb_ft_result_cache_limit;
+@@global.innodb_ft_result_cache_limit
+2000000000
+select @@session.innodb_ft_result_cache_limit;
+ERROR HY000: Variable 'innodb_ft_result_cache_limit' is a GLOBAL variable
+show global variables like 'innodb_ft_result_cache_limit';
+Variable_name	Value
+innodb_ft_result_cache_limit	2000000000
+show session variables like 'innodb_ft_result_cache_limit';
+Variable_name	Value
+innodb_ft_result_cache_limit	2000000000
+select * from information_schema.global_variables where variable_name='innodb_ft_result_cache_limit';
+VARIABLE_NAME	VARIABLE_VALUE
+INNODB_FT_RESULT_CACHE_LIMIT	2000000000
+select * from information_schema.session_variables where variable_name='innodb_ft_result_cache_limit';
+VARIABLE_NAME	VARIABLE_VALUE
+INNODB_FT_RESULT_CACHE_LIMIT	2000000000
+set global innodb_ft_result_cache_limit=900000;
+Warnings:
+Warning	1292	Truncated incorrect innodb_ft_result_cache_limit value: '900000'
+select @@innodb_ft_result_cache_limit;
+@@innodb_ft_result_cache_limit
+1000000
+set global innodb_ft_result_cache_limit=1000000;
+select @@innodb_ft_result_cache_limit;
+@@innodb_ft_result_cache_limit
+1000000
+set global innodb_ft_result_cache_limit=4000000000;
+select @@innodb_ft_result_cache_limit;
+@@innodb_ft_result_cache_limit
+4000000000
+set global innodb_ft_result_cache_limit=2000000000;
diff --git a/mysql-test/suite/sys_vars/r/innodb_ft_total_cache_size_basic.result b/mysql-test/suite/sys_vars/r/innodb_ft_total_cache_size_basic.result
new file mode 100644
index 00000000000..ff234a1fcbf
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_ft_total_cache_size_basic.result
@@ -0,0 +1,21 @@
+select @@global.innodb_ft_total_cache_size;
+@@global.innodb_ft_total_cache_size
+640000000
+select @@session.innodb_ft_total_cache_size;
+ERROR HY000: Variable 'innodb_ft_total_cache_size' is a GLOBAL variable
+show global variables like 'innodb_ft_total_cache_size';
+Variable_name	Value
+innodb_ft_total_cache_size	640000000
+show session variables like 'innodb_ft_total_cache_size';
+Variable_name	Value
+innodb_ft_total_cache_size	640000000
+select * from information_schema.global_variables where variable_name='innodb_ft_total_cache_size';
+VARIABLE_NAME	VARIABLE_VALUE
+INNODB_FT_TOTAL_CACHE_SIZE	640000000
+select * from information_schema.session_variables where variable_name='innodb_ft_total_cache_size';
+VARIABLE_NAME	VARIABLE_VALUE
+INNODB_FT_TOTAL_CACHE_SIZE	640000000
+set global innodb_ft_total_cache_size=1;
+ERROR HY000: Variable 'innodb_ft_total_cache_size' is a read only variable
+set session innodb_ft_total_cache_size=1;
+ERROR HY000: Variable 'innodb_ft_total_cache_size' is a read only variable
diff --git a/mysql-test/suite/sys_vars/r/innodb_log_arch_dir_basic.result b/mysql-test/suite/sys_vars/r/innodb_log_arch_dir_basic.result
new file mode 100644
index 00000000000..bead0303520
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_log_arch_dir_basic.result
@@ -0,0 +1,38 @@
+SELECT @@GLOBAL.innodb_log_arch_dir;
+@@GLOBAL.innodb_log_arch_dir
+./
+NULL Expected
+SET @@GLOBAL.innodb_log_arch_dir=1;
+ERROR HY000: Variable 'innodb_log_arch_dir' is a read only variable
+Expected error 'Read only variable'
+SELECT @@GLOBAL.innodb_log_arch_dir;
+@@GLOBAL.innodb_log_arch_dir
+./
+NULL Expected
+SELECT VARIABLE_VALUE
+FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES 
+WHERE VARIABLE_NAME='innodb_log_arch_dir';
+VARIABLE_VALUE
+./
+empty string Expected
+SELECT @@innodb_log_arch_dir;
+@@innodb_log_arch_dir
+./
+NULL Expected
+SELECT @@innodb_log_arch_dir;
+@@innodb_log_arch_dir
+./
+NULL Expected
+SELECT @@local.innodb_log_arch_dir;
+ERROR HY000: Variable 'innodb_log_arch_dir' is a GLOBAL variable
+Expected error 'Variable is a GLOBAL variable'
+SELECT @@SESSION.innodb_log_arch_dir;
+ERROR HY000: Variable 'innodb_log_arch_dir' is a GLOBAL variable
+Expected error 'Variable is a GLOBAL variable'
+SELECT @@GLOBAL.innodb_log_arch_dir;
+@@GLOBAL.innodb_log_arch_dir
+./
+NULL Expected
+SELECT innodb_log_arch_dir = @@SESSION.innodb_log_arch_dir;
+ERROR 42S22: Unknown column 'innodb_log_arch_dir' in 'field list'
+Expected error Unknown column 'innodb_log_arch_dir' in 'field list'
diff --git a/mysql-test/suite/sys_vars/r/innodb_log_arch_expire_sec_basic.result b/mysql-test/suite/sys_vars/r/innodb_log_arch_expire_sec_basic.result
new file mode 100644
index 00000000000..97bff097252
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_log_arch_expire_sec_basic.result
@@ -0,0 +1,38 @@
+SELECT @@GLOBAL.innodb_log_arch_expire_sec INTO @save;
+SELECT @@GLOBAL.innodb_log_arch_expire_sec;
+@@GLOBAL.innodb_log_arch_expire_sec
+0
+0 Expected
+SET @@GLOBAL.innodb_log_arch_expire_sec=1;
+SELECT @@GLOBAL.innodb_log_arch_expire_sec;
+@@GLOBAL.innodb_log_arch_expire_sec
+1
+1 Expected
+SELECT VARIABLE_VALUE
+FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES 
+WHERE VARIABLE_NAME='innodb_log_arch_expire_sec';
+VARIABLE_VALUE
+1
+1 Expected
+SELECT @@innodb_log_arch_expire_sec;
+@@innodb_log_arch_expire_sec
+1
+1 Expected
+SELECT @@innodb_log_arch_expire_sec;
+@@innodb_log_arch_expire_sec
+1
+1 Expected
+SELECT @@local.innodb_log_arch_expire_sec;
+ERROR HY000: Variable 'innodb_log_arch_expire_sec' is a GLOBAL variable
+Expected error 'Variable is a GLOBAL variable'
+SELECT @@SESSION.innodb_log_arch_expire_sec;
+ERROR HY000: Variable 'innodb_log_arch_expire_sec' is a GLOBAL variable
+Expected error 'Variable is a GLOBAL variable'
+SELECT @@GLOBAL.innodb_log_arch_expire_sec;
+@@GLOBAL.innodb_log_arch_expire_sec
+1
+1 Expected
+SELECT innodb_log_arch_expire_sec = @@SESSION.innodb_log_arch_expire_sec;
+ERROR 42S22: Unknown column 'innodb_log_arch_expire_sec' in 'field list'
+Expected error Unknown column 'innodb_log_arch_expire_sec' in 'field list'
+SET @@GLOBAL.innodb_log_arch_expire_sec = @save;
diff --git a/mysql-test/suite/sys_vars/r/innodb_log_archive_basic.result b/mysql-test/suite/sys_vars/r/innodb_log_archive_basic.result
new file mode 100644
index 00000000000..bb9b53482fa
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_log_archive_basic.result
@@ -0,0 +1,38 @@
+SELECT @@GLOBAL.innodb_log_archive;
+@@GLOBAL.innodb_log_archive
+0
+0 Expected
+SET @save_innodb_log_archive = @@GLOBAL.innodb_log_archive;
+SET @@GLOBAL.innodb_log_archive=1;
+SELECT @@GLOBAL.innodb_log_archive;
+@@GLOBAL.innodb_log_archive
+1
+1 Expected
+SELECT VARIABLE_VALUE
+FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES 
+WHERE VARIABLE_NAME='innodb_log_archive';
+VARIABLE_VALUE
+ON
+ON Expected
+SET @@GLOBAL.innodb_log_archive = @save_innodb_log_archive;
+SELECT @@innodb_log_archive;
+@@innodb_log_archive
+0
+0 Expected
+SELECT @@innodb_log_archive;
+@@innodb_log_archive
+0
+0 Expected
+SELECT @@local.innodb_log_archive;
+ERROR HY000: Variable 'innodb_log_archive' is a GLOBAL variable
+Expected error 'Variable is a GLOBAL variable'
+SELECT @@SESSION.innodb_log_archive;
+ERROR HY000: Variable 'innodb_log_archive' is a GLOBAL variable
+Expected error 'Variable is a GLOBAL variable'
+SELECT @@GLOBAL.innodb_log_archive;
+@@GLOBAL.innodb_log_archive
+0
+0 Expected
+SELECT innodb_log_archive = @@SESSION.innodb_log_archive;
+ERROR 42S22: Unknown column 'innodb_log_archive' in 'field list'
+Expected error Unknown column 'innodb_log_archive' in 'field list'
diff --git a/mysql-test/suite/sys_vars/r/innodb_log_checksum_algorithm_basic.result b/mysql-test/suite/sys_vars/r/innodb_log_checksum_algorithm_basic.result
new file mode 100644
index 00000000000..cb03046c84d
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_log_checksum_algorithm_basic.result
@@ -0,0 +1,47 @@
+SET @orig = @@global.innodb_log_checksum_algorithm;
+SELECT @orig;
+@orig
+innodb
+SET GLOBAL innodb_log_checksum_algorithm = 'crc32';
+SELECT @@global.innodb_log_checksum_algorithm;
+@@global.innodb_log_checksum_algorithm
+crc32
+SET GLOBAL innodb_log_checksum_algorithm = 'strict_crc32';
+SELECT @@global.innodb_log_checksum_algorithm;
+@@global.innodb_log_checksum_algorithm
+strict_crc32
+SET GLOBAL innodb_log_checksum_algorithm = 'innodb';
+SELECT @@global.innodb_log_checksum_algorithm;
+@@global.innodb_log_checksum_algorithm
+innodb
+SET GLOBAL innodb_log_checksum_algorithm = 'strict_innodb';
+SELECT @@global.innodb_log_checksum_algorithm;
+@@global.innodb_log_checksum_algorithm
+strict_innodb
+SET GLOBAL innodb_log_checksum_algorithm = 'none';
+SELECT @@global.innodb_log_checksum_algorithm;
+@@global.innodb_log_checksum_algorithm
+none
+SET GLOBAL innodb_log_checksum_algorithm = 'strict_none';
+SELECT @@global.innodb_log_checksum_algorithm;
+@@global.innodb_log_checksum_algorithm
+strict_none
+SET GLOBAL innodb_log_checksum_algorithm = '';
+ERROR 42000: Variable 'innodb_log_checksum_algorithm' can't be set to the value of ''
+SELECT @@global.innodb_log_checksum_algorithm;
+@@global.innodb_log_checksum_algorithm
+strict_none
+SET GLOBAL innodb_log_checksum_algorithm = 'foobar';
+ERROR 42000: Variable 'innodb_log_checksum_algorithm' can't be set to the value of 'foobar'
+SELECT @@global.innodb_log_checksum_algorithm;
+@@global.innodb_log_checksum_algorithm
+strict_none
+SET GLOBAL innodb_log_checksum_algorithm = 123;
+ERROR 42000: Variable 'innodb_log_checksum_algorithm' can't be set to the value of '123'
+SELECT @@global.innodb_log_checksum_algorithm;
+@@global.innodb_log_checksum_algorithm
+strict_none
+SET GLOBAL innodb_log_checksum_algorithm = @orig;
+SELECT @@global.innodb_log_checksum_algorithm;
+@@global.innodb_log_checksum_algorithm
+innodb
diff --git a/mysql-test/suite/sys_vars/r/innodb_log_compressed_pages_basic.result b/mysql-test/suite/sys_vars/r/innodb_log_compressed_pages_basic.result
new file mode 100644
index 00000000000..8cb8d900b59
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_log_compressed_pages_basic.result
@@ -0,0 +1,69 @@
+SET @start_global_value = @@global.innodb_log_compressed_pages;
+SELECT @start_global_value;
+@start_global_value
+1
+'#---------------------BS_STVARS_028_01----------------------#'
+SELECT COUNT(@@GLOBAL.innodb_log_compressed_pages);
+COUNT(@@GLOBAL.innodb_log_compressed_pages)
+1
+1 Expected
+'#---------------------BS_STVARS_028_02----------------------#'
+SET @@global.innodb_log_compressed_pages = 0;
+SELECT @@global.innodb_log_compressed_pages;
+@@global.innodb_log_compressed_pages
+0
+SET @@global.innodb_log_compressed_pages ='On' ;
+SELECT @@global.innodb_log_compressed_pages;
+@@global.innodb_log_compressed_pages
+1
+SET @@global.innodb_log_compressed_pages ='Off' ;
+SELECT @@global.innodb_log_compressed_pages;
+@@global.innodb_log_compressed_pages
+0
+SET @@global.innodb_log_compressed_pages = 1;
+SELECT @@global.innodb_log_compressed_pages;
+@@global.innodb_log_compressed_pages
+1
+'#---------------------BS_STVARS_028_03----------------------#'
+SELECT IF(@@GLOBAL.innodb_log_compressed_pages,'ON','OFF') = VARIABLE_VALUE
+FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
+WHERE VARIABLE_NAME='innodb_log_compressed_pages';
+IF(@@GLOBAL.innodb_log_compressed_pages,'ON','OFF') = VARIABLE_VALUE
+1
+1 Expected
+SELECT COUNT(@@GLOBAL.innodb_log_compressed_pages);
+COUNT(@@GLOBAL.innodb_log_compressed_pages)
+1
+1 Expected
+SELECT COUNT(VARIABLE_VALUE)
+FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
+WHERE VARIABLE_NAME='innodb_log_compressed_pages';
+COUNT(VARIABLE_VALUE)
+1
+1 Expected
+'#---------------------BS_STVARS_028_04----------------------#'
+SELECT @@innodb_log_compressed_pages = @@GLOBAL.innodb_log_compressed_pages;
+@@innodb_log_compressed_pages = @@GLOBAL.innodb_log_compressed_pages
+1
+1 Expected
+'#---------------------BS_STVARS_028_05----------------------#'
+SELECT COUNT(@@innodb_log_compressed_pages);
+COUNT(@@innodb_log_compressed_pages)
+1
+1 Expected
+SELECT COUNT(@@local.innodb_log_compressed_pages);
+ERROR HY000: Variable 'innodb_log_compressed_pages' is a GLOBAL variable
+Expected error 'Variable is a GLOBAL variable'
+SELECT COUNT(@@SESSION.innodb_log_compressed_pages);
+ERROR HY000: Variable 'innodb_log_compressed_pages' is a GLOBAL variable
+Expected error 'Variable is a GLOBAL variable'
+SELECT COUNT(@@GLOBAL.innodb_log_compressed_pages);
+COUNT(@@GLOBAL.innodb_log_compressed_pages)
+1
+1 Expected
+SELECT innodb_log_compressed_pages = @@SESSION.innodb_log_compressed_pages;
+ERROR 42S22: Unknown column 'innodb_log_compressed_pages' in 'field list'
+SET @@global.innodb_log_compressed_pages = @start_global_value;
+SELECT @@global.innodb_log_compressed_pages;
+@@global.innodb_log_compressed_pages
+1
diff --git a/mysql-test/suite/sys_vars/r/innodb_merge_sort_block_size_basic.result b/mysql-test/suite/sys_vars/r/innodb_merge_sort_block_size_basic.result
deleted file mode 100644
index 90c2954e43d..00000000000
--- a/mysql-test/suite/sys_vars/r/innodb_merge_sort_block_size_basic.result
+++ /dev/null
@@ -1,24 +0,0 @@
-SELECT @@global.innodb_merge_sort_block_size;
-@@global.innodb_merge_sort_block_size
-1048576
-SELECT @@session.innodb_merge_sort_block_size;
-@@session.innodb_merge_sort_block_size
-1048576
-SET @old_global=@@global.innodb_merge_sort_block_size;
-SET @old_session=@@session.innodb_merge_sort_block_size;
-SET @@global.innodb_merge_sort_block_size = 2*1024*1024;
-SET @@session.innodb_merge_sort_block_size = 4*1024*1024;
-SELECT @@global.innodb_merge_sort_block_size;
-@@global.innodb_merge_sort_block_size
-2097152
-SELECT @@session.innodb_merge_sort_block_size;
-@@session.innodb_merge_sort_block_size
-4194304
-SET @@global.innodb_merge_sort_block_size = 1024*1024*1024+1;
-Warnings:
-Warning	1292	Truncated incorrect innodb_merge_sort_block_size value: '1073741825'
-SELECT @@global.innodb_merge_sort_block_size;
-@@global.innodb_merge_sort_block_size
-1073741824
-SET @@global.innodb_merge_sort_block_size=@old_global;
-SET @@session.innodb_merge_sort_block_size=@old_session;
diff --git a/mysql-test/suite/sys_vars/r/innodb_monitor_disable_basic.result b/mysql-test/suite/sys_vars/r/innodb_monitor_disable_basic.result
index 7a7c0a6b6a2..ce57dbb2fdc 100644
--- a/mysql-test/suite/sys_vars/r/innodb_monitor_disable_basic.result
+++ b/mysql-test/suite/sys_vars/r/innodb_monitor_disable_basic.result
@@ -43,6 +43,7 @@ buffer_data_written	disabled
 buffer_flush_batch_scanned	disabled
 buffer_flush_batch_num_scan	disabled
 buffer_flush_batch_scanned_per_call	disabled
+buffer_flush_batch_rescan	disabled
 buffer_flush_batch_total_pages	disabled
 buffer_flush_batches	disabled
 buffer_flush_batch_pages	disabled
diff --git a/mysql-test/suite/sys_vars/r/innodb_monitor_enable_basic.result b/mysql-test/suite/sys_vars/r/innodb_monitor_enable_basic.result
index 7a7c0a6b6a2..ce57dbb2fdc 100644
--- a/mysql-test/suite/sys_vars/r/innodb_monitor_enable_basic.result
+++ b/mysql-test/suite/sys_vars/r/innodb_monitor_enable_basic.result
@@ -43,6 +43,7 @@ buffer_data_written	disabled
 buffer_flush_batch_scanned	disabled
 buffer_flush_batch_num_scan	disabled
 buffer_flush_batch_scanned_per_call	disabled
+buffer_flush_batch_rescan	disabled
 buffer_flush_batch_total_pages	disabled
 buffer_flush_batches	disabled
 buffer_flush_batch_pages	disabled
diff --git a/mysql-test/suite/sys_vars/r/innodb_monitor_reset_all_basic.result b/mysql-test/suite/sys_vars/r/innodb_monitor_reset_all_basic.result
index 7a7c0a6b6a2..ce57dbb2fdc 100644
--- a/mysql-test/suite/sys_vars/r/innodb_monitor_reset_all_basic.result
+++ b/mysql-test/suite/sys_vars/r/innodb_monitor_reset_all_basic.result
@@ -43,6 +43,7 @@ buffer_data_written	disabled
 buffer_flush_batch_scanned	disabled
 buffer_flush_batch_num_scan	disabled
 buffer_flush_batch_scanned_per_call	disabled
+buffer_flush_batch_rescan	disabled
 buffer_flush_batch_total_pages	disabled
 buffer_flush_batches	disabled
 buffer_flush_batch_pages	disabled
diff --git a/mysql-test/suite/sys_vars/r/innodb_monitor_reset_basic.result b/mysql-test/suite/sys_vars/r/innodb_monitor_reset_basic.result
index 7a7c0a6b6a2..ce57dbb2fdc 100644
--- a/mysql-test/suite/sys_vars/r/innodb_monitor_reset_basic.result
+++ b/mysql-test/suite/sys_vars/r/innodb_monitor_reset_basic.result
@@ -43,6 +43,7 @@ buffer_data_written	disabled
 buffer_flush_batch_scanned	disabled
 buffer_flush_batch_num_scan	disabled
 buffer_flush_batch_scanned_per_call	disabled
+buffer_flush_batch_rescan	disabled
 buffer_flush_batch_total_pages	disabled
 buffer_flush_batches	disabled
 buffer_flush_batch_pages	disabled
diff --git a/mysql-test/suite/sys_vars/r/innodb_persistent_stats_root_page_basic.result b/mysql-test/suite/sys_vars/r/innodb_persistent_stats_root_page_basic.result
deleted file mode 100644
index 38347ef8c68..00000000000
--- a/mysql-test/suite/sys_vars/r/innodb_persistent_stats_root_page_basic.result
+++ /dev/null
@@ -1,24 +0,0 @@
-SELECT @@global.innodb_persistent_stats_root_page;
-@@global.innodb_persistent_stats_root_page
-0
-SELECT COUNT(@@global.innodb_persistent_stats_root_page);
-COUNT(@@global.innodb_persistent_stats_root_page)
-1
-SET @@global.innodb_persistent_stats_root_page=100;
-ERROR HY000: Variable 'innodb_persistent_stats_root_page' is a read only variable
-SELECT @@global.innodb_persistent_stats_root_page = VARIABLE_VALUE
-FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
-WHERE VARIABLE_NAME='innodb_persistent_stats_root_page';
-@@global.innodb_persistent_stats_root_page = VARIABLE_VALUE
-1
-SELECT COUNT(*) FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
-WHERE VARIABLE_NAME='innodb_persistent_stats_root_page';
-COUNT(*)
-1
-SELECT @@innodb_persistent_stats_root_page = @@global.innodb_persistent_stats_root_page;
-@@innodb_persistent_stats_root_page = @@global.innodb_persistent_stats_root_page
-1
-SELECT COUNT(@@local.innodb_persistent_stats_root_page);
-ERROR HY000: Variable 'innodb_persistent_stats_root_page' is a GLOBAL variable
-SELECT COUNT(@@session.innodb_persistent_stats_root_page);
-ERROR HY000: Variable 'innodb_persistent_stats_root_page' is a GLOBAL variable
diff --git a/mysql-test/suite/sys_vars/r/innodb_priority_cleaner_basic.result b/mysql-test/suite/sys_vars/r/innodb_priority_cleaner_basic.result
new file mode 100644
index 00000000000..ae5e12ee64a
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_priority_cleaner_basic.result
@@ -0,0 +1,31 @@
+SET @start_value = @@GLOBAL.innodb_priority_cleaner;
+SELECT @@GLOBAL.innodb_priority_cleaner;
+@@GLOBAL.innodb_priority_cleaner
+0
+SELECT @@SESSION.innodb_priority_cleaner;
+ERROR HY000: Variable 'innodb_priority_cleaner' is a GLOBAL variable
+SET GLOBAL innodb_priority_cleaner='OFF';
+SELECT @@GLOBAL.innodb_priority_cleaner;
+@@GLOBAL.innodb_priority_cleaner
+0
+SET GLOBAL innodb_priority_cleaner='ON';
+SELECT @@GLOBAL.innodb_priority_cleaner;
+@@GLOBAL.innodb_priority_cleaner
+1
+SET GLOBAL innodb_priority_cleaner=0;
+SELECT @@GLOBAL.innodb_priority_cleaner;
+@@GLOBAL.innodb_priority_cleaner
+0
+SET GLOBAL innodb_priority_cleaner=1;
+SELECT @@GLOBAL.innodb_priority_cleaner;
+@@GLOBAL.innodb_priority_cleaner
+1
+SET GLOBAL innodb_priority_cleaner=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_priority_cleaner'
+SET GLOBAL innodb_priority_cleaner=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_priority_cleaner'
+SET GLOBAL innodb_priority_cleaner=2;
+ERROR 42000: Variable 'innodb_priority_cleaner' can't be set to the value of '2'
+SET GLOBAL innodb_priority_cleaner='foo';
+ERROR 42000: Variable 'innodb_priority_cleaner' can't be set to the value of 'foo'
+SET GLOBAL innodb_priority_cleaner = @start_value;
diff --git a/mysql-test/suite/sys_vars/r/innodb_priority_io_basic.result b/mysql-test/suite/sys_vars/r/innodb_priority_io_basic.result
new file mode 100644
index 00000000000..70ccb5e4cf4
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_priority_io_basic.result
@@ -0,0 +1,31 @@
+SET @start_value = @@GLOBAL.innodb_priority_io;
+SELECT @@GLOBAL.innodb_priority_io;
+@@GLOBAL.innodb_priority_io
+0
+SELECT @@SESSION.innodb_priority_io;
+ERROR HY000: Variable 'innodb_priority_io' is a GLOBAL variable
+SET GLOBAL innodb_priority_io='OFF';
+SELECT @@GLOBAL.innodb_priority_io;
+@@GLOBAL.innodb_priority_io
+0
+SET GLOBAL innodb_priority_io='ON';
+SELECT @@GLOBAL.innodb_priority_io;
+@@GLOBAL.innodb_priority_io
+1
+SET GLOBAL innodb_priority_io=0;
+SELECT @@GLOBAL.innodb_priority_io;
+@@GLOBAL.innodb_priority_io
+0
+SET GLOBAL innodb_priority_io=1;
+SELECT @@GLOBAL.innodb_priority_io;
+@@GLOBAL.innodb_priority_io
+1
+SET GLOBAL innodb_priority_io=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_priority_io'
+SET GLOBAL innodb_priority_io=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_priority_io'
+SET GLOBAL innodb_priority_io=2;
+ERROR 42000: Variable 'innodb_priority_io' can't be set to the value of '2'
+SET GLOBAL innodb_priority_io='foo';
+ERROR 42000: Variable 'innodb_priority_io' can't be set to the value of 'foo'
+SET GLOBAL innodb_priority_io = @start_value;
diff --git a/mysql-test/suite/sys_vars/r/innodb_priority_master_basic.result b/mysql-test/suite/sys_vars/r/innodb_priority_master_basic.result
new file mode 100644
index 00000000000..d26ead2ff7e
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_priority_master_basic.result
@@ -0,0 +1,31 @@
+SET @start_value = @@GLOBAL.innodb_priority_master;
+SELECT @@GLOBAL.innodb_priority_master;
+@@GLOBAL.innodb_priority_master
+0
+SELECT @@SESSION.innodb_priority_master;
+ERROR HY000: Variable 'innodb_priority_master' is a GLOBAL variable
+SET GLOBAL innodb_priority_master='OFF';
+SELECT @@GLOBAL.innodb_priority_master;
+@@GLOBAL.innodb_priority_master
+0
+SET GLOBAL innodb_priority_master='ON';
+SELECT @@GLOBAL.innodb_priority_master;
+@@GLOBAL.innodb_priority_master
+1
+SET GLOBAL innodb_priority_master=0;
+SELECT @@GLOBAL.innodb_priority_master;
+@@GLOBAL.innodb_priority_master
+0
+SET GLOBAL innodb_priority_master=1;
+SELECT @@GLOBAL.innodb_priority_master;
+@@GLOBAL.innodb_priority_master
+1
+SET GLOBAL innodb_priority_master=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_priority_master'
+SET GLOBAL innodb_priority_master=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_priority_master'
+SET GLOBAL innodb_priority_master=2;
+ERROR 42000: Variable 'innodb_priority_master' can't be set to the value of '2'
+SET GLOBAL innodb_priority_master='foo';
+ERROR 42000: Variable 'innodb_priority_master' can't be set to the value of 'foo'
+SET GLOBAL innodb_priority_master = @start_value;
diff --git a/mysql-test/suite/sys_vars/r/innodb_priority_purge_basic.result b/mysql-test/suite/sys_vars/r/innodb_priority_purge_basic.result
new file mode 100644
index 00000000000..57153ebf82a
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_priority_purge_basic.result
@@ -0,0 +1,31 @@
+SET @start_value = @@GLOBAL.innodb_priority_purge;
+SELECT @@GLOBAL.innodb_priority_purge;
+@@GLOBAL.innodb_priority_purge
+0
+SELECT @@SESSION.innodb_priority_purge;
+ERROR HY000: Variable 'innodb_priority_purge' is a GLOBAL variable
+SET GLOBAL innodb_priority_purge='OFF';
+SELECT @@GLOBAL.innodb_priority_purge;
+@@GLOBAL.innodb_priority_purge
+0
+SET GLOBAL innodb_priority_purge='ON';
+SELECT @@GLOBAL.innodb_priority_purge;
+@@GLOBAL.innodb_priority_purge
+1
+SET GLOBAL innodb_priority_purge=0;
+SELECT @@GLOBAL.innodb_priority_purge;
+@@GLOBAL.innodb_priority_purge
+0
+SET GLOBAL innodb_priority_purge=1;
+SELECT @@GLOBAL.innodb_priority_purge;
+@@GLOBAL.innodb_priority_purge
+1
+SET GLOBAL innodb_priority_purge=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_priority_purge'
+SET GLOBAL innodb_priority_purge=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_priority_purge'
+SET GLOBAL innodb_priority_purge=2;
+ERROR 42000: Variable 'innodb_priority_purge' can't be set to the value of '2'
+SET GLOBAL innodb_priority_purge='foo';
+ERROR 42000: Variable 'innodb_priority_purge' can't be set to the value of 'foo'
+SET GLOBAL innodb_priority_purge = @start_value;
diff --git a/mysql-test/suite/sys_vars/r/innodb_sched_priority_cleaner_basic.result b/mysql-test/suite/sys_vars/r/innodb_sched_priority_cleaner_basic.result
new file mode 100644
index 00000000000..1183fb27732
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_sched_priority_cleaner_basic.result
@@ -0,0 +1,30 @@
+SET @start_value = @@GLOBAL.innodb_sched_priority_cleaner;
+SELECT @@GLOBAL.innodb_sched_priority_cleaner;
+@@GLOBAL.innodb_sched_priority_cleaner
+19
+SELECT @@SESSION.innodb_sched_priority_cleaner;
+ERROR HY000: Variable 'innodb_sched_priority_cleaner' is a GLOBAL variable
+SET GLOBAL innodb_sched_priority_cleaner=19;
+SELECT @@GLOBAL.innodb_sched_priority_cleaner;
+@@GLOBAL.innodb_sched_priority_cleaner
+19
+SET GLOBAL innodb_sched_priority_cleaner=5;
+SELECT @@GLOBAL.innodb_sched_priority_cleaner;
+@@GLOBAL.innodb_sched_priority_cleaner
+5
+SET GLOBAL innodb_sched_priority_cleaner=0;
+SELECT @@GLOBAL.innodb_sched_priority_cleaner;
+@@GLOBAL.innodb_sched_priority_cleaner
+0
+SET GLOBAL innodb_sched_priority_cleaner=-1;
+Warnings:
+Warning	1292	Truncated incorrect innodb_sched_priority_cleaner value: '-1'
+SELECT @@GLOBAL.innodb_sched_priority_cleaner;
+@@GLOBAL.innodb_sched_priority_cleaner
+0
+SET GLOBAL innodb_sched_priority_cleaner=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_sched_priority_cleaner'
+SET GLOBAL innodb_sched_priority_cleaner=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_sched_priority_cleaner'
+SET GLOBAL innodb_sched_priority_cleaner='foo';
+ERROR 42000: Incorrect argument type to variable 'innodb_sched_priority_cleaner'
diff --git a/mysql-test/suite/sys_vars/r/innodb_sched_priority_io_basic.result b/mysql-test/suite/sys_vars/r/innodb_sched_priority_io_basic.result
new file mode 100644
index 00000000000..5fd7705bfaf
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_sched_priority_io_basic.result
@@ -0,0 +1,30 @@
+SET @start_value = @@GLOBAL.innodb_sched_priority_io;
+SELECT @@GLOBAL.innodb_sched_priority_io;
+@@GLOBAL.innodb_sched_priority_io
+19
+SELECT @@SESSION.innodb_sched_priority_io;
+ERROR HY000: Variable 'innodb_sched_priority_io' is a GLOBAL variable
+SET GLOBAL innodb_sched_priority_io=19;
+SELECT @@GLOBAL.innodb_sched_priority_io;
+@@GLOBAL.innodb_sched_priority_io
+19
+SET GLOBAL innodb_sched_priority_io=5;
+SELECT @@GLOBAL.innodb_sched_priority_io;
+@@GLOBAL.innodb_sched_priority_io
+5
+SET GLOBAL innodb_sched_priority_io=0;
+SELECT @@GLOBAL.innodb_sched_priority_io;
+@@GLOBAL.innodb_sched_priority_io
+0
+SET GLOBAL innodb_sched_priority_io=-1;
+Warnings:
+Warning	1292	Truncated incorrect innodb_sched_priority_io value: '-1'
+SELECT @@GLOBAL.innodb_sched_priority_io;
+@@GLOBAL.innodb_sched_priority_io
+0
+SET GLOBAL innodb_sched_priority_io=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_sched_priority_io'
+SET GLOBAL innodb_sched_priority_io=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_sched_priority_io'
+SET GLOBAL innodb_sched_priority_io='foo';
+ERROR 42000: Incorrect argument type to variable 'innodb_sched_priority_io'
diff --git a/mysql-test/suite/sys_vars/r/innodb_sched_priority_master_basic.result b/mysql-test/suite/sys_vars/r/innodb_sched_priority_master_basic.result
new file mode 100644
index 00000000000..caa72f58369
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_sched_priority_master_basic.result
@@ -0,0 +1,30 @@
+SET @start_value = @@GLOBAL.innodb_sched_priority_master;
+SELECT @@GLOBAL.innodb_sched_priority_master;
+@@GLOBAL.innodb_sched_priority_master
+19
+SELECT @@SESSION.innodb_sched_priority_master;
+ERROR HY000: Variable 'innodb_sched_priority_master' is a GLOBAL variable
+SET GLOBAL innodb_sched_priority_master=19;
+SELECT @@GLOBAL.innodb_sched_priority_master;
+@@GLOBAL.innodb_sched_priority_master
+19
+SET GLOBAL innodb_sched_priority_master=5;
+SELECT @@GLOBAL.innodb_sched_priority_master;
+@@GLOBAL.innodb_sched_priority_master
+5
+SET GLOBAL innodb_sched_priority_master=0;
+SELECT @@GLOBAL.innodb_sched_priority_master;
+@@GLOBAL.innodb_sched_priority_master
+0
+SET GLOBAL innodb_sched_priority_master=-1;
+Warnings:
+Warning	1292	Truncated incorrect innodb_sched_priority_master value: '-1'
+SELECT @@GLOBAL.innodb_sched_priority_master;
+@@GLOBAL.innodb_sched_priority_master
+0
+SET GLOBAL innodb_sched_priority_master=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_sched_priority_master'
+SET GLOBAL innodb_sched_priority_master=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_sched_priority_master'
+SET GLOBAL innodb_sched_priority_master='foo';
+ERROR 42000: Incorrect argument type to variable 'innodb_sched_priority_master'
diff --git a/mysql-test/suite/sys_vars/r/innodb_sched_priority_purge_basic.result b/mysql-test/suite/sys_vars/r/innodb_sched_priority_purge_basic.result
new file mode 100644
index 00000000000..e3b9c627214
--- /dev/null
+++ b/mysql-test/suite/sys_vars/r/innodb_sched_priority_purge_basic.result
@@ -0,0 +1,30 @@
+SET @start_value = @@GLOBAL.innodb_sched_priority_purge;
+SELECT @@GLOBAL.innodb_sched_priority_purge;
+@@GLOBAL.innodb_sched_priority_purge
+19
+SELECT @@SESSION.innodb_sched_priority_purge;
+ERROR HY000: Variable 'innodb_sched_priority_purge' is a GLOBAL variable
+SET GLOBAL innodb_sched_priority_purge=19;
+SELECT @@GLOBAL.innodb_sched_priority_purge;
+@@GLOBAL.innodb_sched_priority_purge
+19
+SET GLOBAL innodb_sched_priority_purge=5;
+SELECT @@GLOBAL.innodb_sched_priority_purge;
+@@GLOBAL.innodb_sched_priority_purge
+5
+SET GLOBAL innodb_sched_priority_purge=0;
+SELECT @@GLOBAL.innodb_sched_priority_purge;
+@@GLOBAL.innodb_sched_priority_purge
+0
+SET GLOBAL innodb_sched_priority_purge=-1;
+Warnings:
+Warning	1292	Truncated incorrect innodb_sched_priority_purge value: '-1'
+SELECT @@GLOBAL.innodb_sched_priority_purge;
+@@GLOBAL.innodb_sched_priority_purge
+0
+SET GLOBAL innodb_sched_priority_purge=1.1;
+ERROR 42000: Incorrect argument type to variable 'innodb_sched_priority_purge'
+SET GLOBAL innodb_sched_priority_purge=1e1;
+ERROR 42000: Incorrect argument type to variable 'innodb_sched_priority_purge'
+SET GLOBAL innodb_sched_priority_purge='foo';
+ERROR 42000: Incorrect argument type to variable 'innodb_sched_priority_purge'
diff --git a/mysql-test/suite/sys_vars/r/innodb_sync_array_size_basic.result b/mysql-test/suite/sys_vars/r/innodb_sync_array_size_basic.result
index df626e3373e..526dd7d8350 100644
--- a/mysql-test/suite/sys_vars/r/innodb_sync_array_size_basic.result
+++ b/mysql-test/suite/sys_vars/r/innodb_sync_array_size_basic.result
@@ -1,7 +1,3 @@
-SET @start_global_value = @@global.innodb_sync_array_size;
-SELECT @start_global_value;
-@start_global_value
-1
 Valid values are between 0 and 1024
 SELECT @@global.innodb_sync_array_size between 0 and 1024;
 @@global.innodb_sync_array_size between 0 and 1024
@@ -11,69 +7,24 @@ SELECT @@global.innodb_sync_array_size;
 1
 SELECT @@session.innodb_sync_array_size;
 ERROR HY000: Variable 'innodb_sync_array_size' is a GLOBAL variable
-SHOW global variables LIKE 'innodb_sync_array_size';
+SHOW GLOBAL variables LIKE 'innodb_sync_array_size';
 Variable_name	Value
 innodb_sync_array_size	1
-SHOW session variables LIKE 'innodb_sync_array_size';
+SHOW SESSION variables LIKE 'innodb_sync_array_size';
 Variable_name	Value
 innodb_sync_array_size	1
 SELECT * FROM information_schema.global_variables 
-where variable_name='innodb_sync_array_size';
+WHERE variable_name='innodb_sync_array_size';
 VARIABLE_NAME	VARIABLE_VALUE
 INNODB_SYNC_ARRAY_SIZE	1
 SELECT * FROM information_schema.session_variables 
-where variable_name='innodb_sync_array_size';
-VARIABLE_NAME	VARIABLE_VALUE
-INNODB_SYNC_ARRAY_SIZE	1
-SET global innodb_sync_array_size=10;
-SELECT @@global.innodb_sync_array_size;
-@@global.innodb_sync_array_size
-10
-SELECT * FROM information_schema.global_variables 
-where variable_name='innodb_sync_array_size';
-VARIABLE_NAME	VARIABLE_VALUE
-INNODB_SYNC_ARRAY_SIZE	10
-SELECT * FROM information_schema.session_variables 
-where variable_name='innodb_sync_array_size';
-VARIABLE_NAME	VARIABLE_VALUE
-INNODB_SYNC_ARRAY_SIZE	10
-SET session innodb_sync_array_size=1;
-ERROR HY000: Variable 'innodb_sync_array_size' is a GLOBAL variable and should be set with SET GLOBAL
-SET global innodb_sync_array_size=1.1;
-ERROR 42000: Incorrect argument type to variable 'innodb_sync_array_size'
-SET global innodb_sync_array_size=1e1;
-ERROR 42000: Incorrect argument type to variable 'innodb_sync_array_size'
-SET global innodb_sync_array_size="foo";
-ERROR 42000: Incorrect argument type to variable 'innodb_sync_array_size'
-SET global innodb_sync_array_size=-7;
-Warnings:
-Warning	1292	Truncated incorrect innodb_sync_array_size value: '-7'
-SELECT @@global.innodb_sync_array_size;
-@@global.innodb_sync_array_size
-1
-SELECT * FROM information_schema.global_variables 
-where variable_name='innodb_sync_array_size';
+WHERE variable_name='innodb_sync_array_size';
 VARIABLE_NAME	VARIABLE_VALUE
 INNODB_SYNC_ARRAY_SIZE	1
-SET global innodb_sync_array_size=96;
-SELECT @@global.innodb_sync_array_size;
-@@global.innodb_sync_array_size
-96
-SELECT * FROM information_schema.global_variables 
-where variable_name='innodb_sync_array_size';
-VARIABLE_NAME	VARIABLE_VALUE
-INNODB_SYNC_ARRAY_SIZE	96
-SET global innodb_sync_array_size=0;
-Warnings:
-Warning	1292	Truncated incorrect innodb_sync_array_size value: '0'
-SELECT @@global.innodb_sync_array_size;
-@@global.innodb_sync_array_size
-1
-SET global innodb_sync_array_size=64;
-SELECT @@global.innodb_sync_array_size;
-@@global.innodb_sync_array_size
-64
-SET @@global.innodb_sync_array_size = @start_global_value;
+SET GLOBAL innodb_sync_array_size=10;
+ERROR HY000: Variable 'innodb_sync_array_size' is a read only variable
+SET SESSION innodb_sync_array_size=10;
+ERROR HY000: Variable 'innodb_sync_array_size' is a read only variable
 SELECT @@global.innodb_sync_array_size;
 @@global.innodb_sync_array_size
 1
diff --git a/mysql-test/suite/sys_vars/t/innodb_buffer_pool_evict_basic.test b/mysql-test/suite/sys_vars/t/innodb_buffer_pool_evict_basic.test
index ce42f64395f..f988292b21e 100644
--- a/mysql-test/suite/sys_vars/t/innodb_buffer_pool_evict_basic.test
+++ b/mysql-test/suite/sys_vars/t/innodb_buffer_pool_evict_basic.test
@@ -2,9 +2,9 @@
 # This is a debug variable for now
 -- source include/have_debug.inc
 
-if (`select plugin_auth_version <= "5.5.31-MariaDB-30.2" from information_schema.plugins where plugin_name='innodb'`)
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
 {
-  --skip Not fixed in XtraDB 5.5.31-MariaDB-30.2 or earlier
+  --skip Not fixed in InnoDB 5.6.10 or earlier
 }
 
 SELECT @@global.innodb_buffer_pool_evict;
diff --git a/mysql-test/suite/sys_vars/t/innodb_cleaner_eviction_factor_basic.test b/mysql-test/suite/sys_vars/t/innodb_cleaner_eviction_factor_basic.test
new file mode 100644
index 00000000000..8e0af20a47e
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_cleaner_eviction_factor_basic.test
@@ -0,0 +1,35 @@
+--source include/have_debug.inc
+--source include/have_xtradb.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_cleaner_eviction_factor;
+
+# Default value
+SELECT @@GLOBAL.innodb_cleaner_eviction_factor;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_cleaner_eviction_factor;
+
+# Correct values
+SET GLOBAL innodb_cleaner_eviction_factor='OFF';
+SELECT @@GLOBAL.innodb_cleaner_eviction_factor;
+SET GLOBAL innodb_cleaner_eviction_factor='ON';
+SELECT @@GLOBAL.innodb_cleaner_eviction_factor;
+SET GLOBAL innodb_cleaner_eviction_factor=0;
+SELECT @@GLOBAL.innodb_cleaner_eviction_factor;
+SET GLOBAL innodb_cleaner_eviction_factor=1;
+SELECT @@GLOBAL.innodb_cleaner_eviction_factor;
+
+# Incorrect values
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_eviction_factor=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_eviction_factor=1e1;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_cleaner_eviction_factor=2;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_cleaner_eviction_factor='foo';
+
+SET GLOBAL innodb_cleaner_eviction_factor = @start_value;
diff --git a/mysql-test/suite/sys_vars/t/innodb_cleaner_flush_chunk_size_basic.test b/mysql-test/suite/sys_vars/t/innodb_cleaner_flush_chunk_size_basic.test
new file mode 100644
index 00000000000..c65fc63c20f
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_cleaner_flush_chunk_size_basic.test
@@ -0,0 +1,33 @@
+--source include/have_debug.inc
+--source include/have_xtradb.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_cleaner_flush_chunk_size;
+
+# Default value
+SELECT @@GLOBAL.innodb_cleaner_flush_chunk_size;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_cleaner_flush_chunk_size;
+
+# Correct values
+SET GLOBAL innodb_cleaner_flush_chunk_size=1;
+SELECT @@GLOBAL.innodb_cleaner_flush_chunk_size;
+SET GLOBAL innodb_cleaner_flush_chunk_size=1000;
+SELECT @@GLOBAL.innodb_cleaner_flush_chunk_size;
+SET GLOBAL innodb_cleaner_flush_chunk_size=4294967295;
+SELECT @@GLOBAL.innodb_cleaner_flush_chunk_size;
+
+# Incorrect values
+SET GLOBAL innodb_cleaner_flush_chunk_size=0;
+SELECT @@GLOBAL.innodb_cleaner_flush_chunk_size;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_flush_chunk_size=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_flush_chunk_size=1e1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_flush_chunk_size='foo';
+
+SET GLOBAL innodb_cleaner_flush_chunk_size = @start_value;
diff --git a/mysql-test/suite/sys_vars/t/innodb_cleaner_free_list_lwm_basic.test b/mysql-test/suite/sys_vars/t/innodb_cleaner_free_list_lwm_basic.test
new file mode 100644
index 00000000000..fa9d1e9f574
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_cleaner_free_list_lwm_basic.test
@@ -0,0 +1,35 @@
+--source include/have_debug.inc
+--source include/have_xtradb.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_cleaner_free_list_lwm;
+
+# Default value
+SELECT @@GLOBAL.innodb_cleaner_free_list_lwm;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_cleaner_free_list_lwm;
+
+# Correct values
+SET GLOBAL innodb_cleaner_free_list_lwm=0;
+SELECT @@GLOBAL.innodb_cleaner_free_list_lwm;
+SET GLOBAL innodb_cleaner_free_list_lwm=1;
+SELECT @@GLOBAL.innodb_cleaner_free_list_lwm;
+SET GLOBAL innodb_cleaner_free_list_lwm=99;
+SELECT @@GLOBAL.innodb_cleaner_free_list_lwm;
+SET GLOBAL innodb_cleaner_free_list_lwm=100;
+SELECT @@GLOBAL.innodb_cleaner_free_list_lwm;
+
+# Incorrect values
+SET GLOBAL innodb_cleaner_free_list_lwm=101;
+SELECT @@innodb_cleaner_free_list_lwm;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_free_list_lwm=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_free_list_lwm=1e1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_free_list_lwm='foo';
+
+SET GLOBAL innodb_cleaner_free_list_lwm = @start_value;
diff --git a/mysql-test/suite/sys_vars/t/innodb_cleaner_lru_chunk_size_basic.test b/mysql-test/suite/sys_vars/t/innodb_cleaner_lru_chunk_size_basic.test
new file mode 100644
index 00000000000..12da590446c
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_cleaner_lru_chunk_size_basic.test
@@ -0,0 +1,33 @@
+--source include/have_debug.inc
+--source include/have_xtradb.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_cleaner_lru_chunk_size;
+
+# Default value
+SELECT @@GLOBAL.innodb_cleaner_lru_chunk_size;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_cleaner_lru_chunk_size;
+
+# Correct values
+SET GLOBAL innodb_cleaner_lru_chunk_size=1;
+SELECT @@GLOBAL.innodb_cleaner_lru_chunk_size;
+SET GLOBAL innodb_cleaner_lru_chunk_size=1000;
+SELECT @@GLOBAL.innodb_cleaner_lru_chunk_size;
+SET GLOBAL innodb_cleaner_lru_chunk_size=4294967295;
+SELECT @@GLOBAL.innodb_cleaner_lru_chunk_size;
+
+# Incorrect values
+SET GLOBAL innodb_cleaner_lru_chunk_size=0;
+SELECT @@GLOBAL.innodb_cleaner_lru_chunk_size;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_lru_chunk_size=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_lru_chunk_size=1e1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_lru_chunk_size='foo';
+
+SET GLOBAL innodb_cleaner_lru_chunk_size = @start_value;
diff --git a/mysql-test/suite/sys_vars/t/innodb_cleaner_lsn_age_factor_basic.test b/mysql-test/suite/sys_vars/t/innodb_cleaner_lsn_age_factor_basic.test
new file mode 100644
index 00000000000..b34fcc94494
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_cleaner_lsn_age_factor_basic.test
@@ -0,0 +1,28 @@
+--source include/have_xtradb.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_cleaner_lsn_age_factor;
+
+# Default value
+SELECT @@GLOBAL.innodb_cleaner_lsn_age_factor;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_cleaner_lsn_age_factor;
+
+# Correct values
+SET GLOBAL innodb_cleaner_lsn_age_factor='legacy';
+SELECT @@GLOBAL.innodb_cleaner_lsn_age_factor;
+SET GLOBAL innodb_cleaner_lsn_age_factor='high_checkpoint';
+SELECT @@GLOBAL.innodb_cleaner_lsn_age_factor;
+
+# Incorrect values
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_lsn_age_factor=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_lsn_age_factor=1e1;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_cleaner_lsn_age_factor='foo';
+
+SET GLOBAL innodb_cleaner_lsn_age_factor = @start_value;
diff --git a/mysql-test/suite/sys_vars/t/innodb_cleaner_max_flush_time_basic.test b/mysql-test/suite/sys_vars/t/innodb_cleaner_max_flush_time_basic.test
new file mode 100644
index 00000000000..283c651d0c5
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_cleaner_max_flush_time_basic.test
@@ -0,0 +1,31 @@
+--source include/have_debug.inc
+--source include/have_xtradb.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_cleaner_max_flush_time;
+
+# Default value
+SELECT @@GLOBAL.innodb_cleaner_max_flush_time;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_cleaner_max_flush_time;
+
+# Correct values
+SET GLOBAL innodb_cleaner_max_flush_time=0;
+SELECT @@GLOBAL.innodb_cleaner_max_flush_time;
+SET GLOBAL innodb_cleaner_max_flush_time=1000;
+SELECT @@GLOBAL.innodb_cleaner_max_flush_time;
+SET GLOBAL innodb_cleaner_max_flush_time=4294967295;
+SELECT @@GLOBAL.innodb_cleaner_max_flush_time;
+
+# Incorrect values
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_max_flush_time=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_max_flush_time=1e1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_max_flush_time='foo';
+
+SET GLOBAL innodb_cleaner_max_flush_time = @start_value;
diff --git a/mysql-test/suite/sys_vars/t/innodb_cleaner_max_lru_time_basic.test b/mysql-test/suite/sys_vars/t/innodb_cleaner_max_lru_time_basic.test
new file mode 100644
index 00000000000..d0621e77df3
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_cleaner_max_lru_time_basic.test
@@ -0,0 +1,31 @@
+--source include/have_debug.inc
+--source include/have_xtradb.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_cleaner_max_lru_time;
+
+# Default value
+SELECT @@GLOBAL.innodb_cleaner_max_lru_time;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_cleaner_max_lru_time;
+
+# Correct values
+SET GLOBAL innodb_cleaner_max_lru_time=0;
+SELECT @@GLOBAL.innodb_cleaner_max_lru_time;
+SET GLOBAL innodb_cleaner_max_lru_time=1000;
+SELECT @@GLOBAL.innodb_cleaner_max_lru_time;
+SET GLOBAL innodb_cleaner_max_lru_time=4294967295;
+SELECT @@GLOBAL.innodb_cleaner_max_lru_time;
+
+# Incorrect values
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_max_lru_time=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_max_lru_time=1e1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_cleaner_max_lru_time='foo';
+
+SET GLOBAL innodb_cleaner_max_lru_time = @start_value;
diff --git a/mysql-test/suite/sys_vars/t/innodb_empty_free_list_algorithm_basic.test b/mysql-test/suite/sys_vars/t/innodb_empty_free_list_algorithm_basic.test
new file mode 100644
index 00000000000..6bb34f36a4f
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_empty_free_list_algorithm_basic.test
@@ -0,0 +1,30 @@
+--source include/have_xtradb.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_empty_free_list_algorithm;
+
+# Default value
+SELECT @@GLOBAL.innodb_empty_free_list_algorithm;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_empty_free_list_algorithm;
+
+# Correct values
+SET GLOBAL innodb_empty_free_list_algorithm='legacy';
+SELECT @@GLOBAL.innodb_empty_free_list_algorithm;
+SET GLOBAL innodb_empty_free_list_algorithm='backoff';
+SELECT @@GLOBAL.innodb_empty_free_list_algorithm;
+
+# Incorrect values
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_empty_free_list_algorithm=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_empty_free_list_algorithm=1e1;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_empty_free_list_algorithm=2;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_empty_free_list_algorithm='foo';
+
+SET GLOBAL innodb_empty_free_list_algorithm = @start_value;
diff --git a/mysql-test/suite/sys_vars/t/innodb_foreground_preflush_basic.test b/mysql-test/suite/sys_vars/t/innodb_foreground_preflush_basic.test
new file mode 100644
index 00000000000..f388b392f9b
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_foreground_preflush_basic.test
@@ -0,0 +1,30 @@
+--source include/have_xtradb.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_foreground_preflush;
+
+# Default value
+SELECT @@GLOBAL.innodb_foreground_preflush;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_foreground_preflush;
+
+# Correct values
+SET GLOBAL innodb_foreground_preflush='sync_preflush';
+SELECT @@GLOBAL.innodb_foreground_preflush;
+SET GLOBAL innodb_foreground_preflush='exponential_backoff';
+SELECT @@GLOBAL.innodb_foreground_preflush;
+
+# Incorrect values
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_foreground_preflush=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_foreground_preflush=1e1;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_foreground_preflush=2;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_foreground_preflush='foo';
+
+SET GLOBAL innodb_foreground_preflush = @start_value;
diff --git a/mysql-test/suite/sys_vars/t/innodb_ft_result_cache_limit_basic.test b/mysql-test/suite/sys_vars/t/innodb_ft_result_cache_limit_basic.test
new file mode 100644
index 00000000000..245ed4abdfb
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_ft_result_cache_limit_basic.test
@@ -0,0 +1,38 @@
+
+#
+#  2013-05-09 - Added
+#
+
+--source include/have_innodb.inc
+
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip Not fixed in InnoDB 5.6.10 or earlier
+}
+
+#
+# show the global and session values;
+#
+select @@global.innodb_ft_result_cache_limit;
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+select @@session.innodb_ft_result_cache_limit;
+show global variables like 'innodb_ft_result_cache_limit';
+show session variables like 'innodb_ft_result_cache_limit';
+select * from information_schema.global_variables where variable_name='innodb_ft_result_cache_limit';
+select * from information_schema.session_variables where variable_name='innodb_ft_result_cache_limit';
+
+#
+# test default, min, max value
+#
+let $innodb_ft_result_cache_limit_orig=`select @@innodb_ft_result_cache_limit`;
+
+set global innodb_ft_result_cache_limit=900000;
+select @@innodb_ft_result_cache_limit;
+
+set global innodb_ft_result_cache_limit=1000000;
+select @@innodb_ft_result_cache_limit;
+
+set global innodb_ft_result_cache_limit=4000000000;
+select @@innodb_ft_result_cache_limit;
+
+eval set global innodb_ft_result_cache_limit=$innodb_ft_result_cache_limit_orig;
diff --git a/mysql-test/suite/sys_vars/t/innodb_ft_total_cache_size_basic.test b/mysql-test/suite/sys_vars/t/innodb_ft_total_cache_size_basic.test
new file mode 100644
index 00000000000..772ec5a1919
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_ft_total_cache_size_basic.test
@@ -0,0 +1,27 @@
+--source include/have_innodb.inc
+
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip Not fixed in InnoDB 5.6.10 or earlier
+}
+
+#
+# show the global and session values;
+#
+select @@global.innodb_ft_total_cache_size;
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+select @@session.innodb_ft_total_cache_size;
+show global variables like 'innodb_ft_total_cache_size';
+show session variables like 'innodb_ft_total_cache_size';
+select * from information_schema.global_variables where variable_name='innodb_ft_total_cache_size';
+select * from information_schema.session_variables where variable_name='innodb_ft_total_cache_size';
+
+#
+# show that it's read-only
+#
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+set global innodb_ft_total_cache_size=1;
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+set session innodb_ft_total_cache_size=1;
+
+
diff --git a/mysql-test/suite/sys_vars/t/innodb_log_arch_dir_basic.test b/mysql-test/suite/sys_vars/t/innodb_log_arch_dir_basic.test
new file mode 100644
index 00000000000..084d97fa460
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_log_arch_dir_basic.test
@@ -0,0 +1,68 @@
+#######################################################
+# Basic test for innodb_log_arch_dir variable #
+#######################################################
+
+--source include/have_xtradb.inc
+
+let $datadir= `select @@datadir`;
+
+####################################################################
+#   Displaying default value                                       #
+####################################################################
+--replace_result $datadir ./
+SELECT @@GLOBAL.innodb_log_arch_dir;
+--echo NULL Expected
+
+
+####################################################################
+#   Check if Value can set                                         #
+####################################################################
+
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SET @@GLOBAL.innodb_log_arch_dir=1;
+--echo Expected error 'Read only variable'
+
+--replace_result $datadir ./
+SELECT @@GLOBAL.innodb_log_arch_dir;
+--echo NULL Expected
+
+--replace_result $datadir ./
+SELECT VARIABLE_VALUE
+FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES 
+WHERE VARIABLE_NAME='innodb_log_arch_dir';
+--echo empty string Expected
+
+############################################
+#  Check accessing variable without GLOBAL #
+############################################
+--replace_result $datadir ./
+SELECT @@innodb_log_arch_dir;
+--echo NULL Expected
+
+
+
+##########################################################################
+#   Check if innodb_log_arch_dir can be accessed without @@ sign #
+##########################################################################
+
+--replace_result $datadir ./
+SELECT @@innodb_log_arch_dir;
+--echo NULL Expected
+
+--Error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@local.innodb_log_arch_dir;
+--echo Expected error 'Variable is a GLOBAL variable'
+
+--Error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_log_arch_dir;
+--echo Expected error 'Variable is a GLOBAL variable'
+
+--replace_result $datadir ./
+SELECT @@GLOBAL.innodb_log_arch_dir;
+--echo NULL Expected
+
+--Error ER_BAD_FIELD_ERROR
+SELECT innodb_log_arch_dir = @@SESSION.innodb_log_arch_dir;
+--echo Expected error Unknown column 'innodb_log_arch_dir' in 'field list'
+
+
diff --git a/mysql-test/suite/sys_vars/t/innodb_log_arch_expire_sec_basic.test b/mysql-test/suite/sys_vars/t/innodb_log_arch_expire_sec_basic.test
new file mode 100644
index 00000000000..87c374ea886
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_log_arch_expire_sec_basic.test
@@ -0,0 +1,60 @@
+###############################################################
+# Basic test for innodb_log_arch_expire_sec variable #
+###############################################################
+
+--source include/have_xtradb.inc
+
+SELECT @@GLOBAL.innodb_log_arch_expire_sec INTO @save;
+
+####################################################################
+#   Displaying default value                                       #
+####################################################################
+SELECT @@GLOBAL.innodb_log_arch_expire_sec;
+--echo 0 Expected
+
+
+####################################################################
+#   Check if Value can set                                         #
+####################################################################
+
+SET @@GLOBAL.innodb_log_arch_expire_sec=1;
+
+SELECT @@GLOBAL.innodb_log_arch_expire_sec;
+--echo 1 Expected
+
+SELECT VARIABLE_VALUE
+FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES 
+WHERE VARIABLE_NAME='innodb_log_arch_expire_sec';
+--echo 1 Expected
+
+############################################
+#  Check accessing variable without GLOBAL #
+############################################
+SELECT @@innodb_log_arch_expire_sec;
+--echo 1 Expected
+
+
+
+##################################################################################
+#   Check if innodb_log_arch_expire_sec can be accessed without @@ sign #
+##################################################################################
+
+SELECT @@innodb_log_arch_expire_sec;
+--echo 1 Expected
+
+--Error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@local.innodb_log_arch_expire_sec;
+--echo Expected error 'Variable is a GLOBAL variable'
+
+--Error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_log_arch_expire_sec;
+--echo Expected error 'Variable is a GLOBAL variable'
+
+SELECT @@GLOBAL.innodb_log_arch_expire_sec;
+--echo 1 Expected
+
+--Error ER_BAD_FIELD_ERROR
+SELECT innodb_log_arch_expire_sec = @@SESSION.innodb_log_arch_expire_sec;
+--echo Expected error Unknown column 'innodb_log_arch_expire_sec' in 'field list'
+
+SET @@GLOBAL.innodb_log_arch_expire_sec = @save;
diff --git a/mysql-test/suite/sys_vars/t/innodb_log_archive_basic.test b/mysql-test/suite/sys_vars/t/innodb_log_archive_basic.test
new file mode 100644
index 00000000000..cbc885123ce
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_log_archive_basic.test
@@ -0,0 +1,61 @@
+###################################################
+# Basic test for innodb_log_archive variable #
+###################################################
+
+--source include/have_xtradb.inc
+
+####################################################################
+#   Displaying default value                                       #
+####################################################################
+SELECT @@GLOBAL.innodb_log_archive;
+--echo 0 Expected
+
+
+####################################################################
+#   Check if Value can set                                         #
+####################################################################
+
+SET @save_innodb_log_archive = @@GLOBAL.innodb_log_archive;
+SET @@GLOBAL.innodb_log_archive=1;
+
+SELECT @@GLOBAL.innodb_log_archive;
+--echo 1 Expected
+
+SELECT VARIABLE_VALUE
+FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES 
+WHERE VARIABLE_NAME='innodb_log_archive';
+--echo ON Expected
+
+SET @@GLOBAL.innodb_log_archive = @save_innodb_log_archive;
+
+############################################
+#  Check accessing variable without GLOBAL #
+############################################
+SELECT @@innodb_log_archive;
+--echo 0 Expected
+
+
+
+##########################################################################
+#   Check if innodb_log_archive can be accessed without @@ sign #
+##########################################################################
+
+SELECT @@innodb_log_archive;
+--echo 0 Expected
+
+--Error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@local.innodb_log_archive;
+--echo Expected error 'Variable is a GLOBAL variable'
+
+--Error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_log_archive;
+--echo Expected error 'Variable is a GLOBAL variable'
+
+SELECT @@GLOBAL.innodb_log_archive;
+--echo 0 Expected
+
+--Error ER_BAD_FIELD_ERROR
+SELECT innodb_log_archive = @@SESSION.innodb_log_archive;
+--echo Expected error Unknown column 'innodb_log_archive' in 'field list'
+
+
diff --git a/mysql-test/suite/sys_vars/t/innodb_log_checksum_algorithm_basic.test b/mysql-test/suite/sys_vars/t/innodb_log_checksum_algorithm_basic.test
new file mode 100644
index 00000000000..1a83d4f2602
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_log_checksum_algorithm_basic.test
@@ -0,0 +1,38 @@
+--source include/have_xtradb.inc
+
+# Check the default value
+SET @orig = @@global.innodb_log_checksum_algorithm;
+SELECT @orig;
+
+SET GLOBAL innodb_log_checksum_algorithm = 'crc32';
+SELECT @@global.innodb_log_checksum_algorithm;
+
+SET GLOBAL innodb_log_checksum_algorithm = 'strict_crc32';
+SELECT @@global.innodb_log_checksum_algorithm;
+
+SET GLOBAL innodb_log_checksum_algorithm = 'innodb';
+SELECT @@global.innodb_log_checksum_algorithm;
+
+SET GLOBAL innodb_log_checksum_algorithm = 'strict_innodb';
+SELECT @@global.innodb_log_checksum_algorithm;
+
+SET GLOBAL innodb_log_checksum_algorithm = 'none';
+SELECT @@global.innodb_log_checksum_algorithm;
+
+SET GLOBAL innodb_log_checksum_algorithm = 'strict_none';
+SELECT @@global.innodb_log_checksum_algorithm;
+
+-- error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_log_checksum_algorithm = '';
+SELECT @@global.innodb_log_checksum_algorithm;
+
+-- error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_log_checksum_algorithm = 'foobar';
+SELECT @@global.innodb_log_checksum_algorithm;
+
+-- error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_log_checksum_algorithm = 123;
+SELECT @@global.innodb_log_checksum_algorithm;
+
+SET GLOBAL innodb_log_checksum_algorithm = @orig;
+SELECT @@global.innodb_log_checksum_algorithm;
diff --git a/mysql-test/suite/sys_vars/t/innodb_log_compressed_pages_basic.test b/mysql-test/suite/sys_vars/t/innodb_log_compressed_pages_basic.test
new file mode 100644
index 00000000000..8d10309ae02
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_log_compressed_pages_basic.test
@@ -0,0 +1,93 @@
+--source include/have_innodb.inc
+
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip Not fixed in InnoDB 5.6.10 or earlier
+}
+
+SET @start_global_value = @@global.innodb_log_compressed_pages;
+SELECT @start_global_value;
+
+
+--echo '#---------------------BS_STVARS_028_01----------------------#'
+####################################################################
+#   Displaying default value                                       #
+####################################################################
+SELECT COUNT(@@GLOBAL.innodb_log_compressed_pages);
+--echo 1 Expected
+
+
+--echo '#---------------------BS_STVARS_028_02----------------------#'
+####################################################################
+#   Check if Value can set                                         #
+####################################################################
+
+SET @@global.innodb_log_compressed_pages = 0;
+SELECT @@global.innodb_log_compressed_pages;
+
+SET @@global.innodb_log_compressed_pages ='On' ;
+SELECT @@global.innodb_log_compressed_pages;
+
+SET @@global.innodb_log_compressed_pages ='Off' ;
+SELECT @@global.innodb_log_compressed_pages;
+
+SET @@global.innodb_log_compressed_pages = 1;
+SELECT @@global.innodb_log_compressed_pages;
+
+--echo '#---------------------BS_STVARS_028_03----------------------#'
+#################################################################
+# Check if the value in GLOBAL Table matches value in variable  #
+#################################################################
+
+SELECT IF(@@GLOBAL.innodb_log_compressed_pages,'ON','OFF') = VARIABLE_VALUE
+FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
+WHERE VARIABLE_NAME='innodb_log_compressed_pages';
+--echo 1 Expected
+
+SELECT COUNT(@@GLOBAL.innodb_log_compressed_pages);
+--echo 1 Expected
+
+SELECT COUNT(VARIABLE_VALUE)
+FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
+WHERE VARIABLE_NAME='innodb_log_compressed_pages';
+--echo 1 Expected
+
+
+
+--echo '#---------------------BS_STVARS_028_04----------------------#'
+################################################################################
+#  Check if accessing variable with and without GLOBAL point to same variable  #
+################################################################################
+SELECT @@innodb_log_compressed_pages = @@GLOBAL.innodb_log_compressed_pages;
+--echo 1 Expected
+
+
+
+--echo '#---------------------BS_STVARS_028_05----------------------#'
+################################################################################
+# Check if innodb_log_compressed_pages can be accessed with and without @@ sign#
+################################################################################
+
+SELECT COUNT(@@innodb_log_compressed_pages);
+--echo 1 Expected
+
+--Error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT COUNT(@@local.innodb_log_compressed_pages);
+--echo Expected error 'Variable is a GLOBAL variable'
+
+--Error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT COUNT(@@SESSION.innodb_log_compressed_pages);
+--echo Expected error 'Variable is a GLOBAL variable'
+
+SELECT COUNT(@@GLOBAL.innodb_log_compressed_pages);
+--echo 1 Expected
+
+--Error ER_BAD_FIELD_ERROR
+SELECT innodb_log_compressed_pages = @@SESSION.innodb_log_compressed_pages;
+
+#
+# Cleanup
+#
+
+SET @@global.innodb_log_compressed_pages = @start_global_value;
+SELECT @@global.innodb_log_compressed_pages;
diff --git a/mysql-test/suite/sys_vars/t/innodb_merge_sort_block_size_basic.test b/mysql-test/suite/sys_vars/t/innodb_merge_sort_block_size_basic.test
deleted file mode 100644
index 2ec4870f345..00000000000
--- a/mysql-test/suite/sys_vars/t/innodb_merge_sort_block_size_basic.test
+++ /dev/null
@@ -1,19 +0,0 @@
---source include/have_xtradb.inc
-
-SELECT @@global.innodb_merge_sort_block_size;
-SELECT @@session.innodb_merge_sort_block_size;
-
-SET @old_global=@@global.innodb_merge_sort_block_size;
-SET @old_session=@@session.innodb_merge_sort_block_size;
-
-SET @@global.innodb_merge_sort_block_size = 2*1024*1024;
-SET @@session.innodb_merge_sort_block_size = 4*1024*1024;
-
-SELECT @@global.innodb_merge_sort_block_size;
-SELECT @@session.innodb_merge_sort_block_size;
-
-SET @@global.innodb_merge_sort_block_size = 1024*1024*1024+1;
-SELECT @@global.innodb_merge_sort_block_size;
-
-SET @@global.innodb_merge_sort_block_size=@old_global;
-SET @@session.innodb_merge_sort_block_size=@old_session;
diff --git a/mysql-test/suite/sys_vars/t/innodb_monitor_disable_basic.test b/mysql-test/suite/sys_vars/t/innodb_monitor_disable_basic.test
index 1b23ae14e49..0615d62a0e1 100644
--- a/mysql-test/suite/sys_vars/t/innodb_monitor_disable_basic.test
+++ b/mysql-test/suite/sys_vars/t/innodb_monitor_disable_basic.test
@@ -2,6 +2,11 @@
 # Test the metrics monitor system's control system
 # and counter accuracy.
 
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip Not fixed in InnoDB 5.6.10 or earlier
+}
+
 --source include/have_innodb.inc
 set global innodb_monitor_disable = All;
 # Test turn on/off the monitor counter  with "all" option
diff --git a/mysql-test/suite/sys_vars/t/innodb_monitor_enable_basic.test b/mysql-test/suite/sys_vars/t/innodb_monitor_enable_basic.test
index 1b23ae14e49..0615d62a0e1 100644
--- a/mysql-test/suite/sys_vars/t/innodb_monitor_enable_basic.test
+++ b/mysql-test/suite/sys_vars/t/innodb_monitor_enable_basic.test
@@ -2,6 +2,11 @@
 # Test the metrics monitor system's control system
 # and counter accuracy.
 
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip Not fixed in InnoDB 5.6.10 or earlier
+}
+
 --source include/have_innodb.inc
 set global innodb_monitor_disable = All;
 # Test turn on/off the monitor counter  with "all" option
diff --git a/mysql-test/suite/sys_vars/t/innodb_monitor_reset_all_basic.test b/mysql-test/suite/sys_vars/t/innodb_monitor_reset_all_basic.test
index 1b23ae14e49..868f69300fa 100644
--- a/mysql-test/suite/sys_vars/t/innodb_monitor_reset_all_basic.test
+++ b/mysql-test/suite/sys_vars/t/innodb_monitor_reset_all_basic.test
@@ -2,6 +2,11 @@
 # Test the metrics monitor system's control system
 # and counter accuracy.
 
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip not fixed in innodb 5.6.10 or earlier
+}
+
 --source include/have_innodb.inc
 set global innodb_monitor_disable = All;
 # Test turn on/off the monitor counter  with "all" option
diff --git a/mysql-test/suite/sys_vars/t/innodb_monitor_reset_basic.test b/mysql-test/suite/sys_vars/t/innodb_monitor_reset_basic.test
index 1b23ae14e49..868f69300fa 100644
--- a/mysql-test/suite/sys_vars/t/innodb_monitor_reset_basic.test
+++ b/mysql-test/suite/sys_vars/t/innodb_monitor_reset_basic.test
@@ -2,6 +2,11 @@
 # Test the metrics monitor system's control system
 # and counter accuracy.
 
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip not fixed in innodb 5.6.10 or earlier
+}
+
 --source include/have_innodb.inc
 set global innodb_monitor_disable = All;
 # Test turn on/off the monitor counter  with "all" option
diff --git a/mysql-test/suite/sys_vars/t/innodb_persistent_stats_root_page_basic.test b/mysql-test/suite/sys_vars/t/innodb_persistent_stats_root_page_basic.test
deleted file mode 100644
index 2e216e10521..00000000000
--- a/mysql-test/suite/sys_vars/t/innodb_persistent_stats_root_page_basic.test
+++ /dev/null
@@ -1,26 +0,0 @@
---source include/have_debug.inc
---source include/have_xtradb.inc
-
-SELECT @@global.innodb_persistent_stats_root_page;
-SELECT COUNT(@@global.innodb_persistent_stats_root_page);
-
-# Read-only variable
---error ER_INCORRECT_GLOBAL_LOCAL_VAR
-SET @@global.innodb_persistent_stats_root_page=100;
-
-# Check if INFORMATION_SCHEMA agrees with the var
-SELECT @@global.innodb_persistent_stats_root_page = VARIABLE_VALUE
-FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
-WHERE VARIABLE_NAME='innodb_persistent_stats_root_page';
-
-SELECT COUNT(*) FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
-WHERE VARIABLE_NAME='innodb_persistent_stats_root_page';
-
-# Check if accessing the var without GLOBAL points to the same
-SELECT @@innodb_persistent_stats_root_page = @@global.innodb_persistent_stats_root_page;
-
---error ER_INCORRECT_GLOBAL_LOCAL_VAR
-SELECT COUNT(@@local.innodb_persistent_stats_root_page);
-
---error ER_INCORRECT_GLOBAL_LOCAL_VAR
-SELECT COUNT(@@session.innodb_persistent_stats_root_page);
diff --git a/mysql-test/suite/sys_vars/t/innodb_priority_cleaner_basic.test b/mysql-test/suite/sys_vars/t/innodb_priority_cleaner_basic.test
new file mode 100644
index 00000000000..a305978a280
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_priority_cleaner_basic.test
@@ -0,0 +1,36 @@
+--source include/have_debug.inc
+--source include/have_xtradb.inc
+--source include/linux.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_priority_cleaner;
+
+# Default value
+SELECT @@GLOBAL.innodb_priority_cleaner;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_priority_cleaner;
+
+# Correct values
+SET GLOBAL innodb_priority_cleaner='OFF';
+SELECT @@GLOBAL.innodb_priority_cleaner;
+SET GLOBAL innodb_priority_cleaner='ON';
+SELECT @@GLOBAL.innodb_priority_cleaner;
+SET GLOBAL innodb_priority_cleaner=0;
+SELECT @@GLOBAL.innodb_priority_cleaner;
+SET GLOBAL innodb_priority_cleaner=1;
+SELECT @@GLOBAL.innodb_priority_cleaner;
+
+# Incorrect values
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_priority_cleaner=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_priority_cleaner=1e1;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_priority_cleaner=2;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_priority_cleaner='foo';
+
+SET GLOBAL innodb_priority_cleaner = @start_value;
diff --git a/mysql-test/suite/sys_vars/t/innodb_priority_io_basic.test b/mysql-test/suite/sys_vars/t/innodb_priority_io_basic.test
new file mode 100644
index 00000000000..d8a04cccf1e
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_priority_io_basic.test
@@ -0,0 +1,36 @@
+--source include/have_debug.inc
+--source include/have_xtradb.inc
+--source include/linux.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_priority_io;
+
+# Default value
+SELECT @@GLOBAL.innodb_priority_io;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_priority_io;
+
+# Correct values
+SET GLOBAL innodb_priority_io='OFF';
+SELECT @@GLOBAL.innodb_priority_io;
+SET GLOBAL innodb_priority_io='ON';
+SELECT @@GLOBAL.innodb_priority_io;
+SET GLOBAL innodb_priority_io=0;
+SELECT @@GLOBAL.innodb_priority_io;
+SET GLOBAL innodb_priority_io=1;
+SELECT @@GLOBAL.innodb_priority_io;
+
+# Incorrect values
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_priority_io=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_priority_io=1e1;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_priority_io=2;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_priority_io='foo';
+
+SET GLOBAL innodb_priority_io = @start_value;
diff --git a/mysql-test/suite/sys_vars/t/innodb_priority_master_basic.test b/mysql-test/suite/sys_vars/t/innodb_priority_master_basic.test
new file mode 100644
index 00000000000..f202738f4e1
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_priority_master_basic.test
@@ -0,0 +1,36 @@
+--source include/have_debug.inc
+--source include/have_xtradb.inc
+--source include/linux.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_priority_master;
+
+# Default value
+SELECT @@GLOBAL.innodb_priority_master;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_priority_master;
+
+# Correct values
+SET GLOBAL innodb_priority_master='OFF';
+SELECT @@GLOBAL.innodb_priority_master;
+SET GLOBAL innodb_priority_master='ON';
+SELECT @@GLOBAL.innodb_priority_master;
+SET GLOBAL innodb_priority_master=0;
+SELECT @@GLOBAL.innodb_priority_master;
+SET GLOBAL innodb_priority_master=1;
+SELECT @@GLOBAL.innodb_priority_master;
+
+# Incorrect values
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_priority_master=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_priority_master=1e1;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_priority_master=2;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_priority_master='foo';
+
+SET GLOBAL innodb_priority_master = @start_value;
diff --git a/mysql-test/suite/sys_vars/t/innodb_priority_purge_basic.test b/mysql-test/suite/sys_vars/t/innodb_priority_purge_basic.test
new file mode 100644
index 00000000000..b17a97838a5
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_priority_purge_basic.test
@@ -0,0 +1,36 @@
+--source include/have_debug.inc
+--source include/have_xtradb.inc
+--source include/linux.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_priority_purge;
+
+# Default value
+SELECT @@GLOBAL.innodb_priority_purge;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_priority_purge;
+
+# Correct values
+SET GLOBAL innodb_priority_purge='OFF';
+SELECT @@GLOBAL.innodb_priority_purge;
+SET GLOBAL innodb_priority_purge='ON';
+SELECT @@GLOBAL.innodb_priority_purge;
+SET GLOBAL innodb_priority_purge=0;
+SELECT @@GLOBAL.innodb_priority_purge;
+SET GLOBAL innodb_priority_purge=1;
+SELECT @@GLOBAL.innodb_priority_purge;
+
+# Incorrect values
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_priority_purge=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_priority_purge=1e1;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_priority_purge=2;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_priority_purge='foo';
+
+SET GLOBAL innodb_priority_purge = @start_value;
diff --git a/mysql-test/suite/sys_vars/t/innodb_sched_priority_cleaner_basic.test b/mysql-test/suite/sys_vars/t/innodb_sched_priority_cleaner_basic.test
new file mode 100644
index 00000000000..b2382fd7844
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_sched_priority_cleaner_basic.test
@@ -0,0 +1,43 @@
+--source include/have_xtradb.inc
+--source include/linux.inc
+--source include/not_embedded.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_sched_priority_cleaner;
+
+# Default value
+SELECT @@GLOBAL.innodb_sched_priority_cleaner;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_sched_priority_cleaner;
+
+# Correct values
+# The high priority values may need permissions, thus do not test them
+# SET GLOBAL innodb_sched_priority_cleaner=39;
+# SELECT @@GLOBAL.innodb_sched_priority_cleaner;
+#SET GLOBAL innodb_sched_priority_cleaner=34;
+# SELECT @@GLOBAL.innodb_sched_priority_cleaner;
+SET GLOBAL innodb_sched_priority_cleaner=19;
+SELECT @@GLOBAL.innodb_sched_priority_cleaner;
+SET GLOBAL innodb_sched_priority_cleaner=5;
+SELECT @@GLOBAL.innodb_sched_priority_cleaner;
+SET GLOBAL innodb_sched_priority_cleaner=0;
+SELECT @@GLOBAL.innodb_sched_priority_cleaner;
+
+# Incorrect values
+SET GLOBAL innodb_sched_priority_cleaner=-1;
+SELECT @@GLOBAL.innodb_sched_priority_cleaner;
+#SET GLOBAL innodb_sched_priority_cleaner=40;
+#SELECT @@GLOBAL.innodb_sched_priority_cleaner;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_sched_priority_cleaner=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_sched_priority_cleaner=1e1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_sched_priority_cleaner='foo';
+
+# If we are lacking permissions, then we cannot restore the startup value
+# at the end.
+--source include/restart_mysqld.inc
diff --git a/mysql-test/suite/sys_vars/t/innodb_sched_priority_io_basic.test b/mysql-test/suite/sys_vars/t/innodb_sched_priority_io_basic.test
new file mode 100644
index 00000000000..f77816e84c2
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_sched_priority_io_basic.test
@@ -0,0 +1,44 @@
+--source include/have_debug.inc
+--source include/have_xtradb.inc
+--source include/linux.inc
+--source include/not_embedded.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_sched_priority_io;
+
+# Default value
+SELECT @@GLOBAL.innodb_sched_priority_io;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_sched_priority_io;
+
+# Correct values
+# The high priority values may need permissions, thus do not test them
+# SET GLOBAL innodb_sched_priority_io=39;
+# SELECT @@GLOBAL.innodb_sched_priority_io;
+#SET GLOBAL innodb_sched_priority_io=34;
+# SELECT @@GLOBAL.innodb_sched_priorit_io;
+SET GLOBAL innodb_sched_priority_io=19;
+SELECT @@GLOBAL.innodb_sched_priority_io;
+SET GLOBAL innodb_sched_priority_io=5;
+SELECT @@GLOBAL.innodb_sched_priority_io;
+SET GLOBAL innodb_sched_priority_io=0;
+SELECT @@GLOBAL.innodb_sched_priority_io;
+
+# Incorrect values
+SET GLOBAL innodb_sched_priority_io=-1;
+SELECT @@GLOBAL.innodb_sched_priority_io;
+#SET GLOBAL innodb_sched_priority_io=40;
+#SELECT @@GLOBAL.innodb_sched_priority_io;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_sched_priority_io=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_sched_priority_io=1e1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_sched_priority_io='foo';
+
+# If we are lacking permissions, then we cannot restore the startup value
+# at the end.
+--source include/restart_mysqld.inc
diff --git a/mysql-test/suite/sys_vars/t/innodb_sched_priority_master_basic.test b/mysql-test/suite/sys_vars/t/innodb_sched_priority_master_basic.test
new file mode 100644
index 00000000000..150e7e5793d
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_sched_priority_master_basic.test
@@ -0,0 +1,44 @@
+--source include/have_debug.inc
+--source include/have_xtradb.inc
+--source include/linux.inc
+--source include/not_embedded.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_sched_priority_master;
+
+# Default value
+SELECT @@GLOBAL.innodb_sched_priority_master;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_sched_priority_master;
+
+# Correct values
+# The high priority values may need permissions, thus do not test them
+# SET GLOBAL innodb_sched_priority_master=39;
+# SELECT @@GLOBAL.innodb_sched_priority_master;
+#SET GLOBAL innodb_sched_priority_master=34;
+# SELECT @@GLOBAL.innodb_sched_priority_master;
+SET GLOBAL innodb_sched_priority_master=19;
+SELECT @@GLOBAL.innodb_sched_priority_master;
+SET GLOBAL innodb_sched_priority_master=5;
+SELECT @@GLOBAL.innodb_sched_priority_master;
+SET GLOBAL innodb_sched_priority_master=0;
+SELECT @@GLOBAL.innodb_sched_priority_master;
+
+# Incorrect values
+SET GLOBAL innodb_sched_priority_master=-1;
+SELECT @@GLOBAL.innodb_sched_priority_master;
+#SET GLOBAL innodb_sched_priority_master=40;
+#SELECT @@GLOBAL.innodb_sched_priority_master;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_sched_priority_master=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_sched_priority_master=1e1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_sched_priority_master='foo';
+
+# If we are lacking permissions, then we cannot restore the startup value
+# at the end.
+--source include/restart_mysqld.inc
diff --git a/mysql-test/suite/sys_vars/t/innodb_sched_priority_purge_basic.test b/mysql-test/suite/sys_vars/t/innodb_sched_priority_purge_basic.test
new file mode 100644
index 00000000000..bc37e4ee568
--- /dev/null
+++ b/mysql-test/suite/sys_vars/t/innodb_sched_priority_purge_basic.test
@@ -0,0 +1,44 @@
+--source include/have_debug.inc
+--source include/have_xtradb.inc
+--source include/linux.inc
+--source include/not_embedded.inc
+
+# A dynamic, global variable
+
+SET @start_value = @@GLOBAL.innodb_sched_priority_purge;
+
+# Default value
+SELECT @@GLOBAL.innodb_sched_priority_purge;
+
+# Global only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@SESSION.innodb_sched_priority_purge;
+
+# Correct values
+# The high priority values may need permissions, thus do not test them
+# SET GLOBAL innodb_sched_priority_purge=39;
+# SELECT @@GLOBAL.innodb_sched_priority_purge;
+#SET GLOBAL innodb_sched_priority_purge=34;
+# SELECT @@GLOBAL.innodb_sched_priority_purge;
+SET GLOBAL innodb_sched_priority_purge=19;
+SELECT @@GLOBAL.innodb_sched_priority_purge;
+SET GLOBAL innodb_sched_priority_purge=5;
+SELECT @@GLOBAL.innodb_sched_priority_purge;
+SET GLOBAL innodb_sched_priority_purge=0;
+SELECT @@GLOBAL.innodb_sched_priority_purge;
+
+# Incorrect values
+SET GLOBAL innodb_sched_priority_purge=-1;
+SELECT @@GLOBAL.innodb_sched_priority_purge;
+#SET GLOBAL innodb_sched_priority_purge=40;
+#SELECT @@GLOBAL.innodb_sched_priority_purge;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_sched_priority_purge=1.1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_sched_priority_purge=1e1;
+--error ER_WRONG_TYPE_FOR_VAR
+SET GLOBAL innodb_sched_priority_purge='foo';
+
+# If we are lacking permissions, then we cannot restore the startup value
+# at the end.
+--source include/restart_mysqld.inc
diff --git a/mysql-test/suite/sys_vars/t/innodb_sync_array_size_basic.test b/mysql-test/suite/sys_vars/t/innodb_sync_array_size_basic.test
index 27baf47422d..53011acb576 100644
--- a/mysql-test/suite/sys_vars/t/innodb_sync_array_size_basic.test
+++ b/mysql-test/suite/sys_vars/t/innodb_sync_array_size_basic.test
@@ -1,66 +1,31 @@
-
-
 # 2010-01-27 - Added
-#
 
 --source include/have_innodb.inc
 
-SET @start_global_value = @@global.innodb_sync_array_size;
-SELECT @start_global_value;
+if (`select plugin_auth_version <= "5.6.10" from information_schema.plugins where plugin_name='innodb'`)
+{
+  --skip Not fixed in InnoDB 5.6.10 or earlier
+}
 
-#
-# exists as global only
+# Exists as global only
 #
 --echo Valid values are between 0 and 1024
 SELECT @@global.innodb_sync_array_size between 0 and 1024;
 SELECT @@global.innodb_sync_array_size;
 --error ER_INCORRECT_GLOBAL_LOCAL_VAR
 SELECT @@session.innodb_sync_array_size;
-SHOW global variables LIKE 'innodb_sync_array_size';
-SHOW session variables LIKE 'innodb_sync_array_size';
-SELECT * FROM information_schema.global_variables 
-where variable_name='innodb_sync_array_size';
-SELECT * FROM information_schema.session_variables 
-where variable_name='innodb_sync_array_size';
-
-#
-# SHOW that it's writable
-#
-SET global innodb_sync_array_size=10;
-SELECT @@global.innodb_sync_array_size;
+SHOW GLOBAL variables LIKE 'innodb_sync_array_size';
+SHOW SESSION variables LIKE 'innodb_sync_array_size';
 SELECT * FROM information_schema.global_variables 
-where variable_name='innodb_sync_array_size';
+WHERE variable_name='innodb_sync_array_size';
 SELECT * FROM information_schema.session_variables 
-where variable_name='innodb_sync_array_size';
---error ER_GLOBAL_VARIABLE
-SET session innodb_sync_array_size=1;
+WHERE variable_name='innodb_sync_array_size';
 
 #
-# incorrect types
+# Show that it's read-only
 #
---error ER_WRONG_TYPE_FOR_VAR
-SET global innodb_sync_array_size=1.1;
---error ER_WRONG_TYPE_FOR_VAR
-SET global innodb_sync_array_size=1e1;
---error ER_WRONG_TYPE_FOR_VAR
-SET global innodb_sync_array_size="foo";
-
-SET global innodb_sync_array_size=-7;
-SELECT @@global.innodb_sync_array_size;
-SELECT * FROM information_schema.global_variables 
-where variable_name='innodb_sync_array_size';
-SET global innodb_sync_array_size=96;
-SELECT @@global.innodb_sync_array_size;
-SELECT * FROM information_schema.global_variables 
-where variable_name='innodb_sync_array_size';
-
-#
-# min/max values
-#
-SET global innodb_sync_array_size=0;
-SELECT @@global.innodb_sync_array_size;
-SET global innodb_sync_array_size=64;
-SELECT @@global.innodb_sync_array_size;
-
-SET @@global.innodb_sync_array_size = @start_global_value;
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SET GLOBAL innodb_sync_array_size=10;
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SET SESSION innodb_sync_array_size=10;
 SELECT @@global.innodb_sync_array_size;
diff --git a/mysql-test/t/information_schema_all_engines-master.opt b/mysql-test/t/information_schema_all_engines-master.opt
index 0a9fa574e49..e37aeaac933 100644
--- a/mysql-test/t/information_schema_all_engines-master.opt
+++ b/mysql-test/t/information_schema_all_engines-master.opt
@@ -1,19 +1,15 @@
 --loose-skip-safemalloc --loose-mutex-deadlock-detector=0
---loose-innodb-buffer-pool-pages
---loose-innodb-buffer-pool-pages-blob
---loose-innodb-buffer-pool-pages-index
 --loose-innodb-changed-pages
 --loose-innodb-cmp
+--loose-innodb-cmp-per-index
 --loose-innodb-cmp-reset
+--loose-innodb-cmpmem
 --loose-innodb-cmpmem-reset
 --loose-innodb-index-stats
 --loose-innodb-lock-waits
---loose-innodb-rseg
 --loose-innodb-sys-columns
 --loose-innodb-sys-fields
 --loose-innodb-sys-foreign
 --loose-innodb-sys-foreign-cols
---loose-innodb-sys-stats
 --loose-innodb-sys-tables
 --loose-innodb-sys-tablestats
---loose-innodb-table-stats
diff --git a/mysql-test/t/information_schema_all_engines.test b/mysql-test/t/information_schema_all_engines.test
index 73ed00376ab..c7955a38e1b 100644
--- a/mysql-test/t/information_schema_all_engines.test
+++ b/mysql-test/t/information_schema_all_engines.test
@@ -49,7 +49,7 @@ SELECT t.table_name, c1.column_name
 #
 # Bug#24630  Subselect query crashes mysqld
 #
-select 1 as f1 from information_schema.tables  where "ALL_PLUGINS"=
+select 1 as "must be 1" from information_schema.tables  where "ACCOUNTS"=
 (select cast(table_name as char)  from information_schema.tables
  order by table_name limit 1) limit 1;
 
diff --git a/plugin/metadata_lock_info/mysql-test/metadata_lock_info/suite.opt b/plugin/metadata_lock_info/mysql-test/metadata_lock_info/suite.opt
index 638f267c166..47a7881bf33 100644
--- a/plugin/metadata_lock_info/mysql-test/metadata_lock_info/suite.opt
+++ b/plugin/metadata_lock_info/mysql-test/metadata_lock_info/suite.opt
@@ -1,2 +1,2 @@
 --loose-metadata_lock_info
---plugin-load=$METADATA_LOCK_INFO_SO
+--plugin-load-add=$METADATA_LOCK_INFO_SO
diff --git a/plugin/query_response_time/mysql-test/query_response_time/suite.opt b/plugin/query_response_time/mysql-test/query_response_time/suite.opt
index dcf875578c1..7283ce84e33 100644
--- a/plugin/query_response_time/mysql-test/query_response_time/suite.opt
+++ b/plugin/query_response_time/mysql-test/query_response_time/suite.opt
@@ -1 +1 @@
---plugin-load=$QUERY_RESPONSE_TIME_SO  --plugin-query-response-time=ON --plugin-query-response-time-audit=ON
+--plugin-load-add=$QUERY_RESPONSE_TIME_SO  --plugin-query-response-time=ON --plugin-query-response-time-audit=ON
diff --git a/sql/handler.h b/sql/handler.h
index 360120615f3..69b9c3e071e 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -1369,7 +1369,7 @@ static inline sys_var *find_hton_sysvar(handlerton *hton, st_mysql_sys_var *var)
   Schema which have no meaning for replication.
 */
 #define HTON_NO_BINLOG_ROW_OPT       (1 << 9)
-#define HTON_EXTENDED_KEYS           (1 <<10) //supports extended keys
+#define HTON_SUPPORTS_EXTENDED_KEYS  (1 <<10) //supports extended keys
 
 class Ha_trx_info;
 
diff --git a/sql/innodb_priv.h b/sql/innodb_priv.h
index 82d74236ff9..b9e471b3b13 100644
--- a/sql/innodb_priv.h
+++ b/sql/innodb_priv.h
@@ -31,4 +31,6 @@ uint strconvert(CHARSET_INFO *from_cs, const char *from, uint from_length,
                 uint *errors);
 void sql_print_error(const char *format, ...);
 
+#define thd_binlog_pos(X, Y, Z) mysql_bin_log_commit_pos(X, Z, Y)
+
 #endif /* INNODB_PRIV_INCLUDED */
diff --git a/sql/mysqld.h b/sql/mysqld.h
index 1b718cb8f05..12b5d857e6e 100644
--- a/sql/mysqld.h
+++ b/sql/mysqld.h
@@ -140,7 +140,8 @@ extern char *opt_backup_history_logname, *opt_backup_progress_logname,
 extern const char *log_output_str;
 extern const char *log_backup_output_str;
 extern char *mysql_home_ptr, *pidfile_name_ptr;
-extern char glob_hostname[FN_REFLEN], mysql_home[FN_REFLEN];
+extern MYSQL_PLUGIN_IMPORT char glob_hostname[FN_REFLEN];
+extern char mysql_home[FN_REFLEN];
 extern char pidfile_name[FN_REFLEN], system_time_zone[30], *opt_init_file;
 extern char default_logfile_name[FN_REFLEN];
 extern char log_error_file[FN_REFLEN], *opt_tc_log_file;
diff --git a/sql/share/errmsg-utf8.txt b/sql/share/errmsg-utf8.txt
index 6a01ffe4ffb..2d061fc314c 100644
--- a/sql/share/errmsg-utf8.txt
+++ b/sql/share/errmsg-utf8.txt
@@ -6932,6 +6932,12 @@ ER_ROW_IN_WRONG_PARTITION
   eng "Found a row in wrong partition %s"
   swe "Hittade en rad i fel partition %s"
 
+ER_MTS_EVENT_BIGGER_PENDING_JOBS_SIZE_MAX
+  eng "Cannot schedule event %s, relay-log name %s, position %s to Worker thread because its size %lu exceeds %lu of slave_pending_jobs_size_max."
+
+ER_INNODB_NO_FT_USES_PARSER
+  eng "Cannot CREATE FULLTEXT INDEX WITH PARSER on InnoDB table"
+
 #
 # MariaDB error messages section starts here
 #
diff --git a/sql/table.cc b/sql/table.cc
index a4065f2c393..ecab578d97d 100644
--- a/sql/table.cc
+++ b/sql/table.cc
@@ -1191,7 +1191,7 @@ int TABLE_SHARE::init_from_binary_frm_image(THD *thd, bool write,
       next_chunk+= str_db_type_length + 2;
     }
 
-    share->set_use_ext_keys_flag(plugin_hton(se_plugin)->flags & HTON_EXTENDED_KEYS);
+    share->set_use_ext_keys_flag(plugin_hton(se_plugin)->flags & HTON_SUPPORTS_EXTENDED_KEYS);
 
     if (create_key_infos(disk_buff + 6, frm_image_end, keys, keyinfo,
                          new_frm_ver, ext_key_parts,
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 3aab8662376..ee7ea4246f9 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -100,7 +100,12 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #include "fts0priv.h"
 #include "page0zip.h"
 
-extern "C" enum_tx_isolation thd_get_trx_isolation(const THD* thd);
+#define thd_get_trx_isolation(X) ((enum_tx_isolation)thd_tx_isolation(X))
+
+#ifdef MYSQL_DYNAMIC_PLUGIN
+#define tc_size 400
+#define tdc_size 400
+#endif
 
 #include "ha_innodb.h"
 #include "i_s.h"
@@ -2798,7 +2803,7 @@ innobase_init(
 
 	innobase_hton->flush_logs = innobase_flush_logs;
 	innobase_hton->show_status = innobase_show_status;
-        innobase_hton->flags = HTON_EXTENDED_KEYS;
+	innobase_hton->flags = HTON_SUPPORTS_EXTENDED_KEYS;
 
 	innobase_hton->release_temporary_latches =
 		innobase_release_temporary_latches;
@@ -3467,8 +3472,7 @@ innobase_commit_ordered_2(
 	trx_t*	trx, 	/*!< in: Innodb transaction */
 	THD*	thd)	/*!< in: MySQL thread handle */
 {
-	ulonglong tmp_pos;
-	DBUG_ENTER("innobase_commit_ordered");
+	DBUG_ENTER("innobase_commit_ordered_2");
 
 	/* We need current binlog position for ibbackup to work.
 	Note, the position is current because commit_ordered is guaranteed
@@ -3491,9 +3495,9 @@ retry:
 		}
 	}
 
-	mysql_bin_log_commit_pos(thd, &tmp_pos, &(trx->mysql_log_file_name));
-	trx->mysql_log_offset = (ib_int64_t) tmp_pos;
-
+        unsigned long long pos;
+        thd_binlog_pos(thd, &trx->mysql_log_file_name, &pos);
+        trx->mysql_log_offset= static_cast<ib_int64_t>(pos);
 	/* Don't do write + flush right now. For group commit
 	   to work we want to do the flush in the innobase_commit()
 	   method, which runs without holding any locks. */
@@ -3818,7 +3822,7 @@ innobase_checkpoint_request(
 Log code calls this whenever log has been written and/or flushed up
 to a new position. We use this to notify upper layer of a new commit
 checkpoint when necessary.*/
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_mysql_log_notify(
 /*===============*/
@@ -9443,10 +9447,7 @@ ha_innobase::parse_table_name(
 		}
 
 		if (ignore) {
-			push_warning_printf(
-				thd, Sql_condition::WARN_LEVEL_WARN,
-				WARN_OPTION_IGNORED,
-				ER_DEFAULT(WARN_OPTION_IGNORED),
+			my_error(WARN_OPTION_IGNORED, ME_JUST_WARNING,
 				"DATA DIRECTORY");
 		} else {
 			strncpy(remote_path, create_info->data_file_name,
@@ -9455,10 +9456,7 @@ ha_innobase::parse_table_name(
 	}
 
 	if (create_info->index_file_name) {
-		push_warning_printf(
-			thd, Sql_condition::WARN_LEVEL_WARN,
-			WARN_OPTION_IGNORED,
-			ER_DEFAULT(WARN_OPTION_IGNORED),
+		my_error(WARN_OPTION_IGNORED, ME_JUST_WARNING,
 			"INDEX DIRECTORY");
 	}
 
@@ -9512,6 +9510,11 @@ innobase_table_flags(
 				DBUG_RETURN(false);
 			}
 
+			if (key->flags & HA_USES_PARSER) {
+				my_error(ER_INNODB_NO_FT_USES_PARSER, MYF(0));
+                                DBUG_RETURN(false);
+			}
+
 			if (fts_doc_id_index_bad) {
 				goto index_bad;
 			}
@@ -10601,7 +10604,6 @@ ha_innobase::records_in_range(
 	ib_int64_t	n_rows;
 	ulint		mode1;
 	ulint		mode2;
-        uint key_parts;
 	mem_heap_t*	heap;
 
 	DBUG_ENTER("records_in_range");
@@ -10637,19 +10639,14 @@ ha_innobase::records_in_range(
 		goto func_exit;
 	}
 
-        key_parts= key->ext_key_parts;
-        if ((min_key && min_key->keypart_map>=(key_part_map) (1<<key_parts)) ||
-            (max_key && max_key->keypart_map>=(key_part_map) (1<<key_parts)))
-          key_parts= key->ext_key_parts;
-
-	heap = mem_heap_create(2 * (key_parts * sizeof(dfield_t)
+	heap = mem_heap_create(2 * (key->ext_key_parts * sizeof(dfield_t)
 				    + sizeof(dtuple_t)));
 
-	range_start = dtuple_create(heap, key_parts);
-	dict_index_copy_types(range_start, index, key_parts);
+	range_start = dtuple_create(heap, key->ext_key_parts);
+	dict_index_copy_types(range_start, index, key->ext_key_parts);
 
-	range_end = dtuple_create(heap, key_parts);
-	dict_index_copy_types(range_end, index, key_parts);
+	range_end = dtuple_create(heap, key->ext_key_parts);
+	dict_index_copy_types(range_end, index, key->ext_key_parts);
 
 	row_sel_convert_mysql_key_to_innobase(
 				range_start,
@@ -11344,52 +11341,6 @@ ha_innobase::info_low(
 				  (ulong) rec_per_key;
 			}
 
-                        KEY *key_info= table->key_info+i; 
-                        key_part_map ext_key_part_map=
-                                             key_info->ext_key_part_map;
-
-                        if (key_info->user_defined_key_parts !=
-                            key_info->ext_key_parts)
-                        {
-
-                                KEY *pk_key_info= key_info+
-                                                  table->s->primary_key;
-                                uint k = key_info->user_defined_key_parts;
-                                ha_rows k_rec_per_key = rec_per_key;
-                                uint pk_parts = pk_key_info->user_defined_key_parts;
-                          
-		                index= innobase_get_index(
-                                        table->s->primary_key);
-                                
-                                n_rows= ib_table->stat_n_rows;
-    
-                                for (j = 0; j < pk_parts; j++) {
- 
-				         if (ext_key_part_map & 1<<j) {
-
-                                                rec_per_key =
-						innodb_rec_per_key(index,
-                                                        j, stats.records);
-                               
-				                if (rec_per_key == 0) {
-					                rec_per_key = 1;
-				                }
-                                                else if (rec_per_key > 1) {
-                                                        rec_per_key =
-                                                        (ha_rows)
-                                                          (k_rec_per_key *
-						          (double)rec_per_key /
-                                                           n_rows);
-						}
-                                                
-				                key_info->rec_per_key[k++]=
-				                rec_per_key >= ~(ulong) 0 ?
-                                                ~(ulong) 0 :
-                                                (ulong) rec_per_key;
-
-					} 
-				}
-			}                                         
 		}
 
 		if (!(flag & HA_STATUS_NO_LOCK)) {
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
index ea71b9ab383..a16ce656f04 100644
--- a/storage/innobase/include/ha_prototypes.h
+++ b/storage/innobase/include/ha_prototypes.h
@@ -183,7 +183,7 @@ innobase_mysql_cmp(
 Log code calls this whenever log has been written and/or flushed up
 to a new position. We use this to notify upper layer of a new commit
 checkpoint when necessary.*/
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_mysql_log_notify(
 /*===============*/
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
index 40c1cd14e9e..65f289eda35 100644
--- a/storage/innobase/include/univ.i
+++ b/storage/innobase/include/univ.i
@@ -42,9 +42,9 @@ Created 1/20/1994 Heikki Tuuri
 #define _IB_TO_STR(s)	#s
 #define IB_TO_STR(s)	_IB_TO_STR(s)
 
-#define INNODB_VERSION_MAJOR	1
-#define INNODB_VERSION_MINOR	2
-#define INNODB_VERSION_BUGFIX	MYSQL_VERSION_PATCH
+#define INNODB_VERSION_MAJOR	5
+#define INNODB_VERSION_MINOR	6
+#define INNODB_VERSION_BUGFIX	10
 
 /* The following is the InnoDB version as shown in
 SELECT plugin_version FROM information_schema.plugins;
@@ -60,7 +60,9 @@ component, i.e. we show M.N.P as M.N */
 	IB_TO_STR(INNODB_VERSION_MINOR) "."	\
 	IB_TO_STR(INNODB_VERSION_BUGFIX)
 
-#define REFMAN "http://dev.mysql.com/doc/refman/5.6/en/"
+#define REFMAN "http://dev.mysql.com/doc/refman/"	\
+	IB_TO_STR(INNODB_VERSION_MAJOR) "."		\
+	IB_TO_STR(INNODB_VERSION_MINOR) "/en/"
 
 #ifdef MYSQL_DYNAMIC_PLUGIN
 /* In the dynamic plugin, redefine some externally visible symbols
diff --git a/storage/sequence/mysql-test/sequence/inc.opt b/storage/sequence/mysql-test/sequence/inc.opt
index 5b96925ff59..a6db8dd1482 100644
--- a/storage/sequence/mysql-test/sequence/inc.opt
+++ b/storage/sequence/mysql-test/sequence/inc.opt
@@ -1,2 +1,2 @@
---plugin-load=$HA_SEQUENCE_SO
+--plugin-load-add=$HA_SEQUENCE_SO
 --loose-sequence
diff --git a/storage/test_sql_discovery/mysql-test/sql_discovery/inc.opt b/storage/test_sql_discovery/mysql-test/sql_discovery/inc.opt
index 7d5c2404a50..63f3eec3435 100644
--- a/storage/test_sql_discovery/mysql-test/sql_discovery/inc.opt
+++ b/storage/test_sql_discovery/mysql-test/sql_discovery/inc.opt
@@ -1,2 +1,2 @@
---plugin-load=$HA_TEST_SQL_DISCOVERY_SO
+--plugin-load-add=$HA_TEST_SQL_DISCOVERY_SO
 --loose-test-sql-discovery
diff --git a/storage/tokudb/hatoku_hton.cc b/storage/tokudb/hatoku_hton.cc
index 1a1b2835fa3..911b274a302 100644
--- a/storage/tokudb/hatoku_hton.cc
+++ b/storage/tokudb/hatoku_hton.cc
@@ -332,7 +332,7 @@ static int tokudb_init_func(void *p) {
 
     tokudb_hton->state = SHOW_OPTION_YES;
     // tokudb_hton->flags= HTON_CAN_RECREATE;  // QQQ this came from skeleton
-    tokudb_hton->flags = HTON_CLOSE_CURSORS_AT_COMMIT | HTON_EXTENDED_KEYS;
+    tokudb_hton->flags = HTON_CLOSE_CURSORS_AT_COMMIT | HTON_SUPPORTS_EXTENDED_KEYS;
 
 #if TOKU_INCLUDE_OTHER_DB_TYPE
     // we have historically been a dynamic storage engine, so we set db_type according.
diff --git a/storage/tokudb/mysql-test/rpl/include/have_tokudb.opt b/storage/tokudb/mysql-test/rpl/include/have_tokudb.opt
index 39d3fa12ec5..976f96f3f48 100644
--- a/storage/tokudb/mysql-test/rpl/include/have_tokudb.opt
+++ b/storage/tokudb/mysql-test/rpl/include/have_tokudb.opt
@@ -1 +1 @@
---loose-tokudb --plugin-load=$HA_TOKUDB_SO
+--loose-tokudb --plugin-load-add=$HA_TOKUDB_SO
diff --git a/storage/tokudb/mysql-test/tokudb/suite.opt b/storage/tokudb/mysql-test/tokudb/suite.opt
index b385af22b1d..8cfa7cacb1f 100644
--- a/storage/tokudb/mysql-test/tokudb/suite.opt
+++ b/storage/tokudb/mysql-test/tokudb/suite.opt
@@ -1 +1 @@
---tokudb --plugin-load=$HA_TOKUDB_SO
+--tokudb --plugin-load-add=$HA_TOKUDB_SO
diff --git a/storage/tokudb/mysql-test/tokudb_add_index/suite.opt b/storage/tokudb/mysql-test/tokudb_add_index/suite.opt
index b385af22b1d..8cfa7cacb1f 100644
--- a/storage/tokudb/mysql-test/tokudb_add_index/suite.opt
+++ b/storage/tokudb/mysql-test/tokudb_add_index/suite.opt
@@ -1 +1 @@
---tokudb --plugin-load=$HA_TOKUDB_SO
+--tokudb --plugin-load-add=$HA_TOKUDB_SO
diff --git a/storage/tokudb/mysql-test/tokudb_alter_table/suite.opt b/storage/tokudb/mysql-test/tokudb_alter_table/suite.opt
index b385af22b1d..8cfa7cacb1f 100644
--- a/storage/tokudb/mysql-test/tokudb_alter_table/suite.opt
+++ b/storage/tokudb/mysql-test/tokudb_alter_table/suite.opt
@@ -1 +1 @@
---tokudb --plugin-load=$HA_TOKUDB_SO
+--tokudb --plugin-load-add=$HA_TOKUDB_SO
diff --git a/storage/tokudb/mysql-test/tokudb_bugs/suite.opt b/storage/tokudb/mysql-test/tokudb_bugs/suite.opt
index b385af22b1d..8cfa7cacb1f 100644
--- a/storage/tokudb/mysql-test/tokudb_bugs/suite.opt
+++ b/storage/tokudb/mysql-test/tokudb_bugs/suite.opt
@@ -1 +1 @@
---tokudb --plugin-load=$HA_TOKUDB_SO
+--tokudb --plugin-load-add=$HA_TOKUDB_SO
diff --git a/storage/tokudb/mysql-test/tokudb_mariadb/suite.opt b/storage/tokudb/mysql-test/tokudb_mariadb/suite.opt
index b385af22b1d..8cfa7cacb1f 100644
--- a/storage/tokudb/mysql-test/tokudb_mariadb/suite.opt
+++ b/storage/tokudb/mysql-test/tokudb_mariadb/suite.opt
@@ -1 +1 @@
---tokudb --plugin-load=$HA_TOKUDB_SO
+--tokudb --plugin-load-add=$HA_TOKUDB_SO
diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt
index f128833f1e7..282db2ddf31 100644
--- a/storage/xtradb/CMakeLists.txt
+++ b/storage/xtradb/CMakeLists.txt
@@ -1,14 +1,14 @@
 # Copyright (c) 2006, 2011, Oracle and/or its affiliates. All rights reserved.
-# 
+#
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation; version 2 of the License.
-# 
+#
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
@@ -36,19 +36,31 @@ IF(UNIX)
     ENDIF()
     ADD_DEFINITIONS("-DUNIV_LINUX -D_GNU_SOURCE=1")
   ELSEIF(CMAKE_SYSTEM_NAME MATCHES "HP*")
-    ADD_DEFINITIONS("-DUNIV_HPUX -DUNIV_MUST_NOT_INLINE")
+    ADD_DEFINITIONS("-DUNIV_HPUX")
   ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "AIX")
-    ADD_DEFINITIONS("-DUNIV_AIX -DUNIX_MUST_NOT_INLINE")
+    ADD_DEFINITIONS("-DUNIV_AIX")
   ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
     ADD_DEFINITIONS("-DUNIV_SOLARIS")
-  ELSE()
-   ADD_DEFINITIONS("-DUNIV_MUST_NOT_INLINE")
   ENDIF()
 ENDIF()
 
-# Enable InnoDB's UNIV_DEBUG for debug builds
-SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DUNIV_DEBUG")
-SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DUNIV_DEBUG")
+IF(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+# After: WL#5825 Using C++ Standard Library with MySQL code
+#       we no longer use -fno-exceptions
+#	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
+ENDIF()
+
+# Enable InnoDB's UNIV_DEBUG and UNIV_SYNC_DEBUG in debug builds
+SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DUNIV_DEBUG -DUNIV_SYNC_DEBUG")
+
+# Add -Wconversion if compiling with GCC
+## As of Mar 15 2011 this flag causes 3573+ warnings. If you are reading this
+## please fix them and enable the following code:
+#IF(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+#SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wconversion")
+#ENDIF()
+
+CHECK_FUNCTION_EXISTS(sched_getcpu  HAVE_SCHED_GETCPU)
 
 IF(NOT MSVC)
 # either define HAVE_IB_GCC_ATOMIC_BUILTINS or not
@@ -92,26 +104,21 @@ IF(NOT CMAKE_CROSSCOMPILING)
   HAVE_IB_GCC_ATOMIC_BUILTINS
   )
   CHECK_C_SOURCE_RUNS(
-  "
-  #include <stdint.h>
+  "#include<stdint.h>
   int main()
   {
-    int64_t x, y, res;
+    int64_t	x,y,res;
 
     x = 10;
     y = 123;
-    res = __sync_bool_compare_and_swap(&x, x, y);
-    if (!res || x != y) {
+    res = __sync_sub_and_fetch(&y, x);
+    if (res != y || y != 113) {
       return(1);
     }
-
-    x = 10;
-    y = 123;
-    res = __sync_add_and_fetch(&x, y);
-    if (res != 123 + 10 || x != 123 + 10) {
+    res = __sync_add_and_fetch(&y, x);
+    if (res != y || y != 123) {
       return(1);
     }
-
     return(0);
   }"
   HAVE_IB_GCC_ATOMIC_BUILTINS_64
@@ -120,6 +127,7 @@ ENDIF()
 
 IF(HAVE_IB_GCC_ATOMIC_BUILTINS)
  ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS=1)
+ SET(XTRADB_OK 1)
 ENDIF()
 
 IF(HAVE_IB_GCC_ATOMIC_BUILTINS_64)
@@ -154,6 +162,9 @@ ENDIF()
 
 ENDIF(NOT MSVC)
 
+CHECK_FUNCTION_EXISTS(asprintf  HAVE_ASPRINTF)
+CHECK_FUNCTION_EXISTS(vasprintf  HAVE_VASPRINTF)
+
 # Solaris atomics
 IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
   CHECK_FUNCTION_EXISTS(atomic_cas_ulong  HAVE_ATOMIC_CAS_ULONG)
@@ -168,9 +179,10 @@ IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
      HAVE_ATOMIC_SWAP_UCHAR)
     SET(HAVE_IB_SOLARIS_ATOMICS 1)
   ENDIF()
-  
+
   IF(HAVE_IB_SOLARIS_ATOMICS)
     ADD_DEFINITIONS(-DHAVE_IB_SOLARIS_ATOMICS=1)
+    SET(XTRADB_OK 1)
   ENDIF()
 
   IF(NOT CMAKE_CROSSCOMPILING)
@@ -189,15 +201,15 @@ IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
         memset(&x3, 0x0, sizeof(x3));
 
         if (sizeof(pthread_t) == 4) {
-        
+
           atomic_cas_32(&x1, x2, x3);
-        
+
         } else if (sizeof(pthread_t) == 8) {
-        
+
           atomic_cas_64(&x1, x2, x3);
-        
+
         } else {
-        
+
           return(1);
         }
 
@@ -225,6 +237,7 @@ ENDIF()
 
 IF(MSVC)
   ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS)
+  #SET(XTRADB_OK 1)
 
   # Avoid "unreferenced label" warning in generated file
   GET_FILENAME_COMPONENT(_SRC_DIR ${CMAKE_CURRENT_LIST_FILE} PATH)
@@ -240,60 +253,151 @@ INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/xtradb/include
 		    ${CMAKE_SOURCE_DIR}/storage/xtradb/handler)
 
 # Sun Studio bug with -xO2
-IF(CMAKE_C_COMPILER_ID MATCHES "SunPro" 
-	AND CMAKE_C_FLAGS_RELEASE MATCHES "O2" 
+IF(CMAKE_CXX_COMPILER_ID MATCHES "SunPro"
+	AND CMAKE_CXX_FLAGS_RELEASE MATCHES "O2"
 	AND NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
 	# Sun Studio 12 crashes with -xO2 flag, but not with higher optimization
 	# -xO3
-	SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_SOURCE_DIR}/rem/rem0rec.c 
+	SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_SOURCE_DIR}/rem/rem0rec.cc
     PROPERTIES COMPILE_FLAGS -xO3)
 ENDIF()
 
 # Removing compiler optimizations for innodb/mem/* files on 64-bit Windows
 # due to 64-bit compiler error, See MySQL Bug #19424, #36366, #34297
 IF (MSVC AND CMAKE_SIZEOF_VOID_P EQUAL 8)
-	SET_SOURCE_FILES_PROPERTIES(mem/mem0mem.c mem/mem0pool.c
+	SET_SOURCE_FILES_PROPERTIES(mem/mem0mem.cc mem/mem0pool.cc
 				    PROPERTIES COMPILE_FLAGS -Od)
 ENDIF()
 
-SET(INNOBASE_SOURCES	btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c btr/btr0sea.c
-			buf/buf0buddy.c buf/buf0buf.c buf/buf0flu.c buf/buf0lru.c buf/buf0rea.c
-			data/data0data.c data/data0type.c
-			dict/dict0boot.c dict/dict0crea.c dict/dict0dict.c dict/dict0load.c dict/dict0mem.c
-			dyn/dyn0dyn.c
-			eval/eval0eval.c eval/eval0proc.c
-			fil/fil0fil.c
-			fsp/fsp0fsp.c
-			fut/fut0fut.c fut/fut0lst.c
-			ha/ha0ha.c ha/hash0hash.c ha/ha0storage.c
-			ibuf/ibuf0ibuf.c
-			pars/lexyy.c pars/pars0grm.c pars/pars0opt.c pars/pars0pars.c pars/pars0sym.c
-			lock/lock0lock.c lock/lock0iter.c
-			log/log0log.c log/log0recv.c log/log0online.c
-			mach/mach0data.c
-			mem/mem0mem.c mem/mem0pool.c
-			mtr/mtr0log.c mtr/mtr0mtr.c
-			os/os0file.c os/os0proc.c os/os0sync.c os/os0thread.c
-			os/os0stacktrace.c
-			page/page0cur.c page/page0page.c page/page0zip.c
-			que/que0que.c
-			handler/ha_innodb.cc handler/handler0alter.cc handler/i_s.cc
-			read/read0read.c
-			rem/rem0cmp.c rem/rem0rec.c
-			row/row0ext.c row/row0ins.c row/row0merge.c row/row0mysql.c row/row0purge.c row/row0row.c
-			row/row0sel.c row/row0uins.c row/row0umod.c row/row0undo.c row/row0upd.c row/row0vers.c
-			srv/srv0srv.c srv/srv0start.c
-			sync/sync0arr.c sync/sync0rw.c sync/sync0sync.c
-			trx/trx0i_s.c trx/trx0purge.c trx/trx0rec.c trx/trx0roll.c trx/trx0rseg.c
-			trx/trx0sys.c trx/trx0trx.c trx/trx0undo.c
-			usr/usr0sess.c
-			ut/ut0byte.c ut/ut0dbg.c ut/ut0list.c ut/ut0mem.c ut/ut0rbt.c ut/ut0rnd.c
-			ut/ut0ut.c ut/ut0vec.c ut/ut0wqueue.c ut/ut0bh.c)
+SET(INNOBASE_SOURCES
+	api/api0api.cc
+	api/api0misc.cc
+	btr/btr0btr.cc
+	btr/btr0cur.cc
+	btr/btr0pcur.cc
+	btr/btr0sea.cc
+	buf/buf0buddy.cc
+	buf/buf0buf.cc
+	buf/buf0dblwr.cc
+	buf/buf0checksum.cc
+	buf/buf0dump.cc
+	buf/buf0flu.cc
+	buf/buf0lru.cc
+	buf/buf0rea.cc
+	data/data0data.cc
+	data/data0type.cc
+	dict/dict0boot.cc
+	dict/dict0crea.cc
+	dict/dict0dict.cc
+	dict/dict0load.cc
+	dict/dict0mem.cc
+	dict/dict0stats.cc
+	dict/dict0stats_bg.cc
+	dyn/dyn0dyn.cc
+	eval/eval0eval.cc
+	eval/eval0proc.cc
+	fil/fil0fil.cc
+	fsp/fsp0fsp.cc
+	fut/fut0fut.cc
+	fut/fut0lst.cc
+	ha/ha0ha.cc
+	ha/ha0storage.cc
+	ha/hash0hash.cc
+	fts/fts0fts.cc
+	fts/fts0ast.cc
+	fts/fts0blex.cc
+	fts/fts0config.cc
+	fts/fts0opt.cc
+	fts/fts0pars.cc
+	fts/fts0que.cc
+	fts/fts0sql.cc
+	fts/fts0tlex.cc
+	handler/ha_innodb.cc
+	handler/handler0alter.cc
+	handler/i_s.cc
+	handler/xtradb_i_s.cc
+	ibuf/ibuf0ibuf.cc
+	lock/lock0iter.cc
+	lock/lock0lock.cc
+	lock/lock0wait.cc
+	log/log0log.cc
+	log/log0online.cc
+	log/log0recv.cc
+	mach/mach0data.cc
+	mem/mem0mem.cc
+	mem/mem0pool.cc
+	mtr/mtr0log.cc
+	mtr/mtr0mtr.cc
+	os/os0file.cc
+	os/os0proc.cc
+	os/os0sync.cc
+	os/os0thread.cc
+	os/os0stacktrace.cc
+	page/page0cur.cc
+	page/page0page.cc
+	page/page0zip.cc
+	pars/lexyy.cc
+	pars/pars0grm.cc
+	pars/pars0opt.cc
+	pars/pars0pars.cc
+	pars/pars0sym.cc
+	que/que0que.cc
+	read/read0read.cc
+	rem/rem0cmp.cc
+	rem/rem0rec.cc
+	row/row0ext.cc
+	row/row0ftsort.cc
+	row/row0import.cc
+	row/row0ins.cc
+	row/row0merge.cc
+	row/row0mysql.cc
+	row/row0log.cc
+	row/row0purge.cc
+	row/row0row.cc
+	row/row0sel.cc
+	row/row0uins.cc
+	row/row0umod.cc
+	row/row0undo.cc
+	row/row0upd.cc
+	row/row0quiesce.cc
+	row/row0vers.cc
+	srv/srv0conc.cc
+	srv/srv0mon.cc
+	srv/srv0srv.cc
+	srv/srv0start.cc
+	sync/sync0arr.cc
+	sync/sync0rw.cc
+	sync/sync0sync.cc
+	trx/trx0i_s.cc
+	trx/trx0purge.cc
+	trx/trx0rec.cc
+	trx/trx0roll.cc
+	trx/trx0rseg.cc
+	trx/trx0sys.cc
+	trx/trx0trx.cc
+	trx/trx0undo.cc
+	usr/usr0sess.cc
+	ut/ut0bh.cc
+	ut/ut0byte.cc
+	ut/ut0crc32.cc
+	ut/ut0dbg.cc
+	ut/ut0list.cc
+	ut/ut0mem.cc
+	ut/ut0rbt.cc
+	ut/ut0rnd.cc
+	ut/ut0ut.cc
+	ut/ut0vec.cc
+	ut/ut0wqueue.cc)
 
 IF(WITH_INNODB)
   # Legacy option
   SET(WITH_INNOBASE_STORAGE_ENGINE TRUE)
 ENDIF()
 
-MYSQL_ADD_PLUGIN(xtradb ${INNOBASE_SOURCES} STORAGE_ENGINE DEFAULT
-  LINK_LIBRARIES ${ZLIB_LIBRARY} RECOMPILE_FOR_EMBEDDED)
+IF(XTRADB_OK)
+  MYSQL_ADD_PLUGIN(xtradb ${INNOBASE_SOURCES} STORAGE_ENGINE
+    MODULE_ONLY RECOMPILE_FOR_EMBEDDED
+    LINK_LIBRARIES ${ZLIB_LIBRARY})
+ELSE()
+  MESSAGE("Percona XtraDB is not supported on this platform")
+ENDIF()
diff --git a/storage/xtradb/CMakeLists.txt-disabled b/storage/xtradb/CMakeLists.txt-disabled
deleted file mode 100644
index d3b15275d6e..00000000000
--- a/storage/xtradb/CMakeLists.txt-disabled
+++ /dev/null
@@ -1,303 +0,0 @@
-# Copyright (c) 2006, 2011, Oracle and/or its affiliates. All rights reserved.
-# 
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; version 2 of the License.
-# 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-
-# This is the CMakeLists for XtraDB
-
-IF(NOT WITH_XTRADB_STORAGE_ENGINE)
-  SET(WITHOUT_XTRADB 1)
-ENDIF(NOT WITH_XTRADB_STORAGE_ENGINE)
-
-INCLUDE(CheckFunctionExists)
-INCLUDE(CheckCSourceCompiles)
-INCLUDE(CheckCSourceRuns)
-
-# OS tests
-IF(UNIX)
-  IF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-    CHECK_INCLUDE_FILES (libaio.h HAVE_LIBAIO_H)
-    IF (XTRADB_PREFER_STATIC_LIBAIO)
-      SET(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
-    ENDIF()
-    FIND_LIBRARY(AIO_LIBRARY aio)
-    IF(AIO_LIBRARY)
-      CHECK_LIBRARY_EXISTS(${AIO_LIBRARY} io_queue_init "" HAVE_LIBAIO)
-      IF(HAVE_LIBAIO AND HAVE_LIBAIO_H)
-        ADD_DEFINITIONS(-DLINUX_NATIVE_AIO=1)
-      ENDIF()
-      LINK_LIBRARIES(${AIO_LIBRARY})
-    ENDIF()
-    ADD_DEFINITIONS("-DUNIV_LINUX -D_GNU_SOURCE=1")
-  ELSEIF(CMAKE_SYSTEM_NAME MATCHES "HP*")
-    ADD_DEFINITIONS("-DUNIV_HPUX -DUNIV_MUST_NOT_INLINE")
-  ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "AIX")
-    ADD_DEFINITIONS("-DUNIV_AIX -DUNIX_MUST_NOT_INLINE")
-  ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
-    ADD_DEFINITIONS("-DUNIV_SOLARIS")
-  ELSE()
-   ADD_DEFINITIONS("-DUNIV_MUST_NOT_INLINE")
-  ENDIF()
-ENDIF()
-
-# Enable InnoDB's UNIV_DEBUG for debug builds
-SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DUNIV_DEBUG")
-SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DUNIV_DEBUG")
-
-IF(NOT MSVC)
-# either define HAVE_IB_GCC_ATOMIC_BUILTINS or not
-IF(NOT CMAKE_CROSSCOMPILING)
-  CHECK_C_SOURCE_RUNS(
-  "
-  int main()
-  {
-    long	x;
-    long	y;
-    long	res;
-    char	c;
-
-    x = 10;
-    y = 123;
-    res = __sync_bool_compare_and_swap(&x, x, y);
-    if (!res || x != y) {
-      return(1);
-    }
-
-    x = 10;
-    y = 123;
-    res = __sync_bool_compare_and_swap(&x, x + 1, y);
-    if (res || x != 10) {
-      return(1);
-    }
-    x = 10;
-    y = 123;
-    res = __sync_add_and_fetch(&x, y);
-    if (res != 123 + 10 || x != 123 + 10) {
-      return(1);
-    }
-
-    c = 10;
-    res = __sync_lock_test_and_set(&c, 123);
-    if (res != 10 || c != 123) {
-      return(1);
-    }
-    return(0);
-  }"
-  HAVE_IB_GCC_ATOMIC_BUILTINS
-  )
-  CHECK_C_SOURCE_RUNS(
-  "
-  #include <stdint.h>
-  int main()
-  {
-    int64_t x, y, res;
-
-    x = 10;
-    y = 123;
-    res = __sync_bool_compare_and_swap(&x, x, y);
-    if (!res || x != y) {
-      return(1);
-    }
-
-    x = 10;
-    y = 123;
-    res = __sync_add_and_fetch(&x, y);
-    if (res != 123 + 10 || x != 123 + 10) {
-      return(1);
-    }
-
-    return(0);
-  }"
-  HAVE_IB_GCC_ATOMIC_BUILTINS_64
-  )
-ENDIF()
-
-IF(HAVE_IB_GCC_ATOMIC_BUILTINS)
- ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS=1)
-ENDIF()
-
-IF(HAVE_IB_GCC_ATOMIC_BUILTINS_64)
- ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS_64=1)
-ENDIF()
-
- # either define HAVE_IB_ATOMIC_PTHREAD_T_GCC or not
-IF(NOT CMAKE_CROSSCOMPILING)
-  CHECK_C_SOURCE_RUNS(
-  "
-  #include <pthread.h>
-  #include <string.h>
-
-  int main() {
-    pthread_t       x1;
-    pthread_t       x2;
-    pthread_t       x3;
-
-    memset(&x1, 0x0, sizeof(x1));
-    memset(&x2, 0x0, sizeof(x2));
-    memset(&x3, 0x0, sizeof(x3));
-
-    __sync_bool_compare_and_swap(&x1, x2, x3);
-
-    return(0);
-  }"
-  HAVE_IB_ATOMIC_PTHREAD_T_GCC)
-ENDIF()
-IF(HAVE_IB_ATOMIC_PTHREAD_T_GCC)
-  ADD_DEFINITIONS(-DHAVE_IB_ATOMIC_PTHREAD_T_GCC=1)
-ENDIF()
-
-ENDIF(NOT MSVC)
-
-# Solaris atomics
-IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
-  CHECK_FUNCTION_EXISTS(atomic_cas_ulong  HAVE_ATOMIC_CAS_ULONG)
-  CHECK_FUNCTION_EXISTS(atomic_cas_32 HAVE_ATOMIC_CAS_32)
-  CHECK_FUNCTION_EXISTS(atomic_cas_64 HAVE_ATOMIC_CAS_64)
-  CHECK_FUNCTION_EXISTS(atomic_add_long_nv HAVE_ATOMIC_ADD_LONG_NV)
-  CHECK_FUNCTION_EXISTS(atomic_swap_uchar HAVE_ATOMIC_SWAP_UCHAR)
-  IF(HAVE_ATOMIC_CAS_ULONG AND
-     HAVE_ATOMIC_CAS_32 AND
-     HAVE_ATOMIC_CAS_64 AND
-     HAVE_ATOMIC_ADD_LONG_NV AND
-     HAVE_ATOMIC_SWAP_UCHAR)
-    SET(HAVE_IB_SOLARIS_ATOMICS 1)
-  ENDIF()
-  
-  IF(HAVE_IB_SOLARIS_ATOMICS)
-    ADD_DEFINITIONS(-DHAVE_IB_SOLARIS_ATOMICS=1)
-  ENDIF()
-
-  IF(NOT CMAKE_CROSSCOMPILING)
-  # either define HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS or not
-  CHECK_C_SOURCE_COMPILES(
-  "   #include <pthread.h>
-      #include <string.h>
-
-      int main(int argc, char** argv) {
-        pthread_t       x1;
-        pthread_t       x2;
-        pthread_t       x3;
-
-        memset(&x1, 0x0, sizeof(x1));
-        memset(&x2, 0x0, sizeof(x2));
-        memset(&x3, 0x0, sizeof(x3));
-
-        if (sizeof(pthread_t) == 4) {
-        
-          atomic_cas_32(&x1, x2, x3);
-        
-        } else if (sizeof(pthread_t) == 8) {
-        
-          atomic_cas_64(&x1, x2, x3);
-        
-        } else {
-        
-          return(1);
-        }
-
-      return(0);
-    }
-  " HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS)
-  ENDIF()
-  IF(HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS)
-    ADD_DEFINITIONS(-DHAVE_IB_ATOMIC_PTHREAD_T_SOLARIS=1)
-  ENDIF()
-ENDIF()
-
-
-IF(UNIX)
-# this is needed to know which one of atomic_cas_32() or atomic_cas_64()
-# to use in the source
-SET(CMAKE_EXTRA_INCLUDE_FILES pthread.h)
-CHECK_TYPE_SIZE(pthread_t SIZEOF_PTHREAD_T)
-SET(CMAKE_EXTRA_INCLUDE_FILES)
-ENDIF()
-
-IF(SIZEOF_PTHREAD_T)
-  ADD_DEFINITIONS(-DSIZEOF_PTHREAD_T=${SIZEOF_PTHREAD_T})
-ENDIF()
-
-IF(MSVC)
-  ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS)
-
-  # Avoid "unreferenced label" warning in generated file
-  GET_FILENAME_COMPONENT(_SRC_DIR ${CMAKE_CURRENT_LIST_FILE} PATH)
-  SET_SOURCE_FILES_PROPERTIES(${_SRC_DIR}/pars/pars0grm.c
-          PROPERTIES COMPILE_FLAGS "/wd4102")
-  SET_SOURCE_FILES_PROPERTIES(${_SRC_DIR}/pars/lexyy.c
-          PROPERTIES COMPILE_FLAGS "/wd4003")
-ENDIF()
-
-
-# Include directories under xtradb
-INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/xtradb/include
-		    ${CMAKE_SOURCE_DIR}/storage/xtradb/handler)
-
-# Sun Studio bug with -xO2
-IF(CMAKE_C_COMPILER_ID MATCHES "SunPro" 
-	AND CMAKE_C_FLAGS_RELEASE MATCHES "O2" 
-	AND NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
-	# Sun Studio 12 crashes with -xO2 flag, but not with higher optimization
-	# -xO3
-	SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_SOURCE_DIR}/rem/rem0rec.c 
-    PROPERTIES COMPILE_FLAGS -xO3)
-ENDIF()
-
-# Removing compiler optimizations for innodb/mem/* files on 64-bit Windows
-# due to 64-bit compiler error, See MySQL Bug #19424, #36366, #34297
-IF (MSVC AND CMAKE_SIZEOF_VOID_P EQUAL 8)
-	SET_SOURCE_FILES_PROPERTIES(mem/mem0mem.c mem/mem0pool.c
-				    PROPERTIES COMPILE_FLAGS -Od)
-ENDIF()
-
-SET(INNOBASE_SOURCES	btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c btr/btr0sea.c
-			buf/buf0buddy.c buf/buf0buf.c buf/buf0flu.c buf/buf0lru.c buf/buf0rea.c
-			data/data0data.c data/data0type.c
-			dict/dict0boot.c dict/dict0crea.c dict/dict0dict.c dict/dict0load.c dict/dict0mem.c
-			dyn/dyn0dyn.c
-			eval/eval0eval.c eval/eval0proc.c
-			fil/fil0fil.c
-			fsp/fsp0fsp.c
-			fut/fut0fut.c fut/fut0lst.c
-			ha/ha0ha.c ha/hash0hash.c ha/ha0storage.c
-			ibuf/ibuf0ibuf.c
-			pars/lexyy.c pars/pars0grm.c pars/pars0opt.c pars/pars0pars.c pars/pars0sym.c
-			lock/lock0lock.c lock/lock0iter.c
-			log/log0log.c log/log0recv.c log/log0online.c
-			mach/mach0data.c
-			mem/mem0mem.c mem/mem0pool.c
-			mtr/mtr0log.c mtr/mtr0mtr.c
-			os/os0file.c os/os0proc.c os/os0sync.c os/os0thread.c
-			os/os0stacktrace.c
-			page/page0cur.c page/page0page.c page/page0zip.c
-			que/que0que.c
-			handler/ha_innodb.cc handler/handler0alter.cc handler/i_s.cc
-			read/read0read.c
-			rem/rem0cmp.c rem/rem0rec.c
-			row/row0ext.c row/row0ins.c row/row0merge.c row/row0mysql.c row/row0purge.c row/row0row.c
-			row/row0sel.c row/row0uins.c row/row0umod.c row/row0undo.c row/row0upd.c row/row0vers.c
-			srv/srv0srv.c srv/srv0start.c
-			sync/sync0arr.c sync/sync0rw.c sync/sync0sync.c
-			trx/trx0i_s.c trx/trx0purge.c trx/trx0rec.c trx/trx0roll.c trx/trx0rseg.c
-			trx/trx0sys.c trx/trx0trx.c trx/trx0undo.c
-			usr/usr0sess.c
-			ut/ut0byte.c ut/ut0dbg.c ut/ut0list.c ut/ut0mem.c ut/ut0rbt.c ut/ut0rnd.c
-			ut/ut0ut.c ut/ut0vec.c ut/ut0wqueue.c ut/ut0bh.c)
-
-IF(WITH_INNODB)
-  # Legacy option
-  SET(WITH_INNOBASE_STORAGE_ENGINE TRUE)
-ENDIF()
-
-MYSQL_ADD_PLUGIN(xtradb ${INNOBASE_SOURCES} STORAGE_ENGINE DEFAULT
-  LINK_LIBRARIES ${ZLIB_LIBRARY} RECOMPILE_FOR_EMBEDDED)
diff --git a/storage/xtradb/api/api0api.cc b/storage/xtradb/api/api0api.cc
new file mode 100644
index 00000000000..647ebcde6f0
--- /dev/null
+++ b/storage/xtradb/api/api0api.cc
@@ -0,0 +1,3948 @@
+/*****************************************************************************
+
+Copyright (c) 2008, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file api/api0api.cc
+InnoDB Native API
+
+2008-08-01 Created Sunny Bains
+3/20/2011 Jimmy Yang extracted from Embedded InnoDB
+*******************************************************/
+
+#include "univ.i"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#include "api0api.h"
+#include "api0misc.h"
+#include "srv0start.h"
+#include "dict0dict.h"
+#include "btr0pcur.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "row0vers.h"
+#include "trx0roll.h"
+#include "dict0crea.h"
+#include "row0merge.h"
+#include "pars0pars.h"
+#include "lock0types.h"
+#include "row0sel.h"
+#include "lock0lock.h"
+#include "rem0cmp.h"
+#include "ut0dbg.h"
+#include "dict0priv.h"
+#include "ut0ut.h"
+#include "ha_prototypes.h"
+#include "trx0roll.h"
+
+/** configure variable for binlog option with InnoDB APIs */
+my_bool ib_binlog_enabled = FALSE;
+
+/** configure variable for MDL option with InnoDB APIs */
+my_bool ib_mdl_enabled = FALSE;
+
+/** configure variable for disable rowlock with InnoDB APIs */
+my_bool ib_disable_row_lock = FALSE;
+
+/** configure variable for Transaction isolation levels */
+ulong ib_trx_level_setting = IB_TRX_READ_UNCOMMITTED;
+
+/** configure variable for background commit interval in seconds */
+ulong ib_bk_commit_interval = 0;
+
+/** InnoDB tuple types. */
+enum ib_tuple_type_t{
+	TPL_TYPE_ROW,			/*!< Data row tuple */
+	TPL_TYPE_KEY			/*!< Index key tuple */
+};
+
+/** Query types supported. */
+enum ib_qry_type_t{
+	QRY_NON,			/*!< None/Sentinel */
+	QRY_INS,			/*!< Insert operation */
+	QRY_UPD,			/*!< Update operation */
+	QRY_SEL				/*!< Select operation */
+};
+
+/** Query graph types. */
+struct ib_qry_grph_t {
+	que_fork_t*	ins;		/*!< Innobase SQL query graph used
+					in inserts */
+	que_fork_t*	upd;		/*!< Innobase SQL query graph used
+					in updates or deletes */
+	que_fork_t*	sel;		/*!< dummy query graph used in
+					selects */
+};
+
+/** Query node types. */
+struct ib_qry_node_t {
+	ins_node_t*	ins;		/*!< Innobase SQL insert node
+					used to perform inserts to the table */
+	upd_node_t*	upd;		/*!< Innobase SQL update node
+					used to perform updates and deletes */
+	sel_node_t*	sel;		/*!< Innobase SQL select node
+					used to perform selects on the table */
+};
+
+/** Query processing fields. */
+struct ib_qry_proc_t {
+
+	ib_qry_node_t	node;		/*!< Query node*/
+
+	ib_qry_grph_t	grph;		/*!< Query graph */
+};
+
+/** Cursor instance for traversing tables/indexes. This will eventually
+become row_prebuilt_t. */
+struct ib_cursor_t {
+	mem_heap_t*	heap;		/*!< Instance heap */
+
+	mem_heap_t*	query_heap;	/*!< Heap to use for query graphs */
+
+	ib_qry_proc_t	q_proc;		/*!< Query processing info */
+
+	ib_match_mode_t	match_mode;	/*!< ib_cursor_moveto match mode */
+
+	row_prebuilt_t*	prebuilt;	/*!< For reading rows */
+
+	bool		valid_trx;	/*!< Valid transaction attached */
+};
+
+/** InnoDB table columns used during table and index schema creation. */
+struct ib_col_t {
+	const char*	name;		/*!< Name of column */
+
+	ib_col_type_t	ib_col_type;	/*!< Main type of the column */
+
+	ulint		len;		/*!< Length of the column */
+
+	ib_col_attr_t	ib_col_attr;	/*!< Column attributes */
+
+};
+
+/** InnoDB index columns used during index and index schema creation. */
+struct ib_key_col_t {
+	const char*	name;		/*!< Name of column */
+
+	ulint		prefix_len;	/*!< Column index prefix len or 0 */
+};
+
+struct ib_table_def_t;
+
+/** InnoDB index schema used during index creation */
+struct ib_index_def_t {
+	mem_heap_t*	heap;		/*!< Heap used to build this and all
+					its columns in the list */
+
+	const char*	name;		/*!< Index name */
+
+	dict_table_t*	table;		/*!< Parent InnoDB table */
+
+	ib_table_def_t*	schema;		/*!< Parent table schema that owns
+					this instance */
+
+	ibool		clustered;	/*!< True if clustered index */
+
+	ibool		unique;		/*!< True if unique index */
+
+	ib_vector_t*	cols;		/*!< Vector of columns */
+
+	trx_t*		usr_trx;	/*!< User transacton covering the
+					DDL operations */
+};
+
+/** InnoDB table schema used during table creation */
+struct ib_table_def_t {
+	mem_heap_t*	heap;		/*!< Heap used to build this and all
+					its columns in the list */
+	const char*	name;		/*!< Table name */
+
+	ib_tbl_fmt_t	ib_tbl_fmt;	/*!< Row format */
+
+	ulint		page_size;	/*!< Page size */
+
+	ib_vector_t*	cols;		/*!< Vector of columns */
+
+	ib_vector_t*	indexes;	/*!< Vector of indexes */
+
+	dict_table_t*	table;		/* Table read from or NULL */
+};
+
+/** InnoDB tuple used for key operations. */
+struct ib_tuple_t {
+	mem_heap_t*		heap;	/*!< Heap used to build
+					this and for copying
+					the column values. */
+
+	ib_tuple_type_t		type;	/*!< Tuple discriminitor. */
+
+	const dict_index_t*	index;	/*!< Index for tuple can be either
+					secondary or cluster index. */
+
+	dtuple_t*		ptr;	/*!< The internal tuple
+					instance */
+};
+
+/** The following counter is used to convey information to InnoDB
+about server activity: in selects it is not sensible to call
+srv_active_wake_master_thread after each fetch or search, we only do
+it every INNOBASE_WAKE_INTERVAL'th step. */
+
+#define INNOBASE_WAKE_INTERVAL	32
+
+/*****************************************************************//**
+Check whether the Innodb persistent cursor is positioned.
+@return	IB_TRUE if positioned */
+UNIV_INLINE
+ib_bool_t
+ib_btr_cursor_is_positioned(
+/*========================*/
+	btr_pcur_t*	pcur)		/*!< in: InnoDB persistent cursor */
+{
+	return(pcur->old_stored == BTR_PCUR_OLD_STORED
+	       && (pcur->pos_state == BTR_PCUR_IS_POSITIONED
+	           || pcur->pos_state == BTR_PCUR_WAS_POSITIONED));
+}
+
+
+/********************************************************************//**
+Open a table using the table id, if found then increment table ref count.
+@return	table instance if found */
+static
+dict_table_t*
+ib_open_table_by_id(
+/*================*/
+	ib_id_u64_t	tid,		/*!< in: table id to lookup */
+	ib_bool_t	locked)		/*!< in: TRUE if own dict mutex */
+{
+	dict_table_t*	table;
+	table_id_t	table_id;
+
+	table_id = tid;
+
+	if (!locked) {
+		dict_mutex_enter_for_mysql();
+	}
+
+	table = dict_table_open_on_id(table_id, FALSE, DICT_TABLE_OP_NORMAL);
+
+	if (table != NULL && table->ibd_file_missing) {
+		table = NULL;
+	}
+
+	if (!locked) {
+		dict_mutex_exit_for_mysql();
+	}
+
+	return(table);
+}
+
+/********************************************************************//**
+Open a table using the table name, if found then increment table ref count.
+@return	table instance if found */
+UNIV_INTERN
+void*
+ib_open_table_by_name(
+/*==================*/
+	const char*	name)		/*!< in: table name to lookup */
+{
+	dict_table_t*	table;
+
+	table = dict_table_open_on_name(name, FALSE, FALSE,
+					DICT_ERR_IGNORE_NONE);
+
+	if (table != NULL && table->ibd_file_missing) {
+		table = NULL;
+	}
+
+	return(table);
+}
+
+/********************************************************************//**
+Find table using table name.
+@return	table instance if found */
+static
+dict_table_t*
+ib_lookup_table_by_name(
+/*====================*/
+	const char*	name)		/*!< in: table name to lookup */
+{
+	dict_table_t*	table;
+
+	table = dict_table_get_low(name);
+
+	if (table != NULL && table->ibd_file_missing) {
+		table = NULL;
+	}
+
+	return(table);
+}
+
+/********************************************************************//**
+Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth
+time calls srv_active_wake_master_thread. This function should be used
+when a single database operation may introduce a small need for
+server utility activity, like checkpointing. */
+UNIV_INLINE
+void
+ib_wake_master_thread(void)
+/*=======================*/
+{
+        static ulint    ib_signal_counter = 0;
+
+        ++ib_signal_counter;
+
+        if ((ib_signal_counter % INNOBASE_WAKE_INTERVAL) == 0) {
+                srv_active_wake_master_thread();
+        }
+}
+
+/*********************************************************************//**
+Calculate the max row size of the columns in a cluster index.
+@return	max row length */
+UNIV_INLINE
+ulint
+ib_get_max_row_len(
+/*===============*/
+	dict_index_t*	cluster)		/*!< in: cluster index */
+{
+	ulint		i;
+	ulint		max_len = 0;
+	ulint		n_fields = cluster->n_fields;
+
+	/* Add the size of the ordering columns in the
+	clustered index. */
+	for (i = 0; i < n_fields; ++i) {
+		const dict_col_t*	col;
+
+		col = dict_index_get_nth_col(cluster, i);
+
+		/* Use the maximum output size of
+		mach_write_compressed(), although the encoded
+		length should always fit in 2 bytes. */
+		max_len += dict_col_get_max_size(col);
+	}
+
+	return(max_len);
+}
+
+/*****************************************************************//**
+Read the columns from a rec into a tuple. */
+static
+void
+ib_read_tuple(
+/*==========*/
+	const rec_t*	rec,		/*!< in: Record to read */
+	ib_bool_t	page_format,	/*!< in: IB_TRUE if compressed format */
+	ib_tuple_t*	tuple)		/*!< in: tuple to read into */
+{
+	ulint		i;
+	void*		ptr;
+	rec_t*		copy;
+	ulint		rec_meta_data;
+	ulint		n_index_fields;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets	= offsets_;
+	dtuple_t*	dtuple = tuple->ptr;
+	const dict_index_t* index = tuple->index;
+
+	rec_offs_init(offsets_);
+
+	offsets = rec_get_offsets(
+		rec, index, offsets, ULINT_UNDEFINED, &tuple->heap);
+
+	rec_meta_data = rec_get_info_bits(rec, page_format);
+	dtuple_set_info_bits(dtuple, rec_meta_data);
+
+	/* Make a copy of the rec. */
+	ptr = mem_heap_alloc(tuple->heap, rec_offs_size(offsets));
+	copy = rec_copy(ptr, rec, offsets);
+
+	n_index_fields = ut_min(
+		rec_offs_n_fields(offsets), dtuple_get_n_fields(dtuple));
+
+	for (i = 0; i < n_index_fields; ++i) {
+		ulint		len;
+		const byte*	data;
+		dfield_t*	dfield;
+
+		if (tuple->type == TPL_TYPE_ROW) {
+			const dict_col_t*	col;
+			ulint			col_no;
+			const dict_field_t*	index_field;
+
+			index_field = dict_index_get_nth_field(index, i);
+			col = dict_field_get_col(index_field);
+			col_no = dict_col_get_no(col);
+
+			dfield = dtuple_get_nth_field(dtuple, col_no);
+		} else {
+			dfield = dtuple_get_nth_field(dtuple, i);
+		}
+
+		data = rec_get_nth_field(copy, offsets, i, &len);
+
+		/* Fetch and copy any externally stored column. */
+		if (rec_offs_nth_extern(offsets, i)) {
+
+			ulint	zip_size;
+
+			zip_size = dict_table_zip_size(index->table);
+
+			data = btr_rec_copy_externally_stored_field(
+				copy, offsets, zip_size, i, &len,
+				tuple->heap);
+
+			ut_a(len != UNIV_SQL_NULL);
+		}
+
+		dfield_set_data(dfield, data, len);
+	}
+}
+
+/*****************************************************************//**
+Create an InnoDB key tuple.
+@return	tuple instance created, or NULL */
+static
+ib_tpl_t
+ib_key_tuple_new_low(
+/*=================*/
+	const dict_index_t*	index,	/*!< in: index for which tuple
+					required */
+	ulint			n_cols,	/*!< in: no. of user defined cols */
+	mem_heap_t*		heap)	/*!< in: memory heap */
+{
+	ib_tuple_t*	tuple;
+	ulint		i;
+	ulint		n_cmp_cols;
+
+	tuple = static_cast<ib_tuple_t*>(
+			mem_heap_alloc(heap, sizeof(*tuple)));
+
+	if (tuple == NULL) {
+		mem_heap_free(heap);
+		return(NULL);
+	}
+
+	tuple->heap  = heap;
+	tuple->index = index;
+	tuple->type  = TPL_TYPE_KEY;
+
+	/* Is it a generated clustered index ? */
+	if (n_cols == 0) {
+		++n_cols;
+	}
+
+	tuple->ptr = dtuple_create(heap, n_cols);
+
+	/* Copy types and set to SQL_NULL. */
+	dict_index_copy_types(tuple->ptr, index, n_cols);
+
+	for (i = 0; i < n_cols; i++) {
+
+		dfield_t*	dfield;
+
+		dfield	= dtuple_get_nth_field(tuple->ptr, i);
+		dfield_set_null(dfield);
+	}
+
+	n_cmp_cols = dict_index_get_n_ordering_defined_by_user(index);
+
+	dtuple_set_n_fields_cmp(tuple->ptr, n_cmp_cols);
+
+	return((ib_tpl_t) tuple);
+}
+
+/*****************************************************************//**
+Create an InnoDB key tuple.
+@return	tuple instance created, or NULL */
+static
+ib_tpl_t
+ib_key_tuple_new(
+/*=============*/
+	const dict_index_t*	index,	/*!< in: index of tuple */
+	ulint			n_cols)	/*!< in: no. of user defined cols */
+{
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(64);
+
+	if (heap == NULL) {
+		return(NULL);
+	}
+
+	return(ib_key_tuple_new_low(index, n_cols, heap));
+}
+
+/*****************************************************************//**
+Create an InnoDB row tuple.
+@return	tuple instance, or NULL */
+static
+ib_tpl_t
+ib_row_tuple_new_low(
+/*=================*/
+	const dict_index_t*	index,	/*!< in: index of tuple */
+	ulint			n_cols,	/*!< in: no. of cols in tuple */
+	mem_heap_t*		heap)	/*!< in: memory heap */
+{
+	ib_tuple_t*	tuple;
+
+	tuple = static_cast<ib_tuple_t*>(mem_heap_alloc(heap, sizeof(*tuple)));
+
+	if (tuple == NULL) {
+		mem_heap_free(heap);
+		return(NULL);
+	}
+
+	tuple->heap  = heap;
+	tuple->index = index;
+	tuple->type  = TPL_TYPE_ROW;
+
+	tuple->ptr = dtuple_create(heap, n_cols);
+
+	/* Copy types and set to SQL_NULL. */
+	dict_table_copy_types(tuple->ptr, index->table);
+
+	return((ib_tpl_t) tuple);
+}
+
+/*****************************************************************//**
+Create an InnoDB row tuple.
+@return	tuple instance, or NULL */
+static
+ib_tpl_t
+ib_row_tuple_new(
+/*=============*/
+	const dict_index_t*	index,	/*!< in: index of tuple */
+	ulint			n_cols)	/*!< in: no. of cols in tuple */
+{
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(64);
+
+	if (heap == NULL) {
+		return(NULL);
+	}
+
+	return(ib_row_tuple_new_low(index, n_cols, heap));
+}
+
+/*****************************************************************//**
+Begin a transaction.
+@return	innobase txn handle */
+UNIV_INTERN
+ib_err_t
+ib_trx_start(
+/*=========*/
+	ib_trx_t	ib_trx,		/*!< in: transaction to restart */
+	ib_trx_level_t	ib_trx_level,	/*!< in: trx isolation level */
+	void*		thd)		/*!< in: THD */
+{
+	ib_err_t	err = DB_SUCCESS;
+	trx_t*		trx = (trx_t*) ib_trx;
+
+	ut_a(ib_trx_level <= IB_TRX_SERIALIZABLE);
+
+	trx_start_if_not_started(trx);
+
+	trx->isolation_level = ib_trx_level;
+
+	/* FIXME: This is a place holder, we should add an arg that comes
+	from the client. */
+	trx->mysql_thd = static_cast<THD*>(thd);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Begin a transaction. This will allocate a new transaction handle.
+put the transaction in the active state.
+@return	innobase txn handle */
+UNIV_INTERN
+ib_trx_t
+ib_trx_begin(
+/*=========*/
+	ib_trx_level_t	ib_trx_level)	/*!< in: trx isolation level */
+{
+	trx_t*		trx;
+	ib_bool_t	started;
+
+	trx = trx_allocate_for_mysql();
+	started = ib_trx_start((ib_trx_t) trx, ib_trx_level, NULL);
+	ut_a(started);
+
+	return((ib_trx_t) trx);
+}
+
+/*****************************************************************//**
+Get the transaction's state.
+@return	transaction state */
+UNIV_INTERN
+ib_trx_state_t
+ib_trx_state(
+/*=========*/
+	ib_trx_t	ib_trx)		/*!< in: trx handle */
+{
+	trx_t*		trx = (trx_t*) ib_trx;
+
+	return((ib_trx_state_t) trx->state);
+}
+
+/*****************************************************************//**
+Get a trx start time.
+@return	trx start_time */
+UNIV_INTERN
+ib_u64_t
+ib_trx_get_start_time(
+/*==================*/
+	ib_trx_t	ib_trx)		/*!< in: transaction */
+{
+	trx_t*		trx = (trx_t*) ib_trx;
+	return(static_cast<ib_u64_t>(trx->start_time));
+}
+/*****************************************************************//**
+Release the resources of the transaction.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_trx_release(
+/*===========*/
+	ib_trx_t	ib_trx)		/*!< in: trx handle */
+{
+	trx_t*		trx = (trx_t*) ib_trx;
+
+	ut_ad(trx != NULL);
+	trx_free_for_mysql(trx);
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Commit a transaction. This function will also release the schema
+latches too.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_trx_commit(
+/*==========*/
+	ib_trx_t	ib_trx)		/*!< in: trx handle */
+{
+	ib_err_t	err = DB_SUCCESS;
+	trx_t*		trx = (trx_t*) ib_trx;
+
+	if (trx->state == TRX_STATE_NOT_STARTED) {
+		err = ib_trx_release(ib_trx);
+		return(err);
+	}
+
+	trx_commit(trx);
+
+	err = ib_trx_release(ib_trx);
+	ut_a(err == DB_SUCCESS);
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Rollback a transaction. This function will also release the schema
+latches too.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_trx_rollback(
+/*============*/
+	ib_trx_t	ib_trx)		/*!< in: trx handle */
+{
+	ib_err_t	err;
+	trx_t*		trx = (trx_t*) ib_trx;
+
+	err = static_cast<ib_err_t>(trx_rollback_for_mysql(trx));
+
+        /* It should always succeed */
+        ut_a(err == DB_SUCCESS);
+
+	err = ib_trx_release(ib_trx);
+	ut_a(err == DB_SUCCESS);
+
+	ib_wake_master_thread();
+
+	return(err);
+}
+
+/*****************************************************************//**
+Find an index definition from the index vector using index name.
+@return	index def. if found else NULL */
+UNIV_INLINE
+const ib_index_def_t*
+ib_table_find_index(
+/*================*/
+	ib_vector_t*	indexes,	/*!< in: vector of indexes */
+	const char*	name)		/*!< in: index name */
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(indexes); ++i) {
+		const ib_index_def_t*	index_def;
+
+		index_def = (ib_index_def_t*) ib_vector_get(indexes, i);
+
+		if (innobase_strcasecmp(name, index_def->name) == 0) {
+			return(index_def);
+		}
+	}
+
+	return(NULL);
+}
+
+/*****************************************************************//**
+Get the InnoDB internal precise type from the schema column definition.
+@return	precise type in api format */
+UNIV_INLINE
+ulint
+ib_col_get_prtype(
+/*==============*/
+	const ib_col_t*	ib_col)		/*!< in: column definition */
+{
+	ulint		prtype = 0;
+
+	if (ib_col->ib_col_attr & IB_COL_UNSIGNED) {
+		prtype |= DATA_UNSIGNED;
+
+		ut_a(ib_col->ib_col_type == IB_INT);
+	}
+
+	if (ib_col->ib_col_attr & IB_COL_NOT_NULL) {
+		prtype |= DATA_NOT_NULL;
+	}
+
+	return(prtype);
+}
+
+/*****************************************************************//**
+Get the InnoDB internal main type from the schema column definition.
+@return	column main type */
+UNIV_INLINE
+ulint
+ib_col_get_mtype(
+/*==============*/
+	const ib_col_t*	ib_col)		/*!< in: column definition */
+{
+	/* Note: The api0api.h types should map directly to
+	the internal numeric codes. */
+	return(ib_col->ib_col_type);
+}
+
+/*****************************************************************//**
+Find a column in the the column vector with the same name.
+@return	col. def. if found else NULL */
+UNIV_INLINE
+const ib_col_t*
+ib_table_find_col(
+/*==============*/
+	const ib_vector_t*	cols,	/*!< in: column list head */
+	const char*	name)		/*!< in: column name to find */
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(cols); ++i) {
+		const ib_col_t*	ib_col;
+
+		ib_col =  static_cast<const ib_col_t*>(
+			ib_vector_get((ib_vector_t*) cols, i));
+
+		if (innobase_strcasecmp(ib_col->name, name) == 0) {
+			return(ib_col);
+		}
+	}
+
+	return(NULL);
+}
+
+/*****************************************************************//**
+Find a column in the the column list with the same name.
+@return	col. def. if found else NULL */
+UNIV_INLINE
+const ib_key_col_t*
+ib_index_find_col(
+/*==============*/
+	ib_vector_t*	cols,		/*!< in: column list head */
+	const char*	name)		/*!< in: column name to find */
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(cols); ++i) {
+		const ib_key_col_t*	ib_col;
+
+		ib_col = static_cast<ib_key_col_t*>(ib_vector_get(cols, i));
+
+		if (innobase_strcasecmp(ib_col->name, name) == 0) {
+			return(ib_col);
+		}
+	}
+
+	return(NULL);
+}
+
+#ifdef __WIN__
+/*****************************************************************//**
+Convert a string to lower case. */
+static
+void
+ib_to_lower_case(
+/*=============*/
+	char*		ptr)		/*!< string to convert to lower case */
+{
+	while (*ptr) {
+		*ptr = tolower(*ptr);
+		++ptr;
+	}
+}
+#endif /* __WIN__ */
+
+/*****************************************************************//**
+Normalizes a table name string. A normalized name consists of the
+database name catenated to '/' and table name. An example:
+test/mytable. On Windows normalization puts both the database name and the
+table name always to lower case. This function can be called for system
+tables and they don't have a database component. For tables that don't have
+a database component, we don't normalize them to lower case on Windows.
+The assumption is that they are system tables that reside in the system
+table space. */
+static
+void
+ib_normalize_table_name(
+/*====================*/
+	char*		norm_name,	/*!< out: normalized name as a
+					null-terminated string */
+	const char*	name)		/*!< in: table name string */
+{
+	const char*	ptr = name;
+
+	/* Scan name from the end */
+
+	ptr += ut_strlen(name) - 1;
+
+	/* Find the start of the table name. */
+	while (ptr >= name && *ptr != '\\' && *ptr != '/' && ptr > name) {
+		--ptr;
+	}
+
+
+	/* For system tables there is no '/' or dbname. */
+	ut_a(ptr >= name);
+
+	if (ptr > name) {
+		const char*	db_name;
+		const char*	table_name;
+
+		table_name = ptr + 1;
+
+		--ptr;
+
+		while (ptr >= name && *ptr != '\\' && *ptr != '/') {
+			ptr--;
+		}
+
+		db_name = ptr + 1;
+
+		memcpy(norm_name, db_name,
+			ut_strlen(name) + 1 - (db_name - name));
+
+		norm_name[table_name - db_name - 1] = '/';
+#ifdef __WIN__
+		ib_to_lower_case(norm_name);
+#endif
+	} else {
+		ut_strcpy(norm_name, name);
+	}
+}
+
+/*****************************************************************//**
+Check whether the table name conforms to our requirements. Currently
+we only do a simple check for the presence of a '/'.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_table_name_check(
+/*================*/
+	const char*	name)		/*!< in: table name to check */
+{
+	const char*	slash = NULL;
+	ulint		len = ut_strlen(name);
+
+	if (len < 2
+	    || *name == '/'
+	    || name[len - 1] == '/'
+	    || (name[0] == '.' && name[1] == '/')
+	    || (name[0] == '.' && name[1] == '.' && name[2] == '/')) {
+
+		return(DB_DATA_MISMATCH);
+	}
+
+	for ( ; *name; ++name) {
+#ifdef __WIN__
+		/* Check for reserved characters in DOS filenames. */
+		switch (*name) {
+		case ':':
+		case '|':
+		case '"':
+		case '*':
+		case '<':
+		case '>':
+			return(DB_DATA_MISMATCH);
+		}
+#endif /* __WIN__ */
+		if (*name == '/') {
+			if (slash) {
+				return(DB_DATA_MISMATCH);
+			}
+			slash = name;
+		}
+	}
+
+	return(slash ? DB_SUCCESS : DB_DATA_MISMATCH);
+}
+
+
+
+/*****************************************************************//**
+Get an index definition that is tagged as a clustered index.
+@return	cluster index schema */
+UNIV_INLINE
+ib_index_def_t*
+ib_find_clustered_index(
+/*====================*/
+	ib_vector_t*	indexes)	/*!< in: index defs. to search */
+{
+	ulint		i;
+	ulint		n_indexes;
+
+	n_indexes = ib_vector_size(indexes);
+
+	for (i = 0; i < n_indexes; ++i) {
+		ib_index_def_t*	ib_index_def;
+
+		ib_index_def = static_cast<ib_index_def_t*>(
+			ib_vector_get(indexes, i));
+
+		if (ib_index_def->clustered) {
+			return(ib_index_def);
+		}
+	}
+
+	return(NULL);
+}
+
+/*****************************************************************//**
+Get a table id. The caller must have acquired the dictionary mutex.
+@return	DB_SUCCESS if found */
+static
+ib_err_t
+ib_table_get_id_low(
+/*================*/
+	const char*	table_name,	/*!< in: table to find */
+	ib_id_u64_t*	table_id)	/*!< out: table id if found */
+{
+	dict_table_t*	table;
+	ib_err_t	err = DB_TABLE_NOT_FOUND;
+
+	*table_id = 0;
+
+	table = ib_lookup_table_by_name(table_name);
+
+	if (table != NULL) {
+		*table_id = (table->id);
+
+		err = DB_SUCCESS;
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Create an internal cursor instance.
+@return	DB_SUCCESS or err code */
+static
+ib_err_t
+ib_create_cursor(
+/*=============*/
+	ib_crsr_t*	ib_crsr,	/*!< out: InnoDB cursor */
+	dict_table_t*	table,		/*!< in: table instance */
+	dict_index_t*	index,		/*!< in: index to use */
+	trx_t*		trx)		/*!< in: transaction */
+{
+	mem_heap_t*	heap;
+	ib_cursor_t*	cursor;
+	ib_err_t	err = DB_SUCCESS;
+
+	heap = mem_heap_create(sizeof(*cursor) * 2);
+
+	if (heap != NULL) {
+		row_prebuilt_t*	prebuilt;
+
+		cursor = static_cast<ib_cursor_t*>(
+			 mem_heap_zalloc(heap, sizeof(*cursor)));
+
+		cursor->heap = heap;
+
+		cursor->query_heap = mem_heap_create(64);
+
+		if (cursor->query_heap == NULL) {
+			mem_heap_free(heap);
+
+			return(DB_OUT_OF_MEMORY);
+		}
+
+		cursor->prebuilt = row_create_prebuilt(table, 0);
+
+		prebuilt = cursor->prebuilt;
+
+		prebuilt->trx = trx;
+
+		cursor->valid_trx = TRUE;
+
+		prebuilt->table = table;
+		prebuilt->select_lock_type = LOCK_NONE;
+		prebuilt->innodb_api = TRUE;
+
+		prebuilt->index = index;
+
+		ut_a(prebuilt->index != NULL);
+
+		if (prebuilt->trx != NULL) {
+			++prebuilt->trx->n_mysql_tables_in_use;
+
+			 prebuilt->index_usable =
+				row_merge_is_index_usable(
+					prebuilt->trx, prebuilt->index);
+
+			/* Assign a read view if the transaction does
+			not have it yet */
+
+			trx_assign_read_view(prebuilt->trx);
+		}
+
+		*ib_crsr = (ib_crsr_t) cursor;
+	} else {
+		err = DB_OUT_OF_MEMORY;
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Create an internal cursor instance, and set prebuilt->index to index
+with supplied index_id.
+@return	DB_SUCCESS or err code */
+static
+ib_err_t
+ib_create_cursor_with_index_id(
+/*===========================*/
+	ib_crsr_t*	ib_crsr,	/*!< out: InnoDB cursor */
+	dict_table_t*	table,		/*!< in: table instance */
+	ib_id_u64_t	index_id,	/*!< in: index id or 0 */
+	trx_t*		trx)		/*!< in: transaction */
+{
+	dict_index_t*	index;
+
+	if (index_id != 0) {
+		mutex_enter(&dict_sys->mutex);
+		index = dict_index_find_on_id_low(index_id);
+		mutex_exit(&dict_sys->mutex);
+	} else {
+		index = dict_table_get_first_index(table);
+	}
+
+	return(ib_create_cursor(ib_crsr, table, index, trx));
+}
+
+/*****************************************************************//**
+Open an InnoDB table and return a cursor handle to it.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_open_table_using_id(
+/*==========================*/
+	ib_id_u64_t	table_id,	/*!< in: table id of table to open */
+	ib_trx_t	ib_trx,		/*!< in: Current transaction handle
+					can be NULL */
+	ib_crsr_t*	ib_crsr)	/*!< out,own: InnoDB cursor */
+{
+	ib_err_t	err;
+	dict_table_t*	table;
+
+	if (ib_trx == NULL || !ib_schema_lock_is_exclusive(ib_trx)) {
+		table = ib_open_table_by_id(table_id, FALSE);
+	} else {
+		table = ib_open_table_by_id(table_id, TRUE);
+	}
+
+	if (table == NULL) {
+
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	err = ib_create_cursor_with_index_id(ib_crsr, table, 0,
+					     (trx_t*) ib_trx);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Open an InnoDB index and return a cursor handle to it.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_open_index_using_id(
+/*==========================*/
+	ib_id_u64_t	index_id,	/*!< in: index id of index to open */
+	ib_trx_t	ib_trx,		/*!< in: Current transaction handle
+					can be NULL */
+	ib_crsr_t*	ib_crsr)	/*!< out: InnoDB cursor */
+{
+	ib_err_t	err;
+	dict_table_t*	table;
+	ulint		table_id = (ulint)( index_id >> 32);
+
+	if (ib_trx == NULL || !ib_schema_lock_is_exclusive(ib_trx)) {
+		table = ib_open_table_by_id(table_id, FALSE);
+	} else {
+		table = ib_open_table_by_id(table_id, TRUE);
+	}
+
+	if (table == NULL) {
+
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	/* We only return the lower 32 bits of the dulint. */
+	err = ib_create_cursor_with_index_id(
+		ib_crsr, table, index_id, (trx_t*) ib_trx);
+
+	if (ib_crsr != NULL) {
+		const ib_cursor_t*	cursor;
+
+		cursor = *(ib_cursor_t**) ib_crsr;
+
+		if (cursor->prebuilt->index == NULL) {
+			ib_err_t	crsr_err;
+
+			crsr_err = ib_cursor_close(*ib_crsr);
+			ut_a(crsr_err == DB_SUCCESS);
+
+			*ib_crsr = NULL;
+		}
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Open an InnoDB secondary index cursor and return a cursor handle to it.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_open_index_using_name(
+/*============================*/
+	ib_crsr_t	ib_open_crsr,	/*!< in: open/active cursor */
+	const char*	index_name,	/*!< in: secondary index name */
+	ib_crsr_t*	ib_crsr,	/*!< out,own: InnoDB index cursor */
+	int*		idx_type,	/*!< out: index is cluster index */
+	ib_id_u64_t*	idx_id)		/*!< out: index id */
+{
+	dict_table_t*	table;
+	dict_index_t*	index;
+	index_id_t	index_id = 0;
+	ib_err_t	err = DB_TABLE_NOT_FOUND;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_open_crsr;
+
+	*idx_type = 0;
+	*idx_id = 0;
+	*ib_crsr = NULL;
+
+	/* We want to increment the ref count, so we do a redundant search. */
+	table = dict_table_open_on_id(cursor->prebuilt->table->id,
+				      FALSE, DICT_TABLE_OP_NORMAL);
+	ut_a(table != NULL);
+
+	/* The first index is always the cluster index. */
+	index = dict_table_get_first_index(table);
+
+	/* Traverse the user defined indexes. */
+	while (index != NULL) {
+		if (innobase_strcasecmp(index->name, index_name) == 0) {
+			index_id = index->id;
+			*idx_type = index->type;
+			*idx_id = index_id;
+			break;
+		}
+		index = UT_LIST_GET_NEXT(indexes, index);
+	}
+
+	if (!index_id) {
+		dict_table_close(table, FALSE, FALSE);
+		return(DB_ERROR);
+	}
+
+	if (index_id > 0) {
+		ut_ad(index->id == index_id);
+		err = ib_create_cursor(
+			ib_crsr, table, index, cursor->prebuilt->trx);
+	}
+
+	if (*ib_crsr != NULL) {
+		const ib_cursor_t*	cursor;
+
+		cursor = *(ib_cursor_t**) ib_crsr;
+
+		if (cursor->prebuilt->index == NULL) {
+			err = ib_cursor_close(*ib_crsr);
+			ut_a(err == DB_SUCCESS);
+			*ib_crsr = NULL;
+		}
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Open an InnoDB table and return a cursor handle to it.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_open_table(
+/*=================*/
+	const char*	name,		/*!< in: table name */
+	ib_trx_t	ib_trx,		/*!< in: Current transaction handle
+					can be NULL */
+	ib_crsr_t*	ib_crsr)	/*!< out,own: InnoDB cursor */
+{
+	ib_err_t	err;
+	dict_table_t*	table;
+	char*		normalized_name;
+
+	normalized_name = static_cast<char*>(mem_alloc(ut_strlen(name) + 1));
+	ib_normalize_table_name(normalized_name, name);
+
+	if (ib_trx != NULL) {
+	       if (!ib_schema_lock_is_exclusive(ib_trx)) {
+			table = (dict_table_t*)ib_open_table_by_name(
+				normalized_name);
+		} else {
+			/* NOTE: We do not acquire MySQL metadata lock */
+			table = ib_lookup_table_by_name(normalized_name);
+		}
+	} else {
+		table = (dict_table_t*)ib_open_table_by_name(normalized_name);
+	}
+
+	mem_free(normalized_name);
+	normalized_name = NULL;
+
+	/* It can happen that another thread has created the table but
+	not the cluster index or it's a broken table definition. Refuse to
+	open if that's the case. */
+	if (table != NULL && dict_table_get_first_index(table) == NULL) {
+		table = NULL;
+	}
+
+	if (table != NULL) {
+		err = ib_create_cursor_with_index_id(ib_crsr, table, 0,
+						     (trx_t*) ib_trx);
+	} else {
+		err = DB_TABLE_NOT_FOUND;
+	}
+
+	return(err);
+}
+
+/********************************************************************//**
+Free a context struct for a table handle. */
+static
+void
+ib_qry_proc_free(
+/*=============*/
+	ib_qry_proc_t*	q_proc)		/*!< in, own: qproc struct */
+{
+	que_graph_free_recursive(q_proc->grph.ins);
+	que_graph_free_recursive(q_proc->grph.upd);
+	que_graph_free_recursive(q_proc->grph.sel);
+
+	memset(q_proc, 0x0, sizeof(*q_proc));
+}
+
+/*****************************************************************//**
+set a cursor trx to NULL */
+UNIV_INTERN
+void
+ib_cursor_clear_trx(
+/*================*/
+	ib_crsr_t	ib_crsr)	/*!< in/out: InnoDB cursor */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+
+	cursor->prebuilt->trx = NULL;
+}
+
+/*****************************************************************//**
+Reset the cursor.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_reset(
+/*============*/
+	ib_crsr_t	ib_crsr)	/*!< in/out: InnoDB cursor */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+	if (cursor->valid_trx && prebuilt->trx != NULL
+	    && prebuilt->trx->n_mysql_tables_in_use > 0) {
+
+		--prebuilt->trx->n_mysql_tables_in_use;
+	}
+
+	/* The fields in this data structure are allocated from
+	the query heap and so need to be reset too. */
+	ib_qry_proc_free(&cursor->q_proc);
+
+	mem_heap_empty(cursor->query_heap);
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+update the cursor with new transactions and also reset the cursor
+@return	DB_SUCCESS or err code */
+ib_err_t
+ib_cursor_new_trx(
+/*==============*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor */
+	ib_trx_t	ib_trx)		/*!< in: transaction */
+{
+	ib_err_t        err = DB_SUCCESS;
+	ib_cursor_t*    cursor = (ib_cursor_t*) ib_crsr;
+	trx_t*          trx = (trx_t*) ib_trx;
+
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+	row_update_prebuilt_trx(prebuilt, trx);
+
+	cursor->valid_trx = TRUE;
+
+	trx_assign_read_view(prebuilt->trx);
+
+        ib_qry_proc_free(&cursor->q_proc);
+
+        mem_heap_empty(cursor->query_heap);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Commit the transaction in a cursor
+@return	DB_SUCCESS or err code */
+ib_err_t
+ib_cursor_commit_trx(
+/*=================*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor */
+	ib_trx_t	ib_trx)		/*!< in: transaction */
+{
+	ib_err_t        err = DB_SUCCESS;
+	ib_cursor_t*    cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+	ut_ad(prebuilt->trx == (trx_t*) ib_trx);
+	err = ib_trx_commit(ib_trx);
+	prebuilt->trx = NULL;
+	cursor->valid_trx = FALSE;
+	return(err);
+}
+
+/*****************************************************************//**
+Close an InnoDB table and free the cursor.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_close(
+/*============*/
+	ib_crsr_t	ib_crsr)	/*!< in,own: InnoDB cursor */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt;
+	trx_t*		trx;
+
+	if (!cursor) {
+		return(DB_SUCCESS);
+	}
+
+	prebuilt = cursor->prebuilt;
+	trx = prebuilt->trx;
+
+	ib_qry_proc_free(&cursor->q_proc);
+
+	/* The transaction could have been detached from the cursor. */
+	if (cursor->valid_trx && trx != NULL
+	    && trx->n_mysql_tables_in_use > 0) {
+		--trx->n_mysql_tables_in_use;
+	}
+
+	row_prebuilt_free(prebuilt, FALSE);
+	cursor->prebuilt = NULL;
+
+	mem_heap_free(cursor->query_heap);
+	mem_heap_free(cursor->heap);
+	cursor = NULL;
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Close the table, decrement n_ref_count count.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_close_table(
+/*==================*/
+	ib_crsr_t	ib_crsr)	/*!< in,own: InnoDB cursor */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+	if (prebuilt && prebuilt->table) {
+		dict_table_close(prebuilt->table, FALSE, FALSE);
+	}
+
+	return(DB_SUCCESS);
+}
+/**********************************************************************//**
+Run the insert query and do error handling.
+@return	DB_SUCCESS or error code */
+UNIV_INLINE
+ib_err_t
+ib_insert_row_with_lock_retry(
+/*==========================*/
+	que_thr_t*	thr,		/*!< in: insert query graph */
+	ins_node_t*	node,		/*!< in: insert node for the query */
+	trx_savept_t*	savept)		/*!< in: savepoint to rollback to
+					in case of an error */
+{
+	trx_t*		trx;
+	ib_err_t	err;
+	ib_bool_t	lock_wait;
+
+	trx = thr_get_trx(thr);
+
+	do {
+		thr->run_node = node;
+		thr->prev_node = node;
+
+		row_ins_step(thr);
+
+		err = trx->error_state;
+
+		if (err != DB_SUCCESS) {
+			que_thr_stop_for_mysql(thr);
+
+			thr->lock_state = QUE_THR_LOCK_ROW;
+			lock_wait = ib_handle_errors(&err, trx, thr, savept);
+			thr->lock_state = QUE_THR_LOCK_NOLOCK;
+		} else {
+			lock_wait = FALSE;
+		}
+	} while (lock_wait);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Write a row.
+@return	DB_SUCCESS or err code */
+static
+ib_err_t
+ib_execute_insert_query_graph(
+/*==========================*/
+	dict_table_t*	table,		/*!< in: table where to insert */
+	que_fork_t*	ins_graph,	/*!< in: query graph */
+	ins_node_t*	node)		/*!< in: insert node */
+{
+	trx_t*		trx;
+	que_thr_t*	thr;
+	trx_savept_t	savept;
+	ib_err_t	err = DB_SUCCESS;
+
+	trx = ins_graph->trx;
+
+	savept = trx_savept_take(trx);
+
+	thr = que_fork_get_first_thr(ins_graph);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+	err = ib_insert_row_with_lock_retry(thr, node, &savept);
+
+	if (err == DB_SUCCESS) {
+		que_thr_stop_for_mysql_no_error(thr, trx);
+
+		dict_table_n_rows_inc(table);
+
+		srv_stats.n_rows_inserted.inc();
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*****************************************************************//**
+Create an insert query graph node. */
+static
+void
+ib_insert_query_graph_create(
+/*==========================*/
+	ib_cursor_t*	cursor)		/*!< in: Cursor instance */
+{
+	ib_qry_proc_t*	q_proc = &cursor->q_proc;
+	ib_qry_node_t*	node = &q_proc->node;
+	trx_t*		trx = cursor->prebuilt->trx;
+
+	ut_a(trx->state != TRX_STATE_NOT_STARTED);
+
+	if (node->ins == NULL) {
+		dtuple_t*	row;
+		ib_qry_grph_t*	grph = &q_proc->grph;
+		mem_heap_t*	heap = cursor->query_heap;
+		dict_table_t*	table = cursor->prebuilt->table;
+
+		node->ins = ins_node_create(INS_DIRECT, table, heap);
+
+		node->ins->select = NULL;
+		node->ins->values_list = NULL;
+
+		row = dtuple_create(heap, dict_table_get_n_cols(table));
+		dict_table_copy_types(row, table);
+
+		ins_node_set_new_row(node->ins, row);
+
+		grph->ins = static_cast<que_fork_t*>(
+			que_node_get_parent(
+				pars_complete_graph_for_exec(node->ins, trx,
+							     heap)));
+
+		grph->ins->state = QUE_FORK_ACTIVE;
+	}
+}
+
+/*****************************************************************//**
+Insert a row to a table.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_insert_row(
+/*=================*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor instance */
+	const ib_tpl_t	ib_tpl)		/*!< in: tuple to insert */
+{
+	ib_ulint_t	i;
+	ib_qry_node_t*	node;
+	ib_qry_proc_t*	q_proc;
+	ulint		n_fields;
+	dtuple_t*	dst_dtuple;
+	ib_err_t	err = DB_SUCCESS;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	const ib_tuple_t* src_tuple = (const ib_tuple_t*) ib_tpl;
+
+	ib_insert_query_graph_create(cursor);
+
+	ut_ad(src_tuple->type == TPL_TYPE_ROW);
+
+	q_proc = &cursor->q_proc;
+	node = &q_proc->node;
+
+	node->ins->state = INS_NODE_ALLOC_ROW_ID;
+	dst_dtuple = node->ins->row;
+
+	n_fields = dtuple_get_n_fields(src_tuple->ptr);
+	ut_ad(n_fields == dtuple_get_n_fields(dst_dtuple));
+
+	/* Do a shallow copy of the data fields and check for NULL
+	constraints on columns. */
+	for (i = 0; i < n_fields; i++) {
+		ulint		mtype;
+		dfield_t*	src_field;
+		dfield_t*	dst_field;
+
+		src_field = dtuple_get_nth_field(src_tuple->ptr, i);
+
+		mtype = dtype_get_mtype(dfield_get_type(src_field));
+
+		/* Don't touch the system columns. */
+		if (mtype != DATA_SYS) {
+			ulint	prtype;
+
+			prtype = dtype_get_prtype(dfield_get_type(src_field));
+
+			if ((prtype & DATA_NOT_NULL)
+			    && dfield_is_null(src_field)) {
+
+				err = DB_DATA_MISMATCH;
+				break;
+			}
+
+			dst_field = dtuple_get_nth_field(dst_dtuple, i);
+			ut_ad(mtype
+			      == dtype_get_mtype(dfield_get_type(dst_field)));
+
+			/* Do a shallow copy. */
+			dfield_set_data(
+				dst_field, src_field->data, src_field->len);
+
+			if (dst_field->len != IB_SQL_NULL) {
+				UNIV_MEM_ASSERT_RW(dst_field->data,
+						   dst_field->len);
+			}
+		}
+	}
+
+	if (err == DB_SUCCESS) {
+		err = ib_execute_insert_query_graph(
+			src_tuple->index->table, q_proc->grph.ins, node->ins);
+	}
+
+	srv_active_wake_master_thread();
+
+	return(err);
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates.
+@return	update vector */
+UNIV_INLINE
+upd_t*
+ib_update_vector_create(
+/*====================*/
+	ib_cursor_t*	cursor)		/*!< in: current cursor */
+{
+	trx_t*		trx = cursor->prebuilt->trx;
+	mem_heap_t*	heap = cursor->query_heap;
+	dict_table_t*	table = cursor->prebuilt->table;
+	ib_qry_proc_t*	q_proc = &cursor->q_proc;
+	ib_qry_grph_t*	grph = &q_proc->grph;
+	ib_qry_node_t*	node = &q_proc->node;
+
+	ut_a(trx->state != TRX_STATE_NOT_STARTED);
+
+	if (node->upd == NULL) {
+		node->upd = static_cast<upd_node_t*>(
+			row_create_update_node_for_mysql(table, heap));
+	}
+
+	grph->upd = static_cast<que_fork_t*>(
+		que_node_get_parent(
+			pars_complete_graph_for_exec(node->upd, trx, heap)));
+
+	grph->upd->state = QUE_FORK_ACTIVE;
+
+	return(node->upd->update);
+}
+
+/**********************************************************************//**
+Note that a column has changed. */
+static
+void
+ib_update_col(
+/*==========*/
+
+	ib_cursor_t*	cursor,		/*!< in: current cursor */
+	upd_field_t*	upd_field,	/*!< in/out: update field */
+	ulint		col_no,		/*!< in: column number */
+	dfield_t*	dfield)		/*!< in: updated dfield */
+{
+	ulint		data_len;
+	dict_table_t*	table = cursor->prebuilt->table;
+	dict_index_t*	index = dict_table_get_first_index(table);
+
+	data_len = dfield_get_len(dfield);
+
+	if (data_len == UNIV_SQL_NULL) {
+		dfield_set_null(&upd_field->new_val);
+	} else {
+		dfield_copy_data(&upd_field->new_val, dfield);
+	}
+
+	upd_field->exp = NULL;
+
+	upd_field->orig_len = 0;
+
+	upd_field->field_no = dict_col_get_clust_pos(
+		&table->cols[col_no], index);
+}
+
+/**********************************************************************//**
+Checks which fields have changed in a row and stores the new data
+to an update vector.
+@return	DB_SUCCESS or err code */
+static
+ib_err_t
+ib_calc_diff(
+/*=========*/
+	ib_cursor_t*	cursor,		/*!< in: current cursor */
+	upd_t*		upd,		/*!< in/out: update vector */
+	const ib_tuple_t*old_tuple,	/*!< in: Old tuple in table */
+	const ib_tuple_t*new_tuple)	/*!< in: New tuple to update */
+{
+	ulint		i;
+	ulint		n_changed = 0;
+	ib_err_t	err = DB_SUCCESS;
+	ulint		n_fields = dtuple_get_n_fields(new_tuple->ptr);
+
+	ut_a(old_tuple->type == TPL_TYPE_ROW);
+	ut_a(new_tuple->type == TPL_TYPE_ROW);
+	ut_a(old_tuple->index->table == new_tuple->index->table);
+
+	for (i = 0; i < n_fields; ++i) {
+		ulint		mtype;
+		ulint		prtype;
+		upd_field_t*	upd_field;
+		dfield_t*	new_dfield;
+		dfield_t*	old_dfield;
+
+		new_dfield = dtuple_get_nth_field(new_tuple->ptr, i);
+		old_dfield = dtuple_get_nth_field(old_tuple->ptr, i);
+
+		mtype = dtype_get_mtype(dfield_get_type(old_dfield));
+		prtype = dtype_get_prtype(dfield_get_type(old_dfield));
+
+		/* Skip the system columns */
+		if (mtype == DATA_SYS) {
+			continue;
+
+		} else if ((prtype & DATA_NOT_NULL)
+			   && dfield_is_null(new_dfield)) {
+
+			err = DB_DATA_MISMATCH;
+			break;
+		}
+
+		if (dfield_get_len(new_dfield) != dfield_get_len(old_dfield)
+		    || (!dfield_is_null(old_dfield)
+		        && memcmp(dfield_get_data(new_dfield),
+			      dfield_get_data(old_dfield),
+			      dfield_get_len(old_dfield)) != 0)) {
+
+			upd_field = &upd->fields[n_changed];
+
+			ib_update_col(cursor, upd_field, i, new_dfield);
+
+			++n_changed;
+		}
+	}
+
+	if (err == DB_SUCCESS) {
+		upd->info_bits = 0;
+		upd->n_fields = n_changed;
+	}
+
+	return(err);
+}
+
+/**********************************************************************//**
+Run the update query and do error handling.
+@return	DB_SUCCESS or error code */
+UNIV_INLINE
+ib_err_t
+ib_update_row_with_lock_retry(
+/*==========================*/
+	que_thr_t*	thr,		/*!< in: Update query graph */
+	upd_node_t*	node,		/*!< in: Update node for the query */
+	trx_savept_t*	savept)		/*!< in: savepoint to rollback to
+					in case of an error */
+
+{
+	trx_t*		trx;
+	ib_err_t	err;
+	ib_bool_t	lock_wait;
+
+	trx = thr_get_trx(thr);
+
+	do {
+		thr->run_node = node;
+		thr->prev_node = node;
+
+		row_upd_step(thr);
+
+		err = trx->error_state;
+
+		if (err != DB_SUCCESS) {
+			que_thr_stop_for_mysql(thr);
+
+			if (err != DB_RECORD_NOT_FOUND) {
+				thr->lock_state = QUE_THR_LOCK_ROW;
+
+				lock_wait = ib_handle_errors(
+					&err, trx, thr, savept);
+
+				thr->lock_state = QUE_THR_LOCK_NOLOCK;
+			} else {
+				lock_wait = FALSE;
+			}
+		} else {
+			lock_wait = FALSE;
+		}
+	} while (lock_wait);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Does an update or delete of a row.
+@return	DB_SUCCESS or err code */
+UNIV_INLINE
+ib_err_t
+ib_execute_update_query_graph(
+/*==========================*/
+	ib_cursor_t*	cursor,		/*!< in: Cursor instance */
+	btr_pcur_t*	pcur)		/*!< in: Btree persistent cursor */
+{
+	ib_err_t	err;
+	que_thr_t*	thr;
+	upd_node_t*	node;
+	trx_savept_t	savept;
+	trx_t*		trx = cursor->prebuilt->trx;
+	dict_table_t*	table = cursor->prebuilt->table;
+	ib_qry_proc_t*	q_proc = &cursor->q_proc;
+
+	/* The transaction must be running. */
+	ut_a(trx->state != TRX_STATE_NOT_STARTED);
+
+	node = q_proc->node.upd;
+
+	ut_a(dict_index_is_clust(pcur->btr_cur.index));
+	btr_pcur_copy_stored_position(node->pcur, pcur);
+
+	ut_a(node->pcur->rel_pos == BTR_PCUR_ON);
+
+	savept = trx_savept_take(trx);
+
+	thr = que_fork_get_first_thr(q_proc->grph.upd);
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+	err = ib_update_row_with_lock_retry(thr, node, &savept);
+
+	if (err == DB_SUCCESS) {
+
+		que_thr_stop_for_mysql_no_error(thr, trx);
+
+		if (node->is_delete) {
+
+			dict_table_n_rows_dec(table);
+
+			srv_stats.n_rows_deleted.inc();
+		} else {
+			srv_stats.n_rows_updated.inc();
+		}
+
+	} else if (err == DB_RECORD_NOT_FOUND) {
+		trx->error_state = DB_SUCCESS;
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*****************************************************************//**
+Update a row in a table.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_update_row(
+/*=================*/
+	ib_crsr_t	ib_crsr,	/*!< in: InnoDB cursor instance */
+	const ib_tpl_t	ib_old_tpl,	/*!< in: Old tuple in table */
+	const ib_tpl_t	ib_new_tpl)	/*!< in: New tuple to update */
+{
+	upd_t*		upd;
+	ib_err_t	err;
+	btr_pcur_t*	pcur;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+	const ib_tuple_t*old_tuple = (const ib_tuple_t*) ib_old_tpl;
+	const ib_tuple_t*new_tuple = (const ib_tuple_t*) ib_new_tpl;
+
+	if (dict_index_is_clust(prebuilt->index)) {
+		pcur = &cursor->prebuilt->pcur;
+	} else if (prebuilt->need_to_access_clustered) {
+		pcur = &cursor->prebuilt->clust_pcur;
+	} else {
+		return(DB_ERROR);
+	}
+
+	ut_a(old_tuple->type == TPL_TYPE_ROW);
+	ut_a(new_tuple->type == TPL_TYPE_ROW);
+
+	upd = ib_update_vector_create(cursor);
+
+	err = ib_calc_diff(cursor, upd, old_tuple, new_tuple);
+
+	if (err == DB_SUCCESS) {
+		/* Note that this is not a delete. */
+		cursor->q_proc.node.upd->is_delete = FALSE;
+
+		err = ib_execute_update_query_graph(cursor, pcur);
+	}
+
+	srv_active_wake_master_thread();
+
+	return(err);
+}
+
+/**********************************************************************//**
+Build the update query graph to delete a row from an index.
+@return	DB_SUCCESS or err code */
+static
+ib_err_t
+ib_delete_row(
+/*==========*/
+	ib_cursor_t*	cursor,		/*!< in: current cursor */
+	btr_pcur_t*	pcur,		/*!< in: Btree persistent cursor */
+	const rec_t*	rec)		/*!< in: record to delete */
+{
+	ulint		i;
+	upd_t*		upd;
+	ib_err_t	err;
+	ib_tuple_t*	tuple;
+	ib_tpl_t	ib_tpl;
+	ulint		n_cols;
+	upd_field_t*	upd_field;
+	ib_bool_t	page_format;
+	dict_table_t*	table = cursor->prebuilt->table;
+	dict_index_t*	index = dict_table_get_first_index(table);
+
+	n_cols = dict_index_get_n_ordering_defined_by_user(index);
+	ib_tpl = ib_key_tuple_new(index, n_cols);
+
+	if (!ib_tpl) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	tuple = (ib_tuple_t*) ib_tpl;
+
+	upd = ib_update_vector_create(cursor);
+
+	page_format = dict_table_is_comp(index->table);
+	ib_read_tuple(rec, page_format, tuple);
+
+	upd->n_fields = ib_tuple_get_n_cols(ib_tpl);
+
+	for (i = 0; i < upd->n_fields; ++i) {
+		dfield_t*	dfield;
+
+		upd_field = &upd->fields[i];
+		dfield = dtuple_get_nth_field(tuple->ptr, i);
+
+		dfield_copy_data(&upd_field->new_val, dfield);
+
+		upd_field->exp = NULL;
+
+		upd_field->orig_len = 0;
+
+		upd->info_bits = 0;
+
+		upd_field->field_no = dict_col_get_clust_pos(
+			&table->cols[i], index);
+	}
+
+	/* Note that this is a delete. */
+	cursor->q_proc.node.upd->is_delete = TRUE;
+
+	err = ib_execute_update_query_graph(cursor, pcur);
+
+	ib_tuple_delete(ib_tpl);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Delete a row in a table.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_delete_row(
+/*=================*/
+	ib_crsr_t	ib_crsr)	/*!< in: InnoDB cursor instance */
+{
+	ib_err_t	err;
+	btr_pcur_t*	pcur;
+	dict_index_t*	index;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+	index = dict_table_get_first_index(prebuilt->index->table);
+
+	/* Check whether this is a secondary index cursor */
+	if (index != prebuilt->index) {
+		if (prebuilt->need_to_access_clustered) {
+			pcur = &prebuilt->clust_pcur;
+		} else {
+			return(DB_ERROR);
+		}
+	} else {
+		pcur = &prebuilt->pcur;
+	}
+
+	if (ib_btr_cursor_is_positioned(pcur)) {
+		const rec_t*	rec;
+		ib_bool_t	page_format;
+		mtr_t		mtr;
+
+		page_format = dict_table_is_comp(index->table);
+
+		mtr_start(&mtr);
+
+		if (btr_pcur_restore_position(
+			BTR_SEARCH_LEAF, pcur, &mtr)) {
+
+			rec = btr_pcur_get_rec(pcur);
+		} else {
+			rec = NULL;
+		}
+
+		mtr_commit(&mtr);
+
+		if (rec && !rec_get_deleted_flag(rec, page_format)) {
+			err = ib_delete_row(cursor, pcur, rec);
+		} else {
+			err = DB_RECORD_NOT_FOUND;
+		}
+	} else {
+		err = DB_RECORD_NOT_FOUND;
+	}
+
+	srv_active_wake_master_thread();
+
+	return(err);
+}
+
+/*****************************************************************//**
+Read current row.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_read_row(
+/*===============*/
+	ib_crsr_t	ib_crsr,	/*!< in: InnoDB cursor instance */
+	ib_tpl_t	ib_tpl)		/*!< out: read cols into this tuple */
+{
+	ib_err_t	err;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+
+	ut_a(cursor->prebuilt->trx->state != TRX_STATE_NOT_STARTED);
+
+	/* When searching with IB_EXACT_MATCH set, row_search_for_mysql()
+	will not position the persistent cursor but will copy the record
+	found into the row cache. It should be the only entry. */
+	if (!ib_cursor_is_positioned(ib_crsr) ) {
+		err = DB_RECORD_NOT_FOUND;
+	} else {
+		mtr_t		mtr;
+		btr_pcur_t*	pcur;
+		row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+		if (prebuilt->need_to_access_clustered
+		    && tuple->type == TPL_TYPE_ROW) {
+			pcur = &prebuilt->clust_pcur;
+		} else {
+			pcur = &prebuilt->pcur;
+		}
+
+		if (pcur == NULL) {
+			return(DB_ERROR);
+		}
+
+		mtr_start(&mtr);
+
+		if (btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, &mtr)) {
+			const rec_t*	rec;
+			ib_bool_t	page_format;
+
+			page_format = dict_table_is_comp(tuple->index->table);
+			rec = btr_pcur_get_rec(pcur);
+
+			if (prebuilt->innodb_api_rec &&
+			    prebuilt->innodb_api_rec != rec) {
+				rec = prebuilt->innodb_api_rec;
+			}
+
+			if (!rec_get_deleted_flag(rec, page_format)) {
+				ib_read_tuple(rec, page_format, tuple);
+				err = DB_SUCCESS;
+			} else{
+				err = DB_RECORD_NOT_FOUND;
+			}
+
+		} else {
+			err = DB_RECORD_NOT_FOUND;
+		}
+
+		mtr_commit(&mtr);
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Move cursor to the first record in the table.
+@return	DB_SUCCESS or err code */
+UNIV_INLINE
+ib_err_t
+ib_cursor_position(
+/*===============*/
+	ib_cursor_t*	cursor,		/*!< in: InnoDB cursor instance */
+	ib_srch_mode_t	mode)		/*!< in: Search mode */
+{
+	ib_err_t	err;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+	unsigned char*	buf;
+
+	buf = static_cast<unsigned char*>(mem_alloc(UNIV_PAGE_SIZE));
+
+	/* We want to position at one of the ends, row_search_for_mysql()
+	uses the search_tuple fields to work out what to do. */
+	dtuple_set_n_fields(prebuilt->search_tuple, 0);
+
+	err = static_cast<ib_err_t>(row_search_for_mysql(
+		buf, mode, prebuilt, 0, 0));
+
+	mem_free(buf);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Move cursor to the first record in the table.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_first(
+/*============*/
+	ib_crsr_t	ib_crsr)	/*!< in: InnoDB cursor instance */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+
+	return(ib_cursor_position(cursor, IB_CUR_G));
+}
+
+/*****************************************************************//**
+Move cursor to the last record in the table.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_last(
+/*===========*/
+	ib_crsr_t	ib_crsr)	/*!< in: InnoDB cursor instance */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+
+	return(ib_cursor_position(cursor, IB_CUR_L));
+}
+
+/*****************************************************************//**
+Move cursor to the next user record in the table.
+@return DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_next(
+/*===========*/
+        ib_crsr_t       ib_crsr)        /*!< in: InnoDB cursor instance */
+{
+        ib_err_t	err;
+        ib_cursor_t*    cursor = (ib_cursor_t*) ib_crsr;
+        row_prebuilt_t* prebuilt = cursor->prebuilt;
+	byte		buf[UNIV_PAGE_SIZE_MAX];
+
+        /* We want to move to the next record */
+        dtuple_set_n_fields(prebuilt->search_tuple, 0);
+
+        err = static_cast<ib_err_t>(row_search_for_mysql(
+		buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT));
+
+        return(err);
+}
+
+/*****************************************************************//**
+Search for key.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_moveto(
+/*=============*/
+	ib_crsr_t	ib_crsr,	/*!< in: InnoDB cursor instance */
+	ib_tpl_t	ib_tpl,		/*!< in: Key to search for */
+	ib_srch_mode_t	ib_srch_mode)	/*!< in: search mode */
+{
+	ulint		i;
+	ulint		n_fields;
+	ib_err_t	err = DB_SUCCESS;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+	dtuple_t*	search_tuple = prebuilt->search_tuple;
+	unsigned char*	buf;
+
+	ut_a(tuple->type == TPL_TYPE_KEY);
+
+	n_fields = dict_index_get_n_ordering_defined_by_user(prebuilt->index);
+
+	dtuple_set_n_fields(search_tuple, n_fields);
+	dtuple_set_n_fields_cmp(search_tuple, n_fields);
+
+	/* Do a shallow copy */
+	for (i = 0; i < n_fields; ++i) {
+		dfield_copy(dtuple_get_nth_field(search_tuple, i),
+			    dtuple_get_nth_field(tuple->ptr, i));
+	}
+
+	ut_a(prebuilt->select_lock_type <= LOCK_NUM);
+
+	prebuilt->innodb_api_rec = NULL;
+
+	buf = static_cast<unsigned char*>(mem_alloc(UNIV_PAGE_SIZE));
+
+	err = static_cast<ib_err_t>(row_search_for_mysql(
+		buf, ib_srch_mode, prebuilt, cursor->match_mode, 0));
+
+	mem_free(buf);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Set the cursor search mode. */
+UNIV_INTERN
+void
+ib_cursor_set_match_mode(
+/*=====================*/
+	ib_crsr_t	ib_crsr,	/*!< in: Cursor instance */
+	ib_match_mode_t	match_mode)	/*!< in: ib_cursor_moveto match mode */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+
+	cursor->match_mode = match_mode;
+}
+
+/*****************************************************************//**
+Get the dfield instance for the column in the tuple.
+@return	dfield instance in tuple */
+UNIV_INLINE
+dfield_t*
+ib_col_get_dfield(
+/*==============*/
+	ib_tuple_t*	tuple,		/*!< in: tuple instance */
+	ulint		col_no)		/*!< in: col no. in tuple */
+{
+	dfield_t*	dfield;
+
+	dfield = dtuple_get_nth_field(tuple->ptr, col_no);
+
+	return(dfield);
+}
+
+/*****************************************************************//**
+Predicate to check whether a column type contains variable length data.
+@return	DB_SUCCESS or error code */
+UNIV_INLINE
+ib_err_t
+ib_col_is_capped(
+/*==============*/
+	const dtype_t*  dtype)		/*!< in: column type */
+{
+	return(static_cast<ib_err_t>(
+		(dtype_get_mtype(dtype) == DATA_VARCHAR
+		|| dtype_get_mtype(dtype) == DATA_CHAR
+		|| dtype_get_mtype(dtype) == DATA_MYSQL
+		|| dtype_get_mtype(dtype) == DATA_VARMYSQL
+		|| dtype_get_mtype(dtype) == DATA_FIXBINARY
+		|| dtype_get_mtype(dtype) == DATA_BINARY)
+	       && dtype_get_len(dtype) > 0));
+}
+
+/*****************************************************************//**
+Set a column of the tuple. Make a copy using the tuple's heap.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_col_set_value(
+/*=============*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	col_no,		/*!< in: column index in tuple */
+	const void*	src,		/*!< in: data value */
+	ib_ulint_t	len,		/*!< in: data value len */
+	ib_bool_t	need_cpy)	/*!< in: if need memcpy */
+{
+	const dtype_t*  dtype;
+	dfield_t*	dfield;
+	void*		dst = NULL;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+	ulint		col_len;
+
+	dfield = ib_col_get_dfield(tuple, col_no);
+
+	/* User wants to set the column to NULL. */
+	if (len == IB_SQL_NULL) {
+		dfield_set_null(dfield);
+		return(DB_SUCCESS);
+	}
+
+	dtype = dfield_get_type(dfield);
+	col_len = dtype_get_len(dtype);
+
+	/* Not allowed to update system columns. */
+	if (dtype_get_mtype(dtype) == DATA_SYS) {
+		return(DB_DATA_MISMATCH);
+	}
+
+	dst = dfield_get_data(dfield);
+
+	/* Since TEXT/CLOB also map to DATA_VARCHAR we need to make an
+	exception. Perhaps we need to set the precise type and check
+	for that. */
+	if (ib_col_is_capped(dtype)) {
+
+		len = ut_min(len, col_len);
+
+		if (dst == NULL || len > dfield_get_len(dfield)) {
+			dst = mem_heap_alloc(tuple->heap, col_len);
+			ut_a(dst != NULL);
+		}
+	} else if (dst == NULL || len > dfield_get_len(dfield)) {
+		dst = mem_heap_alloc(tuple->heap, len);
+	}
+
+	if (dst == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	switch (dtype_get_mtype(dtype)) {
+	case DATA_INT: {
+
+		if (col_len == len) {
+			ibool		usign;
+
+			usign = dtype_get_prtype(dtype) & DATA_UNSIGNED;
+			mach_write_int_type(static_cast<byte*>(dst),
+					    static_cast<const byte*>(src),
+					    len, usign);
+
+		} else {
+			return(DB_DATA_MISMATCH);
+		}
+		break;
+	}
+
+	case DATA_FLOAT:
+		if (len == sizeof(float)) {
+			mach_float_write(static_cast<byte*>(dst), *(float*)src);
+		} else {
+			return(DB_DATA_MISMATCH);
+		}
+		break;
+
+	case DATA_DOUBLE:
+		if (len == sizeof(double)) {
+			mach_double_write(static_cast<byte*>(dst),
+					  *(double*)src);
+		} else {
+			return(DB_DATA_MISMATCH);
+		}
+		break;
+
+	case DATA_SYS:
+		ut_error;
+		break;
+
+	case DATA_CHAR: {
+		ulint	pad_char = ULINT_UNDEFINED;
+
+		pad_char = dtype_get_pad_char(
+			dtype_get_mtype(dtype),	dtype_get_prtype(dtype));
+
+		ut_a(pad_char != ULINT_UNDEFINED);
+
+		memset((byte*) dst + len,
+		       pad_char,
+		       col_len - len);
+
+		memcpy(dst, src, len);
+
+		len = col_len;
+		break;
+	}
+	case DATA_BLOB:
+	case DATA_BINARY:
+	case DATA_DECIMAL:
+	case DATA_VARCHAR:
+	case DATA_FIXBINARY:
+		if (need_cpy) {
+			memcpy(dst, src, len);
+		} else {
+			dfield_set_data(dfield, src, len);
+			dst = dfield_get_data(dfield);
+		}
+		break;
+
+	case DATA_MYSQL:
+	case DATA_VARMYSQL: {
+		ulint		cset;
+		CHARSET_INFO*	cs;
+		int		error = 0;
+		ulint		true_len = len;
+
+		/* For multi byte character sets we need to
+		calculate the true length of the data. */
+		cset = dtype_get_charset_coll(
+			dtype_get_prtype(dtype));
+		cs = all_charsets[cset];
+		if (cs) {
+			uint pos = (uint)(col_len / cs->mbmaxlen);
+
+			if (len > 0 && cs->mbmaxlen > 1) {
+				true_len = (ulint)
+					cs->cset->well_formed_len(
+						cs,
+						(const char*)src,
+						(const char*)src + len,
+						pos,
+						&error);
+
+				if (true_len < len) {
+					len = true_len;
+				}
+			}
+		}
+
+		/* All invalid bytes in data need be truncated.
+		If len == 0, means all bytes of the data is invalid.
+		In this case, the data will be truncated to empty.*/
+		memcpy(dst, src, len);
+
+		/* For DATA_MYSQL, need to pad the unused
+		space with spaces. */
+		if (dtype_get_mtype(dtype) == DATA_MYSQL) {
+			ulint		n_chars;
+
+			if (len < col_len) {
+				ulint	pad_len = col_len - len;
+
+				ut_a(cs != NULL);
+				ut_a(!(pad_len % cs->mbminlen));
+
+				cs->cset->fill(cs, (char*)dst + len,
+					       pad_len,
+					       0x20 /* space */);
+			}
+
+			/* Why we should do below? See function
+			row_mysql_store_col_in_innobase_format */
+
+			ut_a(!(dtype_get_len(dtype)
+				% dtype_get_mbmaxlen(dtype)));
+
+			n_chars = dtype_get_len(dtype)
+				/ dtype_get_mbmaxlen(dtype);
+
+			/* Strip space padding. */
+			while (col_len > n_chars
+				&& ((char*)dst)[col_len - 1] == 0x20) {
+				col_len--;
+			}
+
+			len = col_len;
+		}
+		break;
+	}
+
+	default:
+		ut_error;
+	}
+
+	if (dst != dfield_get_data(dfield)) {
+		dfield_set_data(dfield, dst, len);
+	} else {
+		dfield_set_len(dfield, len);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Get the size of the data available in a column of the tuple.
+@return	bytes avail or IB_SQL_NULL */
+UNIV_INTERN
+ib_ulint_t
+ib_col_get_len(
+/*===========*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	i)		/*!< in: column index in tuple */
+{
+	const dfield_t*		dfield;
+	ulint			data_len;
+	ib_tuple_t*		tuple = (ib_tuple_t*) ib_tpl;
+
+	dfield = ib_col_get_dfield(tuple, i);
+
+	data_len = dfield_get_len(dfield);
+
+	return(data_len == UNIV_SQL_NULL ? IB_SQL_NULL : data_len);
+}
+
+/*****************************************************************//**
+Copy a column value from the tuple.
+@return	bytes copied or IB_SQL_NULL */
+UNIV_INLINE
+ib_ulint_t
+ib_col_copy_value_low(
+/*==================*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	i,		/*!< in: column index in tuple */
+	void*		dst,		/*!< out: copied data value */
+	ib_ulint_t	len)		/*!< in: max data value len to copy */
+{
+	const void*	data;
+	const dfield_t*	dfield;
+	ulint		data_len;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	dfield = ib_col_get_dfield(tuple, i);
+
+	data = dfield_get_data(dfield);
+	data_len = dfield_get_len(dfield);
+
+	if (data_len != UNIV_SQL_NULL) {
+
+		const dtype_t*  dtype = dfield_get_type(dfield);
+
+		switch (dtype_get_mtype(dfield_get_type(dfield))) {
+		case DATA_INT: {
+			ibool		usign;
+			ullint		ret;
+
+			ut_a(data_len == len);
+
+			usign = dtype_get_prtype(dtype) & DATA_UNSIGNED;
+			ret = mach_read_int_type(static_cast<const byte*>(data),
+						 data_len, usign);
+
+			if (usign) {
+				if (len == 1) {
+					*(ib_i8_t*)dst = (ib_i8_t)ret;
+				} else if (len == 2) {
+					*(ib_i16_t*)dst = (ib_i16_t)ret;
+				} else if (len == 4) {
+					*(ib_i32_t*)dst = (ib_i32_t)ret;
+				} else {
+					*(ib_i64_t*)dst = (ib_i64_t)ret;
+				}
+			} else {
+				if (len == 1) {
+					*(ib_u8_t*)dst = (ib_i8_t)ret;
+				} else if (len == 2) {
+					*(ib_u16_t*)dst = (ib_i16_t)ret;
+				} else if (len == 4) {
+					*(ib_u32_t*)dst = (ib_i32_t)ret;
+				} else {
+					*(ib_u64_t*)dst = (ib_i64_t)ret;
+				}
+			}
+
+			break;
+		}
+		case DATA_FLOAT:
+			if (len == data_len) {
+				float	f;
+
+				ut_a(data_len == sizeof(f));
+				f = mach_float_read(static_cast<const byte*>(
+					data));
+				memcpy(dst, &f, sizeof(f));
+			} else {
+				data_len = 0;
+			}
+			break;
+		case DATA_DOUBLE:
+			if (len == data_len) {
+				double	d;
+
+				ut_a(data_len == sizeof(d));
+				d = mach_double_read(static_cast<const byte*>(
+					data));
+				memcpy(dst, &d, sizeof(d));
+			} else {
+				data_len = 0;
+			}
+			break;
+		default:
+			data_len = ut_min(data_len, len);
+			memcpy(dst, data, data_len);
+		}
+	} else {
+		data_len = IB_SQL_NULL;
+	}
+
+	return(data_len);
+}
+
+/*****************************************************************//**
+Copy a column value from the tuple.
+@return	bytes copied or IB_SQL_NULL */
+UNIV_INTERN
+ib_ulint_t
+ib_col_copy_value(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	i,		/*!< in: column index in tuple */
+	void*		dst,		/*!< out: copied data value */
+	ib_ulint_t	len)		/*!< in: max data value len to copy */
+{
+	return(ib_col_copy_value_low(ib_tpl, i, dst, len));
+}
+
+/*****************************************************************//**
+Get the InnoDB column attribute from the internal column precise type.
+@return	precise type in api format */
+UNIV_INLINE
+ib_col_attr_t
+ib_col_get_attr(
+/*============*/
+	ulint		prtype)		/*!< in: column definition */
+{
+	ib_col_attr_t	attr = IB_COL_NONE;
+
+	if (prtype & DATA_UNSIGNED) {
+		attr = static_cast<ib_col_attr_t>(attr | IB_COL_UNSIGNED);
+	}
+
+	if (prtype & DATA_NOT_NULL) {
+		attr = static_cast<ib_col_attr_t>(attr | IB_COL_NOT_NULL);
+	}
+
+	return(attr);
+}
+
+/*****************************************************************//**
+Get a column name from the tuple.
+@return	name of the column */
+UNIV_INTERN
+const char*
+ib_col_get_name(
+/*============*/
+	ib_crsr_t       ib_crsr,        /*!< in: InnoDB cursor instance */
+	ib_ulint_t	i)		/*!< in: column index in tuple */
+{
+	const char*	name;
+	ib_cursor_t*    cursor = (ib_cursor_t*) ib_crsr;
+	dict_table_t*	table = cursor->prebuilt->table;
+	dict_col_t*     col = dict_table_get_nth_col(table, i);
+	ulint           col_no = dict_col_get_no(col);
+
+	name = dict_table_get_col_name(table, col_no);
+
+	return(name);
+}
+
+/*****************************************************************//**
+Get an index field name from the cursor.
+@return	name of the field */
+UNIV_INTERN
+const char*
+ib_get_idx_field_name(
+/*==================*/
+	ib_crsr_t       ib_crsr,        /*!< in: InnoDB cursor instance */
+	ib_ulint_t	i)		/*!< in: column index in tuple */
+{
+	ib_cursor_t*    cursor = (ib_cursor_t*) ib_crsr;
+	dict_index_t*	index = cursor->prebuilt->index;
+	dict_field_t* 	field;
+
+	if (index) {
+		field = dict_index_get_nth_field(cursor->prebuilt->index, i);
+
+		if (field) {
+			return(field->name);
+		}
+	}
+
+	return(NULL);
+}
+
+/*****************************************************************//**
+Get a column type, length and attributes from the tuple.
+@return	len of column data */
+UNIV_INLINE
+ib_ulint_t
+ib_col_get_meta_low(
+/*================*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	i,		/*!< in: column index in tuple */
+	ib_col_meta_t*	ib_col_meta)	/*!< out: column meta data */
+{
+	ib_u16_t	prtype;
+	const dfield_t*	dfield;
+	ulint		data_len;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	dfield = ib_col_get_dfield(tuple, i);
+
+	data_len = dfield_get_len(dfield);
+
+	/* We assume 1-1 mapping between the ENUM and internal type codes. */
+	ib_col_meta->type = static_cast<ib_col_type_t>(
+		dtype_get_mtype(dfield_get_type(dfield)));
+
+	ib_col_meta->type_len = dtype_get_len(dfield_get_type(dfield));
+
+	prtype = (ib_u16_t) dtype_get_prtype(dfield_get_type(dfield));
+
+	ib_col_meta->attr = ib_col_get_attr(prtype);
+	ib_col_meta->client_type = prtype & DATA_MYSQL_TYPE_MASK;
+
+	return(data_len);
+}
+
+/*************************************************************//**
+Read a signed int 8 bit column from an InnoDB tuple. */
+UNIV_INLINE
+ib_err_t
+ib_tuple_check_int(
+/*===============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_bool_t		usign,	/*!< in: true if unsigned */
+	ulint			size)	/*!< in: size of integer */
+{
+	ib_col_meta_t		ib_col_meta;
+
+	ib_col_get_meta_low(ib_tpl, i, &ib_col_meta);
+
+	if (ib_col_meta.type != IB_INT) {
+		return(DB_DATA_MISMATCH);
+	} else if (ib_col_meta.type_len == IB_SQL_NULL) {
+		return(DB_UNDERFLOW);
+	} else if (ib_col_meta.type_len != size) {
+		return(DB_DATA_MISMATCH);
+	} else if ((ib_col_meta.attr & IB_COL_UNSIGNED) && !usign) {
+		return(DB_DATA_MISMATCH);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Read a signed int 8 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_i8(
+/*=============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_i8_t*		ival)	/*!< out: integer value */
+{
+	ib_err_t		err;
+
+	err = ib_tuple_check_int(ib_tpl, i, IB_FALSE, sizeof(*ival));
+
+	if (err == DB_SUCCESS) {
+		ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival));
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+Read an unsigned int 8 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_u8(
+/*=============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_u8_t*		ival)	/*!< out: integer value */
+{
+	ib_err_t		err;
+
+	err = ib_tuple_check_int(ib_tpl, i, IB_TRUE, sizeof(*ival));
+
+	if (err == DB_SUCCESS) {
+		ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival));
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+Read a signed int 16 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_i16(
+/*==============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_i16_t*		ival)	/*!< out: integer value */
+{
+	ib_err_t		err;
+
+	err = ib_tuple_check_int(ib_tpl, i, FALSE, sizeof(*ival));
+
+	if (err == DB_SUCCESS) {
+		ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival));
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+Read an unsigned int 16 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_u16(
+/*==============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_u16_t*		ival)	/*!< out: integer value */
+{
+	ib_err_t		err;
+
+	err = ib_tuple_check_int(ib_tpl, i, IB_TRUE, sizeof(*ival));
+
+	if (err == DB_SUCCESS) {
+		ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival));
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+Read a signed int 32 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_i32(
+/*==============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_i32_t*		ival)	/*!< out: integer value */
+{
+	ib_err_t		err;
+
+	err = ib_tuple_check_int(ib_tpl, i, FALSE, sizeof(*ival));
+
+	if (err == DB_SUCCESS) {
+		ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival));
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+Read an unsigned int 32 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_u32(
+/*==============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_u32_t*		ival)	/*!< out: integer value */
+{
+	ib_err_t		err;
+
+	err = ib_tuple_check_int(ib_tpl, i, IB_TRUE, sizeof(*ival));
+
+	if (err == DB_SUCCESS) {
+		ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival));
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+Read a signed int 64 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_i64(
+/*==============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_i64_t*		ival)	/*!< out: integer value */
+{
+	ib_err_t		err;
+
+	err = ib_tuple_check_int(ib_tpl, i, FALSE, sizeof(*ival));
+
+	if (err == DB_SUCCESS) {
+		ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival));
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+Read an unsigned int 64 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_u64(
+/*==============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_u64_t*		ival)	/*!< out: integer value */
+{
+	ib_err_t		err;
+
+	err = ib_tuple_check_int(ib_tpl, i, IB_TRUE, sizeof(*ival));
+
+	if (err == DB_SUCCESS) {
+		ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival));
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Get a column value pointer from the tuple.
+@return	NULL or pointer to buffer */
+UNIV_INTERN
+const void*
+ib_col_get_value(
+/*=============*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	i)		/*!< in: column index in tuple */
+{
+	const void*	data;
+	const dfield_t*	dfield;
+	ulint		data_len;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	dfield = ib_col_get_dfield(tuple, i);
+
+	data = dfield_get_data(dfield);
+	data_len = dfield_get_len(dfield);
+
+	return(data_len != UNIV_SQL_NULL ? data : NULL);
+}
+
+/*****************************************************************//**
+Get a column type, length and attributes from the tuple.
+@return	len of column data */
+UNIV_INTERN
+ib_ulint_t
+ib_col_get_meta(
+/*============*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	i,		/*!< in: column index in tuple */
+	ib_col_meta_t*	ib_col_meta)	/*!< out: column meta data */
+{
+	return(ib_col_get_meta_low(ib_tpl, i, ib_col_meta));
+}
+
+/*****************************************************************//**
+"Clear" or reset an InnoDB tuple. We free the heap and recreate the tuple.
+@return	new tuple, or NULL */
+UNIV_INTERN
+ib_tpl_t
+ib_tuple_clear(
+/*============*/
+	ib_tpl_t	ib_tpl)		/*!< in,own: tuple (will be freed) */
+{
+	const dict_index_t*	index;
+	ulint			n_cols;
+	ib_tuple_t*		tuple	= (ib_tuple_t*) ib_tpl;
+	ib_tuple_type_t		type	= tuple->type;
+	mem_heap_t*		heap	= tuple->heap;
+
+	index = tuple->index;
+	n_cols = dtuple_get_n_fields(tuple->ptr);
+
+	mem_heap_empty(heap);
+
+	if (type == TPL_TYPE_ROW) {
+		return(ib_row_tuple_new_low(index, n_cols, heap));
+	} else {
+		return(ib_key_tuple_new_low(index, n_cols, heap));
+	}
+}
+
+/*****************************************************************//**
+Create a new cluster key search tuple and copy the contents of  the
+secondary index key tuple columns that refer to the cluster index record
+to the cluster key. It does a deep copy of the column data.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_tuple_get_cluster_key(
+/*=====================*/
+	ib_crsr_t	ib_crsr,	/*!< in: secondary index cursor */
+	ib_tpl_t*	ib_dst_tpl,	/*!< out,own: destination tuple */
+	const ib_tpl_t	ib_src_tpl)	/*!< in: source tuple */
+{
+	ulint		i;
+	ulint		n_fields;
+	ib_err_t	err = DB_SUCCESS;
+	ib_tuple_t*	dst_tuple = NULL;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	ib_tuple_t*	src_tuple = (ib_tuple_t*) ib_src_tpl;
+	dict_index_t*	clust_index;
+
+	clust_index = dict_table_get_first_index(cursor->prebuilt->table);
+
+	/* We need to ensure that the src tuple belongs to the same table
+	as the open cursor and that it's not a tuple for a cluster index. */
+	if (src_tuple->type != TPL_TYPE_KEY) {
+		return(DB_ERROR);
+	} else if (src_tuple->index->table != cursor->prebuilt->table) {
+		return(DB_DATA_MISMATCH);
+	} else if (src_tuple->index == clust_index) {
+		return(DB_ERROR);
+	}
+
+	/* Create the cluster index key search tuple. */
+	*ib_dst_tpl = ib_clust_search_tuple_create(ib_crsr);
+
+	if (!*ib_dst_tpl) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	dst_tuple = (ib_tuple_t*) *ib_dst_tpl;
+	ut_a(dst_tuple->index == clust_index);
+
+	n_fields = dict_index_get_n_unique(dst_tuple->index);
+
+	/* Do a deep copy of the data fields. */
+	for (i = 0; i < n_fields; i++) {
+		ulint		pos;
+		dfield_t*	src_field;
+		dfield_t*	dst_field;
+
+		pos = dict_index_get_nth_field_pos(
+			src_tuple->index, dst_tuple->index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+
+		src_field = dtuple_get_nth_field(src_tuple->ptr, pos);
+		dst_field = dtuple_get_nth_field(dst_tuple->ptr, i);
+
+		if (!dfield_is_null(src_field)) {
+			UNIV_MEM_ASSERT_RW(src_field->data, src_field->len);
+
+			dst_field->data = mem_heap_dup(
+				dst_tuple->heap,
+				src_field->data,
+				src_field->len);
+
+			dst_field->len = src_field->len;
+		} else {
+			dfield_set_null(dst_field);
+		}
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Copy the contents of  source tuple to destination tuple. The tuples
+must be of the same type and belong to the same table/index.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_tuple_copy(
+/*==========*/
+	ib_tpl_t	ib_dst_tpl,	/*!< in: destination tuple */
+	const ib_tpl_t	ib_src_tpl)	/*!< in: source tuple */
+{
+	ulint		i;
+	ulint		n_fields;
+	ib_err_t	err = DB_SUCCESS;
+	const ib_tuple_t*src_tuple = (const ib_tuple_t*) ib_src_tpl;
+	ib_tuple_t*	dst_tuple = (ib_tuple_t*) ib_dst_tpl;
+
+	/* Make sure src and dst are not the same. */
+	ut_a(src_tuple != dst_tuple);
+
+	/* Make sure they are the same type and refer to the same index. */
+	if (src_tuple->type != dst_tuple->type
+	   || src_tuple->index != dst_tuple->index) {
+
+		return(DB_DATA_MISMATCH);
+	}
+
+	n_fields = dtuple_get_n_fields(src_tuple->ptr);
+	ut_ad(n_fields == dtuple_get_n_fields(dst_tuple->ptr));
+
+	/* Do a deep copy of the data fields. */
+	for (i = 0; i < n_fields; ++i) {
+		dfield_t*	src_field;
+		dfield_t*	dst_field;
+
+		src_field = dtuple_get_nth_field(src_tuple->ptr, i);
+		dst_field = dtuple_get_nth_field(dst_tuple->ptr, i);
+
+		if (!dfield_is_null(src_field)) {
+			UNIV_MEM_ASSERT_RW(src_field->data, src_field->len);
+
+			dst_field->data = mem_heap_dup(
+				dst_tuple->heap,
+				src_field->data,
+				src_field->len);
+
+			dst_field->len = src_field->len;
+		} else {
+			dfield_set_null(dst_field);
+		}
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Create an InnoDB tuple used for index/table search.
+@return	own: Tuple for current index */
+UNIV_INTERN
+ib_tpl_t
+ib_sec_search_tuple_create(
+/*=======================*/
+	ib_crsr_t	ib_crsr)	/*!< in: Cursor instance */
+{
+	ulint		n_cols;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	dict_index_t*	index = cursor->prebuilt->index;
+
+	n_cols = dict_index_get_n_unique_in_tree(index);
+	return(ib_key_tuple_new(index, n_cols));
+}
+
+/*****************************************************************//**
+Create an InnoDB tuple used for index/table search.
+@return	own: Tuple for current index */
+UNIV_INTERN
+ib_tpl_t
+ib_sec_read_tuple_create(
+/*=====================*/
+	ib_crsr_t	ib_crsr)	/*!< in: Cursor instance */
+{
+	ulint		n_cols;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	dict_index_t*	index = cursor->prebuilt->index;
+
+	n_cols = dict_index_get_n_fields(index);
+	return(ib_row_tuple_new(index, n_cols));
+}
+
+/*****************************************************************//**
+Create an InnoDB tuple used for table key operations.
+@return	own: Tuple for current table */
+UNIV_INTERN
+ib_tpl_t
+ib_clust_search_tuple_create(
+/*=========================*/
+	ib_crsr_t	ib_crsr)	/*!< in: Cursor instance */
+{
+	ulint		n_cols;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	dict_index_t*	index;
+
+	index = dict_table_get_first_index(cursor->prebuilt->table);
+
+	n_cols = dict_index_get_n_ordering_defined_by_user(index);
+	return(ib_key_tuple_new(index, n_cols));
+}
+
+/*****************************************************************//**
+Create an InnoDB tuple for table row operations.
+@return	own: Tuple for current table */
+UNIV_INTERN
+ib_tpl_t
+ib_clust_read_tuple_create(
+/*=======================*/
+	ib_crsr_t	ib_crsr)	/*!< in: Cursor instance */
+{
+	ulint		n_cols;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	dict_index_t*	index;
+
+	index = dict_table_get_first_index(cursor->prebuilt->table);
+
+	n_cols = dict_table_get_n_cols(cursor->prebuilt->table);
+	return(ib_row_tuple_new(index, n_cols));
+}
+
+/*****************************************************************//**
+Return the number of user columns in the tuple definition.
+@return	number of user columns */
+UNIV_INTERN
+ib_ulint_t
+ib_tuple_get_n_user_cols(
+/*=====================*/
+	const ib_tpl_t	ib_tpl)		/*!< in: Tuple for current table */
+{
+	const ib_tuple_t*	tuple = (const ib_tuple_t*) ib_tpl;
+
+	if (tuple->type == TPL_TYPE_ROW) {
+		return(dict_table_get_n_user_cols(tuple->index->table));
+	}
+
+	return(dict_index_get_n_ordering_defined_by_user(tuple->index));
+}
+
+/*****************************************************************//**
+Return the number of columns in the tuple definition.
+@return	number of columns */
+UNIV_INTERN
+ib_ulint_t
+ib_tuple_get_n_cols(
+/*================*/
+	const ib_tpl_t	ib_tpl)		/*!< in: Tuple for table/index */
+{
+	const ib_tuple_t*	tuple = (const ib_tuple_t*) ib_tpl;
+
+	return(dtuple_get_n_fields(tuple->ptr));
+}
+
+/*****************************************************************//**
+Destroy an InnoDB tuple. */
+UNIV_INTERN
+void
+ib_tuple_delete(
+/*============*/
+	ib_tpl_t	ib_tpl)		/*!< in,own: Tuple instance to delete */
+{
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	if (!ib_tpl) {
+		return;
+	}
+
+	mem_heap_free(tuple->heap);
+}
+
+/*****************************************************************//**
+Get a table id. This function will acquire the dictionary mutex.
+@return	DB_SUCCESS if found */
+UNIV_INTERN
+ib_err_t
+ib_table_get_id(
+/*============*/
+	const char*	table_name,	/*!< in: table to find */
+	ib_id_u64_t*	table_id)	/*!< out: table id if found */
+{
+	ib_err_t	err;
+
+	dict_mutex_enter_for_mysql();
+
+	err = ib_table_get_id_low(table_name, table_id);
+
+	dict_mutex_exit_for_mysql();
+
+	return(err);
+}
+
+/*****************************************************************//**
+Get an index id.
+@return	DB_SUCCESS if found */
+UNIV_INTERN
+ib_err_t
+ib_index_get_id(
+/*============*/
+	const char*	table_name,	/*!< in: find index for this table */
+	const char*	index_name,	/*!< in: index to find */
+	ib_id_u64_t*	index_id)	/*!< out: index id if found */
+{
+	dict_table_t*	table;
+	char*		normalized_name;
+	ib_err_t	err = DB_TABLE_NOT_FOUND;
+
+	*index_id = 0;
+
+	normalized_name = static_cast<char*>(
+		mem_alloc(ut_strlen(table_name) + 1));
+	ib_normalize_table_name(normalized_name, table_name);
+
+	table = ib_lookup_table_by_name(normalized_name);
+
+	mem_free(normalized_name);
+	normalized_name = NULL;
+
+	if (table != NULL) {
+		dict_index_t*	index;
+
+		index = dict_table_get_index_on_name(table, index_name);
+
+		if (index != NULL) {
+			/* We only support 32 bit table and index ids. Because
+			we need to pack the table id into the index id. */
+
+			*index_id = (table->id);
+			*index_id <<= 32;
+			*index_id |= (index->id);
+
+			err = DB_SUCCESS;
+		}
+	}
+
+	return(err);
+}
+
+#ifdef __WIN__
+#define SRV_PATH_SEPARATOR      '\\'
+#else
+#define SRV_PATH_SEPARATOR      '/'
+#endif
+
+
+/*****************************************************************//**
+Check if cursor is positioned.
+@return	IB_TRUE if positioned */
+UNIV_INTERN
+ib_bool_t
+ib_cursor_is_positioned(
+/*====================*/
+	const ib_crsr_t	ib_crsr)	/*!< in: InnoDB cursor instance */
+{
+	const ib_cursor_t*	cursor = (const ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*		prebuilt = cursor->prebuilt;
+
+	return(ib_btr_cursor_is_positioned(&prebuilt->pcur));
+}
+
+
+/*****************************************************************//**
+Checks if the data dictionary is latched in exclusive mode.
+@return	TRUE if exclusive latch */
+UNIV_INTERN
+ib_bool_t
+ib_schema_lock_is_exclusive(
+/*========================*/
+	const ib_trx_t	ib_trx)		/*!< in: transaction */
+{
+	const trx_t*	trx = (const trx_t*) ib_trx;
+
+	return(trx->dict_operation_lock_mode == RW_X_LATCH);
+}
+
+/*****************************************************************//**
+Checks if the data dictionary is latched in shared mode.
+@return	TRUE if shared latch */
+UNIV_INTERN
+ib_bool_t
+ib_schema_lock_is_shared(
+/*=====================*/
+	const ib_trx_t	ib_trx)		/*!< in: transaction */
+{
+	const trx_t*	trx = (const trx_t*) ib_trx;
+
+	return(trx->dict_operation_lock_mode == RW_S_LATCH);
+}
+
+/*****************************************************************//**
+Set the Lock an InnoDB cursor/table.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_lock(
+/*===========*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor */
+	ib_lck_mode_t	ib_lck_mode)	/*!< in: InnoDB lock mode */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+	trx_t*		trx = prebuilt->trx;
+	dict_table_t*	table = prebuilt->table;
+
+	return(ib_trx_lock_table_with_retry(
+		trx, table, (enum lock_mode) ib_lck_mode));
+}
+
+/*****************************************************************//**
+Set the Lock an InnoDB table using the table id.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_table_lock(
+/*==========*/
+	ib_trx_t	ib_trx,		/*!< in/out: transaction */
+	ib_id_u64_t	table_id,	/*!< in: table id */
+	ib_lck_mode_t	ib_lck_mode)	/*!< in: InnoDB lock mode */
+{
+	ib_err_t	err;
+	que_thr_t*	thr;
+	mem_heap_t*	heap;
+	dict_table_t*	table;
+	ib_qry_proc_t	q_proc;
+	trx_t*		trx = (trx_t*) ib_trx;
+
+	ut_a(trx->state != TRX_STATE_NOT_STARTED);
+
+	table = ib_open_table_by_id(table_id, FALSE);
+
+	if (table == NULL) {
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	ut_a(ib_lck_mode <= static_cast<ib_lck_mode_t>(LOCK_NUM));
+
+	heap = mem_heap_create(128);
+
+	q_proc.node.sel = sel_node_create(heap);
+
+	thr = pars_complete_graph_for_exec(q_proc.node.sel, trx, heap);
+
+	q_proc.grph.sel = static_cast<que_fork_t*>(que_node_get_parent(thr));
+	q_proc.grph.sel->state = QUE_FORK_ACTIVE;
+
+	trx->op_info = "setting table lock";
+
+	ut_a(ib_lck_mode == IB_LOCK_IS || ib_lck_mode == IB_LOCK_IX);
+	err = static_cast<ib_err_t>(
+		lock_table(0, table, (enum lock_mode) ib_lck_mode, thr));
+
+	trx->error_state = err;
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Unlock an InnoDB table.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_unlock(
+/*=============*/
+	ib_crsr_t	ib_crsr)	/*!< in/out: InnoDB cursor */
+{
+	ib_err_t	err = DB_SUCCESS;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+	if (prebuilt->trx->mysql_n_tables_locked > 0) {
+		--prebuilt->trx->mysql_n_tables_locked;
+	} else {
+		err = DB_ERROR;
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Set the Lock mode of the cursor.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_set_lock_mode(
+/*====================*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor */
+	ib_lck_mode_t	ib_lck_mode)	/*!< in: InnoDB lock mode */
+{
+	ib_err_t	err = DB_SUCCESS;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+	ut_a(ib_lck_mode <= static_cast<ib_lck_mode_t>(LOCK_NUM));
+
+	if (ib_lck_mode == IB_LOCK_X) {
+		err = ib_cursor_lock(ib_crsr, IB_LOCK_IX);
+	} else if (ib_lck_mode == IB_LOCK_S) {
+		err = ib_cursor_lock(ib_crsr, IB_LOCK_IS);
+	}
+
+	if (err == DB_SUCCESS) {
+		prebuilt->select_lock_type = (enum lock_mode) ib_lck_mode;
+		ut_a(prebuilt->trx->state != TRX_STATE_NOT_STARTED);
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Set need to access clustered index record. */
+UNIV_INTERN
+void
+ib_cursor_set_cluster_access(
+/*=========================*/
+	ib_crsr_t	ib_crsr)	/*!< in/out: InnoDB cursor */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+	prebuilt->need_to_access_clustered = TRUE;
+}
+
+/*************************************************************//**
+Convert and write an INT column value to an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INLINE
+ib_err_t
+ib_tuple_write_int(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	ulint		col_no,		/*!< in: column number */
+	const void*	value,		/*!< in: integer value */
+	ulint		value_len)	/*!< in: sizeof value type */
+{
+	const dfield_t*	dfield;
+	ulint		data_len;
+	ulint		type_len;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	ut_a(col_no < ib_tuple_get_n_cols(ib_tpl));
+
+	dfield = ib_col_get_dfield(tuple, col_no);
+
+	data_len = dfield_get_len(dfield);
+	type_len = dtype_get_len(dfield_get_type(dfield));
+
+	if (dtype_get_mtype(dfield_get_type(dfield)) != DATA_INT
+	    || value_len != data_len) {
+
+		return(DB_DATA_MISMATCH);
+	}
+
+	return(ib_col_set_value(ib_tpl, col_no, value, type_len, true));
+}
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_i8(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_i8_t		val)		/*!< in: value to write */
+{
+	return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true));
+}
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_i16(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_i16_t	val)		/*!< in: value to write */
+{
+	return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true));
+}
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_i32(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_i32_t	val)		/*!< in: value to write */
+{
+	return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true));
+}
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_i64(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_i64_t	val)		/*!< in: value to write */
+{
+	return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true));
+}
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_u8(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_u8_t		val)		/*!< in: value to write */
+{
+	return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true));
+}
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_u16(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tupe to write to */
+	int		col_no,		/*!< in: column number */
+	ib_u16_t	val)		/*!< in: value to write */
+{
+	return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true));
+}
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_u32(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_u32_t	val)		/*!< in: value to write */
+{
+	return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true));
+}
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_u64(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_u64_t	val)		/*!< in: value to write */
+{
+	return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true));
+}
+
+/*****************************************************************//**
+Inform the cursor that it's the start of an SQL statement. */
+UNIV_INTERN
+void
+ib_cursor_stmt_begin(
+/*=================*/
+	ib_crsr_t	ib_crsr)	/*!< in: cursor */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+
+	cursor->prebuilt->sql_stat_start = TRUE;
+}
+
+/*****************************************************************//**
+Write a double value to a column.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_double(
+/*==================*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	double		val)		/*!< in: value to write */
+{
+	const dfield_t*	dfield;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	dfield = ib_col_get_dfield(tuple, col_no);
+
+	if (dtype_get_mtype(dfield_get_type(dfield)) == DATA_DOUBLE) {
+		return(ib_col_set_value(ib_tpl, col_no,
+					&val, sizeof(val), true));
+	} else {
+		return(DB_DATA_MISMATCH);
+	}
+}
+
+/*************************************************************//**
+Read a double column value from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_double(
+/*=================*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	col_no,		/*!< in: column number */
+	double*		dval)		/*!< out: double value */
+{
+	ib_err_t	err;
+	const dfield_t*	dfield;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	dfield = ib_col_get_dfield(tuple, col_no);
+
+	if (dtype_get_mtype(dfield_get_type(dfield)) == DATA_DOUBLE) {
+		ib_col_copy_value_low(ib_tpl, col_no, dval, sizeof(*dval));
+		err = DB_SUCCESS;
+	} else {
+		err = DB_DATA_MISMATCH;
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Write a float value to a column.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_float(
+/*=================*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	float		val)		/*!< in: value to write */
+{
+	const dfield_t*	dfield;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	dfield = ib_col_get_dfield(tuple, col_no);
+
+	if (dtype_get_mtype(dfield_get_type(dfield)) == DATA_FLOAT) {
+		return(ib_col_set_value(ib_tpl, col_no,
+					&val, sizeof(val), true));
+	} else {
+		return(DB_DATA_MISMATCH);
+	}
+}
+
+/*************************************************************//**
+Read a float value from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_float(
+/*================*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	col_no,		/*!< in: column number */
+	float*		fval)		/*!< out: float value */
+{
+	ib_err_t	err;
+	const dfield_t*	dfield;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	dfield = ib_col_get_dfield(tuple, col_no);
+
+	if (dtype_get_mtype(dfield_get_type(dfield)) == DATA_FLOAT) {
+		ib_col_copy_value_low(ib_tpl, col_no, fval, sizeof(*fval));
+		err = DB_SUCCESS;
+	} else {
+		err = DB_DATA_MISMATCH;
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Truncate a table. The cursor handle will be closed and set to NULL
+on success.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_truncate(
+/*===============*/
+	ib_crsr_t*	ib_crsr,	/*!< in/out: cursor for table
+					to truncate */
+	ib_id_u64_t*	table_id)	/*!< out: new table id */
+{
+	ib_err_t        err;
+	ib_cursor_t*    cursor = *(ib_cursor_t**) ib_crsr;
+	row_prebuilt_t* prebuilt = cursor->prebuilt;
+
+	*table_id = 0;
+
+	err = ib_cursor_lock(*ib_crsr, IB_LOCK_X);
+
+	if (err == DB_SUCCESS) {
+		trx_t*          trx;
+		dict_table_t*   table = prebuilt->table;
+
+		/* We are going to free the cursor and the prebuilt. Store
+		the transaction handle locally. */
+		trx = prebuilt->trx;
+		err = ib_cursor_close(*ib_crsr);
+		ut_a(err == DB_SUCCESS);
+
+		*ib_crsr = NULL;
+
+		/* A temp go around for assertion in trx_start_for_ddl_low
+		we already start the trx */
+		if (trx->state == TRX_STATE_ACTIVE) {
+#ifdef UNIV_DEBUG
+			trx->start_file = 0;
+#endif /* UNIV_DEBUG */
+			trx->dict_operation = TRX_DICT_OP_TABLE;
+		}
+
+		/* This function currently commits the transaction
+		on success. */
+		err = static_cast<ib_err_t>(
+			row_truncate_table_for_mysql(table, trx));
+
+		if (err == DB_SUCCESS) {
+			*table_id = (table->id);
+		}
+	}
+
+        return(err);
+}
+
+/*****************************************************************//**
+Truncate a table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_table_truncate(
+/*==============*/
+	const char*	table_name,	/*!< in: table name */
+	ib_id_u64_t*	table_id)	/*!< out: new table id */
+{
+	ib_err_t        err;
+	dict_table_t*   table;
+	ib_err_t        trunc_err;
+	ib_trx_t        ib_trx = NULL;
+	ib_crsr_t       ib_crsr = NULL;
+
+	ib_trx = ib_trx_begin(IB_TRX_SERIALIZABLE);
+
+	dict_mutex_enter_for_mysql();
+
+	table = dict_table_open_on_name(table_name, TRUE, FALSE,
+					DICT_ERR_IGNORE_NONE);
+
+	if (table != NULL && dict_table_get_first_index(table)) {
+		err = ib_create_cursor_with_index_id(&ib_crsr, table, 0,
+						     (trx_t*) ib_trx);
+	} else {
+		err = DB_TABLE_NOT_FOUND;
+	}
+
+	dict_mutex_exit_for_mysql();
+
+	if (err == DB_SUCCESS) {
+		trunc_err = ib_cursor_truncate(&ib_crsr, table_id);
+		ut_a(err == DB_SUCCESS);
+	} else {
+		trunc_err = err;
+	}
+
+	if (ib_crsr != NULL) {
+		err = ib_cursor_close(ib_crsr);
+		ut_a(err == DB_SUCCESS);
+	}
+
+	if (trunc_err == DB_SUCCESS) {
+		ut_a(ib_trx_state(ib_trx) == static_cast<ib_trx_state_t>(
+			TRX_STATE_NOT_STARTED));
+
+		err = ib_trx_release(ib_trx);
+		ut_a(err == DB_SUCCESS);
+	} else {
+		err = ib_trx_rollback(ib_trx);
+		ut_a(err == DB_SUCCESS);
+	}
+
+        return(trunc_err);
+}
+
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return 0 or error number */
+UNIV_INTERN
+ib_err_t
+ib_close_thd(
+/*=========*/
+	void*		thd)	/*!< in: handle to the MySQL thread of the user
+				whose resources should be free'd */
+{
+	innobase_close_thd(static_cast<THD*>(thd));
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Return isolation configuration set by "innodb_api_trx_level"
+@return trx isolation level*/
+UNIV_INTERN
+ib_trx_state_t
+ib_cfg_trx_level()
+/*==============*/
+{
+	return(static_cast<ib_trx_state_t>(ib_trx_level_setting));
+}
+
+/*****************************************************************//**
+Return configure value for background commit interval (in seconds)
+@return background commit interval (in seconds) */
+UNIV_INTERN
+ib_ulint_t
+ib_cfg_bk_commit_interval()
+/*=======================*/
+{
+	return(static_cast<ib_ulint_t>(ib_bk_commit_interval));
+}
+
+/*****************************************************************//**
+Get generic configure status
+@return configure status*/
+UNIV_INTERN
+int
+ib_cfg_get_cfg()
+/*============*/
+{
+	int	cfg_status;
+
+	cfg_status = (ib_binlog_enabled) ? IB_CFG_BINLOG_ENABLED : 0;
+
+	if (ib_mdl_enabled) {
+		cfg_status |= IB_CFG_MDL_ENABLED;
+	}
+
+	if (ib_disable_row_lock) {
+		cfg_status |= IB_CFG_DISABLE_ROWLOCK;
+	}
+
+	return(cfg_status);
+}
diff --git a/storage/xtradb/api/api0misc.cc b/storage/xtradb/api/api0misc.cc
new file mode 100644
index 00000000000..b2370105938
--- /dev/null
+++ b/storage/xtradb/api/api0misc.cc
@@ -0,0 +1,206 @@
+/*****************************************************************************
+
+Copyright (c) 2008, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file api/api0misc.cc
+InnoDB Native API
+
+2008-08-01 Created by Sunny Bains
+3/20/2011 Jimmy Yang extracted from Embedded InnoDB
+*******************************************************/
+
+#include <errno.h>
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif /* HAVE_UNISTD_H */
+
+#include "api0misc.h"
+#include "trx0roll.h"
+#include "srv0srv.h"
+#include "dict0mem.h"
+#include "dict0dict.h"
+#include "pars0pars.h"
+#include "row0sel.h"
+#include "lock0lock.h"
+#include "ha_prototypes.h"
+#include <m_ctype.h>
+#include <mysys_err.h>
+#include <mysql/plugin.h>
+
+/*********************************************************************//**
+Sets a lock on a table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+ib_trx_lock_table_with_retry(
+/*=========================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table,		/*!< in: table to lock */
+	enum lock_mode	mode)		/*!< in: LOCK_X or LOCK_S */
+{
+	que_thr_t*	thr;
+	dberr_t		err;
+	mem_heap_t*	heap;
+	sel_node_t*	node;
+
+	heap = mem_heap_create(512);
+
+	trx->op_info = "setting table lock";
+
+	node = sel_node_create(heap);
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+	thr->graph->state = QUE_FORK_ACTIVE;
+
+	/* We use the select query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(static_cast<que_fork_t*>(
+		que_node_get_parent(thr)));
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = thr;
+	thr->prev_node = thr->common.parent;
+
+	err = lock_table(0, table, mode, thr);
+
+	trx->error_state = err;
+
+	if (UNIV_LIKELY(err == DB_SUCCESS)) {
+		que_thr_stop_for_mysql_no_error(thr, trx);
+	} else {
+		que_thr_stop_for_mysql(thr);
+
+		if (err != DB_QUE_THR_SUSPENDED) {
+			ibool	was_lock_wait;
+
+			was_lock_wait = ib_handle_errors(&err, trx, thr, NULL);
+
+			if (was_lock_wait) {
+				goto run_again;
+			}
+		} else {
+			que_thr_t*	run_thr;
+			que_node_t*	parent;
+
+			parent = que_node_get_parent(thr);
+			run_thr = que_fork_start_command(
+				static_cast<que_fork_t*>(parent));
+
+			ut_a(run_thr == thr);
+
+			/* There was a lock wait but the thread was not
+			in a ready to run or running state. */
+			trx->error_state = DB_LOCK_WAIT;
+
+			goto run_again;
+		}
+	}
+
+	que_graph_free(thr->graph);
+	trx->op_info = "";
+
+	return(err);
+}
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return TRUE if it was a lock wait and we should continue running
+the query thread */
+UNIV_INTERN
+ibool
+ib_handle_errors(
+/*=============*/
+        dberr_t*	new_err,/*!< out: possible new error encountered in
+                                lock wait, or if no new error, the value
+                                of trx->error_state at the entry of this
+                                function */
+        trx_t*          trx,    /*!< in: transaction */
+        que_thr_t*      thr,    /*!< in: query thread */
+        trx_savept_t*   savept) /*!< in: savepoint or NULL */
+{
+        dberr_t		err;
+handle_new_error:
+        err = trx->error_state;
+
+        ut_a(err != DB_SUCCESS);
+
+        trx->error_state = DB_SUCCESS;
+
+        switch (err) {
+        case DB_LOCK_WAIT_TIMEOUT:
+		trx_rollback_for_mysql(trx);
+		break;
+                /* fall through */
+        case DB_DUPLICATE_KEY:
+        case DB_FOREIGN_DUPLICATE_KEY:
+        case DB_TOO_BIG_RECORD:
+        case DB_ROW_IS_REFERENCED:
+        case DB_NO_REFERENCED_ROW:
+        case DB_CANNOT_ADD_CONSTRAINT:
+        case DB_TOO_MANY_CONCURRENT_TRXS:
+        case DB_OUT_OF_FILE_SPACE:
+                if (savept) {
+                        /* Roll back the latest, possibly incomplete
+                        insertion or update */
+
+			trx_rollback_to_savepoint(trx, savept);
+                }
+                break;
+        case DB_LOCK_WAIT:
+		lock_wait_suspend_thread(thr);
+
+                if (trx->error_state != DB_SUCCESS) {
+                        que_thr_stop_for_mysql(thr);
+
+                        goto handle_new_error;
+                }
+
+                *new_err = err;
+
+                return(TRUE); /* Operation needs to be retried. */
+
+        case DB_DEADLOCK:
+        case DB_LOCK_TABLE_FULL:
+                /* Roll back the whole transaction; this resolution was added
+                to version 3.23.43 */
+
+                trx_rollback_for_mysql(trx);
+                break;
+
+        case DB_MUST_GET_MORE_FILE_SPACE:
+
+                exit(1);
+
+        case DB_CORRUPTION:
+	case DB_FOREIGN_EXCEED_MAX_CASCADE:
+                break;
+        default:
+                ut_error;
+        }
+
+        if (trx->error_state != DB_SUCCESS) {
+                *new_err = trx->error_state;
+        } else {
+                *new_err = err;
+        }
+
+        trx->error_state = DB_SUCCESS;
+
+        return(FALSE);
+}
diff --git a/storage/xtradb/btr/btr0btr.c b/storage/xtradb/btr/btr0btr.cc
index a9e3bfe479e..569dcea072a 100644
--- a/storage/xtradb/btr/btr0btr.c
+++ b/storage/xtradb/btr/btr0btr.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,7 +18,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file btr/btr0btr.c
+@file btr/btr0btr.cc
 The B-tree
 
 Created 6/2/1994 Heikki Tuuri
@@ -41,6 +42,7 @@ Created 6/2/1994 Heikki Tuuri
 #include "lock0lock.h"
 #include "ibuf0ibuf.h"
 #include "trx0trx.h"
+#include "srv0mon.h"
 
 /**************************************************************//**
 Checks if the page in the cursor can be merged with given page.
@@ -148,8 +150,8 @@ btr_blob_dbg_cmp(
 	const void*	a,	/*!< in: first btr_blob_dbg_t to compare */
 	const void*	b)	/*!< in: second btr_blob_dbg_t to compare */
 {
-	const btr_blob_dbg_t*	aa	= a;
-	const btr_blob_dbg_t*	bb	= b;
+	const btr_blob_dbg_t*	aa = static_cast<const btr_blob_dbg_t*>(a);
+	const btr_blob_dbg_t*	bb = static_cast<const btr_blob_dbg_t*>(b);
 
 	ut_ad(aa != NULL);
 	ut_ad(bb != NULL);
@@ -434,7 +436,7 @@ btr_blob_dbg_op(
 		rec = page_rec_get_next_const(rec);
 	} while (!page_rec_is_supremum(rec));
 
-	if (UNIV_LIKELY_NULL(heap)) {
+	if (heap) {
 		mem_heap_free(heap);
 	}
 
@@ -716,14 +718,16 @@ btr_root_fseg_validate(
 #endif /* UNIV_BTR_DEBUG */
 
 /**************************************************************//**
-Gets the root node of a tree and x-latches it.
-@return	root page, x-latched */
+Gets the root node of a tree and x- or s-latches it.
+@return	root page, x- or s-latched */
 static
 buf_block_t*
 btr_root_block_get(
 /*===============*/
-	dict_index_t*	index,	/*!< in: index tree */
-	mtr_t*		mtr)	/*!< in: mtr */
+	const dict_index_t*	index,	/*!< in: index tree */
+	ulint			mode,	/*!< in: either RW_S_LATCH
+					or RW_X_LATCH */
+	mtr_t*			mtr)	/*!< in: mtr */
 {
 	ulint		space;
 	ulint		zip_size;
@@ -734,8 +738,7 @@ btr_root_block_get(
 	zip_size = dict_table_zip_size(index->table);
 	root_page_no = dict_index_get_page(index);
 
-	block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH,
-			      index, mtr);
+	block = btr_block_get(space, zip_size, root_page_no, mode, index, mtr);
 
 	SRV_CORRUPT_TABLE_CHECK(block, return(0););
 
@@ -772,10 +775,162 @@ UNIV_INTERN
 page_t*
 btr_root_get(
 /*=========*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	mtr_t*			mtr)	/*!< in: mtr */
+{
+	return(buf_block_get_frame(btr_root_block_get(index, RW_X_LATCH,
+						      mtr)));
+}
+
+/**************************************************************//**
+Gets the height of the B-tree (the level of the root, when the leaf
+level is assumed to be 0). The caller must hold an S or X latch on
+the index.
+@return	tree height (level of the root) */
+UNIV_INTERN
+ulint
+btr_height_get(
+/*===========*/
 	dict_index_t*	index,	/*!< in: index tree */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint		height;
+	buf_block_t*	root_block;
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_S_LOCK)
+	      || mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+
+        /* S latches the page */
+        root_block = btr_root_block_get(index, RW_S_LATCH, mtr);
+
+        height = btr_page_get_level(buf_block_get_frame(root_block), mtr);
+
+        /* Release the S latch on the root page. */
+        mtr_memo_release(mtr, root_block, MTR_MEMO_PAGE_S_FIX);
+#ifdef UNIV_SYNC_DEBUG
+        sync_thread_reset_level(&root_block->lock);
+#endif /* UNIV_SYNC_DEBUG */
+
+	return(height);
+}
+
+/**************************************************************//**
+Checks a file segment header within a B-tree root page and updates
+the segment header space id.
+@return	TRUE if valid */
+static
+bool
+btr_root_fseg_adjust_on_import(
+/*===========================*/
+	fseg_header_t*	seg_header,	/*!< in/out: segment header */
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page,
+					or NULL */
+	ulint		space,		/*!< in: tablespace identifier */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
-	return(buf_block_get_frame(btr_root_block_get(index, mtr)));
+	ulint	offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
+
+	if (offset < FIL_PAGE_DATA
+	    || offset > UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) {
+
+		return(FALSE);
+
+	} else if (page_zip) {
+		mach_write_to_4(seg_header + FSEG_HDR_SPACE, space);
+		page_zip_write_header(page_zip, seg_header + FSEG_HDR_SPACE,
+				      4, mtr);
+	} else {
+		mlog_write_ulint(seg_header + FSEG_HDR_SPACE,
+				 space, MLOG_4BYTES, mtr);
+	}
+
+	return(TRUE);
+}
+
+/**************************************************************//**
+Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
+@return error code, or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+btr_root_adjust_on_import(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index tree */
+{
+	dberr_t		err;
+	mtr_t		mtr;
+	page_t*		page;
+	buf_block_t*	block;
+	page_zip_des_t*	page_zip;
+	dict_table_t*	table		= index->table;
+	ulint		space_id	= dict_index_get_space(index);
+	ulint		zip_size	= dict_table_zip_size(table);
+	ulint		root_page_no	= dict_index_get_page(index);
+
+	mtr_start(&mtr);
+
+	mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+	DBUG_EXECUTE_IF("ib_import_trigger_corruption_3",
+			return(DB_CORRUPTION););
+
+	block = btr_block_get(
+		space_id, zip_size, root_page_no, RW_X_LATCH, index, &mtr);
+
+	page = buf_block_get_frame(block);
+	page_zip = buf_block_get_page_zip(block);
+
+	/* Check that this is a B-tree page and both the PREV and NEXT
+	pointers are FIL_NULL, because the root page does not have any
+	siblings. */
+	if (fil_page_get_type(page) != FIL_PAGE_INDEX
+	    || fil_page_get_prev(page) != FIL_NULL
+	    || fil_page_get_next(page) != FIL_NULL) {
+
+		err = DB_CORRUPTION;
+
+	} else if (dict_index_is_clust(index)) {
+		bool	page_is_compact_format;
+
+		page_is_compact_format = page_is_comp(page) > 0;
+
+		/* Check if the page format and table format agree. */
+		if (page_is_compact_format != dict_table_is_comp(table)) {
+			err = DB_CORRUPTION;
+		} else {
+
+			/* Check that the table flags and the tablespace
+			flags match. */
+			ulint	flags = fil_space_get_flags(table->space);
+
+			if (flags
+			    && flags != dict_tf_to_fsp_flags(table->flags)) {
+
+				err = DB_CORRUPTION;
+			} else {
+				err = DB_SUCCESS;
+			}
+		}
+	} else {
+		err = DB_SUCCESS;
+	}
+
+	/* Check and adjust the file segment headers, if all OK so far. */
+	if (err == DB_SUCCESS
+	    && (!btr_root_fseg_adjust_on_import(
+			FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+			+ page, page_zip, space_id, &mtr)
+		|| !btr_root_fseg_adjust_on_import(
+			FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+			+ page, page_zip, space_id, &mtr))) {
+
+		err = DB_CORRUPTION;
+	}
+
+	mtr_commit(&mtr);
+
+	return(err);
 }
 
 /*************************************************************//**
@@ -910,8 +1065,8 @@ btr_page_create(
 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 	btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
-		page_create_zip(block, index, level, mtr);
+	if (page_zip) {
+		page_create_zip(block, index, level, 0, mtr);
 	} else {
 		page_create(block, mtr, dict_table_is_comp(index->table));
 		/* Set the level of the new index page */
@@ -1066,8 +1221,7 @@ btr_get_size(
 	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
 				MTR_MEMO_S_LOCK));
 
-	if (index->page == FIL_NULL
-	    || index->to_be_dropped
+	if (index->page == FIL_NULL || dict_index_is_online_ddl(index)
 	    || *index->name == TEMP_INDEX_PREFIX) {
 		return(ULINT_UNDEFINED);
 	}
@@ -1165,6 +1319,15 @@ btr_page_free_low(
 	fseg_free_page(seg_header,
 		       buf_block_get_space(block),
 		       buf_block_get_page_no(block), mtr);
+
+	/* The page was marked free in the allocation bitmap, but it
+	should remain buffer-fixed until mtr_commit(mtr) or until it
+	is explicitly freed from the mini-transaction. */
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	/* TODO: Discard any operations on the page from the redo log
+	and remove the block from the flush list and the buffer pool.
+	This would free up buffer pool earlier and reduce writes to
+	both the tablespace and the redo log. */
 }
 
 /**************************************************************//**
@@ -1211,7 +1374,7 @@ btr_node_ptr_set_child_page_no(
 
 	ut_ad(len == REC_NODE_PTR_SIZE);
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		page_zip_write_node_ptr(page_zip, rec,
 					rec_offs_data_size(offsets),
 					page_no, mtr);
@@ -1291,8 +1454,7 @@ btr_page_get_father_node_ptr_func(
 	offsets = rec_get_offsets(node_ptr, index, offsets,
 				  ULINT_UNDEFINED, &heap);
 
-	if (UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr, offsets)
-			  != page_no)) {
+	if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
 		rec_t*	print_rec;
 		fputs("InnoDB: Dump of the child page:\n", stderr);
 		buf_page_print(page_align(user_rec), 0,
@@ -1479,8 +1641,8 @@ btr_create(
 	/* Create a new index page on the allocated segment page */
 	page_zip = buf_block_get_page_zip(block);
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
-		page = page_create_zip(block, index, 0, mtr);
+	if (page_zip) {
+		page = page_create_zip(block, index, 0, 0, mtr);
 	} else {
 		page = page_create(block, mtr,
 				   dict_table_is_comp(index->table));
@@ -1614,25 +1776,39 @@ btr_free_root(
 	ut_a(btr_root_fseg_validate(header, space));
 #endif /* UNIV_BTR_DEBUG */
 
-	while (!fseg_free_step(header, mtr));
+	while (!fseg_free_step(header, mtr)) {
+		/* Free the entire segment in small steps. */
+	}
 }
 #endif /* !UNIV_HOTBACKUP */
 
 /*************************************************************//**
-Reorganizes an index page. */
-static
-ibool
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+UNIV_INTERN
+bool
 btr_page_reorganize_low(
 /*====================*/
-	ibool		recovery,/*!< in: TRUE if called in recovery:
+	bool		recovery,/*!< in: true if called in recovery:
 				locks should not be updated, i.e.,
 				there cannot exist locks on the
 				page, and a hash index should not be
 				dropped: it cannot exist */
-	buf_block_t*	block,	/*!< in: page to be reorganized */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	mtr_t*		mtr)	/*!< in: mtr */
+	ulint		z_level,/*!< in: compression level to be used
+				if dealing with compressed page */
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
+	dict_index_t*	index,	/*!< in: the index tree of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
+	buf_block_t*	block		= page_cur_get_block(cursor);
 #ifndef UNIV_HOTBACKUP
 	buf_pool_t*	buf_pool	= buf_pool_from_bpage(&block->page);
 #endif /* !UNIV_HOTBACKUP */
@@ -1645,7 +1821,9 @@ btr_page_reorganize_low(
 	ulint		data_size2;
 	ulint		max_ins_size1;
 	ulint		max_ins_size2;
-	ibool		success		= FALSE;
+	bool		success		= false;
+	ulint		pos;
+	bool		log_compressed;
 
 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 	btr_assert_not_corrupted(block, index);
@@ -1655,13 +1833,6 @@ btr_page_reorganize_low(
 	data_size1 = page_get_data_size(page);
 	max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1);
 
-#ifndef UNIV_HOTBACKUP
-	/* Write the log record */
-	mlog_open_and_write_index(mtr, page, index, page_is_comp(page)
-				  ? MLOG_COMP_PAGE_REORGANIZE
-				  : MLOG_PAGE_REORGANIZE, 0);
-#endif /* !UNIV_HOTBACKUP */
-
 	/* Turn logging off */
 	log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
 
@@ -1677,7 +1848,7 @@ btr_page_reorganize_low(
 	buf_frame_copy(temp_page, page);
 
 #ifndef UNIV_HOTBACKUP
-	if (UNIV_LIKELY(!recovery)) {
+	if (!recovery) {
 		btr_search_drop_page_hash_index(block);
 	}
 
@@ -1685,6 +1856,9 @@ btr_page_reorganize_low(
 #endif /* !UNIV_HOTBACKUP */
 	btr_blob_dbg_remove(page, index, "btr_page_reorganize");
 
+	/* Save the cursor position. */
+	pos = page_rec_get_n_recs_before(page_cur_get_rec(cursor));
+
 	/* Recreate the page: note that global data on page (possible
 	segment headers, next page-field, etc.) is preserved intact */
 
@@ -1702,14 +1876,21 @@ btr_page_reorganize_low(
 		trx_id_t	max_trx_id = page_get_max_trx_id(temp_page);
 		page_set_max_trx_id(block, NULL, max_trx_id, mtr);
 		/* In crash recovery, dict_index_is_sec_or_ibuf() always
-		returns TRUE, even for clustered indexes.  max_trx_id is
+		holds, even for clustered indexes.  max_trx_id is
 		unused in clustered index pages. */
 		ut_ad(max_trx_id != 0 || recovery);
 	}
 
-	if (UNIV_LIKELY_NULL(page_zip)
-	    && UNIV_UNLIKELY
-	    (!page_zip_compress(page_zip, page, index, NULL))) {
+	/* If innodb_log_compressed_pages is ON, page reorganize should log the
+	compressed page image.*/
+	log_compressed = page_zip && page_zip_log_pages;
+
+	if (log_compressed) {
+		mtr_set_log_mode(mtr, log_mode);
+	}
+
+	if (page_zip
+	    && !page_zip_compress(page_zip, page, index, z_level, mtr)) {
 
 		/* Restore the old page and exit. */
 		btr_blob_dbg_restore(page, temp_page, index,
@@ -1739,7 +1920,7 @@ btr_page_reorganize_low(
 	}
 
 #ifndef UNIV_HOTBACKUP
-	if (UNIV_LIKELY(!recovery)) {
+	if (!recovery) {
 		/* Update the record lock bitmaps */
 		lock_move_reorganize_page(block, temp_block);
 	}
@@ -1748,10 +1929,10 @@ btr_page_reorganize_low(
 	data_size2 = page_get_data_size(page);
 	max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1);
 
-	if (UNIV_UNLIKELY(data_size1 != data_size2)
-	    || UNIV_UNLIKELY(max_ins_size1 != max_ins_size2)) {
+	if (data_size1 != data_size2 || max_ins_size1 != max_ins_size2) {
 		buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
 		buf_page_print(temp_page, 0, BUF_PAGE_PRINT_NO_CRASH);
+
 		fprintf(stderr,
 			"InnoDB: Error: page old data size %lu"
 			" new data size %lu\n"
@@ -1764,7 +1945,14 @@ btr_page_reorganize_low(
 			(unsigned long) max_ins_size2);
 		ut_ad(0);
 	} else {
-		success = TRUE;
+		success = true;
+	}
+
+	/* Restore the cursor position. */
+	if (pos > 0) {
+		cursor->rec = page_rec_get_nth(page, pos);
+	} else {
+		ut_ad(cursor->rec == page_get_infimum_rec(page));
 	}
 
 func_exit:
@@ -1778,26 +1966,92 @@ func_exit:
 	/* Restore logging mode */
 	mtr_set_log_mode(mtr, log_mode);
 
+#ifndef UNIV_HOTBACKUP
+	if (success) {
+		byte	type;
+		byte*	log_ptr;
+
+		/* Write the log record */
+		if (page_zip) {
+			ut_ad(page_is_comp(page));
+			type = MLOG_ZIP_PAGE_REORGANIZE;
+		} else if (page_is_comp(page)) {
+			type = MLOG_COMP_PAGE_REORGANIZE;
+		} else {
+			type = MLOG_PAGE_REORGANIZE;
+		}
+
+		log_ptr = log_compressed
+			? NULL
+			: mlog_open_and_write_index(
+				mtr, page, index, type,
+				page_zip ? 1 : 0);
+
+		/* For compressed pages write the compression level. */
+		if (log_ptr && page_zip) {
+			mach_write_to_1(log_ptr, z_level);
+			mlog_close(mtr, log_ptr + 1);
+		}
+	}
+#endif /* !UNIV_HOTBACKUP */
+
 	return(success);
 }
 
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+static __attribute__((nonnull))
+bool
+btr_page_reorganize_block(
+/*======================*/
+	bool		recovery,/*!< in: true if called in recovery:
+				locks should not be updated, i.e.,
+				there cannot exist locks on the
+				page, and a hash index should not be
+				dropped: it cannot exist */
+	ulint		z_level,/*!< in: compression level to be used
+				if dealing with compressed page */
+	buf_block_t*	block,	/*!< in/out: B-tree page */
+	dict_index_t*	index,	/*!< in: the index tree of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	page_cur_t	cur;
+	page_cur_set_before_first(block, &cur);
+
+	return(btr_page_reorganize_low(recovery, z_level, &cur, index, mtr));
+}
+
 #ifndef UNIV_HOTBACKUP
 /*************************************************************//**
 Reorganizes an index page.
-IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf
-page of a non-clustered index, the caller must update the insert
-buffer free bits in the same mini-transaction in such a way that the
-modification will be redo-logged.
-@return	TRUE on success, FALSE on failure */
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
 UNIV_INTERN
-ibool
+bool
 btr_page_reorganize(
 /*================*/
-	buf_block_t*	block,	/*!< in: page to be reorganized */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	mtr_t*		mtr)	/*!< in: mtr */
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
+	dict_index_t*	index,	/*!< in: the index tree of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	return(btr_page_reorganize_low(FALSE, block, index, mtr));
+	return(btr_page_reorganize_low(false, page_zip_level,
+				       cursor, index, mtr));
 }
 #endif /* !UNIV_HOTBACKUP */
 
@@ -1809,18 +2063,34 @@ byte*
 btr_parse_page_reorganize(
 /*======================*/
 	byte*		ptr,	/*!< in: buffer */
-	byte*		end_ptr __attribute__((unused)),
-				/*!< in: buffer end */
+	byte*		end_ptr,/*!< in: buffer end */
 	dict_index_t*	index,	/*!< in: record descriptor */
+	bool		compressed,/*!< in: true if compressed page */
 	buf_block_t*	block,	/*!< in: page to be reorganized, or NULL */
 	mtr_t*		mtr)	/*!< in: mtr or NULL */
 {
+	ulint	level;
+
 	ut_ad(ptr && end_ptr);
 
-	/* The record is empty, except for the record initial part */
+	/* If dealing with a compressed page the record has the
+	compression level used during original compression written in
+	one byte. Otherwise record is empty. */
+	if (compressed) {
+		if (ptr == end_ptr) {
+			return(NULL);
+		}
+
+		level = mach_read_from_1(ptr);
 
-	if (UNIV_LIKELY(block != NULL)) {
-		btr_page_reorganize_low(TRUE, block, index, mtr);
+		ut_a(level <= 9);
+		++ptr;
+	} else {
+		level = page_zip_level;
+	}
+
+	if (block != NULL) {
+		btr_page_reorganize_block(true, level, block, index, mtr);
 	}
 
 	return(ptr);
@@ -1853,8 +2123,8 @@ btr_page_empty(
 	/* Recreate the page: note that global data on page (possible
 	segment headers, next page-field, etc.) is preserved intact */
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
-		page_create_zip(block, index, level, mtr);
+	if (page_zip) {
+		page_create_zip(block, index, level, 0, mtr);
 	} else {
 		page_create(block, mtr, dict_table_is_comp(index->table));
 		btr_page_set_level(page, NULL, level, mtr);
@@ -1874,10 +2144,13 @@ UNIV_INTERN
 rec_t*
 btr_root_raise_and_insert(
 /*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
 	btr_cur_t*	cursor,	/*!< in: cursor at which to insert: must be
 				on the root page; when the function returns,
 				the cursor is positioned on the predecessor
 				of the inserted record */
+	ulint**		offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
 	const dtuple_t*	tuple,	/*!< in: tuple to insert */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
 	mtr_t*		mtr)	/*!< in: mtr */
@@ -1887,7 +2160,6 @@ btr_root_raise_and_insert(
 	page_t*		new_page;
 	ulint		new_page_no;
 	rec_t*		rec;
-	mem_heap_t*	heap;
 	dtuple_t*	node_ptr;
 	ulint		level;
 	rec_t*		node_ptr_rec;
@@ -1900,7 +2172,7 @@ btr_root_raise_and_insert(
 	root = btr_cur_get_page(cursor);
 	root_block = btr_cur_get_block(cursor);
 	root_page_zip = buf_block_get_page_zip(root_block);
-	ut_ad(page_get_n_recs(root) > 0);
+	ut_ad(!page_is_empty(root));
 	index = btr_cur_get_index(cursor);
 #ifdef UNIV_ZIP_DEBUG
 	ut_a(!root_page_zip || page_zip_validate(root_page_zip, root, index));
@@ -1947,10 +2219,9 @@ btr_root_raise_and_insert(
 #ifdef UNIV_ZIP_COPY
 	    || new_page_zip
 #endif /* UNIV_ZIP_COPY */
-	    || UNIV_UNLIKELY
-	    (!page_copy_rec_list_end(new_block, root_block,
-				     page_get_infimum_rec(root),
-				     index, mtr))) {
+	    || !page_copy_rec_list_end(new_block, root_block,
+				       page_get_infimum_rec(root),
+				       index, mtr)) {
 		ut_a(new_page_zip);
 
 		/* Copy the page byte for byte. */
@@ -1974,7 +2245,9 @@ btr_root_raise_and_insert(
 	lock_update_root_raise(new_block, root_block);
 
 	/* Create a memory heap where the node pointer is stored */
-	heap = mem_heap_create(100);
+	if (!*heap) {
+		*heap = mem_heap_create(1000);
+	}
 
 	rec = page_rec_get_next(page_get_infimum_rec(new_page));
 	new_page_no = buf_block_get_page_no(new_block);
@@ -1982,8 +2255,8 @@ btr_root_raise_and_insert(
 	/* Build the node pointer (= node key and page address) for the
 	child */
 
-	node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap,
-					     level);
+	node_ptr = dict_index_build_node_ptr(
+		index, rec, new_page_no, *heap, level);
 	/* The node pointer must be marked as the predefined minimum record,
 	as there is no lower alphabetical limit to records in the leftmost
 	node of a level: */
@@ -2009,15 +2282,12 @@ btr_root_raise_and_insert(
 	page_cur_set_before_first(root_block, page_cursor);
 
 	node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
-					     index, 0, mtr);
+					     index, offsets, heap, 0, mtr);
 
 	/* The root page should only contain the node pointer
 	to new_page at this point.  Thus, the data should fit. */
 	ut_a(node_ptr_rec);
 
-	/* Free the memory heap */
-	mem_heap_free(heap);
-
 	/* We play safe and reset the free bits for the new page */
 
 #if 0
@@ -2033,7 +2303,8 @@ btr_root_raise_and_insert(
 			PAGE_CUR_LE, page_cursor);
 
 	/* Split the child and insert tuple */
-	return(btr_page_split_and_insert(cursor, tuple, n_ext, mtr));
+	return(btr_page_split_and_insert(flags, cursor, offsets, heap,
+					 tuple, n_ext, mtr));
 }
 
 /*************************************************************//**
@@ -2105,8 +2376,7 @@ btr_page_get_split_rec_to_right(
 	the previous insert on the same page, we assume that there is a
 	pattern of sequential inserts here. */
 
-	if (UNIV_LIKELY(page_header_get_ptr(page, PAGE_LAST_INSERT)
-			== insert_point)) {
+	if (page_header_get_ptr(page, PAGE_LAST_INSERT) == insert_point) {
 
 		rec_t*	next_rec;
 
@@ -2174,13 +2444,13 @@ btr_page_get_split_rec(
 	free_space  = page_get_free_space_of_empty(page_is_comp(page));
 
 	page_zip = btr_cur_get_page_zip(cursor);
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		/* Estimate the free space of an empty compressed page. */
 		ulint	free_space_zip = page_zip_empty_size(
 			cursor->index->n_fields,
 			page_zip_get_size(page_zip));
 
-		if (UNIV_LIKELY(free_space > (ulint) free_space_zip)) {
+		if (free_space > (ulint) free_space_zip) {
 			free_space = (ulint) free_space_zip;
 		}
 	}
@@ -2253,7 +2523,7 @@ btr_page_get_split_rec(
 	}
 
 func_exit:
-	if (UNIV_LIKELY_NULL(heap)) {
+	if (heap) {
 		mem_heap_free(heap);
 	}
 	return(rec);
@@ -2262,9 +2532,9 @@ func_exit:
 /*************************************************************//**
 Returns TRUE if the insert fits on the appropriate half-page with the
 chosen split_rec.
-@return	TRUE if fits */
-static
-ibool
+@return	true if fits */
+static __attribute__((nonnull(1,3,4,6), warn_unused_result))
+bool
 btr_page_insert_fits(
 /*=================*/
 	btr_cur_t*	cursor,	/*!< in: cursor at which insert
@@ -2272,11 +2542,11 @@ btr_page_insert_fits(
 	const rec_t*	split_rec,/*!< in: suggestion for first record
 				on upper half-page, or NULL if
 				tuple to be inserted should be first */
-	const ulint*	offsets,/*!< in: rec_get_offsets(
-				split_rec, cursor->index) */
+	ulint**		offsets,/*!< in: rec_get_offsets(
+				split_rec, cursor->index); out: garbage */
 	const dtuple_t*	tuple,	/*!< in: tuple to insert */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
-	mem_heap_t*	heap)	/*!< in: temporary memory heap */
+	mem_heap_t**	heap)	/*!< in: temporary memory heap */
 {
 	page_t*		page;
 	ulint		insert_size;
@@ -2285,15 +2555,13 @@ btr_page_insert_fits(
 	ulint		total_n_recs;
 	const rec_t*	rec;
 	const rec_t*	end_rec;
-	ulint*		offs;
 
 	page = btr_cur_get_page(cursor);
 
-	ut_ad(!split_rec == !offsets);
-	ut_ad(!offsets
-	      || !page_is_comp(page) == !rec_offs_comp(offsets));
-	ut_ad(!offsets
-	      || rec_offs_validate(split_rec, cursor->index, offsets));
+	ut_ad(!split_rec
+	      || !page_is_comp(page) == !rec_offs_comp(*offsets));
+	ut_ad(!split_rec
+	      || rec_offs_validate(split_rec, cursor->index, *offsets));
 
 	insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
 	free_space  = page_get_free_space_of_empty(page_is_comp(page));
@@ -2311,7 +2579,7 @@ btr_page_insert_fits(
 		rec = page_rec_get_next(page_get_infimum_rec(page));
 		end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
 
-	} else if (cmp_dtuple_rec(tuple, split_rec, offsets) >= 0) {
+	} else if (cmp_dtuple_rec(tuple, split_rec, *offsets) >= 0) {
 
 		rec = page_rec_get_next(page_get_infimum_rec(page));
 		end_rec = split_rec;
@@ -2326,19 +2594,17 @@ btr_page_insert_fits(
 		/* Ok, there will be enough available space on the
 		half page where the tuple is inserted */
 
-		return(TRUE);
+		return(true);
 	}
 
-	offs = NULL;
-
 	while (rec != end_rec) {
 		/* In this loop we calculate the amount of reserved
 		space after rec is removed from page. */
 
-		offs = rec_get_offsets(rec, cursor->index, offs,
-				       ULINT_UNDEFINED, &heap);
+		*offsets = rec_get_offsets(rec, cursor->index, *offsets,
+					   ULINT_UNDEFINED, heap);
 
-		total_data -= rec_offs_size(offs);
+		total_data -= rec_offs_size(*offsets);
 		total_n_recs--;
 
 		if (total_data + page_dir_calc_reserved_space(total_n_recs)
@@ -2347,13 +2613,13 @@ btr_page_insert_fits(
 			/* Ok, there will be enough available space on the
 			half page where the tuple is inserted */
 
-			return(TRUE);
+			return(true);
 		}
 
 		rec = page_rec_get_next_const(rec);
 	}
 
-	return(FALSE);
+	return(false);
 }
 
 /*******************************************************//**
@@ -2363,6 +2629,7 @@ UNIV_INTERN
 void
 btr_insert_on_non_leaf_level_func(
 /*==============================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
 	dict_index_t*	index,	/*!< in: index */
 	ulint		level,	/*!< in: level, must be > 0 */
 	dtuple_t*	tuple,	/*!< in: the record to be inserted */
@@ -2372,8 +2639,10 @@ btr_insert_on_non_leaf_level_func(
 {
 	big_rec_t*	dummy_big_rec;
 	btr_cur_t	cursor;
-	ulint		err;
+	dberr_t		err;
 	rec_t*		rec;
+	ulint*		offsets	= NULL;
+	mem_heap_t*	heap = NULL;
 
 	ut_ad(level > 0);
 
@@ -2384,29 +2653,38 @@ btr_insert_on_non_leaf_level_func(
 	ut_ad(cursor.flag == BTR_CUR_BINARY);
 
 	err = btr_cur_optimistic_insert(
-		BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG
-		| BTR_NO_UNDO_LOG_FLAG, &cursor, tuple, &rec,
-		&dummy_big_rec, 0, NULL, mtr);
+		flags
+		| BTR_NO_LOCKING_FLAG
+		| BTR_KEEP_SYS_FLAG
+		| BTR_NO_UNDO_LOG_FLAG,
+		&cursor, &offsets, &heap,
+		tuple, &rec, &dummy_big_rec, 0, NULL, mtr);
 
 	if (err == DB_FAIL) {
-		err = btr_cur_pessimistic_insert(
-			BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG
-			| BTR_NO_UNDO_LOG_FLAG,
-			&cursor, tuple, &rec, &dummy_big_rec, 0, NULL, mtr);
+		err = btr_cur_pessimistic_insert(flags
+						 | BTR_NO_LOCKING_FLAG
+						 | BTR_KEEP_SYS_FLAG
+						 | BTR_NO_UNDO_LOG_FLAG,
+						 &cursor, &offsets, &heap,
+						 tuple, &rec,
+						 &dummy_big_rec, 0, NULL, mtr);
 		ut_a(err == DB_SUCCESS);
 	}
+	mem_heap_free(heap);
 }
 
 /**************************************************************//**
 Attaches the halves of an index page on the appropriate level in an
 index tree. */
-UNIV_INTERN
+static __attribute__((nonnull))
 void
 btr_attach_half_pages(
 /*==================*/
+	ulint		flags,		/*!< in: undo logging and
+					locking flags */
 	dict_index_t*	index,		/*!< in: the index tree */
 	buf_block_t*	block,		/*!< in/out: page to be split */
-	rec_t*		split_rec,	/*!< in: first record on upper
+	const rec_t*	split_rec,	/*!< in: first record on upper
 					half page */
 	buf_block_t*	new_block,	/*!< in/out: the new half page */
 	ulint		direction,	/*!< in: FSP_UP or FSP_DOWN */
@@ -2481,7 +2759,8 @@ btr_attach_half_pages(
 	/* Insert it next to the pointer to the lower half. Note that this
 	may generate recursion leading to a split on the higher level. */
 
-	btr_insert_on_non_leaf_level(index, level + 1, node_ptr_upper, mtr);
+	btr_insert_on_non_leaf_level(flags, index, level + 1,
+				     node_ptr_upper, mtr);
 
 	/* Free the memory heap */
 	mem_heap_free(heap);
@@ -2533,13 +2812,13 @@ btr_attach_half_pages(
 /*************************************************************//**
 Determine if a tuple is smaller than any record on the page.
 @return TRUE if smaller */
-static
-ibool
+static __attribute__((nonnull, warn_unused_result))
+bool
 btr_page_tuple_smaller(
 /*===================*/
 	btr_cur_t*	cursor,	/*!< in: b-tree cursor */
 	const dtuple_t*	tuple,	/*!< in: tuple to consider */
-	ulint*		offsets,/*!< in/out: temporary storage */
+	ulint**		offsets,/*!< in/out: temporary storage */
 	ulint		n_uniq,	/*!< in: number of unique fields
 				in the index page records */
 	mem_heap_t**	heap)	/*!< in/out: heap for offsets */
@@ -2554,11 +2833,11 @@ btr_page_tuple_smaller(
 	page_cur_move_to_next(&pcur);
 	first_rec = page_cur_get_rec(&pcur);
 
-	offsets = rec_get_offsets(
-		first_rec, cursor->index, offsets,
+	*offsets = rec_get_offsets(
+		first_rec, cursor->index, *offsets,
 		n_uniq, heap);
 
-	return(cmp_dtuple_rec(tuple, first_rec, offsets) < 0);
+	return(cmp_dtuple_rec(tuple, first_rec, *offsets) < 0);
 }
 
 /*************************************************************//**
@@ -2574,9 +2853,12 @@ UNIV_INTERN
 rec_t*
 btr_page_split_and_insert(
 /*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
 	btr_cur_t*	cursor,	/*!< in: cursor at which to insert; when the
 				function returns, the cursor is positioned
 				on the predecessor of the inserted record */
+	ulint**		offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
 	const dtuple_t*	tuple,	/*!< in: tuple to insert */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
 	mtr_t*		mtr)	/*!< in: mtr */
@@ -2602,18 +2884,21 @@ btr_page_split_and_insert(
 	ibool		insert_left;
 	ulint		n_iterations = 0;
 	rec_t*		rec;
-	mem_heap_t*	heap;
 	ulint		n_uniq;
-	ulint*		offsets;
 
-	heap = mem_heap_create(1024);
+	if (!*heap) {
+		*heap = mem_heap_create(1024);
+	}
 	n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
 func_start:
-	mem_heap_empty(heap);
-	offsets = NULL;
+	mem_heap_empty(*heap);
+	*offsets = NULL;
 
 	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
 				MTR_MEMO_X_LOCK));
+	ut_ad(!dict_index_is_online_ddl(cursor->index)
+	      || (flags & BTR_CREATE_FLAG)
+	      || dict_index_is_clust(cursor->index));
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
@@ -2623,7 +2908,7 @@ func_start:
 	page_zip = buf_block_get_page_zip(block);
 
 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-	ut_ad(page_get_n_recs(page) >= 1);
+	ut_ad(!page_is_empty(page));
 
 	page_no = buf_block_get_page_no(block);
 
@@ -2637,9 +2922,9 @@ func_start:
 		hint_page_no = page_no + 1;
 		split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
 
-		if (UNIV_UNLIKELY(split_rec == NULL)) {
+		if (split_rec == NULL) {
 			insert_left = btr_page_tuple_smaller(
-				cursor, tuple, offsets, n_uniq, &heap);
+				cursor, tuple, offsets, n_uniq, heap);
 		}
 	} else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
 		direction = FSP_UP;
@@ -2661,7 +2946,7 @@ func_start:
 		if (page_get_n_recs(page) > 1) {
 			split_rec = page_get_middle_rec(page);
 		} else if (btr_page_tuple_smaller(cursor, tuple,
-						  offsets, n_uniq, &heap)) {
+						  offsets, n_uniq, heap)) {
 			split_rec = page_rec_get_next(
 				page_get_infimum_rec(page));
 		} else {
@@ -2684,20 +2969,19 @@ func_start:
 	if (split_rec) {
 		first_rec = move_limit = split_rec;
 
-		offsets = rec_get_offsets(split_rec, cursor->index, offsets,
-					  n_uniq, &heap);
+		*offsets = rec_get_offsets(split_rec, cursor->index, *offsets,
+					   n_uniq, heap);
 
-		insert_left = cmp_dtuple_rec(tuple, split_rec, offsets) < 0;
+		insert_left = cmp_dtuple_rec(tuple, split_rec, *offsets) < 0;
 
-		if (UNIV_UNLIKELY(!insert_left && new_page_zip
-				  && n_iterations > 0)) {
+		if (!insert_left && new_page_zip && n_iterations > 0) {
 			/* If a compressed page has already been split,
 			avoid further splits by inserting the record
 			to an empty page. */
 			split_rec = NULL;
 			goto insert_empty;
 		}
-	} else if (UNIV_UNLIKELY(insert_left)) {
+	} else if (insert_left) {
 		ut_a(n_iterations > 0);
 		first_rec = page_rec_get_next(page_get_infimum_rec(page));
 		move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
@@ -2705,8 +2989,8 @@ func_start:
 insert_empty:
 		ut_ad(!split_rec);
 		ut_ad(!insert_left);
-		buf = mem_alloc(rec_get_converted_size(cursor->index,
-						       tuple, n_ext));
+		buf = (byte*) mem_alloc(rec_get_converted_size(cursor->index,
+							       tuple, n_ext));
 
 		first_rec = rec_convert_dtuple_to_rec(buf, cursor->index,
 						      tuple, n_ext);
@@ -2715,7 +2999,7 @@ insert_empty:
 
 	/* 4. Do first the modifications in the tree structure */
 
-	btr_attach_half_pages(cursor->index, block,
+	btr_attach_half_pages(flags, cursor->index, block,
 			      first_rec, new_block, direction, mtr);
 
 	/* If the split is made on the leaf level and the insert will fit
@@ -2735,10 +3019,11 @@ insert_empty:
 
 		insert_will_fit = !new_page_zip
 			&& btr_page_insert_fits(cursor, NULL,
-						NULL, tuple, n_ext, heap);
+						offsets, tuple, n_ext, heap);
 	}
 
-	if (insert_will_fit && page_is_leaf(page)) {
+	if (insert_will_fit && page_is_leaf(page)
+	    && !dict_index_is_online_ddl(cursor->index)) {
 
 		mtr_memo_release(mtr, dict_index_get_lock(cursor->index),
 				 MTR_MEMO_X_LOCK);
@@ -2752,9 +3037,8 @@ insert_empty:
 #ifdef UNIV_ZIP_COPY
 		    || page_zip
 #endif /* UNIV_ZIP_COPY */
-		    || UNIV_UNLIKELY
-		    (!page_move_rec_list_start(new_block, block, move_limit,
-					       cursor->index, mtr))) {
+		    || !page_move_rec_list_start(new_block, block, move_limit,
+						 cursor->index, mtr)) {
 			/* For some reason, compressing new_page failed,
 			even though it should contain fewer records than
 			the original page.  Copy the page byte for byte
@@ -2795,9 +3079,8 @@ insert_empty:
 #ifdef UNIV_ZIP_COPY
 		    || page_zip
 #endif /* UNIV_ZIP_COPY */
-		    || UNIV_UNLIKELY
-		    (!page_move_rec_list_end(new_block, block, move_limit,
-					     cursor->index, mtr))) {
+		    || !page_move_rec_list_end(new_block, block, move_limit,
+					       cursor->index, mtr)) {
 			/* For some reason, compressing new_page failed,
 			even though it should contain fewer records than
 			the original page.  Copy the page byte for byte
@@ -2833,7 +3116,7 @@ insert_empty:
 	}
 
 #ifdef UNIV_ZIP_DEBUG
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		ut_a(page_zip_validate(page_zip, page, cursor->index));
 		ut_a(page_zip_validate(new_page_zip, new_page, cursor->index));
 	}
@@ -2857,8 +3140,8 @@ insert_empty:
 	page_cur_search(insert_block, cursor->index, tuple,
 			PAGE_CUR_LE, page_cursor);
 
-	rec = page_cur_tuple_insert(page_cursor, tuple,
-				    cursor->index, n_ext, mtr);
+	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+				    offsets, heap, n_ext, mtr);
 
 #ifdef UNIV_ZIP_DEBUG
 	{
@@ -2874,31 +3157,32 @@ insert_empty:
 	}
 #endif /* UNIV_ZIP_DEBUG */
 
-	if (UNIV_LIKELY(rec != NULL)) {
+	if (rec != NULL) {
 
 		goto func_exit;
 	}
 
-	/* 8. If insert did not fit, try page reorganization */
+	/* 8. If insert did not fit, try page reorganization.
+	For compressed pages, page_cur_tuple_insert() will have
+	attempted this already. */
 
-	if (UNIV_UNLIKELY
-	    (!btr_page_reorganize(insert_block, cursor->index, mtr))) {
+	if (page_cur_get_page_zip(page_cursor)
+	    || !btr_page_reorganize(page_cursor, cursor->index, mtr)) {
 
 		goto insert_failed;
 	}
 
-	page_cur_search(insert_block, cursor->index, tuple,
-			PAGE_CUR_LE, page_cursor);
 	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
-				    n_ext, mtr);
+				    offsets, heap, n_ext, mtr);
 
-	if (UNIV_UNLIKELY(rec == NULL)) {
+	if (rec == NULL) {
 		/* The insert did not fit on the page: loop back to the
 		start of the function for a new split */
 insert_failed:
-		/* We play safe and reset the free bits for new_page */
+		/* We play safe and reset the free bits */
 		if (!dict_index_is_clust(cursor->index)) {
 			ibuf_reset_free_bits(new_block);
+			ibuf_reset_free_bits(block);
 		}
 
 		/* fprintf(stderr, "Split second round %lu\n",
@@ -2926,11 +3210,12 @@ func_exit:
 		buf_block_get_page_no(left_block),
 		buf_block_get_page_no(right_block));
 #endif
+	MONITOR_INC(MONITOR_INDEX_SPLIT);
 
 	ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
 	ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
 
-	mem_heap_free(heap);
+	ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
 	return(rec);
 }
 
@@ -3082,7 +3367,7 @@ btr_set_min_rec_mark(
 {
 	ulint	info_bits;
 
-	if (UNIV_LIKELY(page_rec_is_comp(rec))) {
+	if (page_rec_is_comp(rec)) {
 		info_bits = rec_get_info_bits(rec, TRUE);
 
 		rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG);
@@ -3110,15 +3395,15 @@ btr_node_ptr_delete(
 {
 	btr_cur_t	cursor;
 	ibool		compressed;
-	ulint		err;
+	dberr_t		err;
 
 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
 	/* Delete node pointer on father page */
 	btr_page_get_father(index, block, mtr, &cursor);
 
-	compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor, RB_NONE,
-						mtr);
+	compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor,
+						BTR_CREATE_FLAG, RB_NONE, mtr);
 	ut_a(err == DB_SUCCESS);
 
 	if (!compressed) {
@@ -3150,7 +3435,7 @@ btr_lift_page_up(
 	buf_block_t*	blocks[BTR_MAX_LEVELS];
 	ulint		n_blocks;	/*!< last used index in blocks[] */
 	ulint		i;
-	ibool		lift_father_up	= FALSE;
+	bool		lift_father_up;
 	buf_block_t*	block_orig	= block;
 
 	ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
@@ -3192,7 +3477,8 @@ btr_lift_page_up(
 			blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
 		}
 
-		if (n_blocks && page_level == 0) {
+		lift_father_up = (n_blocks && page_level == 0);
+		if (lift_father_up) {
 			/* The father page also should be the only on its level (not
 			root). We should lift up the father page at first.
 			Because the leaf page should be lifted up only for root page.
@@ -3201,7 +3487,6 @@ btr_lift_page_up(
 			later freeing of the page doesn't find the page allocation
 			to be freed.*/
 
-			lift_father_up = TRUE;
 			block = father_block;
 			page = buf_block_get_frame(block);
 			page_level = btr_page_get_level(page, mtr);
@@ -3229,10 +3514,9 @@ btr_lift_page_up(
 #ifdef UNIV_ZIP_COPY
 	    || father_page_zip
 #endif /* UNIV_ZIP_COPY */
-	    || UNIV_UNLIKELY
-	    (!page_copy_rec_list_end(father_block, block,
-				     page_get_infimum_rec(page),
-				     index, mtr))) {
+	    || !page_copy_rec_list_end(father_block, block,
+				       page_get_infimum_rec(page),
+				       index, mtr)) {
 		const page_zip_des_t*	page_zip
 			= buf_block_get_page_zip(block);
 		ut_a(father_page_zip);
@@ -3389,7 +3673,7 @@ btr_compress(
 
 	merge_page_zip = buf_block_get_page_zip(merge_block);
 #ifdef UNIV_ZIP_DEBUG
-	if (UNIV_LIKELY_NULL(merge_page_zip)) {
+	if (merge_page_zip) {
 		const page_zip_des_t*	page_zip
 			= buf_block_get_page_zip(block);
 		ut_a(page_zip);
@@ -3404,7 +3688,7 @@ btr_compress(
 			merge_block, block, page_get_supremum_rec(page),
 			index, mtr);
 
-		if (UNIV_UNLIKELY(!orig_pred)) {
+		if (!orig_pred) {
 			goto err_exit;
 		}
 
@@ -3422,7 +3706,7 @@ btr_compress(
 	} else {
 		rec_t*		orig_succ;
 		ibool		compressed;
-		ulint		err;
+		dberr_t		err;
 		btr_cur_t	cursor2;
 					/* father cursor pointing to node ptr
 					of the right sibling */
@@ -3433,6 +3717,7 @@ btr_compress(
 		btr_page_get_father(index, merge_block, mtr, &cursor2);
 
 		if (merge_page_zip && left_page_no == FIL_NULL) {
+
 			/* The function page_zip_compress(), which will be
 			invoked by page_copy_rec_list_end() below,
 			requires that FIL_PAGE_PREV be FIL_NULL.
@@ -3450,7 +3735,7 @@ btr_compress(
 						   page_get_infimum_rec(page),
 						   cursor->index, mtr);
 
-		if (UNIV_UNLIKELY(!orig_succ)) {
+		if (!orig_succ) {
 			ut_a(merge_page_zip);
 #ifdef UNIV_BTR_DEBUG
 			if (left_page_no == FIL_NULL) {
@@ -3467,6 +3752,7 @@ btr_compress(
 
 #ifdef UNIV_BTR_DEBUG
 		if (merge_page_zip && left_page_no == FIL_NULL) {
+
 			/* Restore FIL_PAGE_PREV in order to avoid an assertion
 			failure in btr_level_list_remove(), which will set
 			the field again to FIL_NULL.  Even though this makes
@@ -3488,6 +3774,7 @@ btr_compress(
 			offsets, right_page_no, mtr);
 
 		compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor2,
+							BTR_CREATE_FLAG,
 							RB_NONE, mtr);
 		ut_a(err == DB_SUCCESS);
 
@@ -3635,17 +3922,16 @@ btr_discard_only_page_on_level(
 #endif /* UNIV_BTR_DEBUG */
 
 	btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
+	ut_ad(page_is_leaf(buf_block_get_frame(block)));
 
 	if (!dict_index_is_clust(index)) {
 		/* We play it safe and reset the free bits for the root */
 		ibuf_reset_free_bits(block);
 
-		if (page_is_leaf(buf_block_get_frame(block))) {
-			ut_a(max_trx_id);
-			page_set_max_trx_id(block,
-					    buf_block_get_page_zip(block),
-					    max_trx_id, mtr);
-		}
+		ut_a(max_trx_id);
+		page_set_max_trx_id(block,
+				    buf_block_get_page_zip(block),
+				    max_trx_id, mtr);
 	}
 }
 
@@ -3788,7 +4074,7 @@ btr_print_size(
 	fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
 	fseg_print(seg, &mtr);
 
-	if (!(index->type & DICT_UNIVERSAL)) {
+	if (!dict_index_is_univ(index)) {
 
 		seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
 
@@ -3883,16 +4169,16 @@ btr_print_index(
 
 	mtr_start(&mtr);
 
-	root = btr_root_block_get(index, &mtr);
+	root = btr_root_block_get(index, RW_X_LATCH, &mtr);
 
 	btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
-	if (UNIV_LIKELY_NULL(heap)) {
+	if (heap) {
 		mem_heap_free(heap);
 	}
 
 	mtr_commit(&mtr);
 
-	btr_validate_index(index, NULL);
+	btr_validate_index(index, 0);
 }
 #endif /* UNIV_BTR_PRINT */
 
@@ -3982,7 +4268,7 @@ btr_index_rec_validate(
 
 	page = page_align(rec);
 
-	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+	if (dict_index_is_univ(index)) {
 		/* The insert buffer index tree can contain records from any
 		other index: we cannot check the number of fields or
 		their length */
@@ -3990,8 +4276,7 @@ btr_index_rec_validate(
 		return(TRUE);
 	}
 
-	if (UNIV_UNLIKELY((ibool)!!page_is_comp(page)
-			  != dict_table_is_comp(index->table))) {
+	if ((ibool)!!page_is_comp(page) != dict_table_is_comp(index->table)) {
 		btr_index_rec_validate_report(page, rec, index);
 		fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n",
 			(ulong) !!page_is_comp(page),
@@ -4002,8 +4287,7 @@ btr_index_rec_validate(
 
 	n = dict_index_get_n_fields(index);
 
-	if (!page_is_comp(page)
-	    && UNIV_UNLIKELY(rec_get_n_fields_old(rec) != n)) {
+	if (!page_is_comp(page) && rec_get_n_fields_old(rec) != n) {
 		btr_index_rec_validate_report(page, rec, index);
 		fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n",
 			(ulong) rec_get_n_fields_old(rec), (ulong) n);
@@ -4054,14 +4338,14 @@ btr_index_rec_validate(
 				rec_print_new(stderr, rec, offsets);
 				putc('\n', stderr);
 			}
-			if (UNIV_LIKELY_NULL(heap)) {
+			if (heap) {
 				mem_heap_free(heap);
 			}
 			return(FALSE);
 		}
 	}
 
-	if (UNIV_LIKELY_NULL(heap)) {
+	if (heap) {
 		mem_heap_free(heap);
 	}
 	return(TRUE);
@@ -4169,14 +4453,15 @@ btr_validate_report2(
 Validates index tree level.
 @return	TRUE if ok */
 static
-ibool
+bool
 btr_validate_level(
 /*===============*/
 	dict_index_t*	index,	/*!< in: index tree */
-	trx_t*		trx,	/*!< in: transaction or NULL */
+	const trx_t*	trx,	/*!< in: transaction or NULL */
 	ulint		level)	/*!< in: level number */
 {
 	ulint		space;
+	ulint		space_flags;
 	ulint		zip_size;
 	buf_block_t*	block;
 	page_t*		page;
@@ -4190,9 +4475,10 @@ btr_validate_level(
 	ulint		left_page_no;
 	page_cur_t	cursor;
 	dtuple_t*	node_ptr_tuple;
-	ibool		ret	= TRUE;
+	bool		ret	= true;
 	mtr_t		mtr;
 	mem_heap_t*	heap	= mem_heap_create(256);
+	fseg_header_t*	seg;
 	ulint*		offsets	= NULL;
 	ulint*		offsets2= NULL;
 #ifdef UNIV_ZIP_DEBUG
@@ -4203,15 +4489,39 @@ btr_validate_level(
 
 	mtr_x_lock(dict_index_get_lock(index), &mtr);
 
-	block = btr_root_block_get(index, &mtr);
+	block = btr_root_block_get(index, RW_X_LATCH, &mtr);
 	page = buf_block_get_frame(block);
+	seg = page + PAGE_HEADER + PAGE_BTR_SEG_TOP;
 
 	space = dict_index_get_space(index);
 	zip_size = dict_table_zip_size(index->table);
 
+	fil_space_get_latch(space, &space_flags);
+
+	if (zip_size != dict_tf_get_zip_size(space_flags)) {
+
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Flags mismatch: table=%lu, tablespace=%lu",
+			(ulint) index->table->flags, (ulint) space_flags);
+
+		mtr_commit(&mtr);
+
+		return(false);
+	}
+
 	while (level != btr_page_get_level(page, &mtr)) {
 		const rec_t*	node_ptr;
 
+		if (fseg_page_is_free(seg,
+				      block->page.space, block->page.offset)) {
+
+			btr_validate_report1(index, level, block);
+
+			ib_logf(IB_LOG_LEVEL_WARN, "page is free");
+
+			ret = false;
+		}
+
 		ut_a(space == buf_block_get_space(block));
 		ut_a(space == page_get_space_id(page));
 #ifdef UNIV_ZIP_DEBUG
@@ -4232,12 +4542,13 @@ btr_validate_level(
 
 	/* Now we are on the desired level. Loop through the pages on that
 	level. */
-loop:
-	if (trx_is_interrupted(trx)) {
-		mtr_commit(&mtr);
-		mem_heap_free(heap);
-		return(ret);
+
+	if (level == 0) {
+		/* Leaf pages are managed in their own file segment. */
+		seg -= PAGE_BTR_SEG_TOP - PAGE_BTR_SEG_LEAF;
 	}
+
+loop:
 	mem_heap_empty(heap);
 	offsets = offsets2 = NULL;
 	mtr_x_lock(dict_index_get_lock(index), &mtr);
@@ -4247,20 +4558,35 @@ loop:
 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
 
-	/* Check ordering etc. of records */
+	ut_a(block->page.space == space);
+
+	if (fseg_page_is_free(seg, block->page.space, block->page.offset)) {
+
+		btr_validate_report1(index, level, block);
+
+		ib_logf(IB_LOG_LEVEL_WARN, "Page is marked as free");
+		ret = false;
+
+	} else if (btr_page_get_index_id(page) != index->id) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Page index id " IB_ID_FMT " != data dictionary "
+			"index id " IB_ID_FMT,
+			btr_page_get_index_id(page), index->id);
+
+		ret = false;
+
+	} else if (!page_validate(page, index)) {
 
-	if (!page_validate(page, index)) {
 		btr_validate_report1(index, level, block);
+		ret = false;
+
+	} else if (level == 0 && !btr_index_page_validate(block, index)) {
 
-		ret = FALSE;
-	} else if (level == 0) {
 		/* We are on level 0. Check that the records have the right
 		number of fields, and field lengths are right. */
 
-		if (!btr_index_page_validate(block, index)) {
-
-			ret = FALSE;
-		}
+		ret = false;
 	}
 
 	ut_a(btr_page_get_level(page, &mtr) == level);
@@ -4268,34 +4594,34 @@ loop:
 	right_page_no = btr_page_get_next(page, &mtr);
 	left_page_no = btr_page_get_prev(page, &mtr);
 
-	ut_a(page_get_n_recs(page) > 0 || (level == 0
-					   && page_get_page_no(page)
-					   == dict_index_get_page(index)));
+	ut_a(!page_is_empty(page)
+	     || (level == 0
+		 && page_get_page_no(page) == dict_index_get_page(index)));
 
 	if (right_page_no != FIL_NULL) {
 		const rec_t*	right_rec;
 		right_block = btr_block_get(space, zip_size, right_page_no,
 					    RW_X_LATCH, index, &mtr);
 		right_page = buf_block_get_frame(right_block);
-		if (UNIV_UNLIKELY(btr_page_get_prev(right_page, &mtr)
-				  != page_get_page_no(page))) {
+		if (btr_page_get_prev(right_page, &mtr)
+		    != page_get_page_no(page)) {
+
 			btr_validate_report2(index, level, block, right_block);
 			fputs("InnoDB: broken FIL_PAGE_NEXT"
 			      " or FIL_PAGE_PREV links\n", stderr);
 			buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
 			buf_page_print(right_page, 0, BUF_PAGE_PRINT_NO_CRASH);
 
-			ret = FALSE;
+			ret = false;
 		}
 
-		if (UNIV_UNLIKELY(page_is_comp(right_page)
-				  != page_is_comp(page))) {
+		if (page_is_comp(right_page) != page_is_comp(page)) {
 			btr_validate_report2(index, level, block, right_block);
 			fputs("InnoDB: 'compact' flag mismatch\n", stderr);
 			buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
 			buf_page_print(right_page, 0, BUF_PAGE_PRINT_NO_CRASH);
 
-			ret = FALSE;
+			ret = false;
 
 			goto node_ptr_fails;
 		}
@@ -4307,9 +4633,8 @@ loop:
 					  offsets, ULINT_UNDEFINED, &heap);
 		offsets2 = rec_get_offsets(right_rec, index,
 					   offsets2, ULINT_UNDEFINED, &heap);
-		if (UNIV_UNLIKELY(cmp_rec_rec(rec, right_rec,
-					      offsets, offsets2,
-					      index) >= 0)) {
+		if (cmp_rec_rec(rec, right_rec, offsets, offsets2,
+			        index) >= 0) {
 
 			btr_validate_report2(index, level, block, right_block);
 
@@ -4329,7 +4654,7 @@ loop:
 			rec_print(stderr, rec, index);
 			putc('\n', stderr);
 
-			ret = FALSE;
+			ret = false;
 		}
 	}
 
@@ -4356,10 +4681,9 @@ loop:
 		offsets = btr_page_get_father_node_ptr(offsets, heap,
 						       &node_cur, &mtr);
 
-		if (UNIV_UNLIKELY(node_ptr != btr_cur_get_rec(&node_cur))
-		    || UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr,
-								    offsets)
-				     != buf_block_get_page_no(block))) {
+		if (node_ptr != btr_cur_get_rec(&node_cur)
+		    || btr_node_ptr_get_child_page_no(node_ptr, offsets)
+				     != buf_block_get_page_no(block)) {
 
 			btr_validate_report1(index, level, block);
 
@@ -4381,7 +4705,7 @@ loop:
 			fputs("InnoDB: record on page ", stderr);
 			rec_print_new(stderr, rec, offsets);
 			putc('\n', stderr);
-			ret = FALSE;
+			ret = false;
 
 			goto node_ptr_fails;
 		}
@@ -4411,7 +4735,7 @@ loop:
 				fputs("InnoDB: first rec ", stderr);
 				rec_print(stderr, first_rec, index);
 				putc('\n', stderr);
-				ret = FALSE;
+				ret = false;
 
 				goto node_ptr_fails;
 			}
@@ -4439,7 +4763,7 @@ loop:
 
 				if (btr_cur_get_rec(&right_node_cur)
 				    != right_node_ptr) {
-					ret = FALSE;
+					ret = false;
 					fputs("InnoDB: node pointer to"
 					      " the right page is wrong\n",
 					      stderr);
@@ -4465,7 +4789,7 @@ loop:
 				    != page_rec_get_next(
 					    page_get_infimum_rec(
 						    right_father_page))) {
-					ret = FALSE;
+					ret = false;
 					fputs("InnoDB: node pointer 2 to"
 					      " the right page is wrong\n",
 					      stderr);
@@ -4490,7 +4814,7 @@ loop:
 				if (page_get_page_no(right_father_page)
 				    != btr_page_get_next(father_page, &mtr)) {
 
-					ret = FALSE;
+					ret = false;
 					fputs("InnoDB: node pointer 3 to"
 					      " the right page is wrong\n",
 					      stderr);
@@ -4521,17 +4845,23 @@ node_ptr_fails:
 	on the next loop.  The page has already been checked. */
 	mtr_commit(&mtr);
 
-	if (right_page_no != FIL_NULL) {
+	if (trx_is_interrupted(trx)) {
+		/* On interrupt, return the current status. */
+	} else if (right_page_no != FIL_NULL) {
+
 		mtr_start(&mtr);
 
-		block = btr_block_get(space, zip_size, right_page_no,
-				      RW_X_LATCH, index, &mtr);
+		block = btr_block_get(
+			space, zip_size, right_page_no,
+			RW_X_LATCH, index, &mtr);
+
 		page = buf_block_get_frame(block);
 
 		goto loop;
 	}
 
 	mem_heap_free(heap);
+
 	return(ret);
 }
 
@@ -4539,21 +4869,26 @@ node_ptr_fails:
 Checks the consistency of an index tree.
 @return	TRUE if ok */
 UNIV_INTERN
-ibool
+bool
 btr_validate_index(
 /*===============*/
 	dict_index_t*	index,	/*!< in: index */
-	trx_t*		trx)	/*!< in: transaction or NULL */
+	const trx_t*	trx)	/*!< in: transaction or NULL */
 {
-	mtr_t	mtr;
-	page_t*	root;
-	ulint	i;
-	ulint	n;
+	/* Full Text index are implemented by auxiliary tables,
+	not the B-tree */
+	if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) {
+		return(true);
+	}
+
+	mtr_t		mtr;
 
 	mtr_start(&mtr);
+
 	mtr_x_lock(dict_index_get_lock(index), &mtr);
 
-	root = btr_root_get(index, &mtr);
+	bool	ok = true;
+	page_t*	root = btr_root_get(index, &mtr);
 
 	SRV_CORRUPT_TABLE_CHECK(root,
 	{
@@ -4561,20 +4896,19 @@ btr_validate_index(
 		return(FALSE);
 	});
 
-	n = btr_page_get_level(root, &mtr);
+	ulint	n = btr_page_get_level(root, &mtr);
 
-	for (i = 0; i <= n && !trx_is_interrupted(trx); i++) {
-		if (!btr_validate_level(index, trx, n - i)) {
+	for (ulint i = 0; i <= n; ++i) {
 
-			mtr_commit(&mtr);
-
-			return(FALSE);
+		if (!btr_validate_level(index, trx, n - i)) {
+			ok = false;
+			break;
 		}
 	}
 
 	mtr_commit(&mtr);
 
-	return(TRUE);
+	return(ok);
 }
 
 /**************************************************************//**
@@ -4625,13 +4959,25 @@ btr_can_merge_with_page(
 		goto error;
 	}
 
+	/* If compression padding tells us that merging will result in
+	too packed up page i.e.: which is likely to cause compression
+	failure then don't merge the pages. */
+	if (zip_size && page_is_leaf(mpage)
+	    && (page_get_data_size(mpage) + data_size
+		>= dict_index_zip_pad_optimal_page_size(index))) {
+
+		goto error;
+	}
+
+
 	max_ins_size = page_get_max_insert_size(mpage, n_recs);
 
 	if (data_size > max_ins_size) {
 
 		/* We have to reorganize mpage */
 
-		if (!btr_page_reorganize(mblock, index, mtr)) {
+		if (!btr_page_reorganize_block(
+			    false, page_zip_level, mblock, index, mtr)) {
 
 			goto error;
 		}
diff --git a/storage/xtradb/btr/btr0cur.c b/storage/xtradb/btr/btr0cur.cc
index 8904270197a..bd2dd1e6095 100644
--- a/storage/xtradb/btr/btr0cur.c
+++ b/storage/xtradb/btr/btr0cur.cc
@@ -2,6 +2,7 @@
 
 Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
+Copyright (c) 2012, Facebook Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -24,7 +25,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file btr/btr0cur.c
+@file btr/btr0cur.cc
 The index tree cursor
 
 All changes that row operations make to a B-tree or the records
@@ -57,6 +58,7 @@ Created 10/16/1994 Heikki Tuuri
 #include "buf0lru.h"
 #include "btr0btr.h"
 #include "btr0sea.h"
+#include "row0log.h"
 #include "row0purge.h"
 #include "row0upd.h"
 #include "trx0rec.h"
@@ -69,13 +71,13 @@ Created 10/16/1994 Heikki Tuuri
 #include "zlib.h"
 
 /** Buffered B-tree operation types, introduced as part of delete buffering. */
-typedef enum btr_op_enum {
+enum btr_op_t {
 	BTR_NO_OP = 0,			/*!< Not buffered */
 	BTR_INSERT_OP,			/*!< Insert, do not ignore UNIQUE */
 	BTR_INSERT_IGNORE_UNIQUE_OP,	/*!< Insert, ignoring UNIQUE */
 	BTR_DELETE_OP,			/*!< Purge a delete-marked record */
 	BTR_DELMARK_OP			/*!< Mark a record for deletion */
-} btr_op_t;
+};
 
 #ifdef UNIV_DEBUG
 /** If the following is set to TRUE, this module prints a lot of
@@ -134,7 +136,12 @@ can be released by page reorganize, then it is reorganized */
 /** A BLOB field reference full of zero, for use in assertions and tests.
 Initially, BLOB field references are set to zero, in
 dtuple_convert_big_rec(). */
-UNIV_INTERN const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE];
+const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE] = {
+	0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0,
+};
 
 #ifndef UNIV_HOTBACKUP
 /*******************************************************************//**
@@ -453,6 +460,7 @@ btr_cur_search_to_nth_level(
 	ut_ad(dict_index_check_search_tuple(index, tuple));
 	ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
 	ut_ad(dtuple_check_typed(tuple));
+	ut_ad(!(index->type & DICT_FTS));
 	ut_ad(index->page != FIL_NULL);
 
 	UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
@@ -464,6 +472,14 @@ btr_cur_search_to_nth_level(
 	cursor->low_match = ULINT_UNDEFINED;
 #endif
 
+	ibool	s_latch_by_caller;
+
+	s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
+
+	ut_ad(!s_latch_by_caller
+	      || mtr_memo_contains(mtr, dict_index_get_lock(index),
+				   MTR_MEMO_S_LOCK));
+
 	/* These flags are mutually exclusive, they are lumped together
 	with the latch mode for historical reasons. It's possible for
 	none of the flags to be set. */
@@ -499,11 +515,11 @@ btr_cur_search_to_nth_level(
 	estimate = latch_mode & BTR_ESTIMATE;
 
 	/* Turn the flags unrelated to the latch mode off. */
-	latch_mode &= ~(BTR_INSERT
-			| BTR_DELETE_MARK
-			| BTR_DELETE
-			| BTR_ESTIMATE
-			| BTR_IGNORE_SEC_UNIQUE);
+	latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+
+	ut_ad(!s_latch_by_caller
+	      || latch_mode == BTR_SEARCH_LEAF
+	      || latch_mode == BTR_MODIFY_LEAF);
 
 	cursor->flag = BTR_CUR_BINARY;
 	cursor->index = index;
@@ -517,17 +533,17 @@ btr_cur_search_to_nth_level(
 
 #ifdef BTR_CUR_HASH_ADAPT
 
-#ifdef UNIV_SEARCH_PERF_STAT
+# ifdef UNIV_SEARCH_PERF_STAT
 	info->n_searches++;
-#endif
+# endif
 	if (rw_lock_get_writer(btr_search_get_latch(cursor->index)) ==
 	    RW_LOCK_NOT_LOCKED
 	    && latch_mode <= BTR_MODIFY_LEAF
 	    && info->last_hash_succ
 	    && !estimate
-#ifdef PAGE_CUR_LE_OR_EXTENDS
+# ifdef PAGE_CUR_LE_OR_EXTENDS
 	    && mode != PAGE_CUR_LE_OR_EXTENDS
-#endif /* PAGE_CUR_LE_OR_EXTENDS */
+# endif /* PAGE_CUR_LE_OR_EXTENDS */
 	    /* If !has_search_latch, we do a dirty read of
 	    btr_search_enabled below, and btr_search_guess_on_hash()
 	    will have to check it again. */
@@ -548,7 +564,7 @@ btr_cur_search_to_nth_level(
 
 		return;
 	}
-#endif /* BTR_CUR_HASH_ADAPT */
+# endif /* BTR_CUR_HASH_ADAPT */
 #endif /* BTR_CUR_ADAPT */
 	btr_cur_n_non_sea++;
 
@@ -565,15 +581,19 @@ btr_cur_search_to_nth_level(
 
 	savepoint = mtr_set_savepoint(mtr);
 
-	if (latch_mode == BTR_MODIFY_TREE) {
+	switch (latch_mode) {
+	case BTR_MODIFY_TREE:
 		mtr_x_lock(dict_index_get_lock(index), mtr);
-
-	} else if (latch_mode == BTR_CONT_MODIFY_TREE) {
+		break;
+	case BTR_CONT_MODIFY_TREE:
 		/* Do nothing */
 		ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
 					MTR_MEMO_X_LOCK));
-	} else {
-		mtr_s_lock(dict_index_get_lock(index), mtr);
+		break;
+	default:
+		if (!s_latch_by_caller) {
+			mtr_s_lock(dict_index_get_lock(index), mtr);
+		}
 	}
 
 	page_cursor = btr_cur_get_page_cur(cursor);
@@ -754,6 +774,7 @@ retry_page_get:
 			? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
 	}
 
+	ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
 	ut_ad(index->id == btr_page_get_index_id(page));
 
 	if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
@@ -778,13 +799,17 @@ retry_page_get:
 				cursor, mtr);
 		}
 
-		if (latch_mode != BTR_MODIFY_TREE
-		    && latch_mode != BTR_CONT_MODIFY_TREE) {
-
-			/* Release the tree s-latch */
-
-			mtr_release_s_latch_at_savepoint(
-				mtr, savepoint, dict_index_get_lock(index));
+		switch (latch_mode) {
+		case BTR_MODIFY_TREE:
+		case BTR_CONT_MODIFY_TREE:
+			break;
+		default:
+			if (!s_latch_by_caller) {
+				/* Release the tree s-latch */
+				mtr_release_s_latch_at_savepoint(
+					mtr, savepoint,
+					dict_index_get_lock(index));
+			}
 		}
 
 		page_mode = mode;
@@ -851,8 +876,7 @@ retry_page_get:
 		will properly check btr_search_enabled again in
 		btr_search_build_page_hash_index() before building a
 		page hash index, while holding btr_search_latch. */
-		if (UNIV_LIKELY(btr_search_enabled)) {
-
+		if (btr_search_enabled) {
 			btr_search_info_update(index, cursor);
 		}
 #endif
@@ -882,14 +906,16 @@ UNIV_INTERN
 void
 btr_cur_open_at_index_side_func(
 /*============================*/
-	ibool		from_left,	/*!< in: TRUE if open to the low end,
-					FALSE if to the high end */
+	bool		from_left,	/*!< in: true if open to the low end,
+					false if to the high end */
 	dict_index_t*	index,		/*!< in: index */
 	ulint		latch_mode,	/*!< in: latch mode */
-	btr_cur_t*	cursor,		/*!< in: cursor */
+	btr_cur_t*	cursor,		/*!< in/out: cursor */
+	ulint		level,		/*!< in: level to search for
+					(0=leaf). */
 	const char*	file,		/*!< in: file name */
 	ulint		line,		/*!< in: line where called */
-	mtr_t*		mtr)		/*!< in: mtr */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
 	page_cur_t*	page_cursor;
 	ulint		page_no;
@@ -906,16 +932,27 @@ btr_cur_open_at_index_side_func(
 	rec_offs_init(offsets_);
 
 	estimate = latch_mode & BTR_ESTIMATE;
-	latch_mode = latch_mode & ~BTR_ESTIMATE;
+	latch_mode &= ~BTR_ESTIMATE;
+
+	ut_ad(level != ULINT_UNDEFINED);
 
 	/* Store the position of the tree latch we push to mtr so that we
 	know how to release it when we have latched the leaf node */
 
 	savepoint = mtr_set_savepoint(mtr);
 
-	if (latch_mode == BTR_MODIFY_TREE) {
+	switch (latch_mode) {
+	case BTR_CONT_MODIFY_TREE:
+		break;
+	case BTR_MODIFY_TREE:
 		mtr_x_lock(dict_index_get_lock(index), mtr);
-	} else {
+		break;
+	case BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED:
+	case BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED:
+		ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+					MTR_MEMO_S_LOCK));
+		break;
+	default:
 		mtr_s_lock(dict_index_get_lock(index), mtr);
 	}
 
@@ -950,6 +987,8 @@ btr_cur_open_at_index_side_func(
 			goto exit_loop;
 		});
 
+		ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+
 		ut_ad(index->id == btr_page_get_index_id(page));
 
 		block->check_index_page_at_flush = TRUE;
@@ -959,26 +998,40 @@ btr_cur_open_at_index_side_func(
 
 			height = btr_page_get_level(page, mtr);
 			root_height = height;
+			ut_a(height >= level);
+		} else {
+			/* TODO: flag the index corrupted if this fails */
+			ut_ad(height == btr_page_get_level(page, mtr));
 		}
 
-		if (height == 0) {
-			btr_cur_latch_leaves(page, space, zip_size, page_no,
-					     latch_mode, cursor, mtr);
-
-			/* In versions <= 3.23.52 we had forgotten to
-			release the tree latch here. If in an index scan
-			we had to scan far to find a record visible to the
-			current transaction, that could starve others
-			waiting for the tree latch. */
-
-			if ((latch_mode != BTR_MODIFY_TREE)
-			    && (latch_mode != BTR_CONT_MODIFY_TREE)) {
+		if (height == level) {
+			btr_cur_latch_leaves(
+				page, space, zip_size, page_no,
+				latch_mode & ~BTR_ALREADY_S_LATCHED,
+				cursor, mtr);
 
-				/* Release the tree s-latch */
+			if (height == 0) {
+				/* In versions <= 3.23.52 we had
+				forgotten to release the tree latch
+				here. If in an index scan we had to
+				scan far to find a record visible to
+				the current transaction, that could
+				starve others waiting for the tree
+				latch. */
+
+				switch (latch_mode) {
+				case BTR_MODIFY_TREE:
+				case BTR_CONT_MODIFY_TREE:
+				case BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED:
+				case BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED:
+					break;
+				default:
+					/* Release the tree s-latch */
 
-				mtr_release_s_latch_at_savepoint(
-					mtr, savepoint,
-					dict_index_get_lock(index));
+					mtr_release_s_latch_at_savepoint(
+						mtr, savepoint,
+						dict_index_get_lock(index));
+				}
 			}
 		}
 
@@ -988,7 +1041,7 @@ btr_cur_open_at_index_side_func(
 			page_cur_set_after_last(block, page_cursor);
 		}
 
-		if (height == 0) {
+		if (height == level) {
 			if (estimate) {
 				btr_cur_add_path_info(cursor, height,
 						      root_height);
@@ -1048,9 +1101,12 @@ btr_cur_open_at_rnd_pos_func(
 	ulint*		offsets		= offsets_;
 	rec_offs_init(offsets_);
 
-	if (latch_mode == BTR_MODIFY_TREE) {
+	switch (latch_mode) {
+	case BTR_MODIFY_TREE:
 		mtr_x_lock(dict_index_get_lock(index), mtr);
-	} else {
+		break;
+	default:
+		ut_ad(latch_mode != BTR_CONT_MODIFY_TREE);
 		mtr_s_lock(dict_index_get_lock(index), mtr);
 	}
 
@@ -1080,6 +1136,8 @@ btr_cur_open_at_rnd_pos_func(
 			goto exit_loop;
 		});
 
+		ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+
 		ut_ad(index->id == btr_page_get_index_id(page));
 
 		if (height == ULINT_UNDEFINED) {
@@ -1124,8 +1182,14 @@ Inserts a record if there is enough space, or if enough space can
 be freed by reorganizing. Differs from btr_cur_optimistic_insert because
 no heuristics is applied to whether it pays to use CPU time for
 reorganizing the page or not.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
 @return	pointer to inserted record if succeed, else NULL */
-static
+static __attribute__((nonnull, warn_unused_result))
 rec_t*
 btr_cur_insert_if_possible(
 /*=======================*/
@@ -1133,45 +1197,43 @@ btr_cur_insert_if_possible(
 				cursor stays valid */
 	const dtuple_t*	tuple,	/*!< in: tuple to insert; the size info need not
 				have been stored to tuple */
+	ulint**		offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	page_cur_t*	page_cursor;
-	buf_block_t*	block;
 	rec_t*		rec;
 
 	ut_ad(dtuple_check_typed(tuple));
 
-	block = btr_cur_get_block(cursor);
-
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+				MTR_MEMO_PAGE_X_FIX));
 	page_cursor = btr_cur_get_page_cur(cursor);
 
 	/* Now, try the insert */
-	rec = page_cur_tuple_insert(page_cursor, tuple,
-				    cursor->index, n_ext, mtr);
+	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+				    offsets, heap, n_ext, mtr);
 
-	if (UNIV_UNLIKELY(!rec)) {
-		/* If record did not fit, reorganize */
-
-		if (btr_page_reorganize(block, cursor->index, mtr)) {
-
-			page_cur_search(block, cursor->index, tuple,
-					PAGE_CUR_LE, page_cursor);
-
-			rec = page_cur_tuple_insert(page_cursor, tuple,
-						    cursor->index, n_ext, mtr);
-		}
+	/* If the record did not fit, reorganize.
+	For compressed pages, page_cur_tuple_insert()
+	attempted this already. */
+	if (!rec && !page_cur_get_page_zip(page_cursor)
+	    && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
+		rec = page_cur_tuple_insert(
+			page_cursor, tuple, cursor->index,
+			offsets, heap, n_ext, mtr);
 	}
 
+	ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
 	return(rec);
 }
 
 /*************************************************************//**
 For an insert, checks the locks and does the undo logging if desired.
 @return	DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
-UNIV_INLINE
-ulint
+UNIV_INLINE __attribute__((warn_unused_result, nonnull(2,3,5,6)))
+dberr_t
 btr_cur_ins_lock_and_undo(
 /*======================*/
 	ulint		flags,	/*!< in: undo logging and locking flags: if
@@ -1186,7 +1248,7 @@ btr_cur_ins_lock_and_undo(
 				successor record */
 {
 	dict_index_t*	index;
-	ulint		err;
+	dberr_t		err;
 	rec_t*		rec;
 	roll_ptr_t	roll_ptr;
 
@@ -1201,33 +1263,35 @@ btr_cur_ins_lock_and_undo(
 	rec = btr_cur_get_rec(cursor);
 	index = cursor->index;
 
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
+
 	err = lock_rec_insert_check_and_lock(flags, rec,
 					     btr_cur_get_block(cursor),
 					     index, thr, mtr, inherit);
 
-	if (err != DB_SUCCESS) {
+	if (err != DB_SUCCESS
+	    || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
 
 		return(err);
 	}
 
-	if (dict_index_is_clust(index) && !dict_index_is_ibuf(index)) {
-
-		err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
-						    thr, index, entry,
-						    NULL, 0, NULL,
-						    &roll_ptr);
-		if (err != DB_SUCCESS) {
+	err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
+					    thr, index, entry,
+					    NULL, 0, NULL, NULL,
+					    &roll_ptr);
+	if (err != DB_SUCCESS) {
 
-			return(err);
-		}
+		return(err);
+	}
 
-		/* Now we can fill in the roll ptr field in entry */
+	/* Now we can fill in the roll ptr field in entry */
 
-		if (!(flags & BTR_KEEP_SYS_FLAG)) {
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
 
-			row_upd_index_entry_sys_field(entry, index,
-						      DATA_ROLL_PTR, roll_ptr);
-		}
+		row_upd_index_entry_sys_field(entry, index,
+					      DATA_ROLL_PTR, roll_ptr);
 	}
 
 	return(DB_SUCCESS);
@@ -1240,14 +1304,13 @@ static
 void
 btr_cur_trx_report(
 /*===============*/
-	trx_t*			trx,	/*!< in: transaction */
+	trx_id_t		trx_id,	/*!< in: transaction id */
 	const dict_index_t*	index,	/*!< in: index */
 	const char*		op)	/*!< in: operation */
 {
-	fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ",
-		(ullint) trx->id);
+	fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ", trx_id);
 	fputs(op, stderr);
-	dict_index_name_print(stderr, trx, index);
+	dict_index_name_print(stderr, NULL, index);
 	putc('\n', stderr);
 }
 #endif /* UNIV_DEBUG */
@@ -1260,7 +1323,7 @@ one record on the page, the insert will always succeed; this is to
 prevent trying to split a page with just one record.
 @return	DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
 UNIV_INTERN
-ulint
+dberr_t
 btr_cur_optimistic_insert(
 /*======================*/
 	ulint		flags,	/*!< in: undo logging and locking flags: if not
@@ -1268,6 +1331,8 @@ btr_cur_optimistic_insert(
 				specified */
 	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
 				cursor stays valid */
+	ulint**		offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
 	dtuple_t*	entry,	/*!< in/out: entry to insert */
 	rec_t**		rec,	/*!< out: pointer to inserted record if
 				succeed */
@@ -1276,10 +1341,11 @@ btr_cur_optimistic_insert(
 				NULL */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
 	que_thr_t*	thr,	/*!< in: query thread or NULL */
-	mtr_t*		mtr)	/*!< in: mtr; if this function returns
-				DB_SUCCESS on a leaf page of a secondary
-				index in a compressed tablespace, the
-				mtr must be committed before latching
+	mtr_t*		mtr)	/*!< in/out: mini-transaction;
+				if this function returns DB_SUCCESS on
+				a leaf page of a secondary index in a
+				compressed tablespace, the caller must
+				mtr_commit(mtr) before latching
 				any further pages */
 {
 	big_rec_t*	big_rec_vec	= NULL;
@@ -1287,14 +1353,13 @@ btr_cur_optimistic_insert(
 	page_cur_t*	page_cursor;
 	buf_block_t*	block;
 	page_t*		page;
-	ulint		max_size;
-	rec_t*		dummy_rec;
+	rec_t*		dummy;
 	ibool		leaf;
 	ibool		reorg;
 	ibool		inherit;
 	ulint		zip_size;
 	ulint		rec_size;
-	ulint		err;
+	dberr_t		err;
 
 	*big_rec = NULL;
 
@@ -1304,6 +1369,14 @@ btr_cur_optimistic_insert(
 
 	page = buf_block_get_frame(block);
 	index = cursor->index;
+
+	ut_ad((thr && thr_get_trx(thr)->fake_changes)
+	      || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
+	ut_ad(dtuple_check_typed(entry));
+
 	zip_size = buf_block_get_zip_size(block);
 #ifdef UNIV_DEBUG_VALGRIND
 	if (zip_size) {
@@ -1312,19 +1385,16 @@ btr_cur_optimistic_insert(
 	}
 #endif /* UNIV_DEBUG_VALGRIND */
 
-	if (!dtuple_check_typed_no_assert(entry)) {
-		fputs("InnoDB: Error in a tuple to insert into ", stderr);
-		dict_index_name_print(stderr, thr_get_trx(thr), index);
-	}
 #ifdef UNIV_DEBUG
 	if (btr_cur_print_record_ops && thr) {
-		btr_cur_trx_report(thr_get_trx(thr), index, "insert into ");
+		btr_cur_trx_report(thr_get_trx(thr)->id, index, "insert ");
 		dtuple_print(stderr, entry);
 	}
 #endif /* UNIV_DEBUG */
 
-	ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-	max_size = page_get_max_insert_size_after_reorganize(page, 1);
+	ut_ad((thr && thr_get_trx(thr)->fake_changes)
+	      || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
 	leaf = page_is_leaf(page);
 
 	/* Calculate the record size when entry is converted to a record */
@@ -1345,7 +1415,7 @@ btr_cur_optimistic_insert(
 		rec_size = rec_get_converted_size(index, entry, n_ext);
 	}
 
-	if (UNIV_UNLIKELY(zip_size)) {
+	if (zip_size) {
 		/* Estimate the free space of an empty compressed page.
 		Subtract one byte for the encoded heap_no in the
 		modification log. */
@@ -1390,16 +1460,13 @@ too_big:
 	LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
 				      goto fail);
 
-	/* If there have been many consecutive inserts, and we are on the leaf
-	level, check if we have to split the page to reserve enough free space
-	for future updates of records. */
-
-	if (dict_index_is_clust(index)
-	    && (page_get_n_recs(page) >= 2)
-	    && UNIV_LIKELY(leaf)
-	    && (dict_index_get_space_reserve() + rec_size > max_size)
-	    && (btr_page_get_split_rec_to_right(cursor, &dummy_rec)
-		|| btr_page_get_split_rec_to_left(cursor, &dummy_rec))) {
+	if (leaf && zip_size
+	    && (page_get_data_size(page) + rec_size
+		>= dict_index_zip_pad_optimal_page_size(index))) {
+		/* If compression padding tells us that insertion will
+		result in too packed up page i.e.: which is likely to
+		cause compression failure then don't do an optimistic
+		insertion. */
 fail:
 		err = DB_FAIL;
 fail_err:
@@ -1411,11 +1478,30 @@ fail_err:
 		return(err);
 	}
 
-	if (UNIV_UNLIKELY(max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT
-			  || max_size < rec_size)
-	    && UNIV_LIKELY(page_get_n_recs(page) > 1)
-	    && page_get_max_insert_size(page, 1) < rec_size) {
+	ulint	max_size = page_get_max_insert_size_after_reorganize(page, 1);
+
+	if (page_has_garbage(page)) {
+		if ((max_size < rec_size
+		     || max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT)
+		    && page_get_n_recs(page) > 1
+		    && page_get_max_insert_size(page, 1) < rec_size) {
 
+			goto fail;
+		}
+	} else if (max_size < rec_size) {
+		goto fail;
+	}
+
+	/* If there have been many consecutive inserts to the
+	clustered index leaf page of an uncompressed table, check if
+	we have to split the page to reserve enough free space for
+	future updates of records. */
+
+	if (leaf && !zip_size && dict_index_is_clust(index)
+	    && page_get_n_recs(page) >= 2
+	    && dict_index_get_space_reserve() + rec_size > max_size
+	    && (btr_page_get_split_rec_to_right(cursor, &dummy)
+		|| btr_page_get_split_rec_to_left(cursor, &dummy))) {
 		goto fail;
 	}
 
@@ -1441,44 +1527,37 @@ fail_err:
 	{
 		const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
 		*rec = page_cur_tuple_insert(page_cursor, entry, index,
-					     n_ext, mtr);
+					     offsets, heap, n_ext, mtr);
 		reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
+	}
 
-		if (UNIV_UNLIKELY(reorg)) {
-			ut_a(zip_size);
-			/* It's possible for rec to be NULL if the
-			page is compressed.  This is because a
-			reorganized page may become incompressible. */
-			if (!*rec) {
-				goto fail;
-			}
+	if (*rec) {
+	} else if (zip_size) {
+		/* Reset the IBUF_BITMAP_FREE bits, because
+		page_cur_tuple_insert() will have attempted page
+		reorganize before failing. */
+		if (leaf && !dict_index_is_clust(index)) {
+			ibuf_reset_free_bits(block);
 		}
-	}
 
-	if (UNIV_UNLIKELY(!*rec) && UNIV_LIKELY(!reorg)) {
-		/* If the record did not fit, reorganize */
-		if (UNIV_UNLIKELY(!btr_page_reorganize(block, index, mtr))) {
-			ut_a(zip_size);
+		goto fail;
+	} else {
+		ut_ad(!reorg);
 
+		/* If the record did not fit, reorganize */
+		if (!btr_page_reorganize(page_cursor, index, mtr)) {
+			ut_ad(0);
 			goto fail;
 		}
 
-		ut_ad(zip_size
-		      || page_get_max_insert_size(page, 1) == max_size);
+		ut_ad(page_get_max_insert_size(page, 1) == max_size);
 
 		reorg = TRUE;
 
-		page_cur_search(block, index, entry, PAGE_CUR_LE, page_cursor);
-
 		*rec = page_cur_tuple_insert(page_cursor, entry, index,
-					     n_ext, mtr);
+					     offsets, heap, n_ext, mtr);
 
 		if (UNIV_UNLIKELY(!*rec)) {
-			if (UNIV_LIKELY(zip_size != 0)) {
-
-				goto fail;
-			}
-
 			fputs("InnoDB: Error: cannot insert tuple ", stderr);
 			dtuple_print(stderr, entry);
 			fputs(" into ", stderr);
@@ -1502,12 +1581,6 @@ fail_err:
 		lock_update_insert(block, *rec);
 	}
 
-#if 0
-	fprintf(stderr, "Insert into page %lu, max ins size %lu,"
-		" rec %lu ind type %lu\n",
-		buf_block_get_page_no(block), max_size,
-		rec_size + PAGE_DIR_SLOT_SIZE, index->type);
-#endif
 	if (leaf && !dict_index_is_clust(index)) {
 		/* Update the free bits of the B-tree page in the
 		insert buffer bitmap. */
@@ -1546,7 +1619,7 @@ made on the leaf level, to avoid deadlocks, mtr must also own x-latches
 to brothers of page, if those brothers exist.
 @return	DB_SUCCESS or error number */
 UNIV_INTERN
-ulint
+dberr_t
 btr_cur_pessimistic_insert(
 /*=======================*/
 	ulint		flags,	/*!< in: undo logging and locking flags: if not
@@ -1557,6 +1630,9 @@ btr_cur_pessimistic_insert(
 				insertion will certainly succeed */
 	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
 				cursor stays valid */
+	ulint**		offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
+				that can be emptied, or NULL */
 	dtuple_t*	entry,	/*!< in/out: entry to insert */
 	rec_t**		rec,	/*!< out: pointer to inserted record if
 				succeed */
@@ -1565,17 +1641,15 @@ btr_cur_pessimistic_insert(
 				NULL */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
 	que_thr_t*	thr,	/*!< in: query thread or NULL */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	dict_index_t*	index		= cursor->index;
 	ulint		zip_size	= dict_table_zip_size(index->table);
 	big_rec_t*	big_rec_vec	= NULL;
-	mem_heap_t*	heap		= NULL;
-	ulint		err;
+	dberr_t		err;
 	ibool		dummy_inh;
 	ibool		success;
-	ulint		n_extents	= 0;
-	ulint		n_reserved;
+	ulint		n_reserved	= 0;
 
 	ut_ad(dtuple_check_typed(entry));
 
@@ -1586,6 +1660,9 @@ btr_cur_pessimistic_insert(
 				MTR_MEMO_X_LOCK));
 	ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, btr_cur_get_block(cursor),
 				MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
 
 	cursor->flag = BTR_CUR_BINARY;
 
@@ -1607,7 +1684,7 @@ btr_cur_pessimistic_insert(
 		of the index tree, so that the insert will not fail because
 		of lack of space */
 
-		n_extents = cursor->tree_height / 16 + 3;
+		ulint	n_extents = cursor->tree_height / 16 + 3;
 
 		success = fsp_reserve_free_extents(&n_reserved, index->space,
 						   n_extents, FSP_NORMAL, mtr);
@@ -1618,7 +1695,7 @@ btr_cur_pessimistic_insert(
 
 	if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
 				   dict_table_is_comp(index->table),
-				   dict_index_get_n_fields(index),
+				   dtuple_get_n_fields(entry),
 				   zip_size)) {
 		/* The record is so big that we have to store some fields
 		externally on separate database pages */
@@ -1634,7 +1711,7 @@ btr_cur_pessimistic_insert(
 
 		if (big_rec_vec == NULL) {
 
-			if (n_extents > 0) {
+			if (n_reserved > 0) {
 				fil_space_release_free_extents(index->space,
 							       n_reserved);
 			}
@@ -1644,7 +1721,7 @@ btr_cur_pessimistic_insert(
 
 	if (UNIV_UNLIKELY(thr && thr_get_trx(thr)->fake_changes)) {
 		/* skip CHANGE, LOG */
-		if (n_extents > 0) {
+		if (n_reserved > 0) {
 			fil_space_release_free_extents(index->space,
 						       n_reserved);
 		}
@@ -1656,13 +1733,11 @@ btr_cur_pessimistic_insert(
 	    == buf_block_get_page_no(btr_cur_get_block(cursor))) {
 
 		/* The page is the root page */
-		*rec = btr_root_raise_and_insert(cursor, entry, n_ext, mtr);
+		*rec = btr_root_raise_and_insert(
+			flags, cursor, offsets, heap, entry, n_ext, mtr);
 	} else {
-		*rec = btr_page_split_and_insert(cursor, entry, n_ext, mtr);
-	}
-
-	if (UNIV_LIKELY_NULL(heap)) {
-		mem_heap_free(heap);
+		*rec = btr_page_split_and_insert(
+			flags, cursor, offsets, heap, entry, n_ext, mtr);
 	}
 
 	ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec);
@@ -1675,7 +1750,7 @@ btr_cur_pessimistic_insert(
 		lock_update_insert(btr_cur_get_block(cursor), *rec);
 	}
 
-	if (n_extents > 0) {
+	if (n_reserved > 0) {
 		fil_space_release_free_extents(index->space, n_reserved);
 	}
 
@@ -1689,26 +1764,28 @@ btr_cur_pessimistic_insert(
 /*************************************************************//**
 For an update, checks the locks and does the undo logging.
 @return	DB_SUCCESS, DB_WAIT_LOCK, or error number */
-UNIV_INLINE
-ulint
+UNIV_INLINE __attribute__((warn_unused_result, nonnull(2,3,6,7)))
+dberr_t
 btr_cur_upd_lock_and_undo(
 /*======================*/
 	ulint		flags,	/*!< in: undo logging and locking flags */
 	btr_cur_t*	cursor,	/*!< in: cursor on record to update */
+	const ulint*	offsets,/*!< in: rec_get_offsets() on cursor */
 	const upd_t*	update,	/*!< in: update vector */
 	ulint		cmpl_info,/*!< in: compiler info on secondary index
 				updates */
-	que_thr_t*	thr,	/*!< in: query thread */
+	que_thr_t*	thr,	/*!< in: query thread
+				(can be NULL if BTR_NO_LOCKING_FLAG) */
 	mtr_t*		mtr,	/*!< in/out: mini-transaction */
 	roll_ptr_t*	roll_ptr)/*!< out: roll pointer */
 {
 	dict_index_t*	index;
-	rec_t*		rec;
-	ulint		err;
+	const rec_t*	rec;
+	dberr_t		err;
 
-	ut_ad(cursor && update && thr && roll_ptr);
+	ut_ad(thr || (flags & BTR_NO_LOCKING_FLAG));
 
-	if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) {
+	if (UNIV_UNLIKELY(thr && thr_get_trx(thr)->fake_changes)) {
 		/* skip LOCK, UNDO */
 		return(DB_SUCCESS);
 	}
@@ -1716,7 +1793,12 @@ btr_cur_upd_lock_and_undo(
 	rec = btr_cur_get_rec(cursor);
 	index = cursor->index;
 
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
 	if (!dict_index_is_clust(index)) {
+		ut_ad(dict_index_is_online_ddl(index)
+		      == !!(flags & BTR_CREATE_FLAG));
+
 		/* We do undo logging only when we update a clustered index
 		record */
 		return(lock_sec_rec_modify_check_and_lock(
@@ -1727,50 +1809,39 @@ btr_cur_upd_lock_and_undo(
 	/* Check if we have to wait for a lock: enqueue an explicit lock
 	request if yes */
 
-	err = DB_SUCCESS;
-
 	if (!(flags & BTR_NO_LOCKING_FLAG)) {
-		mem_heap_t*	heap		= NULL;
-		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
-		rec_offs_init(offsets_);
-
 		err = lock_clust_rec_modify_check_and_lock(
 			flags, btr_cur_get_block(cursor), rec, index,
-			rec_get_offsets(rec, index, offsets_,
-					ULINT_UNDEFINED, &heap), thr);
-		if (UNIV_LIKELY_NULL(heap)) {
-			mem_heap_free(heap);
-		}
+			offsets, thr);
 		if (err != DB_SUCCESS) {
-
 			return(err);
 		}
 	}
 
 	/* Append the info about the update in the undo log */
 
-	err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
-					    index, NULL, update,
-					    cmpl_info, rec, roll_ptr);
-	return(err);
+	return(trx_undo_report_row_operation(
+		       flags, TRX_UNDO_MODIFY_OP, thr,
+		       index, NULL, update,
+		       cmpl_info, rec, offsets, roll_ptr));
 }
 
 /***********************************************************//**
 Writes a redo log record of updating a record in-place. */
-UNIV_INLINE
+UNIV_INLINE __attribute__((nonnull))
 void
 btr_cur_update_in_place_log(
 /*========================*/
 	ulint		flags,		/*!< in: flags */
-	rec_t*		rec,		/*!< in: record */
-	dict_index_t*	index,		/*!< in: index where cursor positioned */
+	const rec_t*	rec,		/*!< in: record */
+	dict_index_t*	index,		/*!< in: index of the record */
 	const upd_t*	update,		/*!< in: update vector */
-	trx_t*		trx,		/*!< in: transaction */
+	trx_id_t	trx_id,		/*!< in: transaction id */
 	roll_ptr_t	roll_ptr,	/*!< in: roll ptr */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	byte*	log_ptr;
-	page_t*	page	= page_align(rec);
+	byte*		log_ptr;
+	const page_t*	page	= page_align(rec);
 	ut_ad(flags < 256);
 	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
 
@@ -1795,8 +1866,8 @@ btr_cur_update_in_place_log(
 	mach_write_to_1(log_ptr, flags);
 	log_ptr++;
 
-	log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
-						mtr);
+	log_ptr = row_upd_write_sys_vals_to_log(
+		index, trx_id, roll_ptr, log_ptr, mtr);
 	mach_write_to_2(log_ptr, page_offset(rec));
 	log_ptr += 2;
 
@@ -1886,49 +1957,69 @@ func_exit:
 /*************************************************************//**
 See if there is enough place in the page modification log to log
 an update-in-place.
-@return	TRUE if enough place */
+
+@retval false if out of space; IBUF_BITMAP_FREE will be reset
+outside mtr if the page was recompressed
+@retval	true if enough place;
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
+a secondary index leaf page. This has to be done either within the
+same mini-transaction, or by invoking ibuf_reset_free_bits() before
+mtr_commit(mtr). */
 UNIV_INTERN
-ibool
-btr_cur_update_alloc_zip(
-/*=====================*/
+bool
+btr_cur_update_alloc_zip_func(
+/*==========================*/
 	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	buf_block_t*	block,	/*!< in/out: buffer page */
-	dict_index_t*	index,	/*!< in: the index corresponding to the block */
+	page_cur_t*	cursor,	/*!< in/out: B-tree page cursor */
+	dict_index_t*	index,	/*!< in: the index corresponding to cursor */
+#ifdef UNIV_DEBUG
+	ulint*		offsets,/*!< in/out: offsets of the cursor record */
+#endif /* UNIV_DEBUG */
 	ulint		length,	/*!< in: size needed */
-	ibool		create,	/*!< in: TRUE=delete-and-insert,
-				FALSE=update-in-place */
-	mtr_t*		mtr,	/*!< in: mini-transaction */
+	bool		create,	/*!< in: true=delete-and-insert,
+				false=update-in-place */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
 	trx_t*		trx)	/*!< in: NULL or transaction */
 {
-	ut_a(page_zip == buf_block_get_page_zip(block));
+	const page_t*	page = page_cur_get_page(cursor);
+
+	ut_ad(page_zip == page_cur_get_page_zip(cursor));
 	ut_ad(page_zip);
 	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
 
 	if (page_zip_available(page_zip, dict_index_is_clust(index),
 			       length, create)) {
-		return(TRUE);
+		return(true);
 	}
 
-	if (!page_zip->m_nonempty) {
+	if (!page_zip->m_nonempty && !page_has_garbage(page)) {
 		/* The page has been freshly compressed, so
-		recompressing it will not help. */
-		return(FALSE);
+		reorganizing it will not help. */
+		return(false);
+	}
+
+	if (create && page_is_leaf(page)
+	    && (length + page_get_data_size(page)
+		>= dict_index_zip_pad_optimal_page_size(index))) {
+		return(false);
 	}
 
 	if (UNIV_UNLIKELY(trx && trx->fake_changes)) {
-	    /* Don't call page_zip_compress_write_log_no_data as that has
-	    assert which would fail. Assume there won't be a compression
-	    failure. */
+		/* Don't call page_zip_compress_write_log_no_data as that has
+		assert which would fail. Assume there won't be a compression
+		failure. */
 
-	    return TRUE;
+		return(true);
 	}
 
-	if (!page_zip_compress(page_zip, buf_block_get_frame(block),
-			       index, mtr)) {
-		/* Unable to compress the page */
-		return(FALSE);
+	if (!btr_page_reorganize(cursor, index, mtr)) {
+		goto out_of_space;
 	}
 
+	rec_offs_make_valid(page_cur_get_rec(cursor), index, offsets);
+
 	/* After recompressing a page, we must make sure that the free
 	bits in the insert buffer bitmap will not exceed the free
 	space on the page.  Because this function will not attempt
@@ -1936,103 +2027,118 @@ btr_cur_update_alloc_zip(
 	safe to reset the free bits if page_zip_available() fails
 	again, below.  The free bits can safely be reset in a separate
 	mini-transaction.  If page_zip_available() succeeds below, we
-	can be sure that the page_zip_compress() above did not reduce
+	can be sure that the btr_page_reorganize() above did not reduce
 	the free space available on the page. */
 
-	if (!page_zip_available(page_zip, dict_index_is_clust(index),
-				length, create)) {
-		/* Out of space: reset the free bits. */
-		if (!dict_index_is_clust(index)
-		    && page_is_leaf(buf_block_get_frame(block))) {
-			ibuf_reset_free_bits(block);
-		}
-		return(FALSE);
+	if (page_zip_available(page_zip, dict_index_is_clust(index),
+			       length, create)) {
+		return(true);
+	}
+
+out_of_space:
+	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
+
+	/* Out of space: reset the free bits. */
+	if (!dict_index_is_clust(index) && page_is_leaf(page)) {
+		ibuf_reset_free_bits(page_cur_get_block(cursor));
 	}
 
-	return(TRUE);
+	return(false);
 }
 
 /*************************************************************//**
 Updates a record when the update causes no size changes in its fields.
 We assume here that the ordering fields of the record do not change.
-@return	DB_SUCCESS or error number */
+@return locking or undo log related error code, or
+@retval DB_SUCCESS on success
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
 UNIV_INTERN
-ulint
+dberr_t
 btr_cur_update_in_place(
 /*====================*/
 	ulint		flags,	/*!< in: undo logging and locking flags */
 	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
 				cursor stays valid and positioned on the
 				same record */
+	ulint*		offsets,/*!< in/out: offsets on cursor->page_cur.rec */
 	const upd_t*	update,	/*!< in: update vector */
 	ulint		cmpl_info,/*!< in: compiler info on secondary index
 				updates */
 	que_thr_t*	thr,	/*!< in: query thread */
-	mtr_t*		mtr)	/*!< in: mtr; must be committed before
-				latching any further pages */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
+				is a secondary index, the caller must
+				mtr_commit(mtr) before latching any
+				further pages */
 {
 	dict_index_t*	index;
 	buf_block_t*	block;
 	page_zip_des_t*	page_zip;
-	ulint		err;
+	dberr_t		err;
 	rec_t*		rec;
 	roll_ptr_t	roll_ptr	= 0;
-	trx_t*		trx;
 	ulint		was_delete_marked;
 	ibool		is_hashed;
-	mem_heap_t*	heap		= NULL;
-	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
-	ulint*		offsets		= offsets_;
-	rec_offs_init(offsets_);
+	trx_t*		trx;
 
 	rec = btr_cur_get_rec(cursor);
 	index = cursor->index;
+	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
 	/* The insert buffer tree should never be updated in place. */
 	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+	      || dict_index_is_clust(index));
+	ut_ad(thr_get_trx(thr)->id == trx_id
+	      || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
+	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+	ut_ad(fil_page_get_type(btr_cur_get_page(cursor)) == FIL_PAGE_INDEX);
+	ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
 
-	trx = thr_get_trx(thr);
-	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
 #ifdef UNIV_DEBUG
-	if (btr_cur_print_record_ops && thr) {
-		btr_cur_trx_report(trx, index, "update ");
+	if (btr_cur_print_record_ops) {
+		btr_cur_trx_report(trx_id, index, "update ");
 		rec_print_new(stderr, rec, offsets);
 	}
 #endif /* UNIV_DEBUG */
 
 	block = btr_cur_get_block(cursor);
 	page_zip = buf_block_get_page_zip(block);
+	trx = thr_get_trx(thr);
 
 	/* Check that enough space is available on the compressed page. */
-	if (page_zip
-	    && !btr_cur_update_alloc_zip(page_zip, block, index,
-					 rec_offs_size(offsets), FALSE, mtr,
-					 trx)) {
-		return(DB_ZIP_OVERFLOW);
+	if (page_zip) {
+		if (!btr_cur_update_alloc_zip(
+			    page_zip, btr_cur_get_page_cur(cursor),
+			    index, offsets, rec_offs_size(offsets),
+			    false, mtr, trx)) {
+			return(DB_ZIP_OVERFLOW);
+		}
+
+		rec = btr_cur_get_rec(cursor);
 	}
 
 	/* Do lock checking and undo logging */
-	err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
+	err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
+					update, cmpl_info,
 					thr, mtr, &roll_ptr);
 	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
-
-		if (UNIV_LIKELY_NULL(heap)) {
-			mem_heap_free(heap);
-		}
-		return(err);
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+		goto func_exit;
 	}
 
 	if (UNIV_UNLIKELY(trx->fake_changes)) {
 		/* skip CHANGE, LOG */
-		if (UNIV_LIKELY_NULL(heap)) {
-			mem_heap_free(heap);
-		}
 		return(err); /* == DB_SUCCESS */
 	}
 
 	if (!(flags & BTR_KEEP_SYS_FLAG)) {
-		row_upd_rec_sys_fields(rec, NULL,
-				       index, offsets, trx, roll_ptr);
+		row_upd_rec_sys_fields(rec, NULL, index, offsets,
+				       thr_get_trx(thr), roll_ptr);
 	}
 
 	was_delete_marked = rec_get_deleted_flag(
@@ -2066,18 +2172,12 @@ btr_cur_update_in_place(
 		rw_lock_x_unlock(btr_search_get_latch(cursor->index));
 	}
 
-	if (page_zip && !dict_index_is_clust(index)
-	    && page_is_leaf(buf_block_get_frame(block))) {
-		/* Update the free bits in the insert buffer. */
-		ibuf_update_free_bits_zip(block, mtr);
-	}
-
 	btr_cur_update_in_place_log(flags, rec, index, update,
-				    trx, roll_ptr, mtr);
+				    trx_id, roll_ptr, mtr);
 
 	if (was_delete_marked
-	    && !rec_get_deleted_flag(rec, page_is_comp(
-					     buf_block_get_frame(block)))) {
+	    && !rec_get_deleted_flag(
+		    rec, page_is_comp(buf_block_get_frame(block)))) {
 		/* The new updated record owns its possible externally
 		stored fields */
 
@@ -2085,10 +2185,18 @@ btr_cur_update_in_place(
 					     rec, index, offsets, mtr);
 	}
 
-	if (UNIV_LIKELY_NULL(heap)) {
-		mem_heap_free(heap);
+	ut_ad(err == DB_SUCCESS);
+
+func_exit:
+	if (page_zip
+	    && !(flags & BTR_KEEP_IBUF_BITMAP)
+	    && !dict_index_is_clust(index)
+	    && page_is_leaf(buf_block_get_frame(block))) {
+		/* Update the free bits in the insert buffer. */
+		ibuf_update_free_bits_zip(block, mtr);
 	}
-	return(DB_SUCCESS);
+
+	return(err);
 }
 
 /*************************************************************//**
@@ -2097,28 +2205,37 @@ holds an x-latch on the page. The operation does not succeed if there is too
 little space on the page or if the update would result in too empty a page,
 so that tree compression is recommended. We assume here that the ordering
 fields of the record do not change.
-@return DB_SUCCESS, or DB_OVERFLOW if the updated record does not fit,
-DB_UNDERFLOW if the page would become too empty, or DB_ZIP_OVERFLOW if
-there is not enough space left on the compressed page */
+@return error code, including
+@retval DB_SUCCESS on success
+@retval DB_OVERFLOW if the updated record does not fit
+@retval DB_UNDERFLOW if the page would become too empty
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
 UNIV_INTERN
-ulint
+dberr_t
 btr_cur_optimistic_update(
 /*======================*/
 	ulint		flags,	/*!< in: undo logging and locking flags */
 	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
 				cursor stays valid and positioned on the
 				same record */
+	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to NULL or memory heap */
 	const upd_t*	update,	/*!< in: update vector; this must also
 				contain trx id and roll ptr fields */
 	ulint		cmpl_info,/*!< in: compiler info on secondary index
 				updates */
-	que_thr_t*	thr,	/*!< in: query thread */
-	mtr_t*		mtr)	/*!< in: mtr; must be committed before
-				latching any further pages */
+	que_thr_t*	thr,	/*!< in: query thread, or NULL if
+				appropriate flags are set */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
+				is a secondary index, the caller must
+				mtr_commit(mtr) before latching any
+				further pages */
 {
 	dict_index_t*	index;
 	page_cur_t*	page_cursor;
-	ulint		err;
+	dberr_t		err;
 	buf_block_t*	block;
 	page_t*		page;
 	page_zip_des_t*	page_zip;
@@ -2128,52 +2245,58 @@ btr_cur_optimistic_update(
 	ulint		old_rec_size;
 	dtuple_t*	new_entry;
 	roll_ptr_t	roll_ptr;
-	trx_t*		trx;
-	mem_heap_t*	heap;
 	ulint		i;
 	ulint		n_ext;
-	ulint*		offsets;
 
 	block = btr_cur_get_block(cursor);
 	page = buf_block_get_frame(block);
 	rec = btr_cur_get_rec(cursor);
 	index = cursor->index;
 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
-	ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(thr_get_trx(thr)->fake_changes
+	      || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 	/* The insert buffer tree should never be updated in place. */
 	ut_ad(!dict_index_is_ibuf(index));
-
-	heap = mem_heap_create(1024);
-	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
-#ifdef UNIV_BLOB_NULL_DEBUG
-	ut_a(!rec_offs_any_null_extern(rec, offsets));
-#endif /* UNIV_BLOB_NULL_DEBUG */
+	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+	      || dict_index_is_clust(index));
+	ut_ad(thr_get_trx(thr)->id == trx_id
+	      || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
+	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+	ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+	ut_ad(btr_page_get_index_id(page) == index->id);
+
+	*offsets = rec_get_offsets(rec, index, *offsets,
+				   ULINT_UNDEFINED, heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	ut_a(!rec_offs_any_null_extern(rec, *offsets)
+	     || trx_is_recv(thr_get_trx(thr)));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 
 #ifdef UNIV_DEBUG
-	if (btr_cur_print_record_ops && thr) {
-		btr_cur_trx_report(thr_get_trx(thr), index, "update ");
-		rec_print_new(stderr, rec, offsets);
+	if (btr_cur_print_record_ops) {
+		btr_cur_trx_report(trx_id, index, "update ");
+		rec_print_new(stderr, rec, *offsets);
 	}
 #endif /* UNIV_DEBUG */
 
-	if (!row_upd_changes_field_size_or_external(index, offsets, update)) {
+	if (!row_upd_changes_field_size_or_external(index, *offsets, update)) {
 
 		/* The simplest and the most common case: the update does not
 		change the size of any field and none of the updated fields is
 		externally stored in rec or update, and there is enough space
 		on the compressed page to log the update. */
 
-		mem_heap_free(heap);
-		return(btr_cur_update_in_place(flags, cursor, update,
-					       cmpl_info, thr, mtr));
+		return(btr_cur_update_in_place(
+			       flags, cursor, *offsets, update,
+			       cmpl_info, thr, trx_id, mtr));
 	}
 
-	if (rec_offs_any_extern(offsets)) {
+	if (rec_offs_any_extern(*offsets)) {
 any_extern:
 		/* Externally stored fields are treated in pessimistic
 		update */
 
-		mem_heap_free(heap);
 		return(DB_OVERFLOW);
 	}
 
@@ -2186,8 +2309,14 @@ any_extern:
 
 	page_cursor = btr_cur_get_page_cur(cursor);
 
-	new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
-					   &n_ext, heap);
+	if (!*heap) {
+		*heap = mem_heap_create(
+			rec_offs_size(*offsets)
+			+ DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
+	}
+
+	new_entry = row_rec_to_index_entry(rec, index, *offsets,
+					   &n_ext, *heap);
 	/* We checked above that there are no externally stored fields. */
 	ut_a(!n_ext);
 
@@ -2195,8 +2324,8 @@ any_extern:
 	corresponding to new_entry is latched in mtr.
 	Thus the following call is safe. */
 	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
-						     FALSE, heap);
-	old_rec_size = rec_offs_size(offsets);
+						     FALSE, *heap);
+	old_rec_size = rec_offs_size(*offsets);
 	new_rec_size = rec_get_converted_size(index, new_entry, 0);
 
 	page_zip = buf_block_get_page_zip(block);
@@ -2204,30 +2333,36 @@ any_extern:
 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
 
-	if (page_zip
-	    && !btr_cur_update_alloc_zip(page_zip, block, index,
-					 new_rec_size, TRUE, mtr,
-					 thr_get_trx(thr))) {
-		err = DB_ZIP_OVERFLOW;
-		goto err_exit;
+	if (page_zip) {
+		if (!btr_cur_update_alloc_zip(
+			    page_zip, page_cursor, index, *offsets,
+			    new_rec_size, true, mtr, thr_get_trx(thr))) {
+			return(DB_ZIP_OVERFLOW);
+		}
+
+		rec = page_cur_get_rec(page_cursor);
 	}
 
 	if (UNIV_UNLIKELY(new_rec_size
 			  >= (page_get_free_space_of_empty(page_is_comp(page))
 			      / 2))) {
-
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
 		err = DB_OVERFLOW;
-		goto err_exit;
+		goto func_exit;
 	}
 
 	if (UNIV_UNLIKELY(page_get_data_size(page)
 			  - old_rec_size + new_rec_size
 			  < BTR_CUR_PAGE_COMPRESS_LIMIT)) {
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
 
 		/* The page would become too empty */
-
 		err = DB_UNDERFLOW;
-		goto err_exit;
+		goto func_exit;
 	}
 
 	/* We do not attempt to reorganize if the page is compressed.
@@ -2241,25 +2376,33 @@ any_extern:
 	       && (max_size >= new_rec_size))
 	      || (page_get_n_recs(page) <= 1))) {
 
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+
 		/* There was not enough space, or it did not pay to
 		reorganize: for simplicity, we decide what to do assuming a
 		reorganization is needed, though it might not be necessary */
 
 		err = DB_OVERFLOW;
-		goto err_exit;
+		goto func_exit;
 	}
 
 	/* Do lock checking and undo logging */
-	err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
+	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
+					update, cmpl_info,
 					thr, mtr, &roll_ptr);
 	if (err != DB_SUCCESS) {
-
-		goto err_exit;
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+		goto func_exit;
 	}
 
-	if (UNIV_UNLIKELY(thr && thr_get_trx(thr)->fake_changes)) {
+	if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) {
 		/* skip CHANGE, LOG */
-		goto err_exit; /* == DB_SUCCESS */
+		ut_ad(err == DB_SUCCESS);
+		return(DB_SUCCESS);
 	}
 
 	/* Ok, we may do the replacement. Store on the page infimum the
@@ -2270,44 +2413,38 @@ any_extern:
 
 	btr_search_update_hash_on_delete(cursor);
 
-	/* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above
-	invokes rec_offs_make_valid() to point to the copied record that
-	the fields of new_entry point to.  We have to undo it here. */
-	ut_ad(rec_offs_validate(NULL, index, offsets));
-	rec_offs_make_valid(page_cur_get_rec(page_cursor), index, offsets);
-
-	page_cur_delete_rec(page_cursor, index, offsets, mtr);
+	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
 
 	page_cur_move_to_prev(page_cursor);
 
-	trx = thr_get_trx(thr);
-
 	if (!(flags & BTR_KEEP_SYS_FLAG)) {
 		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
 					      roll_ptr);
 		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
-					      trx->id);
+					      trx_id);
 	}
 
 	/* There are no externally stored columns in new_entry */
-	rec = btr_cur_insert_if_possible(cursor, new_entry, 0/*n_ext*/, mtr);
+	rec = btr_cur_insert_if_possible(
+		cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
 	ut_a(rec); /* <- We calculated above the insert would fit */
 
-	if (page_zip && !dict_index_is_clust(index)
-	    && page_is_leaf(page)) {
-		/* Update the free bits in the insert buffer. */
-		ibuf_update_free_bits_zip(block, mtr);
-	}
-
 	/* Restore the old explicit lock state on the record */
 
 	lock_rec_restore_from_page_infimum(block, rec, block);
 
 	page_cur_move_to_next(page_cursor);
+	ut_ad(err == DB_SUCCESS);
+
+func_exit:
+	if (page_zip
+	    && !(flags & BTR_KEEP_IBUF_BITMAP)
+	    && !dict_index_is_clust(index)
+	    && page_is_leaf(page)) {
+		/* Update the free bits in the insert buffer. */
+		ibuf_update_free_bits_zip(block, mtr);
+	}
 
-	err = DB_SUCCESS;
-err_exit:
-	mem_heap_free(heap);
 	return(err);
 }
 
@@ -2367,7 +2504,7 @@ own x-latches to brothers of page, if those brothers exist. We assume
 here that the ordering fields of the record do not change.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 btr_cur_pessimistic_update(
 /*=======================*/
 	ulint		flags,	/*!< in: undo logging, locking, and rollback
@@ -2375,7 +2512,13 @@ btr_cur_pessimistic_update(
 	btr_cur_t*	cursor,	/*!< in/out: cursor on the record to update;
 				cursor may become invalid if *big_rec == NULL
 				|| !(flags & BTR_KEEP_POS_FLAG) */
-	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: pointer to memory heap
+				that can be emptied, or NULL */
+	mem_heap_t*	entry_heap,
+				/*!< in/out: memory heap for allocating
+				big_rec and the index tuple */
 	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
 				be stored externally by the caller, or NULL */
 	const upd_t*	update,	/*!< in: update vector; this is allowed also
@@ -2383,9 +2526,11 @@ btr_cur_pessimistic_update(
 				the values in update vector have no effect */
 	ulint		cmpl_info,/*!< in: compiler info on secondary index
 				updates */
-	que_thr_t*	thr,	/*!< in: query thread */
-	mtr_t*		mtr)	/*!< in: mtr; must be committed before
-				latching any further pages */
+	que_thr_t*	thr,	/*!< in: query thread, or NULL if
+				appropriate flags are set */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; must be
+				committed before latching any further pages */
 {
 	big_rec_t*	big_rec_vec	= NULL;
 	big_rec_t*	dummy_big_rec;
@@ -2395,65 +2540,84 @@ btr_cur_pessimistic_update(
 	page_zip_des_t*	page_zip;
 	rec_t*		rec;
 	page_cur_t*	page_cursor;
-	dtuple_t*	new_entry;
-	ulint		err;
-	ulint		optim_err;
+	dberr_t		err;
+	dberr_t		optim_err;
 	roll_ptr_t	roll_ptr;
-	trx_t*		trx;
 	ibool		was_first;
-	ulint		n_extents	= 0;
-	ulint		n_reserved;
+	ulint		n_reserved	= 0;
 	ulint		n_ext;
-	ulint*		offsets		= NULL;
+	trx_t*		trx;
 
+	*offsets = NULL;
 	*big_rec = NULL;
 
 	block = btr_cur_get_block(cursor);
 	page = buf_block_get_frame(block);
 	page_zip = buf_block_get_page_zip(block);
-	rec = btr_cur_get_rec(cursor);
 	index = cursor->index;
 
-	ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, dict_index_get_lock(index),
-				MTR_MEMO_X_LOCK));
-	ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(thr_get_trx(thr)->fake_changes
+	      || mtr_memo_contains(mtr, dict_index_get_lock(index),
+				   MTR_MEMO_X_LOCK));
+	ut_ad(thr_get_trx(thr)->fake_changes
+	      || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 #ifdef UNIV_ZIP_DEBUG
 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
 	/* The insert buffer tree should never be updated in place. */
 	ut_ad(!dict_index_is_ibuf(index));
-
-	optim_err = btr_cur_optimistic_update(flags, cursor, update,
-					      cmpl_info, thr, mtr);
-
-	switch (optim_err) {
+	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+	      || dict_index_is_clust(index));
+	ut_ad(thr_get_trx(thr)->id == trx_id
+	      || (flags & ~BTR_KEEP_POS_FLAG)
+	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+
+	err = optim_err = btr_cur_optimistic_update(
+		flags | BTR_KEEP_IBUF_BITMAP,
+		cursor, offsets, offsets_heap, update,
+		cmpl_info, thr, trx_id, mtr);
+
+	switch (err) {
+	case DB_ZIP_OVERFLOW:
 	case DB_UNDERFLOW:
 	case DB_OVERFLOW:
-	case DB_ZIP_OVERFLOW:
 		break;
 	default:
-		return(optim_err);
+	err_exit:
+		/* We suppressed this with BTR_KEEP_IBUF_BITMAP.
+		For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
+		already reset by btr_cur_update_alloc_zip() if the
+		page was recompressed. */
+		if (page_zip
+		    && optim_err != DB_ZIP_OVERFLOW
+		    && !dict_index_is_clust(index)
+		    && page_is_leaf(page)) {
+			ibuf_update_free_bits_zip(block, mtr);
+		}
+
+		return(err);
 	}
 
 	/* Do lock checking and undo logging */
-	err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
+	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
+					update, cmpl_info,
 					thr, mtr, &roll_ptr);
 	if (err != DB_SUCCESS) {
-
-		return(err);
+		goto err_exit;
 	}
 
 	if (optim_err == DB_OVERFLOW) {
 		ulint	reserve_flag;
+		ulint	n_extents;
 
 		/* First reserve enough free space for the file segments
 		of the index tree, so that the update will not fail because
 		of lack of space */
-
 		if (UNIV_UNLIKELY(cursor->tree_height == ULINT_UNDEFINED)) {
 			/* When the tree height is uninitialized due to fake
 			changes, reserve some hardcoded number of extents.  */
-			ut_a(thr && thr_get_trx(thr)->fake_changes);
+			ut_a(thr_get_trx(thr)->fake_changes);
 			n_extents = 3;
 		}
 		else {
@@ -2468,24 +2632,18 @@ btr_cur_pessimistic_update(
 
 		if (!fsp_reserve_free_extents(&n_reserved, index->space,
 					      n_extents, reserve_flag, mtr)) {
-			return(DB_OUT_OF_FILE_SPACE);
+			err = DB_OUT_OF_FILE_SPACE;
+			goto err_exit;
 		}
 	}
 
-	if (!*heap) {
-		*heap = mem_heap_create(1024);
-	}
-	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, heap);
+	rec = btr_cur_get_rec(cursor);
 
-	trx = thr_get_trx(thr);
+	*offsets = rec_get_offsets(
+		rec, index, *offsets, ULINT_UNDEFINED, offsets_heap);
 
-	new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
-					   &n_ext, *heap);
-	/* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above
-	invokes rec_offs_make_valid() to point to the copied record that
-	the fields of new_entry point to.  We have to undo it here. */
-	ut_ad(rec_offs_validate(NULL, index, offsets));
-	rec_offs_make_valid(rec, index, offsets);
+	dtuple_t*	new_entry = row_rec_to_index_entry(
+		rec, index, *offsets, &n_ext, entry_heap);
 
 	/* The page containing the clustered index record
 	corresponding to new_entry is latched in mtr.  If the
@@ -2494,15 +2652,18 @@ btr_cur_pessimistic_update(
 	purge would also have removed the clustered index record
 	itself.  Thus the following call is safe. */
 	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
-						     FALSE, *heap);
+						     FALSE, entry_heap);
+
+	trx = thr_get_trx(thr);
+
 	if (!(flags & BTR_KEEP_SYS_FLAG) && UNIV_LIKELY(!trx->fake_changes)) {
 		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
 					      roll_ptr);
 		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
-					      trx->id);
+					      trx_id);
 	}
 
-	if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(offsets)) {
+	if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(*offsets)) {
 		/* We are in a transaction rollback undoing a row
 		update: we must free possible externally stored fields
 		which got new values in the update, if they are not
@@ -2516,16 +2677,17 @@ btr_cur_pessimistic_update(
 		ut_ad(!(trx->fake_changes));
 
 		btr_rec_free_updated_extern_fields(
-			index, rec, page_zip, offsets, update,
-			trx_is_recv(trx) ? RB_RECOVERY : RB_NORMAL, mtr);
+			index, rec, page_zip, *offsets, update,
+			trx_is_recv(thr_get_trx(thr))
+			? RB_RECOVERY : RB_NORMAL, mtr);
 	}
 
 	/* We have to set appropriate extern storage bits in the new
 	record to be inserted: we have to remember which fields were such */
 
 	ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
-	ut_ad(rec_offs_validate(rec, index, offsets));
-	n_ext += btr_push_update_extern_fields(new_entry, update, *heap);
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+	n_ext += btr_push_update_extern_fields(new_entry, update, entry_heap);
 
 	if (page_zip) {
 		ut_ad(page_is_comp(page));
@@ -2544,8 +2706,21 @@ make_external:
 		big_rec_vec = dtuple_convert_big_rec(index, new_entry, &n_ext);
 		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
 
+			/* We cannot goto return_after_reservations,
+			because we may need to update the
+			IBUF_BITMAP_FREE bits, which was suppressed by
+			BTR_KEEP_IBUF_BITMAP. */
+#ifdef UNIV_ZIP_DEBUG
+			ut_a(!page_zip
+			     || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+			if (n_reserved > 0) {
+				fil_space_release_free_extents(
+					index->space, n_reserved);
+			}
+
 			err = DB_TOO_BIG_RECORD;
-			goto return_after_reservations;
+			goto err_exit;
 		}
 
 		ut_ad(page_is_leaf(page));
@@ -2577,11 +2752,12 @@ make_external:
 #endif /* UNIV_ZIP_DEBUG */
 	page_cursor = btr_cur_get_page_cur(cursor);
 
-	page_cur_delete_rec(page_cursor, index, offsets, mtr);
+	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
 
 	page_cur_move_to_prev(page_cursor);
 
-	rec = btr_cur_insert_if_possible(cursor, new_entry, n_ext, mtr);
+	rec = btr_cur_insert_if_possible(cursor, new_entry,
+					 offsets, offsets_heap, n_ext, mtr);
 
 	if (rec) {
 		page_cursor->rec = rec;
@@ -2589,24 +2765,26 @@ make_external:
 		lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
 						   rec, block);
 
-		offsets = rec_get_offsets(rec, index, offsets,
-					  ULINT_UNDEFINED, heap);
-
-		if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
+		if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
 			/* The new inserted record owns its possible externally
 			stored fields */
-			btr_cur_unmark_extern_fields(page_zip,
-						     rec, index, offsets, mtr);
+			btr_cur_unmark_extern_fields(
+				page_zip, rec, index, *offsets, mtr);
 		}
 
-		btr_cur_compress_if_useful(
-			cursor,
-			big_rec_vec != NULL && (flags & BTR_KEEP_POS_FLAG),
-			mtr);
+		bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
 
-		if (page_zip && !dict_index_is_clust(index)
-		    && page_is_leaf(page)) {
-			/* Update the free bits in the insert buffer. */
+		if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
+			if (adjust) {
+				rec_offs_make_valid(
+					page_cursor->rec, index, *offsets);
+			}
+		} else if (page_zip &&
+			   !dict_index_is_clust(index)
+			   && page_is_leaf(page)) {
+			/* Update the free bits in the insert buffer.
+			This is the same block which was skipped by
+			BTR_KEEP_IBUF_BITMAP. */
 			ibuf_update_free_bits_zip(block, mtr);
 		}
 
@@ -2620,9 +2798,10 @@ make_external:
 		btr_cur_insert_if_possible() to return FALSE. */
 		ut_a(page_zip || optim_err != DB_UNDERFLOW);
 
-		/* Out of space: reset the free bits. */
-		if (!dict_index_is_clust(index)
-		    && page_is_leaf(page)) {
+		/* Out of space: reset the free bits.
+		This is the same block which was skipped by
+		BTR_KEEP_IBUF_BITMAP. */
+		if (!dict_index_is_clust(index) && page_is_leaf(page)) {
 			ibuf_reset_free_bits(block);
 		}
 	}
@@ -2654,11 +2833,13 @@ make_external:
 	err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
 					 | BTR_NO_LOCKING_FLAG
 					 | BTR_KEEP_SYS_FLAG,
-					 cursor, new_entry, &rec,
+					 cursor, offsets, offsets_heap,
+					 new_entry, &rec,
 					 &dummy_big_rec, n_ext, NULL, mtr);
 	ut_a(rec);
 	ut_a(err == DB_SUCCESS);
 	ut_a(dummy_big_rec == NULL);
+	ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
 	page_cursor->rec = rec;
 
 	if (dict_index_is_sec_or_ibuf(index)) {
@@ -2671,10 +2852,10 @@ make_external:
 
 		page_update_max_trx_id(rec_block,
 				       buf_block_get_page_zip(rec_block),
-				       trx->id, mtr);
+				       trx_id, mtr);
 	}
 
-	if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
+	if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
 		/* The new inserted record owns its possible externally
 		stored fields */
 		buf_block_t*	rec_block = btr_cur_get_block(cursor);
@@ -2685,10 +2866,8 @@ make_external:
 #endif /* UNIV_ZIP_DEBUG */
 		page_zip = buf_block_get_page_zip(rec_block);
 
-		offsets = rec_get_offsets(rec, index, offsets,
-					  ULINT_UNDEFINED, heap);
 		btr_cur_unmark_extern_fields(page_zip,
-					     rec, index, offsets, mtr);
+					     rec, index, *offsets, mtr);
 	}
 
 	lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
@@ -2709,7 +2888,7 @@ return_after_reservations:
 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
 
-	if (n_extents > 0) {
+	if (n_reserved > 0) {
 		fil_space_release_free_extents(index->space, n_reserved);
 	}
 
@@ -2727,17 +2906,13 @@ UNIV_INLINE
 void
 btr_cur_del_mark_set_clust_rec_log(
 /*===============================*/
-	ulint		flags,	/*!< in: flags */
 	rec_t*		rec,	/*!< in: record */
 	dict_index_t*	index,	/*!< in: index of the record */
-	ibool		val,	/*!< in: value to set */
-	trx_t*		trx,	/*!< in: deleting transaction */
+	trx_id_t	trx_id,	/*!< in: transaction id */
 	roll_ptr_t	roll_ptr,/*!< in: roll ptr to the undo log record */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
 	byte*	log_ptr;
-	ut_ad(flags < 256);
-	ut_ad(val <= 1);
 
 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
 
@@ -2753,13 +2928,11 @@ btr_cur_del_mark_set_clust_rec_log(
 		return;
 	}
 
-	mach_write_to_1(log_ptr, flags);
-	log_ptr++;
-	mach_write_to_1(log_ptr, val);
-	log_ptr++;
+	*log_ptr++ = 0;
+	*log_ptr++ = 1;
 
-	log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
-						mtr);
+	log_ptr = row_upd_write_sys_vals_to_log(
+		index, trx_id, roll_ptr, log_ptr, mtr);
 	mach_write_to_2(log_ptr, page_offset(rec));
 	log_ptr += 2;
 
@@ -2856,20 +3029,18 @@ of the deleting transaction, and in the roll ptr field pointer to the
 undo log record created.
 @return	DB_SUCCESS, DB_LOCK_WAIT, or error number */
 UNIV_INTERN
-ulint
+dberr_t
 btr_cur_del_mark_set_clust_rec(
 /*===========================*/
-	ulint		flags,	/*!< in: undo logging and locking flags */
 	buf_block_t*	block,	/*!< in/out: buffer block of the record */
 	rec_t*		rec,	/*!< in/out: record */
 	dict_index_t*	index,	/*!< in: clustered index of the record */
 	const ulint*	offsets,/*!< in: rec_get_offsets(rec) */
-	ibool		val,	/*!< in: value to set */
 	que_thr_t*	thr,	/*!< in: query thread */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	roll_ptr_t	roll_ptr;
-	ulint		err;
+	dberr_t		err;
 	page_zip_des_t*	page_zip;
 	trx_t*		trx;
 
@@ -2881,7 +3052,7 @@ btr_cur_del_mark_set_clust_rec(
 
 #ifdef UNIV_DEBUG
 	if (btr_cur_print_record_ops && thr) {
-		btr_cur_trx_report(thr_get_trx(thr), index, "del mark ");
+		btr_cur_trx_report(thr_get_trx(thr)->id, index, "del mark ");
 		rec_print_new(stderr, rec, offsets);
 	}
 #endif /* UNIV_DEBUG */
@@ -2889,12 +3060,12 @@ btr_cur_del_mark_set_clust_rec(
 	ut_ad(dict_index_is_clust(index));
 	ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
 
-	if (UNIV_UNLIKELY(thr && thr_get_trx(thr)->fake_changes)) {
+	if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) {
 		/* skip LOCK, UNDO, CHANGE, LOG */
 		return(DB_SUCCESS);
 	}
 
-	err = lock_clust_rec_modify_check_and_lock(flags, block,
+	err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
 						   rec, index, offsets, thr);
 
 	if (err != DB_SUCCESS) {
@@ -2902,8 +3073,8 @@ btr_cur_del_mark_set_clust_rec(
 		return(err);
 	}
 
-	err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
-					    index, NULL, NULL, 0, rec,
+	err = trx_undo_report_row_operation(0, TRX_UNDO_MODIFY_OP, thr,
+					    index, NULL, NULL, 0, rec, offsets,
 					    &roll_ptr);
 	if (err != DB_SUCCESS) {
 
@@ -2916,17 +3087,21 @@ btr_cur_del_mark_set_clust_rec(
 
 	page_zip = buf_block_get_page_zip(block);
 
-	btr_blob_dbg_set_deleted_flag(rec, index, offsets, val);
-	btr_rec_set_deleted_flag(rec, page_zip, val);
+	btr_blob_dbg_set_deleted_flag(rec, index, offsets, TRUE);
+	btr_rec_set_deleted_flag(rec, page_zip, TRUE);
 
 	trx = thr_get_trx(thr);
 
-	if (!(flags & BTR_KEEP_SYS_FLAG)) {
-		row_upd_rec_sys_fields(rec, page_zip,
-				       index, offsets, trx, roll_ptr);
+	if (dict_index_is_online_ddl(index)) {
+		row_log_table_delete(
+			rec, index, offsets, false,
+			trx_read_trx_id(row_get_trx_id_offset(index, offsets)
+					+ rec));
 	}
 
-	btr_cur_del_mark_set_clust_rec_log(flags, rec, index, val, trx,
+	row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr);
+
+	btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id,
 					   roll_ptr, mtr);
 
 	return(err);
@@ -3015,20 +3190,20 @@ btr_cur_parse_del_mark_set_sec_rec(
 Sets a secondary index record delete mark to TRUE or FALSE.
 @return	DB_SUCCESS, DB_LOCK_WAIT, or error number */
 UNIV_INTERN
-ulint
+dberr_t
 btr_cur_del_mark_set_sec_rec(
 /*=========================*/
 	ulint		flags,	/*!< in: locking flag */
 	btr_cur_t*	cursor,	/*!< in: cursor */
 	ibool		val,	/*!< in: value to set */
 	que_thr_t*	thr,	/*!< in: query thread */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	buf_block_t*	block;
 	rec_t*		rec;
-	ulint		err;
+	dberr_t		err;
 
-	if (UNIV_UNLIKELY(thr && thr_get_trx(thr)->fake_changes)) {
+	if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) {
 		/* skip LOCK, CHANGE, LOG */
 		return(DB_SUCCESS);
 	}
@@ -3038,7 +3213,7 @@ btr_cur_del_mark_set_sec_rec(
 
 #ifdef UNIV_DEBUG
 	if (btr_cur_print_record_ops && thr) {
-		btr_cur_trx_report(thr_get_trx(thr), cursor->index,
+		btr_cur_trx_report(thr_get_trx(thr)->id, cursor->index,
 				   "del mark ");
 		rec_print(stderr, rec, cursor->index);
 	}
@@ -3128,12 +3303,15 @@ positioned, but no latch on the whole tree.
 @return	TRUE if success, i.e., the page did not become too empty */
 UNIV_INTERN
 ibool
-btr_cur_optimistic_delete(
-/*======================*/
+btr_cur_optimistic_delete_func(
+/*===========================*/
 	btr_cur_t*	cursor,	/*!< in: cursor on leaf page, on the record to
 				delete; cursor stays valid: if deletion
 				succeeds, on function exit it points to the
 				successor of the deleted record */
+#ifdef UNIV_DEBUG
+	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
+#endif /* UNIV_DEBUG */
 	mtr_t*		mtr)	/*!< in: mtr; if this function returns
 				TRUE on a leaf page of a secondary
 				index, the mtr must be committed
@@ -3147,6 +3325,7 @@ btr_cur_optimistic_delete(
 	ibool		no_compress_needed;
 	rec_offs_init(offsets_);
 
+	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
 	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
 				MTR_MEMO_PAGE_X_FIX));
 	/* This is intended only for leaf page deletions */
@@ -3156,6 +3335,9 @@ btr_cur_optimistic_delete(
 	SRV_CORRUPT_TABLE_CHECK(block, return(DB_CORRUPTION););
 
 	ut_ad(page_is_leaf(buf_block_get_frame(block)));
+	ut_ad(!dict_index_is_online_ddl(cursor->index)
+	      || dict_index_is_clust(cursor->index)
+	      || (flags & BTR_CREATE_FLAG));
 
 	rec = btr_cur_get_rec(cursor);
 	offsets = rec_get_offsets(rec, cursor->index, offsets,
@@ -3169,38 +3351,42 @@ btr_cur_optimistic_delete(
 
 		page_t*		page	= buf_block_get_frame(block);
 		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
-		ulint		max_ins	= 0;
 
 		lock_update_delete(block, rec);
 
 		btr_search_update_hash_on_delete(cursor);
 
-		if (!page_zip) {
-			max_ins = page_get_max_insert_size_after_reorganize(
-				page, 1);
-		}
+		if (page_zip) {
 #ifdef UNIV_ZIP_DEBUG
-		ut_a(!page_zip
-		     || page_zip_validate(page_zip, page, cursor->index));
+			ut_a(page_zip_validate(page_zip, page, cursor->index));
 #endif /* UNIV_ZIP_DEBUG */
-		page_cur_delete_rec(btr_cur_get_page_cur(cursor),
-				    cursor->index, offsets, mtr);
+			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+					    cursor->index, offsets, mtr);
 #ifdef UNIV_ZIP_DEBUG
-		ut_a(!page_zip
-		     || page_zip_validate(page_zip, page, cursor->index));
+			ut_a(page_zip_validate(page_zip, page, cursor->index));
 #endif /* UNIV_ZIP_DEBUG */
 
-		if (dict_index_is_clust(cursor->index)
-		    || dict_index_is_ibuf(cursor->index)
-		    || !page_is_leaf(page)) {
-			/* The insert buffer does not handle
-			inserts to clustered indexes, to
-			non-leaf pages of secondary index B-trees,
-			or to the insert buffer. */
-		} else if (page_zip) {
-			ibuf_update_free_bits_zip(block, mtr);
+			/* On compressed pages, the IBUF_BITMAP_FREE
+			space is not affected by deleting (purging)
+			records, because it is defined as the minimum
+			of space available *without* reorganize, and
+			space available in the modification log. */
 		} else {
-			ibuf_update_free_bits_low(block, max_ins, mtr);
+			const ulint	max_ins
+				= page_get_max_insert_size_after_reorganize(
+					page, 1);
+
+			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+					    cursor->index, offsets, mtr);
+
+			/* The change buffer does not handle inserts
+			into non-leaf pages, into clustered indexes,
+			or into the change buffer. */
+			if (page_is_leaf(page)
+			    && !dict_index_is_clust(cursor->index)
+			    && !dict_index_is_ibuf(cursor->index)) {
+				ibuf_update_free_bits_low(block, max_ins, mtr);
+			}
 		}
 	}
 
@@ -3223,7 +3409,7 @@ UNIV_INTERN
 ibool
 btr_cur_pessimistic_delete(
 /*=======================*/
-	ulint*		err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
 				the latter may occur because we may have
 				to update node pointers on upper levels,
 				and in the case of variable length keys
@@ -3236,6 +3422,7 @@ btr_cur_pessimistic_delete(
 				if compression does not occur, the cursor
 				stays valid: it points to successor of
 				deleted record on function exit */
+	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
 	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
@@ -3244,9 +3431,7 @@ btr_cur_pessimistic_delete(
 	page_zip_des_t*	page_zip;
 	dict_index_t*	index;
 	rec_t*		rec;
-	dtuple_t*	node_ptr;
-	ulint		n_extents	= 0;
-	ulint		n_reserved;
+	ulint		n_reserved	= 0;
 	ibool		success;
 	ibool		ret		= FALSE;
 	ulint		level;
@@ -3257,6 +3442,10 @@ btr_cur_pessimistic_delete(
 	page = buf_block_get_frame(block);
 	index = btr_cur_get_index(cursor);
 
+	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
 	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
 				MTR_MEMO_X_LOCK));
 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
@@ -3267,7 +3456,7 @@ btr_cur_pessimistic_delete(
 
 		ut_a(cursor->tree_height != ULINT_UNDEFINED);
 
-		n_extents = cursor->tree_height / 32 + 1;
+		ulint	n_extents = cursor->tree_height / 32 + 1;
 
 		success = fsp_reserve_free_extents(&n_reserved,
 						   index->space,
@@ -3307,13 +3496,15 @@ btr_cur_pessimistic_delete(
 
 		btr_discard_page(cursor, mtr);
 
-		*err = DB_SUCCESS;
 		ret = TRUE;
 
 		goto return_after_reservations;
 	}
 
-	lock_update_delete(block, rec);
+	if (flags == 0) {
+		lock_update_delete(block, rec);
+	}
+
 	level = btr_page_get_level(page, mtr);
 
 	if (level > 0
@@ -3342,12 +3533,12 @@ btr_cur_pessimistic_delete(
 
 			btr_node_ptr_delete(index, block, mtr);
 
-			node_ptr = dict_index_build_node_ptr(
+			dtuple_t*	node_ptr = dict_index_build_node_ptr(
 				index, next_rec, buf_block_get_page_no(block),
 				heap, level);
 
-			btr_insert_on_non_leaf_level(index,
-						     level + 1, node_ptr, mtr);
+			btr_insert_on_non_leaf_level(
+				flags, index, level + 1, node_ptr, mtr);
 		}
 	}
 
@@ -3360,16 +3551,16 @@ btr_cur_pessimistic_delete(
 
 	ut_ad(btr_check_node_ptr(index, block, mtr));
 
+return_after_reservations:
 	*err = DB_SUCCESS;
 
-return_after_reservations:
 	mem_heap_free(heap);
 
 	if (ret == FALSE) {
 		ret = btr_cur_compress_if_useful(cursor, FALSE, mtr);
 	}
 
-	if (n_extents > 0) {
+	if (n_reserved > 0) {
 		fil_space_release_free_extents(index->space, n_reserved);
 	}
 
@@ -3389,8 +3580,8 @@ btr_cur_add_path_info(
 	ulint		root_height)	/*!< in: root node height in tree */
 {
 	btr_path_t*	slot;
-	rec_t*		rec;
-	page_t*		page;
+	const rec_t*	rec;
+	const page_t*	page;
 
 	ut_a(cursor->path_arr);
 
@@ -3602,6 +3793,9 @@ btr_estimate_n_rows_in_range(
 	ibool		is_n_rows_exact;
 	ulint		i;
 	mtr_t		mtr;
+	ib_int64_t	table_n_rows;
+
+	table_n_rows = dict_table_get_n_rows(index->table);
 
 	mtr_start(&mtr);
 
@@ -3614,9 +3808,9 @@ btr_estimate_n_rows_in_range(
 					    &cursor, 0,
 					    __FILE__, __LINE__, &mtr);
 	} else {
-		btr_cur_open_at_index_side(TRUE, index,
+		btr_cur_open_at_index_side(true, index,
 					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
-					   &cursor, &mtr);
+					   &cursor, 0, &mtr);
 	}
 
 	mtr_commit(&mtr);
@@ -3632,9 +3826,9 @@ btr_estimate_n_rows_in_range(
 					    &cursor, 0,
 					    __FILE__, __LINE__, &mtr);
 	} else {
-		btr_cur_open_at_index_side(FALSE, index,
+		btr_cur_open_at_index_side(false, index,
 					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
-					   &cursor, &mtr);
+					   &cursor, 0, &mtr);
 	}
 
 	mtr_commit(&mtr);
@@ -3672,16 +3866,15 @@ btr_estimate_n_rows_in_range(
 			to over 1 / 2 of the estimated rows in the whole
 			table */
 
-			if (n_rows > index->table->stat_n_rows / 2
-			    && !is_n_rows_exact) {
+			if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
 
-				n_rows = index->table->stat_n_rows / 2;
+				n_rows = table_n_rows / 2;
 
 				/* If there are just 0 or 1 rows in the table,
 				then we estimate all rows are in the range */
 
 				if (n_rows == 0) {
-					n_rows = index->table->stat_n_rows;
+					n_rows = table_n_rows;
 				}
 			}
 
@@ -3741,9 +3934,9 @@ btr_estimate_n_rows_in_range(
 
 /*******************************************************************//**
 Record the number of non_null key values in a given index for
-each n-column prefix of the index where n < dict_index_get_n_unique(index).
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
 The estimates are eventually stored in the array:
-index->stat_n_non_null_key_vals. */
+index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
 static
 void
 btr_record_not_null_field_in_rec(
@@ -3754,7 +3947,7 @@ btr_record_not_null_field_in_rec(
 	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index),
 					its size could be for all fields or
 					that of "n_unique" */
-	ib_int64_t*	n_not_null)	/*!< in/out: array to record number of
+	ib_uint64_t*	n_not_null)	/*!< in/out: array to record number of
 					not null rows for n-column prefix */
 {
 	ulint	i;
@@ -3776,10 +3969,12 @@ btr_record_not_null_field_in_rec(
 
 /*******************************************************************//**
 Estimates the number of different key values in a given index, for
-each n-column prefix of the index where n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals.
-If innodb_stats_method is "nulls_ignored", we also record the number of
-non-null values for each prefix and store the estimates in
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
+0..n_uniq-1) and the number of pages that were sampled is saved in
+index->stat_n_sample_sizes[].
+If innodb_stats_method is nulls_ignored, we also record the number of
+non-null values for each prefix and stored the estimates in
 array index->stat_n_non_null_key_vals. */
 UNIV_INTERN
 void
@@ -3793,8 +3988,8 @@ btr_estimate_number_of_different_key_vals(
 	ulint		n_cols;
 	ulint		matched_fields;
 	ulint		matched_bytes;
-	ib_int64_t*	n_diff;
-	ib_int64_t*	n_not_null;
+	ib_uint64_t*	n_diff;
+	ib_uint64_t*	n_not_null;
 	ibool		stats_null_not_equal;
 	ullint		n_sample_pages; /* number of pages to sample */
 	ulint		not_empty_flag	= 0;
@@ -3810,12 +4005,13 @@ btr_estimate_number_of_different_key_vals(
 	n_cols = dict_index_get_n_unique(index);
 
 	heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
-			       * (n_cols + 1)
+			       * n_cols
 			       + dict_index_get_n_fields(index)
 			       * (sizeof *offsets_rec
 				  + sizeof *offsets_next_rec));
 
-	n_diff = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_int64_t));
+	n_diff = (ib_uint64_t*) mem_heap_zalloc(
+		heap, n_cols * sizeof(ib_int64_t));
 
 	n_not_null = NULL;
 
@@ -3824,8 +4020,8 @@ btr_estimate_number_of_different_key_vals(
 	considered equal (by setting stats_null_not_equal value) */
 	switch (srv_innodb_stats_method) {
 	case SRV_STATS_NULLS_IGNORED:
-		n_not_null = mem_heap_zalloc(heap, (n_cols + 1)
-					     * sizeof *n_not_null);
+		n_not_null = (ib_uint64_t*) mem_heap_zalloc(
+			heap, n_cols * sizeof *n_not_null);
 		/* fall through */
 
 	case SRV_STATS_NULLS_UNEQUAL:
@@ -3844,14 +4040,14 @@ btr_estimate_number_of_different_key_vals(
 
 	/* It makes no sense to test more pages than are contained
 	in the index, thus we lower the number if it is too high */
-	if (srv_stats_sample_pages > index->stat_index_size) {
+	if (srv_stats_transient_sample_pages > index->stat_index_size) {
 		if (index->stat_index_size > 0) {
 			n_sample_pages = index->stat_index_size;
 		} else {
 			n_sample_pages = 1;
 		}
 	} else {
-		n_sample_pages = srv_stats_sample_pages;
+		n_sample_pages = srv_stats_transient_sample_pages;
 	}
 
 	/* We sample some pages in the index to get an estimate */
@@ -3878,7 +4074,7 @@ btr_estimate_number_of_different_key_vals(
 			offsets_rec = rec_get_offsets(rec, index, offsets_rec,
 						      ULINT_UNDEFINED, &heap);
 
-			if (n_not_null) {
+			if (n_not_null != NULL) {
 				btr_record_not_null_field_in_rec(
 					n_cols, offsets_rec, n_not_null);
 			}
@@ -3906,14 +4102,14 @@ btr_estimate_number_of_different_key_vals(
 					       &matched_fields,
 					       &matched_bytes);
 
-			for (j = matched_fields + 1; j <= n_cols; j++) {
+			for (j = matched_fields; j < n_cols; j++) {
 				/* We add one if this index record has
 				a different prefix from the previous */
 
 				n_diff[j]++;
 			}
 
-			if (n_not_null) {
+			if (n_not_null != NULL) {
 				btr_record_not_null_field_in_rec(
 					n_cols, offsets_next_rec, n_not_null);
 			}
@@ -3948,7 +4144,7 @@ btr_estimate_number_of_different_key_vals(
 			if (btr_page_get_prev(page, &mtr) != FIL_NULL
 			    || btr_page_get_next(page, &mtr) != FIL_NULL) {
 
-				n_diff[n_cols]++;
+				n_diff[n_cols - 1]++;
 			}
 		}
 
@@ -3964,11 +4160,11 @@ exit_loop:
 	also the pages used for external storage of fields (those pages are
 	included in index->stat_n_leaf_pages) */
 
-	for (j = 0; j <= n_cols; j++) {
+	for (j = 0; j < n_cols; j++) {
 		index->stat_n_diff_key_vals[j]
 			= BTR_TABLE_STATS_FROM_SAMPLE(
 				n_diff[j], index, n_sample_pages,
-				total_external_size, not_empty_flag); 
+				total_external_size, not_empty_flag);
 
 		/* If the tree is small, smaller than
 		10 * n_sample_pages + total_external_size, then
@@ -3988,11 +4184,13 @@ exit_loop:
 
 		index->stat_n_diff_key_vals[j] += add_on;
 
+		index->stat_n_sample_sizes[j] = n_sample_pages;
+
 		/* Update the stat_n_non_null_key_vals[] with our
 		sampled result. stat_n_non_null_key_vals[] is created
 		and initialized to zero in dict_index_add_to_cache(),
 		along with stat_n_diff_key_vals[] array */
-		if (n_not_null != NULL && (j < n_cols)) {
+		if (n_not_null != NULL) {
 			index->stat_n_non_null_key_vals[j] =
 				 BTR_TABLE_STATS_FROM_SAMPLE(
 					n_not_null[j], index, n_sample_pages,
@@ -4246,10 +4444,11 @@ btr_push_update_extern_fields(
 				will have to be copied. */
 				ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
 
-				data = dfield_get_data(field);
+				data = (byte*) dfield_get_data(field);
 				len = dfield_get_len(field);
 
-				buf = mem_heap_alloc(heap, uf->orig_len);
+				buf = (byte*) mem_heap_alloc(heap,
+							     uf->orig_len);
 				/* Copy the locally stored prefix. */
 				memcpy(buf, data,
 				       uf->orig_len
@@ -4307,15 +4506,13 @@ btr_blob_free(
 	buf_pool_t*	buf_pool = buf_pool_from_block(block);
 	ulint		space	= buf_block_get_space(block);
 	ulint		page_no	= buf_block_get_page_no(block);
-	ibool           have_LRU_mutex = FALSE;
+	bool		freed	= false;
 
 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
 	mtr_commit(mtr);
 
-	//buf_pool_mutex_enter(buf_pool);
 	mutex_enter(&buf_pool->LRU_list_mutex);
-	have_LRU_mutex = TRUE;
 	mutex_enter(&block->mutex);
 
 	/* Only free the block if it is still allocated to
@@ -4326,23 +4523,25 @@ btr_blob_free(
 	    && buf_block_get_space(block) == space
 	    && buf_block_get_page_no(block) == page_no) {
 
-		if (!buf_LRU_free_block(&block->page, all, &have_LRU_mutex)
-		    && all && block->page.zip.data
-		    /* Now, buf_LRU_free_block() may release mutex temporarily */
+		freed = buf_LRU_free_page(&block->page, all);
+
+		if (!freed && all && block->page.zip.data
+		    /* Now, buf_LRU_free_page() may release mutexes
+		    temporarily */
 		    && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE
 		    && buf_block_get_space(block) == space
 		    && buf_block_get_page_no(block) == page_no) {
+
 			/* Attempt to deallocate the uncompressed page
 			if the whole block cannot be deallocted. */
-
-			buf_LRU_free_block(&block->page, FALSE, &have_LRU_mutex);
+			freed = buf_LRU_free_page(&block->page, false);
 		}
 	}
 
-	//buf_pool_mutex_exit(buf_pool);
-	if (have_LRU_mutex) {
+	if (!freed) {
 		mutex_exit(&buf_pool->LRU_list_mutex);
 	}
+
 	mutex_exit(&block->mutex);
 }
 
@@ -4353,7 +4552,7 @@ The fields are stored on pages allocated from leaf node
 file segment of the index tree.
 @return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
 UNIV_INTERN
-enum db_err
+dberr_t
 btr_store_big_rec_extern_fields(
 /*============================*/
 	dict_index_t*	index,		/*!< in: index of rec; the index tree
@@ -4387,7 +4586,7 @@ btr_store_big_rec_extern_fields(
 	z_stream	c_stream;
 	buf_block_t**	freed_pages	= NULL;
 	ulint		n_freed_pages	= 0;
-	enum db_err	error		= DB_SUCCESS;
+	dberr_t		error		= DB_SUCCESS;
 
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(rec_offs_any_extern(offsets));
@@ -4418,7 +4617,7 @@ btr_store_big_rec_extern_fields(
 		heap = mem_heap_create(250000);
 		page_zip_set_alloc(&c_stream, heap);
 
-		err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION,
+		err = deflateInit2(&c_stream, page_zip_level,
 				   Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
 		ut_a(err == Z_OK);
 	}
@@ -4433,10 +4632,11 @@ btr_store_big_rec_extern_fields(
 					* sizeof *freed_pages);
 			}
 
-			freed_pages = mem_heap_alloc(
-				heap,
-				btr_mtr->n_freed_pages
-				* sizeof *freed_pages);
+			freed_pages = static_cast<buf_block_t**>(
+				mem_heap_alloc(
+					heap,
+					btr_mtr->n_freed_pages
+					* sizeof *freed_pages));
 			n_freed_pages = 0;
 		}
 
@@ -4502,7 +4702,8 @@ btr_store_big_rec_extern_fields(
 			int	err = deflateReset(&c_stream);
 			ut_a(err == Z_OK);
 
-			c_stream.next_in = (void*) big_rec_vec->fields[i].data;
+			c_stream.next_in = (Bytef*)
+				big_rec_vec->fields[i].data;
 			c_stream.avail_in = extern_len;
 		}
 
@@ -4567,6 +4768,8 @@ alloc_another:
 						page_no, MLOG_4BYTES, &mtr);
 				}
 
+			} else if (dict_index_is_online_ddl(index)) {
+				row_log_table_blob_alloc(index, page_no);
 			}
 
 			if (page_zip) {
@@ -4858,8 +5061,7 @@ btr_check_blob_fil_page_type(
 		ulint	flags = fil_space_get_flags(space_id);
 
 #ifndef UNIV_DEBUG /* Improve debug test coverage */
-		if (UNIV_LIKELY
-		    ((flags & DICT_TF_FORMAT_MASK) == DICT_TF_FORMAT_51)) {
+		if (dict_tf_get_format(flags) == UNIV_FORMAT_A) {
 			/* Old versions of InnoDB did not initialize
 			FIL_PAGE_TYPE on BLOB pages.  Do not print
 			anything about the type mismatch when reading
@@ -4910,13 +5112,17 @@ btr_free_externally_stored_field(
 					X-latch to the index tree */
 {
 	page_t*		page;
-	ulint		space_id;
+	const ulint	space_id	= mach_read_from_4(
+		field_ref + BTR_EXTERN_SPACE_ID);
+	const ulint	start_page	= mach_read_from_4(
+		field_ref + BTR_EXTERN_PAGE_NO);
 	ulint		rec_zip_size = dict_table_zip_size(index->table);
 	ulint		ext_zip_size;
 	ulint		page_no;
 	ulint		next_page_no;
 	mtr_t		mtr;
 
+	ut_ad(dict_index_is_clust(index));
 	ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
 				MTR_MEMO_X_LOCK));
 	ut_ad(mtr_memo_contains_page(local_mtr, field_ref,
@@ -4933,7 +5139,7 @@ btr_free_externally_stored_field(
 		return;
 	}
 
-	space_id = mach_read_from_4(field_ref + BTR_EXTERN_SPACE_ID);
+	ut_ad(space_id == index->space);
 
 	if (UNIV_UNLIKELY(space_id != dict_index_get_space(index))) {
 		ext_zip_size = fil_space_get_zip_size(space_id);
@@ -4963,8 +5169,7 @@ btr_free_externally_stored_field(
 
 		btr_blob_dbg_t	b;
 
-		b.blob_page_no = mach_read_from_4(
-			field_ref + BTR_EXTERN_PAGE_NO);
+		b.blob_page_no = start_page;
 
 		if (rec) {
 			/* Remove the reference from the record to the
@@ -5019,6 +5224,10 @@ btr_free_externally_stored_field(
 			return;
 		}
 
+		if (page_no == start_page && dict_index_is_online_ddl(index)) {
+			row_log_table_blob_free(index, start_page);
+		}
+
 		ext_block = buf_page_get(space_id, ext_zip_size, page_no,
 					 RW_X_LATCH, &mtr);
 		buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
@@ -5038,7 +5247,7 @@ btr_free_externally_stored_field(
 
 			btr_page_free_low(index, ext_block, 0, &mtr);
 
-			if (page_zip) {
+			if (page_zip != NULL) {
 				mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
 						next_page_no);
 				mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
@@ -5259,8 +5468,8 @@ btr_copy_zblob_prefix(
 	page_zip_set_alloc(&d_stream, heap);
 
 	ut_ad(ut_is_2pow(zip_size));
-	ut_ad(zip_size >= PAGE_ZIP_MIN_SIZE);
-	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+	ut_ad(zip_size >= UNIV_ZIP_SIZE_MIN);
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
 	ut_ad(space_id);
 
 	err = inflateInit(&d_stream);
@@ -5295,6 +5504,7 @@ btr_copy_zblob_prefix(
 				" page %lu space %lu\n",
 				(ulong) fil_page_get_type(bpage->zip.data),
 				(ulong) page_no, (ulong) space_id);
+			ut_ad(0);
 			goto end_of_blob;
 		}
 
@@ -5401,7 +5611,7 @@ btr_copy_externally_stored_field_prefix_low(
 		return(0);
 	}
 
-	if (UNIV_UNLIKELY(zip_size)) {
+	if (zip_size) {
 		return(btr_copy_zblob_prefix(buf, len, zip_size,
 					     space_id, page_no, offset));
 	} else {
@@ -5473,7 +5683,7 @@ btr_copy_externally_stored_field_prefix(
 Copies an externally stored field of a record to mem heap.  The
 clustered index record must be protected by a lock or a page latch.
 @return	the whole field copied to heap */
-static
+UNIV_INTERN
 byte*
 btr_copy_externally_stored_field(
 /*=============================*/
@@ -5508,7 +5718,7 @@ btr_copy_externally_stored_field(
 
 	extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
 
-	buf = mem_heap_alloc(heap, local_len + extern_len);
+	buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
 
 	memcpy(buf, data, local_len);
 	*len = local_len
diff --git a/storage/xtradb/btr/btr0pcur.c b/storage/xtradb/btr/btr0pcur.cc
index 3929c4a9c2d..49002ca9ab6 100644
--- a/storage/xtradb/btr/btr0pcur.c
+++ b/storage/xtradb/btr/btr0pcur.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file btr/btr0pcur.c
+@file btr/btr0pcur.cc
 The index tree persistent cursor
 
 Created 2/23/1996 Heikki Tuuri
@@ -43,7 +43,7 @@ btr_pcur_create_for_mysql(void)
 {
 	btr_pcur_t*	pcur;
 
-	pcur = mem_alloc(sizeof(btr_pcur_t));
+	pcur = (btr_pcur_t*) mem_alloc(sizeof(btr_pcur_t));
 
 	pcur->btr_cur.index = NULL;
 	btr_pcur_init(pcur);
@@ -130,7 +130,7 @@ btr_pcur_store_position(
 	      || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 	ut_a(cursor->latch_mode != BTR_NO_LATCHES);
 
-	if (UNIV_UNLIKELY(page_get_n_recs(page) == 0)) {
+	if (page_is_empty(page)) {
 		/* It must be an empty index tree; NOTE that in this case
 		we do not store the modify_clock, but always do a search
 		if we restore the cursor position */
@@ -195,7 +195,8 @@ btr_pcur_copy_stored_position(
 
 	if (pcur_donate->old_rec_buf) {
 
-		pcur_receive->old_rec_buf = mem_alloc(pcur_donate->buf_size);
+		pcur_receive->old_rec_buf = (byte*)
+			mem_alloc(pcur_donate->buf_size);
 
 		ut_memcpy(pcur_receive->old_rec_buf, pcur_donate->old_rec_buf,
 			  pcur_donate->buf_size);
@@ -263,7 +264,8 @@ btr_pcur_restore_position_func(
 
 		btr_cur_open_at_index_side(
 			cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE,
-			index, latch_mode, btr_pcur_get_btr_cur(cursor), mtr);
+			index, latch_mode,
+			btr_pcur_get_btr_cur(cursor), 0, mtr);
 
 		cursor->latch_mode = latch_mode;
 		cursor->pos_state = BTR_PCUR_IS_POSITIONED;
@@ -597,8 +599,8 @@ btr_pcur_open_on_user_rec_func(
 	ulint		line,		/*!< in: line where called */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	btr_pcur_open_func(index, tuple, mode, latch_mode, cursor,
-			   file, line, mtr);
+	btr_pcur_open_low(index, 0, tuple, mode, latch_mode, cursor,
+			  file, line, mtr);
 
 	if ((mode == PAGE_CUR_GE) || (mode == PAGE_CUR_G)) {
 
diff --git a/storage/xtradb/btr/btr0sea.c b/storage/xtradb/btr/btr0sea.cc
index 9fe6fa54b97..eec3c0b29aa 100644
--- a/storage/xtradb/btr/btr0sea.c
+++ b/storage/xtradb/btr/btr0sea.cc
@@ -18,13 +18,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file btr/btr0sea.c
+@file btr/btr0sea.cc
 The index tree adaptive search
 
 Created 2/17/1996 Heikki Tuuri
@@ -47,12 +47,8 @@ Created 2/17/1996 Heikki Tuuri
 Protected by btr_search_latch. */
 UNIV_INTERN char		btr_search_enabled	= TRUE;
 
-UNIV_INTERN ulint		btr_search_index_num	= 1;
-
-#ifdef UNIV_PFS_MUTEX
-/* Key to register btr_search_enabled_mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	btr_search_enabled_mutex_key;
-#endif /* UNIV_PFS_MUTEX */
+/** Number of adaptive hash index partitions */
+UNIV_INTERN ulint		btr_search_index_num;
 
 /** A dummy variable to fool the compiler */
 UNIV_INTERN ulint		btr_search_this_is_zero = 0;
@@ -76,7 +72,7 @@ NOTE: They do not protect values of non-ordering fields within a record from
 being updated in-place! We can use fact (1) to perform unique searches to
 indexes. */
 
-UNIV_INTERN rw_lock_t*		btr_search_latch_arr;
+UNIV_INTERN prio_rw_lock_t*	btr_search_latch_arr;
 
 /** padding to prevent other memory update hotspots from residing on
 the same memory cache line */
@@ -129,7 +125,7 @@ will not guarantee success. */
 static
 void
 btr_search_check_free_space_in_heap(
-/*=====================================*/
+/*================================*/
 	dict_index_t*	index)
 {
 	hash_table_t*	table;
@@ -171,39 +167,43 @@ btr_search_sys_create(
 /*==================*/
 	ulint	hash_size)	/*!< in: hash index hash table size */
 {
-	ulint i;
-	/* We allocate the search latch from dynamic memory:
-	see above at the global variable definition */
-
-	//btr_search_latch_temp = mem_alloc(sizeof(rw_lock_t));
-
-	//rw_lock_create(btr_search_latch_key, &btr_search_latch,
-	//	       SYNC_SEARCH_SYS);
+	ulint	i;
 
 	/* PS bug lp:1018264 - Multiple hash index partitions causes overly
-	   large hash index: When multiple adaptive hash index partitions are
-	   specified, _each_ partition was being created with hash_size which
-	   should be 1/64 of the total size of all buffer pools which is
-	   incorrect and can cause overly high memory usage. hash_size
-	   should be representing the _total_ size of all partitions, not the
-	   individual size of each partition. */
+	large hash index: When multiple adaptive hash index partitions are
+	specified, _each_ partition was being created with hash_size which
+	should be 1/64 of the total size of all buffer pools which is
+	incorrect and can cause overly high memory usage. hash_size
+	should be representing the _total_ size of all partitions, not the
+	individual size of each partition. */
 	hash_size /= btr_search_index_num;
 
-	btr_search_sys = mem_alloc(sizeof(btr_search_sys_t));
+	/* We allocate the search latch from dynamic memory:
+	see above at the global variable definition */
 
 	/* btr_search_index_num is constrained to machine word size for
 	historical reasons. This limitation can be easily removed later. */
 
-	btr_search_latch_arr = mem_alloc(sizeof(rw_lock_t) *
-					 btr_search_index_num);
-	btr_search_sys->hash_tables = mem_alloc(sizeof(hash_table_t *) *
-						btr_search_index_num);
+	btr_search_latch_arr = (prio_rw_lock_t *)
+		mem_alloc(sizeof(prio_rw_lock_t) * btr_search_index_num);
+
+	btr_search_sys = (btr_search_sys_t*)
+		mem_alloc(sizeof(btr_search_sys_t));
+
+	btr_search_sys->hash_tables = (hash_table_t **)
+		mem_alloc(sizeof(hash_table_t *) * btr_search_index_num);
+
 	for (i = 0; i < btr_search_index_num; i++) {
 
 		rw_lock_create(btr_search_latch_key,
 				&btr_search_latch_arr[i], SYNC_SEARCH_SYS);
 
-		btr_search_sys->hash_tables[i] = ha_create(hash_size, 0, 0);
+		btr_search_sys->hash_tables[i]
+			= ha_create(hash_size, 0, MEM_HEAP_FOR_BTR_SEARCH, 0);
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+		btr_search_sys->hash_tables[i]->adaptive = TRUE;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
 	}
 }
 
@@ -214,25 +214,51 @@ void
 btr_search_sys_free(void)
 /*=====================*/
 {
-	ulint i;
+	ulint	i;
 
 	for (i = 0; i < btr_search_index_num; i++) {
+
+		rw_lock_free(&btr_search_latch_arr[i]);
+
 		mem_heap_free(btr_search_sys->hash_tables[i]->heap);
+
 		hash_table_free(btr_search_sys->hash_tables[i]);
 
-		rw_lock_free(&btr_search_latch_arr[i]);
 	}
-	mem_free(btr_search_sys->hash_tables);
+
 	mem_free(btr_search_latch_arr);
+	btr_search_latch_arr = NULL;
+
+	mem_free(btr_search_sys->hash_tables);
 
-	//rw_lock_free(&btr_search_latch);
-	//mem_free(btr_search_latch_temp);
-	//btr_search_latch_temp = NULL;
 	mem_free(btr_search_sys);
 	btr_search_sys = NULL;
 }
 
 /********************************************************************//**
+Set index->ref_count = 0 on all indexes of a table. */
+static
+void
+btr_search_disable_ref_count(
+/*=========================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	dict_index_t*	index;
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	for (index = dict_table_get_first_index(table); index;
+	     index = dict_table_get_next_index(index)) {
+
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(rw_lock_own(btr_search_get_latch(index),
+				  RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+		index->search_info->ref_count = 0;
+	}
+}
+
+/********************************************************************//**
 Disable the adaptive hash search system and empty the index. */
 UNIV_INTERN
 void
@@ -240,7 +266,7 @@ btr_search_disable(void)
 /*====================*/
 {
 	dict_table_t*	table;
-	ulint i;
+	ulint		i;
 
 	mutex_enter(&dict_sys->mutex);
 	btr_search_x_lock_all();
@@ -252,13 +278,13 @@ btr_search_disable(void)
 	for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU); table;
 	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
 
-		dict_index_t*	index;
+		btr_search_disable_ref_count(table);
+	}
 
-		for (index = dict_table_get_first_index(table); index;
-		     index = dict_table_get_next_index(index)) {
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU); table;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
 
-			index->search_info->ref_count = 0;
-		}
+		btr_search_disable_ref_count(table);
 	}
 
 	mutex_exit(&dict_sys->mutex);
@@ -300,7 +326,7 @@ btr_search_info_create(
 {
 	btr_search_t*	info;
 
-	info = mem_heap_alloc(heap, sizeof(btr_search_t));
+	info = (btr_search_t*) mem_heap_alloc(heap, sizeof(btr_search_t));
 
 #ifdef UNIV_DEBUG
 	info->magic_n = BTR_SEARCH_MAGIC_N;
@@ -368,17 +394,15 @@ btr_search_info_update_hash(
 	btr_search_t*	info,	/*!< in/out: search info */
 	btr_cur_t*	cursor)	/*!< in: cursor which was just positioned */
 {
-	dict_index_t*	index;
+	dict_index_t*	index = cursor->index;
 	ulint		n_unique;
 	int		cmp;
 
 #ifdef UNIV_SYNC_DEBUG
-	ut_ad(!rw_lock_own(btr_search_get_latch(cursor->index), RW_LOCK_SHARED));
-	ut_ad(!rw_lock_own(btr_search_get_latch(cursor->index), RW_LOCK_EX));
+	ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
 
-	index = cursor->index;
-
 	if (dict_index_is_ibuf(index)) {
 		/* So many deletes are performed on an insert buffer tree
 		that we do not consider a hash index useful on it: */
@@ -492,8 +516,10 @@ btr_search_update_block_hash_info(
 				/*!< in: cursor */
 {
 #ifdef UNIV_SYNC_DEBUG
-	ut_ad(!rw_lock_own(btr_search_get_latch(cursor->index), RW_LOCK_SHARED));
-	ut_ad(!rw_lock_own(btr_search_get_latch(cursor->index), RW_LOCK_EX));
+	ut_ad(!rw_lock_own(btr_search_get_latch(cursor->index),
+			   RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(btr_search_get_latch(cursor->index),
+			   RW_LOCK_EX));
 	ut_ad(rw_lock_own(&block->lock, RW_LOCK_SHARED)
 	      || rw_lock_own(&block->lock, RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
@@ -577,7 +603,8 @@ btr_search_update_hash_ref(
 
 	ut_ad(cursor->flag == BTR_CUR_HASH_FAIL);
 #ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(btr_search_get_latch(cursor->index), RW_LOCK_EX));
+	ut_ad(rw_lock_own(btr_search_get_latch(cursor->index),
+			  RW_LOCK_EX));
 	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
 	      || rw_lock_own(&(block->lock), RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
@@ -624,6 +651,8 @@ btr_search_update_hash_ref(
 
 		ha_insert_for_fold(btr_search_get_hash_table(cursor->index),
 				   fold, block, rec);
+
+		MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
 	}
 }
 
@@ -642,8 +671,10 @@ btr_search_info_update_slow(
 	ulint*		params2;
 
 #ifdef UNIV_SYNC_DEBUG
-	ut_ad(!rw_lock_own(btr_search_get_latch(cursor->index), RW_LOCK_SHARED));
-	ut_ad(!rw_lock_own(btr_search_get_latch(cursor->index), RW_LOCK_EX));
+	ut_ad(!rw_lock_own(btr_search_get_latch(cursor->index),
+			   RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(btr_search_get_latch(cursor->index),
+			   RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
 
 	block = btr_cur_get_block(cursor);
@@ -687,7 +718,7 @@ btr_search_info_update_slow(
 		inside the called function. It might be that the compiler
 		would optimize the call just to pass pointers to block. */
 
-		params = mem_alloc(3 * sizeof(ulint));
+		params = (ulint*) mem_alloc(3 * sizeof(ulint));
 		params[0] = block->n_fields;
 		params[1] = block->n_bytes;
 		params[2] = block->left_side;
@@ -930,7 +961,8 @@ btr_search_guess_on_hash(
 	ut_ad(rw_lock_get_writer(btr_search_get_latch(index)) != RW_LOCK_EX);
 	ut_ad(rw_lock_get_reader_count(btr_search_get_latch(index)) > 0);
 
-	rec = ha_search_and_get_data(btr_search_get_hash_table(index), fold);
+	rec = (rec_t*) ha_search_and_get_data(
+		btr_search_get_hash_table(index), fold);
 
 	if (UNIV_UNLIKELY(!rec)) {
 		goto failure_unlock;
@@ -1071,7 +1103,11 @@ btr_search_drop_page_hash_index(
 	buf_block_t*	block)	/*!< in: block containing index page,
 				s- or x-latched, or an index page
 				for which we know that
-				block->buf_fix_count == 0 */
+				block->buf_fix_count == 0 or it is an
+				index page which has already been
+				removed from the buf_pool->page_hash
+				i.e.: it is in state
+				BUF_BLOCK_REMOVE_HASH */
 {
 	hash_table_t*		table;
 	ulint			n_fields;
@@ -1088,6 +1124,7 @@ btr_search_drop_page_hash_index(
 	mem_heap_t*		heap;
 	const dict_index_t*	index;
 	ulint*			offsets;
+	btr_search_t*		info;
 
 retry:
 	/* Do a dirty check on block->index, return if the block is not in the
@@ -1100,6 +1137,10 @@ retry:
 		return;
 	}
 
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
 	rw_lock_s_lock(btr_search_get_latch(index));
 
 	if (UNIV_UNLIKELY(index != block->index)) {
@@ -1109,18 +1150,39 @@ retry:
 		goto retry;
 	}
 
+	ut_a(!dict_index_is_ibuf(index));
+#ifdef UNIV_DEBUG
+	switch (dict_index_get_online_status(index)) {
+	case ONLINE_INDEX_CREATION:
+		/* The index is being created (bulk loaded). */
+	case ONLINE_INDEX_COMPLETE:
+		/* The index has been published. */
+	case ONLINE_INDEX_ABORTED:
+		/* Either the index creation was aborted due to an
+		error observed by InnoDB (in which case there should
+		not be any adaptive hash index entries), or it was
+		completed and then flagged aborted in
+		rollback_inplace_alter_table(). */
+		break;
+	case ONLINE_INDEX_ABORTED_DROPPED:
+		/* The index should have been dropped from the tablespace
+		already, and the adaptive hash index entries should have
+		been dropped as well. */
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
 	table = btr_search_get_hash_table(index);
 
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
 	      || rw_lock_own(&(block->lock), RW_LOCK_EX)
-	      || (block->page.buf_fix_count == 0));
+	      || block->page.buf_fix_count == 0
+	      || buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH);
 #endif /* UNIV_SYNC_DEBUG */
 
 	n_fields = block->curr_n_fields;
 	n_bytes = block->curr_n_bytes;
-	ut_a(index == block->index);
-	ut_a(!dict_index_is_ibuf(index));
 
 	/* NOTE: The fields of block must not be accessed after
 	releasing btr_search_latch, as the index page might only
@@ -1136,7 +1198,7 @@ retry:
 	/* Calculate and cache fold values into an array for fast deletion
 	from the hash index */
 
-	folds = mem_alloc(n_recs * sizeof(ulint));
+	folds = (ulint*) mem_alloc(n_recs * sizeof(ulint));
 
 	n_cached = 0;
 
@@ -1204,11 +1266,15 @@ next_rec:
 		ha_remove_all_nodes_to_page(table, folds[i], page);
 	}
 
-	ut_a(index->search_info->ref_count > 0);
-	index->search_info->ref_count--;
+	info = btr_search_get_info(block->index);
+	ut_a(info->ref_count > 0);
+	info->ref_count--;
 
 	block->index = NULL;
 
+	MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_REMOVED);
+	MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_REMOVED, n_cached);
+
 cleanup:
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 	if (UNIV_UNLIKELY(block->n_pointers)) {
@@ -1233,184 +1299,6 @@ cleanup:
 	mem_free(folds);
 }
 
-/************************************************************************
-Drops a page hash index based on index */
-UNIV_INTERN
-void
-btr_search_drop_page_hash_index_on_index(
-/*=====================================*/
-	dict_index_t*	index)		/* in: record descriptor */
-{
-
-	hash_table_t*	table;
-	buf_block_t*	block;
-	ulint		n_fields;
-	ulint		n_bytes;
-	const page_t*		page;
-	const rec_t*		rec;
-	ulint		fold;
-	ulint		prev_fold;
-	index_id_t	index_id;
-	ulint		n_cached;
-	ulint		n_recs;
-	ulint*		folds;
-	ulint		i, j;
-	mem_heap_t*	heap	= NULL;
-	ulint*		offsets;
-	ibool		released_search_latch;
-
-	rw_lock_s_lock(btr_search_get_latch(index));
-
-	table = btr_search_get_hash_table(index);
-
-	for (j = 0; j < srv_buf_pool_instances; j++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(j);
-
-		do {
-			buf_chunk_t*	chunks	= buf_pool->chunks;
-			buf_chunk_t*	chunk	= chunks + buf_pool->n_chunks;
-
-			released_search_latch = FALSE;
-
-			while (--chunk >= chunks) {
-				block	= chunk->blocks;
-				i	= chunk->size;
-
-retry:
-				for (; i--; block++) {
-					if (buf_block_get_state(block)
-					    != BUF_BLOCK_FILE_PAGE
-					    || block->index != index
-					    || !block->index) {
-						continue;
-					}
-
-					page = block->frame;
-
-					/* from btr_search_drop_page_hash_index() */
-					n_fields = block->curr_n_fields;
-					n_bytes = block->curr_n_bytes;
-
-
-					/* keeping latch order */
-					rw_lock_s_unlock(
-						btr_search_get_latch(index));
-					released_search_latch = TRUE;
-					rw_lock_x_lock(&block->lock);
-
-
-					ut_a(n_fields + n_bytes > 0);
-
-					n_recs = page_get_n_recs(page);
-
-					/* Calculate and cache fold values into an array for fast deletion
-					from the hash index */
-
-					folds = mem_alloc(n_recs * sizeof(ulint));
-
-					n_cached = 0;
-
-					rec = page_get_infimum_rec(page);
-					rec = page_rec_get_next_low(rec, page_is_comp(page));
-
-					index_id = btr_page_get_index_id(page);
-	
-					ut_a(index_id == index->id);
-
-					prev_fold = 0;
-
-					offsets = NULL;
-
-					while (!page_rec_is_supremum(rec)) {
-						offsets = rec_get_offsets(rec, index, offsets,
-									n_fields + (n_bytes > 0), &heap);
-						ut_a(rec_offs_n_fields(offsets) == n_fields + (n_bytes > 0));
-						fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id);
-
-						if (fold == prev_fold && prev_fold != 0) {
-
-							goto next_rec;
-						}
-
-						/* Remove all hash nodes pointing to this page from the
-						hash chain */
-
-						folds[n_cached] = fold;
-						n_cached++;
-next_rec:
-						rec = page_rec_get_next_low(rec, page_rec_is_comp(rec));
-						prev_fold = fold;
-					}
-
-					if (UNIV_LIKELY_NULL(heap)) {
-						mem_heap_empty(heap);
-					}
-
-					rw_lock_x_lock(
-						btr_search_get_latch(index));
-
-					if (UNIV_UNLIKELY(!block->index)) {
-						goto cleanup;
-					}
-
-					ut_a(block->index == index);
-
-					if (UNIV_UNLIKELY(block->curr_n_fields != n_fields)
-					    || UNIV_UNLIKELY(block->curr_n_bytes != n_bytes)) {
-						rw_lock_x_unlock(
-							btr_search_get_latch(index));
-						rw_lock_x_unlock(&block->lock);
-
-						mem_free(folds);
-
-						rw_lock_s_lock(
-							btr_search_get_latch(index));
-						goto retry;
-					}
-
-					for (i = 0; i < n_cached; i++) {
-
-						ha_remove_all_nodes_to_page(table, folds[i], page);
-					}
-
-					ut_a(index->search_info->ref_count > 0);
-					index->search_info->ref_count--;
-
-					block->index = NULL;
-
-cleanup:
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-					if (UNIV_UNLIKELY(block->n_pointers)) {
-						/* Corruption */
-						ut_print_timestamp(stderr);
-						fprintf(stderr,
-"InnoDB: The adaptive hash index is corrupted. After dropping\n"
-"InnoDB: the hash index to a page of %s, %lu hash nodes still remain.\n",
-							index->name, (ulong) block->n_pointers);
-					}
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-					rw_lock_x_unlock(
-						btr_search_get_latch(index));
-					rw_lock_x_unlock(&block->lock);
-
-					mem_free(folds);
-
-					rw_lock_s_lock(
-						btr_search_get_latch(index));
-				}
-			}
-		} while (released_search_latch);
-	}
-
-	rw_lock_s_unlock(btr_search_get_latch(index));
-
-	if (UNIV_LIKELY_NULL(heap)) {
-		mem_heap_free(heap);
-	}
-}
-
 /********************************************************************//**
 Drops a possible page hash index when a page is evicted from the buffer pool
 or freed in a file segment. */
@@ -1483,20 +1371,25 @@ btr_search_build_page_hash_index(
 	ut_ad(index);
 	ut_a(!dict_index_is_ibuf(index));
 
-	table = btr_search_get_hash_table(index);
-	page = buf_block_get_frame(block);
-
 #ifdef UNIV_SYNC_DEBUG
-	ut_ad(!rw_lock_own(btr_search_get_latch(index->id), RW_LOCK_EX));
+	ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_EX));
 	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
 	      || rw_lock_own(&(block->lock), RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
 
 	rw_lock_s_lock(btr_search_get_latch(index));
 
+	if (!btr_search_enabled) {
+		rw_lock_s_unlock(btr_search_get_latch(index));
+		return;
+	}
+
+	table = btr_search_get_hash_table(index);
+	page = buf_block_get_frame(block);
+
 	if (block->index && ((block->curr_n_fields != n_fields)
-				 || (block->curr_n_bytes != n_bytes)
-				 || (block->curr_left_side != left_side))) {
+			     || (block->curr_n_bytes != n_bytes)
+			     || (block->curr_left_side != left_side))) {
 
 		rw_lock_s_unlock(btr_search_get_latch(index));
 
@@ -1528,8 +1421,8 @@ btr_search_build_page_hash_index(
 	/* Calculate and cache fold values and corresponding records into
 	an array for fast insertion to the hash index */
 
-	folds = mem_alloc(n_recs * sizeof(ulint));
-	recs = mem_alloc(n_recs * sizeof(rec_t*));
+	folds = (ulint*) mem_alloc(n_recs * sizeof(ulint));
+	recs = (rec_t**) mem_alloc(n_recs * sizeof(rec_t*));
 
 	n_cached = 0;
 
@@ -1631,6 +1524,8 @@ btr_search_build_page_hash_index(
 		ha_insert_for_fold(table, folds[i], block, recs[i]);
 	}
 
+	MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_ADDED);
+	MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_ADDED, n_cached);
 exit_func:
 	rw_lock_x_unlock(btr_search_get_latch(index));
 
@@ -1760,7 +1655,12 @@ btr_search_update_hash_on_delete(
 	if (block->index) {
 		ut_a(block->index == index);
 
-		ha_search_and_delete_if_found(table, fold, rec);
+		if (ha_search_and_delete_if_found(table, fold, rec)) {
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVED);
+		} else {
+			MONITOR_INC(
+				MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND);
+		}
 	}
 
 	rw_lock_x_unlock(btr_search_get_latch(cursor->index));
@@ -1816,8 +1716,11 @@ btr_search_update_hash_node_on_insert(
 
 		table = btr_search_get_hash_table(cursor->index);
 
-		ha_search_and_update_if_found(table, cursor->fold, rec,
-					      block, page_rec_get_next(rec));
+		if (ha_search_and_update_if_found(
+			table, cursor->fold, rec, block,
+			page_rec_get_next(rec))) {
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_UPDATED);
+		}
 
 func_exit:
 		rw_lock_x_unlock(btr_search_get_latch(cursor->index));
@@ -1842,9 +1745,9 @@ btr_search_update_hash_on_insert(
 	hash_table_t*	table;
 	buf_block_t*	block;
 	dict_index_t*	index;
-	rec_t*		rec;
-	rec_t*		ins_rec;
-	rec_t*		next_rec;
+	const rec_t*	rec;
+	const rec_t*	ins_rec;
+	const rec_t*	next_rec;
 	ulint		fold;
 	ulint		ins_fold;
 	ulint		next_fold = 0; /* remove warning (??? bug ???) */
@@ -1857,12 +1760,6 @@ btr_search_update_hash_on_insert(
 	ulint*		offsets		= offsets_;
 	rec_offs_init(offsets_);
 
-	table = btr_search_get_hash_table(cursor->index);
-
-	btr_search_check_free_space_in_heap(cursor->index);
-
-	rec = btr_cur_get_rec(cursor);
-
 	block = btr_cur_get_block(cursor);
 
 #ifdef UNIV_SYNC_DEBUG
@@ -1876,6 +1773,12 @@ btr_search_update_hash_on_insert(
 		return;
 	}
 
+	btr_search_check_free_space_in_heap(cursor->index);
+
+	table = btr_search_get_hash_table(cursor->index);
+
+	rec = btr_cur_get_rec(cursor);
+
 	ut_a(index == cursor->index);
 	ut_a(!dict_index_is_ibuf(index));
 
@@ -1883,8 +1786,8 @@ btr_search_update_hash_on_insert(
 	n_bytes = block->curr_n_bytes;
 	left_side = block->curr_left_side;
 
-	ins_rec = page_rec_get_next(rec);
-	next_rec = page_rec_get_next(ins_rec);
+	ins_rec = page_rec_get_next_const(rec);
+	next_rec = page_rec_get_next_const(ins_rec);
 
 	offsets = rec_get_offsets(ins_rec, index, offsets,
 				  ULINT_UNDEFINED, &heap);
@@ -1996,17 +1899,18 @@ function_exit:
 
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 /********************************************************************//**
-Validates the search system.
+Validates one hash table in the search system.
 @return	TRUE if ok */
-UNIV_INTERN
+static
 ibool
-btr_search_validate(void)
-/*=====================*/
+btr_search_validate_one_table(
+/*==========================*/
+	ulint	t)
 {
 	ha_node_t*	node;
 	ulint		n_page_dumps	= 0;
 	ibool		ok		= TRUE;
-	ulint		i,j;
+	ulint		i;
 	ulint		cell_count;
 	mem_heap_t*	heap		= NULL;
 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
@@ -2018,29 +1922,24 @@ btr_search_validate(void)
 
 	rec_offs_init(offsets_);
 
-	btr_search_x_lock_all();
-	buf_pool_page_hash_x_lock_all();
-
-	for (j = 0; j < btr_search_index_num; j++) {
-
-	cell_count = hash_get_n_cells(btr_search_sys->hash_tables[j]);
+	cell_count = hash_get_n_cells(btr_search_sys->hash_tables[t]);
 
 	for (i = 0; i < cell_count; i++) {
 		/* We release btr_search_latch every once in a while to
 		give other queries a chance to run. */
 		if ((i != 0) && ((i % chunk_size) == 0)) {
-			buf_pool_page_hash_x_unlock_all();
 			btr_search_x_unlock_all();
 			os_thread_yield();
 			btr_search_x_lock_all();
-			buf_pool_page_hash_x_lock_all();
 		}
 
-		node = hash_get_nth_cell(btr_search_sys->hash_tables[j], i)->node;
+		node = (ha_node_t*)
+			hash_get_nth_cell(btr_search_sys->hash_tables[t],
+					  i)->node;
 
 		for (; node != NULL; node = node->next) {
-			const buf_block_t*	block
-				= buf_block_align(node->data);
+			buf_block_t*	block
+				= buf_block_align((byte*) node->data);
 			const buf_block_t*	hash_block;
 			buf_pool_t*		buf_pool;
 			index_id_t		page_index_id;
@@ -2080,6 +1979,8 @@ btr_search_validate(void)
 				     == BUF_BLOCK_REMOVE_HASH);
 			}
 
+			mutex_enter(&block->mutex);
+
 			ut_a(!dict_index_is_ibuf(block->index));
 
 			page_index_id = btr_page_get_index_id(block->frame);
@@ -2135,6 +2036,8 @@ btr_search_validate(void)
 					n_page_dumps++;
 				}
 			}
+
+			mutex_exit(&block->mutex);
 		}
 	}
 
@@ -2144,26 +2047,47 @@ btr_search_validate(void)
 		/* We release btr_search_latch every once in a while to
 		give other queries a chance to run. */
 		if (i != 0) {
-			buf_pool_page_hash_x_unlock_all();
 			btr_search_x_unlock_all();
 			os_thread_yield();
 			btr_search_x_lock_all();
-			buf_pool_page_hash_x_lock_all();
 		}
 
-		if (!ha_validate(btr_search_sys->hash_tables[j], i, end_index)) {
+		if (!ha_validate(btr_search_sys->hash_tables[t], i,
+				 end_index)) {
 			ok = FALSE;
 		}
 	}
 
-	} /*for (j = 0; j < btr_search_index_num; j++)*/
-
-	buf_pool_page_hash_x_unlock_all();
-	btr_search_x_unlock_all();
 	if (UNIV_LIKELY_NULL(heap)) {
 		mem_heap_free(heap);
 	}
 
 	return(ok);
 }
+
+/********************************************************************//**
+Validates the search system.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+btr_search_validate(void)
+/*=====================*/
+{
+	ulint	i;
+	ibool	ok	= TRUE;
+
+	btr_search_x_lock_all();
+
+	for (i = 0; i < btr_search_index_num; i++) {
+
+		if (!btr_search_validate_one_table(i))
+			ok = FALSE;
+	}
+
+	btr_search_x_unlock_all();
+
+	return(ok);
+}
+
+
 #endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
diff --git a/storage/xtradb/buf/buf0buddy.c b/storage/xtradb/buf/buf0buddy.c
deleted file mode 100644
index 439be08b01f..00000000000
--- a/storage/xtradb/buf/buf0buddy.c
+++ /dev/null
@@ -1,595 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file buf/buf0buddy.c
-Binary buddy allocator for compressed pages
-
-Created December 2006 by Marko Makela
-*******************************************************/
-
-#define THIS_MODULE
-#include "buf0buddy.h"
-#ifdef UNIV_NONINL
-# include "buf0buddy.ic"
-#endif
-#undef THIS_MODULE
-#include "buf0buf.h"
-#include "buf0lru.h"
-#include "buf0flu.h"
-#include "page0zip.h"
-
-/**********************************************************************//**
-Get the offset of the buddy of a compressed page frame.
-@return	the buddy relative of page */
-UNIV_INLINE
-byte*
-buf_buddy_get(
-/*==========*/
-	byte*	page,	/*!< in: compressed page */
-	ulint	size)	/*!< in: page size in bytes */
-{
-	ut_ad(ut_is_2pow(size));
-	ut_ad(size >= BUF_BUDDY_LOW);
-	ut_ad(size < BUF_BUDDY_HIGH);
-	ut_ad(!ut_align_offset(page, size));
-
-	if (((ulint) page) & size) {
-		return(page - size);
-	} else {
-		return(page + size);
-	}
-}
-
-/** Validate a given zip_free list. */
-#define BUF_BUDDY_LIST_VALIDATE(b, i)				\
-	UT_LIST_VALIDATE(zip_list, buf_page_t,			\
-			 b->zip_free[i],			\
-			 ut_ad(buf_page_get_state(		\
-				       ut_list_node_313)	\
-			       == BUF_BLOCK_ZIP_FREE))
-
-/**********************************************************************//**
-Add a block to the head of the appropriate buddy free list. */
-UNIV_INLINE
-void
-buf_buddy_add_to_free(
-/*==================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	buf_page_t*	bpage,		/*!< in,own: block to be freed */
-	ulint		i)		/*!< in: index of
-					buf_pool->zip_free[] */
-{
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(mutex_own(&buf_pool->zip_free_mutex));
-	ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
-	ut_ad(buf_pool->zip_free[i].start != bpage);
-	UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_free[i], bpage);
-}
-
-/**********************************************************************//**
-Remove a block from the appropriate buddy free list. */
-UNIV_INLINE
-void
-buf_buddy_remove_from_free(
-/*=======================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	buf_page_t*	bpage,		/*!< in: block to be removed */
-	ulint		i)		/*!< in: index of
-					buf_pool->zip_free[] */
-{
-#ifdef UNIV_DEBUG
-	buf_page_t*	prev = UT_LIST_GET_PREV(zip_list, bpage);
-	buf_page_t*	next = UT_LIST_GET_NEXT(zip_list, bpage);
-
-	ut_ad(!prev || buf_page_get_state(prev) == BUF_BLOCK_ZIP_FREE);
-	ut_ad(!next || buf_page_get_state(next) == BUF_BLOCK_ZIP_FREE);
-#endif /* UNIV_DEBUG */
-
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(mutex_own(&buf_pool->zip_free_mutex));
-	ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
-	UT_LIST_REMOVE(zip_list, buf_pool->zip_free[i], bpage);
-}
-
-/**********************************************************************//**
-Try to allocate a block from buf_pool->zip_free[].
-@return	allocated block, or NULL if buf_pool->zip_free[] was empty */
-static
-void*
-buf_buddy_alloc_zip(
-/*================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	ulint		i)		/*!< in: index of buf_pool->zip_free[] */
-{
-	buf_page_t*	bpage;
-
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(mutex_own(&buf_pool->zip_free_mutex));
-	ut_a(i < BUF_BUDDY_SIZES);
-	ut_a(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
-
-	ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i));
-
-	bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]);
-
-	if (bpage) {
-		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
-
-		buf_buddy_remove_from_free(buf_pool, bpage, i);
-	} else if (i + 1 < BUF_BUDDY_SIZES) {
-		/* Attempt to split. */
-		bpage = buf_buddy_alloc_zip(buf_pool, i + 1);
-
-		if (bpage) {
-			buf_page_t*	buddy = (buf_page_t*)
-				(((char*) bpage) + (BUF_BUDDY_LOW << i));
-
-			ut_ad(!buf_pool_contains_zip(buf_pool, buddy));
-			ut_d(memset(buddy, i, BUF_BUDDY_LOW << i));
-			buddy->state = BUF_BLOCK_ZIP_FREE;
-			buf_buddy_add_to_free(buf_pool, buddy, i);
-		}
-	}
-
-	if (bpage) {
-		ut_d(memset(bpage, ~i, BUF_BUDDY_LOW << i));
-		UNIV_MEM_ALLOC(bpage, BUF_BUDDY_SIZES << i);
-	}
-
-	return(bpage);
-}
-
-/**********************************************************************//**
-Deallocate a buffer frame of UNIV_PAGE_SIZE. */
-static
-void
-buf_buddy_block_free(
-/*=================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	void*		buf,		/*!< in: buffer frame to deallocate */
-	ibool		have_page_hash_mutex)
-{
-	const ulint	fold	= BUF_POOL_ZIP_FOLD_PTR(buf);
-	buf_page_t*	bpage;
-	buf_block_t*	block;
-
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(!mutex_own(&buf_pool->zip_mutex));
-	ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE));
-
-	mutex_enter(&buf_pool->zip_hash_mutex);
-
-	HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage,
-		    ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY
-			  && bpage->in_zip_hash && !bpage->in_page_hash),
-		    ((buf_block_t*) bpage)->frame == buf);
-	ut_a(bpage);
-	ut_a(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY);
-	ut_ad(!bpage->in_page_hash);
-	ut_ad(bpage->in_zip_hash);
-	ut_d(bpage->in_zip_hash = FALSE);
-	HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage);
-
-	mutex_exit(&buf_pool->zip_hash_mutex);
-
-	ut_d(memset(buf, 0, UNIV_PAGE_SIZE));
-	UNIV_MEM_INVALID(buf, UNIV_PAGE_SIZE);
-
-	block = (buf_block_t*) bpage;
-	mutex_enter(&block->mutex);
-	buf_LRU_block_free_non_file_page(block, have_page_hash_mutex);
-	mutex_exit(&block->mutex);
-
-	ut_ad(buf_pool->buddy_n_frames > 0);
-	ut_d(buf_pool->buddy_n_frames--);
-}
-
-/**********************************************************************//**
-Allocate a buffer block to the buddy allocator. */
-static
-void
-buf_buddy_block_register(
-/*=====================*/
-	buf_block_t*	block)	/*!< in: buffer frame to allocate */
-{
-	buf_pool_t*	buf_pool = buf_pool_from_block(block);
-	const ulint	fold = BUF_POOL_ZIP_FOLD(block);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(!mutex_own(&buf_pool->zip_mutex));
-	ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE);
-
-	buf_block_set_state(block, BUF_BLOCK_MEMORY);
-
-	ut_a(block->frame);
-	ut_a(!ut_align_offset(block->frame, UNIV_PAGE_SIZE));
-
-	ut_ad(!block->page.in_page_hash);
-	ut_ad(!block->page.in_zip_hash);
-	ut_d(block->page.in_zip_hash = TRUE);
-
-	mutex_enter(&buf_pool->zip_hash_mutex);
-	HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page);
-	mutex_exit(&buf_pool->zip_hash_mutex);
-
-	ut_d(buf_pool->buddy_n_frames++);
-}
-
-/**********************************************************************//**
-Allocate a block from a bigger object.
-@return	allocated block */
-static
-void*
-buf_buddy_alloc_from(
-/*=================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	void*		buf,		/*!< in: a block that is free to use */
-	ulint		i,		/*!< in: index of
-					buf_pool->zip_free[] */
-	ulint		j)		/*!< in: size of buf as an index
-					of buf_pool->zip_free[] */
-{
-	ulint	offs	= BUF_BUDDY_LOW << j;
-	ut_ad(j <= BUF_BUDDY_SIZES);
-	ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
-	ut_ad(j >= i);
-	ut_ad(!ut_align_offset(buf, offs));
-
-	/* Add the unused parts of the block to the free lists. */
-	while (j > i) {
-		buf_page_t*	bpage;
-
-		offs >>= 1;
-		j--;
-
-		bpage = (buf_page_t*) ((byte*) buf + offs);
-		ut_d(memset(bpage, j, BUF_BUDDY_LOW << j));
-		bpage->state = BUF_BLOCK_ZIP_FREE;
-		ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i));
-		buf_buddy_add_to_free(buf_pool, bpage, j);
-	}
-
-	return(buf);
-}
-
-/**********************************************************************//**
-Allocate a block.  The thread calling this function must hold
-buf_pool->mutex and must not hold buf_pool->zip_mutex or any block->mutex.
-The buf_pool_mutex may be released and reacquired.
-@return	allocated block, never NULL */
-UNIV_INTERN
-void*
-buf_buddy_alloc_low(
-/*================*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
-	ulint		i,		/*!< in: index of buf_pool->zip_free[],
-					or BUF_BUDDY_SIZES */
-	ibool*		lru,		/*!< in: pointer to a variable that
-					will be assigned TRUE if storage was
-					allocated from the LRU list and
-					buf_pool->mutex was temporarily
-					released */
-	ibool		have_page_hash_mutex)
-{
-	buf_block_t*	block;
-
-	ut_ad(lru);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
-	ut_ad(!mutex_own(&buf_pool->zip_mutex));
-	ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
-
-	if (i < BUF_BUDDY_SIZES) {
-		/* Try to allocate from the buddy system. */
-		mutex_enter(&buf_pool->zip_free_mutex);
-		block = buf_buddy_alloc_zip(buf_pool, i);
-
-		if (block) {
-			goto func_exit;
-		}
-		mutex_exit(&buf_pool->zip_free_mutex);
-	}
-
-	/* Try allocating from the buf_pool->free list. */
-	block = buf_LRU_get_free_only(buf_pool);
-
-	if (block) {
-
-		goto alloc_big;
-	}
-
-	/* Try replacing an uncompressed page in the buffer pool. */
-	//buf_pool_mutex_exit(buf_pool);
-	mutex_exit(&buf_pool->LRU_list_mutex);
-	if (have_page_hash_mutex) {
-		rw_lock_x_unlock(&buf_pool->page_hash_latch);
-	}
-	block = buf_LRU_get_free_block(buf_pool);
-	*lru = TRUE;
-	//buf_pool_mutex_enter(buf_pool);
-	mutex_enter(&buf_pool->LRU_list_mutex);
-	if (have_page_hash_mutex) {
-		rw_lock_x_lock(&buf_pool->page_hash_latch);
-	}
-
-alloc_big:
-	buf_buddy_block_register(block);
-
-	mutex_enter(&buf_pool->zip_free_mutex);
-	block = buf_buddy_alloc_from(
-		buf_pool, block->frame, i, BUF_BUDDY_SIZES);
-
-func_exit:
-	buf_pool->buddy_stat[i].used++;
-	mutex_exit(&buf_pool->zip_free_mutex);
-
-	return(block);
-}
-
-/**********************************************************************//**
-Try to relocate a block.
-@return	TRUE if relocated */
-static
-ibool
-buf_buddy_relocate(
-/*===============*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	void*		src,		/*!< in: block to relocate */
-	void*		dst,		/*!< in: free block to relocate to */
-	ulint		i,		/*!< in: index of
-					buf_pool->zip_free[] */
-	ibool		have_page_hash_mutex)
-{
-	buf_page_t*	bpage;
-	const ulint	size	= BUF_BUDDY_LOW << i;
-	mutex_t*	mutex;
-	ulint		space;
-	ulint		page_no;
-
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(mutex_own(&buf_pool->zip_free_mutex));
-	ut_ad(!mutex_own(&buf_pool->zip_mutex));
-	ut_ad(!ut_align_offset(src, size));
-	ut_ad(!ut_align_offset(dst, size));
-	ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
-	UNIV_MEM_ASSERT_W(dst, size);
-
-	if (!have_page_hash_mutex) {
-		mutex_exit(&buf_pool->zip_free_mutex);
-		mutex_enter(&buf_pool->LRU_list_mutex);
-		rw_lock_x_lock(&buf_pool->page_hash_latch);
-	}
-
-	/* We assume that all memory from buf_buddy_alloc()
-	is used for compressed page frames. */
-
-	/* We look inside the allocated objects returned by
-	buf_buddy_alloc() and assume that each block is a compressed
-	page that contains a valid space_id and page_no in the page
-	header. Should the fields be invalid, we will be unable to
-	relocate the block. */
-
-	/* The src block may be split into smaller blocks,
-	some of which may be free.  Thus, the
-	mach_read_from_4() calls below may attempt to read
-	from free memory.  The memory is "owned" by the buddy
-	allocator (and it has been allocated from the buffer
-	pool), so there is nothing wrong about this.  The
-	mach_read_from_4() calls here will only trigger bogus
-	Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */
-	space	= mach_read_from_4((const byte *) src
-				   + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
-	page_no	= mach_read_from_4((const byte *) src
-				   + FIL_PAGE_OFFSET);
-	/* Suppress Valgrind warnings about conditional jump
-	on uninitialized value. */
-	UNIV_MEM_VALID(&space, sizeof space);
-	UNIV_MEM_VALID(&page_no, sizeof page_no);
-	bpage = buf_page_hash_get(buf_pool, space, page_no);
-
-	if (!bpage || bpage->zip.data != src) {
-		/* The block has probably been freshly
-		allocated by buf_LRU_get_free_block() but not
-		added to buf_pool->page_hash yet.  Obviously,
-		it cannot be relocated. */
-
-		if (!have_page_hash_mutex) {
-			mutex_enter(&buf_pool->zip_free_mutex);
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			rw_lock_x_unlock(&buf_pool->page_hash_latch);
-		}
-		return(FALSE);
-	}
-
-	if (page_zip_get_size(&bpage->zip) != size) {
-		/* The block is of different size.  We would
-		have to relocate all blocks covered by src.
-		For the sake of simplicity, give up. */
-		ut_ad(page_zip_get_size(&bpage->zip) < size);
-
-		if (!have_page_hash_mutex) {
-			mutex_enter(&buf_pool->zip_free_mutex);
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			rw_lock_x_unlock(&buf_pool->page_hash_latch);
-		}
-		return(FALSE);
-	}
-
-	/* To keep latch order */
-	if (have_page_hash_mutex)
-		mutex_exit(&buf_pool->zip_free_mutex);
-
-	/* The block must have been allocated, but it may
-	contain uninitialized data. */
-	UNIV_MEM_ASSERT_W(src, size);
-
-	mutex = buf_page_get_mutex_enter(bpage);
-
-	mutex_enter(&buf_pool->zip_free_mutex);
-
-	if (mutex && buf_page_can_relocate(bpage)) {
-		/* Relocate the compressed page. */
-		ullint	usec	= ut_time_us(NULL);
-		ut_a(bpage->zip.data == src);
-		memcpy(dst, src, size);
-		bpage->zip.data = dst;
-		mutex_exit(mutex);
-		UNIV_MEM_INVALID(src, size);
-		{
-			buf_buddy_stat_t*	buddy_stat
-				= &buf_pool->buddy_stat[i];
-			buddy_stat->relocated++;
-			buddy_stat->relocated_usec
-				+= ut_time_us(NULL) - usec;
-		}
-
-		if (!have_page_hash_mutex) {
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			rw_lock_x_unlock(&buf_pool->page_hash_latch);
-		}
-		return(TRUE);
-	}
-
-	if (!have_page_hash_mutex) {
-		mutex_exit(&buf_pool->LRU_list_mutex);
-		rw_lock_x_unlock(&buf_pool->page_hash_latch);
-	}
-
-	if (mutex) {
-		mutex_exit(mutex);
-	}
-	return(FALSE);
-}
-
-/**********************************************************************//**
-Deallocate a block. */
-UNIV_INTERN
-void
-buf_buddy_free_low(
-/*===============*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	void*		buf,		/*!< in: block to be freed, must not be
-					pointed to by the buffer pool */
-	ulint		i,		/*!< in: index of buf_pool->zip_free[],
-					or BUF_BUDDY_SIZES */
-	ibool		have_page_hash_mutex)
-{
-	buf_page_t*	bpage;
-	buf_page_t*	buddy;
-
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(mutex_own(&buf_pool->zip_free_mutex));
-	ut_ad(!mutex_own(&buf_pool->zip_mutex));
-	ut_ad(i <= BUF_BUDDY_SIZES);
-	ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
-	ut_ad(buf_pool->buddy_stat[i].used > 0);
-
-	buf_pool->buddy_stat[i].used--;
-recombine:
-	UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i);
-	((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE;
-
-	if (i == BUF_BUDDY_SIZES) {
-		mutex_exit(&buf_pool->zip_free_mutex);
-		buf_buddy_block_free(buf_pool, buf, have_page_hash_mutex);
-		mutex_enter(&buf_pool->zip_free_mutex);
-		return;
-	}
-
-	ut_ad(i < BUF_BUDDY_SIZES);
-	ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i));
-	ut_ad(!buf_pool_contains_zip(buf_pool, buf));
-
-	/* Do not recombine blocks if there are few free blocks.
-	We may waste up to 15360*max_len bytes to free blocks
-	(1024 + 2048 + 4096 + 8192 = 15360) */
-	if (UT_LIST_GET_LEN(buf_pool->zip_free[i]) < 16) {
-		goto func_exit;
-	}
-
-	/* Try to combine adjacent blocks. */
-	buddy = (buf_page_t*) buf_buddy_get(((byte*) buf), BUF_BUDDY_LOW << i);
-
-#ifndef UNIV_DEBUG_VALGRIND
-	/* When Valgrind instrumentation is not enabled, we can read
-	buddy->state to quickly determine that a block is not free.
-	When the block is not free, buddy->state belongs to a compressed
-	page frame that may be flagged uninitialized in our Valgrind
-	instrumentation.  */
-
-	if (buddy->state != BUF_BLOCK_ZIP_FREE) {
-
-		goto buddy_nonfree;
-	}
-#endif /* !UNIV_DEBUG_VALGRIND */
-
-	for (bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); bpage; ) {
-		ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
-
-		if (bpage == buddy) {
-			/* The buddy is free: recombine */
-			buf_buddy_remove_from_free(buf_pool, bpage, i);
-buddy_is_free:
-			ut_ad(buf_page_get_state(buddy) == BUF_BLOCK_ZIP_FREE);
-			ut_ad(!buf_pool_contains_zip(buf_pool, buddy));
-			i++;
-			buf = ut_align_down(buf, BUF_BUDDY_LOW << i);
-
-			goto recombine;
-		}
-
-		ut_a(bpage != buf);
-		UNIV_MEM_ASSERT_W(bpage, BUF_BUDDY_LOW << i);
-		bpage = UT_LIST_GET_NEXT(zip_list, bpage);
-	}
-
-#ifndef UNIV_DEBUG_VALGRIND
-buddy_nonfree:
-#endif /* !UNIV_DEBUG_VALGRIND */
-
-	ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i));
-
-	/* The buddy is not free. Is there a free block of this size? */
-	bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]);
-
-	if (bpage) {
-
-		/* Remove the block from the free list, because a successful
-		buf_buddy_relocate() will overwrite bpage->list. */
-		buf_buddy_remove_from_free(buf_pool, bpage, i);
-
-		/* Try to relocate the buddy of buf to the free block. */
-		if (buf_buddy_relocate(buf_pool, buddy, bpage, i, have_page_hash_mutex)) {
-
-			buddy->state = BUF_BLOCK_ZIP_FREE;
-			goto buddy_is_free;
-		}
-
-		buf_buddy_add_to_free(buf_pool, bpage, i);
-	}
-
-func_exit:
-	/* Free the block to the buddy list. */
-	bpage = buf;
-
-	/* Fill large blocks with a constant pattern. */
-	ut_d(memset(bpage, i, BUF_BUDDY_LOW << i));
-	UNIV_MEM_INVALID(bpage, BUF_BUDDY_LOW << i);
-	bpage->state = BUF_BLOCK_ZIP_FREE;
-	buf_buddy_add_to_free(buf_pool, bpage, i);
-}
diff --git a/storage/xtradb/buf/buf0buddy.cc b/storage/xtradb/buf/buf0buddy.cc
new file mode 100644
index 00000000000..3f8f339a81a
--- /dev/null
+++ b/storage/xtradb/buf/buf0buddy.cc
@@ -0,0 +1,726 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buddy.cc
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#define THIS_MODULE
+#include "buf0buddy.h"
+#ifdef UNIV_NONINL
+# include "buf0buddy.ic"
+#endif
+#undef THIS_MODULE
+#include "buf0buf.h"
+#include "buf0lru.h"
+#include "buf0flu.h"
+#include "page0zip.h"
+#include "srv0start.h"
+
+/** When freeing a buf we attempt to coalesce by looking at its buddy
+and deciding whether it is free or not. To ascertain if the buddy is
+free we look for BUF_BUDDY_STAMP_FREE at BUF_BUDDY_STAMP_OFFSET
+within the buddy. The question is how we can be sure that it is
+safe to look at BUF_BUDDY_STAMP_OFFSET.
+The answer lies in following invariants:
+* All blocks allocated by buddy allocator are used for compressed
+page frame.
+* A compressed table always have space_id < SRV_LOG_SPACE_FIRST_ID
+* BUF_BUDDY_STAMP_OFFSET always points to the space_id field in
+a frame.
+  -- The above is true because we look at these fields when the
+     corresponding buddy block is free which implies that:
+     * The block we are looking at must have an address aligned at
+       the same size that its free buddy has. For example, if we have
+       a free block of 8K then its buddy's address must be aligned at
+       8K as well.
+     * It is possible that the block we are looking at may have been
+       further divided into smaller sized blocks but its starting
+       address must still remain the start of a page frame i.e.: it
+       cannot be middle of a block. For example, if we have a free
+       block of size 8K then its buddy may be divided into blocks
+       of, say, 1K, 1K, 2K, 4K but the buddy's address will still be
+       the starting address of first 1K compressed page.
+     * What is important to note is that for any given block, the
+       buddy's address cannot be in the middle of a larger block i.e.:
+       in above example, our 8K block cannot have a buddy whose address
+       is aligned on 8K but it is part of a larger 16K block.
+*/
+
+/** Offset within buf_buddy_free_t where free or non_free stamps
+are written.*/
+#define BUF_BUDDY_STAMP_OFFSET	FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID
+
+/** Value that we stamp on all buffers that are currently on the zip_free
+list. This value is stamped at BUF_BUDDY_STAMP_OFFSET offset */
+#define BUF_BUDDY_STAMP_FREE	(SRV_LOG_SPACE_FIRST_ID)
+
+/** Stamp value for non-free buffers. Will be overwritten by a non-zero
+value by the consumer of the block */
+#define BUF_BUDDY_STAMP_NONFREE	(0XFFFFFFFF)
+
+#if BUF_BUDDY_STAMP_FREE >= BUF_BUDDY_STAMP_NONFREE
+# error "BUF_BUDDY_STAMP_FREE >= BUF_BUDDY_STAMP_NONFREE"
+#endif
+
+/** Return type of buf_buddy_is_free() */
+enum buf_buddy_state_t {
+	BUF_BUDDY_STATE_FREE,	/*!< If the buddy to completely free */
+	BUF_BUDDY_STATE_USED,	/*!< Buddy currently in used */
+	BUF_BUDDY_STATE_PARTIALLY_USED/*!< Some sub-blocks in the buddy
+				are in use */
+};
+
+#ifdef UNIV_DEBUG_VALGRIND
+/**********************************************************************//**
+Invalidate memory area that we won't access while page is free */
+UNIV_INLINE
+void
+buf_buddy_mem_invalid(
+/*==================*/
+	buf_buddy_free_t*	buf,	/*!< in: block to check */
+	ulint			i)	/*!< in: index of zip_free[] */
+{
+	const size_t	size	= BUF_BUDDY_LOW << i;
+	ut_ad(i <= BUF_BUDDY_SIZES);
+
+	UNIV_MEM_ASSERT_W(buf, size);
+	UNIV_MEM_INVALID(buf, size);
+}
+#else /* UNIV_DEBUG_VALGRIND */
+# define buf_buddy_mem_invalid(buf, i) ut_ad((i) <= BUF_BUDDY_SIZES)
+#endif /* UNIV_DEBUG_VALGRIND */
+
+/**********************************************************************//**
+Check if a buddy is stamped free.
+@return	whether the buddy is free */
+UNIV_INLINE __attribute__((warn_unused_result))
+bool
+buf_buddy_stamp_is_free(
+/*====================*/
+	const buf_buddy_free_t*	buf)	/*!< in: block to check */
+{
+	return(mach_read_from_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET)
+	       == BUF_BUDDY_STAMP_FREE);
+}
+
+/**********************************************************************//**
+Stamps a buddy free. */
+UNIV_INLINE
+void
+buf_buddy_stamp_free(
+/*=================*/
+	buf_buddy_free_t*	buf,	/*!< in/out: block to stamp */
+	ulint			i)	/*!< in: block size */
+{
+	ut_d(memset(buf, i, BUF_BUDDY_LOW << i));
+	buf_buddy_mem_invalid(buf, i);
+	mach_write_to_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET,
+			BUF_BUDDY_STAMP_FREE);
+	buf->stamp.size = i;
+}
+
+/**********************************************************************//**
+Stamps a buddy nonfree.
+@param[in/out]	buf	block to stamp
+@param[in]	i	block size */
+#define buf_buddy_stamp_nonfree(buf, i) do {				\
+	buf_buddy_mem_invalid(buf, i);					\
+	memset(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, 0xff, 4);	\
+} while (0)
+#if BUF_BUDDY_STAMP_NONFREE != 0xffffffff
+# error "BUF_BUDDY_STAMP_NONFREE != 0xffffffff"
+#endif
+
+/**********************************************************************//**
+Get the offset of the buddy of a compressed page frame.
+@return	the buddy relative of page */
+UNIV_INLINE
+void*
+buf_buddy_get(
+/*==========*/
+	byte*	page,	/*!< in: compressed page */
+	ulint	size)	/*!< in: page size in bytes */
+{
+	ut_ad(ut_is_2pow(size));
+	ut_ad(size >= BUF_BUDDY_LOW);
+	ut_ad(BUF_BUDDY_LOW <= UNIV_ZIP_SIZE_MIN);
+	ut_ad(size < BUF_BUDDY_HIGH);
+	ut_ad(BUF_BUDDY_HIGH == UNIV_PAGE_SIZE);
+	ut_ad(!ut_align_offset(page, size));
+
+	if (((ulint) page) & size) {
+		return(page - size);
+	} else {
+		return(page + size);
+	}
+}
+
+/** Validate a given zip_free list. */
+struct	CheckZipFree {
+	ulint	i;
+	CheckZipFree(ulint i) : i (i) {}
+
+	void	operator()(const buf_buddy_free_t* elem) const
+	{
+		ut_a(buf_buddy_stamp_is_free(elem));
+		ut_a(elem->stamp.size <= i);
+	}
+};
+
+#define BUF_BUDDY_LIST_VALIDATE(bp, i)				\
+	UT_LIST_VALIDATE(list, buf_buddy_free_t,		\
+			 bp->zip_free[i], CheckZipFree(i))
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Debug function to validate that a buffer is indeed free i.e.: in the
+zip_free[].
+@return true if free */
+UNIV_INLINE
+bool
+buf_buddy_check_free(
+/*=================*/
+	buf_pool_t*		buf_pool,/*!< in: buffer pool instance */
+	const buf_buddy_free_t*	buf,	/*!< in: block to check */
+	ulint			i)	/*!< in: index of buf_pool->zip_free[] */
+{
+	const ulint	size	= BUF_BUDDY_LOW << i;
+
+	ut_ad(mutex_own(&buf_pool->zip_free_mutex));
+	ut_ad(!ut_align_offset(buf, size));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+	buf_buddy_free_t* itr;
+
+	for (itr = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
+	     itr && itr != buf;
+	     itr = UT_LIST_GET_NEXT(list, itr)) {
+	}
+
+	return(itr == buf);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Checks if a buf is free i.e.: in the zip_free[].
+@retval BUF_BUDDY_STATE_FREE if fully free
+@retval BUF_BUDDY_STATE_USED if currently in use
+@retval BUF_BUDDY_STATE_PARTIALLY_USED if partially in use. */
+static  __attribute__((warn_unused_result))
+buf_buddy_state_t
+buf_buddy_is_free(
+/*==============*/
+	buf_buddy_free_t*	buf,	/*!< in: block to check */
+	ulint			i)	/*!< in: index of
+					buf_pool->zip_free[] */
+{
+#ifdef UNIV_DEBUG
+	const ulint	size	= BUF_BUDDY_LOW << i;
+	ut_ad(!ut_align_offset(buf, size));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+#endif /* UNIV_DEBUG */
+
+	/* We assume that all memory from buf_buddy_alloc()
+	is used for compressed page frames. */
+
+	/* We look inside the allocated objects returned by
+	buf_buddy_alloc() and assume that each block is a compressed
+	page that contains one of the following in space_id.
+	* BUF_BUDDY_STAMP_FREE if the block is in a zip_free list or
+	* BUF_BUDDY_STAMP_NONFREE if the block has been allocated but
+	not initialized yet or
+	* A valid space_id of a compressed tablespace
+
+	The call below attempts to read from free memory.  The memory
+	is "owned" by the buddy allocator (and it has been allocated
+	from the buffer pool), so there is nothing wrong about this. */
+	if (!buf_buddy_stamp_is_free(buf)) {
+		return(BUF_BUDDY_STATE_USED);
+	}
+
+	/* A block may be free but a fragment of it may still be in use.
+	To guard against that we write the free block size in terms of
+	zip_free index at start of stamped block. Note that we can
+	safely rely on this value only if the buf is free. */
+	ut_ad(buf->stamp.size <= i);
+	return(buf->stamp.size == i
+	       ? BUF_BUDDY_STATE_FREE
+	       : BUF_BUDDY_STATE_PARTIALLY_USED);
+}
+
+/**********************************************************************//**
+Add a block to the head of the appropriate buddy free list. */
+UNIV_INLINE
+void
+buf_buddy_add_to_free(
+/*==================*/
+	buf_pool_t*		buf_pool,	/*!< in: buffer pool instance */
+	buf_buddy_free_t*	buf,		/*!< in,own: block to be freed */
+	ulint			i)		/*!< in: index of
+						buf_pool->zip_free[] */
+{
+	ut_ad(mutex_own(&buf_pool->zip_free_mutex));
+	ut_ad(buf_pool->zip_free[i].start != buf);
+
+	buf_buddy_stamp_free(buf, i);
+	UT_LIST_ADD_FIRST(list, buf_pool->zip_free[i], buf);
+	ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i));
+}
+
+/**********************************************************************//**
+Remove a block from the appropriate buddy free list. */
+UNIV_INLINE
+void
+buf_buddy_remove_from_free(
+/*=======================*/
+	buf_pool_t*		buf_pool,	/*!< in: buffer pool instance */
+	buf_buddy_free_t*	buf,		/*!< in,own: block to be freed */
+	ulint			i)		/*!< in: index of
+						buf_pool->zip_free[] */
+{
+	ut_ad(mutex_own(&buf_pool->zip_free_mutex));
+	ut_ad(buf_buddy_check_free(buf_pool, buf, i));
+
+	UT_LIST_REMOVE(list, buf_pool->zip_free[i], buf);
+	buf_buddy_stamp_nonfree(buf, i);
+}
+
+/**********************************************************************//**
+Try to allocate a block from buf_pool->zip_free[].
+@return	allocated block, or NULL if buf_pool->zip_free[] was empty */
+static
+buf_buddy_free_t*
+buf_buddy_alloc_zip(
+/*================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		i)		/*!< in: index of buf_pool->zip_free[] */
+{
+	buf_buddy_free_t*	buf;
+
+	ut_ad(mutex_own(&buf_pool->zip_free_mutex));
+	ut_a(i < BUF_BUDDY_SIZES);
+	ut_a(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+	ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i));
+
+	buf = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
+
+	if (buf) {
+		buf_buddy_remove_from_free(buf_pool, buf, i);
+	} else if (i + 1 < BUF_BUDDY_SIZES) {
+		/* Attempt to split. */
+		buf = buf_buddy_alloc_zip(buf_pool, i + 1);
+
+		if (buf) {
+			buf_buddy_free_t* buddy =
+				reinterpret_cast<buf_buddy_free_t*>(
+					buf->stamp.bytes
+					+ (BUF_BUDDY_LOW << i));
+
+			ut_ad(!buf_pool_contains_zip(buf_pool, buddy));
+			buf_buddy_add_to_free(buf_pool, buddy, i);
+		}
+	}
+
+	if (buf) {
+		/* Trash the page other than the BUF_BUDDY_STAMP_NONFREE. */
+		UNIV_MEM_TRASH(buf, ~i, BUF_BUDDY_STAMP_OFFSET);
+		UNIV_MEM_TRASH(BUF_BUDDY_STAMP_OFFSET + 4
+			       + buf->stamp.bytes, ~i,
+			       (BUF_BUDDY_LOW << i)
+			       - (BUF_BUDDY_STAMP_OFFSET + 4));
+		ut_ad(mach_read_from_4(buf->stamp.bytes
+				       + BUF_BUDDY_STAMP_OFFSET)
+		      == BUF_BUDDY_STAMP_NONFREE);
+	}
+
+	return(buf);
+}
+
+/**********************************************************************//**
+Deallocate a buffer frame of UNIV_PAGE_SIZE. */
+static
+void
+buf_buddy_block_free(
+/*=================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	void*		buf)		/*!< in: buffer frame to deallocate */
+{
+	const ulint	fold	= BUF_POOL_ZIP_FOLD_PTR(buf);
+	buf_page_t*	bpage;
+	buf_block_t*	block;
+
+	ut_ad(!mutex_own(&buf_pool->zip_mutex));
+	ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE));
+
+	mutex_enter(&buf_pool->zip_hash_mutex);
+
+	HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage,
+		    ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY
+			  && bpage->in_zip_hash && !bpage->in_page_hash),
+		    ((buf_block_t*) bpage)->frame == buf);
+	ut_a(bpage);
+	ut_a(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY);
+	ut_ad(!bpage->in_page_hash);
+	ut_ad(bpage->in_zip_hash);
+	ut_d(bpage->in_zip_hash = FALSE);
+	HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage);
+
+	mutex_exit(&buf_pool->zip_hash_mutex);
+
+	ut_d(memset(buf, 0, UNIV_PAGE_SIZE));
+	UNIV_MEM_INVALID(buf, UNIV_PAGE_SIZE);
+
+	block = (buf_block_t*) bpage;
+	mutex_enter(&block->mutex);
+	buf_LRU_block_free_non_file_page(block);
+	mutex_exit(&block->mutex);
+
+	ut_ad(buf_pool->buddy_n_frames > 0);
+	ut_d(buf_pool->buddy_n_frames--);
+}
+
+/**********************************************************************//**
+Allocate a buffer block to the buddy allocator. */
+static
+void
+buf_buddy_block_register(
+/*=====================*/
+	buf_block_t*	block)	/*!< in: buffer frame to allocate */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_block(block);
+	const ulint	fold = BUF_POOL_ZIP_FOLD(block);
+	ut_ad(!mutex_own(&buf_pool->zip_mutex));
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE);
+
+	buf_block_set_state(block, BUF_BLOCK_MEMORY);
+
+	ut_a(block->frame);
+	ut_a(!ut_align_offset(block->frame, UNIV_PAGE_SIZE));
+
+	ut_ad(!block->page.in_page_hash);
+	ut_ad(!block->page.in_zip_hash);
+	ut_d(block->page.in_zip_hash = TRUE);
+
+	mutex_enter(&buf_pool->zip_hash_mutex);
+	HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page);
+	mutex_exit(&buf_pool->zip_hash_mutex);
+
+	ut_d(buf_pool->buddy_n_frames++);
+}
+
+/**********************************************************************//**
+Allocate a block from a bigger object.
+@return	allocated block */
+static
+void*
+buf_buddy_alloc_from(
+/*=================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	void*		buf,		/*!< in: a block that is free to use */
+	ulint		i,		/*!< in: index of
+					buf_pool->zip_free[] */
+	ulint		j)		/*!< in: size of buf as an index
+					of buf_pool->zip_free[] */
+{
+	ulint	offs	= BUF_BUDDY_LOW << j;
+	ut_ad(mutex_own(&buf_pool->zip_free_mutex));
+	ut_ad(j <= BUF_BUDDY_SIZES);
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+	ut_ad(j >= i);
+	ut_ad(!ut_align_offset(buf, offs));
+
+	/* Add the unused parts of the block to the free lists. */
+	while (j > i) {
+		buf_buddy_free_t*	zip_buf;
+
+		offs >>= 1;
+		j--;
+
+		zip_buf = reinterpret_cast<buf_buddy_free_t*>(
+			reinterpret_cast<byte*>(buf) + offs);
+		buf_buddy_add_to_free(buf_pool, zip_buf, j);
+	}
+
+	buf_buddy_stamp_nonfree(reinterpret_cast<buf_buddy_free_t*>(buf), i);
+	return(buf);
+}
+
+/**********************************************************************//**
+Allocate a block.  The thread calling this function must hold
+buf_pool->LRU_list_mutex and must not hold buf_pool->zip_mutex or any
+block->mutex.  The buf_pool->LRU_list_mutex may be released and reacquired.
+@return	allocated block, never NULL */
+UNIV_INTERN
+void*
+buf_buddy_alloc_low(
+/*================*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
+	ulint		i,		/*!< in: index of buf_pool->zip_free[],
+					or BUF_BUDDY_SIZES */
+	ibool*		lru)		/*!< in: pointer to a variable that
+					will be assigned TRUE if storage was
+					allocated from the LRU list and
+					buf_pool->LRU_list_mutex was
+					temporarily released */
+{
+	buf_block_t*	block;
+
+	ut_ad(lru);
+	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+	ut_ad(!mutex_own(&buf_pool->zip_mutex));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+	if (i < BUF_BUDDY_SIZES) {
+		/* Try to allocate from the buddy system. */
+		mutex_enter(&buf_pool->zip_free_mutex);
+		block = (buf_block_t*) buf_buddy_alloc_zip(buf_pool, i);
+
+		if (block) {
+			goto func_exit;
+		}
+		mutex_exit(&buf_pool->zip_free_mutex);
+	}
+
+	/* Try allocating from the buf_pool->free list. */
+	block = buf_LRU_get_free_only(buf_pool);
+
+	if (block) {
+
+		goto alloc_big;
+	}
+
+	/* Try replacing an uncompressed page in the buffer pool. */
+	mutex_exit(&buf_pool->LRU_list_mutex);
+	block = buf_LRU_get_free_block(buf_pool);
+	*lru = TRUE;
+	mutex_enter(&buf_pool->LRU_list_mutex);
+
+alloc_big:
+	buf_buddy_block_register(block);
+
+	mutex_enter(&buf_pool->zip_free_mutex);
+	block = (buf_block_t*) buf_buddy_alloc_from(
+		buf_pool, block->frame, i, BUF_BUDDY_SIZES);
+
+func_exit:
+	buf_pool->buddy_stat[i].used++;
+	mutex_exit(&buf_pool->zip_free_mutex);
+
+	return(block);
+}
+
+/**********************************************************************//**
+Try to relocate a block.  The caller must hold zip_free_mutex, and this
+function will release and lock it again.
+@return	true if relocated */
+static
+bool
+buf_buddy_relocate(
+/*===============*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	void*		src,		/*!< in: block to relocate */
+	void*		dst,		/*!< in: free block to relocate to */
+	ulint		i)		/*!< in: index of
+					buf_pool->zip_free[] */
+{
+	buf_page_t*	bpage;
+	const ulint	size	= BUF_BUDDY_LOW << i;
+	ib_mutex_t*	mutex;
+	ulint		space;
+	ulint		offset;
+	prio_rw_lock_t*	hash_lock;
+
+	ut_ad(mutex_own(&buf_pool->zip_free_mutex));
+	ut_ad(!mutex_own(&buf_pool->zip_mutex));
+	ut_ad(!ut_align_offset(src, size));
+	ut_ad(!ut_align_offset(dst, size));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+	UNIV_MEM_ASSERT_W(dst, size);
+
+	space	= mach_read_from_4((const byte*) src
+				   + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	offset	= mach_read_from_4((const byte*) src
+				   + FIL_PAGE_OFFSET);
+
+	/* Suppress Valgrind warnings about conditional jump
+	on uninitialized value. */
+	UNIV_MEM_VALID(&space, sizeof space);
+	UNIV_MEM_VALID(&offset, sizeof offset);
+
+	ut_ad(space != BUF_BUDDY_STAMP_FREE);
+
+	mutex_exit(&buf_pool->zip_free_mutex);
+	/* Lock page hash to prevent a relocation for the target page */
+	bpage = buf_page_hash_get_s_locked(buf_pool, space, offset, &hash_lock);
+
+	if (!bpage || bpage->zip.data != src) {
+		/* The block has probably been freshly
+		allocated by buf_LRU_get_free_block() but not
+		added to buf_pool->page_hash yet.  Obviously,
+		it cannot be relocated. */
+
+		if (bpage) {
+			rw_lock_s_unlock(hash_lock);
+		}
+		mutex_enter(&buf_pool->zip_free_mutex);
+		return(false);
+	}
+
+	if (page_zip_get_size(&bpage->zip) != size) {
+		/* The block is of different size.  We would
+		have to relocate all blocks covered by src.
+		For the sake of simplicity, give up. */
+		ut_ad(page_zip_get_size(&bpage->zip) < size);
+
+		rw_lock_s_unlock(hash_lock);
+		mutex_enter(&buf_pool->zip_free_mutex);
+		return(false);
+	}
+
+	/* The block must have been allocated, but it may
+	contain uninitialized data. */
+	UNIV_MEM_ASSERT_W(src, size);
+
+	mutex = buf_page_get_mutex(bpage);
+
+	mutex_enter(mutex);
+
+	rw_lock_s_unlock(hash_lock);
+
+	mutex_enter(&buf_pool->zip_free_mutex);
+
+	if (buf_page_can_relocate(bpage)) {
+		/* Relocate the compressed page. */
+		ullint	usec	= ut_time_us(NULL);
+		ut_a(bpage->zip.data == src);
+		memcpy(dst, src, size);
+		bpage->zip.data = (page_zip_t*) dst;
+		mutex_exit(mutex);
+		buf_buddy_mem_invalid(
+			reinterpret_cast<buf_buddy_free_t*>(src), i);
+
+		buf_buddy_stat_t*	buddy_stat = &buf_pool->buddy_stat[i];
+		buddy_stat->relocated++;
+		buddy_stat->relocated_usec += ut_time_us(NULL) - usec;
+		return(true);
+	}
+
+	mutex_exit(mutex);
+	return(false);
+}
+
+/**********************************************************************//**
+Deallocate a block. */
+UNIV_INTERN
+void
+buf_buddy_free_low(
+/*===============*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	void*		buf,		/*!< in: block to be freed, must not be
+					pointed to by the buffer pool */
+	ulint		i)		/*!< in: index of buf_pool->zip_free[],
+					or BUF_BUDDY_SIZES */
+{
+	buf_buddy_free_t*	buddy;
+
+	ut_ad(!mutex_own(&buf_pool->zip_mutex));
+	ut_ad(i <= BUF_BUDDY_SIZES);
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+	mutex_enter(&buf_pool->zip_free_mutex);
+
+	ut_ad(buf_pool->buddy_stat[i].used > 0);
+	buf_pool->buddy_stat[i].used--;
+recombine:
+	UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i);
+
+	if (i == BUF_BUDDY_SIZES) {
+		mutex_exit(&buf_pool->zip_free_mutex);
+		buf_buddy_block_free(buf_pool, buf);
+		return;
+	}
+
+	ut_ad(i < BUF_BUDDY_SIZES);
+	ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i));
+	ut_ad(!buf_pool_contains_zip(buf_pool, buf));
+
+	/* Do not recombine blocks if there are few free blocks.
+	We may waste up to 15360*max_len bytes to free blocks
+	(1024 + 2048 + 4096 + 8192 = 15360) */
+	if (UT_LIST_GET_LEN(buf_pool->zip_free[i]) < 16) {
+		goto func_exit;
+	}
+
+	/* Try to combine adjacent blocks. */
+	buddy = reinterpret_cast<buf_buddy_free_t*>(
+		buf_buddy_get(reinterpret_cast<byte*>(buf),
+			      BUF_BUDDY_LOW << i));
+
+	switch (buf_buddy_is_free(buddy, i)) {
+	case BUF_BUDDY_STATE_FREE:
+		/* The buddy is free: recombine */
+		buf_buddy_remove_from_free(buf_pool, buddy, i);
+buddy_is_free:
+		ut_ad(!buf_pool_contains_zip(buf_pool, buddy));
+		i++;
+		buf = ut_align_down(buf, BUF_BUDDY_LOW << i);
+
+		goto recombine;
+
+	case BUF_BUDDY_STATE_USED:
+		ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i));
+
+		/* The buddy is not free. Is there a free block of
+		this size? */
+		if (buf_buddy_free_t* zip_buf =
+			UT_LIST_GET_FIRST(buf_pool->zip_free[i])) {
+
+			/* Remove the block from the free list, because
+			a successful buf_buddy_relocate() will overwrite
+			zip_free->list. */
+			buf_buddy_remove_from_free(buf_pool, zip_buf, i);
+
+			/* Try to relocate the buddy of buf to the free
+			block. */
+			if (buf_buddy_relocate(buf_pool, buddy, zip_buf, i)) {
+
+				goto buddy_is_free;
+			}
+
+			buf_buddy_add_to_free(buf_pool, zip_buf, i);
+		}
+
+		break;
+	case BUF_BUDDY_STATE_PARTIALLY_USED:
+		/* Some sub-blocks in the buddy are still in use.
+		Relocation will fail. No need to try. */
+		break;
+	}
+
+func_exit:
+	/* Free the block to the buddy list. */
+	buf_buddy_add_to_free(buf_pool,
+			      reinterpret_cast<buf_buddy_free_t*>(buf),
+			      i);
+	mutex_exit(&buf_pool->zip_free_mutex);
+}
diff --git a/storage/xtradb/buf/buf0buf.c b/storage/xtradb/buf/buf0buf.cc
index f06fd4abfb1..d4b170028d9 100644
--- a/storage/xtradb/buf/buf0buf.c
+++ b/storage/xtradb/buf/buf0buf.cc
@@ -18,13 +18,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file buf/buf0buf.c
+@file buf/buf0buf.cc
 The database buffer buf_pool
 
 Created 11/5/1995 Heikki Tuuri
@@ -51,6 +51,8 @@ Created 11/5/1995 Heikki Tuuri
 #include "dict0dict.h"
 #include "log0recv.h"
 #include "page0zip.h"
+#include "srv0mon.h"
+#include "buf0checksum.h"
 #include "trx0trx.h"
 #include "srv0start.h"
 
@@ -69,7 +71,8 @@ _increment_page_get_statistics(buf_block_t* block, trx_t* trx)
 	ut_ad(trx && trx->take_stats);
 
 	if (!trx->distinct_page_access_hash) {
-		trx->distinct_page_access_hash = mem_alloc(DPAH_SIZE);
+		trx->distinct_page_access_hash
+			= static_cast<byte *>(mem_alloc(DPAH_SIZE));
 		memset(trx->distinct_page_access_hash, 0, DPAH_SIZE);
 	}
 
@@ -116,24 +119,9 @@ in the file along with the file page, resides in the control block.
 
 		Buffer pool struct
 		------------------
-The buffer buf_pool contains a single mutex which protects all the
+The buffer buf_pool contains several mutexes which protect all the
 control data structures of the buf_pool. The content of a buffer frame is
 protected by a separate read-write lock in its control block, though.
-These locks can be locked and unlocked without owning the buf_pool->mutex.
-The OS events in the buf_pool struct can be waited for without owning the
-buf_pool->mutex.
-
-The buf_pool->mutex is a hot-spot in main memory, causing a lot of
-memory bus traffic on multiprocessor systems when processors
-alternately access the mutex. On our Pentium, the mutex is accessed
-maybe every 10 microseconds. We gave up the solution to have mutexes
-for each control block, for instance, because it seemed to be
-complicated.
-
-A solution to reduce mutex contention of the buf_pool->mutex is to
-create a separate mutex for the page hash table. On Pentium,
-accessing the hash table takes 2 microseconds, about half
-of the total buf_pool->mutex hold time.
 
 		Control blocks
 		--------------
@@ -217,7 +205,7 @@ uncompressed pages are accessible via buf_block_t objects that are
 reachable via buf_pool->chunks[].
 
 The chains of free memory blocks (buf_pool->zip_free[]) are used by
-the buddy allocator (buf0buddy.c) to keep track of currently unused
+the buddy allocator (buf0buddy.cc) to keep track of currently unused
 memory blocks of size sizeof(buf_page_t)..UNIV_PAGE_SIZE / 2.  These
 blocks are inside the UNIV_PAGE_SIZE-sized memory blocks of type
 BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
@@ -298,7 +286,6 @@ UNIV_INTERN ibool		buf_debug_prints = FALSE;
 #ifdef UNIV_PFS_RWLOCK
 /* Keys to register buffer block related rwlocks and mutexes with
 performance schema */
-UNIV_INTERN mysql_pfs_key_t	buf_pool_page_hash_key;
 UNIV_INTERN mysql_pfs_key_t	buf_block_lock_key;
 # ifdef UNIV_SYNC_DEBUG
 UNIV_INTERN mysql_pfs_key_t	buf_block_debug_latch_key;
@@ -309,6 +296,7 @@ UNIV_INTERN mysql_pfs_key_t	buf_block_debug_latch_key;
 UNIV_INTERN mysql_pfs_key_t	buffer_block_mutex_key;
 UNIV_INTERN mysql_pfs_key_t	buf_pool_mutex_key;
 UNIV_INTERN mysql_pfs_key_t	buf_pool_zip_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	buf_pool_flush_state_mutex_key;
 UNIV_INTERN mysql_pfs_key_t	buf_pool_LRU_list_mutex_key;
 UNIV_INTERN mysql_pfs_key_t	buf_pool_free_list_mutex_key;
 UNIV_INTERN mysql_pfs_key_t	buf_pool_zip_free_mutex_key;
@@ -334,19 +322,26 @@ be effective only if PFS_GROUP_BUFFER_SYNC is defined. */
 # endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
 #endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
 
+/** Macro to determine whether the read of write counter is used depending
+on the io_type */
+#define MONITOR_RW_COUNTER(io_type, counter)		\
+	((io_type == BUF_IO_READ)			\
+	 ? (counter##_READ)				\
+	 : (counter##_WRITTEN))
+
 /********************************************************************//**
 Gets the smallest oldest_modification lsn for any page in the pool. Returns
 zero if all modified pages have been flushed to disk.
 @return oldest modification in pool, zero if none */
 UNIV_INTERN
-ib_uint64_t
+lsn_t
 buf_pool_get_oldest_modification(void)
 /*==================================*/
 {
 	ulint		i;
 	buf_page_t*	bpage;
-	ib_uint64_t	lsn = 0;
-	ib_uint64_t	oldest_lsn = 0;
+	lsn_t		lsn = 0;
+	lsn_t		oldest_lsn = 0;
 
 	/* When we traverse all the flush lists we don't want another
 	thread to add a dirty page to any flush list. */
@@ -403,6 +398,7 @@ buf_get_total_list_len(
 		buf_pool_t*	buf_pool;
 
 		buf_pool = buf_pool_from_array(i);
+
 		*LRU_len += UT_LIST_GET_LEN(buf_pool->LRU);
 		*free_len += UT_LIST_GET_LEN(buf_pool->free);
 		*flush_list_len += UT_LIST_GET_LEN(buf_pool->flush_list);
@@ -418,11 +414,10 @@ buf_get_total_list_size_in_bytes(
 	buf_pools_list_size_t*	buf_pools_list_size)	/*!< out: list sizes
 							in all buffer pools */
 {
-	ulint			i;
 	ut_ad(buf_pools_list_size);
 	memset(buf_pools_list_size, 0, sizeof(*buf_pools_list_size));
 
-	for (i = 0; i < srv_buf_pool_instances; i++) {
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
 		buf_pool_t*	buf_pool;
 
 		buf_pool = buf_pool_from_array(i);
@@ -500,97 +495,24 @@ buf_block_alloc(
 #endif /* !UNIV_HOTBACKUP */
 
 /********************************************************************//**
-Calculates a page checksum which is stored to the page when it is written
-to a file. Note that we must be careful to calculate the same value on
-32-bit and 64-bit architectures.
-@return	checksum */
-UNIV_INTERN
-ulint
-buf_calc_page_new_checksum(
-/*=======================*/
-	const byte*	page)	/*!< in: buffer page */
-{
-	ulint checksum;
-
-	/* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
-	..._ARCH_LOG_NO, are written outside the buffer pool to the first
-	pages of data files, we have to skip them in the page checksum
-	calculation.
-	We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
-	checksum is stored, and also the last 8 bytes of page because
-	there we store the old formula checksum. */
-
-	checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
-				  FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
-		+ ut_fold_binary(page + FIL_PAGE_DATA,
-				 UNIV_PAGE_SIZE - FIL_PAGE_DATA
-				 - FIL_PAGE_END_LSN_OLD_CHKSUM);
-	checksum = checksum & 0xFFFFFFFFUL;
-
-	return(checksum);
-}
-
-UNIV_INTERN
-ulint
-buf_calc_page_new_checksum_32(
-/*==========================*/
-	const byte*	page)	/*!< in: buffer page */
-{
-	ulint checksum;
-
-	checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
-				  FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
-		+ ut_fold_binary(page + FIL_PAGE_DATA,
-				 FIL_PAGE_DATA_ALIGN_32 - FIL_PAGE_DATA)
-		+ ut_fold_binary_32(page + FIL_PAGE_DATA_ALIGN_32,
-				    UNIV_PAGE_SIZE - FIL_PAGE_DATA_ALIGN_32
-				    - FIL_PAGE_END_LSN_OLD_CHKSUM);
-
-	checksum = checksum & 0xFFFFFFFFUL;
-
-	return(checksum);
-}
-
-/********************************************************************//**
-In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
-looked at the first few bytes of the page. This calculates that old
-checksum.
-NOTE: we must first store the new formula checksum to
-FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
-because this takes that field as an input!
-@return	checksum */
-UNIV_INTERN
-ulint
-buf_calc_page_old_checksum(
-/*=======================*/
-	const byte*	page)	/*!< in: buffer page */
-{
-	ulint checksum;
-
-	checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
-
-	checksum = checksum & 0xFFFFFFFFUL;
-
-	return(checksum);
-}
-
-/********************************************************************//**
 Checks if a page is corrupt.
 @return	TRUE if corrupted */
 UNIV_INTERN
 ibool
 buf_page_is_corrupted(
 /*==================*/
-	ibool		check_lsn,	/*!< in: TRUE if we need to check
+	bool		check_lsn,	/*!< in: true if we need to check
 					and complain about the LSN */
 	const byte*	read_buf,	/*!< in: a database page */
 	ulint		zip_size)	/*!< in: size of compressed page;
 					0 for uncompressed pages */
 {
-	ulint		checksum_field;
-	ulint		old_checksum_field;
+	ulint		checksum_field1;
+	ulint		checksum_field2;
+	ibool		crc32_inited = FALSE;
+	ib_uint32_t	crc32 = ULINT32_UNDEFINED;
 
-	if (UNIV_LIKELY(!zip_size)
+	if (!zip_size
 	    && memcmp(read_buf + FIL_PAGE_LSN + 4,
 		      read_buf + UNIV_PAGE_SIZE
 		      - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
@@ -603,93 +525,205 @@ buf_page_is_corrupted(
 
 #ifndef UNIV_HOTBACKUP
 	if (check_lsn && recv_lsn_checks_on) {
-		ib_uint64_t	current_lsn;
+		lsn_t	current_lsn;
+
+		/* Since we are going to reset the page LSN during the import
+		phase it makes no sense to spam the log with error messages. */
 
 		if (log_peek_lsn(&current_lsn)
-		    && UNIV_UNLIKELY
-		    (current_lsn
-		     < mach_read_from_8(read_buf + FIL_PAGE_LSN))) {
+		    && current_lsn
+		    < mach_read_from_8(read_buf + FIL_PAGE_LSN)) {
 			ut_print_timestamp(stderr);
 
 			fprintf(stderr,
-				"  InnoDB: Error: page %lu log sequence number"
-				" %llu\n"
+				" InnoDB: Error: page %lu log sequence number"
+				" " LSN_PF "\n"
 				"InnoDB: is in the future! Current system "
-				"log sequence number %llu.\n"
+				"log sequence number " LSN_PF ".\n"
 				"InnoDB: Your database may be corrupt or "
 				"you may have copied the InnoDB\n"
 				"InnoDB: tablespace but not the InnoDB "
 				"log files. See\n"
-				"InnoDB: " REFMAN "forcing-innodb-recovery.html\n"
+				"InnoDB: " REFMAN
+				"forcing-innodb-recovery.html\n"
 				"InnoDB: for more information.\n",
-				(ulong) mach_read_from_4(read_buf
-							 + FIL_PAGE_OFFSET),
-				mach_read_from_8(read_buf + FIL_PAGE_LSN),
+				(ulong) mach_read_from_4(
+					read_buf + FIL_PAGE_OFFSET),
+				(lsn_t) mach_read_from_8(
+					read_buf + FIL_PAGE_LSN),
 				current_lsn);
 		}
 	}
 #endif
 
-	/* If we use checksums validation, make additional check before
-	returning TRUE to ensure that the checksum is not equal to
-	BUF_NO_CHECKSUM_MAGIC which might be stored by InnoDB with checksums
-	disabled. Otherwise, skip checksum calculation and return FALSE */
+	/* Check whether the checksum fields have correct values */
+
+	if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_NONE) {
+		return(FALSE);
+	}
 
-	if (UNIV_LIKELY(srv_use_checksums)) {
-		checksum_field = mach_read_from_4(read_buf
-						  + FIL_PAGE_SPACE_OR_CHKSUM);
+	if (zip_size) {
+		return(!page_zip_verify_checksum(read_buf, zip_size));
+	}
 
-		if (UNIV_UNLIKELY(zip_size)) {
-			return(checksum_field != BUF_NO_CHECKSUM_MAGIC
-			       && checksum_field
-			       != page_zip_calc_checksum(read_buf, zip_size));
-		}
+	checksum_field1 = mach_read_from_4(
+		read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
 
-		old_checksum_field = mach_read_from_4(
-			read_buf + UNIV_PAGE_SIZE
-			- FIL_PAGE_END_LSN_OLD_CHKSUM);
+	checksum_field2 = mach_read_from_4(
+		read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM);
 
-		/* There are 2 valid formulas for old_checksum_field:
+	/* declare empty pages non-corrupted */
+	if (checksum_field1 == 0 && checksum_field2 == 0
+	    && mach_read_from_4(read_buf + FIL_PAGE_LSN) == 0) {
+		/* make sure that the page is really empty */
+		ut_d(for (ulint i = 0; i < UNIV_PAGE_SIZE; i++) {
+		     ut_a(read_buf[i] == 0); });
+
+		return(FALSE);
+	}
+
+	switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+
+		crc32 = buf_calc_page_crc32(read_buf);
+
+		return(checksum_field1 != crc32 || checksum_field2 != crc32);
+
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+
+		return(checksum_field1
+		       != buf_calc_page_new_checksum(read_buf)
+		       || checksum_field2
+		       != buf_calc_page_old_checksum(read_buf));
+
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+
+		return(checksum_field1 != BUF_NO_CHECKSUM_MAGIC
+		       || checksum_field2 != BUF_NO_CHECKSUM_MAGIC);
+
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+		/* There are 3 valid formulas for
+		checksum_field2 (old checksum field):
 
 		1. Very old versions of InnoDB only stored 8 byte lsn to the
 		start and the end of the page.
 
-		2. Newer InnoDB versions store the old formula checksum
-		there. */
+		2. InnoDB versions before MySQL 5.6.3 store the old formula
+		checksum (buf_calc_page_old_checksum()).
 
-		if (old_checksum_field != mach_read_from_4(read_buf
-							   + FIL_PAGE_LSN)
-		    && old_checksum_field != BUF_NO_CHECKSUM_MAGIC
-		    && old_checksum_field
-		    != buf_calc_page_old_checksum(read_buf)) {
+		3. InnoDB versions 5.6.3 and newer with
+		innodb_checksum_algorithm=strict_crc32|crc32 store CRC32. */
 
-			return(TRUE);
+		/* since innodb_checksum_algorithm is not strict_* allow
+		any of the algos to match for the old field */
+
+		if (checksum_field2
+		    != mach_read_from_4(read_buf + FIL_PAGE_LSN)
+		    && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) {
+
+			/* The checksum does not match any of the
+			fast to check. First check the selected algorithm
+			for writing checksums because we assume that the
+			chance of it matching is higher. */
+
+			if (srv_checksum_algorithm
+			    == SRV_CHECKSUM_ALGORITHM_CRC32) {
+
+				crc32 = buf_calc_page_crc32(read_buf);
+				crc32_inited = TRUE;
+
+				if (checksum_field2 != crc32
+				    && checksum_field2
+				    != buf_calc_page_old_checksum(read_buf)) {
+
+					return(TRUE);
+				}
+			} else {
+				ut_ad(srv_checksum_algorithm
+				     == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+				if (checksum_field2
+				    != buf_calc_page_old_checksum(read_buf)) {
+
+					crc32 = buf_calc_page_crc32(read_buf);
+					crc32_inited = TRUE;
+
+					if (checksum_field2 != crc32) {
+						return(TRUE);
+					}
+				}
+			}
 		}
 
+		/* old field is fine, check the new field */
+
 		/* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
 		(always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
 
-		if (!srv_fast_checksum
-		    && checksum_field != 0
-		    && checksum_field != BUF_NO_CHECKSUM_MAGIC
-		    && checksum_field
-		    != buf_calc_page_new_checksum(read_buf)) {
+		if (checksum_field1 != 0
+		    && checksum_field1 != BUF_NO_CHECKSUM_MAGIC) {
 
-			return(TRUE);
+			/* The checksum does not match any of the
+			fast to check. First check the selected algorithm
+			for writing checksums because we assume that the
+			chance of it matching is higher. */
+
+			if (srv_checksum_algorithm
+			    == SRV_CHECKSUM_ALGORITHM_CRC32) {
+
+				if (!crc32_inited) {
+					crc32 = buf_calc_page_crc32(read_buf);
+					crc32_inited = TRUE;
+				}
+
+				if (checksum_field1 != crc32
+				    && checksum_field1
+				    != buf_calc_page_new_checksum(read_buf)) {
+
+					return(TRUE);
+				}
+			} else {
+				ut_ad(srv_checksum_algorithm
+				     == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+				if (checksum_field1
+				    != buf_calc_page_new_checksum(read_buf)) {
+
+					if (!crc32_inited) {
+						crc32 = buf_calc_page_crc32(
+							read_buf);
+						crc32_inited = TRUE;
+					}
+
+					if (checksum_field1 != crc32) {
+						return(TRUE);
+					}
+				}
+			}
 		}
 
-		if (srv_fast_checksum
-		    && checksum_field != 0
-		    && checksum_field != BUF_NO_CHECKSUM_MAGIC
-		    && checksum_field
-		    != buf_calc_page_new_checksum_32(read_buf)
-		    && checksum_field
-		    != buf_calc_page_new_checksum(read_buf)) {
+		/* If CRC32 is stored in at least one of the fields, then the
+		other field must also be CRC32 */
+		if (crc32_inited
+		    && ((checksum_field1 == crc32
+			 && checksum_field2 != crc32)
+			|| (checksum_field1 != crc32
+			    && checksum_field2 == crc32))) {
 
 			return(TRUE);
 		}
+
+		break;
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+		/* should have returned FALSE earlier */
+		ut_error;
+	/* no default so the compiler will emit a warning if new enum
+	is added and not handled here */
 	}
 
+	DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", return(TRUE); );
+
 	return(FALSE);
 }
 
@@ -710,10 +744,14 @@ buf_page_print(
 #ifndef UNIV_HOTBACKUP
 	dict_index_t*	index;
 #endif /* !UNIV_HOTBACKUP */
-	ulint		checksum;
-	ulint		checksum_32;
-	ulint		old_checksum;
-	ulint		size	= zip_size;
+	ulint		size = zip_size;
+
+	if (!read_buf) {
+		fprintf(stderr,
+			" InnoDB: Not dumping page as (in memory) pointer "
+			"is NULL\n");
+		return;
+	}
 
 	if (!size) {
 		size = UNIV_PAGE_SIZE;
@@ -722,7 +760,7 @@ buf_page_print(
 	if (!(flags & BUF_PAGE_PRINT_NO_FULL)) {
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
-			"  InnoDB: Page dump in ascii and hex (%lu bytes):\n",
+			" InnoDB: Page dump in ascii and hex (%lu bytes):\n",
 			(ulong) size);
 		ut_print_buf(stderr, read_buf, size);
 		fputs("\nInnoDB: End of page dump\n", stderr);
@@ -730,104 +768,80 @@ buf_page_print(
 
 	if (zip_size) {
 		/* Print compressed page. */
-
-		switch (fil_page_get_type(read_buf)) {
-		case FIL_PAGE_TYPE_ZBLOB:
-		case FIL_PAGE_TYPE_ZBLOB2:
-			checksum = srv_use_checksums
-				? page_zip_calc_checksum(read_buf, zip_size)
-				: BUF_NO_CHECKSUM_MAGIC;
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: Compressed BLOB page"
-				" checksum %lu, stored %lu\n"
-				"InnoDB: Page lsn %lu %lu\n"
-				"InnoDB: Page number (if stored"
-				" to page already) %lu,\n"
-				"InnoDB: space id (if stored"
-				" to page already) %lu\n",
-				(ulong) checksum,
-				(ulong) mach_read_from_4(
-					read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
-				(ulong) mach_read_from_4(
-					read_buf + FIL_PAGE_LSN),
-				(ulong) mach_read_from_4(
-					read_buf + (FIL_PAGE_LSN + 4)),
-				(ulong) mach_read_from_4(
-					read_buf + FIL_PAGE_OFFSET),
-				(ulong) mach_read_from_4(
-					read_buf
-					+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
-			return;
-		default:
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: unknown page type %lu,"
-				" assuming FIL_PAGE_INDEX\n",
-				fil_page_get_type(read_buf));
-			/* fall through */
-		case FIL_PAGE_INDEX:
-			checksum = srv_use_checksums
-				? page_zip_calc_checksum(read_buf, zip_size)
-				: BUF_NO_CHECKSUM_MAGIC;
-
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: Compressed page checksum %lu,"
-				" stored %lu\n"
-				"InnoDB: Page lsn %lu %lu\n"
-				"InnoDB: Page number (if stored"
-				" to page already) %lu,\n"
-				"InnoDB: space id (if stored"
-				" to page already) %lu\n",
-				(ulong) checksum,
-				(ulong) mach_read_from_4(
-					read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
-				(ulong) mach_read_from_4(
-					read_buf + FIL_PAGE_LSN),
-				(ulong) mach_read_from_4(
-					read_buf + (FIL_PAGE_LSN + 4)),
-				(ulong) mach_read_from_4(
-					read_buf + FIL_PAGE_OFFSET),
-				(ulong) mach_read_from_4(
-					read_buf
-					+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
-			return;
-		case FIL_PAGE_TYPE_XDES:
-			/* This is an uncompressed page. */
-			break;
-		}
-	}
-
-	checksum = srv_use_checksums
-		? buf_calc_page_new_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
-	checksum_32 = srv_use_checksums
-		? buf_calc_page_new_checksum_32(read_buf) : BUF_NO_CHECKSUM_MAGIC;
-	old_checksum = srv_use_checksums
-		? buf_calc_page_old_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
-
-	ut_print_timestamp(stderr);
-	fprintf(stderr,
-		"  InnoDB: Page checksum %lu (32bit_calc: %lu), prior-to-4.0.14-form"
-		" checksum %lu\n"
-		"InnoDB: stored checksum %lu, prior-to-4.0.14-form"
-		" stored checksum %lu\n"
-		"InnoDB: Page lsn %lu %lu, low 4 bytes of lsn"
-		" at page end %lu\n"
-		"InnoDB: Page number (if stored to page already) %lu,\n"
-		"InnoDB: space id (if created with >= MySQL-4.1.1"
-		" and stored already) %lu\n",
-		(ulong) checksum, (ulong) checksum_32, (ulong) old_checksum,
-		(ulong) mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
-		(ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Compressed page type (" ULINTPF "); "
+			"stored checksum in field1 " ULINTPF "; "
+			"calculated checksums for field1: "
+			"%s " ULINTPF ", "
+			"%s " ULINTPF ", "
+			"%s " ULINTPF "; "
+			"page LSN " LSN_PF "; "
+			"page number (if stored to page already) " ULINTPF "; "
+			"space id (if stored to page already) " ULINTPF "\n",
+			fil_page_get_type(read_buf),
+			mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+			buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_CRC32),
+			page_zip_calc_checksum(read_buf, zip_size,
+				SRV_CHECKSUM_ALGORITHM_CRC32),
+			buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_INNODB),
+			page_zip_calc_checksum(read_buf, zip_size,
+				SRV_CHECKSUM_ALGORITHM_INNODB),
+			buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_NONE),
+			page_zip_calc_checksum(read_buf, zip_size,
+				SRV_CHECKSUM_ALGORITHM_NONE),
+			mach_read_from_8(read_buf + FIL_PAGE_LSN),
+			mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
+			mach_read_from_4(read_buf
+					 + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+	} else {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: uncompressed page, "
+			"stored checksum in field1 " ULINTPF ", "
+			"calculated checksums for field1: "
+			"%s " UINT32PF ", "
+			"%s " ULINTPF ", "
+			"%s " ULINTPF ", "
+
+			"stored checksum in field2 " ULINTPF ", "
+			"calculated checksums for field2: "
+			"%s " UINT32PF ", "
+			"%s " ULINTPF ", "
+			"%s " ULINTPF ", "
+
+			"page LSN " ULINTPF " " ULINTPF ", "
+			"low 4 bytes of LSN at page end " ULINTPF ", "
+			"page number (if stored to page already) " ULINTPF ", "
+			"space id (if created with >= MySQL-4.1.1 "
+			"and stored already) %lu\n",
+			mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_CRC32),
+			buf_calc_page_crc32(read_buf),
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_INNODB),
+			buf_calc_page_new_checksum(read_buf),
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_NONE),
+			BUF_NO_CHECKSUM_MAGIC,
+
+			mach_read_from_4(read_buf + UNIV_PAGE_SIZE
 					 - FIL_PAGE_END_LSN_OLD_CHKSUM),
-		(ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN),
-		(ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
-		(ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_CRC32),
+			buf_calc_page_crc32(read_buf),
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_INNODB),
+			buf_calc_page_old_checksum(read_buf),
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_NONE),
+			BUF_NO_CHECKSUM_MAGIC,
+
+			mach_read_from_4(read_buf + FIL_PAGE_LSN),
+			mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
+			mach_read_from_4(read_buf + UNIV_PAGE_SIZE
 					 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
-		(ulong) mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
-		(ulong) mach_read_from_4(read_buf
+			mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
+			mach_read_from_4(read_buf
 					 + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+	}
 
 #ifndef UNIV_HOTBACKUP
 	if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE)
@@ -929,7 +943,7 @@ pfs_register_buffer_block(
 				 PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER);
 
 	for (i = 0; i < num_to_register; i++) {
-		mutex_t*	mutex;
+		ib_mutex_t*	mutex;
 		rw_lock_t*	rwlock;
 
 #  ifdef UNIV_PFS_MUTEX
@@ -972,7 +986,7 @@ buf_block_init(
 	buf_block_t*	block,		/*!< in: pointer to control block */
 	byte*		frame)		/*!< in: pointer to buffer frame */
 {
-	UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE, block);
+	UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE);
 
 	block->frame = frame;
 
@@ -995,13 +1009,9 @@ buf_block_init(
 	block->page.in_zip_hash = FALSE;
 	block->page.in_flush_list = FALSE;
 	block->page.in_free_list = FALSE;
-#endif /* UNIV_DEBUG */
-	block->page.flush_list.prev = NULL;
-	block->page.flush_list.next = NULL;
-	block->page.zip_list.prev = NULL;
-	block->page.zip_list.next = NULL;
 	block->page.in_LRU_list = FALSE;
 	block->in_unzip_LRU_list = FALSE;
+#endif /* UNIV_DEBUG */
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 	block->n_pointers = 0;
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
@@ -1033,7 +1043,6 @@ buf_block_init(
 #endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
 
 	ut_ad(rw_lock_validate(&(block->lock)));
-
 }
 
 /********************************************************************//**
@@ -1071,14 +1080,14 @@ buf_chunk_init(
 
 	/* Allocate the block descriptors from
 	the start of the memory block. */
-	chunk->blocks = chunk->mem;
+	chunk->blocks = (buf_block_t*) chunk->mem;
 
 	/* Align a pointer to the first frame.  Note that when
 	os_large_page_size is smaller than UNIV_PAGE_SIZE,
 	we may allocate one fewer block than requested.  When
 	it is bigger, we may allocate more blocks than requested. */
 
-	frame = ut_align(chunk->mem, UNIV_PAGE_SIZE);
+	frame = (byte*) ut_align(chunk->mem, UNIV_PAGE_SIZE);
 	chunk->size = chunk->mem_size / UNIV_PAGE_SIZE
 		- (frame != chunk->mem);
 
@@ -1110,11 +1119,9 @@ buf_chunk_init(
 		UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE);
 
 		/* Add the block to the free list */
-		mutex_enter(&buf_pool->free_list_mutex);
-		UT_LIST_ADD_LAST(free, buf_pool->free, (&block->page));
+		UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page));
 
 		ut_d(block->page.in_free_list = TRUE);
-		mutex_exit(&buf_pool->free_list_mutex);
 		ut_ad(buf_pool_from_block(block) == buf_pool);
 
 		block++;
@@ -1169,8 +1176,6 @@ buf_pool_contains_zip(
 	buf_chunk_t*	chunk = buf_pool->chunks;
 
 	ut_ad(buf_pool);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(mutex_own(&buf_pool->zip_free_mutex));
 	for (n = buf_pool->n_chunks; n--; chunk++) {
 
 		buf_block_t* block = buf_chunk_contains_zip(chunk, data);
@@ -1202,7 +1207,7 @@ buf_chunk_not_freed(
 		ibool	ready;
 
 		switch (buf_block_get_state(block)) {
-		case BUF_BLOCK_ZIP_FREE:
+		case BUF_BLOCK_POOL_WATCH:
 		case BUF_BLOCK_ZIP_PAGE:
 		case BUF_BLOCK_ZIP_DIRTY:
 			/* The uncompressed buffer pool should never
@@ -1221,8 +1226,9 @@ buf_chunk_not_freed(
 			ready = buf_flush_ready_for_replace(&block->page);
 			mutex_exit(&block->mutex);
 
-			if (block->page.is_corrupt) {
-				/* corrupt page may remain, it can be skipped */
+			if (UNIV_UNLIKELY(block->page.is_corrupt)) {
+				/* corrupt page may remain, it can be
+				skipped */
 				break;
 			}
 
@@ -1248,8 +1254,6 @@ buf_pool_set_sizes(void)
 	ulint	i;
 	ulint	curr_size = 0;
 
-	buf_pool_mutex_enter_all();
-
 	for (i = 0; i < srv_buf_pool_instances; i++) {
 		buf_pool_t*	buf_pool;
 
@@ -1259,8 +1263,6 @@ buf_pool_set_sizes(void)
 
 	srv_buf_pool_curr_size = curr_size;
 	srv_buf_pool_old_size = srv_buf_pool_size;
-
-	buf_pool_mutex_exit_all();
 }
 
 /********************************************************************//**
@@ -1280,12 +1282,8 @@ buf_pool_init_instance(
 
 	/* 1. Initialize general fields
 	------------------------------- */
-	mutex_create(buf_pool_mutex_key,
-		     &buf_pool->mutex, SYNC_BUF_POOL);
 	mutex_create(buf_pool_LRU_list_mutex_key,
 		     &buf_pool->LRU_list_mutex, SYNC_BUF_LRU_LIST);
-	rw_lock_create(buf_pool_page_hash_key,
-		       &buf_pool->page_hash_latch, SYNC_BUF_PAGE_HASH);
 	mutex_create(buf_pool_free_list_mutex_key,
 		     &buf_pool->free_list_mutex, SYNC_BUF_FREE_LIST);
 	mutex_create(buf_pool_zip_free_mutex_key,
@@ -1294,14 +1292,14 @@ buf_pool_init_instance(
 		     &buf_pool->zip_hash_mutex, SYNC_BUF_ZIP_HASH);
 	mutex_create(buf_pool_zip_mutex_key,
 		     &buf_pool->zip_mutex, SYNC_BUF_BLOCK);
-
-	mutex_enter(&buf_pool->LRU_list_mutex);
-	rw_lock_x_lock(&buf_pool->page_hash_latch);
-	buf_pool_mutex_enter(buf_pool);
+	mutex_create(buf_pool_flush_state_mutex_key,
+		     &buf_pool->flush_state_mutex, SYNC_BUF_FLUSH_STATE);
 
 	if (buf_pool_size > 0) {
 		buf_pool->n_chunks = 1;
-		buf_pool->chunks = chunk = mem_zalloc(sizeof *chunk);
+
+		buf_pool->chunks = chunk =
+			(buf_chunk_t*) mem_zalloc(sizeof *chunk);
 
 		UT_LIST_INIT(buf_pool->free);
 
@@ -1309,19 +1307,28 @@ buf_pool_init_instance(
 			mem_free(chunk);
 			mem_free(buf_pool);
 
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			rw_lock_x_unlock(&buf_pool->page_hash_latch);
-			buf_pool_mutex_exit(buf_pool);
-
 			return(DB_ERROR);
 		}
 
 		buf_pool->instance_no = instance_no;
 		buf_pool->old_pool_size = buf_pool_size;
 		buf_pool->curr_size = chunk->size;
+		buf_pool->read_ahead_area
+			= ut_min(64, ut_2_power_up(buf_pool->curr_size / 32));
 		buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
 
-		buf_pool->page_hash = hash_create(2 * buf_pool->curr_size);
+		/* Number of locks protecting page_hash must be a
+		power of two */
+		srv_n_page_hash_locks =
+				 ut_2_power_up(srv_n_page_hash_locks);
+		ut_a(srv_n_page_hash_locks != 0);
+		ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS);
+
+		buf_pool->page_hash = ha_create(2 * buf_pool->curr_size,
+						srv_n_page_hash_locks,
+						MEM_HEAP_FOR_PAGE_HASH,
+						SYNC_BUF_PAGE_HASH);
+
 		buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
 
 		buf_pool->last_printout_time = ut_time();
@@ -1333,17 +1340,15 @@ buf_pool_init_instance(
 		     SYNC_BUF_FLUSH_LIST);
 
 	for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
-		buf_pool->no_flush[i] = os_event_create(NULL);
+		buf_pool->no_flush[i] = os_event_create();
 	}
 
-	/* 3. Initialize LRU fields
-	--------------------------- */
+	buf_pool->watch = (buf_page_t*) mem_zalloc(
+		sizeof(*buf_pool->watch) * BUF_POOL_WATCH_SIZE);
 
 	/* All fields are initialized by mem_zalloc(). */
 
-	mutex_exit(&buf_pool->LRU_list_mutex);
-	rw_lock_x_unlock(&buf_pool->page_hash_latch);
-	buf_pool_mutex_exit(buf_pool);
+	buf_pool->try_LRU_scan = TRUE;
 
 	return(DB_SUCCESS);
 }
@@ -1380,6 +1385,9 @@ buf_pool_free_instance(
 		bpage = prev_bpage;
 	}
 
+	mem_free(buf_pool->watch);
+	buf_pool->watch = NULL;
+
 	chunks = buf_pool->chunks;
 	chunk = chunks + buf_pool->n_chunks;
 
@@ -1388,6 +1396,7 @@ buf_pool_free_instance(
 	}
 
 	mem_free(buf_pool->chunks);
+	ha_clear(buf_pool->page_hash);
 	hash_table_free(buf_pool->page_hash);
 	hash_table_free(buf_pool->zip_hash);
 }
@@ -1396,7 +1405,7 @@ buf_pool_free_instance(
 Creates the buffer pool.
 @return	DB_SUCCESS if success, DB_ERROR if not enough memory or error */
 UNIV_INTERN
-ulint
+dberr_t
 buf_pool_init(
 /*==========*/
 	ulint	total_size,	/*!< in: size of the total pool in bytes */
@@ -1410,10 +1419,8 @@ buf_pool_init(
 	ut_ad(n_instances <= MAX_BUFFER_POOLS);
 	ut_ad(n_instances == srv_buf_pool_instances);
 
-	/* We create an extra buffer pool instance, this instance is used
-	for flushing the flush lists, to keep track of n_flush for all
-	the buffer pools and also used as a waiting object during flushing. */
-	buf_pool_ptr = mem_zalloc(n_instances * sizeof *buf_pool_ptr);
+	buf_pool_ptr = (buf_pool_t*) mem_zalloc(
+		n_instances * sizeof *buf_pool_ptr);
 
 	for (i = 0; i < n_instances; i++) {
 		buf_pool_t*	ptr	= &buf_pool_ptr[i];
@@ -1464,11 +1471,7 @@ buf_pool_clear_hash_index(void)
 	ulint	p;
 
 #ifdef UNIV_SYNC_DEBUG
-	ulint	j;
-
-	for (j = 0; j < btr_search_index_num; j++) {
-		ut_ad(rw_lock_own(&btr_search_latch_arr[j], RW_LOCK_EX));
-	}
+	ut_ad(btr_search_own_all(RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
 	ut_ad(!btr_search_enabled);
 
@@ -1519,23 +1522,25 @@ buf_relocate(
 	ulint		fold;
 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
 
-	//ut_ad(buf_pool_mutex_own(buf_pool));
+	fold = buf_page_address_fold(bpage->space, bpage->offset);
+
 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_EX));
-#endif
+	ut_ad(buf_page_hash_lock_held_x(buf_pool, bpage));
 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 	ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
 	ut_a(bpage->buf_fix_count == 0);
 	ut_ad(bpage->in_LRU_list);
 	ut_ad(!bpage->in_zip_hash);
 	ut_ad(bpage->in_page_hash);
-	ut_ad(bpage == buf_page_hash_get(buf_pool,
-			       		 bpage->space, bpage->offset));
+	ut_ad(bpage == buf_page_hash_get_low(buf_pool,
+					     bpage->space,
+					     bpage->offset,
+					     fold));
+
 	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
 #ifdef UNIV_DEBUG
 	switch (buf_page_get_state(bpage)) {
-	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_POOL_WATCH:
 	case BUF_BLOCK_NOT_USED:
 	case BUF_BLOCK_READY_FOR_USE:
 	case BUF_BLOCK_FILE_PAGE:
@@ -1550,7 +1555,7 @@ buf_relocate(
 
 	memcpy(dpage, bpage, sizeof *dpage);
 
-	bpage->in_LRU_list = FALSE;
+	ut_d(bpage->in_LRU_list = FALSE);
 	ut_d(bpage->in_page_hash = FALSE);
 
 	/* relocate buf_pool->LRU */
@@ -1580,12 +1585,10 @@ buf_relocate(
 #endif /* UNIV_LRU_DEBUG */
 	}
 
-	ut_d(UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU,
-			      ut_ad(ut_list_node_313->in_LRU_list)));
+        ut_d(UT_LIST_VALIDATE(
+		LRU, buf_page_t, buf_pool->LRU, CheckInLRUList()));
 
 	/* relocate buf_pool->page_hash */
-	fold = buf_page_address_fold(bpage->space, bpage->offset);
-
 	HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
 	HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage);
 }
@@ -1600,6 +1603,8 @@ buf_pool_watch_is_sentinel(
 	buf_pool_t*		buf_pool,	/*!< buffer pool instance */
 	const buf_page_t*	bpage)		/*!< in: block */
 {
+	/* We must also own the appropriate hash lock. */
+	ut_ad(buf_page_hash_lock_held_s_or_x(buf_pool, bpage));
 	ut_ad(buf_page_in_file(bpage));
 
 	if (bpage < &buf_pool->watch[0]
@@ -1620,8 +1625,9 @@ buf_pool_watch_is_sentinel(
 }
 
 /****************************************************************//**
-Add watch for the given page to be read in. Caller must have the buffer pool
-mutex reserved.
+Add watch for the given page to be read in. Caller must have
+appropriate hash_lock for the bpage. This function may release the
+hash_lock and reacquire it.
 @return NULL if watch set, block if the page is in the buffer pool */
 UNIV_INTERN
 buf_page_t*
@@ -1634,32 +1640,53 @@ buf_pool_watch_set(
 	buf_page_t*	bpage;
 	ulint		i;
 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
-	mutex_t*	block_mutex;
+	prio_rw_lock_t*	hash_lock;
 
-	//ut_ad(buf_pool_mutex_own(buf_pool));
+	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
 
-	rw_lock_x_lock(&buf_pool->page_hash_latch);
 	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
 
 	if (UNIV_LIKELY_NULL(bpage)) {
-
-		block_mutex = buf_page_get_mutex_enter(bpage);
-		ut_a(block_mutex);
-
+page_found:
 		if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) {
 			/* The page was loaded meanwhile. */
-			rw_lock_x_unlock(&buf_pool->page_hash_latch);
 			return(bpage);
 		}
 		/* Add to an existing watch. */
+		mutex_enter(&buf_pool->zip_mutex);
 		bpage->buf_fix_count++;
-		rw_lock_x_unlock(&buf_pool->page_hash_latch);
-		mutex_exit(block_mutex);
+		mutex_exit(&buf_pool->zip_mutex);
 		return(NULL);
 	}
 
-	/* buf_pool->watch is protected by zip_mutex for now */
-	mutex_enter(&buf_pool->zip_mutex);
+	/* From this point this function becomes fairly heavy in terms
+	of latching. We acquire all the hash_locks. They are needed
+	because we don't want to read any stale information in
+	buf_pool->watch[]. However, it is not in the critical code path
+	as this function will be called only by the purge thread. */
+
+
+	/* To obey latching order first release the hash_lock. */
+	rw_lock_x_unlock(hash_lock);
+
+	hash_lock_x_all(buf_pool->page_hash);
+
+	/* We have to recheck that the page
+	was not loaded or a watch set by some other
+	purge thread. This is because of the small
+	time window between when we release the
+	hash_lock to acquire all the hash locks above. */
+
+	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
+	if (UNIV_LIKELY_NULL(bpage)) {
+		hash_unlock_x_all_but(buf_pool->page_hash, hash_lock);
+		goto page_found;
+	}
+
 	for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
 		bpage = &buf_pool->watch[i];
 
@@ -1674,21 +1701,25 @@ buf_pool_watch_set(
 			ut_ad(!bpage->in_page_hash);
 			ut_ad(bpage->buf_fix_count == 0);
 
-			/* bpage is pointing to buf_pool->watch[],
-			which is protected by buf_pool->mutex.
-			Normally, buf_page_t objects are protected by
-			buf_block_t::mutex or buf_pool->zip_mutex or both. */
+			mutex_enter(&buf_pool->zip_mutex);
 
 			bpage->state = BUF_BLOCK_ZIP_PAGE;
 			bpage->space = space;
 			bpage->offset = offset;
 			bpage->buf_fix_count = 1;
-			bpage->buf_pool_index = buf_pool_index(buf_pool);
+
+			mutex_exit(&buf_pool->zip_mutex);
+
 			ut_d(bpage->in_page_hash = TRUE);
 			HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
 				    fold, bpage);
-			rw_lock_x_unlock(&buf_pool->page_hash_latch);
-			mutex_exit(&buf_pool->zip_mutex);
+
+			/* Once the sentinel is in the page_hash we can
+			safely release all locks except just the
+			relevant hash_lock */
+			hash_unlock_x_all_but(buf_pool->page_hash,
+						hash_lock);
+
 			return(NULL);
 		case BUF_BLOCK_ZIP_PAGE:
 			ut_ad(bpage->in_page_hash);
@@ -1706,8 +1737,6 @@ buf_pool_watch_set(
 	ut_error;
 
 	/* Fix compiler warning */
-	rw_lock_x_unlock(&buf_pool->page_hash_latch);
-	mutex_exit(&buf_pool->zip_mutex);
 	return(NULL);
 }
 
@@ -1725,11 +1754,14 @@ buf_pool_watch_remove(
 					space, offset) */
 	buf_page_t*	watch)		/*!< in/out: sentinel for watch */
 {
-	//ut_ad(buf_pool_mutex_own(buf_pool));
 #ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_EX));
-#endif
-	ut_ad(mutex_own(&buf_pool->zip_mutex)); /* for now */
+	/* We must also own the appropriate hash_bucket mutex. */
+	prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+	ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(buf_page_get_state(watch) == BUF_BLOCK_ZIP_PAGE);
+	ut_ad(buf_own_zip_mutex_for_page(watch));
 
 	HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, watch);
 	ut_d(watch->in_page_hash = FALSE);
@@ -1750,32 +1782,34 @@ buf_pool_watch_unset(
 	buf_page_t*	bpage;
 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
 	ulint		fold = buf_page_address_fold(space, offset);
+	prio_rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool,
+							     fold);
+
+	rw_lock_x_lock(hash_lock);
 
-	//buf_pool_mutex_enter(buf_pool);
-	rw_lock_x_lock(&buf_pool->page_hash_latch);
 	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
 	/* The page must exist because buf_pool_watch_set()
 	increments buf_fix_count. */
 	ut_a(bpage);
 
 	if (UNIV_UNLIKELY(!buf_pool_watch_is_sentinel(buf_pool, bpage))) {
-		mutex_t* mutex = buf_page_get_mutex_enter(bpage);
+		ib_mutex_t* mutex = buf_page_get_mutex(bpage);
 
+		mutex_enter(mutex);
 		ut_a(bpage->buf_fix_count > 0);
 		bpage->buf_fix_count--;
 		mutex_exit(mutex);
 	} else {
-		mutex_enter(&buf_pool->zip_mutex);
 		ut_a(bpage->buf_fix_count > 0);
 
+		mutex_enter(&buf_pool->zip_mutex);
 		if (UNIV_LIKELY(!--bpage->buf_fix_count)) {
 			buf_pool_watch_remove(buf_pool, fold, bpage);
 		}
 		mutex_exit(&buf_pool->zip_mutex);
 	}
 
-	//buf_pool_mutex_exit(buf_pool);
-	rw_lock_x_unlock(&buf_pool->page_hash_latch);
+	rw_lock_x_unlock(hash_lock);
 }
 
 /****************************************************************//**
@@ -1794,17 +1828,17 @@ buf_pool_watch_occurred(
 	buf_page_t*	bpage;
 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
 	ulint		fold	= buf_page_address_fold(space, offset);
+	prio_rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool,
+							     fold);
 
-	//buf_pool_mutex_enter(buf_pool);
-	rw_lock_s_lock(&buf_pool->page_hash_latch);
+	rw_lock_s_lock(hash_lock);
 
 	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
 	/* The page must exist because buf_pool_watch_set()
 	increments buf_fix_count. */
 	ut_a(bpage);
 	ret = !buf_pool_watch_is_sentinel(buf_pool, bpage);
-	//buf_pool_mutex_exit(buf_pool);
-	rw_lock_s_unlock(&buf_pool->page_hash_latch);
+	rw_lock_s_unlock(hash_lock);
 
 	return(ret);
 }
@@ -1821,7 +1855,6 @@ buf_page_make_young(
 {
 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
 
-	//buf_pool_mutex_enter(buf_pool);
 	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
 	mutex_enter(&buf_pool->LRU_list_mutex);
 
@@ -1829,7 +1862,6 @@ buf_page_make_young(
 
 	buf_LRU_make_block_young(bpage);
 
-	//buf_pool_mutex_exit(buf_pool);
 	mutex_exit(&buf_pool->LRU_list_mutex);
 }
 
@@ -1844,10 +1876,6 @@ buf_page_make_young_if_needed(
 	buf_page_t*	bpage)		/*!< in/out: buffer block of a
 					file page */
 {
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	ut_ad(!buf_pool_mutex_own(buf_pool));
-#endif /* UNIV_DEBUG */
 	ut_a(buf_page_in_file(bpage));
 
 	if (buf_page_peek_if_too_old(bpage)) {
@@ -1868,18 +1896,12 @@ buf_reset_check_index_page_at_flush(
 	buf_block_t*	block;
 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
 
-	//buf_pool_mutex_enter(buf_pool);
-	rw_lock_s_lock(&buf_pool->page_hash_latch);
-
 	block = (buf_block_t*) buf_page_hash_get(buf_pool, space, offset);
 
 	if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) {
 		ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
 		block->check_index_page_at_flush = FALSE;
 	}
-
-	//buf_pool_mutex_exit(buf_pool);
-	rw_lock_s_unlock(&buf_pool->page_hash_latch);
 }
 
 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
@@ -1898,22 +1920,22 @@ buf_page_set_file_page_was_freed(
 {
 	buf_page_t*	bpage;
 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+	prio_rw_lock_t*	hash_lock;
 
-	//buf_pool_mutex_enter(buf_pool);
-	rw_lock_s_lock(&buf_pool->page_hash_latch);
-
-	bpage = buf_page_hash_get(buf_pool, space, offset);
+	bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
+					   &hash_lock);
 
 	if (bpage) {
+		ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
 		ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
+		mutex_enter(block_mutex);
+		rw_lock_s_unlock(hash_lock);
 		/* bpage->file_page_was_freed can already hold
 		when this code is invoked from dict_drop_index_tree() */
 		bpage->file_page_was_freed = TRUE;
+		mutex_exit(block_mutex);
 	}
 
-	//buf_pool_mutex_exit(buf_pool);
-	rw_lock_s_unlock(&buf_pool->page_hash_latch);
-
 	return(bpage);
 }
 
@@ -1932,25 +1954,66 @@ buf_page_reset_file_page_was_freed(
 {
 	buf_page_t*	bpage;
 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+	prio_rw_lock_t*	hash_lock;
 
-	//buf_pool_mutex_enter(buf_pool);
-	rw_lock_s_lock(&buf_pool->page_hash_latch);
-
-	bpage = buf_page_hash_get(buf_pool, space, offset);
-
+	bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
+					   &hash_lock);
 	if (bpage) {
+		ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
 		ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
+		mutex_enter(block_mutex);
+		rw_lock_s_unlock(hash_lock);
 		bpage->file_page_was_freed = FALSE;
+		mutex_exit(block_mutex);
 	}
 
-	//buf_pool_mutex_exit(buf_pool);
-	rw_lock_s_unlock(&buf_pool->page_hash_latch);
-
 	return(bpage);
 }
 #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
 
 /********************************************************************//**
+Attempts to discard the uncompressed frame of a compressed page. The
+caller should not be holding any mutexes when this function is called.
+@return	TRUE if successful, FALSE otherwise. */
+static
+void
+buf_block_try_discard_uncompressed(
+/*===============================*/
+	ulint		space,	/*!< in: space id */
+	ulint		offset)	/*!< in: page number */
+{
+	buf_page_t*	bpage;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+
+	/* Since we need to acquire buf_pool->LRU_list_mutex to discard
+	the uncompressed frame and because page_hash mutex resides below
+	buf_pool->LRU_list_mutex in sync ordering therefore we must first
+	release the page_hash mutex. This means that the block in question
+	can move out of page_hash. Therefore we need to check again if the
+	block is still in page_hash. */
+
+	mutex_enter(&buf_pool->LRU_list_mutex);
+
+	bpage = buf_page_hash_get(buf_pool, space, offset);
+
+	if (bpage) {
+
+		ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
+
+		mutex_enter(block_mutex);
+
+		if (buf_LRU_free_page(bpage, false)) {
+
+			mutex_exit(block_mutex);
+			return;
+		}
+		mutex_exit(block_mutex);
+	}
+
+	mutex_exit(&buf_pool->LRU_list_mutex);
+}
+
+/********************************************************************//**
 Get read access to a compressed page (usually of type
 FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
 The page must be released with buf_page_release_zip().
@@ -1968,7 +2031,9 @@ buf_page_get_zip(
 	ulint		offset)	/*!< in: page number */
 {
 	buf_page_t*	bpage;
-	mutex_t*	block_mutex;
+	ib_mutex_t*	block_mutex;
+	prio_rw_lock_t*	hash_lock;
+	ibool		discard_attempted = FALSE;
 	ibool		must_read;
 	trx_t*		trx = NULL;
 	ulint		sec;
@@ -1976,7 +2041,6 @@ buf_page_get_zip(
 	ib_uint64_t	start_time;
 	ib_uint64_t	finish_time;
 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
-	ibool have_LRU_mutex = FALSE;
 
 	if (UNIV_UNLIKELY(innobase_get_slow_log())) {
 		trx = innobase_get_trx();
@@ -1984,10 +2048,12 @@ buf_page_get_zip(
 	buf_pool->stat.n_page_gets++;
 
 	for (;;) {
-		//buf_pool_mutex_enter(buf_pool);
 lookup:
-		rw_lock_s_lock(&buf_pool->page_hash_latch);
-		bpage = buf_page_hash_get(buf_pool, space, offset);
+
+		/* The following call will also grab the page_hash
+		mutex if the page is found. */
+		bpage = buf_page_hash_get_s_locked(buf_pool, space,
+						offset, &hash_lock);
 		if (bpage) {
 			ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
 			break;
@@ -1995,9 +2061,7 @@ lookup:
 
 		/* Page not in buf_pool: needs to be read from file */
 
-		//buf_pool_mutex_exit(buf_pool);
-		rw_lock_s_unlock(&buf_pool->page_hash_latch);
-
+		ut_ad(!hash_lock);
 		buf_read_page(space, zip_size, offset, trx);
 
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
@@ -2005,88 +2069,52 @@ lookup:
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 	}
 
-	if (UNIV_UNLIKELY(!bpage->zip.data)) {
+	ut_ad(buf_page_hash_lock_held_s(buf_pool, bpage));
+
+	if (!bpage->zip.data) {
 		/* There is no compressed page. */
 err_exit:
-		//buf_pool_mutex_exit(buf_pool);
-		rw_lock_s_unlock(&buf_pool->page_hash_latch);
+		rw_lock_s_unlock(hash_lock);
 		return(NULL);
 	}
 
 	if (UNIV_UNLIKELY(bpage->is_corrupt && srv_pass_corrupt_table <= 1)) {
 
-		rw_lock_s_unlock(&buf_pool->page_hash_latch);
+		rw_lock_s_unlock(hash_lock);
 
 		return(NULL);
 	}
 
-	block_mutex = buf_page_get_mutex_enter(bpage);
-
-	rw_lock_s_unlock(&buf_pool->page_hash_latch);
-
 	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
 
 	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_POOL_WATCH:
 	case BUF_BLOCK_NOT_USED:
 	case BUF_BLOCK_READY_FOR_USE:
 	case BUF_BLOCK_MEMORY:
 	case BUF_BLOCK_REMOVE_HASH:
-	case BUF_BLOCK_ZIP_FREE:
-		if (block_mutex)
-			mutex_exit(block_mutex);
 		break;
 	case BUF_BLOCK_ZIP_PAGE:
 	case BUF_BLOCK_ZIP_DIRTY:
-		ut_a(block_mutex == &buf_pool->zip_mutex);
+		block_mutex = &buf_pool->zip_mutex;
+		mutex_enter(block_mutex);
 		bpage->buf_fix_count++;
 		goto got_block;
 	case BUF_BLOCK_FILE_PAGE:
-	{
-		ut_a(block_mutex == &((buf_block_t*) bpage)->mutex);
-
-		/* release mutex to obey to latch-order */
-		mutex_exit(block_mutex);
-
-		/* get LRU_list_mutex for buf_LRU_free_block() */
-		if (!have_LRU_mutex) {
-			mutex_enter(&buf_pool->LRU_list_mutex);
-			have_LRU_mutex = TRUE;
-		}
-
-		mutex_enter(block_mutex);
-
-		if (UNIV_UNLIKELY(bpage->space != space
-				  || bpage->offset != offset
-				  || !bpage->in_LRU_list
-				  || !bpage->zip.data)) {
-			/* someone should interrupt, retry */
-			if (have_LRU_mutex) {
-				mutex_exit(&buf_pool->LRU_list_mutex);
-				have_LRU_mutex = FALSE;
-			}
-			mutex_exit(block_mutex);
-			goto lookup;
-		}
-
 		/* Discard the uncompressed page frame if possible. */
-		if (buf_LRU_free_block(bpage, FALSE, &have_LRU_mutex)) {
-			if (have_LRU_mutex) {
-				mutex_exit(&buf_pool->LRU_list_mutex);
-				have_LRU_mutex = FALSE;
-			}
-			mutex_exit(block_mutex);
+		if (!discard_attempted) {
+			rw_lock_s_unlock(hash_lock);
+			buf_block_try_discard_uncompressed(space,
+							   offset);
+			discard_attempted = TRUE;
 			goto lookup;
 		}
 
-		if (have_LRU_mutex) {
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			have_LRU_mutex = FALSE;
-		}
-
+		block_mutex = &((buf_block_t*) bpage)->mutex;
+		mutex_enter(block_mutex);
 		buf_block_buf_fix_inc((buf_block_t*) bpage,
 				      __FILE__, __LINE__);
 		goto got_block;
-	  }
 	}
 
 	ut_error;
@@ -2095,7 +2123,10 @@ err_exit:
 got_block:
 	must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ;
 
-	//buf_pool_mutex_exit(buf_pool);
+	rw_lock_s_unlock(hash_lock);
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+	ut_a(!bpage->file_page_was_freed);
+#endif
 
 	buf_page_set_accessed(bpage);
 
@@ -2103,10 +2134,6 @@ got_block:
 
 	buf_page_make_young_if_needed(bpage);
 
-#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
-	ut_a(!bpage->file_page_was_freed);
-#endif
-
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 	ut_a(++buf_dbg_counter % 5771 || buf_validate());
 	ut_a(bpage->buf_fix_count > 0);
@@ -2181,26 +2208,28 @@ buf_zip_decompress(
 	buf_block_t*	block,	/*!< in/out: block */
 	ibool		check)	/*!< in: TRUE=verify the page checksum */
 {
-	const byte*	frame		= block->page.zip.data;
-	ulint		stamp_checksum	= mach_read_from_4(
-		frame + FIL_PAGE_SPACE_OR_CHKSUM);
+	const byte*	frame = block->page.zip.data;
+	ulint		size = page_zip_get_size(&block->page.zip);
 
 	ut_ad(buf_block_get_zip_size(block));
 	ut_a(buf_block_get_space(block) != 0);
 
-	if (UNIV_LIKELY(check && stamp_checksum != BUF_NO_CHECKSUM_MAGIC)) {
-		ulint	calc_checksum	= page_zip_calc_checksum(
-			frame, page_zip_get_size(&block->page.zip));
+	if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
 
-		if (UNIV_UNLIKELY(stamp_checksum != calc_checksum)) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: compressed page checksum mismatch"
-				" (space %u page %u): %lu != %lu\n",
-				block->page.space, block->page.offset,
-				stamp_checksum, calc_checksum);
-			return(FALSE);
-		}
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: compressed page checksum mismatch"
+			" (space %u page %u): stored: %lu, crc32: %lu "
+			"innodb: %lu, none: %lu\n",
+			block->page.space, block->page.offset,
+			mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM),
+			page_zip_calc_checksum(frame, size,
+					       SRV_CHECKSUM_ALGORITHM_CRC32),
+			page_zip_calc_checksum(frame, size,
+					       SRV_CHECKSUM_ALGORITHM_INNODB),
+			page_zip_calc_checksum(frame, size,
+					       SRV_CHECKSUM_ALGORITHM_NONE));
+		return(FALSE);
 	}
 
 	switch (fil_page_get_type(frame)) {
@@ -2277,12 +2306,13 @@ buf_block_align_instance(
 			ut_ad(block->frame == page_align(ptr));
 #ifdef UNIV_DEBUG
 			/* A thread that updates these fields must
-			hold buf_pool->mutex and block->mutex.  Acquire
+			hold one of the buf_pool mutexes, depending on the
+			page state, and block->mutex.  Acquire
 			only the latter. */
 			mutex_enter(&block->mutex);
 
 			switch (buf_block_get_state(block)) {
-			case BUF_BLOCK_ZIP_FREE:
+			case BUF_BLOCK_POOL_WATCH:
 			case BUF_BLOCK_ZIP_PAGE:
 			case BUF_BLOCK_ZIP_DIRTY:
 				/* These types should only be used in
@@ -2373,8 +2403,8 @@ buf_pointer_is_block_field_instance(
 	/* TODO: protect buf_pool->chunks with a mutex (it will
 	currently remain constant after buf_pool_init()) */
 	while (chunk < echunk) {
-		if (ptr >= (void *)chunk->blocks
-		    && ptr < (void *)(chunk->blocks + chunk->size)) {
+		if (ptr >= (void*) chunk->blocks
+		    && ptr < (void*) (chunk->blocks + chunk->size)) {
 
 			return(TRUE);
 		}
@@ -2421,15 +2451,35 @@ buf_block_is_uncompressed(
 	const buf_block_t*	block)		/*!< in: pointer to block,
 						not dereferenced */
 {
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-
 	if (UNIV_UNLIKELY((((ulint) block) % sizeof *block) != 0)) {
 		/* The pointer should be aligned. */
 		return(FALSE);
 	}
 
-	return(buf_pointer_is_block_field_instance(buf_pool, (void *)block));
+	return(buf_pointer_is_block_field_instance(buf_pool, (void*) block));
+}
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/********************************************************************//**
+Return true if probe is enabled.
+@return true if probe enabled. */
+static
+bool
+buf_debug_execute_is_force_flush()
+/*==============================*/
+{
+	DBUG_EXECUTE_IF("ib_buf_force_flush", return(true); );
+
+	/* This is used during queisce testing, we want to ensure maximum
+	buffering by the change buffer. */
+
+	if (srv_ibuf_disable_background_merge) {
+		return(true);
+	}
+
+	return(false);
 }
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 
 /********************************************************************//**
 This is the general function used to get access to a database page.
@@ -2456,15 +2506,15 @@ buf_page_get_gen(
 	unsigned	access_time;
 	ulint		fix_type;
 	ibool		must_read;
+	prio_rw_lock_t*	hash_lock;
+	ib_mutex_t*	block_mutex;
 	ulint		retries = 0;
-	mutex_t*	block_mutex = NULL;
 	trx_t*		trx = NULL;
 	ulint		sec;
 	ulint		ms;
 	ib_uint64_t	start_time;
 	ib_uint64_t	finish_time;
 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
-	ibool           have_LRU_mutex = FALSE;
 
 	ut_ad(mtr);
 	ut_ad(mtr->state == MTR_ACTIVE);
@@ -2498,48 +2548,36 @@ buf_page_get_gen(
 	}
 	buf_pool->stat.n_page_gets++;
 	fold = buf_page_address_fold(space, offset);
+	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
 loop:
 	block = guess;
-	//buf_pool_mutex_enter(buf_pool);
 
+	rw_lock_s_lock(hash_lock);
 	if (block) {
-		block_mutex = buf_page_get_mutex_enter((buf_page_t*)block);
-
 		/* If the guess is a compressed page descriptor that
 		has been allocated by buf_page_alloc_descriptor(),
 		it may have been freed by buf_relocate(). */
 
-		if (!block_mutex) {
-			block = guess = NULL;
-		} else if (!buf_block_is_uncompressed(buf_pool, block)
+		if (!buf_block_is_uncompressed(buf_pool, block)
 		    || offset != block->page.offset
 		    || space != block->page.space
 		    || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
 
-			mutex_exit(block_mutex);
-
+			/* Our guess was bogus or things have changed
+			since. */
 			block = guess = NULL;
 		} else {
 			ut_ad(!block->page.in_zip_hash);
-			ut_ad(block->page.in_page_hash);
 		}
 	}
 
 	if (block == NULL) {
-		rw_lock_s_lock(&buf_pool->page_hash_latch);
 		block = (buf_block_t*) buf_page_hash_get_low(
 			buf_pool, space, offset, fold);
-		if (block) {
-
-			block_mutex = buf_page_get_mutex_enter((buf_page_t*)block);
-			ut_a(block_mutex);
-		}
-		rw_lock_s_unlock(&buf_pool->page_hash_latch);
 	}
 
-loop2:
-	if (block && buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
-		mutex_exit(block_mutex);
+	if (!block || buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
+		rw_lock_s_unlock(hash_lock);
 		block = NULL;
 	}
 
@@ -2547,33 +2585,35 @@ loop2:
 		/* Page not in buf_pool: needs to be read from file */
 
 		if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+			rw_lock_x_lock(hash_lock);
 			block = (buf_block_t*) buf_pool_watch_set(
 				space, offset, fold);
 
 			if (UNIV_LIKELY_NULL(block)) {
-				block_mutex = buf_page_get_mutex((buf_page_t*)block);
-				ut_a(block_mutex);
-				ut_ad(mutex_own(block_mutex));
+				/* We can release hash_lock after we
+				acquire block_mutex to make sure that
+				no state change takes place. */
+				block_mutex = buf_page_get_mutex(&block->page);
+				mutex_enter(block_mutex);
+
+				/* Now safe to release page_hash mutex */
+				rw_lock_x_unlock(hash_lock);
 				goto got_block;
 			}
-		}
 
-		//buf_pool_mutex_exit(buf_pool);
+			rw_lock_x_unlock(hash_lock);
+		}
 
 		if (mode == BUF_GET_IF_IN_POOL
 		    || mode == BUF_PEEK_IF_IN_POOL
 		    || mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
-
+#ifdef UNIV_SYNC_DEBUG
+			ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+			ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
 			return(NULL);
 		}
 
-		/* We should not hold LRU mutex below when trying
-		to read the page */
-		if (have_LRU_mutex) {
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			have_LRU_mutex = FALSE;
-		}
-
 		if (buf_read_page(space, zip_size, offset, trx)) {
 			buf_read_ahead_random(space, zip_size, offset,
 					      ibuf_inside(mtr), trx);
@@ -2611,8 +2651,18 @@ loop2:
 		goto loop;
 	}
 
+
+	/* We can release hash_lock after we acquire block_mutex to
+	make sure that no state change takes place. */
+	block_mutex = buf_page_get_mutex(&block->page);
+	mutex_enter(block_mutex);
+
+	/* Now safe to release page_hash mutex */
+	rw_lock_s_unlock(hash_lock);
+
 got_block:
 	ut_ad(page_zip_get_size(&block->page.zip) == zip_size);
+	ut_ad(mutex_own(block_mutex));
 
 	must_read = buf_block_get_io_fix(block) == BUF_IO_READ;
 
@@ -2623,7 +2673,6 @@ got_block:
 		but we cannot wait around for the read to
 		complete. */
 null_exit:
-		//buf_pool_mutex_exit(buf_pool);
 		mutex_exit(block_mutex);
 
 		return(NULL);
@@ -2633,24 +2682,19 @@ null_exit:
 			  srv_pass_corrupt_table <= 1)) {
 
 		mutex_exit(block_mutex);
+
 		return(NULL);
 	}
 
 	switch (buf_block_get_state(block)) {
 		buf_page_t*	bpage;
-		ibool		success;
 
 	case BUF_BLOCK_FILE_PAGE:
-		if (block_mutex == &buf_pool->zip_mutex) {
-			/* it is wrong mutex... */
-			mutex_exit(block_mutex);
-			goto loop;
-		}
+		ut_ad(block_mutex != &buf_pool->zip_mutex);
 		break;
 
 	case BUF_BLOCK_ZIP_PAGE:
 	case BUF_BLOCK_ZIP_DIRTY:
-		ut_ad(block_mutex == &buf_pool->zip_mutex);
 		if (mode == BUF_PEEK_IF_IN_POOL) {
 			/* This mode is only used for dropping an
 			adaptive hash index.  There cannot be an
@@ -2660,20 +2704,17 @@ null_exit:
 		}
 
 		bpage = &block->page;
-		/* Protect bpage->buf_fix_count. */
-		//mutex_enter(&buf_pool->zip_mutex);
+		ut_ad(block_mutex == &buf_pool->zip_mutex);
 
 		if (bpage->buf_fix_count
 		    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
 			/* This condition often occurs when the buffer
 			is not buffer-fixed, but I/O-fixed by
 			buf_page_init_for_read(). */
-			//mutex_exit(&buf_pool->zip_mutex);
+			mutex_exit(&buf_pool->zip_mutex);
 wait_until_unfixed:
 			/* The block is buffer-fixed or I/O-fixed.
 			Try again later. */
-			//buf_pool_mutex_exit(buf_pool);
-			mutex_exit(block_mutex);
 			os_thread_sleep(WAIT_FOR_READ);
 
 			goto loop;
@@ -2685,30 +2726,22 @@ wait_until_unfixed:
 		bpage->buf_fix_count++;
 
 		/* Allocate an uncompressed page. */
-		//buf_pool_mutex_exit(buf_pool);
-		//mutex_exit(&buf_pool->zip_mutex);
-		mutex_exit(block_mutex);
-
+		mutex_exit(&buf_pool->zip_mutex);
 		block = buf_LRU_get_free_block(buf_pool);
 		ut_a(block);
-		block_mutex = &block->mutex;
 
-		//buf_pool_mutex_enter(buf_pool);
-		if (!have_LRU_mutex) {
-			mutex_enter(&buf_pool->LRU_list_mutex);
-			have_LRU_mutex = TRUE;
-		}
+		mutex_enter(&buf_pool->LRU_list_mutex);
+
+		rw_lock_x_lock(hash_lock);
+		/* Buffer-fixing prevents the page_hash from changing. */
+		ut_ad(bpage == buf_page_hash_get_low(
+			      buf_pool, space, offset, fold));
 
-		rw_lock_x_lock(&buf_pool->page_hash_latch);
 		mutex_enter(&block->mutex);
 		mutex_enter(&buf_pool->zip_mutex);
-		/* Buffer-fixing prevents the page_hash from changing. */
-		ut_ad(bpage == buf_page_hash_get_low(buf_pool,
-						     space, offset, fold));
 
-		if (UNIV_UNLIKELY
-		    (--bpage->buf_fix_count
-		     || buf_page_get_io_fix(bpage) != BUF_IO_NONE)) {
+		if (--bpage->buf_fix_count
+		    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
 
 			mutex_exit(&buf_pool->zip_mutex);
 			/* The block was buffer-fixed or I/O-fixed while
@@ -2717,15 +2750,11 @@ wait_until_unfixed:
 			This should be extremely unlikely, for example,
 			if buf_page_get_zip() was invoked. */
 
-			buf_LRU_block_free_non_file_page(block, TRUE);
-			//mutex_exit(&block->mutex);
-
-			rw_lock_x_unlock(&buf_pool->page_hash_latch);
-
-			if (have_LRU_mutex) {
-				mutex_exit(&buf_pool->LRU_list_mutex);
-				have_LRU_mutex = FALSE;
-			}
+			buf_LRU_block_free_non_file_page(block);
+			mutex_exit(&buf_pool->LRU_list_mutex);
+			mutex_exit(&buf_pool->zip_mutex);
+			rw_lock_x_unlock(hash_lock);
+			mutex_exit(&block->mutex);
 
 			goto wait_until_unfixed;
 		}
@@ -2734,19 +2763,16 @@ wait_until_unfixed:
 		and uncompress it. */
 
 		buf_relocate(bpage, &block->page);
-
-		rw_lock_x_unlock(&buf_pool->page_hash_latch);
-
 		buf_block_init_low(block);
 		block->lock_hash_val = lock_rec_hash(space, offset);
 
 		UNIV_MEM_DESC(&block->page.zip.data,
-			      page_zip_get_size(&block->page.zip), block);
+			      page_zip_get_size(&block->page.zip));
 
 		if (buf_page_get_state(&block->page)
 		    == BUF_BLOCK_ZIP_PAGE) {
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-			UT_LIST_REMOVE(zip_list, buf_pool->zip_clean,
+			UT_LIST_REMOVE(list, buf_pool->zip_clean,
 				       &block->page);
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 			ut_ad(!block->page.in_flush_list);
@@ -2764,10 +2790,7 @@ wait_until_unfixed:
 		/* Insert at the front of unzip_LRU list */
 		buf_unzip_LRU_add_block(block, FALSE);
 
-		if (have_LRU_mutex) {
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			have_LRU_mutex = FALSE;
-		}
+		mutex_exit(&buf_pool->LRU_list_mutex);
 
 		block->page.buf_fix_count = 1;
 		buf_block_set_io_fix(block, BUF_IO_READ);
@@ -2775,21 +2798,23 @@ wait_until_unfixed:
 
 		UNIV_MEM_INVALID(bpage, sizeof *bpage);
 
-		access_time = buf_page_is_accessed(&block->page);
+		rw_lock_x_unlock(hash_lock);
 
-		mutex_exit(block_mutex);
-		mutex_exit(&buf_pool->zip_mutex);
+		os_atomic_increment_ulint(&buf_pool->n_pend_unzip, 1);
 
-		buf_pool_mutex_enter(buf_pool);
-		buf_pool->n_pend_unzip++;
-		buf_pool_mutex_exit(buf_pool);
+		access_time = buf_page_is_accessed(&block->page);
+		mutex_exit(&block->mutex);
+		mutex_exit(&buf_pool->zip_mutex);
 
 		buf_page_free_descriptor(bpage);
 
 		/* Decompress the page while not holding
-		buf_pool->mutex or block->mutex. */
-		success = buf_zip_decompress(block, srv_use_checksums);
-		ut_a(success);
+		any buf_pool or block->mutex. */
+
+		/* Page checksum verification is already done when
+		the page is read from disk. Hence page checksum
+		verification is not necessary when decompressing the page. */
+		ut_a(buf_zip_decompress(block, FALSE));
 
 		if (UNIV_LIKELY(!recv_no_ibuf_operations)) {
 			if (access_time) {
@@ -2803,20 +2828,15 @@ wait_until_unfixed:
 		}
 
 		/* Unfix and unlatch the block. */
-		//buf_pool_mutex_enter(buf_pool);
-		block_mutex = &block->mutex;
-		mutex_enter(block_mutex);
+		mutex_enter(&block->mutex);
 		block->page.buf_fix_count--;
 		buf_block_set_io_fix(block, BUF_IO_NONE);
-
-		buf_pool_mutex_enter(buf_pool);
-		buf_pool->n_pend_unzip--;
-		buf_pool_mutex_exit(buf_pool);
+		os_atomic_decrement_ulint(&buf_pool->n_pend_unzip, 1);
 		rw_lock_x_unlock(&block->lock);
 
 		break;
 
-	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_POOL_WATCH:
 	case BUF_BLOCK_NOT_USED:
 	case BUF_BLOCK_READY_FOR_USE:
 	case BUF_BLOCK_MEMORY:
@@ -2825,9 +2845,13 @@ wait_until_unfixed:
 		break;
 	}
 
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 
-	//mutex_enter(&block->mutex);
 #if UNIV_WORD_SIZE == 4
 	/* On 32-bit systems, there is no padding in buf_page_t.  On
 	other systems, Valgrind could complain about uninitialized pad
@@ -2835,64 +2859,65 @@ wait_until_unfixed:
 	UNIV_MEM_ASSERT_RW(&block->page, sizeof block->page);
 #endif
 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+
 	if ((mode == BUF_GET_IF_IN_POOL || mode == BUF_GET_IF_IN_POOL_OR_WATCH)
-	    && ibuf_debug) {
+	    && (ibuf_debug || buf_debug_execute_is_force_flush())) {
 		/* Try to evict the block from the buffer pool, to use the
 		insert buffer (change buffer) as much as possible. */
-		ulint	page_no	= buf_block_get_page_no(block);
 
-		if (buf_LRU_free_block(&block->page, TRUE, &have_LRU_mutex)) {
-			mutex_exit(block_mutex);
+		/* To obey the latching order, release the
+		block->mutex before acquiring buf_pool->LRU_list_mutex. Protect
+		the block from changes by temporarily buffer-fixing it
+		for the time we are not holding block->mutex. */
+
+		buf_block_buf_fix_inc(block, file, line);
+		mutex_exit(&block->mutex);
+		mutex_enter(&buf_pool->LRU_list_mutex);
+		mutex_enter(&block->mutex);
+		buf_block_buf_fix_dec(block);
+
+		if (buf_LRU_free_page(&block->page, true)) {
+			mutex_exit(&block->mutex);
+			rw_lock_x_lock(hash_lock);
+
 			if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
 				/* Set the watch, as it would have
 				been set if the page were not in the
 				buffer pool in the first place. */
 				block = (buf_block_t*) buf_pool_watch_set(
 					space, offset, fold);
+			} else {
+				block = (buf_block_t*) buf_page_hash_get_low(
+					buf_pool, space, offset, fold);
+			}
 
-				if (UNIV_LIKELY_NULL(block)) {
-					block_mutex = buf_page_get_mutex((buf_page_t*)block);
-					ut_a(block_mutex);
-					ut_ad(mutex_own(block_mutex));
+			rw_lock_x_unlock(hash_lock);
 
-					/* The page entered the buffer
-					pool for some reason. Try to
-					evict it again. */
-					goto got_block;
-				}
+			if (UNIV_LIKELY_NULL(block)) {
+				/* Either the page has been read in or
+				a watch was set on that in the window
+				where we released the buf_pool::mutex
+				and before we acquire the hash_lock
+				above. Try again. */
+				guess = block;
+				goto loop;
 			}
-			//buf_pool_mutex_exit(buf_pool);
+
 			fprintf(stderr,
 				"innodb_change_buffering_debug evict %u %u\n",
 				(unsigned) space, (unsigned) offset);
 			return(NULL);
-		} else if (UNIV_UNLIKELY(buf_block_get_state(block)
-					 != BUF_BLOCK_FILE_PAGE
-				|| (buf_block_get_page_no(block) != page_no)
-				|| (buf_block_get_space(block) != space))) {
-
-				/* buf_LRU_free_block temporarily releases the
-				block mutex, and now block points to something
-				else. */
-				mutex_exit(block_mutex);
-				block = NULL;
-				goto loop2;
-
 		} else {
-			/* We should not hold LRU mutex below when trying
-			to flush page */
-			if (have_LRU_mutex) {
-				mutex_exit(&buf_pool->LRU_list_mutex);
-				have_LRU_mutex = FALSE;
-			}
 
-			if (buf_flush_page_try(buf_pool, block)) {
-				fprintf(stderr,
-					"innodb_change_buffering_debug flush %u %u\n",
-					(unsigned) space, (unsigned) offset);
-				guess = block;
-				goto loop;
-			}
+			mutex_exit(&buf_pool->LRU_list_mutex);
+		}
+
+		if (buf_flush_page_try(buf_pool, block)) {
+			fprintf(stderr,
+				"innodb_change_buffering_debug flush %u %u\n",
+				(unsigned) space, (unsigned) offset);
+			guess = block;
+			goto loop;
 		}
 
 		/* Failed to evict the page; change it directly */
@@ -2904,7 +2929,6 @@ wait_until_unfixed:
 	ut_a(mode == BUF_GET_POSSIBLY_FREED
 	     || !block->page.file_page_was_freed);
 #endif
-
 	/* Check if this is the first access to the page */
 	access_time = buf_page_is_accessed(&block->page);
 
@@ -2989,6 +3013,11 @@ wait_until_unfixed:
 	ut_a(ibuf_count_get(buf_block_get_space(block),
 			    buf_block_get_page_no(block)) == 0);
 #endif
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
 	if (UNIV_UNLIKELY(trx && trx->take_stats)) {
 		_increment_page_get_statistics(block, trx);
 	}
@@ -3006,8 +3035,7 @@ buf_page_optimistic_get(
 /*====================*/
 	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
 	buf_block_t*	block,	/*!< in: guessed buffer block */
-	ib_uint64_t	modify_clock,/*!< in: modify clock value if mode is
-				..._GUESS_ON_CLOCK */
+	ib_uint64_t	modify_clock,/*!< in: modify clock value */
 	const char*	file,	/*!< in: file name */
 	ulint		line,	/*!< in: line where called */
 	mtr_t*		mtr)	/*!< in: mini-transaction */
@@ -3090,7 +3118,9 @@ buf_page_optimistic_get(
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
-	ut_a(block->page.file_page_was_freed == FALSE);
+	mutex_enter(&block->mutex);
+	ut_a(!block->page.file_page_was_freed);
+	mutex_exit(&block->mutex);
 #endif
 	if (UNIV_UNLIKELY(innobase_get_slow_log())) {
 		trx = innobase_get_trx();
@@ -3201,7 +3231,18 @@ buf_page_get_known_nowait(
 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
-	ut_a(mode == BUF_KEEP_OLD || !block->page.file_page_was_freed);
+	if (mode != BUF_KEEP_OLD) {
+		/* If mode == BUF_KEEP_OLD, we are executing an I/O
+		completion routine.  Avoid a bogus assertion failure
+		when ibuf_merge_or_delete_for_page() is processing a
+		page that was just freed due to DROP INDEX, or
+		deleting a record from SYS_INDEXES. This check will be
+		skipped in recv_recover_page() as well. */
+
+		mutex_enter(&block->mutex);
+		ut_a(!block->page.file_page_was_freed);
+		mutex_exit(&block->mutex);
+	}
 #endif
 
 #ifdef UNIV_IBUF_COUNT_DEBUG
@@ -3226,7 +3267,7 @@ buf_page_get_known_nowait(
 /*******************************************************************//**
 Given a tablespace id and page number tries to get that page. If the
 page is not in the buffer pool it is not loaded and NULL is returned.
-Suitable for using when holding the kernel mutex.
+Suitable for using when holding the lock_sys_t::mutex.
 @return	pointer to a page or NULL */
 UNIV_INTERN
 const buf_block_t*
@@ -3242,25 +3283,25 @@ buf_page_try_get_func(
 	ibool		success;
 	ulint		fix_type;
 	buf_pool_t*	buf_pool = buf_pool_get(space_id, page_no);
+	prio_rw_lock_t*	hash_lock;
 
 	ut_ad(mtr);
 	ut_ad(mtr->state == MTR_ACTIVE);
 
-	//buf_pool_mutex_enter(buf_pool);
-	rw_lock_s_lock(&buf_pool->page_hash_latch);
-	block = buf_block_hash_get(buf_pool, space_id, page_no);
+	block = buf_block_hash_get_s_locked(buf_pool, space_id,
+					    page_no, &hash_lock);
 
 	if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
-		//buf_pool_mutex_exit(buf_pool);
-		rw_lock_s_unlock(&buf_pool->page_hash_latch);
+		if (block) {
+			rw_lock_s_unlock(hash_lock);
+		}
 		return(NULL);
 	}
 
 	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
 
 	mutex_enter(&block->mutex);
-	//buf_pool_mutex_exit(buf_pool);
-	rw_lock_s_unlock(&buf_pool->page_hash_latch);
+	rw_lock_s_unlock(hash_lock);
 
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
@@ -3299,7 +3340,9 @@ buf_page_try_get_func(
 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
-	ut_a(block->page.file_page_was_freed == FALSE);
+	mutex_enter(&block->mutex);
+	ut_a(!block->page.file_page_was_freed);
+	mutex_exit(&block->mutex);
 #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
 	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
 
@@ -3352,13 +3395,15 @@ buf_page_init(
 	buf_page_t*	hash_page;
 
 	ut_ad(buf_pool == buf_pool_get(space, offset));
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_EX));
-#endif
+
 	ut_ad(mutex_own(&(block->mutex)));
 	ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
 
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(buf_page_hash_lock_get(buf_pool, fold),
+			  RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
 	/* Set the state of the block */
 	buf_block_set_file_page(block, space, offset);
 
@@ -3384,14 +3429,17 @@ buf_page_init(
 	if (UNIV_LIKELY(!hash_page)) {
 	} else if (buf_pool_watch_is_sentinel(buf_pool, hash_page)) {
 		/* Preserve the reference count. */
-		ulint	buf_fix_count;
 
 		mutex_enter(&buf_pool->zip_mutex);
-		buf_fix_count = hash_page->buf_fix_count;
+
+		ulint	buf_fix_count = hash_page->buf_fix_count;
+
 		ut_a(buf_fix_count > 0);
 		block->page.buf_fix_count += buf_fix_count;
 		buf_pool_watch_remove(buf_pool, fold, hash_page);
+
 		mutex_exit(&buf_pool->zip_mutex);
+
 	} else {
 		fprintf(stderr,
 			"InnoDB: Error: page %lu %lu already found"
@@ -3401,8 +3449,6 @@ buf_page_init(
 			(const void*) hash_page, (const void*) block);
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 		mutex_exit(&block->mutex);
-		//buf_pool_mutex_exit(buf_pool);
-		rw_lock_x_unlock(&buf_pool->page_hash_latch);
 		buf_print();
 		buf_LRU_print();
 		buf_validate();
@@ -3435,7 +3481,7 @@ UNIV_INTERN
 buf_page_t*
 buf_page_init_for_read(
 /*===================*/
-	ulint*		err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
+	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
 	ulint		mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */
 	ulint		space,	/*!< in: space id */
 	ulint		zip_size,/*!< in: compressed page size, or 0 */
@@ -3449,6 +3495,7 @@ buf_page_init_for_read(
 	buf_block_t*	block;
 	buf_page_t*	bpage	= NULL;
 	buf_page_t*	watch_page;
+	prio_rw_lock_t*	hash_lock;
 	mtr_t		mtr;
 	ulint		fold;
 	ibool		lru	= FALSE;
@@ -3477,8 +3524,7 @@ buf_page_init_for_read(
 		ut_ad(mode == BUF_READ_ANY_PAGE);
 	}
 
-	if (zip_size && UNIV_LIKELY(!unzip)
-	    && UNIV_LIKELY(!recv_recovery_is_on())) {
+	if (zip_size && !unzip && !recv_recovery_is_on()) {
 		block = NULL;
 	} else {
 		block = buf_LRU_get_free_block(buf_pool);
@@ -3487,29 +3533,24 @@ buf_page_init_for_read(
 	}
 
 	fold = buf_page_address_fold(space, offset);
+	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
 
-	//buf_pool_mutex_enter(buf_pool);
 	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
 	mutex_enter(&buf_pool->LRU_list_mutex);
-	rw_lock_x_lock(&buf_pool->page_hash_latch);
+	rw_lock_x_lock(hash_lock);
 
 	watch_page = buf_page_hash_get_low(buf_pool, space, offset, fold);
-
 	if (watch_page && !buf_pool_watch_is_sentinel(buf_pool, watch_page)) {
 		/* The page is already in the buffer pool. */
 		watch_page = NULL;
 err_exit:
+		mutex_exit(&buf_pool->LRU_list_mutex);
+		rw_lock_x_unlock(hash_lock);
 		if (block) {
 			mutex_enter(&block->mutex);
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			rw_lock_x_unlock(&buf_pool->page_hash_latch);
-			buf_LRU_block_free_non_file_page(block, FALSE);
+			buf_LRU_block_free_non_file_page(block);
 			mutex_exit(&block->mutex);
 		}
-		else {
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			rw_lock_x_unlock(&buf_pool->page_hash_latch);
-		}
 
 		bpage = NULL;
 		goto func_exit;
@@ -3526,16 +3567,18 @@ err_exit:
 
 	if (block) {
 		bpage = &block->page;
+
 		mutex_enter(&block->mutex);
 
 		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
 
 		buf_page_init(buf_pool, space, offset, fold, zip_size, block);
-
-		rw_lock_x_unlock(&buf_pool->page_hash_latch);
+		rw_lock_x_unlock(hash_lock);
 
 		/* The block must be put to the LRU list, to the old blocks */
 		buf_LRU_add_block(bpage, TRUE/* to old blocks */);
+		mutex_exit(&buf_pool->LRU_list_mutex);
+		lru = TRUE;
 
 		/* We set a pass-type x-lock on the frame because then
 		the same thread which called for the read operation
@@ -3549,19 +3592,20 @@ err_exit:
 		rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
 		buf_page_set_io_fix(bpage, BUF_IO_READ);
 
-		if (UNIV_UNLIKELY(zip_size)) {
-			/* buf_pool->mutex may be released and
+		if (zip_size) {
+			/* buf_pool->LRU_list_mutex may be released and
 			reacquired by buf_buddy_alloc().  Thus, we
 			must release block->mutex in order not to
 			break the latching order in the reacquisition
-			of buf_pool->mutex.  We also must defer this
+			of buf_pool->LRU_list_mutex.  We also must defer this
 			operation until after the block descriptor has
 			been added to buf_pool->LRU and
 			buf_pool->page_hash. */
 			mutex_exit(&block->mutex);
-			data = buf_buddy_alloc(buf_pool, zip_size, &lru, FALSE);
+			mutex_enter(&buf_pool->LRU_list_mutex);
+			data = buf_buddy_alloc(buf_pool, zip_size, &lru);
 			mutex_enter(&block->mutex);
-			block->page.zip.data = data;
+			block->page.zip.data = (page_zip_t*) data;
 
 			/* To maintain the invariant
 			block->in_unzip_LRU_list
@@ -3570,35 +3614,39 @@ err_exit:
 			after block->page.zip.data is set. */
 			ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
 			buf_unzip_LRU_add_block(block, TRUE);
+			mutex_exit(&buf_pool->LRU_list_mutex);
 		}
 
-		mutex_exit(&buf_pool->LRU_list_mutex);
 		mutex_exit(&block->mutex);
 	} else {
+		rw_lock_x_unlock(hash_lock);
+
 		/* The compressed page must be allocated before the
 		control block (bpage), in order to avoid the
 		invocation of buf_buddy_relocate_block() on
 		uninitialized data. */
-		data = buf_buddy_alloc(buf_pool, zip_size, &lru, TRUE);
+		data = buf_buddy_alloc(buf_pool, zip_size, &lru);
+
+		rw_lock_x_lock(hash_lock);
 
 		/* If buf_buddy_alloc() allocated storage from the LRU list,
-		it released and reacquired buf_pool->mutex.  Thus, we must
-		check the page_hash again, as it may have been modified. */
+		it released and reacquired buf_pool->LRU_list_mutex.  Thus, we
+		must check the page_hash again, as it may have been
+		modified. */
 		if (UNIV_UNLIKELY(lru)) {
 
 			watch_page = buf_page_hash_get_low(
 				buf_pool, space, offset, fold);
 
-			if (watch_page
+			if (UNIV_UNLIKELY(watch_page
 			    && !buf_pool_watch_is_sentinel(buf_pool,
-				   			   watch_page)) {
+							   watch_page))) {
 
 				/* The block was added by some other thread. */
-				watch_page = NULL;
-				buf_buddy_free(buf_pool, data, zip_size, TRUE);
-
 				mutex_exit(&buf_pool->LRU_list_mutex);
-				rw_lock_x_unlock(&buf_pool->page_hash_latch);
+				rw_lock_x_unlock(hash_lock);
+				watch_page = NULL;
+				buf_buddy_free(buf_pool, data, zip_size);
 
 				bpage = NULL;
 				goto func_exit;
@@ -3612,11 +3660,11 @@ err_exit:
 
 		page_zip_des_init(&bpage->zip);
 		page_zip_set_size(&bpage->zip, zip_size);
-		bpage->zip.data = data;
+		bpage->zip.data = (page_zip_t*) data;
 
 		mutex_enter(&buf_pool->zip_mutex);
 		UNIV_MEM_DESC(bpage->zip.data,
-			      page_zip_get_size(&bpage->zip), bpage);
+			      page_zip_get_size(&bpage->zip));
 
 		buf_page_init_low(bpage);
 
@@ -3629,15 +3677,17 @@ err_exit:
 		bpage->in_zip_hash = FALSE;
 		bpage->in_flush_list = FALSE;
 		bpage->in_free_list = FALSE;
-#endif /* UNIV_DEBUG */
 		bpage->in_LRU_list = FALSE;
+#endif /* UNIV_DEBUG */
 
 		ut_d(bpage->in_page_hash = TRUE);
 
 		if (UNIV_LIKELY_NULL(watch_page)) {
+
 			/* Preserve the reference count. */
 			ulint	buf_fix_count = watch_page->buf_fix_count;
 			ut_a(buf_fix_count > 0);
+			ut_ad(buf_own_zip_mutex_for_page(bpage));
 			bpage->buf_fix_count += buf_fix_count;
 			ut_ad(buf_pool_watch_is_sentinel(buf_pool, watch_page));
 			buf_pool_watch_remove(buf_pool, fold, watch_page);
@@ -3646,15 +3696,14 @@ err_exit:
 		HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold,
 			    bpage);
 
-		rw_lock_x_unlock(&buf_pool->page_hash_latch);
+		rw_lock_x_unlock(hash_lock);
 
-		/* The block must be put to the LRU list, to the old blocks
+		/* The block must be put to the LRU list, to the old blocks.
 		The zip_size is already set into the page zip */
 		buf_LRU_add_block(bpage, TRUE/* to old blocks */);
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 		buf_LRU_insert_zip_clean(bpage);
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
 		mutex_exit(&buf_pool->LRU_list_mutex);
 
 		buf_page_set_io_fix(bpage, BUF_IO_READ);
@@ -3662,17 +3711,20 @@ err_exit:
 		mutex_exit(&buf_pool->zip_mutex);
 	}
 
-	buf_pool_mutex_enter(buf_pool);
-	buf_pool->n_pend_reads++;
-	buf_pool_mutex_exit(buf_pool);
+	os_atomic_increment_ulint(&buf_pool->n_pend_reads, 1);
 func_exit:
-	//buf_pool_mutex_exit(buf_pool);
 
 	if (mode == BUF_READ_IBUF_PAGES_ONLY) {
 
 		ibuf_mtr_commit(&mtr);
 	}
 
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
 	ut_ad(!bpage || buf_page_in_file(bpage));
 	return(bpage);
 }
@@ -3697,7 +3749,8 @@ buf_page_create(
 	buf_block_t*	block;
 	ulint		fold;
 	buf_block_t*	free_block	= NULL;
-	buf_pool_t*	buf_pool 	= buf_pool_get(space, offset);
+	buf_pool_t*	buf_pool	= buf_pool_get(space, offset);
+	prio_rw_lock_t*	hash_lock;
 
 	ut_ad(mtr);
 	ut_ad(mtr->state == MTR_ACTIVE);
@@ -3706,11 +3759,11 @@ buf_page_create(
 	free_block = buf_LRU_get_free_block(buf_pool);
 
 	fold = buf_page_address_fold(space, offset);
+	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
 
-	//buf_pool_mutex_enter(buf_pool);
 	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
 	mutex_enter(&buf_pool->LRU_list_mutex);
-	rw_lock_x_lock(&buf_pool->page_hash_latch);
+	rw_lock_x_lock(hash_lock);
 
 	block = (buf_block_t*) buf_page_hash_get_low(
 		buf_pool, space, offset, fold);
@@ -3726,9 +3779,8 @@ buf_page_create(
 #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
 
 		/* Page can be found in buf_pool */
-		//buf_pool_mutex_exit(buf_pool);
+		rw_lock_x_unlock(hash_lock);
 		mutex_exit(&buf_pool->LRU_list_mutex);
-		rw_lock_x_unlock(&buf_pool->page_hash_latch);
 
 		buf_block_free(free_block);
 
@@ -3749,8 +3801,9 @@ buf_page_create(
 
 	mutex_enter(&block->mutex);
 
-	buf_page_init(buf_pool, space, offset, fold, zip_size,block);
-	rw_lock_x_unlock(&buf_pool->page_hash_latch);
+	buf_page_init(buf_pool, space, offset, fold, zip_size, block);
+
+	rw_lock_x_unlock(hash_lock);
 
 	/* The block must be put to the LRU list */
 	buf_LRU_add_block(&block->page, FALSE);
@@ -3763,21 +3816,22 @@ buf_page_create(
 		ibool	lru;
 
 		/* Prevent race conditions during buf_buddy_alloc(),
-		which may release and reacquire buf_pool->mutex,
+		which may release and reacquire buf_pool->LRU_list_mutex,
 		by IO-fixing and X-latching the block. */
 
 		buf_page_set_io_fix(&block->page, BUF_IO_READ);
 		rw_lock_x_lock(&block->lock);
+
 		mutex_exit(&block->mutex);
-		/* buf_pool->mutex may be released and reacquired by
+		/* buf_pool->LRU_list_mutex may be released and reacquired by
 		buf_buddy_alloc().  Thus, we must release block->mutex
 		in order not to break the latching order in
-		the reacquisition of buf_pool->mutex.  We also must
+		the reacquisition of buf_pool->LRU_list_mutex.  We also must
 		defer this operation until after the block descriptor
 		has been added to buf_pool->LRU and buf_pool->page_hash. */
-		data = buf_buddy_alloc(buf_pool, zip_size, &lru, FALSE);
+		data = buf_buddy_alloc(buf_pool, zip_size, &lru);
 		mutex_enter(&block->mutex);
-		block->page.zip.data = data;
+		block->page.zip.data = (page_zip_t*) data;
 
 		/* To maintain the invariant
 		block->in_unzip_LRU_list
@@ -3804,9 +3858,6 @@ buf_page_create(
 
 	ibuf_merge_or_delete_for_page(NULL, space, offset, zip_size, TRUE);
 
-	/* Flush pages from the end of the LRU list if necessary */
-	buf_flush_free_margin(buf_pool, FALSE);
-
 	frame = block->frame;
 
 	memset(frame + FIL_PAGE_PREV, 0xff, 4);
@@ -3832,6 +3883,114 @@ buf_page_create(
 }
 
 /********************************************************************//**
+Monitor the buffer page read/write activity, and increment corresponding
+counter value if MONITOR_MODULE_BUF_PAGE (module_buf_page) module is
+enabled. */
+static
+void
+buf_page_monitor(
+/*=============*/
+	const buf_page_t*	bpage,	/*!< in: pointer to the block */
+	enum buf_io_fix		io_type)/*!< in: io_fix types */
+{
+	const byte*	frame;
+	monitor_id_t	counter;
+
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+	/* If the counter module is not turned on, just return */
+	if (!MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)) {
+		return;
+	}
+
+	ut_a(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
+
+	frame = bpage->zip.data
+		? bpage->zip.data
+		: ((buf_block_t*) bpage)->frame;
+
+	switch (fil_page_get_type(frame)) {
+		ulint	level;
+
+	case FIL_PAGE_INDEX:
+		level = btr_page_get_level_low(frame);
+
+		/* Check if it is an index page for insert buffer */
+		if (btr_page_get_index_id(frame)
+		    == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
+			if (level == 0) {
+				counter = MONITOR_RW_COUNTER(
+					io_type, MONITOR_INDEX_IBUF_LEAF_PAGE);
+			} else {
+				counter = MONITOR_RW_COUNTER(
+					io_type,
+					MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
+			}
+		} else {
+			if (level == 0) {
+				counter = MONITOR_RW_COUNTER(
+					io_type, MONITOR_INDEX_LEAF_PAGE);
+			} else {
+				counter = MONITOR_RW_COUNTER(
+					io_type, MONITOR_INDEX_NON_LEAF_PAGE);
+			}
+		}
+		break;
+
+        case FIL_PAGE_UNDO_LOG:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_UNDO_LOG_PAGE);
+		break;
+
+        case FIL_PAGE_INODE:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_INODE_PAGE);
+		break;
+
+        case FIL_PAGE_IBUF_FREE_LIST:
+		counter = MONITOR_RW_COUNTER(io_type,
+					     MONITOR_IBUF_FREELIST_PAGE);
+		break;
+
+        case FIL_PAGE_IBUF_BITMAP:
+		counter = MONITOR_RW_COUNTER(io_type,
+					     MONITOR_IBUF_BITMAP_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_SYS:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_SYSTEM_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_TRX_SYS:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_TRX_SYSTEM_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_FSP_HDR:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_FSP_HDR_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_XDES:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_XDES_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_BLOB:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_BLOB_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_ZBLOB:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_ZBLOB2:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB2_PAGE);
+		break;
+
+	default:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_OTHER_PAGE);
+	}
+
+	MONITOR_INC_NOCHECK(counter);
+}
+
+/********************************************************************//**
 Mark a table with the specified space pointed by bpage->space corrupted.
 Also remove the bpage from LRU list.
 @return TRUE if successful */
@@ -3846,12 +4005,14 @@ buf_mark_space_corrupt(
 					== BUF_BLOCK_FILE_PAGE);
 	ulint		space = bpage->space;
 	ibool		ret = TRUE;
+	const ulint	fold = buf_page_address_fold(bpage->space,
+						     bpage->offset);
+	prio_rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
 
 	/* First unfix and release lock on the bpage */
-	//buf_pool_mutex_enter(buf_pool);
 	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
 	mutex_enter(&buf_pool->LRU_list_mutex);
-	rw_lock_x_lock(&buf_pool->page_hash_latch);
+	rw_lock_x_lock(hash_lock);
 	mutex_enter(buf_page_get_mutex(bpage));
 	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
 	ut_ad(bpage->buf_fix_count == 0);
@@ -3869,18 +4030,14 @@ buf_mark_space_corrupt(
 	if (dict_set_corrupted_by_space(space)) {
 		buf_LRU_free_one_page(bpage);
 	} else {
+		mutex_exit(buf_page_get_mutex(bpage));
 		ret = FALSE;
 	}
 
-	buf_pool_mutex_enter(buf_pool);
-	ut_ad(buf_pool->n_pend_reads > 0);
-	buf_pool->n_pend_reads--;
-	buf_pool_mutex_exit(buf_pool);
-
-	mutex_exit(buf_page_get_mutex(bpage));
-	//buf_pool_mutex_exit(buf_pool);
 	mutex_exit(&buf_pool->LRU_list_mutex);
-	rw_lock_x_unlock(&buf_pool->page_hash_latch);
+
+	ut_ad(buf_pool->n_pend_reads > 0);
+	os_atomic_decrement_ulint(&buf_pool->n_pend_reads, 1);
 
 	return(ret);
 }
@@ -3888,9 +4045,9 @@ buf_mark_space_corrupt(
 /********************************************************************//**
 Completes an asynchronous read or write request of a file page to or from
 the buffer pool.
-@return TRUE if successful */
+@return true if successful */
 UNIV_INTERN
-ibool
+bool
 buf_page_io_complete(
 /*=================*/
 	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
@@ -3899,8 +4056,7 @@ buf_page_io_complete(
 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
 	const ibool	uncompressed = (buf_page_get_state(bpage)
 					== BUF_BLOCK_FILE_PAGE);
-	ibool		have_LRU_mutex = FALSE;
-	mutex_t*	block_mutex;
+	bool		have_LRU_mutex = false;
 
 	ut_a(buf_page_in_file(bpage));
 
@@ -3920,15 +4076,16 @@ buf_page_io_complete(
 
 		if (buf_page_get_zip_size(bpage)) {
 			frame = bpage->zip.data;
-			buf_pool->n_pend_unzip++;
+			os_atomic_increment_ulint(&buf_pool->n_pend_unzip, 1);
 			if (uncompressed
 			    && !buf_zip_decompress((buf_block_t*) bpage,
 						   FALSE)) {
 
-				buf_pool->n_pend_unzip--;
+				os_atomic_decrement_ulint(
+					&buf_pool->n_pend_unzip, 1);
 				goto corrupt;
 			}
-			buf_pool->n_pend_unzip--;
+			os_atomic_decrement_ulint(&buf_pool->n_pend_unzip, 1);
 		} else {
 			ut_a(uncompressed);
 			frame = ((buf_block_t*) bpage)->frame;
@@ -3941,9 +4098,8 @@ buf_page_io_complete(
 		read_space_id = mach_read_from_4(
 			frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
 
-		if ((bpage->space == TRX_SYS_SPACE
-		     || (srv_doublewrite_file && bpage->space == TRX_DOUBLEWRITE_SPACE))
-		    && trx_doublewrite_page_inside(bpage->offset)) {
+		if (bpage->space == TRX_SYS_SPACE
+		    && buf_dblwr_page_inside(bpage->offset)) {
 
 			ut_print_timestamp(stderr);
 			fprintf(stderr,
@@ -3977,8 +4133,20 @@ buf_page_io_complete(
 		/* From version 3.23.38 up we store the page checksum
 		to the 4 first bytes of the page end lsn field */
 
-		if (buf_page_is_corrupted(TRUE, frame,
+		if (buf_page_is_corrupted(true, frame,
 					  buf_page_get_zip_size(bpage))) {
+
+			/* Not a real corruption if it was triggered by
+			error injection */
+			DBUG_EXECUTE_IF("buf_page_is_corrupt_failure",
+				if (bpage->space > TRX_SYS_SPACE
+				    && buf_mark_space_corrupt(bpage)) {
+					ib_logf(IB_LOG_LEVEL_INFO,
+						"Simulated page corruption");
+					return(true);
+				}
+				goto page_not_corrupt;
+				;);
 corrupt:
 			fprintf(stderr,
 				"InnoDB: Database page corruption on disk"
@@ -4016,7 +4184,7 @@ corrupt:
 			      REFMAN "forcing-innodb-recovery.html\n"
 			      "InnoDB: about forcing recovery.\n", stderr);
 
-			if (srv_pass_corrupt_table && !trx_sys_sys_space(bpage->space)
+			if (srv_pass_corrupt_table && bpage->space != 0
 			    && bpage->space < SRV_LOG_SPACE_FIRST_ID) {
 				trx_t*	trx;
 
@@ -4039,7 +4207,7 @@ corrupt:
 				table as corrupted instead of crashing server */
 				if (bpage->space > TRX_SYS_SPACE
 				    && buf_mark_space_corrupt(bpage)) {
-					return(FALSE);
+					return(false);
 				} else {
 					fputs("InnoDB: Ending processing"
 					      " because of"
@@ -4051,6 +4219,9 @@ corrupt:
 		}
 		} /**/
 
+		DBUG_EXECUTE_IF("buf_page_is_corrupt_failure",
+				page_not_corrupt:  bpage = bpage; );
+
 		if (recv_recovery_is_on()) {
 			/* Pages must be uncompressed for crash recovery. */
 			ut_a(uncompressed);
@@ -4088,14 +4259,17 @@ corrupt:
 		buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY ||
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 		buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU)) {
+
+		have_LRU_mutex = TRUE; /* optimistic */
 	}
 retry_mutex:
-	if (!have_LRU_mutex) {
+	if (have_LRU_mutex) {
 		mutex_enter(&buf_pool->LRU_list_mutex);
-		have_LRU_mutex = TRUE;
 	}
-	block_mutex = buf_page_get_mutex_enter(bpage);
-	ut_a(block_mutex);
+
+	ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+	mutex_enter(block_mutex);
+
 	if (io_type == BUF_IO_WRITE
 			  && (
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
@@ -4108,7 +4282,6 @@ retry_mutex:
 		mutex_exit(block_mutex);
 		goto retry_mutex;
 	}
-	buf_pool_mutex_enter(buf_pool);
 
 #ifdef UNIV_IBUF_COUNT_DEBUG
 	if (io_type == BUF_IO_WRITE || uncompressed) {
@@ -4123,23 +4296,20 @@ retry_mutex:
 	removes the newest lock debug record, without checking the thread
 	id. */
 
-	buf_page_set_io_fix(bpage, BUF_IO_NONE);
-
 	switch (io_type) {
 	case BUF_IO_READ:
+
+		buf_page_set_io_fix(bpage, BUF_IO_NONE);
+
 		/* NOTE that the call to ibuf may have moved the ownership of
 		the x-latch to this OS thread: do not let this confuse you in
 		debugging! */
 
-		if (have_LRU_mutex) {
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			have_LRU_mutex = FALSE;
-		}
-
-		ut_a(!have_LRU_mutex);
 		ut_ad(buf_pool->n_pend_reads > 0);
-		buf_pool->n_pend_reads--;
-		buf_pool->stat.n_pages_read++;
+		os_atomic_decrement_ulint(&buf_pool->n_pend_reads, 1);
+		os_atomic_increment_ulint(&buf_pool->stat.n_pages_read, 1);
+
+		ut_ad(!have_LRU_mutex);
 
 		if (uncompressed) {
 			rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock,
@@ -4154,9 +4324,10 @@ retry_mutex:
 
 		buf_flush_write_complete(bpage);
 
+		os_atomic_increment_ulint(&buf_pool->stat.n_pages_written, 1);
+
 		if (have_LRU_mutex) {
 			mutex_exit(&buf_pool->LRU_list_mutex);
-			have_LRU_mutex = FALSE;
 		}
 
 		if (uncompressed) {
@@ -4164,14 +4335,14 @@ retry_mutex:
 					     BUF_IO_WRITE);
 		}
 
-		buf_pool->stat.n_pages_written++;
-
 		break;
 
 	default:
 		ut_error;
 	}
 
+	buf_page_monitor(bpage, io_type);
+
 #ifdef UNIV_DEBUG
 	if (buf_debug_prints) {
 		fprintf(stderr, "Has %s page space %lu page no %lu\n",
@@ -4181,40 +4352,9 @@ retry_mutex:
 	}
 #endif /* UNIV_DEBUG */
 
-	buf_pool_mutex_exit(buf_pool);
 	mutex_exit(block_mutex);
 
-	return(TRUE);
-}
-
-/********************************************************************//**
-*/
-UNIV_INTERN
-buf_block_t*
-buf_page_from_array(
-/*================*/
-	buf_pool_t*	buf_pool,
-	ulint		n_block)
-{
-	ulint		n_chunks, offset;
-	buf_chunk_t*	chunk;
-
-	ut_a(n_block < buf_pool->curr_size);
-
-	chunk = buf_pool->chunks;
-	offset = n_block;
-
-	for (n_chunks = buf_pool->n_chunks; n_chunks--; chunk++) {
-		if (offset < chunk->size) {
-			return(&chunk->blocks[offset]);
-		}
-
-		offset -= chunk->size;
-	}
-
-	ut_error;
-
-	return(NULL);
+	return(true);
 }
 
 /*********************************************************************//**
@@ -4231,17 +4371,17 @@ buf_all_freed_instance(
 
 	ut_ad(buf_pool);
 
-	//buf_pool_mutex_enter(buf_pool);
 	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
-	mutex_enter(&buf_pool->LRU_list_mutex);
-	rw_lock_x_lock(&buf_pool->page_hash_latch);
-
 	chunk = buf_pool->chunks;
 
 	for (i = buf_pool->n_chunks; i--; chunk++) {
 
+		mutex_enter(&buf_pool->LRU_list_mutex);
+
 		const buf_block_t* block = buf_chunk_not_freed(chunk);
 
+		mutex_exit(&buf_pool->LRU_list_mutex);
+
 		if (UNIV_LIKELY_NULL(block)) {
 			fprintf(stderr,
 				"Page %lu %lu still fixed or dirty\n",
@@ -4251,10 +4391,6 @@ buf_all_freed_instance(
 		}
 	}
 
-	//buf_pool_mutex_exit(buf_pool);
-	mutex_exit(&buf_pool->LRU_list_mutex);
-	rw_lock_x_unlock(&buf_pool->page_hash_latch);
-
 	return(TRUE);
 }
 
@@ -4266,10 +4402,11 @@ buf_pool_invalidate_instance(
 /*=========================*/
 	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
 {
-	ibool		freed;
-	enum buf_flush	i;
+	ulint		i;
 
-	buf_pool_mutex_enter(buf_pool);
+	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
+
+	mutex_enter(&buf_pool->flush_state_mutex);
 
 	for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
 
@@ -4285,23 +4422,20 @@ buf_pool_invalidate_instance(
 		pool invalidation to proceed we must ensure there is NO
 		write activity happening. */
 		if (buf_pool->n_flush[i] > 0) {
-			buf_pool_mutex_exit(buf_pool);
-			buf_flush_wait_batch_end(buf_pool, i);
-			buf_pool_mutex_enter(buf_pool);
+			buf_flush_t	type = static_cast<buf_flush_t>(i);
+
+			mutex_exit(&buf_pool->flush_state_mutex);
+			buf_flush_wait_batch_end(buf_pool, type);
+			mutex_enter(&buf_pool->flush_state_mutex);
 		}
 	}
-
-	buf_pool_mutex_exit(buf_pool);
+	mutex_exit(&buf_pool->flush_state_mutex);
 
 	ut_ad(buf_all_freed_instance(buf_pool));
 
-	freed = TRUE;
-
-	while (freed) {
-		freed = buf_LRU_search_and_free_block(buf_pool, 100);
+	while (buf_LRU_scan_and_free_block(buf_pool, TRUE)) {
 	}
 
-	//buf_pool_mutex_enter(buf_pool);
 	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
 	mutex_enter(&buf_pool->LRU_list_mutex);
 
@@ -4311,13 +4445,11 @@ buf_pool_invalidate_instance(
 	buf_pool->freed_page_clock = 0;
 	buf_pool->LRU_old = NULL;
 	buf_pool->LRU_old_len = 0;
-	buf_pool->LRU_flush_ended = 0;
+
+	mutex_exit(&buf_pool->LRU_list_mutex);
 
 	memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat));
 	buf_refresh_io_stats(buf_pool);
-
-	//buf_pool_mutex_exit(buf_pool);
-	mutex_exit(&buf_pool->LRU_list_mutex);
 }
 
 /*********************************************************************//**
@@ -4349,21 +4481,25 @@ buf_pool_validate_instance(
 	buf_page_t*	b;
 	buf_chunk_t*	chunk;
 	ulint		i;
-	ulint		n_single_flush	= 0;
 	ulint		n_lru_flush	= 0;
+	ulint		n_page_flush	= 0;
 	ulint		n_list_flush	= 0;
 	ulint		n_lru		= 0;
 	ulint		n_flush		= 0;
 	ulint		n_free		= 0;
 	ulint		n_zip		= 0;
+	ulint		fold		= 0;
+	ulint		space		= 0;
+	ulint		offset		= 0;
 
 	ut_ad(buf_pool);
 
-	//buf_pool_mutex_enter(buf_pool);
 	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
 	mutex_enter(&buf_pool->LRU_list_mutex);
-	rw_lock_x_lock(&buf_pool->page_hash_latch);
-	/* for keep the new latch order, it cannot validate correctly... */
+	hash_lock_x_all(buf_pool->page_hash);
+	mutex_enter(&buf_pool->zip_mutex);
+	mutex_enter(&buf_pool->free_list_mutex);
+	mutex_enter(&buf_pool->flush_state_mutex);
 
 	chunk = buf_pool->chunks;
 
@@ -4376,10 +4512,8 @@ buf_pool_validate_instance(
 
 		for (j = chunk->size; j--; block++) {
 
-			mutex_enter(&block->mutex);
-
 			switch (buf_block_get_state(block)) {
-			case BUF_BLOCK_ZIP_FREE:
+			case BUF_BLOCK_POOL_WATCH:
 			case BUF_BLOCK_ZIP_PAGE:
 			case BUF_BLOCK_ZIP_DIRTY:
 				/* These should only occur on
@@ -4388,22 +4522,26 @@ buf_pool_validate_instance(
 				break;
 
 			case BUF_BLOCK_FILE_PAGE:
-				ut_a(buf_page_hash_get(buf_pool,
-						       buf_block_get_space(
-							       block),
-						       buf_block_get_page_no(
-							       block))
+
+				space = buf_block_get_space(block);
+				offset = buf_block_get_page_no(block);
+				fold = buf_page_address_fold(space, offset);
+				ut_a(buf_page_hash_get_low(buf_pool,
+							   space,
+							   offset,
+							   fold)
 				     == &block->page);
 
 #ifdef UNIV_IBUF_COUNT_DEBUG
-				ut_a(buf_page_get_io_fix(&block->page)
+				ut_a(buf_page_get_io_fix_unlocked(&block->page)
 				     == BUF_IO_READ
 				     || !ibuf_count_get(buf_block_get_space(
 								block),
 							buf_block_get_page_no(
 								block)));
 #endif
-				switch (buf_page_get_io_fix(&block->page)) {
+				switch (buf_page_get_io_fix_unlocked(
+						&block->page)) {
 				case BUF_IO_NONE:
 					break;
 
@@ -4411,16 +4549,8 @@ buf_pool_validate_instance(
 					switch (buf_page_get_flush_type(
 							&block->page)) {
 					case BUF_FLUSH_LRU:
-						n_lru_flush++;
-						ut_a(rw_lock_is_locked(
-							     &block->lock,
-							     RW_LOCK_SHARED));
-						break;
-					case BUF_FLUSH_LIST:
-						n_list_flush++;
-						break;
 					case BUF_FLUSH_SINGLE_PAGE:
-						n_single_flush++;
+					case BUF_FLUSH_LIST:
 						break;
 					default:
 						ut_error;
@@ -4451,17 +4581,13 @@ buf_pool_validate_instance(
 				/* do nothing */
 				break;
 			}
-
-			mutex_exit(&block->mutex);
 		}
 	}
 
-	mutex_enter(&buf_pool->zip_mutex);
-
 	/* Check clean compressed-only blocks. */
 
 	for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
-	     b = UT_LIST_GET_NEXT(zip_list, b)) {
+	     b = UT_LIST_GET_NEXT(list, b)) {
 		ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
 		switch (buf_page_get_io_fix(b)) {
 		case BUF_IO_NONE:
@@ -4469,7 +4595,7 @@ buf_pool_validate_instance(
 			/* All clean blocks should be I/O-unfixed. */
 			break;
 		case BUF_IO_READ:
-			/* In buf_LRU_free_block(), we temporarily set
+			/* In buf_LRU_free_page(), we temporarily set
 			b->io_fix = BUF_IO_READ for a newly allocated
 			control block in order to prevent
 			buf_page_get_gen() from decompressing the block. */
@@ -4483,8 +4609,9 @@ buf_pool_validate_instance(
 		we have acquired buf_pool->zip_mutex above which acts
 		as the 'block->mutex' for these bpages. */
 		ut_a(!b->oldest_modification);
-		ut_a(buf_page_hash_get(buf_pool, b->space, b->offset) == b);
-
+		fold = buf_page_address_fold(b->space, b->offset);
+		ut_a(buf_page_hash_get_low(buf_pool, b->space, b->offset,
+					   fold) == b);
 		n_lru++;
 		n_zip++;
 	}
@@ -4493,7 +4620,7 @@ buf_pool_validate_instance(
 
 	buf_flush_list_mutex_enter(buf_pool);
 	for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
-	     b = UT_LIST_GET_NEXT(flush_list, b)) {
+	     b = UT_LIST_GET_NEXT(list, b)) {
 		ut_ad(b->in_flush_list);
 		ut_a(b->oldest_modification);
 		n_flush++;
@@ -4502,7 +4629,9 @@ buf_pool_validate_instance(
 		case BUF_BLOCK_ZIP_DIRTY:
 			n_lru++;
 			n_zip++;
-			switch (buf_page_get_io_fix(b)) {
+			/* fallthrough */
+		case BUF_BLOCK_FILE_PAGE:
+			switch (buf_page_get_io_fix_unlocked(b)) {
 			case BUF_IO_NONE:
 			case BUF_IO_READ:
 			case BUF_IO_PIN:
@@ -4512,22 +4641,21 @@ buf_pool_validate_instance(
 				case BUF_FLUSH_LRU:
 					n_lru_flush++;
 					break;
+				case BUF_FLUSH_SINGLE_PAGE:
+					n_page_flush++;
+					break;
 				case BUF_FLUSH_LIST:
 					n_list_flush++;
 					break;
-				case BUF_FLUSH_SINGLE_PAGE:
-					n_single_flush++;
-					break;
 				default:
 					ut_error;
 				}
 				break;
+			default:
+				ut_error;
 			}
 			break;
-		case BUF_BLOCK_FILE_PAGE:
-			/* uncompressed page */
-			break;
-		case BUF_BLOCK_ZIP_FREE:
+		case BUF_BLOCK_POOL_WATCH:
 		case BUF_BLOCK_ZIP_PAGE:
 		case BUF_BLOCK_NOT_USED:
 		case BUF_BLOCK_READY_FOR_USE:
@@ -4536,11 +4664,14 @@ buf_pool_validate_instance(
 			ut_error;
 			break;
 		}
-		ut_a(buf_page_hash_get(buf_pool, b->space, b->offset) == b);
+		fold = buf_page_address_fold(b->space, b->offset);
+		ut_a(buf_page_hash_get_low(buf_pool, b->space, b->offset,
+					   fold) == b);
 	}
 
 	ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
 
+	hash_unlock_x_all(buf_pool->page_hash);
 	buf_flush_list_mutex_exit(buf_pool);
 
 	mutex_exit(&buf_pool->zip_mutex);
@@ -4553,8 +4684,9 @@ buf_pool_validate_instance(
 	}
 
 	ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
-	/* because of latching order with block->mutex, we cannot get needed mutexes before that */
-/*
+
+	mutex_exit(&buf_pool->LRU_list_mutex);
+
 	if (UT_LIST_GET_LEN(buf_pool->free) != n_free) {
 		fprintf(stderr, "Free list len %lu, free blocks %lu\n",
 			(ulong) UT_LIST_GET_LEN(buf_pool->free),
@@ -4562,14 +4694,13 @@ buf_pool_validate_instance(
 		ut_error;
 	}
 
-	ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush);
+	mutex_exit(&buf_pool->free_list_mutex);
+
 	ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
 	ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
-*/
+	ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_page_flush);
 
-	//buf_pool_mutex_exit(buf_pool);
-	mutex_exit(&buf_pool->LRU_list_mutex);
-	rw_lock_x_unlock(&buf_pool->page_hash_latch);
+	mutex_exit(&buf_pool->flush_state_mutex);
 
 	ut_a(buf_LRU_validate());
 	ut_a(buf_flush_validate(buf_pool));
@@ -4622,14 +4753,13 @@ buf_print_instance(
 
 	size = buf_pool->curr_size;
 
-	index_ids = mem_alloc(size * sizeof *index_ids);
-	counts = mem_alloc(sizeof(ulint) * size);
-
-	//buf_pool_mutex_enter(buf_pool);
 	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
-	mutex_enter(&buf_pool->LRU_list_mutex);
-	mutex_enter(&buf_pool->free_list_mutex);
-	buf_flush_list_mutex_enter(buf_pool);
+	index_ids = static_cast<index_id_t*>(
+		mem_alloc(size * sizeof *index_ids));
+
+	counts = static_cast<ulint*>(mem_alloc(sizeof(ulint) * size));
+
+	/* Dirty reads below */
 
 	fprintf(stderr,
 		"buf_pool size %lu\n"
@@ -4656,12 +4786,12 @@ buf_print_instance(
 		(ulong) buf_pool->stat.n_pages_created,
 		(ulong) buf_pool->stat.n_pages_written);
 
-	buf_flush_list_mutex_exit(buf_pool);
-
 	/* Count the number of blocks belonging to each index in the buffer */
 
 	n_found = 0;
 
+	mutex_enter(&buf_pool->LRU_list_mutex);
+
 	chunk = buf_pool->chunks;
 
 	for (i = buf_pool->n_chunks; i--; chunk++) {
@@ -4697,9 +4827,7 @@ buf_print_instance(
 		}
 	}
 
-	//buf_pool_mutex_exit(buf_pool);
 	mutex_exit(&buf_pool->LRU_list_mutex);
-	mutex_exit(&buf_pool->free_list_mutex);
 
 	for (i = 0; i < n_found; i++) {
 		index = dict_index_get_if_in_cache(index_ids[i]);
@@ -4756,7 +4884,8 @@ buf_get_latched_pages_number_instance(
 	buf_chunk_t*	chunk;
 	ulint		fixed_pages_number = 0;
 
-	//buf_pool_mutex_enter(buf_pool);
+	/* The LRU list mutex is enough to protect the required fields below */
+	mutex_enter(&buf_pool->LRU_list_mutex);
 
 	chunk = buf_pool->chunks;
 
@@ -4773,24 +4902,23 @@ buf_get_latched_pages_number_instance(
 				continue;
 			}
 
-			mutex_enter(&block->mutex);
-
 			if (block->page.buf_fix_count != 0
-			    || buf_page_get_io_fix(&block->page)
+			    || buf_page_get_io_fix_unlocked(&block->page)
 			    != BUF_IO_NONE) {
 				fixed_pages_number++;
 			}
 
-			mutex_exit(&block->mutex);
 		}
 	}
 
+	mutex_exit(&buf_pool->LRU_list_mutex);
+
 	mutex_enter(&buf_pool->zip_mutex);
 
 	/* Traverse the lists of clean and dirty compressed-only blocks. */
 
 	for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
-	     b = UT_LIST_GET_NEXT(zip_list, b)) {
+	     b = UT_LIST_GET_NEXT(list, b)) {
 		ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
 		ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE);
 
@@ -4802,7 +4930,7 @@ buf_get_latched_pages_number_instance(
 
 	buf_flush_list_mutex_enter(buf_pool);
 	for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
-	     b = UT_LIST_GET_NEXT(flush_list, b)) {
+	     b = UT_LIST_GET_NEXT(list, b)) {
 		ut_ad(b->in_flush_list);
 
 		switch (buf_page_get_state(b)) {
@@ -4814,13 +4942,16 @@ buf_get_latched_pages_number_instance(
 			break;
 		case BUF_BLOCK_FILE_PAGE:
 			/* uncompressed page */
+		case BUF_BLOCK_REMOVE_HASH:
+			/* We hold flush list but not LRU list mutex here.
+			Thus encountering BUF_BLOCK_REMOVE_HASH pages is
+			possible.  */
 			break;
-		case BUF_BLOCK_ZIP_FREE:
+		case BUF_BLOCK_POOL_WATCH:
 		case BUF_BLOCK_ZIP_PAGE:
 		case BUF_BLOCK_NOT_USED:
 		case BUF_BLOCK_READY_FOR_USE:
 		case BUF_BLOCK_MEMORY:
-		case BUF_BLOCK_REMOVE_HASH:
 			ut_error;
 			break;
 		}
@@ -4828,7 +4959,6 @@ buf_get_latched_pages_number_instance(
 
 	buf_flush_list_mutex_exit(buf_pool);
 	mutex_exit(&buf_pool->zip_mutex);
-	//buf_pool_mutex_exit(buf_pool);
 
 	return(fixed_pages_number);
 }
@@ -4859,26 +4989,18 @@ buf_get_latched_pages_number(void)
 #endif /* UNIV_DEBUG */
 
 /*********************************************************************//**
-Returns the number of pending buf pool ios.
-@return	number of pending I/O operations */
+Returns the number of pending buf pool read ios.
+@return	number of pending read I/O operations */
 UNIV_INTERN
 ulint
-buf_get_n_pending_ios(void)
-/*=======================*/
+buf_get_n_pending_read_ios(void)
+/*============================*/
 {
 	ulint	i;
 	ulint	pend_ios = 0;
 
 	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		pend_ios +=
-			buf_pool->n_pend_reads
-			+ buf_pool->n_flush[BUF_FLUSH_LRU]
-			+ buf_pool->n_flush[BUF_FLUSH_LIST]
-			+ buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE];
+		pend_ios += buf_pool_from_array(i)->n_pend_reads;
 	}
 
 	return(pend_ios);
@@ -4936,8 +5058,6 @@ buf_stats_aggregate_pool_info(
 	total_info->n_pend_reads += pool_info->n_pend_reads;
 	total_info->n_pending_flush_lru += pool_info->n_pending_flush_lru;
 	total_info->n_pending_flush_list += pool_info->n_pending_flush_list;
-	total_info->n_pending_flush_single_page +=
-		 pool_info->n_pending_flush_single_page;
 	total_info->n_pages_made_young += pool_info->n_pages_made_young;
 	total_info->n_pages_not_made_young += pool_info->n_pages_not_made_young;
 	total_info->n_pages_read += pool_info->n_pages_read;
@@ -4985,12 +5105,7 @@ buf_stats_get_pool_info(
 
 	/* Find appropriate pool_info to store stats for this buffer pool */
 	pool_info = &all_pool_info[pool_id];
-
 	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
-	mutex_enter(&buf_pool->LRU_list_mutex);
-	mutex_enter(&buf_pool->free_list_mutex);
-	buf_pool_mutex_enter(buf_pool);
-	buf_flush_list_mutex_enter(buf_pool);
 
 	pool_info->pool_unique_id = pool_id;
 
@@ -5010,6 +5125,8 @@ buf_stats_get_pool_info(
 
 	pool_info->n_pend_reads = buf_pool->n_pend_reads;
 
+	mutex_enter(&buf_pool->flush_state_mutex);
+
 	pool_info->n_pending_flush_lru =
 		 (buf_pool->n_flush[BUF_FLUSH_LRU]
 		  + buf_pool->init_flush[BUF_FLUSH_LRU]);
@@ -5019,9 +5136,10 @@ buf_stats_get_pool_info(
 		  + buf_pool->init_flush[BUF_FLUSH_LIST]);
 
 	pool_info->n_pending_flush_single_page =
-		 buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE];
+		 (buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
+		  + buf_pool->init_flush[BUF_FLUSH_SINGLE_PAGE]);
 
-	buf_flush_list_mutex_exit(buf_pool);
+	mutex_exit(&buf_pool->flush_state_mutex);
 
 	current_time = time(NULL);
 	time_elapsed = 0.001 + difftime(current_time,
@@ -5104,9 +5222,6 @@ buf_stats_get_pool_info(
 	pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
 
 	buf_refresh_io_stats(buf_pool);
-	mutex_exit(&buf_pool->LRU_list_mutex);
-	mutex_exit(&buf_pool->free_list_mutex);
-	buf_pool_mutex_exit(buf_pool);
 }
 
 /*********************************************************************//**
@@ -5122,7 +5237,7 @@ buf_print_io_instance(
 
 	fprintf(file,
 		"Buffer pool size        %lu\n"
-		"Buffer pool size, bytes %lu\n"
+		"Buffer pool size, bytes " ULINTPF "\n"
 		"Free buffers            %lu\n"
 		"Database pages          %lu\n"
 		"Old database pages      %lu\n"
@@ -5212,8 +5327,10 @@ buf_print_io(
 		pool_info_total = &pool_info[srv_buf_pool_instances];
 	} else {
 		ut_a(srv_buf_pool_instances == 1);
-		pool_info_total = pool_info = (buf_pool_info_t*) mem_zalloc(
-			sizeof *pool_info)
+
+		pool_info_total = pool_info =
+			static_cast<buf_pool_info_t*>(
+				mem_zalloc(sizeof *pool_info));
 	}
 
 	for (i = 0; i < srv_buf_pool_instances; i++) {
@@ -5271,9 +5388,7 @@ void
 buf_refresh_io_stats_all(void)
 /*==========================*/
 {
-	ulint		i;
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
 		buf_pool_t*	buf_pool;
 
 		buf_pool = buf_pool_from_array(i);
@@ -5290,9 +5405,7 @@ ibool
 buf_all_freed(void)
 /*===============*/
 {
-	ulint	i;
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
 		buf_pool_t*	buf_pool;
 
 		buf_pool = buf_pool_from_array(i);
@@ -5300,7 +5413,7 @@ buf_all_freed(void)
 		if (!buf_all_freed_instance(buf_pool)) {
 			return(FALSE);
 		}
- 	}
+	}
 
 	return(TRUE);
 }
@@ -5311,27 +5424,27 @@ pool.
 @return	number of pending i/o */
 UNIV_INTERN
 ulint
-buf_pool_check_num_pending_io(void)
-/*===============================*/
+buf_pool_check_no_pending_io(void)
+/*==============================*/
 {
 	ulint		i;
 	ulint		pending_io = 0;
 
-	buf_pool_mutex_enter_all();
-
 	for (i = 0; i < srv_buf_pool_instances; i++) {
-		const buf_pool_t*	buf_pool;
+		buf_pool_t*	buf_pool;
 
 		buf_pool = buf_pool_from_array(i);
 
-		pending_io += buf_pool->n_pend_reads
-			      + buf_pool->n_flush[BUF_FLUSH_LRU]
-			      + buf_pool->n_flush[BUF_FLUSH_LIST]
-			      + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE];
+		pending_io += buf_pool->n_pend_reads;
 
-	}
+		mutex_enter(&buf_pool->flush_state_mutex);
 
-	buf_pool_mutex_exit_all();
+		pending_io += buf_pool->n_flush[BUF_FLUSH_LRU];
+		pending_io += buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE];
+		pending_io += buf_pool->n_flush[BUF_FLUSH_LIST];
+
+		mutex_exit(&buf_pool->flush_state_mutex);
+	}
 
 	return(pending_io);
 }
@@ -5348,12 +5461,10 @@ buf_get_free_list_len(void)
 {
 	ulint	len;
 
-	//buf_pool_mutex_enter(buf_pool);
 	mutex_enter(&buf_pool->free_list_mutex);
 
 	len = UT_LIST_GET_LEN(buf_pool->free);
 
-	//buf_pool_mutex_exit(buf_pool);
 	mutex_exit(&buf_pool->free_list_mutex);
 
 	return(len);
@@ -5382,7 +5493,7 @@ buf_page_init_for_backup_restore(
 
 	/* We assume that block->page.data has been allocated
 	with zip_size == UNIV_PAGE_SIZE. */
-	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
 	ut_ad(ut_is_2pow(zip_size));
 	page_zip_set_size(&block->page.zip, zip_size);
 	if (zip_size) {
diff --git a/storage/xtradb/buf/buf0checksum.cc b/storage/xtradb/buf/buf0checksum.cc
new file mode 100644
index 00000000000..ec79bbe6be9
--- /dev/null
+++ b/storage/xtradb/buf/buf0checksum.cc
@@ -0,0 +1,155 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0checksum.cc
+Buffer pool checksum functions, also linked from /extra/innochecksum.cc
+
+Created Aug 11, 2011 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+#include "fil0fil.h" /* FIL_* */
+#include "ut0crc32.h" /* ut_crc32() */
+#include "ut0rnd.h" /* ut_fold_binary() */
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "srv0srv.h" /* SRV_CHECKSUM_* */
+#include "buf0types.h"
+
+/** the macro MYSQL_SYSVAR_ENUM() requires "long unsigned int" and if we
+use srv_checksum_algorithm_t here then we get a compiler error:
+ha_innodb.cc:12251: error: cannot convert 'srv_checksum_algorithm_t*' to
+  'long unsigned int*' in initialization */
+UNIV_INTERN ulong	srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_INNODB;
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************************//**
+Calculates a page CRC32 which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@return	checksum */
+UNIV_INTERN
+ib_uint32_t
+buf_calc_page_crc32(
+/*================*/
+	const byte*	page)	/*!< in: buffer page */
+{
+	ib_uint32_t	checksum;
+
+	/* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
+	FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool
+	to the first pages of data files, we have to skip them in the page
+	checksum calculation.
+	We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
+	checksum is stored, and also the last 8 bytes of page because
+	there we store the old formula checksum. */
+
+	checksum = ut_crc32(page + FIL_PAGE_OFFSET,
+			    FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
+		^ ut_crc32(page + FIL_PAGE_DATA,
+			   UNIV_PAGE_SIZE - FIL_PAGE_DATA
+			   - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+	return(checksum);
+}
+
+/********************************************************************//**
+Calculates a page checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@return	checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_new_checksum(
+/*=======================*/
+	const byte*	page)	/*!< in: buffer page */
+{
+	ulint checksum;
+
+	/* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
+	FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool
+	to the first pages of data files, we have to skip them in the page
+	checksum calculation.
+	We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
+	checksum is stored, and also the last 8 bytes of page because
+	there we store the old formula checksum. */
+
+	checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
+				  FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
+		+ ut_fold_binary(page + FIL_PAGE_DATA,
+				 UNIV_PAGE_SIZE - FIL_PAGE_DATA
+				 - FIL_PAGE_END_LSN_OLD_CHKSUM);
+	checksum = checksum & 0xFFFFFFFFUL;
+
+	return(checksum);
+}
+
+/********************************************************************//**
+In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
+looked at the first few bytes of the page. This calculates that old
+checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@return	checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_old_checksum(
+/*=======================*/
+	const byte*	page)	/*!< in: buffer page */
+{
+	ulint checksum;
+
+	checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
+
+	checksum = checksum & 0xFFFFFFFFUL;
+
+	return(checksum);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/********************************************************************//**
+Return a printable string describing the checksum algorithm.
+@return	algorithm name */
+UNIV_INTERN
+const char*
+buf_checksum_algorithm_name(
+/*========================*/
+	srv_checksum_algorithm_t	algo)	/*!< in: algorithm */
+{
+	switch (algo) {
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+		return("crc32");
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+		return("innodb");
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+		return("none");
+	}
+
+	ut_error;
+	return(NULL);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/xtradb/buf/buf0dblwr.cc b/storage/xtradb/buf/buf0dblwr.cc
new file mode 100644
index 00000000000..506a5b177ba
--- /dev/null
+++ b/storage/xtradb/buf/buf0dblwr.cc
@@ -0,0 +1,1136 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dblwr.cc
+Doublwrite buffer module
+
+Created 2011/12/19
+*******************************************************/
+
+#include "buf0dblwr.h"
+
+#ifdef UNIV_NONINL
+#include "buf0buf.ic"
+#endif
+
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "page0zip.h"
+#include "trx0sys.h"
+
+#ifndef UNIV_HOTBACKUP
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	buf_dblwr_mutex_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+/** The doublewrite buffer */
+UNIV_INTERN buf_dblwr_t*	buf_dblwr = NULL;
+
+/** Set to TRUE when the doublewrite buffer is being created */
+UNIV_INTERN ibool	buf_dblwr_being_created = FALSE;
+
+/****************************************************************//**
+Determines if a page number is located inside the doublewrite buffer.
+@return TRUE if the location is inside the two blocks of the
+doublewrite buffer */
+UNIV_INTERN
+ibool
+buf_dblwr_page_inside(
+/*==================*/
+	ulint	page_no)	/*!< in: page number */
+{
+	if (buf_dblwr == NULL) {
+
+		return(FALSE);
+	}
+
+	if (page_no >= buf_dblwr->block1
+	    && page_no < buf_dblwr->block1
+	    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		return(TRUE);
+	}
+
+	if (page_no >= buf_dblwr->block2
+	    && page_no < buf_dblwr->block2
+	    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/****************************************************************//**
+Calls buf_page_get() on the TRX_SYS_PAGE and returns a pointer to the
+doublewrite buffer within it.
+@return	pointer to the doublewrite buffer within the filespace header
+page. */
+UNIV_INLINE
+byte*
+buf_dblwr_get(
+/*==========*/
+	mtr_t*	mtr)	/*!< in/out: MTR to hold the page latch */
+{
+	buf_block_t*	block;
+
+	block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
+			     RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+	return(buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE);
+}
+
+/********************************************************************//**
+Flush a batch of writes to the datafiles that have already been
+written to the dblwr buffer on disk. */
+UNIV_INLINE
+void
+buf_dblwr_sync_datafiles()
+/*======================*/
+{
+	/* Wake possible simulated aio thread to actually post the
+	writes to the operating system */
+	os_aio_simulated_wake_handler_threads();
+
+	/* Wait that all async writes to tablespaces have been posted to
+	the OS */
+	os_aio_wait_until_no_pending_writes();
+
+	/* Now we flush the data to disk (for example, with fsync) */
+	fil_flush_file_spaces(FIL_TABLESPACE);
+}
+
+/****************************************************************//**
+Creates or initialializes the doublewrite buffer at a database start. */
+static
+void
+buf_dblwr_init(
+/*===========*/
+	byte*	doublewrite)	/*!< in: pointer to the doublewrite buf
+				header on trx sys page */
+{
+	ulint	buf_size;
+
+	buf_dblwr = static_cast<buf_dblwr_t*>(
+		mem_zalloc(sizeof(buf_dblwr_t)));
+
+	/* There are two blocks of same size in the doublewrite
+	buffer. */
+	buf_size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+
+	/* There must be atleast one buffer for single page writes
+	and one buffer for batch writes. */
+	ut_a(srv_doublewrite_batch_size > 0
+	     && srv_doublewrite_batch_size < buf_size);
+
+	mutex_create(buf_dblwr_mutex_key,
+		     &buf_dblwr->mutex, SYNC_DOUBLEWRITE);
+
+	buf_dblwr->b_event = os_event_create();
+	buf_dblwr->s_event = os_event_create();
+	buf_dblwr->first_free = 0;
+	buf_dblwr->s_reserved = 0;
+	buf_dblwr->b_reserved = 0;
+
+	buf_dblwr->block1 = mach_read_from_4(
+		doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
+	buf_dblwr->block2 = mach_read_from_4(
+		doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
+
+	buf_dblwr->in_use = static_cast<bool*>(
+		mem_zalloc(buf_size * sizeof(bool)));
+
+	buf_dblwr->write_buf_unaligned = static_cast<byte*>(
+		ut_malloc((1 + buf_size) * UNIV_PAGE_SIZE));
+
+	buf_dblwr->write_buf = static_cast<byte*>(
+		ut_align(buf_dblwr->write_buf_unaligned,
+			 UNIV_PAGE_SIZE));
+
+	buf_dblwr->buf_block_arr = static_cast<buf_page_t**>(
+		mem_zalloc(buf_size * sizeof(void*)));
+}
+
+/****************************************************************//**
+Creates the doublewrite buffer to a new InnoDB installation. The header of the
+doublewrite buffer is placed on the trx system header page. */
+UNIV_INTERN
+void
+buf_dblwr_create(void)
+/*==================*/
+{
+	buf_block_t*	block2;
+	buf_block_t*	new_block;
+	byte*	doublewrite;
+	byte*	fseg_header;
+	ulint	page_no;
+	ulint	prev_page_no;
+	ulint	i;
+	mtr_t	mtr;
+
+	if (buf_dblwr) {
+		/* Already inited */
+
+		return;
+	}
+
+start_again:
+	mtr_start(&mtr);
+	buf_dblwr_being_created = TRUE;
+
+	doublewrite = buf_dblwr_get(&mtr);
+
+	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+		/* The doublewrite buffer has already been created:
+		just read in some numbers */
+
+		buf_dblwr_init(doublewrite);
+
+		mtr_commit(&mtr);
+		buf_dblwr_being_created = FALSE;
+		return;
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Doublewrite buffer not found: creating new");
+
+	if (buf_pool_get_curr_size()
+	    < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+		+ FSP_EXTENT_SIZE / 2 + 100)
+	       * UNIV_PAGE_SIZE)) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot create doublewrite buffer: you must "
+			"increase your buffer pool size. Cannot continue "
+			"operation.");
+
+		exit(EXIT_FAILURE);
+	}
+
+	block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
+			     TRX_SYS_DOUBLEWRITE
+			     + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
+
+	/* fseg_create acquires a second latch on the page,
+	therefore we must declare it: */
+
+	buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
+
+	if (block2 == NULL) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot create doublewrite buffer: you must "
+			"increase your tablespace size. "
+			"Cannot continue operation.");
+
+		/* We exit without committing the mtr to prevent
+		its modifications to the database getting to disk */
+
+		exit(EXIT_FAILURE);
+	}
+
+	fseg_header = doublewrite + TRX_SYS_DOUBLEWRITE_FSEG;
+	prev_page_no = 0;
+
+	for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+		     + FSP_EXTENT_SIZE / 2; i++) {
+		new_block = fseg_alloc_free_page(
+			fseg_header, prev_page_no + 1, FSP_UP, &mtr);
+		if (new_block == NULL) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Cannot create doublewrite buffer: you must "
+				"increase your tablespace size. "
+				"Cannot continue operation.");
+
+			exit(EXIT_FAILURE);
+		}
+
+		/* We read the allocated pages to the buffer pool;
+		when they are written to disk in a flush, the space
+		id and page number fields are also written to the
+		pages. When we at database startup read pages
+		from the doublewrite buffer, we know that if the
+		space id and page number in them are the same as
+		the page position in the tablespace, then the page
+		has not been written to in doublewrite. */
+
+		ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+		page_no = buf_block_get_page_no(new_block);
+
+		if (i == FSP_EXTENT_SIZE / 2) {
+			ut_a(page_no == FSP_EXTENT_SIZE);
+			mlog_write_ulint(doublewrite
+					 + TRX_SYS_DOUBLEWRITE_BLOCK1,
+					 page_no, MLOG_4BYTES, &mtr);
+			mlog_write_ulint(doublewrite
+					 + TRX_SYS_DOUBLEWRITE_REPEAT
+					 + TRX_SYS_DOUBLEWRITE_BLOCK1,
+					 page_no, MLOG_4BYTES, &mtr);
+
+		} else if (i == FSP_EXTENT_SIZE / 2
+			   + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+			ut_a(page_no == 2 * FSP_EXTENT_SIZE);
+			mlog_write_ulint(doublewrite
+					 + TRX_SYS_DOUBLEWRITE_BLOCK2,
+					 page_no, MLOG_4BYTES, &mtr);
+			mlog_write_ulint(doublewrite
+					 + TRX_SYS_DOUBLEWRITE_REPEAT
+					 + TRX_SYS_DOUBLEWRITE_BLOCK2,
+					 page_no, MLOG_4BYTES, &mtr);
+
+		} else if (i > FSP_EXTENT_SIZE / 2) {
+			ut_a(page_no == prev_page_no + 1);
+		}
+
+		if (((i + 1) & 15) == 0) {
+			/* rw_locks can only be recursively x-locked
+			2048 times. (on 32 bit platforms,
+			(lint) 0 - (X_LOCK_DECR * 2049)
+			is no longer a negative number, and thus
+			lock_word becomes like a shared lock).
+			For 4k page size this loop will
+			lock the fseg header too many times. Since
+			this code is not done while any other threads
+			are active, restart the MTR occasionally. */
+			mtr_commit(&mtr);
+			mtr_start(&mtr);
+			doublewrite = buf_dblwr_get(&mtr);
+			fseg_header = doublewrite
+				      + TRX_SYS_DOUBLEWRITE_FSEG;
+		}
+
+		prev_page_no = page_no;
+	}
+
+	mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
+			 TRX_SYS_DOUBLEWRITE_MAGIC_N,
+			 MLOG_4BYTES, &mtr);
+	mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
+			 + TRX_SYS_DOUBLEWRITE_REPEAT,
+			 TRX_SYS_DOUBLEWRITE_MAGIC_N,
+			 MLOG_4BYTES, &mtr);
+
+	mlog_write_ulint(doublewrite
+			 + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+			 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+			 MLOG_4BYTES, &mtr);
+	mtr_commit(&mtr);
+
+	/* Flush the modified pages to disk and make a checkpoint */
+	log_make_checkpoint_at(LSN_MAX, TRUE);
+
+	/* Remove doublewrite pages from LRU */
+	buf_pool_invalidate();
+
+	ib_logf(IB_LOG_LEVEL_INFO, "Doublewrite buffer created");
+
+	goto start_again;
+}
+
+/****************************************************************//**
+At a database startup initializes the doublewrite buffer memory structure if
+we already have a doublewrite buffer created in the data files. If we are
+upgrading to an InnoDB version which supports multiple tablespaces, then this
+function performs the necessary update operations. If we are in a crash
+recovery, this function uses a possible doublewrite buffer to restore
+half-written pages in the data files. */
+UNIV_INTERN
+void
+buf_dblwr_init_or_restore_pages(
+/*============================*/
+	ibool	restore_corrupt_pages)	/*!< in: TRUE=restore pages */
+{
+	byte*	buf;
+	byte*	read_buf;
+	byte*	unaligned_read_buf;
+	ulint	block1;
+	ulint	block2;
+	byte*	page;
+	ibool	reset_space_ids = FALSE;
+	byte*	doublewrite;
+	ulint	space_id;
+	ulint	page_no;
+	ulint	i;
+
+	/* We do the file i/o past the buffer pool */
+
+	unaligned_read_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
+
+	read_buf = static_cast<byte*>(
+		ut_align(unaligned_read_buf, UNIV_PAGE_SIZE));
+
+	/* Read the trx sys header to check if we are using the doublewrite
+	buffer */
+
+	fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0,
+	       UNIV_PAGE_SIZE, read_buf, NULL);
+	doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
+
+	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+		/* The doublewrite buffer has been created */
+
+		buf_dblwr_init(doublewrite);
+
+		block1 = buf_dblwr->block1;
+		block2 = buf_dblwr->block2;
+
+		buf = buf_dblwr->write_buf;
+	} else {
+		goto leave_func;
+	}
+
+	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
+	    != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
+
+		/* We are upgrading from a version < 4.1.x to a version where
+		multiple tablespaces are supported. We must reset the space id
+		field in the pages in the doublewrite buffer because starting
+		from this version the space id is stored to
+		FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
+
+		reset_space_ids = TRUE;
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Resetting space id's in the doublewrite buffer");
+	}
+
+	/* Read the pages from the doublewrite buffer to memory */
+
+	fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block1, 0,
+	       TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+	       buf, NULL);
+	fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block2, 0,
+	       TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+	       buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+	       NULL);
+	/* Check if any of these pages is half-written in data files, in the
+	intended position */
+
+	page = buf;
+
+	for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
+
+		ulint source_page_no;
+		page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
+
+		if (reset_space_ids) {
+
+			space_id = 0;
+			mach_write_to_4(page
+					+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0);
+			/* We do not need to calculate new checksums for the
+			pages because the field .._SPACE_ID does not affect
+			them. Write the page back to where we read it from. */
+
+			if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+				source_page_no = block1 + i;
+			} else {
+				source_page_no = block2
+					+ i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+			}
+
+			fil_io(OS_FILE_WRITE, true, 0, 0, source_page_no, 0,
+			       UNIV_PAGE_SIZE, page, NULL);
+		} else {
+
+			space_id = mach_read_from_4(
+				page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+		}
+
+		if (!restore_corrupt_pages) {
+			/* The database was shut down gracefully: no need to
+			restore pages */
+
+		} else if (!fil_tablespace_exists_in_mem(space_id)) {
+			/* Maybe we have dropped the single-table tablespace
+			and this page once belonged to it: do nothing */
+
+		} else if (!fil_check_adress_in_tablespace(space_id,
+							   page_no)) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"A page in the doublewrite buffer is not "
+				"within space bounds; space id %lu "
+				"page number %lu, page %lu in "
+				"doublewrite buf.",
+				(ulong) space_id, (ulong) page_no, (ulong) i);
+
+		} else if (space_id == TRX_SYS_SPACE
+			   && ((page_no >= block1
+				&& page_no
+				< block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+			       || (page_no >= block2
+				   && page_no
+				   < (block2
+				      + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)))) {
+
+			/* It is an unwritten doublewrite buffer page:
+			do nothing */
+		} else {
+			ulint	zip_size = fil_space_get_zip_size(space_id);
+
+			/* Read in the actual page from the file */
+			fil_io(OS_FILE_READ, true, space_id, zip_size,
+			       page_no, 0,
+			       zip_size ? zip_size : UNIV_PAGE_SIZE,
+			       read_buf, NULL);
+
+			/* Check if the page is corrupt */
+
+			if (buf_page_is_corrupted(true, read_buf, zip_size)) {
+
+				fprintf(stderr,
+					"InnoDB: Warning: database page"
+					" corruption or a failed\n"
+					"InnoDB: file read of"
+					" space %lu page %lu.\n"
+					"InnoDB: Trying to recover it from"
+					" the doublewrite buffer.\n",
+					(ulong) space_id, (ulong) page_no);
+
+				if (buf_page_is_corrupted(true,
+							  page, zip_size)) {
+					fprintf(stderr,
+						"InnoDB: Dump of the page:\n");
+					buf_page_print(
+						read_buf, zip_size,
+						BUF_PAGE_PRINT_NO_CRASH);
+					fprintf(stderr,
+						"InnoDB: Dump of"
+						" corresponding page"
+						" in doublewrite buffer:\n");
+					buf_page_print(
+						page, zip_size,
+						BUF_PAGE_PRINT_NO_CRASH);
+
+					fprintf(stderr,
+						"InnoDB: Also the page in the"
+						" doublewrite buffer"
+						" is corrupt.\n"
+						"InnoDB: Cannot continue"
+						" operation.\n"
+						"InnoDB: You can try to"
+						" recover the database"
+						" with the my.cnf\n"
+						"InnoDB: option:\n"
+						"InnoDB:"
+						" innodb_force_recovery=6\n");
+					ut_error;
+				}
+
+				/* Write the good page from the
+				doublewrite buffer to the intended
+				position */
+
+				fil_io(OS_FILE_WRITE, true, space_id,
+				       zip_size, page_no, 0,
+				       zip_size ? zip_size : UNIV_PAGE_SIZE,
+				       page, NULL);
+
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"Recovered the page from"
+					" the doublewrite buffer.");
+			}
+		}
+
+		page += UNIV_PAGE_SIZE;
+	}
+
+	fil_flush_file_spaces(FIL_TABLESPACE);
+
+leave_func:
+	ut_free(unaligned_read_buf);
+}
+
+/****************************************************************//**
+Frees doublewrite buffer. */
+UNIV_INTERN
+void
+buf_dblwr_free(void)
+/*================*/
+{
+	/* Free the double write data structures. */
+	ut_a(buf_dblwr != NULL);
+	ut_ad(buf_dblwr->s_reserved == 0);
+	ut_ad(buf_dblwr->b_reserved == 0);
+
+	os_event_free(buf_dblwr->b_event);
+	os_event_free(buf_dblwr->s_event);
+	ut_free(buf_dblwr->write_buf_unaligned);
+	buf_dblwr->write_buf_unaligned = NULL;
+
+	mem_free(buf_dblwr->buf_block_arr);
+	buf_dblwr->buf_block_arr = NULL;
+
+	mem_free(buf_dblwr->in_use);
+	buf_dblwr->in_use = NULL;
+
+	mutex_free(&buf_dblwr->mutex);
+	mem_free(buf_dblwr);
+	buf_dblwr = NULL;
+}
+
+/********************************************************************//**
+Updates the doublewrite buffer when an IO request is completed. */
+UNIV_INTERN
+void
+buf_dblwr_update(
+/*=============*/
+	const buf_page_t*	bpage,	/*!< in: buffer block descriptor */
+	buf_flush_t		flush_type)/*!< in: flush type */
+{
+	if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
+		return;
+	}
+
+	switch (flush_type) {
+	case BUF_FLUSH_LIST:
+	case BUF_FLUSH_LRU:
+		mutex_enter(&buf_dblwr->mutex);
+
+		ut_ad(buf_dblwr->batch_running);
+		ut_ad(buf_dblwr->b_reserved > 0);
+		ut_ad(buf_dblwr->b_reserved <= buf_dblwr->first_free);
+
+		buf_dblwr->b_reserved--;
+
+		if (buf_dblwr->b_reserved == 0) {
+			mutex_exit(&buf_dblwr->mutex);
+			/* This will finish the batch. Sync data files
+			to the disk. */
+			fil_flush_file_spaces(FIL_TABLESPACE);
+			mutex_enter(&buf_dblwr->mutex);
+
+			/* We can now reuse the doublewrite memory buffer: */
+			buf_dblwr->first_free = 0;
+			buf_dblwr->batch_running = false;
+			os_event_set(buf_dblwr->b_event);
+		}
+
+		mutex_exit(&buf_dblwr->mutex);
+		break;
+	case BUF_FLUSH_SINGLE_PAGE:
+		{
+			const ulint size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+			ulint i;
+			mutex_enter(&buf_dblwr->mutex);
+			for (i = srv_doublewrite_batch_size; i < size; ++i) {
+				if (buf_dblwr->buf_block_arr[i] == bpage) {
+					buf_dblwr->s_reserved--;
+					buf_dblwr->buf_block_arr[i] = NULL;
+					buf_dblwr->in_use[i] = false;
+					break;
+				}
+			}
+
+			/* The block we are looking for must exist as a
+			reserved block. */
+			ut_a(i < size);
+		}
+		os_event_set(buf_dblwr->s_event);
+		mutex_exit(&buf_dblwr->mutex);
+		break;
+	case BUF_FLUSH_N_TYPES:
+		ut_error;
+	}
+}
+
+/********************************************************************//**
+Check the LSN values on the page. */
+static
+void
+buf_dblwr_check_page_lsn(
+/*=====================*/
+	const page_t*	page)		/*!< in: page to check */
+{
+	if (memcmp(page + (FIL_PAGE_LSN + 4),
+		   page + (UNIV_PAGE_SIZE
+			   - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
+		   4)) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: ERROR: The page to be written"
+			" seems corrupt!\n"
+			"InnoDB: The low 4 bytes of LSN fields do not match "
+			"(" ULINTPF " != " ULINTPF ")!"
+			" Noticed in the buffer pool.\n",
+			mach_read_from_4(
+				page + FIL_PAGE_LSN + 4),
+			mach_read_from_4(
+				page + UNIV_PAGE_SIZE
+				- FIL_PAGE_END_LSN_OLD_CHKSUM + 4));
+	}
+}
+
+/********************************************************************//**
+Asserts when a corrupt block is find during writing out data to the
+disk. */
+static
+void
+buf_dblwr_assert_on_corrupt_block(
+/*==============================*/
+	const buf_block_t*	block)	/*!< in: block to check */
+{
+	buf_page_print(block->frame, 0, BUF_PAGE_PRINT_NO_CRASH);
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: Apparent corruption of an"
+		" index page n:o %lu in space %lu\n"
+		"InnoDB: to be written to data file."
+		" We intentionally crash server\n"
+		"InnoDB: to prevent corrupt data"
+		" from ending up in data\n"
+		"InnoDB: files.\n",
+		(ulong) buf_block_get_page_no(block),
+		(ulong) buf_block_get_space(block));
+
+	ut_error;
+}
+
+/********************************************************************//**
+Check the LSN values on the page with which this block is associated.
+Also validate the page if the option is set. */
+static
+void
+buf_dblwr_check_block(
+/*==================*/
+	const buf_block_t*	block)	/*!< in: block to check */
+{
+	if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
+	    || block->page.zip.data) {
+		/* No simple validate for compressed pages exists. */
+		return;
+	}
+
+	buf_dblwr_check_page_lsn(block->frame);
+
+	if (!block->check_index_page_at_flush) {
+		return;
+	}
+
+	if (page_is_comp(block->frame)) {
+		if (!page_simple_validate_new(block->frame)) {
+			buf_dblwr_assert_on_corrupt_block(block);
+		}
+	} else if (!page_simple_validate_old(block->frame)) {
+
+		buf_dblwr_assert_on_corrupt_block(block);
+	}
+}
+
+/********************************************************************//**
+Writes a page that has already been written to the doublewrite buffer
+to the datafile. It is the job of the caller to sync the datafile. */
+static
+void
+buf_dblwr_write_block_to_datafile(
+/*==============================*/
+	const buf_page_t*	bpage,	/*!< in: page to write */
+	bool			sync)	/*!< in: true if sync IO
+					is requested */
+{
+	ut_a(bpage);
+	ut_a(buf_page_in_file(bpage));
+
+	const ulint flags = sync
+		? OS_FILE_WRITE
+		: OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER;
+
+	if (bpage->zip.data) {
+		fil_io(flags, sync, buf_page_get_space(bpage),
+		       buf_page_get_zip_size(bpage),
+		       buf_page_get_page_no(bpage), 0,
+		       buf_page_get_zip_size(bpage),
+		       (void*) bpage->zip.data,
+		       (void*) bpage);
+
+		return;
+	}
+
+
+	const buf_block_t* block = (buf_block_t*) bpage;
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	buf_dblwr_check_page_lsn(block->frame);
+
+	fil_io(flags, sync, buf_block_get_space(block), 0,
+	       buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
+	       (void*) block->frame, (void*) block);
+}
+
+/********************************************************************//**
+Flushes possible buffered writes from the doublewrite memory buffer to disk,
+and also wakes up the aio thread if simulated aio is used. It is very
+important to call this function after a batch of writes has been posted,
+and also when we may have to wait for a page latch! Otherwise a deadlock
+of threads can occur. */
+UNIV_INTERN
+void
+buf_dblwr_flush_buffered_writes(void)
+/*=================================*/
+{
+	byte*		write_buf;
+	ulint		first_free;
+	ulint		len;
+
+	if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
+		/* Sync the writes to the disk. */
+		buf_dblwr_sync_datafiles();
+		return;
+	}
+
+try_again:
+	mutex_enter(&buf_dblwr->mutex);
+
+	/* Write first to doublewrite buffer blocks. We use synchronous
+	aio and thus know that file write has been completed when the
+	control returns. */
+
+	if (buf_dblwr->first_free == 0) {
+
+		mutex_exit(&buf_dblwr->mutex);
+
+		return;
+	}
+
+	if (buf_dblwr->batch_running) {
+		/* Another thread is running the batch right now. Wait
+		for it to finish. */
+		ib_int64_t	sig_count = os_event_reset(buf_dblwr->b_event);
+		mutex_exit(&buf_dblwr->mutex);
+
+		os_event_wait_low(buf_dblwr->b_event, sig_count);
+		goto try_again;
+	}
+
+	ut_a(!buf_dblwr->batch_running);
+	ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved);
+
+	/* Disallow anyone else to post to doublewrite buffer or to
+	start another batch of flushing. */
+	buf_dblwr->batch_running = true;
+	first_free = buf_dblwr->first_free;
+
+	/* Now safe to release the mutex. Note that though no other
+	thread is allowed to post to the doublewrite batch flushing
+	but any threads working on single page flushes are allowed
+	to proceed. */
+	mutex_exit(&buf_dblwr->mutex);
+
+	write_buf = buf_dblwr->write_buf;
+
+	for (ulint len2 = 0, i = 0;
+	     i < buf_dblwr->first_free;
+	     len2 += UNIV_PAGE_SIZE, i++) {
+
+		const buf_block_t*	block;
+
+		block = (buf_block_t*) buf_dblwr->buf_block_arr[i];
+
+		if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
+		    || block->page.zip.data) {
+			/* No simple validate for compressed
+			pages exists. */
+			continue;
+		}
+
+		/* Check that the actual page in the buffer pool is
+		not corrupt and the LSN values are sane. */
+		buf_dblwr_check_block(block);
+
+		/* Check that the page as written to the doublewrite
+		buffer has sane LSN values. */
+		buf_dblwr_check_page_lsn(write_buf + len2);
+	}
+
+	/* Write out the first block of the doublewrite buffer */
+	len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
+		     buf_dblwr->first_free) * UNIV_PAGE_SIZE;
+
+	fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
+	       buf_dblwr->block1, 0, len,
+	       (void*) write_buf, NULL);
+
+	if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		/* No unwritten pages in the second block. */
+		goto flush;
+	}
+
+	/* Write out the second block of the doublewrite buffer. */
+	len = (buf_dblwr->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+	       * UNIV_PAGE_SIZE;
+
+	write_buf = buf_dblwr->write_buf
+		    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
+
+	fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
+	       buf_dblwr->block2, 0, len,
+	       (void*) write_buf, NULL);
+
+flush:
+	/* increment the doublewrite flushed pages counter */
+	srv_stats.dblwr_pages_written.add(buf_dblwr->first_free);
+	srv_stats.dblwr_writes.inc();
+
+	/* Now flush the doublewrite buffer data to disk */
+	fil_flush(TRX_SYS_SPACE);
+
+	/* We know that the writes have been flushed to disk now
+	and in recovery we will find them in the doublewrite buffer
+	blocks. Next do the writes to the intended positions. */
+
+	/* Up to this point first_free and buf_dblwr->first_free are
+	same because we have set the buf_dblwr->batch_running flag
+	disallowing any other thread to post any request but we
+	can't safely access buf_dblwr->first_free in the loop below.
+	This is so because it is possible that after we are done with
+	the last iteration and before we terminate the loop, the batch
+	gets finished in the IO helper thread and another thread posts
+	a new batch setting buf_dblwr->first_free to a higher value.
+	If this happens and we are using buf_dblwr->first_free in the
+	loop termination condition then we'll end up dispatching
+	the same block twice from two different threads. */
+	ut_ad(first_free == buf_dblwr->first_free);
+	for (ulint i = 0; i < first_free; i++) {
+		buf_dblwr_write_block_to_datafile(
+			buf_dblwr->buf_block_arr[i], false);
+	}
+
+	/* Wake possible simulated aio thread to actually post the
+	writes to the operating system. We don't flush the files
+	at this point. We leave it to the IO helper thread to flush
+	datafiles when the whole batch has been processed. */
+	os_aio_simulated_wake_handler_threads();
+}
+
+/********************************************************************//**
+Posts a buffer page for writing. If the doublewrite memory buffer is
+full, calls buf_dblwr_flush_buffered_writes and waits for for free
+space to appear. */
+UNIV_INTERN
+void
+buf_dblwr_add_to_batch(
+/*====================*/
+	buf_page_t*	bpage)	/*!< in: buffer block to write */
+{
+	ulint	zip_size;
+
+	ut_a(buf_page_in_file(bpage));
+	ut_ad(!mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex));
+
+try_again:
+	mutex_enter(&buf_dblwr->mutex);
+
+	ut_a(buf_dblwr->first_free <= srv_doublewrite_batch_size);
+
+	if (buf_dblwr->batch_running) {
+
+		/* This not nearly as bad as it looks. There is only
+		page_cleaner thread which does background flushing
+		in batches therefore it is unlikely to be a contention
+		point. The only exception is when a user thread is
+		forced to do a flush batch because of a sync
+		checkpoint. */
+		ib_int64_t	sig_count = os_event_reset(buf_dblwr->b_event);
+		mutex_exit(&buf_dblwr->mutex);
+
+		os_event_wait_low(buf_dblwr->b_event, sig_count);
+		goto try_again;
+	}
+
+	if (buf_dblwr->first_free == srv_doublewrite_batch_size) {
+		mutex_exit(&(buf_dblwr->mutex));
+
+		buf_dblwr_flush_buffered_writes();
+
+		goto try_again;
+	}
+
+	zip_size = buf_page_get_zip_size(bpage);
+
+	if (zip_size) {
+		UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
+		/* Copy the compressed page and clear the rest. */
+		memcpy(buf_dblwr->write_buf
+		       + UNIV_PAGE_SIZE * buf_dblwr->first_free,
+		       bpage->zip.data, zip_size);
+		memset(buf_dblwr->write_buf
+		       + UNIV_PAGE_SIZE * buf_dblwr->first_free
+		       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
+	} else {
+		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+		UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
+				   UNIV_PAGE_SIZE);
+
+		memcpy(buf_dblwr->write_buf
+		       + UNIV_PAGE_SIZE * buf_dblwr->first_free,
+		       ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
+	}
+
+	buf_dblwr->buf_block_arr[buf_dblwr->first_free] = bpage;
+
+	buf_dblwr->first_free++;
+	buf_dblwr->b_reserved++;
+
+	ut_ad(!buf_dblwr->batch_running);
+	ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved);
+	ut_ad(buf_dblwr->b_reserved <= srv_doublewrite_batch_size);
+
+	if (buf_dblwr->first_free == srv_doublewrite_batch_size) {
+		mutex_exit(&(buf_dblwr->mutex));
+
+		buf_dblwr_flush_buffered_writes();
+
+		return;
+	}
+
+	mutex_exit(&(buf_dblwr->mutex));
+}
+
+/********************************************************************//**
+Writes a page to the doublewrite buffer on disk, sync it, then write
+the page to the datafile and sync the datafile. This function is used
+for single page flushes. If all the buffers allocated for single page
+flushes in the doublewrite buffer are in use we wait here for one to
+become free. We are guaranteed that a slot will become free because any
+thread that is using a slot must also release the slot before leaving
+this function. */
+UNIV_INTERN
+void
+buf_dblwr_write_single_page(
+/*========================*/
+	buf_page_t*	bpage,	/*!< in: buffer block to write */
+	bool		sync)	/*!< in: true if sync IO requested */
+{
+	ulint		n_slots;
+	ulint		size;
+	ulint		zip_size;
+	ulint		offset;
+	ulint		i;
+
+	ut_a(buf_page_in_file(bpage));
+	ut_a(srv_use_doublewrite_buf);
+	ut_a(buf_dblwr != NULL);
+
+	/* total number of slots available for single page flushes
+	starts from srv_doublewrite_batch_size to the end of the
+	buffer. */
+	size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+	ut_a(size > srv_doublewrite_batch_size);
+	n_slots = size - srv_doublewrite_batch_size;
+
+	if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
+
+		/* Check that the actual page in the buffer pool is
+		not corrupt and the LSN values are sane. */
+		buf_dblwr_check_block((buf_block_t*) bpage);
+
+		/* Check that the page as written to the doublewrite
+		buffer has sane LSN values. */
+		if (!bpage->zip.data) {
+			buf_dblwr_check_page_lsn(
+				((buf_block_t*) bpage)->frame);
+		}
+	}
+
+retry:
+	mutex_enter(&buf_dblwr->mutex);
+	if (buf_dblwr->s_reserved == n_slots) {
+
+		/* All slots are reserved. */
+		ib_int64_t	sig_count =
+			os_event_reset(buf_dblwr->s_event);
+		mutex_exit(&buf_dblwr->mutex);
+		os_event_wait_low(buf_dblwr->s_event, sig_count);
+
+		goto retry;
+	}
+
+	for (i = srv_doublewrite_batch_size; i < size; ++i) {
+
+		if (!buf_dblwr->in_use[i]) {
+			break;
+		}
+	}
+
+	/* We are guaranteed to find a slot. */
+	ut_a(i < size);
+	buf_dblwr->in_use[i] = true;
+	buf_dblwr->s_reserved++;
+	buf_dblwr->buf_block_arr[i] = bpage;
+
+	/* increment the doublewrite flushed pages counter */
+	srv_stats.dblwr_pages_written.inc();
+	srv_stats.dblwr_writes.inc();
+
+	mutex_exit(&buf_dblwr->mutex);
+
+	/* Lets see if we are going to write in the first or second
+	block of the doublewrite buffer. */
+	if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		offset = buf_dblwr->block1 + i;
+	} else {
+		offset = buf_dblwr->block2 + i
+			 - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+	}
+
+	/* We deal with compressed and uncompressed pages a little
+	differently here. In case of uncompressed pages we can
+	directly write the block to the allocated slot in the
+	doublewrite buffer in the system tablespace and then after
+	syncing the system table space we can proceed to write the page
+	in the datafile.
+	In case of compressed page we first do a memcpy of the block
+	to the in-memory buffer of doublewrite before proceeding to
+	write it. This is so because we want to pad the remaining
+	bytes in the doublewrite page with zeros. */
+
+	zip_size = buf_page_get_zip_size(bpage);
+	if (zip_size) {
+		memcpy(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i,
+		       bpage->zip.data, zip_size);
+		memset(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i
+		       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
+
+		fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
+		       offset, 0, UNIV_PAGE_SIZE,
+		       (void*) (buf_dblwr->write_buf
+				+ UNIV_PAGE_SIZE * i), NULL);
+	} else {
+		/* It is a regular page. Write it directly to the
+		doublewrite buffer */
+		fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
+		       offset, 0, UNIV_PAGE_SIZE,
+		       (void*) ((buf_block_t*) bpage)->frame,
+		       NULL);
+	}
+
+	/* Now flush the doublewrite buffer data to disk */
+	fil_flush(TRX_SYS_SPACE);
+
+	/* We know that the write has been flushed to disk now
+	and during recovery we will find it in the doublewrite buffer
+	blocks. Next do the write to the intended position. */
+	buf_dblwr_write_block_to_datafile(bpage, sync);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/buf/buf0dump.cc b/storage/xtradb/buf/buf0dump.cc
new file mode 100644
index 00000000000..090e8cac63b
--- /dev/null
+++ b/storage/xtradb/buf/buf0dump.cc
@@ -0,0 +1,621 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dump.cc
+Implements a buffer pool dump/load.
+
+Created April 08, 2011 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+
+#include <stdarg.h> /* va_* */
+#include <string.h> /* strerror() */
+
+#include "buf0buf.h" /* srv_buf_pool_instances */
+#include "buf0dump.h"
+#include "db0err.h"
+#include "dict0dict.h" /* dict_operation_lock */
+#include "os0file.h" /* OS_FILE_MAX_PATH */
+#include "os0sync.h" /* os_event* */
+#include "os0thread.h" /* os_thread_* */
+#include "srv0srv.h" /* srv_fast_shutdown, srv_buf_dump* */
+#include "srv0start.h" /* srv_shutdown_state */
+#include "sync0rw.h" /* rw_lock_s_lock() */
+#include "ut0byte.h" /* ut_ull_create() */
+#include "ut0sort.h" /* UT_SORT_FUNCTION_BODY */
+
+enum status_severity {
+	STATUS_INFO,
+	STATUS_NOTICE,
+	STATUS_ERR
+};
+
+#define SHUTTING_DOWN()	(UNIV_UNLIKELY(srv_shutdown_state \
+				       != SRV_SHUTDOWN_NONE))
+
+/* Flags that tell the buffer pool dump/load thread which action should it
+take after being waked up. */
+static ibool	buf_dump_should_start = FALSE;
+static ibool	buf_load_should_start = FALSE;
+
+static ibool	buf_load_abort_flag = FALSE;
+
+/* Used to temporary store dump info in order to avoid IO while holding
+buffer pool LRU list mutex during dump and also to sort the contents of the
+dump before reading the pages from disk during load.
+We store the space id in the high 32 bits and page no in low 32 bits. */
+typedef ib_uint64_t	buf_dump_t;
+
+/* Aux macros to create buf_dump_t and to extract space and page from it */
+#define BUF_DUMP_CREATE(space, page)	ut_ull_create(space, page)
+#define BUF_DUMP_SPACE(a)		((ulint) ((a) >> 32))
+#define BUF_DUMP_PAGE(a)		((ulint) ((a) & 0xFFFFFFFFUL))
+
+/*****************************************************************//**
+Wakes up the buffer pool dump/load thread and instructs it to start
+a dump. This function is called by MySQL code via buffer_pool_dump_now()
+and it should return immediately because the whole MySQL is frozen during
+its execution. */
+UNIV_INTERN
+void
+buf_dump_start()
+/*============*/
+{
+	buf_dump_should_start = TRUE;
+	os_event_set(srv_buf_dump_event);
+}
+
+/*****************************************************************//**
+Wakes up the buffer pool dump/load thread and instructs it to start
+a load. This function is called by MySQL code via buffer_pool_load_now()
+and it should return immediately because the whole MySQL is frozen during
+its execution. */
+UNIV_INTERN
+void
+buf_load_start()
+/*============*/
+{
+	buf_load_should_start = TRUE;
+	os_event_set(srv_buf_dump_event);
+}
+
+/*****************************************************************//**
+Sets the global variable that feeds MySQL's innodb_buffer_pool_dump_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3). The value of this variable can be
+retrieved by:
+SELECT variable_value FROM information_schema.global_status WHERE
+variable_name = 'INNODB_BUFFER_POOL_DUMP_STATUS';
+or by:
+SHOW STATUS LIKE 'innodb_buffer_pool_dump_status'; */
+static __attribute__((nonnull, format(printf, 2, 3)))
+void
+buf_dump_status(
+/*============*/
+	enum status_severity	severity,/*!< in: status severity */
+	const char*		fmt,	/*!< in: format */
+	...)				/*!< in: extra parameters according
+					to fmt */
+{
+	va_list	ap;
+
+	va_start(ap, fmt);
+
+	ut_vsnprintf(
+		export_vars.innodb_buffer_pool_dump_status,
+		sizeof(export_vars.innodb_buffer_pool_dump_status),
+		fmt, ap);
+
+	if (severity == STATUS_NOTICE || severity == STATUS_ERR) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: %s\n",
+			export_vars.innodb_buffer_pool_dump_status);
+	}
+
+	va_end(ap);
+}
+
+/*****************************************************************//**
+Sets the global variable that feeds MySQL's innodb_buffer_pool_load_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3). The value of this variable can be
+retrieved by:
+SELECT variable_value FROM information_schema.global_status WHERE
+variable_name = 'INNODB_BUFFER_POOL_LOAD_STATUS';
+or by:
+SHOW STATUS LIKE 'innodb_buffer_pool_load_status'; */
+static __attribute__((nonnull, format(printf, 2, 3)))
+void
+buf_load_status(
+/*============*/
+	enum status_severity	severity,/*!< in: status severity */
+	const char*	fmt,	/*!< in: format */
+	...)			/*!< in: extra parameters according to fmt */
+{
+	va_list	ap;
+
+	va_start(ap, fmt);
+
+	ut_vsnprintf(
+		export_vars.innodb_buffer_pool_load_status,
+		sizeof(export_vars.innodb_buffer_pool_load_status),
+		fmt, ap);
+
+	if (severity == STATUS_NOTICE || severity == STATUS_ERR) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: %s\n",
+			export_vars.innodb_buffer_pool_load_status);
+	}
+
+	va_end(ap);
+}
+
+/*****************************************************************//**
+Perform a buffer pool dump into the file specified by
+innodb_buffer_pool_filename. If any errors occur then the value of
+innodb_buffer_pool_dump_status will be set accordingly, see buf_dump_status().
+The dump filename can be specified by (relative to srv_data_home):
+SET GLOBAL innodb_buffer_pool_filename='filename'; */
+static
+void
+buf_dump(
+/*=====*/
+	ibool	obey_shutdown)	/*!< in: quit if we are in a shutting down
+				state */
+{
+#define SHOULD_QUIT()	(SHUTTING_DOWN() && obey_shutdown)
+
+	char	full_filename[OS_FILE_MAX_PATH];
+	char	tmp_filename[OS_FILE_MAX_PATH];
+	char	now[32];
+	FILE*	f;
+	ulint	i;
+	int	ret;
+
+	ut_snprintf(full_filename, sizeof(full_filename),
+		    "%s%c%s", srv_data_home, SRV_PATH_SEPARATOR,
+		    srv_buf_dump_filename);
+
+	ut_snprintf(tmp_filename, sizeof(tmp_filename),
+		    "%s.incomplete", full_filename);
+
+	buf_dump_status(STATUS_NOTICE, "Dumping buffer pool(s) to %s",
+			full_filename);
+
+	f = fopen(tmp_filename, "w");
+	if (f == NULL) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot open '%s' for writing: %s",
+				tmp_filename, strerror(errno));
+		return;
+	}
+	/* else */
+
+	/* walk through each buffer pool */
+	for (i = 0; i < srv_buf_pool_instances && !SHOULD_QUIT(); i++) {
+		buf_pool_t*		buf_pool;
+		const buf_page_t*	bpage;
+		buf_dump_t*		dump;
+		ulint			n_pages;
+		ulint			j;
+
+		buf_pool = buf_pool_from_array(i);
+
+		/* obtain buf_pool LRU list mutex before allocate, since
+		UT_LIST_GET_LEN(buf_pool->LRU) could change */
+		mutex_enter(&buf_pool->LRU_list_mutex);
+
+		n_pages = UT_LIST_GET_LEN(buf_pool->LRU);
+
+		/* skip empty buffer pools */
+		if (n_pages == 0) {
+			mutex_exit(&buf_pool->LRU_list_mutex);
+			continue;
+		}
+
+		dump = static_cast<buf_dump_t*>(
+			ut_malloc(n_pages * sizeof(*dump))) ;
+
+		if (dump == NULL) {
+			mutex_exit(&buf_pool->LRU_list_mutex);
+			fclose(f);
+			buf_dump_status(STATUS_ERR,
+					"Cannot allocate " ULINTPF " bytes: %s",
+					(ulint) (n_pages * sizeof(*dump)),
+					strerror(errno));
+			/* leave tmp_filename to exist */
+			return;
+		}
+
+		for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), j = 0;
+		     bpage != NULL;
+		     bpage = UT_LIST_GET_PREV(LRU, bpage), j++) {
+
+			ut_a(buf_page_in_file(bpage));
+
+			dump[j] = BUF_DUMP_CREATE(buf_page_get_space(bpage),
+						  buf_page_get_page_no(bpage));
+		}
+
+		ut_a(j == n_pages);
+
+		mutex_exit(&buf_pool->LRU_list_mutex);
+
+		for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) {
+			ret = fprintf(f, ULINTPF "," ULINTPF "\n",
+				      BUF_DUMP_SPACE(dump[j]),
+				      BUF_DUMP_PAGE(dump[j]));
+			if (ret < 0) {
+				ut_free(dump);
+				fclose(f);
+				buf_dump_status(STATUS_ERR,
+						"Cannot write to '%s': %s",
+						tmp_filename, strerror(errno));
+				/* leave tmp_filename to exist */
+				return;
+			}
+
+			if (j % 128 == 0) {
+				buf_dump_status(
+					STATUS_INFO,
+					"Dumping buffer pool "
+					ULINTPF "/" ULINTPF ", "
+					"page " ULINTPF "/" ULINTPF,
+					i + 1, srv_buf_pool_instances,
+					j + 1, n_pages);
+			}
+		}
+
+		ut_free(dump);
+	}
+
+	ret = fclose(f);
+	if (ret != 0) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot close '%s': %s",
+				tmp_filename, strerror(errno));
+		return;
+	}
+	/* else */
+
+	ret = unlink(full_filename);
+	if (ret != 0 && errno != ENOENT) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot delete '%s': %s",
+				full_filename, strerror(errno));
+		/* leave tmp_filename to exist */
+		return;
+	}
+	/* else */
+
+	ret = rename(tmp_filename, full_filename);
+	if (ret != 0) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot rename '%s' to '%s': %s",
+				tmp_filename, full_filename,
+				strerror(errno));
+		/* leave tmp_filename to exist */
+		return;
+	}
+	/* else */
+
+	/* success */
+
+	ut_sprintf_timestamp(now);
+
+	buf_dump_status(STATUS_NOTICE,
+			"Buffer pool(s) dump completed at %s", now);
+}
+
+/*****************************************************************//**
+Compare two buffer pool dump entries, used to sort the dump on
+space_no,page_no before loading in order to increase the chance for
+sequential IO.
+@return -1/0/1 if entry 1 is smaller/equal/bigger than entry 2 */
+static
+lint
+buf_dump_cmp(
+/*=========*/
+	const buf_dump_t	d1,	/*!< in: buffer pool dump entry 1 */
+	const buf_dump_t	d2)	/*!< in: buffer pool dump entry 2 */
+{
+	if (d1 < d2) {
+		return(-1);
+	} else if (d1 == d2) {
+		return(0);
+	} else {
+		return(1);
+	}
+}
+
+/*****************************************************************//**
+Sort a buffer pool dump on space_no, page_no. */
+static
+void
+buf_dump_sort(
+/*==========*/
+	buf_dump_t*	dump,	/*!< in/out: buffer pool dump to sort */
+	buf_dump_t*	tmp,	/*!< in/out: temp storage */
+	ulint		low,	/*!< in: lowest index (inclusive) */
+	ulint		high)	/*!< in: highest index (non-inclusive) */
+{
+	UT_SORT_FUNCTION_BODY(buf_dump_sort, dump, tmp, low, high,
+			      buf_dump_cmp);
+}
+
+/*****************************************************************//**
+Perform a buffer pool load from the file specified by
+innodb_buffer_pool_filename. If any errors occur then the value of
+innodb_buffer_pool_load_status will be set accordingly, see buf_load_status().
+The dump filename can be specified by (relative to srv_data_home):
+SET GLOBAL innodb_buffer_pool_filename='filename'; */
+static
+void
+buf_load()
+/*======*/
+{
+	char		full_filename[OS_FILE_MAX_PATH];
+	char		now[32];
+	FILE*		f;
+	buf_dump_t*	dump;
+	buf_dump_t*	dump_tmp;
+	ulint		dump_n;
+	ulint		total_buffer_pools_pages;
+	ulint		i;
+	ulint		space_id;
+	ulint		page_no;
+	int		fscanf_ret;
+
+	/* Ignore any leftovers from before */
+	buf_load_abort_flag = FALSE;
+
+	ut_snprintf(full_filename, sizeof(full_filename),
+		    "%s%c%s", srv_data_home, SRV_PATH_SEPARATOR,
+		    srv_buf_dump_filename);
+
+	buf_load_status(STATUS_NOTICE,
+			"Loading buffer pool(s) from %s", full_filename);
+
+	f = fopen(full_filename, "r");
+	if (f == NULL) {
+		buf_load_status(STATUS_ERR,
+				"Cannot open '%s' for reading: %s",
+				full_filename, strerror(errno));
+		return;
+	}
+	/* else */
+
+	/* First scan the file to estimate how many entries are in it.
+	This file is tiny (approx 500KB per 1GB buffer pool), reading it
+	two times is fine. */
+	dump_n = 0;
+	while (fscanf(f, ULINTPF "," ULINTPF, &space_id, &page_no) == 2
+	       && !SHUTTING_DOWN()) {
+		dump_n++;
+	}
+
+	if (!SHUTTING_DOWN() && !feof(f)) {
+		/* fscanf() returned != 2 */
+		const char*	what;
+		if (ferror(f)) {
+			what = "reading";
+		} else {
+			what = "parsing";
+		}
+		fclose(f);
+		buf_load_status(STATUS_ERR, "Error %s '%s', "
+				"unable to load buffer pool (stage 1)",
+				what, full_filename);
+		return;
+	}
+
+	/* If dump is larger than the buffer pool(s), then we ignore the
+	extra trailing. This could happen if a dump is made, then buffer
+	pool is shrunk and then load it attempted. */
+	total_buffer_pools_pages = buf_pool_get_n_pages()
+		* srv_buf_pool_instances;
+	if (dump_n > total_buffer_pools_pages) {
+		dump_n = total_buffer_pools_pages;
+	}
+
+	dump = static_cast<buf_dump_t*>(ut_malloc(dump_n * sizeof(*dump)));
+
+	if (dump == NULL) {
+		fclose(f);
+		buf_load_status(STATUS_ERR,
+				"Cannot allocate " ULINTPF " bytes: %s",
+				(ulint) (dump_n * sizeof(*dump)),
+				strerror(errno));
+		return;
+	}
+
+	dump_tmp = static_cast<buf_dump_t*>(
+		ut_malloc(dump_n * sizeof(*dump_tmp)));
+
+	if (dump_tmp == NULL) {
+		ut_free(dump);
+		fclose(f);
+		buf_load_status(STATUS_ERR,
+				"Cannot allocate " ULINTPF " bytes: %s",
+				(ulint) (dump_n * sizeof(*dump_tmp)),
+				strerror(errno));
+		return;
+	}
+
+	rewind(f);
+
+	for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
+		fscanf_ret = fscanf(f, ULINTPF "," ULINTPF,
+				    &space_id, &page_no);
+
+		if (fscanf_ret != 2) {
+			if (feof(f)) {
+				break;
+			}
+			/* else */
+
+			ut_free(dump);
+			ut_free(dump_tmp);
+			fclose(f);
+			buf_load_status(STATUS_ERR,
+					"Error parsing '%s', unable "
+					"to load buffer pool (stage 2)",
+					full_filename);
+			return;
+		}
+
+		if (space_id > ULINT32_MASK || page_no > ULINT32_MASK) {
+			ut_free(dump);
+			ut_free(dump_tmp);
+			fclose(f);
+			buf_load_status(STATUS_ERR,
+					"Error parsing '%s': bogus "
+					"space,page " ULINTPF "," ULINTPF
+					" at line " ULINTPF ", "
+					"unable to load buffer pool",
+					full_filename,
+					space_id, page_no,
+					i);
+			return;
+		}
+
+		dump[i] = BUF_DUMP_CREATE(space_id, page_no);
+	}
+
+	/* Set dump_n to the actual number of initialized elements,
+	i could be smaller than dump_n here if the file got truncated after
+	we read it the first time. */
+	dump_n = i;
+
+	fclose(f);
+
+	if (dump_n == 0) {
+		ut_free(dump);
+		ut_sprintf_timestamp(now);
+		buf_load_status(STATUS_NOTICE,
+				"Buffer pool(s) load completed at %s "
+				"(%s was empty)", now, full_filename);
+		return;
+	}
+
+	if (!SHUTTING_DOWN()) {
+		buf_dump_sort(dump, dump_tmp, 0, dump_n);
+	}
+
+	ut_free(dump_tmp);
+
+	for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
+
+		buf_read_page_async(BUF_DUMP_SPACE(dump[i]),
+				    BUF_DUMP_PAGE(dump[i]));
+
+		if (i % 64 == 63) {
+			os_aio_simulated_wake_handler_threads();
+		}
+
+		if (i % 128 == 0) {
+			buf_load_status(STATUS_INFO,
+					"Loaded " ULINTPF "/" ULINTPF " pages",
+					i + 1, dump_n);
+		}
+
+		if (buf_load_abort_flag) {
+			buf_load_abort_flag = FALSE;
+			ut_free(dump);
+			buf_load_status(
+				STATUS_NOTICE,
+				"Buffer pool(s) load aborted on request");
+			return;
+		}
+	}
+
+	ut_free(dump);
+
+	ut_sprintf_timestamp(now);
+
+	buf_load_status(STATUS_NOTICE,
+			"Buffer pool(s) load completed at %s", now);
+}
+
+/*****************************************************************//**
+Aborts a currently running buffer pool load. This function is called by
+MySQL code via buffer_pool_load_abort() and it should return immediately
+because the whole MySQL is frozen during its execution. */
+UNIV_INTERN
+void
+buf_load_abort()
+/*============*/
+{
+	buf_load_abort_flag = TRUE;
+}
+
+/*****************************************************************//**
+This is the main thread for buffer pool dump/load. It waits for an
+event and when waked up either performs a dump or load and sleeps
+again.
+@return this function does not return, it calls os_thread_exit() */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(buf_dump_thread)(
+/*============================*/
+	void*	arg __attribute__((unused)))	/*!< in: a dummy parameter
+						required by os_thread_create */
+{
+	ut_ad(!srv_read_only_mode);
+
+	srv_buf_dump_thread_active = TRUE;
+
+	buf_dump_status(STATUS_INFO, "not started");
+	buf_load_status(STATUS_INFO, "not started");
+
+	if (srv_buffer_pool_load_at_startup) {
+		buf_load();
+	}
+
+	while (!SHUTTING_DOWN()) {
+
+		os_event_wait(srv_buf_dump_event);
+
+		if (buf_dump_should_start) {
+			buf_dump_should_start = FALSE;
+			buf_dump(TRUE /* quit on shutdown */);
+		}
+
+		if (buf_load_should_start) {
+			buf_load_should_start = FALSE;
+			buf_load();
+		}
+
+		os_event_reset(srv_buf_dump_event);
+	}
+
+	if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) {
+		buf_dump(FALSE /* ignore shutdown down flag,
+		keep going even if we are in a shutdown state */);
+	}
+
+	srv_buf_dump_thread_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
diff --git a/storage/xtradb/buf/buf0flu.c b/storage/xtradb/buf/buf0flu.c
deleted file mode 100644
index fea665eba40..00000000000
--- a/storage/xtradb/buf/buf0flu.c
+++ /dev/null
@@ -1,2402 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file buf/buf0flu.c
-The database buffer buf_pool flush algorithm
-
-Created 11/11/1995 Heikki Tuuri
-*******************************************************/
-
-#include "buf0flu.h"
-
-#ifdef UNIV_NONINL
-#include "buf0flu.ic"
-#endif
-
-#include "buf0buf.h"
-#include "srv0srv.h"
-#include "page0zip.h"
-#ifndef UNIV_HOTBACKUP
-#include "ut0byte.h"
-#include "ut0lst.h"
-#include "page0page.h"
-#include "fil0fil.h"
-#include "buf0lru.h"
-#include "buf0rea.h"
-#include "ibuf0ibuf.h"
-#include "log0log.h"
-#include "os0file.h"
-#include "trx0sys.h"
-#include "mysql/plugin.h"
-#include "mysql/service_thd_wait.h"
-
-/**********************************************************************
-These statistics are generated for heuristics used in estimating the
-rate at which we should flush the dirty blocks to avoid bursty IO
-activity. Note that the rate of flushing not only depends on how many
-dirty pages we have in the buffer pool but it is also a fucntion of
-how much redo the workload is generating and at what rate. */
-/* @{ */
-
-/** Number of intervals for which we keep the history of these stats.
-Each interval is 1 second, defined by the rate at which
-srv_error_monitor_thread() calls buf_flush_stat_update(). */
-#define BUF_FLUSH_STAT_N_INTERVAL 20
-
-/** Sampled values buf_flush_stat_cur.
-Not protected by any mutex.  Updated by buf_flush_stat_update(). */
-static buf_flush_stat_t	buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
-
-/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */
-static ulint		buf_flush_stat_arr_ind;
-
-/** Values at start of the current interval. Reset by
-buf_flush_stat_update(). */
-static buf_flush_stat_t	buf_flush_stat_cur;
-
-/** Running sum of past values of buf_flush_stat_cur.
-Updated by buf_flush_stat_update(). Not protected by any mutex. */
-static buf_flush_stat_t	buf_flush_stat_sum;
-
-/** Number of pages flushed through non flush_list flushes. */
-// static ulint buf_lru_flush_page_count = 0;
-
-/* @} */
-
-/******************************************************************//**
-Increases flush_list size in bytes with zip_size for compressed page,
-UNIV_PAGE_SIZE for uncompressed page in inline function */
-static inline
-void
-incr_flush_list_size_in_bytes(
-/*==========================*/
-	buf_block_t*	block,		/*!< in: control block */
-	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
-{
-	ulint		zip_size;
-	ut_ad(buf_flush_list_mutex_own(buf_pool));
-	zip_size = page_zip_get_size(&block->page.zip);
-	buf_pool->stat.flush_list_bytes += zip_size ? zip_size : UNIV_PAGE_SIZE;
-	ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
-}
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/******************************************************************//**
-Validates the flush list.
-@return	TRUE if ok */
-static
-ibool
-buf_flush_validate_low(
-/*===================*/
-	buf_pool_t*	buf_pool);	/*!< in: Buffer pool instance */
-
-/******************************************************************//**
-Validates the flush list some of the time.
-@return	TRUE if ok or the check was skipped */
-static
-ibool
-buf_flush_validate_skip(
-/*====================*/
-	buf_pool_t*	buf_pool)	/*!< in: Buffer pool instance */
-{
-/** Try buf_flush_validate_low() every this many times */
-# define BUF_FLUSH_VALIDATE_SKIP	23
-
-	/** The buf_flush_validate_low() call skip counter.
-	Use a signed type because of the race condition below. */
-	static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
-
-	/* There is a race condition below, but it does not matter,
-	because this call is only for heuristic purposes. We want to
-	reduce the call frequency of the costly buf_flush_validate_low()
-	check in debug builds. */
-	if (--buf_flush_validate_count > 0) {
-		return(TRUE);
-	}
-
-	buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
-	return(buf_flush_validate_low(buf_pool));
-}
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-/******************************************************************//**
-Insert a block in the flush_rbt and returns a pointer to its
-predecessor or NULL if no predecessor. The ordering is maintained
-on the basis of the <oldest_modification, space, offset> key.
-@return	pointer to the predecessor or NULL if no predecessor. */
-static
-buf_page_t*
-buf_flush_insert_in_flush_rbt(
-/*==========================*/
-	buf_page_t*	bpage)	/*!< in: bpage to be inserted. */
-{
-	const ib_rbt_node_t*	c_node;
-	const ib_rbt_node_t*	p_node;
-	buf_page_t*		prev = NULL;
-	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
-
-	ut_ad(buf_flush_list_mutex_own(buf_pool));
-
-	/* Insert this buffer into the rbt. */
-	c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
-	ut_a(c_node != NULL);
-
-	/* Get the predecessor. */
-	p_node = rbt_prev(buf_pool->flush_rbt, c_node);
-
-	if (p_node != NULL) {
-		buf_page_t**	value;
-		value = rbt_value(buf_page_t*, p_node);
-		prev = *value;
-		ut_a(prev != NULL);
-	}
-
-	return(prev);
-}
-
-/*********************************************************//**
-Delete a bpage from the flush_rbt. */
-static
-void
-buf_flush_delete_from_flush_rbt(
-/*============================*/
-	buf_page_t*	bpage)	/*!< in: bpage to be removed. */
-{
-#ifdef UNIV_DEBUG
-	ibool		ret = FALSE;
-#endif /* UNIV_DEBUG */
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-
-	ut_ad(buf_flush_list_mutex_own(buf_pool));
-
-#ifdef UNIV_DEBUG
-	ret =
-#endif /* UNIV_DEBUG */
-	rbt_delete(buf_pool->flush_rbt, &bpage);
-
-	ut_ad(ret);
-}
-
-/*****************************************************************//**
-Compare two modified blocks in the buffer pool. The key for comparison
-is:
-key = <oldest_modification, space, offset>
-This comparison is used to maintian ordering of blocks in the
-buf_pool->flush_rbt.
-Note that for the purpose of flush_rbt, we only need to order blocks
-on the oldest_modification. The other two fields are used to uniquely
-identify the blocks.
-@return	 < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
-static
-int
-buf_flush_block_cmp(
-/*================*/
-	const void*	p1,		/*!< in: block1 */
-	const void*	p2)		/*!< in: block2 */
-{
-	int			ret;
-	const buf_page_t*	b1 = *(const buf_page_t**) p1;
-	const buf_page_t*	b2 = *(const buf_page_t**) p2;
-#ifdef UNIV_DEBUG
-	buf_pool_t*		buf_pool = buf_pool_from_bpage(b1);
-#endif /* UNIV_DEBUG */
-
-	ut_ad(b1 != NULL);
-	ut_ad(b2 != NULL);
-
-	ut_ad(buf_flush_list_mutex_own(buf_pool));
-
-	ut_ad(b1->in_flush_list);
-	ut_ad(b2->in_flush_list);
-
-	if (b2->oldest_modification > b1->oldest_modification) {
-		return(1);
-	} else if (b2->oldest_modification < b1->oldest_modification) {
-		return(-1);
-	}
-
-	/* If oldest_modification is same then decide on the space. */
-	ret = (int)(b2->space - b1->space);
-
-	/* Or else decide ordering on the offset field. */
-	return(ret ? ret : (int)(b2->offset - b1->offset));
-}
-
-/********************************************************************//**
-Initialize the red-black tree to speed up insertions into the flush_list
-during recovery process. Should be called at the start of recovery
-process before any page has been read/written. */
-UNIV_INTERN
-void
-buf_flush_init_flush_rbt(void)
-/*==========================*/
-{
-	ulint	i;
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		buf_flush_list_mutex_enter(buf_pool);
-
-		/* Create red black tree for speedy insertions in flush list. */
-		buf_pool->flush_rbt = rbt_create(
-			sizeof(buf_page_t*), buf_flush_block_cmp);
-
-		buf_flush_list_mutex_exit(buf_pool);
-	}
-}
-
-/********************************************************************//**
-Frees up the red-black tree. */
-UNIV_INTERN
-void
-buf_flush_free_flush_rbt(void)
-/*==========================*/
-{
-	ulint	i;
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		buf_flush_list_mutex_enter(buf_pool);
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-		ut_a(buf_flush_validate_low(buf_pool));
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-		rbt_free(buf_pool->flush_rbt);
-		buf_pool->flush_rbt = NULL;
-
-		buf_flush_list_mutex_exit(buf_pool);
-	}
-}
-
-/********************************************************************//**
-Inserts a modified block into the flush list. */
-UNIV_INTERN
-void
-buf_flush_insert_into_flush_list(
-/*=============================*/
-	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	buf_block_t*	block,		/*!< in/out: block which is modified */
-	ib_uint64_t	lsn)		/*!< in: oldest modification */
-{
-	ut_ad(!buf_pool_mutex_own(buf_pool));
-	ut_ad(log_flush_order_mutex_own());
-	ut_ad(mutex_own(&block->mutex));
-
-	buf_flush_list_mutex_enter(buf_pool);
-
-	ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
-	      || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
-		  <= lsn));
-
-	/* If we are in the recovery then we need to update the flush
-	red-black tree as well. */
-	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
-		buf_flush_list_mutex_exit(buf_pool);
-		buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
-		return;
-	}
-
-	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-	ut_ad(!block->page.in_flush_list);
-
-	ut_d(block->page.in_flush_list = TRUE);
-	block->page.oldest_modification = lsn;
-	UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page);
-	incr_flush_list_size_in_bytes(block, buf_pool);
-
-#ifdef UNIV_DEBUG_VALGRIND
-	{
-		ulint	zip_size = buf_block_get_zip_size(block);
-
-		if (UNIV_UNLIKELY(zip_size)) {
-			UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
-		} else {
-			UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
-		}
-	}
-#endif /* UNIV_DEBUG_VALGRIND */
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ut_a(buf_flush_validate_skip(buf_pool));
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-	buf_flush_list_mutex_exit(buf_pool);
-}
-
-/********************************************************************//**
-Inserts a modified block into the flush list in the right sorted position.
-This function is used by recovery, because there the modifications do not
-necessarily come in the order of lsn's. */
-UNIV_INTERN
-void
-buf_flush_insert_sorted_into_flush_list(
-/*====================================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	buf_block_t*	block,		/*!< in/out: block which is modified */
-	ib_uint64_t	lsn)		/*!< in: oldest modification */
-{
-	buf_page_t*	prev_b;
-	buf_page_t*	b;
-
-	ut_ad(!buf_pool_mutex_own(buf_pool));
-	ut_ad(log_flush_order_mutex_own());
-	ut_ad(mutex_own(&block->mutex));
-	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-
-	buf_flush_list_mutex_enter(buf_pool);
-
-	/* The field in_LRU_list is protected by buf_pool->mutex, which
-	we are not holding.  However, while a block is in the flush
-	list, it is dirty and cannot be discarded, not from the
-	page_hash or from the LRU list.  At most, the uncompressed
-	page frame of a compressed block may be discarded or created
-	(copying the block->page to or from a buf_page_t that is
-	dynamically allocated from buf_buddy_alloc()).  Because those
-	transitions hold block->mutex and the flush list mutex (via
-	buf_flush_relocate_on_flush_list()), there is no possibility
-	of a race condition in the assertions below. */
-	ut_ad(block->page.in_LRU_list);
-	ut_ad(block->page.in_page_hash);
-	/* buf_buddy_block_register() will take a block in the
-	BUF_BLOCK_MEMORY state, not a file page. */
-	ut_ad(!block->page.in_zip_hash);
-
-	ut_ad(!block->page.in_flush_list);
-	ut_d(block->page.in_flush_list = TRUE);
-	block->page.oldest_modification = lsn;
-
-#ifdef UNIV_DEBUG_VALGRIND
-	{
-		ulint	zip_size = buf_block_get_zip_size(block);
-
-		if (UNIV_UNLIKELY(zip_size)) {
-			UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
-		} else {
-			UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
-		}
-	}
-#endif /* UNIV_DEBUG_VALGRIND */
-
-	prev_b = NULL;
-
-	/* For the most part when this function is called the flush_rbt
-	should not be NULL. In a very rare boundary case it is possible
-	that the flush_rbt has already been freed by the recovery thread
-	before the last page was hooked up in the flush_list by the
-	io-handler thread. In that case we'll  just do a simple
-	linear search in the else block. */
-	if (buf_pool->flush_rbt) {
-
-		prev_b = buf_flush_insert_in_flush_rbt(&block->page);
-
-	} else {
-
-		b = UT_LIST_GET_FIRST(buf_pool->flush_list);
-
-		while (b && b->oldest_modification
-		       > block->page.oldest_modification) {
-			ut_ad(b->in_flush_list);
-			prev_b = b;
-			b = UT_LIST_GET_NEXT(flush_list, b);
-		}
-	}
-
-	if (prev_b == NULL) {
-		UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page);
-	} else {
-		UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list,
-				     prev_b, &block->page);
-	}
-
-	incr_flush_list_size_in_bytes(block, buf_pool);
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ut_a(buf_flush_validate_low(buf_pool));
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-	buf_flush_list_mutex_exit(buf_pool);
-}
-
-/********************************************************************//**
-Returns TRUE if the file page block is immediately suitable for replacement,
-i.e., the transition FILE_PAGE => NOT_USED allowed.
-@return	TRUE if can replace immediately */
-UNIV_INTERN
-ibool
-buf_flush_ready_for_replace(
-/*========================*/
-	buf_page_t*	bpage)	/*!< in: buffer control block, must be
-				buf_page_in_file(bpage) and in the LRU list */
-{
-#ifdef UNIV_DEBUG
-	//buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-#endif
-	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-	//ut_ad(bpage->in_LRU_list);
-
-	if (UNIV_LIKELY(bpage->in_LRU_list && buf_page_in_file(bpage))) {
-
-		return(bpage->oldest_modification == 0
-		       && buf_page_get_io_fix(bpage) == BUF_IO_NONE
-		       && bpage->buf_fix_count == 0);
-	}
-
-	/* permited not to own LRU_mutex..  */
-/*
-	ut_print_timestamp(stderr);
-	fprintf(stderr,
-		"  InnoDB: Error: buffer block state %lu"
-		" in the LRU list!\n",
-		(ulong) buf_page_get_state(bpage));
-	ut_print_buf(stderr, bpage, sizeof(buf_page_t));
-	putc('\n', stderr);
-*/
-
-	return(FALSE);
-}
-
-/********************************************************************//**
-Returns TRUE if the block is modified and ready for flushing.
-@return	TRUE if can flush immediately */
-UNIV_INLINE
-ibool
-buf_flush_ready_for_flush(
-/*======================*/
-	buf_page_t*	bpage,	/*!< in: buffer control block, must be
-				buf_page_in_file(bpage) */
-	enum buf_flush	flush_type)/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
-{
-#ifdef UNIV_DEBUG
-	//buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-#endif
-	//ut_a(buf_page_in_file(bpage));
-	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
-	ut_ad(mutex_own(buf_page_get_mutex(bpage))
-	      || flush_type == BUF_FLUSH_LIST);
-
-	if (buf_page_in_file(bpage) && bpage->oldest_modification != 0
-	    && buf_page_get_io_fix_unlocked(bpage) == BUF_IO_NONE) {
-		ut_ad(bpage->in_flush_list);
-
-		if (flush_type != BUF_FLUSH_LRU) {
-
-			return(TRUE);
-
-		} else if (bpage->buf_fix_count == 0) {
-
-			/* If we are flushing the LRU list, to avoid deadlocks
-			we require the block not to be bufferfixed, and hence
-			not latched. */
-
-			return(TRUE);
-		}
-	}
-
-	return(FALSE);
-}
-
-/********************************************************************//**
-Remove a block from the flush list of modified blocks. */
-UNIV_INTERN
-void
-buf_flush_remove(
-/*=============*/
-	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
-{
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	ulint		zip_size;
-
-	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_DIRTY
-	      || mutex_own(&buf_pool->LRU_list_mutex));
-#endif
-	ut_ad(bpage->in_flush_list);
-
-	buf_flush_list_mutex_enter(buf_pool);
-
-	switch (buf_page_get_state(bpage)) {
-	case BUF_BLOCK_ZIP_PAGE:
-		/* Clean compressed pages should not be on the flush list */
-	case BUF_BLOCK_ZIP_FREE:
-	case BUF_BLOCK_NOT_USED:
-	case BUF_BLOCK_READY_FOR_USE:
-	case BUF_BLOCK_MEMORY:
-	case BUF_BLOCK_REMOVE_HASH:
-		ut_error;
-		return;
-	case BUF_BLOCK_ZIP_DIRTY:
-		buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
-		UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-		buf_LRU_insert_zip_clean(bpage);
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-		break;
-	case BUF_BLOCK_FILE_PAGE:
-		UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
-		break;
-	}
-
-	/* If the flush_rbt is active then delete from there as well. */
-	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
-		buf_flush_delete_from_flush_rbt(bpage);
-	}
-
-	/* Must be done after we have removed it from the flush_rbt
-	because we assert on in_flush_list in comparison function. */
-	ut_d(bpage->in_flush_list = FALSE);
-
-	zip_size = page_zip_get_size(&bpage->zip);
-	buf_pool->stat.flush_list_bytes -= zip_size ? zip_size : UNIV_PAGE_SIZE;
-
-	bpage->oldest_modification = 0;
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ut_a(buf_flush_validate_skip(buf_pool));
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-	buf_flush_list_mutex_exit(buf_pool);
-}
-
-/*******************************************************************//**
-Relocates a buffer control block on the flush_list.
-Note that it is assumed that the contents of bpage have already been
-copied to dpage.
-IMPORTANT: When this function is called bpage and dpage are not
-exact copies of each other. For example, they both will have different
-::state. Also the ::list pointers in dpage may be stale. We need to
-use the current list node (bpage) to do the list manipulation because
-the list pointers could have changed between the time that we copied
-the contents of bpage to the dpage and the flush list manipulation
-below. */
-UNIV_INTERN
-void
-buf_flush_relocate_on_flush_list(
-/*=============================*/
-	buf_page_t*	bpage,	/*!< in/out: control block being moved */
-	buf_page_t*	dpage)	/*!< in/out: destination block */
-{
-	buf_page_t*	prev;
-	buf_page_t* 	prev_b = NULL;
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-	/* Must reside in the same buffer pool. */
-	ut_ad(buf_pool == buf_pool_from_bpage(dpage));
-
-	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-
-	buf_flush_list_mutex_enter(buf_pool);
-
-	/* FIXME: At this point we have both buf_pool and flush_list
-	mutexes. Theoretically removal of a block from flush list is
-	only covered by flush_list mutex but currently we do
-	have buf_pool mutex in buf_flush_remove() therefore this block
-	is guaranteed to be in the flush list. We need to check if
-	this will work without the assumption of block removing code
-	having the buf_pool mutex. */
-	ut_ad(bpage->in_flush_list);
-	ut_ad(dpage->in_flush_list);
-
-	/* If recovery is active we must swap the control blocks in
-	the flush_rbt as well. */
-	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
-		buf_flush_delete_from_flush_rbt(bpage);
-		prev_b = buf_flush_insert_in_flush_rbt(dpage);
-	}
-
-	/* Must be done after we have removed it from the flush_rbt
-	because we assert on in_flush_list in comparison function. */
-	ut_d(bpage->in_flush_list = FALSE);
-
-	prev = UT_LIST_GET_PREV(flush_list, bpage);
-	UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
-
-	if (prev) {
-		ut_ad(prev->in_flush_list);
-		UT_LIST_INSERT_AFTER(
-			flush_list,
-			buf_pool->flush_list,
-			prev, dpage);
-	} else {
-		UT_LIST_ADD_FIRST(
-			flush_list,
-			buf_pool->flush_list,
-			dpage);
-	}
-
-	/* Just an extra check. Previous in flush_list
-	should be the same control block as in flush_rbt. */
-	ut_a(!buf_pool->flush_rbt || prev_b == prev);
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ut_a(buf_flush_validate_low(buf_pool));
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-	buf_flush_list_mutex_exit(buf_pool);
-}
-
-/********************************************************************//**
-Updates the flush system data structures when a write is completed. */
-UNIV_INTERN
-void
-buf_flush_write_complete(
-/*=====================*/
-	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
-{
-	enum buf_flush	flush_type;
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-
-	ut_ad(bpage);
-
-	buf_flush_remove(bpage);
-
-	flush_type = buf_page_get_flush_type(bpage);
-	buf_pool->n_flush[flush_type]--;
-
-	if (flush_type == BUF_FLUSH_LRU) {
-		/* Put the block to the end of the LRU list to wait to be
-		moved to the free list */
-
-		buf_LRU_make_block_old(bpage);
-
-		buf_pool->LRU_flush_ended++;
-	}
-
-	/* fprintf(stderr, "n pending flush %lu\n",
-	buf_pool->n_flush[flush_type]); */
-
-	if (buf_pool->n_flush[flush_type] == 0
-	    && buf_pool->init_flush[flush_type] == FALSE) {
-
-		/* The running flush batch has ended */
-
-		os_event_set(buf_pool->no_flush[flush_type]);
-	}
-}
-
-/********************************************************************//**
-Flush a batch of writes to the datafiles that have already been
-written by the OS. */
-static
-void
-buf_flush_sync_datafiles(void)
-/*==========================*/
-{
-	/* Wake possible simulated aio thread to actually post the
-	writes to the operating system */
-	os_aio_simulated_wake_handler_threads();
-
-	/* Wait that all async writes to tablespaces have been posted to
-	the OS */
-	os_aio_wait_until_no_pending_writes();
-
-	/* Now we flush the data to disk (for example, with fsync) */
-	fil_flush_file_spaces(FIL_TABLESPACE);
-
-	return;
-}
-
-/********************************************************************//**
-Flushes possible buffered writes from the doublewrite memory buffer to disk,
-and also wakes up the aio thread if simulated aio is used. It is very
-important to call this function after a batch of writes has been posted,
-and also when we may have to wait for a page latch! Otherwise a deadlock
-of threads can occur. */
-static
-void
-buf_flush_buffered_writes(void)
-/*===========================*/
-{
-	byte*		write_buf;
-	ulint		len;
-	ulint		len2;
-	ulint		i;
-
-	if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
-		/* Sync the writes to the disk. */
-		buf_flush_sync_datafiles();
-		return;
-	}
-
-	mutex_enter(&(trx_doublewrite->mutex));
-
-	/* Write first to doublewrite buffer blocks. We use synchronous
-	aio and thus know that file write has been completed when the
-	control returns. */
-
-	if (trx_doublewrite->first_free == 0) {
-
-		mutex_exit(&(trx_doublewrite->mutex));
-
-		return;
-	}
-
-	for (i = 0; i < trx_doublewrite->first_free; i++) {
-
-		const buf_block_t*	block;
-
-		block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];
-
-		if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
-		    || block->page.zip.data) {
-			/* No simple validate for compressed pages exists. */
-			continue;
-		}
-
-		if (UNIV_UNLIKELY
-		    (memcmp(block->frame + (FIL_PAGE_LSN + 4),
-			    block->frame + (UNIV_PAGE_SIZE
-					    - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
-			    4))) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: ERROR: The page to be written"
-				" seems corrupt!\n"
-				"InnoDB: The lsn fields do not match!"
-				" Noticed in the buffer pool\n"
-				"InnoDB: before posting to the"
-				" doublewrite buffer.\n");
-		}
-
-		if (!block->check_index_page_at_flush) {
-		} else if (page_is_comp(block->frame)) {
-			if (UNIV_UNLIKELY
-			    (!page_simple_validate_new(block->frame))) {
-corrupted_page:
-				buf_page_print(block->frame, 0,
-					       BUF_PAGE_PRINT_NO_CRASH);
-
-				ut_print_timestamp(stderr);
-				fprintf(stderr,
-					"  InnoDB: Apparent corruption of an"
-					" index page n:o %lu in space %lu\n"
-					"InnoDB: to be written to data file."
-					" We intentionally crash server\n"
-					"InnoDB: to prevent corrupt data"
-					" from ending up in data\n"
-					"InnoDB: files.\n",
-					(ulong) buf_block_get_page_no(block),
-					(ulong) buf_block_get_space(block));
-
-				ut_error;
-			}
-		} else if (UNIV_UNLIKELY
-			   (!page_simple_validate_old(block->frame))) {
-
-			goto corrupted_page;
-		}
-	}
-
-	/* increment the doublewrite flushed pages counter */
-	srv_dblwr_pages_written+= trx_doublewrite->first_free;
-	srv_dblwr_writes++;
-
-	len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
-		     trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
-
-	write_buf = trx_doublewrite->write_buf;
-	i = 0;
-
-	fil_io(OS_FILE_WRITE, TRUE,
-	       (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0,
-	       trx_doublewrite->block1, 0, len,
-	       (void*) write_buf, NULL);
-
-	for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
-	     len2 += UNIV_PAGE_SIZE, i++) {
-		const buf_block_t* block = (buf_block_t*)
-			trx_doublewrite->buf_block_arr[i];
-
-		if (UNIV_LIKELY(!block->page.zip.data)
-		    && UNIV_LIKELY(buf_block_get_state(block)
-				   == BUF_BLOCK_FILE_PAGE)
-		    && UNIV_UNLIKELY
-		    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
-			    write_buf + len2
-			    + (UNIV_PAGE_SIZE
-			       - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: ERROR: The page to be written"
-				" seems corrupt!\n"
-				"InnoDB: The lsn fields do not match!"
-				" Noticed in the doublewrite block1.\n");
-		}
-	}
-
-	if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-		goto flush;
-	}
-
-	len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
-		* UNIV_PAGE_SIZE;
-
-	write_buf = trx_doublewrite->write_buf
-		+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
-	ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
-
-	fil_io(OS_FILE_WRITE, TRUE,
-	       (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0,
-	       trx_doublewrite->block2, 0, len,
-	       (void*) write_buf, NULL);
-
-	for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
-	     len2 += UNIV_PAGE_SIZE, i++) {
-		const buf_block_t* block = (buf_block_t*)
-			trx_doublewrite->buf_block_arr[i];
-
-		if (UNIV_LIKELY(!block->page.zip.data)
-		    && UNIV_LIKELY(buf_block_get_state(block)
-				   == BUF_BLOCK_FILE_PAGE)
-		    && UNIV_UNLIKELY
-		    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
-			    write_buf + len2
-			    + (UNIV_PAGE_SIZE
-			       - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: ERROR: The page to be"
-				" written seems corrupt!\n"
-				"InnoDB: The lsn fields do not match!"
-				" Noticed in"
-				" the doublewrite block2.\n");
-		}
-	}
-
-flush:
-	/* Now flush the doublewrite buffer data to disk */
-
-	fil_flush(srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE, FALSE);
-
-	/* We know that the writes have been flushed to disk now
-	and in recovery we will find them in the doublewrite buffer
-	blocks. Next do the writes to the intended positions. */
-
-	for (i = 0; i < trx_doublewrite->first_free; i++) {
-		const buf_block_t* block = (buf_block_t*)
-			trx_doublewrite->buf_block_arr[i];
-
-		ut_a(buf_page_in_file(&block->page));
-		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
-			fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
-			       FALSE, buf_page_get_space(&block->page),
-			       buf_page_get_zip_size(&block->page),
-			       buf_page_get_page_no(&block->page), 0,
-			       buf_page_get_zip_size(&block->page),
-			       (void*)block->page.zip.data,
-			       (void*)block);
-
-			/* Increment the counter of I/O operations used
-			for selecting LRU policy. */
-			buf_LRU_stat_inc_io();
-
-			continue;
-		}
-
-		ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-
-		if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
-					 block->frame
-					 + (UNIV_PAGE_SIZE
-					    - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
-					 4))) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: ERROR: The page to be written"
-				" seems corrupt!\n"
-				"InnoDB: The lsn fields do not match!"
-				" Noticed in the buffer pool\n"
-				"InnoDB: after posting and flushing"
-				" the doublewrite buffer.\n"
-				"InnoDB: Page buf fix count %lu,"
-				" io fix %lu, state %lu\n",
-				(ulong)block->page.buf_fix_count,
-				(ulong)buf_block_get_io_fix_unlocked(block),
-				(ulong)buf_block_get_state(block));
-		}
-
-		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
-		       FALSE, buf_block_get_space(block), 0,
-		       buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
-		       (void*)block->frame, (void*)block);
-
-		/* Increment the counter of I/O operations used
-		for selecting LRU policy. */
-		buf_LRU_stat_inc_io();
-	}
-
-	/* Sync the writes to the disk. */
-	buf_flush_sync_datafiles();
-
-	/* We can now reuse the doublewrite memory buffer: */
-	trx_doublewrite->first_free = 0;
-
-	mutex_exit(&(trx_doublewrite->mutex));
-}
-
-/********************************************************************//**
-Posts a buffer page for writing. If the doublewrite memory buffer is
-full, calls buf_flush_buffered_writes and waits for for free space to
-appear. */
-static
-void
-buf_flush_post_to_doublewrite_buf(
-/*==============================*/
-	buf_page_t*	bpage)	/*!< in: buffer block to write */
-{
-	ulint	zip_size;
-try_again:
-	mutex_enter(&(trx_doublewrite->mutex));
-
-	ut_a(buf_page_in_file(bpage));
-
-	if (trx_doublewrite->first_free
-	    >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-		mutex_exit(&(trx_doublewrite->mutex));
-
-		buf_flush_buffered_writes();
-
-		goto try_again;
-	}
-
-	zip_size = buf_page_get_zip_size(bpage);
-
-	if (UNIV_UNLIKELY(zip_size)) {
-		UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
-		/* Copy the compressed page and clear the rest. */
-		memcpy(trx_doublewrite->write_buf
-		       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
-		       bpage->zip.data, zip_size);
-		memset(trx_doublewrite->write_buf
-		       + UNIV_PAGE_SIZE * trx_doublewrite->first_free
-		       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
-	} else {
-		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
-		UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
-				   UNIV_PAGE_SIZE);
-
-		memcpy(trx_doublewrite->write_buf
-		       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
-		       ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
-	}
-
-	trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;
-
-	trx_doublewrite->first_free++;
-
-	if (trx_doublewrite->first_free
-	    >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-		mutex_exit(&(trx_doublewrite->mutex));
-
-		buf_flush_buffered_writes();
-
-		return;
-	}
-
-	mutex_exit(&(trx_doublewrite->mutex));
-}
-#endif /* !UNIV_HOTBACKUP */
-
-/********************************************************************//**
-Initializes a page for writing to the tablespace. */
-UNIV_INTERN
-void
-buf_flush_init_for_writing(
-/*=======================*/
-	byte*		page,		/*!< in/out: page */
-	void*		page_zip_,	/*!< in/out: compressed page, or NULL */
-	ib_uint64_t	newest_lsn)	/*!< in: newest modification lsn
-					to the page */
-{
-	ut_ad(page);
-
-	if (page_zip_) {
-		page_zip_des_t*	page_zip = page_zip_;
-		ulint		zip_size = page_zip_get_size(page_zip);
-		ut_ad(zip_size);
-		ut_ad(ut_is_2pow(zip_size));
-		ut_ad(zip_size <= UNIV_PAGE_SIZE);
-
-		switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
-		case FIL_PAGE_TYPE_ALLOCATED:
-		case FIL_PAGE_INODE:
-		case FIL_PAGE_IBUF_BITMAP:
-		case FIL_PAGE_TYPE_FSP_HDR:
-		case FIL_PAGE_TYPE_XDES:
-			/* These are essentially uncompressed pages. */
-			memcpy(page_zip->data, page, zip_size);
-			/* fall through */
-		case FIL_PAGE_TYPE_ZBLOB:
-		case FIL_PAGE_TYPE_ZBLOB2:
-		case FIL_PAGE_INDEX:
-			mach_write_to_8(page_zip->data
-					+ FIL_PAGE_LSN, newest_lsn);
-			memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
-			mach_write_to_4(page_zip->data
-					+ FIL_PAGE_SPACE_OR_CHKSUM,
-					srv_use_checksums
-					? page_zip_calc_checksum(
-						page_zip->data, zip_size)
-					: BUF_NO_CHECKSUM_MAGIC);
-			return;
-		}
-
-		ut_print_timestamp(stderr);
-		fputs("  InnoDB: ERROR: The compressed page to be written"
-		      " seems corrupt:", stderr);
-		ut_print_buf(stderr, page, zip_size);
-		fputs("\nInnoDB: Possibly older version of the page:", stderr);
-		ut_print_buf(stderr, page_zip->data, zip_size);
-		putc('\n', stderr);
-		ut_error;
-	}
-
-	/* Write the newest modification lsn to the page header and trailer */
-	mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
-
-	mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
-			newest_lsn);
-
-	/* Store the new formula checksum */
-
-	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
-			srv_use_checksums
-			? (!srv_fast_checksum
-			   ? buf_calc_page_new_checksum(page)
-			   : buf_calc_page_new_checksum_32(page))
-			: BUF_NO_CHECKSUM_MAGIC);
-
-	/* We overwrite the first 4 bytes of the end lsn field to store
-	the old formula checksum. Since it depends also on the field
-	FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
-	new formula checksum. */
-
-	mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
-			srv_use_checksums
-			? buf_calc_page_old_checksum(page)
-			: BUF_NO_CHECKSUM_MAGIC);
-}
-
-#ifndef UNIV_HOTBACKUP
-/********************************************************************//**
-Does an asynchronous write of a buffer page. NOTE: in simulated aio and
-also when the doublewrite buffer is used, we must call
-buf_flush_buffered_writes after we have posted a batch of writes! */
-static
-void
-buf_flush_write_block_low(
-/*======================*/
-	buf_page_t*	bpage)	/*!< in: buffer block to write */
-{
-	ulint	zip_size	= buf_page_get_zip_size(bpage);
-	page_t*	frame		= NULL;
-
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	//ut_ad(!buf_pool_mutex_own(buf_pool));
-#endif
-
-#ifdef UNIV_LOG_DEBUG
-	static ibool univ_log_debug_warned;
-#endif /* UNIV_LOG_DEBUG */
-
-	ut_ad(buf_page_in_file(bpage));
-
-	/* We are not holding buf_pool->mutex or block_mutex here.
-	Nevertheless, it is safe to access bpage, because it is
-	io_fixed and oldest_modification != 0.  Thus, it cannot be
-	relocated in the buffer pool or removed from flush_list or
-	LRU_list. */
-	//ut_ad(!buf_pool_mutex_own(buf_pool));
-	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
-	ut_ad(!buf_flush_list_mutex_own(buf_pool));
-	ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
-	ut_ad(buf_page_get_io_fix_unlocked(bpage) == BUF_IO_WRITE);
-	ut_ad(bpage->oldest_modification != 0);
-
-#ifdef UNIV_IBUF_COUNT_DEBUG
-	ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
-#endif
-	ut_ad(bpage->newest_modification != 0);
-
-#ifdef UNIV_LOG_DEBUG
-	if (!univ_log_debug_warned) {
-		univ_log_debug_warned = TRUE;
-		fputs("Warning: cannot force log to disk if"
-		      " UNIV_LOG_DEBUG is defined!\n"
-		      "Crash recovery will not work!\n",
-		      stderr);
-	}
-#else
-	/* Force the log to the disk before writing the modified block */
-	log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
-#endif
-	switch (buf_page_get_state(bpage)) {
-	case BUF_BLOCK_ZIP_FREE:
-	case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
-	case BUF_BLOCK_NOT_USED:
-	case BUF_BLOCK_READY_FOR_USE:
-	case BUF_BLOCK_MEMORY:
-	case BUF_BLOCK_REMOVE_HASH:
-		ut_error;
-		break;
-	case BUF_BLOCK_ZIP_DIRTY:
-		frame = bpage->zip.data;
-		if (UNIV_LIKELY(srv_use_checksums)) {
-			ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
-			     == page_zip_calc_checksum(frame, zip_size));
-		}
-		mach_write_to_8(frame + FIL_PAGE_LSN,
-				bpage->newest_modification);
-		memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
-		break;
-	case BUF_BLOCK_FILE_PAGE:
-		frame = bpage->zip.data;
-		if (!frame) {
-			frame = ((buf_block_t*) bpage)->frame;
-		}
-
-		buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
-					   bpage->zip.data
-					   ? &bpage->zip : NULL,
-					   bpage->newest_modification);
-		break;
-	}
-
-	if (!srv_use_doublewrite_buf || !trx_doublewrite) {
-		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
-		       FALSE, buf_page_get_space(bpage), zip_size,
-		       buf_page_get_page_no(bpage), 0,
-		       zip_size ? zip_size : UNIV_PAGE_SIZE,
-		       frame, bpage);
-	} else {
-		buf_flush_post_to_doublewrite_buf(bpage);
-	}
-}
-
-# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
-/********************************************************************//**
-Writes a flushable page asynchronously from the buffer pool to a file.
-NOTE: block->mutex must be held upon entering this function, and it will be
-released by this function after flushing.
-This is loosely based on buf_flush_batch() and buf_flush_page().
-@return TRUE if the page was flushed and the mutex released */
-UNIV_INTERN
-ibool
-buf_flush_page_try(
-/*===============*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
-	buf_block_t*	block)		/*!< in/out: buffer control block */
-{
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-	ut_ad(mutex_own(&block->mutex));
-
-	if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_LRU)) {
-		return(FALSE);
-	}
-
-	buf_pool_mutex_enter(buf_pool);
-
-	if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
-	    || buf_pool->init_flush[BUF_FLUSH_LRU]) {
-		buf_pool_mutex_exit(buf_pool);
-		/* There is already a flush batch of the same type running */
-		return(FALSE);
-	}
-
-	buf_pool->init_flush[BUF_FLUSH_LRU] = TRUE;
-
-	buf_page_set_io_fix(&block->page, BUF_IO_WRITE);
-
-	buf_page_set_flush_type(&block->page, BUF_FLUSH_LRU);
-
-	if (buf_pool->n_flush[BUF_FLUSH_LRU]++ == 0) {
-
-		os_event_reset(buf_pool->no_flush[BUF_FLUSH_LRU]);
-	}
-
-	/* VERY IMPORTANT:
-	Because any thread may call the LRU flush, even when owning
-	locks on pages, to avoid deadlocks, we must make sure that the
-	s-lock is acquired on the page without waiting: this is
-	accomplished because buf_flush_ready_for_flush() must hold,
-	and that requires the page not to be bufferfixed. */
-
-	rw_lock_s_lock_gen(&block->lock, BUF_IO_WRITE);
-
-	/* Note that the s-latch is acquired before releasing the
-	buf_pool mutex: this ensures that the latch is acquired
-	immediately. */
-
-	mutex_exit(&block->mutex);
-	buf_pool_mutex_exit(buf_pool);
-
-	/* Even though block is not protected by any mutex at this
-	point, it is safe to access block, because it is io_fixed and
-	oldest_modification != 0.  Thus, it cannot be relocated in the
-	buffer pool or removed from flush_list or LRU_list. */
-
-	buf_flush_write_block_low(&block->page);
-
-	buf_pool_mutex_enter(buf_pool);
-	buf_pool->init_flush[BUF_FLUSH_LRU] = FALSE;
-
-	if (buf_pool->n_flush[BUF_FLUSH_LRU] == 0) {
-		/* The running flush batch has ended */
-		os_event_set(buf_pool->no_flush[BUF_FLUSH_LRU]);
-	}
-
-	buf_pool_mutex_exit(buf_pool);
-	buf_flush_buffered_writes();
-
-	return(TRUE);
-}
-# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
-
-/********************************************************************//**
-Writes a flushable page asynchronously from the buffer pool to a file.
-NOTE: in simulated aio we must call
-os_aio_simulated_wake_handler_threads after we have posted a batch of
-writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
-held upon entering this function, and they will be released by this
-function. */
-static
-void
-buf_flush_page(
-/*===========*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	buf_page_t*	bpage,		/*!< in: buffer control block */
-	enum buf_flush	flush_type)	/*!< in: BUF_FLUSH_LRU
-					or BUF_FLUSH_LIST */
-{
-	mutex_t*	block_mutex;
-	ibool		is_uncompressed;
-
-	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_SHARED));
-#endif
-	ut_ad(buf_page_in_file(bpage));
-
-	block_mutex = buf_page_get_mutex(bpage);
-	ut_ad(mutex_own(block_mutex));
-
-	buf_pool_mutex_enter(buf_pool);
-	rw_lock_s_unlock(&buf_pool->page_hash_latch);
-
-	ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
-
-	buf_page_set_io_fix(bpage, BUF_IO_WRITE);
-
-	buf_page_set_flush_type(bpage, flush_type);
-
-	if (buf_pool->n_flush[flush_type] == 0) {
-
-		os_event_reset(buf_pool->no_flush[flush_type]);
-	}
-
-	buf_pool->n_flush[flush_type]++;
-
-	is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
-	ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
-
-	switch (flush_type) {
-		ibool	is_s_latched;
-	case BUF_FLUSH_LIST:
-		/* If the simulated aio thread is not running, we must
-		not wait for any latch, as we may end up in a deadlock:
-		if buf_fix_count == 0, then we know we need not wait */
-
-		is_s_latched = (bpage->buf_fix_count == 0);
-		if (is_s_latched && is_uncompressed) {
-			rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
-					   BUF_IO_WRITE);
-		}
-
-		mutex_exit(block_mutex);
-		buf_pool_mutex_exit(buf_pool);
-
-		/* Even though bpage is not protected by any mutex at
-		this point, it is safe to access bpage, because it is
-		io_fixed and oldest_modification != 0.  Thus, it
-		cannot be relocated in the buffer pool or removed from
-		flush_list or LRU_list. */
-
-		if (!is_s_latched) {
-			buf_flush_buffered_writes();
-
-			if (is_uncompressed) {
-				rw_lock_s_lock_gen(&((buf_block_t*) bpage)
-						   ->lock, BUF_IO_WRITE);
-			}
-		}
-
-		break;
-
-	case BUF_FLUSH_LRU:
-		/* VERY IMPORTANT:
-		Because any thread may call the LRU flush, even when owning
-		locks on pages, to avoid deadlocks, we must make sure that the
-		s-lock is acquired on the page without waiting: this is
-		accomplished because buf_flush_ready_for_flush() must hold,
-		and that requires the page not to be bufferfixed. */
-
-		if (is_uncompressed) {
-			rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
-					   BUF_IO_WRITE);
-		}
-
-		/* Note that the s-latch is acquired before releasing the
-		buf_pool mutex: this ensures that the latch is acquired
-		immediately. */
-
-		mutex_exit(block_mutex);
-		buf_pool_mutex_exit(buf_pool);
-		break;
-
-	default:
-		ut_error;
-	}
-
-	/* Even though bpage is not protected by any mutex at this
-	point, it is safe to access bpage, because it is io_fixed and
-	oldest_modification != 0.  Thus, it cannot be relocated in the
-	buffer pool or removed from flush_list or LRU_list. */
-
-#ifdef UNIV_DEBUG
-	if (buf_debug_prints) {
-		fprintf(stderr,
-			"Flushing %u space %u page %u\n",
-			flush_type, bpage->space, bpage->offset);
-	}
-#endif /* UNIV_DEBUG */
-	buf_flush_write_block_low(bpage);
-}
-
-/***********************************************************//**
-Flushes to disk all flushable pages within the flush area.
-@return	number of pages flushed */
-static
-ulint
-buf_flush_try_neighbors(
-/*====================*/
-	ulint		space,		/*!< in: space id */
-	ulint		offset,		/*!< in: page offset */
-	enum buf_flush	flush_type,	/*!< in: BUF_FLUSH_LRU or
-					BUF_FLUSH_LIST */
-	ulint		n_flushed,	/*!< in: number of pages
-					flushed so far in this batch */
-	ulint		n_to_flush)	/*!< in: maximum number of pages
-					we are allowed to flush */
-{
-	ulint		i;
-	ulint		low;
-	ulint		high;
-	ulint		count = 0;
-	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
-	ibool		is_forward_scan;
-
-	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
-
-	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN || !srv_flush_neighbor_pages) {
-		/* If there is little space, it is better not to flush
-		any block except from the end of the LRU list */
-
-		low = offset;
-		high = offset + 1;
-	} else {
-		/* When flushed, dirty blocks are searched in
-		neighborhoods of this size, and flushed along with the
-		original page. */
-
-		ulint	buf_flush_area;
-	
-		buf_flush_area	= ut_min(
-			BUF_READ_AHEAD_AREA(buf_pool),
-			buf_pool->curr_size / 16);
-
-		low = (offset / buf_flush_area) * buf_flush_area;
-		high = (offset / buf_flush_area + 1) * buf_flush_area;
-	}
-
-	/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
-
-	if (high > fil_space_get_size(space)) {
-		high = fil_space_get_size(space);
-	}
-
-	if (srv_flush_neighbor_pages == 2) {
-
-		/* In the case of contiguous flush where the requested page
-		does not fall at the start of flush area, first scan backward
-		from the page and later forward from it. */
-		is_forward_scan = (offset == low);
-	}
-	else {
-		is_forward_scan = TRUE;
-	}
-
-scan:
-	if (srv_flush_neighbor_pages == 2) {
-		if (is_forward_scan) {
-			i = offset;
-		}
-		else {
-			i = offset - 1;
-		}
-	}
-	else {
-		i = low;
-	}
-
-	for (; is_forward_scan ? (i < high) : (i >= low);
-	     is_forward_scan ? i++ : i--) {
-
-		buf_page_t*	bpage;
-
-		if ((count + n_flushed) >= n_to_flush) {
-
-			/* We have already flushed enough pages and
-			should call it a day. There is, however, one
-			exception. If the page whose neighbors we
-			are flushing has not been flushed yet then
-			we'll try to flush the victim that we
-			selected originally. */
-			if (i <= offset) {
-				i = offset;
-			} else {
-				break;
-			}
-		}
-
-		buf_pool = buf_pool_get(space, i);
-
-		//buf_pool_mutex_enter(buf_pool);
-		rw_lock_s_lock(&buf_pool->page_hash_latch);
-
-		/* We only want to flush pages from this buffer pool. */
-		bpage = buf_page_hash_get(buf_pool, space, i);
-
-		if (!bpage) {
-
-			//buf_pool_mutex_exit(buf_pool);
-			rw_lock_s_unlock(&buf_pool->page_hash_latch);
-			if (srv_flush_neighbor_pages == 2) {
-
-				/* This is contiguous neighbor page flush and
-				the pages here are not contiguous. */
-				break;
-			}
-			continue;
-		}
-
-		ut_a(buf_page_in_file(bpage));
-
-		/* We avoid flushing 'non-old' blocks in an LRU flush,
-		because the flushed blocks are soon freed */
-
-		if (flush_type != BUF_FLUSH_LRU
-		    || i == offset
-		    || buf_page_is_old(bpage)) {
-			mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
-
-			if (block_mutex && buf_flush_ready_for_flush(bpage, flush_type)
-			    && (i == offset || !bpage->buf_fix_count)) {
-				/* We only try to flush those
-				neighbors != offset where the buf fix
-				count is zero, as we then know that we
-				probably can latch the page without a
-				semaphore wait. Semaphore waits are
-				expensive because we must flush the
-				doublewrite buffer before we start
-				waiting. */
-
-				buf_flush_page(buf_pool, bpage, flush_type);
-				ut_ad(!mutex_own(block_mutex));
-				ut_ad(!buf_pool_mutex_own(buf_pool));
-				count++;
-				continue;
-			} else if (block_mutex) {
-				mutex_exit(block_mutex);
-			}
-		}
-		//buf_pool_mutex_exit(buf_pool);
-		rw_lock_s_unlock(&buf_pool->page_hash_latch);
-
-		if (srv_flush_neighbor_pages == 2) {
-
-			/* We are trying to do the contiguous neighbor page
-			flush, but the last page we checked was unflushable,
-			making a "hole" in the flush, so stop this attempt. */
-			break;
-		}
-	}
-
-	if (!is_forward_scan) {
-
-		/* Backward scan done, now do the forward scan */
-		ut_a (srv_flush_neighbor_pages == 2);
-		is_forward_scan = TRUE;
-		goto scan;
-	}
-
-	return(count);
-}
-
-/********************************************************************//**
-Check if the block is modified and ready for flushing. If the the block
-is ready to flush then flush the page and try o flush its neighbors.
-
-@return	TRUE if LRU list mutex was not released during this function.
-This does not guarantee that some pages were written as well.
-Number of pages written are incremented to the count. */
-static
-ibool
-buf_flush_page_and_try_neighbors(
-/*=============================*/
-	buf_page_t*	bpage,		/*!< in: buffer control block */
-	enum buf_flush	flush_type,	/*!< in: BUF_FLUSH_LRU
-					or BUF_FLUSH_LIST */
-	ulint		n_to_flush,	/*!< in: number of pages to
-					flush */
-	ulint*		count)		/*!< in/out: number of pages
-					flushed */
-{
-	mutex_t*	block_mutex = NULL;
-	ibool		flushed = FALSE;
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-#endif /* UNIV_DEBUG */
-
-	ut_ad((flush_type == BUF_FLUSH_LRU
-	       && mutex_own(&buf_pool->LRU_list_mutex))
-	      || (flush_type == BUF_FLUSH_LIST
-		  && buf_flush_list_mutex_own(buf_pool)));
-
-	if (flush_type == BUF_FLUSH_LRU) {
-		block_mutex = buf_page_get_mutex_enter(bpage);
-		ut_ad(block_mutex);
-	}
-
-	ut_a(buf_page_in_file(bpage));
-
-	if (buf_flush_ready_for_flush(bpage, flush_type)) {
-		ulint		space;
-		ulint		offset;
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_bpage(bpage);
-
-		//buf_pool_mutex_exit(buf_pool);
-		if (flush_type == BUF_FLUSH_LRU) {
-			mutex_exit(&buf_pool->LRU_list_mutex);
-		}
-
-		/* These fields are protected by both the
-		buffer pool mutex and block mutex. */
-		space = buf_page_get_space(bpage);
-		offset = buf_page_get_page_no(bpage);
-
-		if (flush_type == BUF_FLUSH_LRU) {
-			mutex_exit(block_mutex);
-		} else {
-			buf_flush_list_mutex_exit(buf_pool);
-		}
-
-		/* Try to flush also all the neighbors */
-		*count += buf_flush_try_neighbors(space,
-						  offset,
-						  flush_type,
-						  *count,
-						  n_to_flush);
-
-		if (flush_type == BUF_FLUSH_LRU) {
-			mutex_enter(&buf_pool->LRU_list_mutex);
-		} else {
-			buf_flush_list_mutex_enter(buf_pool);
-		}
-		flushed = TRUE;
-	} else if (block_mutex) {
-		mutex_exit(block_mutex);
-	}
-
-	ut_ad((flush_type == BUF_FLUSH_LRU
-	       && mutex_own(&buf_pool->LRU_list_mutex))
-	      || buf_flush_list_mutex_own(buf_pool));
-
-	return(flushed);
-}
-
-/*******************************************************************//**
-This utility flushes dirty blocks from the end of the LRU list.
-In the case of an LRU flush the calling thread may own latches to
-pages: to avoid deadlocks, this function must be written so that it
-cannot end up waiting for these latches!
-@return number of blocks for which the write request was queued. */
-static
-ulint
-buf_flush_LRU_list_batch(
-/*=====================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	ulint		max)		/*!< in: max of blocks to flush */
-{
-	buf_page_t*	bpage;
-	ulint		count = 0;
-
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
-
-	do {
-		/* Start from the end of the list looking for a
-		suitable block to be flushed. */
-		bpage = UT_LIST_GET_LAST(buf_pool->LRU);
-
-		/* Iterate backwards over the flush list till we find
-		a page that isn't ready for flushing. */
-		while (bpage != NULL
-		       && !buf_flush_page_and_try_neighbors(
-				bpage, BUF_FLUSH_LRU, max, &count)) {
-
-			bpage = UT_LIST_GET_PREV(LRU, bpage);
-		}
-	} while (bpage != NULL && count < max);
-
-	/* We keep track of all flushes happening as part of LRU
-	flush. When estimating the desired rate at which flush_list
-	should be flushed, we factor in this value. */
-	buf_lru_flush_page_count += count;
-
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
-
-	return(count);
-}
-
-/*******************************************************************//**
-This utility flushes dirty blocks from the end of the flush_list.
-the calling thread is not allowed to own any latches on pages!
-@return number of blocks for which the write request was queued;
-ULINT_UNDEFINED if there was a flush of the same type already
-running */
-static
-ulint
-buf_flush_flush_list_batch(
-/*=======================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	ulint		min_n,		/*!< in: wished minimum mumber
-					of blocks flushed (it is not
-					guaranteed that the actual
-					number is that big, though) */
-	ib_uint64_t	lsn_limit)	/*!< all blocks whose
-					oldest_modification is smaller
-					than this should be flushed (if
-					their number does not exceed
-					min_n) */
-{
-	ulint		len;
-	buf_page_t*	bpage;
-	ulint		count = 0;
-
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-
-	/* If we have flushed enough, leave the loop */
-	do {
-		/* Start from the end of the list looking for a suitable
-		block to be flushed. */
-
-		buf_flush_list_mutex_enter(buf_pool);
-
-		/* We use len here because theoretically insertions can
-		happen in the flush_list below while we are traversing
-		it for a suitable candidate for flushing. We'd like to
-		set a limit on how farther we are willing to traverse
-		the list. */
-		len = UT_LIST_GET_LEN(buf_pool->flush_list);
-		bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
-
-		if (bpage) {
-			ut_a(bpage->oldest_modification > 0);
-		}
-
-		if (!bpage || bpage->oldest_modification >= lsn_limit) {
-
-			/* We have flushed enough */
-			buf_flush_list_mutex_exit(buf_pool);
-			break;
-		}
-
-		ut_a(bpage->oldest_modification > 0);
-
-		ut_ad(bpage->in_flush_list);
-
-		/* The list may change during the flushing and we cannot
-		safely preserve within this function a pointer to a
-		block in the list! */
-		while (bpage != NULL
-		       && len > 0
-		       && !buf_flush_page_and_try_neighbors(
-				bpage, BUF_FLUSH_LIST, min_n, &count)) {
-
-			/* If we are here that means that buf_pool->mutex
-			 was not released in buf_flush_page_and_try_neighbors()
-			above and this guarantees that bpage didn't get
-			relocated since we released the flush_list
-			mutex above. There is a chance, however, that
-			the bpage got removed from flush_list (not
-			currently possible because flush_list_remove()
-			also obtains buf_pool mutex but that may change
-			in future). To avoid this scenario we check
-			the oldest_modification and if it is zero
-			we start all over again. */
-			if (bpage->oldest_modification == 0) {
-				buf_flush_list_mutex_exit(buf_pool);
-				break;
-			}
-
-			bpage = UT_LIST_GET_PREV(flush_list, bpage);
-
-			ut_ad(!bpage || bpage->in_flush_list);
-
-			--len;
-		}
-
-		buf_flush_list_mutex_exit(buf_pool);
-
-	} while (count < min_n && bpage != NULL && len > 0);
-
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-
-	return(count);
-}
-
-/*******************************************************************//**
-This utility flushes dirty blocks from the end of the LRU list or flush_list.
-NOTE 1: in the case of an LRU flush the calling thread may own latches to
-pages: to avoid deadlocks, this function must be written so that it cannot
-end up waiting for these latches! NOTE 2: in the case of a flush list flush,
-the calling thread is not allowed to own any latches on pages!
-@return number of blocks for which the write request was queued;
-ULINT_UNDEFINED if there was a flush of the same type already running */
-static
-ulint
-buf_flush_batch(
-/*============*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	enum buf_flush	flush_type,	/*!< in: BUF_FLUSH_LRU or
-					BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
-					then the caller must not own any
-					latches on pages */
-	ulint		min_n,		/*!< in: wished minimum mumber of blocks
-					flushed (it is not guaranteed that the
-					actual number is that big, though) */
-	ib_uint64_t	lsn_limit)	/*!< in: in the case of BUF_FLUSH_LIST
-					all blocks whose oldest_modification is
-					smaller than this should be flushed
-					(if their number does not exceed
-					min_n), otherwise ignored */
-{
-	ulint		count	= 0;
-
-	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad((flush_type != BUF_FLUSH_LIST)
-	      || sync_thread_levels_empty_except_dict());
-#endif /* UNIV_SYNC_DEBUG */
-
-	//buf_pool_mutex_enter(buf_pool);
-
-	/* Note: The buffer pool mutex is released and reacquired within
-	the flush functions. */
-	switch(flush_type) {
-	case BUF_FLUSH_LRU:
-		mutex_enter(&buf_pool->LRU_list_mutex);
-		count = buf_flush_LRU_list_batch(buf_pool, min_n);
-		mutex_exit(&buf_pool->LRU_list_mutex);
-		break;
-	case BUF_FLUSH_LIST:
-		count = buf_flush_flush_list_batch(buf_pool, min_n, lsn_limit);
-		break;
-	default:
-		ut_error;
-	}
-
-	//buf_pool_mutex_exit(buf_pool);
-
-	buf_flush_buffered_writes();
-
-#ifdef UNIV_DEBUG
-	if (buf_debug_prints && count > 0) {
-		fprintf(stderr, flush_type == BUF_FLUSH_LRU
-			? "Flushed %lu pages in LRU flush\n"
-			: "Flushed %lu pages in flush list flush\n",
-			(ulong) count);
-	}
-#endif /* UNIV_DEBUG */
-
-	return(count);
-}
-
-/******************************************************************//**
-Gather the aggregated stats for both flush list and LRU list flushing */
-static
-void
-buf_flush_common(
-/*=============*/
-	enum buf_flush	flush_type,	/*!< in: type of flush */
-	ulint		page_count)	/*!< in: number of pages flushed */
-{
-	buf_flush_buffered_writes();
-
-	ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
-
-#ifdef UNIV_DEBUG
-	if (buf_debug_prints && page_count > 0) {
-		fprintf(stderr, flush_type == BUF_FLUSH_LRU
-			? "Flushed %lu pages in LRU flush\n"
-			: "Flushed %lu pages in flush list flush\n",
-			(ulong) page_count);
-	}
-#endif /* UNIV_DEBUG */
-
-	srv_buf_pool_flushed += page_count;
-}
-
-/******************************************************************//**
-Start a buffer flush batch for LRU or flush list */
-static
-ibool
-buf_flush_start(
-/*============*/
-	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	enum buf_flush	flush_type)	/*!< in: BUF_FLUSH_LRU
-					or BUF_FLUSH_LIST */
-{
-	buf_pool_mutex_enter(buf_pool);
-
-	if (buf_pool->n_flush[flush_type] > 0
-	   || buf_pool->init_flush[flush_type] == TRUE) {
-
-		/* There is already a flush batch of the same type running */
-
-		buf_pool_mutex_exit(buf_pool);
-
-		return(FALSE);
-	}
-
-	buf_pool->init_flush[flush_type] = TRUE;
-
-	buf_pool_mutex_exit(buf_pool);
-
-	return(TRUE);
-}
-
-/******************************************************************//**
-End a buffer flush batch for LRU or flush list */
-static
-void
-buf_flush_end(
-/*==========*/
-	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	enum buf_flush	flush_type)	/*!< in: BUF_FLUSH_LRU
-					or BUF_FLUSH_LIST */
-{
-	buf_pool_mutex_enter(buf_pool);
-
-	buf_pool->init_flush[flush_type] = FALSE;
-
-	if (buf_pool->n_flush[flush_type] == 0) {
-
-		/* The running flush batch has ended */
-
-		os_event_set(buf_pool->no_flush[flush_type]);
-	}
-
-	buf_pool_mutex_exit(buf_pool);
-}
-
-/******************************************************************//**
-Waits until a flush batch of the given type ends */
-UNIV_INTERN
-void
-buf_flush_wait_batch_end(
-/*=====================*/
-	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	enum buf_flush	type)		/*!< in: BUF_FLUSH_LRU
-					or BUF_FLUSH_LIST */
-{
-	ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
-
-	if (buf_pool == NULL) {
-		ulint	i;
-
-		for (i = 0; i < srv_buf_pool_instances; ++i) {
-			buf_pool_t*	buf_pool;
-
-			buf_pool = buf_pool_from_array(i);
-
-			thd_wait_begin(NULL, THD_WAIT_DISKIO);
-			os_event_wait(buf_pool->no_flush[type]);
-			thd_wait_end(NULL);
-		}
-	} else {
-		thd_wait_begin(NULL, THD_WAIT_DISKIO);
-		os_event_wait(buf_pool->no_flush[type]);
-		thd_wait_end(NULL);
-	}
-}
-
-/*******************************************************************//**
-This utility flushes dirty blocks from the end of the LRU list.
-NOTE: The calling thread may own latches to pages: to avoid deadlocks,
-this function must be written so that it cannot end up waiting for these
-latches!
-@return number of blocks for which the write request was queued;
-ULINT_UNDEFINED if there was a flush of the same type already running */
-UNIV_INTERN
-ulint
-buf_flush_LRU(
-/*==========*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	ulint		min_n)		/*!< in: wished minimum mumber of blocks
-					flushed (it is not guaranteed that the
-					actual number is that big, though) */
-{
-	ulint		page_count;
-
-	if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
-		return(ULINT_UNDEFINED);
-	}
-
-	page_count = buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0);
-
-	buf_flush_end(buf_pool, BUF_FLUSH_LRU);
-
-	buf_flush_common(BUF_FLUSH_LRU, page_count);
-
-	return(page_count);
-}
-
-/*******************************************************************//**
-This utility flushes dirty blocks from the end of the flush list of
-all buffer pool instances.
-NOTE: The calling thread is not allowed to own any latches on pages!
-@return number of blocks for which the write request was queued;
-ULINT_UNDEFINED if there was a flush of the same type already running */
-UNIV_INTERN
-ulint
-buf_flush_list(
-/*===========*/
-	ulint		min_n,		/*!< in: wished minimum mumber of blocks
-					flushed (it is not guaranteed that the
-					actual number is that big, though) */
-	ib_uint64_t	lsn_limit)	/*!< in the case BUF_FLUSH_LIST all
-					blocks whose oldest_modification is
-					smaller than this should be flushed
-					(if their number does not exceed
-					min_n), otherwise ignored */
-{
-	ulint		i;
-	ulint		total_page_count = 0;
-	ibool		skipped = FALSE;
-
-	if (min_n != ULINT_MAX) {
-		/* Ensure that flushing is spread evenly amongst the
-		buffer pool instances. When min_n is ULINT_MAX
-		we need to flush everything up to the lsn limit
-		so no limit here. */
-		min_n = (min_n + srv_buf_pool_instances - 1)
-			 / srv_buf_pool_instances;
-	}
-
-	/* Flush to lsn_limit in all buffer pool instances */
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-		ulint		page_count = 0;
-
-		buf_pool = buf_pool_from_array(i);
-
-		if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) {
-			/* We have two choices here. If lsn_limit was
-			specified then skipping an instance of buffer
-			pool means we cannot guarantee that all pages
-			up to lsn_limit has been flushed. We can
-			return right now with failure or we can try
-			to flush remaining buffer pools up to the
-			lsn_limit. We attempt to flush other buffer
-			pools based on the assumption that it will
-			help in the retry which will follow the
-			failure. */
-			skipped = TRUE;
-
-			continue;
-		}
-
-		page_count = buf_flush_batch(
-			buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit);
-
-		buf_flush_end(buf_pool, BUF_FLUSH_LIST);
-
-		buf_flush_common(BUF_FLUSH_LIST, page_count);
-
-		total_page_count += page_count;
-	}
-
-	return(lsn_limit != IB_ULONGLONG_MAX && skipped
-	       ? ULINT_UNDEFINED : total_page_count);
-}
- 
-/******************************************************************//**
-Gives a recommendation of how many blocks should be flushed to establish
-a big enough margin of replaceable blocks near the end of the LRU list
-and in the free list.
-@return number of blocks which should be flushed from the end of the
-LRU list */
-static
-ulint
-buf_flush_LRU_recommendation(
-/*=========================*/
-	buf_pool_t*	buf_pool)		/*!< in: Buffer pool instance */
-{
-	buf_page_t*	bpage;
-	ulint		n_replaceable;
-	ulint		distance	= 0;
-	ibool		have_LRU_mutex = FALSE;
-
-	if(UT_LIST_GET_LEN(buf_pool->unzip_LRU))
-		have_LRU_mutex = TRUE;
-retry:
-	//buf_pool_mutex_enter(buf_pool);
-	if (have_LRU_mutex)
-		mutex_enter(&buf_pool->LRU_list_mutex);
-
-	n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
-
-	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
-
-	while ((bpage != NULL)
-	       && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)
-		   + BUF_FLUSH_EXTRA_MARGIN(buf_pool))
-	       && (distance < BUF_LRU_FREE_SEARCH_LEN(buf_pool))) {
-
-		mutex_t* block_mutex;
-		if (!bpage->in_LRU_list) {
-			/* reatart. but it is very optimistic */
-			bpage = UT_LIST_GET_LAST(buf_pool->LRU);
-			continue;
-		}
-		block_mutex = buf_page_get_mutex_enter(bpage);
-
-		if (block_mutex && buf_flush_ready_for_replace(bpage)) {
-			n_replaceable++;
-		}
-
-		if (block_mutex) {
-			mutex_exit(block_mutex);
-		}
-
-		distance++;
-
-		bpage = UT_LIST_GET_PREV(LRU, bpage);
-	}
-
-	//buf_pool_mutex_exit(buf_pool);
-	if (have_LRU_mutex)
-		mutex_exit(&buf_pool->LRU_list_mutex);
-
-	if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)) {
-
-		return(0);
-	} else if (!have_LRU_mutex) {
-		/* confirm it again with LRU_mutex for exactness */
-		have_LRU_mutex = TRUE;
-		distance = 0;
-		goto retry;
-	}
-
-	return(BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)
-	       + BUF_FLUSH_EXTRA_MARGIN(buf_pool)
-	       - n_replaceable);
-}
-
-/*********************************************************************//**
-Flushes pages from the end of the LRU list if there is too small a margin
-of replaceable pages there or in the free list. VERY IMPORTANT: this function
-is called also by threads which have locks on pages. To avoid deadlocks, we
-flush only pages such that the s-lock required for flushing can be acquired
-immediately, without waiting. */
-UNIV_INTERN
-void
-buf_flush_free_margin(
-/*==================*/
-	buf_pool_t*	buf_pool,		/*!< in: Buffer pool instance */
-	ibool		wait)
-{
-	ulint	n_to_flush;
-
-	n_to_flush = buf_flush_LRU_recommendation(buf_pool);
-
-	if (n_to_flush > 0) {
-		ulint	n_flushed;
-
-		n_flushed = buf_flush_LRU(buf_pool, n_to_flush);
-
-		if (wait && n_flushed == ULINT_UNDEFINED) {
-			/* There was an LRU type flush batch already running;
-			let us wait for it to end */
-
-			buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
-		}
-	}
-}
-
-/*********************************************************************//**
-Flushes pages from the end of all the LRU lists. */
-UNIV_INTERN
-void
-buf_flush_free_margins(
-/*========================*/
-	ibool	wait)
-{
-	ulint	i;
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		buf_flush_free_margin(buf_pool, wait);
-	}
-}
-
-/*********************************************************************
-Update the historical stats that we are collecting for flush rate
-heuristics at the end of each interval.
-Flush rate heuristic depends on (a) rate of redo log generation and
-(b) the rate at which LRU flush is happening. */
-UNIV_INTERN
-void
-buf_flush_stat_update(void)
-/*=======================*/
-{
-	buf_flush_stat_t*	item;
-	ib_uint64_t		lsn_diff;
-	ib_uint64_t		lsn;
-	ulint			n_flushed;
-
-	lsn = log_get_lsn();
-	if (buf_flush_stat_cur.redo == 0) {
-		/* First time around. Just update the current LSN
-		and return. */
-		buf_flush_stat_cur.redo = lsn;
-		return;
-	}
-
-	item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
-
-	/* values for this interval */
-	lsn_diff = lsn - buf_flush_stat_cur.redo;
-	n_flushed = buf_lru_flush_page_count
-		    - buf_flush_stat_cur.n_flushed;
-
-	/* add the current value and subtract the obsolete entry. */
-	buf_flush_stat_sum.redo += lsn_diff - item->redo;
-	buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
-
-	/* put current entry in the array. */
-	item->redo = lsn_diff;
-	item->n_flushed = n_flushed;
-
-	/* update the index */
-	buf_flush_stat_arr_ind++;
-	buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
-
-	/* reset the current entry. */
-	buf_flush_stat_cur.redo = lsn;
-	buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
-}
-
-/*********************************************************************
-Determines the fraction of dirty pages that need to be flushed based
-on the speed at which we generate redo log. Note that if redo log
-is generated at a significant rate without corresponding increase
-in the number of dirty pages (for example, an in-memory workload)
-it can cause IO bursts of flushing. This function implements heuristics
-to avoid this burstiness.
-@return	number of dirty pages to be flushed / second */
-UNIV_INTERN
-ulint
-buf_flush_get_desired_flush_rate(void)
-/*==================================*/
-{
-	ulint		i;
-	lint		rate;
-	ulint		redo_avg;
-	ulint		n_dirty = 0;
-	ulint		n_flush_req;
-	ulint		lru_flush_avg;
-	ib_uint64_t	lsn = log_get_lsn();
-	ulint		log_capacity = log_get_capacity();
-
-	/* log_capacity should never be zero after the initialization
-	of log subsystem. */
-	ut_ad(log_capacity != 0);
-
-	/* Get total number of dirty pages. It is OK to access
-	flush_list without holding any mutex as we are using this
-	only for heuristics. */
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-		n_dirty += UT_LIST_GET_LEN(buf_pool->flush_list);
-	}
-
-	/* An overflow can happen if we generate more than 2^32 bytes
-	of redo in this interval i.e.: 4G of redo in 1 second. We can
-	safely consider this as infinity because if we ever come close
-	to 4G we'll start a synchronous flush of dirty pages. */
-	/* redo_avg below is average at which redo is generated in
-	past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
-	interval. */
-	redo_avg = (ulint) (buf_flush_stat_sum.redo
-			    / BUF_FLUSH_STAT_N_INTERVAL
-			    + (lsn - buf_flush_stat_cur.redo));
-
-	/* An overflow can happen possibly if we flush more than 2^32
-	pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
-	unlikely scenario. Even when this happens it means that our
-	flush rate will be off the mark. It won't affect correctness
-	of any subsystem. */
-	/* lru_flush_avg below is rate at which pages are flushed as
-	part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
-	number of pages flushed in the current interval. */
-	lru_flush_avg = buf_flush_stat_sum.n_flushed
-			/ BUF_FLUSH_STAT_N_INTERVAL
-			+ (buf_lru_flush_page_count
-			   - buf_flush_stat_cur.n_flushed);
-
-	n_flush_req = (n_dirty * redo_avg) / log_capacity;
-
-	/* The number of pages that we want to flush from the flush
-	list is the difference between the required rate and the
-	number of pages that we are historically flushing from the
-	LRU list */
-	rate = n_flush_req - lru_flush_avg;
-	return(rate > 0 ? (ulint) rate : 0);
-}
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/******************************************************************//**
-Validates the flush list.
-@return	TRUE if ok */
-static
-ibool
-buf_flush_validate_low(
-/*===================*/
-	buf_pool_t*	buf_pool)		/*!< in: Buffer pool instance */
-{
-	buf_page_t*		bpage;
-	const ib_rbt_node_t*	rnode = NULL;
-
-	ut_ad(buf_flush_list_mutex_own(buf_pool));
-
-	UT_LIST_VALIDATE(flush_list, buf_page_t, buf_pool->flush_list,
-			 ut_ad(ut_list_node_313->in_flush_list));
-
-	bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
-
-	/* If we are in recovery mode i.e.: flush_rbt != NULL
-	then each block in the flush_list must also be present
-	in the flush_rbt. */
-	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
-		rnode = rbt_first(buf_pool->flush_rbt);
-	}
-
-	while (bpage != NULL) {
-		const ib_uint64_t om = bpage->oldest_modification;
-
-		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
-
-		ut_ad(bpage->in_flush_list);
-
-		/* A page in buf_pool->flush_list can be in
-		BUF_BLOCK_REMOVE_HASH state. This happens when a page
-		is in the middle of being relocated. In that case the
-		original descriptor can have this state and still be
-		in the flush list waiting to acquire the
-		buf_pool->flush_list_mutex to complete the relocation. */
-		ut_a(buf_page_in_file(bpage)
-		     || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
-		ut_a(om > 0);
-
-		if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
-			buf_page_t** prpage;
-
-			ut_a(rnode);
-			prpage = rbt_value(buf_page_t*, rnode);
-
-			ut_a(*prpage);
-			ut_a(*prpage == bpage);
-			rnode = rbt_next(buf_pool->flush_rbt, rnode);
-		}
-
-		bpage = UT_LIST_GET_NEXT(flush_list, bpage);
-
-		ut_a(!bpage || om >= bpage->oldest_modification);
-	}
-
-	/* By this time we must have exhausted the traversal of
-	flush_rbt (if active) as well. */
-	ut_a(rnode == NULL);
-
-	return(TRUE);
-}
-
-/******************************************************************//**
-Validates the flush list.
-@return	TRUE if ok */
-UNIV_INTERN
-ibool
-buf_flush_validate(
-/*===============*/
-	buf_pool_t*	buf_pool)	/*!< buffer pool instance */
-{
-	ibool	ret;
-
-	buf_flush_list_mutex_enter(buf_pool);
-
-	ret = buf_flush_validate_low(buf_pool);
-
-	buf_flush_list_mutex_exit(buf_pool);
-
-	return(ret);
-}
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/buf/buf0flu.cc b/storage/xtradb/buf/buf0flu.cc
new file mode 100644
index 00000000000..abcee504d2e
--- /dev/null
+++ b/storage/xtradb/buf/buf0flu.cc
@@ -0,0 +1,2938 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0flu.cc
+The database buffer buf_pool flush algorithm
+
+Created 11/11/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0flu.h"
+
+#ifdef UNIV_NONINL
+#include "buf0flu.ic"
+#endif
+
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "page0zip.h"
+#ifndef UNIV_HOTBACKUP
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "page0page.h"
+#include "fil0fil.h"
+#include "buf0lru.h"
+#include "buf0rea.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+#include "os0file.h"
+#include "trx0sys.h"
+#include "srv0mon.h"
+#include "mysql/plugin.h"
+#include "mysql/service_thd_wait.h"
+
+/** Number of pages flushed through non flush_list flushes. */
+// static ulint buf_lru_flush_page_count = 0;
+
+/** Flag indicating if the page_cleaner is in active state. This flag
+is set to TRUE by the page_cleaner thread when it is spawned and is set
+back to FALSE at shutdown by the page_cleaner as well. Therefore no
+need to protect it by a mutex. It is only ever read by the thread
+doing the shutdown */
+UNIV_INTERN ibool buf_page_cleaner_is_active = FALSE;
+
+#ifdef UNIV_PFS_THREAD
+UNIV_INTERN mysql_pfs_key_t buf_page_cleaner_thread_key;
+#endif /* UNIV_PFS_THREAD */
+
+/** If LRU list of a buf_pool is less than this size then LRU eviction
+should not happen. This is because when we do LRU flushing we also put
+the blocks on free list. If LRU list is very small then we can end up
+in thrashing. */
+#define BUF_LRU_MIN_LEN		256
+
+/* @} */
+
+/** Handled page counters for a single flush */
+struct flush_counters_t {
+	ulint	flushed;	/*!< number of dirty pages flushed */
+	ulint	evicted;	/*!< number of clean pages evicted */
+};
+
+/******************************************************************//**
+Increases flush_list size in bytes with zip_size for compressed page,
+UNIV_PAGE_SIZE for uncompressed page in inline function */
+static inline
+void
+incr_flush_list_size_in_bytes(
+/*==========================*/
+	buf_block_t*	block,		/*!< in: control block */
+	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
+{
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+	ulint zip_size = page_zip_get_size(&block->page.zip);
+	buf_pool->stat.flush_list_bytes += zip_size ? zip_size : UNIV_PAGE_SIZE;
+	ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/******************************************************************//**
+Validates the flush list.
+@return	TRUE if ok */
+static
+ibool
+buf_flush_validate_low(
+/*===================*/
+	buf_pool_t*	buf_pool);	/*!< in: Buffer pool instance */
+
+/******************************************************************//**
+Validates the flush list some of the time.
+@return	TRUE if ok or the check was skipped */
+static
+ibool
+buf_flush_validate_skip(
+/*====================*/
+	buf_pool_t*	buf_pool)	/*!< in: Buffer pool instance */
+{
+/** Try buf_flush_validate_low() every this many times */
+# define BUF_FLUSH_VALIDATE_SKIP	23
+
+	/** The buf_flush_validate_low() call skip counter.
+	Use a signed type because of the race condition below. */
+	static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
+
+	/* There is a race condition below, but it does not matter,
+	because this call is only for heuristic purposes. We want to
+	reduce the call frequency of the costly buf_flush_validate_low()
+	check in debug builds. */
+	if (--buf_flush_validate_count > 0) {
+		return(TRUE);
+	}
+
+	buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
+	return(buf_flush_validate_low(buf_pool));
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+/*******************************************************************//**
+Sets hazard pointer during flush_list iteration. */
+UNIV_INLINE
+void
+buf_flush_set_hp(
+/*=============*/
+	buf_pool_t*		buf_pool,/*!< in/out: buffer pool instance */
+	const buf_page_t*	bpage)	/*!< in: buffer control block */
+{
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+	ut_ad(buf_pool->flush_list_hp == NULL || bpage == NULL);
+	ut_ad(!bpage || buf_page_in_file(bpage)
+	      || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
+	ut_ad(!bpage || bpage->in_flush_list);
+	ut_ad(!bpage || buf_pool_from_bpage(bpage) == buf_pool);
+
+	buf_pool->flush_list_hp = bpage;
+}
+
+/*******************************************************************//**
+Checks if the given block is a hazard pointer
+@return true if bpage is hazard pointer */
+UNIV_INLINE
+bool
+buf_flush_is_hp(
+/*============*/
+	buf_pool_t*		buf_pool,/*!< in: buffer pool instance */
+	const buf_page_t*	bpage)	/*!< in: buffer control block */
+{
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+	return(buf_pool->flush_list_hp == bpage);
+}
+
+/*******************************************************************//**
+Whenever we move a block in flush_list (either to remove it or to
+relocate it) we check the hazard pointer set by some other thread
+doing the flush list scan. If the hazard pointer is the same as the
+one we are about going to move then we set it to NULL to force a rescan
+in the thread doing the batch. */
+UNIV_INLINE
+void
+buf_flush_update_hp(
+/*================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_page_t*	bpage)		/*!< in: buffer control block */
+{
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+	if (buf_flush_is_hp(buf_pool, bpage)) {
+		buf_flush_set_hp(buf_pool, NULL);
+		MONITOR_INC(MONITOR_FLUSH_HP_RESCAN);
+	}
+}
+
+/******************************************************************//**
+Insert a block in the flush_rbt and returns a pointer to its
+predecessor or NULL if no predecessor. The ordering is maintained
+on the basis of the <oldest_modification, space, offset> key.
+@return	pointer to the predecessor or NULL if no predecessor. */
+static
+buf_page_t*
+buf_flush_insert_in_flush_rbt(
+/*==========================*/
+	buf_page_t*	bpage)	/*!< in: bpage to be inserted. */
+{
+	const ib_rbt_node_t*	c_node;
+	const ib_rbt_node_t*	p_node;
+	buf_page_t*		prev = NULL;
+	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
+
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+	/* Insert this buffer into the rbt. */
+	c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
+	ut_a(c_node != NULL);
+
+	/* Get the predecessor. */
+	p_node = rbt_prev(buf_pool->flush_rbt, c_node);
+
+	if (p_node != NULL) {
+		buf_page_t**	value;
+		value = rbt_value(buf_page_t*, p_node);
+		prev = *value;
+		ut_a(prev != NULL);
+	}
+
+	return(prev);
+}
+
+/*********************************************************//**
+Delete a bpage from the flush_rbt. */
+static
+void
+buf_flush_delete_from_flush_rbt(
+/*============================*/
+	buf_page_t*	bpage)	/*!< in: bpage to be removed. */
+{
+#ifdef UNIV_DEBUG
+	ibool		ret = FALSE;
+#endif /* UNIV_DEBUG */
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+#ifdef UNIV_DEBUG
+	ret =
+#endif /* UNIV_DEBUG */
+	rbt_delete(buf_pool->flush_rbt, &bpage);
+
+	ut_ad(ret);
+}
+
+/*****************************************************************//**
+Compare two modified blocks in the buffer pool. The key for comparison
+is:
+key = <oldest_modification, space, offset>
+This comparison is used to maintian ordering of blocks in the
+buf_pool->flush_rbt.
+Note that for the purpose of flush_rbt, we only need to order blocks
+on the oldest_modification. The other two fields are used to uniquely
+identify the blocks.
+@return	 < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
+static
+int
+buf_flush_block_cmp(
+/*================*/
+	const void*	p1,		/*!< in: block1 */
+	const void*	p2)		/*!< in: block2 */
+{
+	int			ret;
+	const buf_page_t*	b1 = *(const buf_page_t**) p1;
+	const buf_page_t*	b2 = *(const buf_page_t**) p2;
+#ifdef UNIV_DEBUG
+	buf_pool_t*		buf_pool = buf_pool_from_bpage(b1);
+#endif /* UNIV_DEBUG */
+
+	ut_ad(b1 != NULL);
+	ut_ad(b2 != NULL);
+
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+	ut_ad(b1->in_flush_list);
+	ut_ad(b2->in_flush_list);
+
+	if (b2->oldest_modification > b1->oldest_modification) {
+		return(1);
+	} else if (b2->oldest_modification < b1->oldest_modification) {
+		return(-1);
+	}
+
+	/* If oldest_modification is same then decide on the space. */
+	ret = (int)(b2->space - b1->space);
+
+	/* Or else decide ordering on the offset field. */
+	return(ret ? ret : (int)(b2->offset - b1->offset));
+}
+
+/********************************************************************//**
+Initialize the red-black tree to speed up insertions into the flush_list
+during recovery process. Should be called at the start of recovery
+process before any page has been read/written. */
+UNIV_INTERN
+void
+buf_flush_init_flush_rbt(void)
+/*==========================*/
+{
+	ulint	i;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		buf_flush_list_mutex_enter(buf_pool);
+
+		/* Create red black tree for speedy insertions in flush list. */
+		buf_pool->flush_rbt = rbt_create(
+			sizeof(buf_page_t*), buf_flush_block_cmp);
+
+		buf_flush_list_mutex_exit(buf_pool);
+	}
+}
+
+/********************************************************************//**
+Frees up the red-black tree. */
+UNIV_INTERN
+void
+buf_flush_free_flush_rbt(void)
+/*==========================*/
+{
+	ulint	i;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		buf_flush_list_mutex_enter(buf_pool);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		ut_a(buf_flush_validate_low(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+		rbt_free(buf_pool->flush_rbt);
+		buf_pool->flush_rbt = NULL;
+
+		buf_flush_list_mutex_exit(buf_pool);
+	}
+}
+
+/********************************************************************//**
+Inserts a modified block into the flush list. */
+UNIV_INTERN
+void
+buf_flush_insert_into_flush_list(
+/*=============================*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	buf_block_t*	block,		/*!< in/out: block which is modified */
+	lsn_t		lsn)		/*!< in: oldest modification */
+{
+	ut_ad(log_flush_order_mutex_own());
+	ut_ad(mutex_own(&block->mutex));
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
+	      || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
+		  <= lsn));
+
+	/* If we are in the recovery then we need to update the flush
+	red-black tree as well. */
+	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+		buf_flush_list_mutex_exit(buf_pool);
+		buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
+		return;
+	}
+
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(!block->page.in_flush_list);
+
+	ut_d(block->page.in_flush_list = TRUE);
+	block->page.oldest_modification = lsn;
+	UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
+	incr_flush_list_size_in_bytes(block, buf_pool);
+
+#ifdef UNIV_DEBUG_VALGRIND
+	{
+		ulint	zip_size = buf_block_get_zip_size(block);
+
+		if (zip_size) {
+			UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
+		} else {
+			UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
+		}
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_flush_validate_skip(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	buf_flush_list_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Inserts a modified block into the flush list in the right sorted position.
+This function is used by recovery, because there the modifications do not
+necessarily come in the order of lsn's. */
+UNIV_INTERN
+void
+buf_flush_insert_sorted_into_flush_list(
+/*====================================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_block_t*	block,		/*!< in/out: block which is modified */
+	lsn_t		lsn)		/*!< in: oldest modification */
+{
+	buf_page_t*	prev_b;
+	buf_page_t*	b;
+
+	ut_ad(log_flush_order_mutex_own());
+	ut_ad(mutex_own(&block->mutex));
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	/* The field in_LRU_list is protected by buf_pool->LRU_list_mutex,
+	which we are not holding.  However, while a block is in the flush
+	list, it is dirty and cannot be discarded, not from the
+	page_hash or from the LRU list.  At most, the uncompressed
+	page frame of a compressed block may be discarded or created
+	(copying the block->page to or from a buf_page_t that is
+	dynamically allocated from buf_buddy_alloc()).  Because those
+	transitions hold block->mutex and the flush list mutex (via
+	buf_flush_relocate_on_flush_list()), there is no possibility
+	of a race condition in the assertions below. */
+	ut_ad(block->page.in_LRU_list);
+	ut_ad(block->page.in_page_hash);
+	/* buf_buddy_block_register() will take a block in the
+	BUF_BLOCK_MEMORY state, not a file page. */
+	ut_ad(!block->page.in_zip_hash);
+
+	ut_ad(!block->page.in_flush_list);
+	ut_d(block->page.in_flush_list = TRUE);
+	block->page.oldest_modification = lsn;
+
+#ifdef UNIV_DEBUG_VALGRIND
+	{
+		ulint	zip_size = buf_block_get_zip_size(block);
+
+		if (zip_size) {
+			UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
+		} else {
+			UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
+		}
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	prev_b = NULL;
+
+	/* For the most part when this function is called the flush_rbt
+	should not be NULL. In a very rare boundary case it is possible
+	that the flush_rbt has already been freed by the recovery thread
+	before the last page was hooked up in the flush_list by the
+	io-handler thread. In that case we'll  just do a simple
+	linear search in the else block. */
+	if (buf_pool->flush_rbt) {
+
+		prev_b = buf_flush_insert_in_flush_rbt(&block->page);
+
+	} else {
+
+		b = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+		while (b && b->oldest_modification
+		       > block->page.oldest_modification) {
+			ut_ad(b->in_flush_list);
+			prev_b = b;
+			b = UT_LIST_GET_NEXT(list, b);
+		}
+	}
+
+	if (prev_b == NULL) {
+		UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
+	} else {
+		UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
+				     prev_b, &block->page);
+	}
+
+	incr_flush_list_size_in_bytes(block, buf_pool);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_flush_validate_low(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	buf_flush_list_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Returns TRUE if the file page block is immediately suitable for replacement,
+i.e., the transition FILE_PAGE => NOT_USED allowed.
+@return	TRUE if can replace immediately */
+UNIV_INTERN
+ibool
+buf_flush_ready_for_replace(
+/*========================*/
+	buf_page_t*	bpage)	/*!< in: buffer control block, must be
+				buf_page_in_file(bpage) and in the LRU list */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+#endif
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(bpage->in_LRU_list);
+
+	if (UNIV_LIKELY(buf_page_in_file(bpage))) {
+
+		return(bpage->oldest_modification == 0
+		       && buf_page_get_io_fix(bpage) == BUF_IO_NONE
+		       && bpage->buf_fix_count == 0);
+	}
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: Error: buffer block state %lu"
+		" in the LRU list!\n",
+		(ulong) buf_page_get_state(bpage));
+	ut_print_buf(stderr, bpage, sizeof(buf_page_t));
+	putc('\n', stderr);
+
+	return(FALSE);
+}
+
+/********************************************************************//**
+Returns true if the block is modified and ready for flushing.
+@return	true if can flush immediately */
+UNIV_INTERN
+bool
+buf_flush_ready_for_flush(
+/*======================*/
+	buf_page_t*	bpage,	/*!< in: buffer control block, must be
+				buf_page_in_file(bpage) */
+	buf_flush_t	flush_type)/*!< in: type of flush */
+{
+	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
+	ut_ad(mutex_own(buf_page_get_mutex(bpage))
+	      || flush_type == BUF_FLUSH_LIST);
+	ut_a(buf_page_in_file(bpage));
+
+	if (bpage->oldest_modification == 0
+	    || buf_page_get_io_fix_unlocked(bpage) != BUF_IO_NONE) {
+		return(false);
+	}
+
+	ut_ad(bpage->in_flush_list);
+
+	switch (flush_type) {
+	case BUF_FLUSH_LIST:
+	case BUF_FLUSH_LRU:
+	case BUF_FLUSH_SINGLE_PAGE:
+		/* Because any thread may call single page flush, even
+		when owning locks on pages, to avoid deadlocks, we must
+		make sure that the that it is not buffer fixed.
+		The same holds true for LRU flush because a user thread
+		may end up waiting for an LRU flush to end while
+		holding locks on other pages. */
+		return(bpage->buf_fix_count == 0);
+	case BUF_FLUSH_N_TYPES:
+		break;
+	}
+
+	ut_error;
+	return(false);
+}
+
+/********************************************************************//**
+Remove a block from the flush list of modified blocks. */
+UNIV_INTERN
+void
+buf_flush_remove(
+/*=============*/
+	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ulint		zip_size;
+
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_DIRTY
+	      || mutex_own(&buf_pool->LRU_list_mutex));
+#endif
+	ut_ad(bpage->in_flush_list);
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_POOL_WATCH:
+	case BUF_BLOCK_ZIP_PAGE:
+		/* Clean compressed pages should not be on the flush list */
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_error;
+		return;
+	case BUF_BLOCK_ZIP_DIRTY:
+		buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
+		UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		buf_LRU_insert_zip_clean(bpage);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+		break;
+	case BUF_BLOCK_FILE_PAGE:
+		UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
+		break;
+	}
+
+	/* If the flush_rbt is active then delete from there as well. */
+	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+		buf_flush_delete_from_flush_rbt(bpage);
+	}
+
+	/* Must be done after we have removed it from the flush_rbt
+	because we assert on in_flush_list in comparison function. */
+	ut_d(bpage->in_flush_list = FALSE);
+
+	zip_size = page_zip_get_size(&bpage->zip);
+	buf_pool->stat.flush_list_bytes -= zip_size ? zip_size : UNIV_PAGE_SIZE;
+
+	bpage->oldest_modification = 0;
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_flush_validate_skip(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	buf_flush_update_hp(buf_pool, bpage);
+	buf_flush_list_mutex_exit(buf_pool);
+}
+
+/*******************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage have already been
+copied to dpage.
+IMPORTANT: When this function is called bpage and dpage are not
+exact copies of each other. For example, they both will have different
+::state. Also the ::list pointers in dpage may be stale. We need to
+use the current list node (bpage) to do the list manipulation because
+the list pointers could have changed between the time that we copied
+the contents of bpage to the dpage and the flush list manipulation
+below. */
+UNIV_INTERN
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+	buf_page_t*	bpage,	/*!< in/out: control block being moved */
+	buf_page_t*	dpage)	/*!< in/out: destination block */
+{
+	buf_page_t*	prev;
+	buf_page_t*	prev_b = NULL;
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	/* Must reside in the same buffer pool. */
+	ut_ad(buf_pool == buf_pool_from_bpage(dpage));
+
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	ut_ad(bpage->in_flush_list);
+	ut_ad(dpage->in_flush_list);
+
+	/* If recovery is active we must swap the control blocks in
+	the flush_rbt as well. */
+	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+		buf_flush_delete_from_flush_rbt(bpage);
+		prev_b = buf_flush_insert_in_flush_rbt(dpage);
+	}
+
+	/* Must be done after we have removed it from the flush_rbt
+	because we assert on in_flush_list in comparison function. */
+	ut_d(bpage->in_flush_list = FALSE);
+
+	prev = UT_LIST_GET_PREV(list, bpage);
+	UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
+
+	if (prev) {
+		ut_ad(prev->in_flush_list);
+		UT_LIST_INSERT_AFTER(
+			list,
+			buf_pool->flush_list,
+			prev, dpage);
+	} else {
+		UT_LIST_ADD_FIRST(
+			list,
+			buf_pool->flush_list,
+			dpage);
+	}
+
+	/* Just an extra check. Previous in flush_list
+	should be the same control block as in flush_rbt. */
+	ut_a(!buf_pool->flush_rbt || prev_b == prev);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_flush_validate_low(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	buf_flush_update_hp(buf_pool, bpage);
+	buf_flush_list_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Updates the flush system data structures when a write is completed. */
+UNIV_INTERN
+void
+buf_flush_write_complete(
+/*=====================*/
+	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
+{
+	buf_flush_t	flush_type = buf_page_get_flush_type(bpage);
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	mutex_enter(&buf_pool->flush_state_mutex);
+
+	buf_flush_remove(bpage);
+
+	buf_page_set_io_fix(bpage, BUF_IO_NONE);
+
+	buf_pool->n_flush[flush_type]--;
+
+	/* fprintf(stderr, "n pending flush %lu\n",
+	buf_pool->n_flush[flush_type]); */
+
+	if (buf_pool->n_flush[flush_type] == 0
+	    && buf_pool->init_flush[flush_type] == FALSE) {
+
+		/* The running flush batch has ended */
+
+		os_event_set(buf_pool->no_flush[flush_type]);
+	}
+
+	buf_dblwr_update(bpage, flush_type);
+
+	mutex_exit(&buf_pool->flush_state_mutex);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Calculate the checksum of a page from compressed table and update the page. */
+UNIV_INTERN
+void
+buf_flush_update_zip_checksum(
+/*==========================*/
+	buf_frame_t*	page,		/*!< in/out: Page to update */
+	ulint		zip_size,	/*!< in: Compressed page size */
+	lsn_t		lsn)		/*!< in: Lsn to stamp on the page */
+{
+	ut_a(zip_size > 0);
+
+	ib_uint32_t	checksum = page_zip_calc_checksum(
+		page, zip_size,
+		static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm));
+
+	mach_write_to_8(page + FIL_PAGE_LSN, lsn);
+	memset(page + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
+}
+
+/********************************************************************//**
+Initializes a page for writing to the tablespace. */
+UNIV_INTERN
+void
+buf_flush_init_for_writing(
+/*=======================*/
+	byte*	page,		/*!< in/out: page */
+	void*	page_zip_,	/*!< in/out: compressed page, or NULL */
+	lsn_t	newest_lsn)	/*!< in: newest modification lsn
+				to the page */
+{
+	ib_uint32_t	checksum = 0 /* silence bogus gcc warning */;
+
+	ut_ad(page);
+
+	if (page_zip_) {
+		page_zip_des_t*	page_zip;
+		ulint		zip_size;
+
+		page_zip = static_cast<page_zip_des_t*>(page_zip_);
+		zip_size = page_zip_get_size(page_zip);
+
+		ut_ad(zip_size);
+		ut_ad(ut_is_2pow(zip_size));
+		ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+
+		switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
+		case FIL_PAGE_TYPE_ALLOCATED:
+		case FIL_PAGE_INODE:
+		case FIL_PAGE_IBUF_BITMAP:
+		case FIL_PAGE_TYPE_FSP_HDR:
+		case FIL_PAGE_TYPE_XDES:
+			/* These are essentially uncompressed pages. */
+			memcpy(page_zip->data, page, zip_size);
+			/* fall through */
+		case FIL_PAGE_TYPE_ZBLOB:
+		case FIL_PAGE_TYPE_ZBLOB2:
+		case FIL_PAGE_INDEX:
+
+			buf_flush_update_zip_checksum(
+				page_zip->data, zip_size, newest_lsn);
+
+			return;
+		}
+
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: ERROR: The compressed page to be written"
+		      " seems corrupt:", stderr);
+		ut_print_buf(stderr, page, zip_size);
+		fputs("\nInnoDB: Possibly older version of the page:", stderr);
+		ut_print_buf(stderr, page_zip->data, zip_size);
+		putc('\n', stderr);
+		ut_error;
+	}
+
+	/* Write the newest modification lsn to the page header and trailer */
+	mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
+
+	mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+			newest_lsn);
+
+	/* Store the new formula checksum */
+
+	switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+		checksum = buf_calc_page_crc32(page);
+		break;
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+		checksum = (ib_uint32_t) buf_calc_page_new_checksum(page);
+		break;
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+		checksum = BUF_NO_CHECKSUM_MAGIC;
+		break;
+	/* no default so the compiler will emit a warning if new enum
+	is added and not handled here */
+	}
+
+	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
+
+	/* We overwrite the first 4 bytes of the end lsn field to store
+	the old formula checksum. Since it depends also on the field
+	FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
+	new formula checksum. */
+
+	if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB
+	    || srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB) {
+
+		checksum = (ib_uint32_t) buf_calc_page_old_checksum(page);
+
+		/* In other cases we use the value assigned from above.
+		If CRC32 is used then it is faster to use that checksum
+		(calculated above) instead of calculating another one.
+		We can afford to store something other than
+		buf_calc_page_old_checksum() or BUF_NO_CHECKSUM_MAGIC in
+		this field because the file will not be readable by old
+		versions of MySQL/InnoDB anyway (older than MySQL 5.6.3) */
+	}
+
+	mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+			checksum);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Does an asynchronous write of a buffer page. NOTE: in simulated aio and
+also when the doublewrite buffer is used, we must call
+buf_dblwr_flush_buffered_writes after we have posted a batch of
+writes! */
+static
+void
+buf_flush_write_block_low(
+/*======================*/
+	buf_page_t*	bpage,		/*!< in: buffer block to write */
+	buf_flush_t	flush_type,	/*!< in: type of flush */
+	bool		sync)		/*!< in: true if sync IO request */
+{
+	ulint	zip_size	= buf_page_get_zip_size(bpage);
+	page_t*	frame		= NULL;
+
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
+#endif
+
+#ifdef UNIV_LOG_DEBUG
+	static ibool	univ_log_debug_warned;
+#endif /* UNIV_LOG_DEBUG */
+
+	ut_ad(buf_page_in_file(bpage));
+
+	/* We are not holding block_mutex here.
+	Nevertheless, it is safe to access bpage, because it is
+	io_fixed and oldest_modification != 0.  Thus, it cannot be
+	relocated in the buffer pool or removed from flush_list or
+	LRU_list. */
+	ut_ad(!buf_flush_list_mutex_own(buf_pool));
+	ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(buf_page_get_io_fix_unlocked(bpage) == BUF_IO_WRITE);
+	ut_ad(bpage->oldest_modification != 0);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
+#endif
+	ut_ad(bpage->newest_modification != 0);
+
+#ifdef UNIV_LOG_DEBUG
+	if (!univ_log_debug_warned) {
+		univ_log_debug_warned = TRUE;
+		fputs("Warning: cannot force log to disk if"
+		      " UNIV_LOG_DEBUG is defined!\n"
+		      "Crash recovery will not work!\n",
+		      stderr);
+	}
+#else
+	/* Force the log to the disk before writing the modified block */
+	log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
+#endif
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_POOL_WATCH:
+	case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_error;
+		break;
+	case BUF_BLOCK_ZIP_DIRTY:
+		frame = bpage->zip.data;
+
+		ut_a(page_zip_verify_checksum(frame, zip_size));
+
+		mach_write_to_8(frame + FIL_PAGE_LSN,
+				bpage->newest_modification);
+		memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+		break;
+	case BUF_BLOCK_FILE_PAGE:
+		frame = bpage->zip.data;
+		if (!frame) {
+			frame = ((buf_block_t*) bpage)->frame;
+		}
+
+		buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
+					   bpage->zip.data
+					   ? &bpage->zip : NULL,
+					   bpage->newest_modification);
+		break;
+	}
+
+	if (!srv_use_doublewrite_buf || !buf_dblwr) {
+		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+		       sync, buf_page_get_space(bpage), zip_size,
+		       buf_page_get_page_no(bpage), 0,
+		       zip_size ? zip_size : UNIV_PAGE_SIZE,
+		       frame, bpage);
+	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
+		buf_dblwr_write_single_page(bpage, sync);
+	} else {
+		ut_ad(!sync);
+		buf_dblwr_add_to_batch(bpage);
+	}
+
+	/* When doing single page flushing the IO is done synchronously
+	and we flush the changes to disk only for the tablespace we
+	are working on. */
+	if (sync) {
+		ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
+		fil_flush(buf_page_get_space(bpage));
+		buf_page_io_complete(bpage);
+	}
+
+	/* Increment the counter of I/O operations used
+	for selecting LRU policy. */
+	buf_LRU_stat_inc_io();
+}
+
+/********************************************************************//**
+Writes a flushable page asynchronously from the buffer pool to a file.
+NOTE: in simulated aio we must call
+os_aio_simulated_wake_handler_threads after we have posted a batch of
+writes! NOTE: buf_page_get_mutex(bpage) must be held upon entering this
+function, and it will be released by this function. */
+UNIV_INTERN
+void
+buf_flush_page(
+/*===========*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_page_t*	bpage,		/*!< in: buffer control block */
+	buf_flush_t	flush_type,	/*!< in: type of flush */
+	bool		sync)		/*!< in: true if sync IO request */
+{
+	ib_mutex_t*	block_mutex;
+	ibool		is_uncompressed;
+
+	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
+	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
+	ut_ad(buf_page_in_file(bpage));
+	ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE);
+
+	block_mutex = buf_page_get_mutex(bpage);
+	ut_ad(mutex_own(block_mutex));
+
+	ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
+
+	mutex_enter(&buf_pool->flush_state_mutex);
+
+	buf_page_set_io_fix(bpage, BUF_IO_WRITE);
+
+	buf_page_set_flush_type(bpage, flush_type);
+
+	if (buf_pool->n_flush[flush_type] == 0) {
+
+		os_event_reset(buf_pool->no_flush[flush_type]);
+	}
+
+	buf_pool->n_flush[flush_type]++;
+
+	mutex_exit(&buf_pool->flush_state_mutex);
+
+	is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
+
+	switch (flush_type) {
+		ibool	is_s_latched;
+	case BUF_FLUSH_LIST:
+		/* If the simulated aio thread is not running, we must
+		not wait for any latch, as we may end up in a deadlock:
+		if buf_fix_count == 0, then we know we need not wait */
+
+		is_s_latched = (bpage->buf_fix_count == 0);
+		if (is_s_latched && is_uncompressed) {
+			rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
+					   BUF_IO_WRITE);
+		}
+
+		mutex_exit(block_mutex);
+
+		/* Even though bpage is not protected by any mutex at
+		this point, it is safe to access bpage, because it is
+		io_fixed and oldest_modification != 0.  Thus, it
+		cannot be relocated in the buffer pool or removed from
+		flush_list or LRU_list. */
+
+		if (!is_s_latched) {
+			buf_dblwr_flush_buffered_writes();
+
+			if (is_uncompressed) {
+				rw_lock_s_lock_gen(&((buf_block_t*) bpage)
+						   ->lock, BUF_IO_WRITE);
+			}
+		}
+
+		break;
+
+	case BUF_FLUSH_LRU:
+	case BUF_FLUSH_SINGLE_PAGE:
+		/* VERY IMPORTANT:
+		Because any thread may call single page flush, even when
+		owning locks on pages, to avoid deadlocks, we must make
+		sure that the s-lock is acquired on the page without
+		waiting: this is accomplished because
+		buf_flush_ready_for_flush() must hold, and that requires
+		the page not to be bufferfixed.
+		The same holds true for LRU flush because a user thread
+		may end up waiting for an LRU flush to end while
+		holding locks on other pages. */
+
+		if (is_uncompressed) {
+			rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
+					   BUF_IO_WRITE);
+		}
+
+		/* Note that the s-latch is acquired before releasing the
+		buf_page_get_mutex() mutex: this ensures that the latch is
+		acquired immediately. */
+
+		mutex_exit(block_mutex);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	/* Even though bpage is not protected by any mutex at this
+	point, it is safe to access bpage, because it is io_fixed and
+	oldest_modification != 0.  Thus, it cannot be relocated in the
+	buffer pool or removed from flush_list or LRU_list. */
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints) {
+		fprintf(stderr,
+			"Flushing %u space %u page %u\n",
+			flush_type, bpage->space, bpage->offset);
+	}
+#endif /* UNIV_DEBUG */
+	buf_flush_write_block_low(bpage, flush_type, sync);
+}
+
+# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/********************************************************************//**
+Writes a flushable page asynchronously from the buffer pool to a file.
+NOTE: block->mutex must be held upon entering this function, and it will be
+released by this function after flushing.  This is loosely based on
+buf_flush_batch() and buf_flush_page().
+@return TRUE if the page was flushed and the mutexes released */
+UNIV_INTERN
+ibool
+buf_flush_page_try(
+/*===============*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
+	buf_block_t*	block)		/*!< in/out: buffer control block */
+{
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(mutex_own(&block->mutex));
+
+	if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) {
+		return(FALSE);
+	}
+
+	/* The following call will release the buffer pool and
+	block mutex. */
+	buf_flush_page(buf_pool, &block->page, BUF_FLUSH_SINGLE_PAGE, true);
+	return(TRUE);
+}
+# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+/***********************************************************//**
+Check the page is in buffer pool and can be flushed.
+@return	true if the page can be flushed. */
+static
+bool
+buf_flush_check_neighbor(
+/*=====================*/
+	ulint		space,		/*!< in: space id */
+	ulint		offset,		/*!< in: page offset */
+	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU or
+					BUF_FLUSH_LIST */
+{
+	buf_page_t*	bpage;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+	bool		ret;
+	prio_rw_lock_t*	hash_lock;
+	ib_mutex_t*	block_mutex;
+
+	ut_ad(flush_type == BUF_FLUSH_LRU
+	      || flush_type == BUF_FLUSH_LIST);
+
+	/* We only want to flush pages from this buffer pool. */
+	bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
+					   &hash_lock);
+
+	if (!bpage) {
+
+		return(false);
+	}
+
+	block_mutex = buf_page_get_mutex(bpage);
+
+	mutex_enter(block_mutex);
+
+	rw_lock_s_unlock(hash_lock);
+
+	ut_a(buf_page_in_file(bpage));
+
+	/* We avoid flushing 'non-old' blocks in an LRU flush,
+	because the flushed blocks are soon freed */
+
+	ret = false;
+	if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) {
+
+		if (buf_flush_ready_for_flush(bpage, flush_type)) {
+			ret = true;
+		}
+	}
+
+	mutex_exit(block_mutex);
+
+	return(ret);
+}
+
+/***********************************************************//**
+Flushes to disk all flushable pages within the flush area.
+@return	number of pages flushed */
+static
+ulint
+buf_flush_try_neighbors(
+/*====================*/
+	ulint		space,		/*!< in: space id */
+	ulint		offset,		/*!< in: page offset */
+	buf_flush_t	flush_type,	/*!< in: BUF_FLUSH_LRU or
+					BUF_FLUSH_LIST */
+	ulint		n_flushed,	/*!< in: number of pages
+					flushed so far in this batch */
+	ulint		n_to_flush)	/*!< in: maximum number of pages
+					we are allowed to flush */
+{
+	ulint		i;
+	ulint		low;
+	ulint		high;
+	ulint		count = 0;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+
+	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
+	ut_ad(!buf_flush_list_mutex_own(buf_pool));
+
+	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
+	    || srv_flush_neighbors == 0) {
+		/* If there is little space or neighbor flushing is
+		not enabled then just flush the victim. */
+		low = offset;
+		high = offset + 1;
+	} else {
+		/* When flushed, dirty blocks are searched in
+		neighborhoods of this size, and flushed along with the
+		original page. */
+
+		ulint	buf_flush_area;
+
+		buf_flush_area	= ut_min(
+			BUF_READ_AHEAD_AREA(buf_pool),
+			buf_pool->curr_size / 16);
+
+		low = (offset / buf_flush_area) * buf_flush_area;
+		high = (offset / buf_flush_area + 1) * buf_flush_area;
+
+		if (srv_flush_neighbors == 1) {
+			/* adjust 'low' and 'high' to limit
+			   for contiguous dirty area */
+			if (offset > low) {
+				for (i = offset - 1;
+				     i >= low
+				     && buf_flush_check_neighbor(
+						space, i, flush_type);
+				     i--) {
+					/* do nothing */
+				}
+				low = i + 1;
+			}
+
+			for (i = offset + 1;
+			     i < high
+			     && buf_flush_check_neighbor(
+						space, i, flush_type);
+			     i++) {
+				/* do nothing */
+			}
+			high = i;
+		}
+	}
+
+	/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
+
+	if (high > fil_space_get_size(space)) {
+		high = fil_space_get_size(space);
+	}
+
+	for (i = low; i < high; i++) {
+
+		buf_page_t*	bpage;
+		prio_rw_lock_t*	hash_lock;
+		ib_mutex_t*	block_mutex;
+
+		if ((count + n_flushed) >= n_to_flush) {
+
+			/* We have already flushed enough pages and
+			should call it a day. There is, however, one
+			exception. If the page whose neighbors we
+			are flushing has not been flushed yet then
+			we'll try to flush the victim that we
+			selected originally. */
+			if (i <= offset) {
+				i = offset;
+			} else {
+				break;
+			}
+		}
+
+		buf_pool = buf_pool_get(space, i);
+
+		/* We only want to flush pages from this buffer pool. */
+		bpage = buf_page_hash_get_s_locked(buf_pool, space, i,
+						   &hash_lock);
+
+		if (!bpage) {
+
+			continue;
+		}
+
+		block_mutex = buf_page_get_mutex(bpage);
+
+		mutex_enter(block_mutex);
+
+		rw_lock_s_unlock(hash_lock);
+
+		ut_a(buf_page_in_file(bpage));
+
+		/* We avoid flushing 'non-old' blocks in an LRU flush,
+		because the flushed blocks are soon freed */
+
+		if (flush_type != BUF_FLUSH_LRU
+		    || i == offset
+		    || buf_page_is_old(bpage)) {
+
+			if (buf_flush_ready_for_flush(bpage, flush_type)
+			    && (i == offset || !bpage->buf_fix_count)) {
+				/* We only try to flush those
+				neighbors != offset where the buf fix
+				count is zero, as we then know that we
+				probably can latch the page without a
+				semaphore wait. Semaphore waits are
+				expensive because we must flush the
+				doublewrite buffer before we start
+				waiting. */
+
+				buf_flush_page(buf_pool, bpage, flush_type, false);
+				ut_ad(!mutex_own(block_mutex));
+				count++;
+				continue;
+			}
+		}
+
+		mutex_exit(block_mutex);
+	}
+
+	if (count > 0) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+					MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+					MONITOR_FLUSH_NEIGHBOR_COUNT,
+					MONITOR_FLUSH_NEIGHBOR_PAGES,
+					(count - 1));
+	}
+
+	return(count);
+}
+
+/********************************************************************//**
+Check if the block is modified and ready for flushing. If the the block
+is ready to flush then flush the page and try o flush its neighbors.
+
+@return	TRUE if, depending on the flush type, either LRU or flush list
+mutex was released during this function.  This does not guarantee that some
+pages were written as well.
+Number of pages written are incremented to the count. */
+static
+ibool
+buf_flush_page_and_try_neighbors(
+/*=============================*/
+	buf_page_t*	bpage,		/*!< in: buffer control block,
+					must be
+					buf_page_in_file(bpage) */
+	buf_flush_t	flush_type,	/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+	ulint		n_to_flush,	/*!< in: number of pages to
+					flush */
+	ulint*		count)		/*!< in/out: number of pages
+					flushed */
+{
+	ib_mutex_t*	block_mutex = NULL;
+	ibool		flushed = FALSE;
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+#endif /* UNIV_DEBUG */
+
+	ut_ad((flush_type == BUF_FLUSH_LRU
+	       && mutex_own(&buf_pool->LRU_list_mutex))
+	      || (flush_type == BUF_FLUSH_LIST
+		  && buf_flush_list_mutex_own(buf_pool)));
+
+	if (flush_type == BUF_FLUSH_LRU) {
+		block_mutex = buf_page_get_mutex(bpage);
+		mutex_enter(block_mutex);
+	}
+
+	if (UNIV_UNLIKELY(buf_page_get_state(bpage)
+			  == BUF_BLOCK_REMOVE_HASH)) {
+
+		/* In case we don't hold the LRU list mutex, we may see a page
+		that is about to be relocated on the flush list.  Do not
+		attempt to flush it.  */
+		ut_ad(flush_type == BUF_FLUSH_LIST);
+		return (flushed);
+	}
+
+	ut_a(buf_page_in_file(bpage));
+
+	if (buf_flush_ready_for_flush(bpage, flush_type)) {
+		ulint		space;
+		ulint		offset;
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_bpage(bpage);
+
+		if (flush_type == BUF_FLUSH_LRU) {
+			mutex_exit(&buf_pool->LRU_list_mutex);
+		}
+
+		/* These fields are protected by the buf_page_get_mutex()
+		mutex. */
+		space = buf_page_get_space(bpage);
+		offset = buf_page_get_page_no(bpage);
+
+		if (flush_type == BUF_FLUSH_LRU) {
+			mutex_exit(block_mutex);
+		} else {
+			buf_flush_list_mutex_exit(buf_pool);
+		}
+
+		/* Try to flush also all the neighbors */
+		*count += buf_flush_try_neighbors(space,
+						  offset,
+						  flush_type,
+						  *count,
+						  n_to_flush);
+
+		if (flush_type == BUF_FLUSH_LRU) {
+			mutex_enter(&buf_pool->LRU_list_mutex);
+		} else {
+			buf_flush_list_mutex_enter(buf_pool);
+		}
+		flushed = TRUE;
+	} else if (flush_type == BUF_FLUSH_LRU) {
+		mutex_exit(block_mutex);
+	}
+
+	ut_ad((flush_type == BUF_FLUSH_LRU
+	       && mutex_own(&buf_pool->LRU_list_mutex))
+	      || (flush_type == BUF_FLUSH_LIST
+		  && buf_flush_list_mutex_own(buf_pool)));
+
+	return(flushed);
+}
+
+/*******************************************************************//**
+This utility moves the uncompressed frames of pages to the free list.
+Note that this function does not actually flush any data to disk. It
+just detaches the uncompressed frames from the compressed pages at the
+tail of the unzip_LRU and puts those freed frames in the free list.
+Note that it is a best effort attempt and it is not guaranteed that
+after a call to this function there will be 'max' blocks in the free
+list.
+@return number of blocks moved to the free list. */
+static
+ulint
+buf_free_from_unzip_LRU_list_batch(
+/*===============================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		max)		/*!< in: desired number of
+					blocks in the free_list */
+{
+	buf_block_t*	block;
+	ulint		scanned = 0;
+	ulint		count = 0;
+	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
+	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
+
+	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+
+	block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
+	while (block != NULL && count < max
+	       && free_len < srv_LRU_scan_depth
+	       && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
+
+		ib_mutex_t*	block_mutex = buf_page_get_mutex(&block->page);
+
+		++scanned;
+
+		mutex_enter(block_mutex);
+
+		if (buf_LRU_free_page(&block->page, false)) {
+
+			mutex_exit(block_mutex);
+			/* Block was freed. LRU list mutex potentially
+			released and reacquired */
+			++count;
+			mutex_enter(&buf_pool->LRU_list_mutex);
+			block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
+
+		} else {
+
+			mutex_exit(block_mutex);
+			block = UT_LIST_GET_PREV(unzip_LRU, block);
+		}
+
+		free_len = UT_LIST_GET_LEN(buf_pool->free);
+		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
+	}
+
+	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+
+	if (scanned) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_BATCH_SCANNED,
+			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+			scanned);
+	}
+
+	return(count);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list.
+The calling thread is not allowed to own any latches on pages!
+It attempts to make 'max' blocks available in the free list. Note that
+it is a best effort attempt and it is not guaranteed that after a call
+to this function there will be 'max' blocks in the free list.
+@return number of blocks for which the write request was queued. */
+__attribute__((nonnull))
+static
+void
+buf_flush_LRU_list_batch(
+/*=====================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		max,		/*!< in: desired number of
+					blocks in the free_list */
+	bool		limited_scan,	/*!< in: if true, allow to scan only up
+					to srv_LRU_scan_depth pages in total */
+	flush_counters_t*	n)	/*!< out: flushed/evicted page
+					counts */
+{
+	buf_page_t*	bpage;
+	ulint		scanned = 0;
+	ulint		lru_position = 0;
+	ulint		max_lru_position;
+	ulint		max_scanned_pages;
+	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
+	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+	n->flushed = 0;
+	n->evicted = 0;
+
+	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+
+	max_scanned_pages = limited_scan ? srv_LRU_scan_depth : lru_len * max;
+	max_lru_position = ut_min(srv_LRU_scan_depth, lru_len);
+
+	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	while (bpage != NULL
+	       && (srv_cleaner_eviction_factor ? n->evicted : n->flushed) < max
+	       && free_len < srv_LRU_scan_depth
+	       && lru_len > BUF_LRU_MIN_LEN
+	       && lru_position < max_lru_position
+	       && scanned < max_scanned_pages) {
+
+		ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
+		ibool	 evict;
+		ulint	failed_acquire;
+
+		++scanned;
+		++lru_position;
+
+		failed_acquire = mutex_enter_nowait(block_mutex);
+
+		evict = UNIV_LIKELY(!failed_acquire)
+			&& buf_flush_ready_for_replace(bpage);
+
+		if (UNIV_LIKELY(!failed_acquire) && !evict) {
+
+			mutex_exit(block_mutex);
+		}
+
+		/* If the block is ready to be replaced we try to
+		free it i.e.: put it on the free list.
+		Otherwise we try to flush the block and its
+		neighbors. In this case we'll put it on the
+		free list in the next pass. We do this extra work
+		of putting blocks to the free list instead of
+		just flushing them because after every flush
+		we have to restart the scan from the tail of
+		the LRU list and if we don't clear the tail
+		of the flushed pages then the scan becomes
+		O(n*n). */
+		if (evict) {
+			if (buf_LRU_free_page(bpage, true)) {
+
+				mutex_exit(block_mutex);
+				n->evicted++;
+				lru_position = 0;
+				mutex_enter(&buf_pool->LRU_list_mutex);
+				bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+			} else {
+
+				bpage = UT_LIST_GET_PREV(LRU, bpage);
+				mutex_exit(block_mutex);
+			}
+		} else if (UNIV_LIKELY(!failed_acquire)) {
+
+			if (buf_flush_page_and_try_neighbors(
+				bpage,
+				BUF_FLUSH_LRU, max, &n->flushed)) {
+
+				lru_position = 0;
+
+				/* LRU list mutex was released.
+				Restart the scan. */
+				bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+			} else {
+
+				bpage = UT_LIST_GET_PREV(LRU, bpage);
+			}
+		}
+
+		free_len = UT_LIST_GET_LEN(buf_pool->free);
+		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+	}
+
+	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+
+	/* We keep track of all flushes happening as part of LRU
+	flush. When estimating the desired rate at which flush_list
+	should be flushed, we factor in this value. */
+	buf_pool->stat.buf_lru_flush_page_count += n->flushed;
+
+	if (scanned) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_BATCH_SCANNED,
+			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+			scanned);
+	}
+}
+
+/*******************************************************************//**
+Flush and move pages from LRU or unzip_LRU list to the free list.
+Whether LRU or unzip_LRU is used depends on the state of the system.
+@return number of blocks for which either the write request was queued
+or in case of unzip_LRU the number of blocks actually moved to the
+free list */
+__attribute__((nonnull))
+static
+void
+buf_do_LRU_batch(
+/*=============*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		max,		/*!< in: desired number of
+					blocks in the free_list */
+	bool		limited_scan,	/*!< in: if true, allow to scan only up
+					to srv_LRU_scan_depth pages in total */
+	flush_counters_t*	n)	/*!< out: flushed/evicted page
+					counts */
+{
+	ulint	count = 0;
+
+	if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
+		count += buf_free_from_unzip_LRU_list_batch(buf_pool, max);
+	}
+
+	if (max > count) {
+		buf_flush_LRU_list_batch(buf_pool, max - count, limited_scan,
+					 n);
+	} else {
+		n->evicted = 0;
+		n->flushed = 0;
+	}
+
+	n->evicted += count;
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the flush_list.
+the calling thread is not allowed to own any latches on pages!
+@return number of blocks for which the write request was queued;
+ULINT_UNDEFINED if there was a flush of the same type already
+running */
+static
+ulint
+buf_do_flush_list_batch(
+/*====================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		min_n,		/*!< in: wished minimum mumber
+					of blocks flushed (it is not
+					guaranteed that the actual
+					number is that big, though) */
+	lsn_t		lsn_limit)	/*!< all blocks whose
+					oldest_modification is smaller
+					than this should be flushed (if
+					their number does not exceed
+					min_n) */
+{
+	ulint		count = 0;
+	ulint		scanned = 0;
+
+	/* Start from the end of the list looking for a suitable
+	block to be flushed. */
+	buf_flush_list_mutex_enter(buf_pool);
+	ulint len = UT_LIST_GET_LEN(buf_pool->flush_list);
+
+	/* In order not to degenerate this scan to O(n*n) we attempt
+	to preserve pointer of previous block in the flush list. To do
+	so we declare it a hazard pointer. Any thread working on the
+	flush list must check the hazard pointer and if it is removing
+	the same block then it must reset it. */
+	for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
+	     count < min_n && bpage != NULL && len > 0
+	     && bpage->oldest_modification < lsn_limit;
+	     ++scanned) {
+
+		buf_page_t*	prev;
+
+		ut_a(bpage->oldest_modification > 0);
+		ut_ad(bpage->in_flush_list);
+
+		prev = UT_LIST_GET_PREV(list, bpage);
+		buf_flush_set_hp(buf_pool, prev);
+
+#ifdef UNIV_DEBUG
+		bool flushed =
+#endif /* UNIV_DEBUG */
+		buf_flush_page_and_try_neighbors(
+			bpage, BUF_FLUSH_LIST, min_n, &count);
+
+		ut_ad(flushed || buf_flush_is_hp(buf_pool, prev));
+
+		if (!buf_flush_is_hp(buf_pool, prev)) {
+			/* The hazard pointer was reset by some other
+			thread. Restart the scan. */
+			ut_ad(buf_flush_is_hp(buf_pool, NULL));
+			bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
+			len = UT_LIST_GET_LEN(buf_pool->flush_list);
+		} else {
+			bpage = prev;
+			--len;
+			buf_flush_set_hp(buf_pool, NULL);
+		}
+
+		ut_ad(!bpage || bpage->in_flush_list);
+	}
+
+	buf_flush_list_mutex_exit(buf_pool);
+
+	MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
+				     MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+				     MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+				     scanned);
+
+	return(count);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list or flush_list.
+NOTE 1: in the case of an LRU flush the calling thread may own latches to
+pages: to avoid deadlocks, this function must be written so that it cannot
+end up waiting for these latches! NOTE 2: in the case of a flush list flush,
+the calling thread is not allowed to own any latches on pages!
+@return number of blocks for which the write request was queued */
+__attribute__((nonnull))
+static
+void
+buf_flush_batch(
+/*============*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_flush_t	flush_type,	/*!< in: BUF_FLUSH_LRU or
+					BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
+					then the caller must not own any
+					latches on pages */
+	ulint		min_n,		/*!< in: wished minimum mumber of blocks
+					flushed (it is not guaranteed that the
+					actual number is that big, though) */
+	lsn_t		lsn_limit,	/*!< in: in the case of BUF_FLUSH_LIST
+					all blocks whose oldest_modification is
+					smaller than this should be flushed
+					(if their number does not exceed
+					min_n), otherwise ignored */
+	bool		limited_lru_scan,/*!< in: for LRU flushes, if true,
+					allow to scan only up to
+					srv_LRU_scan_depth pages in total */
+	flush_counters_t*	n)	/*!< out: flushed/evicted page
+					counts  */
+{
+	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad((flush_type != BUF_FLUSH_LIST)
+	      || sync_thread_levels_empty_except_dict());
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* Note: The buffer pool mutexes are released and reacquired within
+	the flush functions. */
+	switch (flush_type) {
+	case BUF_FLUSH_LRU:
+		mutex_enter(&buf_pool->LRU_list_mutex);
+		buf_do_LRU_batch(buf_pool, min_n, limited_lru_scan, n);
+		mutex_exit(&buf_pool->LRU_list_mutex);
+		break;
+	case BUF_FLUSH_LIST:
+		ut_ad(!limited_lru_scan);
+		n->flushed = buf_do_flush_list_batch(buf_pool, min_n,
+						     lsn_limit);
+		n->evicted = 0;
+		break;
+	default:
+		ut_error;
+	}
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints && n->flushed > 0) {
+		fprintf(stderr, flush_type == BUF_FLUSH_LRU
+			? "Flushed %lu pages in LRU flush\n"
+			: "Flushed %lu pages in flush list flush\n",
+			(ulong) n->flushed);
+	}
+#endif /* UNIV_DEBUG */
+}
+
+/******************************************************************//**
+Gather the aggregated stats for both flush list and LRU list flushing */
+static
+void
+buf_flush_common(
+/*=============*/
+	buf_flush_t	flush_type,	/*!< in: type of flush */
+	ulint		page_count)	/*!< in: number of pages flushed */
+{
+	if (page_count) {
+		buf_dblwr_flush_buffered_writes();
+	}
+
+	ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints && page_count > 0) {
+		fprintf(stderr, flush_type == BUF_FLUSH_LRU
+			? "Flushed %lu pages in LRU flush\n"
+			: "Flushed %lu pages in flush list flush\n",
+			(ulong) page_count);
+	}
+#endif /* UNIV_DEBUG */
+
+	srv_stats.buf_pool_flushed.add(page_count);
+}
+
+/******************************************************************//**
+Start a buffer flush batch for LRU or flush list */
+static
+ibool
+buf_flush_start(
+/*============*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+{
+	mutex_enter(&buf_pool->flush_state_mutex);
+
+	if (buf_pool->n_flush[flush_type] > 0
+	    || buf_pool->init_flush[flush_type] == TRUE) {
+
+		/* There is already a flush batch of the same type running */
+
+		mutex_exit(&buf_pool->flush_state_mutex);
+
+		return(FALSE);
+	}
+
+	buf_pool->init_flush[flush_type] = TRUE;
+
+	mutex_exit(&buf_pool->flush_state_mutex);
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+End a buffer flush batch for LRU or flush list */
+static
+void
+buf_flush_end(
+/*==========*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+{
+	mutex_enter(&buf_pool->flush_state_mutex);
+
+	buf_pool->init_flush[flush_type] = FALSE;
+
+	buf_pool->try_LRU_scan = TRUE;
+
+	if (buf_pool->n_flush[flush_type] == 0) {
+
+		/* The running flush batch has ended */
+
+		os_event_set(buf_pool->no_flush[flush_type]);
+	}
+
+	mutex_exit(&buf_pool->flush_state_mutex);
+}
+
+/******************************************************************//**
+Waits until a flush batch of the given type ends */
+UNIV_INTERN
+void
+buf_flush_wait_batch_end(
+/*=====================*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	buf_flush_t	type)		/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+{
+	ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
+
+	if (buf_pool == NULL) {
+		ulint	i;
+
+		for (i = 0; i < srv_buf_pool_instances; ++i) {
+			buf_pool_t*	buf_pool;
+
+			buf_pool = buf_pool_from_array(i);
+
+			thd_wait_begin(NULL, THD_WAIT_DISKIO);
+			os_event_wait(buf_pool->no_flush[type]);
+			thd_wait_end(NULL);
+		}
+	} else {
+		thd_wait_begin(NULL, THD_WAIT_DISKIO);
+	os_event_wait(buf_pool->no_flush[type]);
+		thd_wait_end(NULL);
+	}
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list and also
+puts replaceable clean pages from the end of the LRU list to the free
+list.
+NOTE: The calling thread is not allowed to own any latches on pages!
+@return true if a batch was queued successfully. false if another batch
+of same type was already running. */
+__attribute__((nonnull))
+static
+bool
+buf_flush_LRU(
+/*==========*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
+	ulint		min_n,		/*!< in: wished minimum mumber of blocks
+					flushed (it is not guaranteed that the
+					actual number is that big, though) */
+	bool			limited_scan,	/*!< in: if true, allow to scan
+						only up to srv_LRU_scan_depth
+						pages in total */
+	flush_counters_t	*n)	/*!< out: flushed/evicted page
+					counts */
+{
+	if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
+		n->flushed = 0;
+		n->evicted = 0;
+		return(false);
+	}
+
+	buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0, limited_scan, n);
+
+	buf_flush_end(buf_pool, BUF_FLUSH_LRU);
+
+	buf_flush_common(BUF_FLUSH_LRU, n->flushed);
+
+	return(true);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the flush list of
+all buffer pool instances.
+NOTE: The calling thread is not allowed to own any latches on pages!
+@return true if a batch was queued successfully for each buffer pool
+instance. false if another batch of same type was already running in
+at least one of the buffer pool instance */
+UNIV_INTERN
+bool
+buf_flush_list(
+/*===========*/
+	ulint		min_n,		/*!< in: wished minimum mumber of blocks
+					flushed (it is not guaranteed that the
+					actual number is that big, though) */
+	lsn_t		lsn_limit,	/*!< in the case BUF_FLUSH_LIST all
+					blocks whose oldest_modification is
+					smaller than this should be flushed
+					(if their number does not exceed
+					min_n), otherwise ignored */
+	ulint*		n_processed)	/*!< out: the number of pages
+					which were processed is passed
+					back to caller. Ignored if NULL */
+
+{
+	ulint		i;
+
+	ulint		requested_pages[MAX_BUFFER_POOLS];
+	bool		active_instance[MAX_BUFFER_POOLS];
+	ulint		remaining_instances = srv_buf_pool_instances;
+	bool		timeout = false;
+	ulint		flush_start_time = 0;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		requested_pages[i] = 0;
+		active_instance[i] = true;
+	}
+
+	if (n_processed) {
+		*n_processed = 0;
+	}
+
+	if (min_n != ULINT_MAX) {
+		/* Ensure that flushing is spread evenly amongst the
+		buffer pool instances. When min_n is ULINT_MAX
+		we need to flush everything up to the lsn limit
+		so no limit here. */
+		min_n = (min_n + srv_buf_pool_instances - 1)
+			 / srv_buf_pool_instances;
+		if (lsn_limit != LSN_MAX) {
+			flush_start_time = ut_time_ms();
+		}
+	}
+
+	/* Flush to lsn_limit in all buffer pool instances */
+	while (remaining_instances && !timeout) {
+
+		ulint flush_common_batch = 0;
+
+		for (i = 0; i < srv_buf_pool_instances; i++) {
+
+			if (flush_start_time
+			    && (ut_time_ms() - flush_start_time
+				>= srv_cleaner_max_flush_time)) {
+
+				timeout = true;
+				break;
+			}
+
+			if (active_instance[i]) {
+
+				buf_pool_t*	buf_pool;
+				ulint		chunk_size;
+				flush_counters_t n;
+
+				chunk_size = ut_min(
+					srv_cleaner_flush_chunk_size,
+					min_n - requested_pages[i]);
+
+				buf_pool = buf_pool_from_array(i);
+
+				if (!buf_flush_start(buf_pool,
+						     BUF_FLUSH_LIST)) {
+
+					continue;
+				}
+
+				buf_flush_batch(buf_pool, BUF_FLUSH_LIST,
+						chunk_size, lsn_limit, false,
+						&n);
+
+				buf_flush_end(buf_pool, BUF_FLUSH_LIST);
+
+				flush_common_batch += n.flushed;
+
+				if (n_processed) {
+					*n_processed += n.flushed;
+				}
+
+				requested_pages[i] += chunk_size;
+
+				if (requested_pages[i] >= min_n
+				    || !n.flushed) {
+
+					active_instance[i] = false;
+					remaining_instances--;
+				}
+
+				if (n.flushed) {
+					MONITOR_INC_VALUE_CUMULATIVE(
+						MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+						MONITOR_FLUSH_BATCH_COUNT,
+						MONITOR_FLUSH_BATCH_PAGES,
+						n.flushed);
+				}
+			}
+		}
+
+		buf_flush_common(BUF_FLUSH_LIST, flush_common_batch);
+	}
+
+	/* If we haven't flushed all the instances due to timeout or a repeat
+	failure to start a flush, return failure */
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		if (active_instance[i]) {
+			return(false);
+		}
+	}
+
+	return(true);
+}
+
+/******************************************************************//**
+This function picks up a single dirty page from the tail of the LRU
+list, flushes it, removes it from page_hash and LRU list and puts
+it on the free list. It is called from user threads when they are
+unable to find a replaceable page at the tail of the LRU list i.e.:
+when the background LRU flushing in the page_cleaner thread is not
+fast enough to keep pace with the workload.
+@return TRUE if success. */
+UNIV_INTERN
+ibool
+buf_flush_single_page_from_LRU(
+/*===========================*/
+	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
+{
+	ulint		scanned;
+	buf_page_t*	bpage;
+	ib_mutex_t*	block_mutex;
+	ibool		freed;
+	bool		evict_zip;
+
+	mutex_enter(&buf_pool->LRU_list_mutex);
+
+	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), scanned = 1;
+	     bpage != NULL;
+	     bpage = UT_LIST_GET_PREV(LRU, bpage), ++scanned) {
+
+		block_mutex = buf_page_get_mutex(bpage);
+		mutex_enter(block_mutex);
+		if (buf_flush_ready_for_flush(bpage,
+					      BUF_FLUSH_SINGLE_PAGE)) {
+			/* buf_flush_page() will release the block
+			mutex */
+			break;
+		}
+		mutex_exit(block_mutex);
+	}
+
+	mutex_exit(&buf_pool->LRU_list_mutex);
+
+	MONITOR_INC_VALUE_CUMULATIVE(
+		MONITOR_LRU_SINGLE_FLUSH_SCANNED,
+		MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
+		MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
+		scanned);
+
+	if (!bpage) {
+		/* Can't find a single flushable page. */
+		return(FALSE);
+	}
+
+	/* The following call will release the buf_page_get_mutex() mutex. */
+	buf_flush_page(buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true);
+
+	/* At this point the page has been written to the disk.
+	As we are not holding LRU list or buf_page_get_mutex() mutex therefore
+	we cannot use the bpage safely. It may have been plucked out
+	of the LRU list by some other thread or it may even have
+	relocated in case of a compressed page. We need to start
+	the scan of LRU list again to remove the block from the LRU
+	list and put it on the free list. */
+	mutex_enter(&buf_pool->LRU_list_mutex);
+
+	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	     bpage != NULL;
+	     bpage = UT_LIST_GET_PREV(LRU, bpage)) {
+
+		ibool	ready;
+
+		block_mutex = buf_page_get_mutex(bpage);
+		mutex_enter(block_mutex);
+		ready = buf_flush_ready_for_replace(bpage);
+		if (ready) {
+			break;
+		}
+		mutex_exit(block_mutex);
+
+	}
+
+	if (!bpage) {
+		/* Can't find a single replaceable page. */
+		mutex_exit(&buf_pool->LRU_list_mutex);
+		return(FALSE);
+	}
+
+	evict_zip = !buf_LRU_evict_from_unzip_LRU(buf_pool);;
+
+	freed = buf_LRU_free_page(bpage, evict_zip);
+	if (!freed)
+		mutex_exit(&buf_pool->LRU_list_mutex);
+	mutex_exit(block_mutex);
+
+	return(freed);
+}
+
+/*********************************************************************//**
+Clears up tail of the LRU lists:
+* Put replaceable pages at the tail of LRU to the free list
+* Flush dirty pages at the tail of LRU to the disk
+The depth to which we scan each buffer pool is controlled by dynamic
+config parameter innodb_LRU_scan_depth.
+@return total pages flushed */
+UNIV_INTERN
+ulint
+buf_flush_LRU_tail(void)
+/*====================*/
+{
+	ulint	total_flushed = 0;
+	ulint	start_time = ut_time_ms();
+	ulint	scan_depth[MAX_BUFFER_POOLS];
+	ulint	requested_pages[MAX_BUFFER_POOLS];
+	bool	active_instance[MAX_BUFFER_POOLS];
+	bool	limited_scan[MAX_BUFFER_POOLS];
+	ulint	previous_evicted[MAX_BUFFER_POOLS];
+	ulint	remaining_instances = srv_buf_pool_instances;
+	ulint	lru_chunk_size = srv_cleaner_lru_chunk_size;
+	ulint	free_list_lwm = srv_LRU_scan_depth / 100
+		* srv_cleaner_free_list_lwm;
+
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+
+		const buf_pool_t* buf_pool = buf_pool_from_array(i);
+
+		scan_depth[i] = ut_min(srv_LRU_scan_depth,
+				       UT_LIST_GET_LEN(buf_pool->LRU));
+		requested_pages[i] = 0;
+		active_instance[i] = true;
+		limited_scan[i] = true;
+		previous_evicted[i] = 0;
+	}
+
+	while (remaining_instances) {
+
+		if (ut_time_ms() - start_time >= srv_cleaner_max_lru_time) {
+
+			break;
+		}
+
+		for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+
+			if (!active_instance[i]) {
+				continue;
+			}
+
+			ulint free_len = free_list_lwm;
+			buf_pool_t* buf_pool = buf_pool_from_array(i);
+
+			do {
+				flush_counters_t	n;
+
+				ut_ad(requested_pages[i] <= scan_depth[i]);
+
+				/* Currently page_cleaner is the only thread
+				that can trigger an LRU flush. It is possible
+				that a batch triggered during last iteration is
+				still running, */
+				if (buf_flush_LRU(buf_pool, lru_chunk_size,
+						  limited_scan[i], &n)) {
+
+					/* Allowed only one batch per
+					buffer pool instance. */
+					buf_flush_wait_batch_end(
+						buf_pool, BUF_FLUSH_LRU);
+				}
+
+				total_flushed += n.flushed;
+
+				/* When we evict less pages than we did	on a
+				previous try we relax the LRU scan limit in
+				order to attempt to evict more */
+				limited_scan[i]
+					= (previous_evicted[i] > n.evicted);
+				previous_evicted[i] = n.evicted;
+
+				requested_pages[i] += lru_chunk_size;
+
+				if (requested_pages[i] >= scan_depth[i]
+				    || !(srv_cleaner_eviction_factor
+					 ? n.evicted : n.flushed)) {
+
+					active_instance[i] = false;
+					remaining_instances--;
+				} else {
+
+					free_len = UT_LIST_GET_LEN(
+						buf_pool->free);
+				}
+			} while (active_instance[i]
+				 && free_len <= free_list_lwm);
+		}
+	}
+
+	if (total_flushed) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_BATCH_TOTAL_PAGE,
+			MONITOR_LRU_BATCH_COUNT,
+			MONITOR_LRU_BATCH_PAGES,
+			total_flushed);
+	}
+
+	return(total_flushed);
+}
+
+/*********************************************************************//**
+Wait for any possible LRU flushes that are in progress to end. */
+UNIV_INTERN
+void
+buf_flush_wait_LRU_batch_end(void)
+/*==============================*/
+{
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		mutex_enter(&buf_pool->flush_state_mutex);
+
+		if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
+		   || buf_pool->init_flush[BUF_FLUSH_LRU]) {
+
+			mutex_exit(&buf_pool->flush_state_mutex);
+			buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
+		} else {
+			mutex_exit(&buf_pool->flush_state_mutex);
+		}
+	}
+}
+
+/*********************************************************************//**
+Flush a batch of dirty pages from the flush list
+@return number of pages flushed, 0 if no page is flushed or if another
+flush_list type batch is running */
+static
+ulint
+page_cleaner_do_flush_batch(
+/*========================*/
+	ulint		n_to_flush,	/*!< in: number of pages that
+					we should attempt to flush. */
+	lsn_t		lsn_limit)	/*!< in: LSN up to which flushing
+					must happen */
+{
+	ulint n_flushed;
+
+	buf_flush_list(n_to_flush, lsn_limit, &n_flushed);
+
+	return(n_flushed);
+}
+
+/*********************************************************************//**
+Calculates if flushing is required based on number of dirty pages in
+the buffer pool.
+@return percent of io_capacity to flush to manage dirty page ratio */
+static
+ulint
+af_get_pct_for_dirty()
+/*==================*/
+{
+	ulint dirty_pct = buf_get_modified_ratio_pct();
+
+	ut_a(srv_max_dirty_pages_pct_lwm
+	     <= srv_max_buf_pool_modified_pct);
+
+	if (srv_max_dirty_pages_pct_lwm == 0) {
+		/* The user has not set the option to preflush dirty
+		pages as we approach the high water mark. */
+		if (dirty_pct > srv_max_buf_pool_modified_pct) {
+			/* We have crossed the high water mark of dirty
+			pages In this case we start flushing at 100% of
+			innodb_io_capacity. */
+			return(100);
+		}
+	} else if (dirty_pct > srv_max_dirty_pages_pct_lwm) {
+		/* We should start flushing pages gradually. */
+		return((dirty_pct * 100)
+		       / (srv_max_buf_pool_modified_pct + 1));
+	}
+
+	return(0);
+}
+
+/*********************************************************************//**
+Calculates if flushing is required based on redo generation rate.
+@return percent of io_capacity to flush to manage redo space */
+static
+ulint
+af_get_pct_for_lsn(
+/*===============*/
+	lsn_t	age)	/*!< in: current age of LSN. */
+{
+	lsn_t	max_async_age;
+	lsn_t	lsn_age_factor;
+	lsn_t	af_lwm = (srv_adaptive_flushing_lwm
+			  * log_get_capacity()) / 100;
+
+	if (age < af_lwm) {
+		/* No adaptive flushing. */
+		return(0);
+	}
+
+	max_async_age = log_get_max_modified_age_async();
+
+	if (age < max_async_age && !srv_adaptive_flushing) {
+		/* We have still not reached the max_async point and
+		the user has disabled adaptive flushing. */
+		return(0);
+	}
+
+	/* If we are here then we know that either:
+	1) User has enabled adaptive flushing
+	2) User may have disabled adaptive flushing but we have reached
+	max_async_age. */
+	lsn_age_factor = (age * 100) / max_async_age;
+
+	ut_ad(srv_max_io_capacity >= srv_io_capacity);
+	switch ((srv_cleaner_lsn_age_factor_t)srv_cleaner_lsn_age_factor) {
+	case SRV_CLEANER_LSN_AGE_FACTOR_LEGACY:
+		return(static_cast<ulint>(
+			       ((srv_max_io_capacity / srv_io_capacity)
+				* (lsn_age_factor
+				   * sqrt((double)lsn_age_factor)))
+			       / 7.5));
+	case SRV_CLEANER_LSN_AGE_FACTOR_HIGH_CHECKPOINT:
+		return(static_cast<ulint>(
+			       ((srv_max_io_capacity / srv_io_capacity)
+				* (lsn_age_factor * lsn_age_factor
+				   * sqrt((double)lsn_age_factor)))
+			       / 700.5));
+	default:
+		ut_error;
+	}
+}
+
+/*********************************************************************//**
+This function is called approximately once every second by the
+page_cleaner thread. Based on various factors it decides if there is a
+need to do flushing. If flushing is needed it is performed and the
+number of pages flushed is returned.
+@return number of pages flushed */
+static
+ulint
+page_cleaner_flush_pages_if_needed(void)
+/*====================================*/
+{
+	static	lsn_t		lsn_avg_rate = 0;
+	static	lsn_t		prev_lsn = 0;
+	static	lsn_t		last_lsn = 0;
+	static	ulint		sum_pages = 0;
+	static	ulint		last_pages = 0;
+	static	ulint		prev_pages = 0;
+	static	ulint		avg_page_rate = 0;
+	static	ulint		n_iterations = 0;
+	lsn_t			oldest_lsn;
+	lsn_t			cur_lsn;
+	lsn_t			age;
+	lsn_t			lsn_rate;
+	ulint			n_pages = 0;
+	ulint			pct_for_dirty = 0;
+	ulint			pct_for_lsn = 0;
+	ulint			pct_total = 0;
+	int			age_factor = 0;
+
+	cur_lsn = log_get_lsn();
+
+	if (prev_lsn == 0) {
+		/* First time around. */
+		prev_lsn = cur_lsn;
+		return(0);
+	}
+
+	if (prev_lsn == cur_lsn) {
+		return(0);
+	}
+
+	/* We update our variables every srv_flushing_avg_loops
+	iterations to smooth out transition in workload. */
+	if (++n_iterations >= srv_flushing_avg_loops) {
+
+		avg_page_rate = ((sum_pages / srv_flushing_avg_loops)
+				 + avg_page_rate) / 2;
+
+		/* How much LSN we have generated since last call. */
+		lsn_rate = (cur_lsn - prev_lsn) / srv_flushing_avg_loops;
+
+		lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
+
+		prev_lsn = cur_lsn;
+
+		n_iterations = 0;
+
+		sum_pages = 0;
+	}
+
+	oldest_lsn = buf_pool_get_oldest_modification();
+
+	ut_ad(oldest_lsn <= log_get_lsn());
+
+	age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0;
+
+	pct_for_dirty = af_get_pct_for_dirty();
+	pct_for_lsn = af_get_pct_for_lsn(age);
+
+	pct_total = ut_max(pct_for_dirty, pct_for_lsn);
+
+	/* Cap the maximum IO capacity that we are going to use by
+	max_io_capacity. */
+	n_pages = PCT_IO(pct_total);
+	if (age < log_get_max_modified_age_async())
+		n_pages = (n_pages + avg_page_rate) / 2;
+
+	if (n_pages > srv_max_io_capacity) {
+		n_pages = srv_max_io_capacity;
+	}
+
+	if (last_pages && cur_lsn - last_lsn > lsn_avg_rate / 2) {
+		age_factor = prev_pages / last_pages;
+	}
+
+	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
+
+	prev_pages = n_pages;
+	n_pages = page_cleaner_do_flush_batch(
+		n_pages, oldest_lsn + lsn_avg_rate * (age_factor + 1));
+
+	last_lsn= cur_lsn;
+	last_pages= n_pages + 1;
+
+	MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
+	MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
+	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
+	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
+
+	if (n_pages) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+			MONITOR_FLUSH_ADAPTIVE_COUNT,
+			MONITOR_FLUSH_ADAPTIVE_PAGES,
+			n_pages);
+
+		sum_pages += n_pages;
+	}
+
+	return(n_pages);
+}
+
+/*********************************************************************//**
+Puts the page_cleaner thread to sleep if it has finished work in less
+than a second */
+static
+void
+page_cleaner_sleep_if_needed(
+/*=========================*/
+	ulint	next_loop_time)	/*!< in: time when next loop iteration
+				should start */
+{
+	ulint	cur_time = ut_time_ms();
+
+	if (next_loop_time > cur_time) {
+		/* Get sleep interval in micro seconds. We use
+		ut_min() to avoid long sleep in case of
+		wrap around. */
+		os_thread_sleep(ut_min(1000000,
+				(next_loop_time - cur_time)
+				 * 1000));
+	}
+}
+
+/*********************************************************************//**
+Returns the aggregate free list length over all buffer pool instances.
+@return total free list length. */
+__attribute__((warn_unused_result))
+static
+ulint
+buf_get_total_free_list_length(void)
+/*================================*/
+{
+	ulint result = 0;
+
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+
+		result += UT_LIST_GET_LEN(buf_pool_from_array(i)->free);
+	}
+
+	return result;
+}
+
+/*********************************************************************//**
+Adjust the desired page cleaner thread sleep time for LRU flushes.  */
+__attribute__((nonnull))
+static
+void
+page_cleaner_adapt_lru_sleep_time(
+/*==============================*/
+	ulint*	lru_sleep_time)	/*!< in/out: desired page cleaner thread sleep
+				time for LRU flushes  */
+{
+	ulint free_len = buf_get_total_free_list_length();
+	ulint max_free_len = srv_LRU_scan_depth * srv_buf_pool_instances;
+
+	if (free_len < max_free_len / 100) {
+
+		/* Free lists filled less than 1%, no sleep */
+		*lru_sleep_time = 0;
+	} else if (free_len > max_free_len / 5) {
+
+		/* Free lists filled more than 20%, sleep a bit more */
+		*lru_sleep_time += 50;
+		if (*lru_sleep_time > srv_cleaner_max_lru_time)
+			*lru_sleep_time = srv_cleaner_max_lru_time;
+	} else if (free_len < max_free_len / 20 && *lru_sleep_time >= 50) {
+
+		/* Free lists filled less than 5%, sleep a bit less */
+		*lru_sleep_time -= 50;
+	} else {
+
+		/* Free lists filled between 5% and 20%, no change */
+	}
+}
+
+/*********************************************************************//**
+Get the desired page cleaner thread sleep time for flush list flushes.
+@return desired sleep time */
+__attribute__((warn_unused_result))
+static
+ulint
+page_cleaner_adapt_flush_sleep_time(void)
+/*=====================================*/
+{
+	lsn_t	age = log_get_lsn() - log_sys->last_checkpoint_lsn;
+
+	if (age > log_sys->max_modified_age_sync) {
+
+		/* No sleep if in sync preflush zone */
+		return(0);
+	}
+
+	/* In all other cases flush list factors do not influence the page
+	cleaner sleep time */
+	return(srv_cleaner_max_flush_time);
+}
+
+/******************************************************************//**
+page_cleaner thread tasked with flushing dirty pages from the buffer
+pools. As of now we'll have only one instance of this thread.
+@return a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(buf_flush_page_cleaner_thread)(
+/*==========================================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	ulint	next_loop_time = ut_time_ms() + 1000;
+	ulint	n_flushed = 0;
+	ulint	last_activity = srv_get_activity_count();
+	ulint	lru_sleep_time = srv_cleaner_max_lru_time;
+
+	ut_ad(!srv_read_only_mode);
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(buf_page_cleaner_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+	srv_cleaner_tid = os_thread_get_tid();
+
+	os_thread_set_priority(srv_cleaner_tid, srv_sched_priority_cleaner);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "InnoDB: page_cleaner thread running, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	buf_page_cleaner_is_active = TRUE;
+
+	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+
+		ulint	flush_sleep_time;
+		ulint	page_cleaner_sleep_time;
+
+		srv_current_thread_priority = srv_cleaner_thread_priority;
+
+		/* The page_cleaner skips sleep if the server is
+		idle and there are no pending IOs in the buffer pool
+		and there is work to do. */
+		if (srv_check_activity(last_activity)
+		    || buf_get_n_pending_read_ios()
+		    || n_flushed == 0) {
+			page_cleaner_sleep_if_needed(next_loop_time);
+		}
+
+		page_cleaner_adapt_lru_sleep_time(&lru_sleep_time);
+
+		flush_sleep_time = page_cleaner_adapt_flush_sleep_time();
+
+		page_cleaner_sleep_time = ut_min(lru_sleep_time,
+						 flush_sleep_time);
+
+		next_loop_time = ut_time_ms() + page_cleaner_sleep_time;
+
+		/* Flush pages from end of LRU if required */
+		n_flushed = buf_flush_LRU_tail();
+
+		if (srv_check_activity(last_activity)) {
+			last_activity = srv_get_activity_count();
+
+			/* Flush pages from flush_list if required */
+			n_flushed += page_cleaner_flush_pages_if_needed();
+		} else {
+			n_flushed = page_cleaner_do_flush_batch(
+							PCT_IO(100),
+							LSN_MAX);
+
+			if (n_flushed) {
+				MONITOR_INC_VALUE_CUMULATIVE(
+					MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+					MONITOR_FLUSH_BACKGROUND_COUNT,
+					MONITOR_FLUSH_BACKGROUND_PAGES,
+					n_flushed);
+			}
+		}
+	}
+
+	ut_ad(srv_shutdown_state > 0);
+	if (srv_fast_shutdown == 2) {
+		/* In very fast shutdown we simulate a crash of
+		buffer pool. We are not required to do any flushing */
+		goto thread_exit;
+	}
+
+	/* In case of normal and slow shutdown the page_cleaner thread
+	must wait for all other activity in the server to die down.
+	Note that we can start flushing the buffer pool as soon as the
+	server enters shutdown phase but we must stay alive long enough
+	to ensure that any work done by the master or purge threads is
+	also flushed.
+	During shutdown we pass through two stages. In the first stage,
+	when SRV_SHUTDOWN_CLEANUP is set other threads like the master
+	and the purge threads may be working as well. We start flushing
+	the buffer pool but can't be sure that no new pages are being
+	dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */
+
+	do {
+		n_flushed = page_cleaner_do_flush_batch(PCT_IO(100), LSN_MAX);
+
+		/* We sleep only if there are no pages to flush */
+		if (n_flushed == 0) {
+			os_thread_sleep(100000);
+		}
+	} while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
+
+	/* At this point all threads including the master and the purge
+	thread must have been suspended. */
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
+	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
+
+	/* We can now make a final sweep on flushing the buffer pool
+	and exit after we have cleaned the whole buffer pool.
+	It is important that we wait for any running batch that has
+	been triggered by us to finish. Otherwise we can end up
+	considering end of that batch as a finish of our final
+	sweep and we'll come out of the loop leaving behind dirty pages
+	in the flush_list */
+	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+	buf_flush_wait_LRU_batch_end();
+
+	bool	success;
+
+	do {
+
+		success = buf_flush_list(PCT_IO(100), LSN_MAX, &n_flushed);
+		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+
+	} while (!success || n_flushed > 0);
+
+	/* Some sanity checks */
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
+	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t* buf_pool = buf_pool_from_array(i);
+		ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0);
+	}
+
+	/* We have lived our life. Time to die. */
+
+thread_exit:
+	buf_page_cleaner_is_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+
+/** Functor to validate the flush list. */
+struct	Check {
+	void	operator()(const buf_page_t* elem)
+	{
+		ut_a(elem->in_flush_list);
+	}
+};
+
+/******************************************************************//**
+Validates the flush list.
+@return	TRUE if ok */
+static
+ibool
+buf_flush_validate_low(
+/*===================*/
+	buf_pool_t*	buf_pool)		/*!< in: Buffer pool instance */
+{
+	buf_page_t*		bpage;
+	const ib_rbt_node_t*	rnode = NULL;
+
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+	UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list, Check());
+
+	bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+	/* If we are in recovery mode i.e.: flush_rbt != NULL
+	then each block in the flush_list must also be present
+	in the flush_rbt. */
+	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+		rnode = rbt_first(buf_pool->flush_rbt);
+	}
+
+	while (bpage != NULL) {
+		const lsn_t	om = bpage->oldest_modification;
+
+		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
+
+		ut_ad(bpage->in_flush_list);
+
+		/* A page in buf_pool->flush_list can be in
+		BUF_BLOCK_REMOVE_HASH state. This happens when a page
+		is in the middle of being relocated. In that case the
+		original descriptor can have this state and still be
+		in the flush list waiting to acquire the
+		buf_pool->flush_list_mutex to complete the relocation. */
+		ut_a(buf_page_in_file(bpage)
+		     || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
+		ut_a(om > 0);
+
+		if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+			buf_page_t** prpage;
+
+			ut_a(rnode);
+			prpage = rbt_value(buf_page_t*, rnode);
+
+			ut_a(*prpage);
+			ut_a(*prpage == bpage);
+			rnode = rbt_next(buf_pool->flush_rbt, rnode);
+		}
+
+		bpage = UT_LIST_GET_NEXT(list, bpage);
+
+		ut_a(!bpage || om >= bpage->oldest_modification);
+	}
+
+	/* By this time we must have exhausted the traversal of
+	flush_rbt (if active) as well. */
+	ut_a(rnode == NULL);
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Validates the flush list.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+buf_flush_validate(
+/*===============*/
+	buf_pool_t*	buf_pool)	/*!< buffer pool instance */
+{
+	ibool	ret;
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	ret = buf_flush_validate_low(buf_pool);
+
+	buf_flush_list_mutex_exit(buf_pool);
+
+	return(ret);
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Check if there are any dirty pages that belong to a space id in the flush
+list in a particular buffer pool.
+@return	number of dirty pages present in a single buffer pool */
+UNIV_INTERN
+ulint
+buf_pool_get_dirty_pages_count(
+/*===========================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool */
+	ulint		id)		/*!< in: space id to check */
+
+{
+	ulint		count = 0;
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	buf_page_t*	bpage;
+
+	for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
+	     bpage != 0;
+	     bpage = UT_LIST_GET_NEXT(list, bpage)) {
+
+		ut_ad(buf_page_in_file(bpage)
+		      || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
+		ut_ad(bpage->in_flush_list);
+		ut_ad(bpage->oldest_modification > 0);
+
+		if (bpage->space == id) {
+			++count;
+		}
+	}
+
+	buf_flush_list_mutex_exit(buf_pool);
+
+	return(count);
+}
+
+/******************************************************************//**
+Check if there are any dirty pages that belong to a space id in the flush list.
+@return	number of dirty pages present in all the buffer pools */
+UNIV_INTERN
+ulint
+buf_flush_get_dirty_pages_count(
+/*============================*/
+	ulint		id)		/*!< in: space id to check */
+
+{
+	ulint		count = 0;
+
+	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		count += buf_pool_get_dirty_pages_count(buf_pool, id);
+	}
+
+	return(count);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/xtradb/buf/buf0lru.c b/storage/xtradb/buf/buf0lru.cc
index cfb45232084..8a6d042f4c7 100644
--- a/storage/xtradb/buf/buf0lru.c
+++ b/storage/xtradb/buf/buf0lru.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file buf/buf0lru.c
+@file buf/buf0lru.cc
 The database buffer replacement algorithm
 
 Created 11/5/1995 Heikki Tuuri
@@ -25,6 +25,7 @@ Created 11/5/1995 Heikki Tuuri
 
 #include "buf0lru.h"
 
+#ifndef UNIV_HOTBACKUP
 #ifdef UNIV_NONINL
 #include "buf0lru.ic"
 #endif
@@ -40,6 +41,7 @@ Created 11/5/1995 Heikki Tuuri
 #include "btr0btr.h"
 #include "buf0buddy.h"
 #include "buf0buf.h"
+#include "buf0dblwr.h"
 #include "buf0flu.h"
 #include "buf0rea.h"
 #include "btr0sea.h"
@@ -49,6 +51,10 @@ Created 11/5/1995 Heikki Tuuri
 #include "log0recv.h"
 #include "srv0srv.h"
 #include "srv0start.h"
+#include "srv0mon.h"
+#include "lock0lock.h"
+
+#include "ha_prototypes.h"
 
 /** The number of blocks from the LRU_old pointer onward, including
 the block pointed to, must be buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
@@ -70,7 +76,7 @@ allowed to point to either end of the LRU list. */
 /** When dropping the search hash index entries before deleting an ibd
 file, we build a local array of pages belonging to that tablespace
 in the buffer pool. Following is the size of that array.
-We also release buf_pool->mutex after scanning this many pages of the
+We also release buf_pool->LRU_list_mutex after scanning this many pages of the
 flush_list when dropping a table. This is to ensure that other threads
 are not blocked for extended period of time when using very large
 buffer pools. */
@@ -126,21 +132,25 @@ UNIV_INTERN uint	buf_LRU_old_threshold_ms;
 /******************************************************************//**
 Takes a block out of the LRU list and page hash table.
 If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
-the object will be freed and buf_pool->zip_mutex will be released.
-
-If a compressed page or a compressed-only block descriptor is freed,
-other compressed pages or compressed-only block descriptors may be
-relocated.
-@return the new state of the block (BUF_BLOCK_ZIP_FREE if the state
-was BUF_BLOCK_ZIP_PAGE, or BUF_BLOCK_REMOVE_HASH otherwise) */
-static
-enum buf_page_state
-buf_LRU_block_remove_hashed_page(
-/*=============================*/
+the object will be freed.
+
+The caller must hold buf_pool->LRU_list_mutex, the buf_page_get_mutex() mutex
+and the appropriate hash_lock. This function will release the
+buf_page_get_mutex() and the hash_lock.
+
+If a compressed page is freed other compressed pages may be relocated.
+@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
+caller needs to free the page to the free list
+@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
+this case the block is already returned to the buddy allocator. */
+static __attribute__((nonnull, warn_unused_result))
+bool
+buf_LRU_block_remove_hashed(
+/*========================*/
 	buf_page_t*	bpage,	/*!< in: block, must contain a file page and
 				be in a state where it can be freed; there
 				may or may not be a hash index to the page */
-	ibool		zip);	/*!< in: TRUE if should remove also the
+	bool		zip);	/*!< in: true if should remove also the
 				compressed page of an uncompressed page */
 /******************************************************************//**
 Puts a file page whose has no hash index to the free list. */
@@ -148,9 +158,8 @@ static
 void
 buf_LRU_block_free_hashed_page(
 /*===========================*/
-	buf_block_t*	block,	/*!< in: block, must contain a file page and
+	buf_block_t*	block);	/*!< in: block, must contain a file page and
 				be in a state where it can be freed */
-	ibool		have_page_hash_mutex);
 
 /******************************************************************//**
 Increases LRU size in bytes with zip_size for compressed page,
@@ -162,9 +171,8 @@ incr_LRU_size_in_bytes(
 	buf_page_t*	bpage,		/*!< in: control block */
 	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
 {
-	ulint		zip_size;
 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
-	zip_size = page_zip_get_size(&bpage->zip);
+	ulint zip_size = page_zip_get_size(&bpage->zip);
 	buf_pool->stat.LRU_bytes += zip_size ? zip_size : UNIV_PAGE_SIZE;
 	ut_ad(buf_pool->stat.LRU_bytes <= buf_pool->curr_pool_size);
 }
@@ -173,28 +181,19 @@ incr_LRU_size_in_bytes(
 Determines if the unzip_LRU list should be used for evicting a victim
 instead of the general LRU list.
 @return	TRUE if should use unzip_LRU */
-UNIV_INLINE
+UNIV_INTERN
 ibool
 buf_LRU_evict_from_unzip_LRU(
 /*=========================*/
-	buf_pool_t*	buf_pool,
-	ibool*		have_LRU_mutex)
+	buf_pool_t*	buf_pool)
 {
 	ulint	io_avg;
 	ulint	unzip_avg;
 
-	//ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 
-	if (!*have_LRU_mutex) {
-		mutex_enter(&buf_pool->LRU_list_mutex);
-		*have_LRU_mutex = TRUE;
-	}
 	/* If the unzip_LRU list is empty, we can only use the LRU. */
 	if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0) {
-		if (*have_LRU_mutex) {
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			*have_LRU_mutex = FALSE;
-		}
 		return(FALSE);
 	}
 
@@ -203,26 +202,14 @@ buf_LRU_evict_from_unzip_LRU(
 	decompressed pages in the buffer pool. */
 	if (UT_LIST_GET_LEN(buf_pool->unzip_LRU)
 	    <= UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
-		if (*have_LRU_mutex) {
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			*have_LRU_mutex = FALSE;
-		}
 		return(FALSE);
 	}
 
 	/* If eviction hasn't started yet, we assume by default
 	that a workload is disk bound. */
 	if (buf_pool->freed_page_clock == 0) {
-		if (*have_LRU_mutex) {
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			*have_LRU_mutex = FALSE;
-		}
 		return(TRUE);
 	}
-	if (*have_LRU_mutex) {
-		mutex_exit(&buf_pool->LRU_list_mutex);
-		*have_LRU_mutex = FALSE;
-	}
 
 	/* Calculate the average over past intervals, and add the values
 	of the current interval. */
@@ -266,11 +253,9 @@ buf_LRU_drop_page_hash_batch(
 When doing a DROP TABLE/DISCARD TABLESPACE we have to drop all page
 hash index entries belonging to that table. This function tries to
 do that in batch. Note that this is a 'best effort' attempt and does
-not guarantee that ALL hash entries will be removed.
-
-@return number of hashed pages found*/
+not guarantee that ALL hash entries will be removed. */
 static
-ulint
+void
 buf_LRU_drop_page_hash_for_tablespace(
 /*==================================*/
 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
@@ -280,22 +265,18 @@ buf_LRU_drop_page_hash_for_tablespace(
 	ulint*		page_arr;
 	ulint		num_entries;
 	ulint		zip_size;
-	ulint		num_found = 0;
 
 	zip_size = fil_space_get_zip_size(id);
 
 	if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
 		/* Somehow, the tablespace does not exist.  Nothing to drop. */
 		ut_ad(0);
-		return num_found;
+		return;
 	}
 
-	page_arr = ut_malloc(
-		sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE);
-
-	//buf_pool_mutex_enter(buf_pool);
+	page_arr = static_cast<ulint*>(ut_malloc(
+		sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE));
 
-	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
 	mutex_enter(&buf_pool->LRU_list_mutex);
 	num_entries = 0;
 
@@ -303,17 +284,12 @@ scan_again:
 	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
 
 	while (bpage != NULL) {
-		/* bpage->state,space,io_fix,buf_fix_count are protected by block_mutex at XtraDB */
-		mutex_t*	block_mutex = buf_page_get_mutex_enter(bpage);
 		buf_page_t*	prev_bpage;
 		ibool		is_fixed;
+		ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
 
 		prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
 
-		if (UNIV_UNLIKELY(!block_mutex)) {
-			goto next_page;
-		}
-
 		ut_a(buf_page_in_file(bpage));
 
 		if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE
@@ -322,40 +298,32 @@ scan_again:
 			/* Compressed pages are never hashed.
 			Skip blocks of other tablespaces.
 			Skip I/O-fixed blocks (to be dealt with later). */
-			mutex_exit(block_mutex);
 next_page:
 			bpage = prev_bpage;
 			continue;
 		}
 
-		//mutex_enter(&((buf_block_t*) bpage)->mutex);
+		mutex_enter(block_mutex);
 		is_fixed = bpage->buf_fix_count > 0
 			|| !((buf_block_t*) bpage)->index;
-		//mutex_exit(&((buf_block_t*) bpage)->mutex);
+		mutex_exit(block_mutex);
 
 		if (is_fixed) {
-			mutex_exit(block_mutex);
 			goto next_page;
 		}
 
 		/* Store the page number so that we can drop the hash
 		index in a batch later. */
 		page_arr[num_entries] = bpage->offset;
-
-		mutex_exit(block_mutex);
-
 		ut_a(num_entries < BUF_LRU_DROP_SEARCH_SIZE);
-
 		++num_entries;
-		++num_found;
 
 		if (num_entries < BUF_LRU_DROP_SEARCH_SIZE) {
 			goto next_page;
 		}
 
-		/* Array full. We release the buf_pool->mutex to obey
+		/* Array full. We release the buf_pool->LRU_list_mutex to obey
 		the latching order. */
-		//buf_pool_mutex_exit(buf_pool);
 		mutex_exit(&buf_pool->LRU_list_mutex);
 
 		buf_LRU_drop_page_hash_batch(
@@ -363,11 +331,9 @@ next_page:
 
 		num_entries = 0;
 
-		//buf_pool_mutex_enter(buf_pool);
-		ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
 		mutex_enter(&buf_pool->LRU_list_mutex);
 
-		/* Note that we released the buf_pool mutex above
+		/* Note that we released the buf_pool->LRU_list_mutex above
 		after reading the prev_bpage during processing of a
 		page_hash_batch (i.e.: when the array was full).
 		Because prev_bpage could belong to a compressed-only
@@ -381,52 +347,38 @@ next_page:
 		guarantee that ALL such entries will be dropped. */
 
 		/* If, however, bpage has been removed from LRU list
-		to the free list then we should restart the scan.
-		bpage->state is protected by buf_pool mutex. */
-
-		/* obtain block_mutex again to avoid race condition of bpage->state */
-		block_mutex = buf_page_get_mutex_enter(bpage);
-		if (!block_mutex) {
-			goto scan_again;
-		}
+		to the free list then we should restart the scan.  */
 
 		if (bpage
 		    && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
-			mutex_exit(block_mutex);
 			goto scan_again;
 		}
-		mutex_exit(block_mutex);
 	}
 
-	//buf_pool_mutex_exit(buf_pool);
 	mutex_exit(&buf_pool->LRU_list_mutex);
 	
 	/* Drop any remaining batch of search hashed pages. */
 	buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, num_entries);
 	ut_free(page_arr);
-
-	return num_found;
 }
 
 /******************************************************************//**
 While flushing (or removing dirty) pages from a tablespace we don't
-want to hog the CPU and resources. Release the LRU list and block
+want to hog the CPU and resources. Release the buffer pool and block
 mutex and try to force a context switch. Then reacquire the same mutexes.
 The current page is "fixed" before the release of the mutexes and then
 "unfixed" again once we have reacquired the mutexes. */
-static
+static	__attribute__((nonnull))
 void
 buf_flush_yield(
 /*============*/
 	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
 	buf_page_t*	bpage)		/*!< in/out: current page */
 {
-	mutex_t*	block_mutex;
-
-	block_mutex = buf_page_get_mutex(bpage);
+	ib_mutex_t*	block_mutex	= buf_page_get_mutex(bpage);
 
-	ut_ad(mutex_own(block_mutex));
 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+	ut_ad(mutex_own(block_mutex));
 	ut_ad(buf_page_in_file(bpage));
 
 	/* "Fix" the block so that the position cannot be
@@ -434,40 +386,39 @@ buf_flush_yield(
 	block mutexes. */
 	buf_page_set_sticky(bpage);
 
-	/* Now it is safe to release the LRU list mutex. */
+	/* Now it is safe to release the LRU list mutex */
 	mutex_exit(&buf_pool->LRU_list_mutex);
 
 	mutex_exit(block_mutex);
 	/* Try and force a context switch. */
 	os_thread_yield();
 
-	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
 	mutex_enter(&buf_pool->LRU_list_mutex);
 
 	mutex_enter(block_mutex);
 	/* "Unfix" the block now that we have both the
-	LRU list and block mutex again. */
+	buffer pool and block mutex again. */
 	buf_page_unset_sticky(bpage);
 	mutex_exit(block_mutex);
 }
 
 /******************************************************************//**
-If we have hogged the resources for too long then release the LRU list
-and flush list mutex and do a thread yield. Set the current page to
-"sticky" so that it is not relocated during the yield.
-@return TRUE if yielded */
-static
-ibool
+If we have hogged the resources for too long then release the buffer
+pool and flush list mutex and do a thread yield. Set the current page
+to "sticky" so that it is not relocated during the yield.
+@return true if yielded */
+static	__attribute__((nonnull(1), warn_unused_result))
+bool
 buf_flush_try_yield(
 /*================*/
 	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
 	buf_page_t*	bpage,		/*!< in/out: bpage to remove */
 	ulint		processed,	/*!< in: number of pages processed */
-	ibool*		must_restart)	/*!< in/out: if TRUE, we have to
+	bool*		must_restart)	/*!< in/out: if true, we have to
 					restart the flush list scan */
 {
 	/* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the
-	loop we release buf_pool->mutex to let other threads
+	loop we release buf_pool->LRU_list_mutex to let other threads
 	do their job but only if the block is not IO fixed. This
 	ensures that the block stays in its position in the
 	flush_list. */
@@ -476,7 +427,7 @@ buf_flush_try_yield(
 	    && processed >= BUF_LRU_DROP_SEARCH_SIZE
 	    && buf_page_get_io_fix_unlocked(bpage) == BUF_IO_NONE) {
 
-		mutex_t*	block_mutex;
+		ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
 
 		buf_flush_list_mutex_exit(buf_pool);
 
@@ -485,30 +436,24 @@ buf_flush_try_yield(
 		buf_page_get_gen() won't be called for pages from this
 		tablespace.  */
 
-		block_mutex = buf_page_get_mutex_enter(bpage);
-		if (UNIV_UNLIKELY(block_mutex == NULL)) {
-
-			buf_flush_list_mutex_enter(buf_pool);
-
-			*must_restart = TRUE;
-			return FALSE;
-		}
-
+		mutex_enter(block_mutex);
 		/* Recheck the I/O fix and the flush list presence now that we
 		hold the right mutex */
 		if (UNIV_UNLIKELY(buf_page_get_io_fix(bpage) != BUF_IO_NONE
 				  || bpage->oldest_modification == 0)) {
 
 			mutex_exit(block_mutex);
+
+			*must_restart = true;
+
 			buf_flush_list_mutex_enter(buf_pool);
 
-			*must_restart = TRUE;
-			return FALSE;
+			return false;
 		}
 
-		*must_restart = FALSE;
+		*must_restart = false;
 
-		/* Release the LRU list and block mutex
+		/* Release the LRU list and buf_page_get_mutex() mutex
 		to give the other threads a go. */
 
 		buf_flush_yield(buf_pool, bpage);
@@ -521,36 +466,35 @@ buf_flush_try_yield(
 
 		ut_ad(bpage->in_flush_list);
 
-		return(TRUE);
+		return(true);
 	}
 
-	return(FALSE);
+	return(false);
 }
 
 /******************************************************************//**
 Removes a single page from a given tablespace inside a specific
 buffer pool instance.
-@return TRUE if page was removed. */
-static
-ibool
+@return true if page was removed. */
+static	__attribute__((nonnull, warn_unused_result))
+bool
 buf_flush_or_remove_page(
 /*=====================*/
 	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
 	buf_page_t*	bpage,		/*!< in/out: bpage to remove */
-	ibool*		must_restart)	/*!< in/out: if TRUE, must restart the
+	bool		flush,		/*!< in: flush to disk if true but
+					don't remove else remove without
+					flushing to disk */
+	bool*		must_restart)	/*!< in/out: if true, must restart the
 					flush list scan */
 {
-	mutex_t*	block_mutex;
-	ibool		processed = FALSE;
+	ib_mutex_t*	block_mutex	= buf_page_get_mutex(bpage);
 
 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 	ut_ad(buf_flush_list_mutex_own(buf_pool));
 
-	block_mutex = buf_page_get_mutex(bpage);
-
-	/* bpage->space and bpage->io_fix are protected by
-	buf_pool->mutex and block_mutex. It is safe to check
-	them while holding buf_pool->mutex only. */
+	/* It is safe to check bpage->space and bpage->io_fix while holding
+	buf_pool->LRU_list_mutex only. */
 
 	if (UNIV_UNLIKELY(buf_page_get_io_fix_unlocked(bpage)
 			  != BUF_IO_NONE)) {
@@ -558,50 +502,71 @@ buf_flush_or_remove_page(
 		/* We cannot remove this page during this scan
 		yet; maybe the system is currently reading it
 		in, or flushing the modifications to the file */
+		return(false);
 
-	} else {
-
-		/* We have to release the flush_list_mutex to obey the
-		latching order. We are not however guaranteed that the page
-		will stay in the flush_list. */
+	}
 
-		buf_flush_list_mutex_exit(buf_pool);
+	bool		processed = false;
 
-		/* We don't have to worry about bpage becoming a dangling
-		pointer by a compressed page flush list relocation because
-		buf_page_get_gen() won't be called for pages from this
-		tablespace.  */
+	buf_flush_list_mutex_exit(buf_pool);
 
-		mutex_enter(block_mutex);
+	/* We don't have to worry about bpage becoming a dangling
+	pointer by a compressed page flush list relocation because
+	buf_page_get_gen() won't be called for pages from this
+	tablespace.  */
 
-		/* Recheck the page I/O fix and the flush list presence now
-		thatwe hold the right mutex. */
-		if (UNIV_UNLIKELY(buf_page_get_io_fix(bpage) != BUF_IO_NONE
-				  || bpage->oldest_modification == 0)) {
+	mutex_enter(block_mutex);
 
-			/* The page became I/O-fixed or is not on the flush
-			list anymore, this invalidates any flush-list-page
-			pointers we have. */
-			*must_restart = TRUE;
+	/* Recheck the page I/O fix and the flush list presence now
+	that we hold the right mutex. */
+	if (UNIV_UNLIKELY(buf_page_get_io_fix(bpage) != BUF_IO_NONE
+			  || bpage->oldest_modification == 0)) {
 
-		} else {
+		/* The page became I/O-fixed or is not on the flush
+		list anymore, this invalidates any flush-list-page
+		pointers we have. */
 
-			ut_ad(bpage->oldest_modification != 0);
+		mutex_exit(block_mutex);
 
-			if (bpage->buf_fix_count == 0) {
+		*must_restart = TRUE;
 
-				buf_flush_remove(bpage);
+	} else if (!flush) {
 
-				processed = TRUE;
-			}
-		}
+		buf_flush_remove(bpage);
 
 		mutex_exit(block_mutex);
 
-		buf_flush_list_mutex_enter(buf_pool);
+		processed = true;
+
+	} else if (buf_flush_ready_for_flush(bpage,
+					     BUF_FLUSH_SINGLE_PAGE)) {
+
+		mutex_exit(&buf_pool->LRU_list_mutex);
+
+		/* The following call will release the buf_page_get_mutex()
+		mutex. */
+		buf_flush_page(buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, false);
+		ut_ad(!mutex_own(block_mutex));
+
+		/* Wake possible simulated aio thread to actually
+		post the writes to the operating system */
+		os_aio_simulated_wake_handler_threads();
+
+		mutex_enter(&buf_pool->LRU_list_mutex);
+
+		processed = true;
+	} else {
+		/* Not ready for flush. It can't be IO fixed because we
+		checked for that at the start of the function. It must
+		be buffer fixed. */
+		ut_ad(bpage->buf_fix_count > 0);
+		mutex_exit(block_mutex);
 	}
 
+	buf_flush_list_mutex_enter(buf_pool);
+
 	ut_ad(!mutex_own(block_mutex));
+	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 
 	return(processed);
 }
@@ -611,49 +576,81 @@ Remove all dirty pages belonging to a given tablespace inside a specific
 buffer pool instance when we are deleting the data file(s) of that
 tablespace. The pages still remain a part of LRU and are evicted from
 the list as they age towards the tail of the LRU.
-@return TRUE if all freed. */
-static
-ibool
+@retval DB_SUCCESS if all freed
+@retval DB_FAIL if not all freed
+@retval DB_INTERRUPTED if the transaction was interrupted */
+static	__attribute__((nonnull(1), warn_unused_result))
+dberr_t
 buf_flush_or_remove_pages(
 /*======================*/
 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	ulint		id)		/*!< in: target space id for which
+	ulint		id,		/*!< in: target space id for which
 					to remove or flush pages */
+	bool		flush,		/*!< in: flush to disk if true but
+					don't remove else remove without
+					flushing to disk */
+	const trx_t*	trx)		/*!< to check if the operation must
+					be interrupted, can be 0 */
 {
 	buf_page_t*	prev;
 	buf_page_t*	bpage;
 	ulint		processed = 0;
-	ibool		all_freed = TRUE;
-	ibool		must_restart = FALSE;
 
 	buf_flush_list_mutex_enter(buf_pool);
 
+rescan:
+	bool	must_restart = false;
+	bool	all_freed = true;
+
 	for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
-	     !must_restart && bpage != NULL;
+	     bpage != NULL;
 	     bpage = prev) {
 
 		ut_a(buf_page_in_file(bpage));
-		ut_ad(bpage->in_flush_list);
 
 		/* Save the previous link because once we free the
 		page we can't rely on the links. */
 
-		prev = UT_LIST_GET_PREV(flush_list, bpage);
+		prev = UT_LIST_GET_PREV(list, bpage);
 
 		if (buf_page_get_space(bpage) != id) {
 
 			/* Skip this block, as it does not belong to
 			the target space. */
 
-		} else if (!buf_flush_or_remove_page(buf_pool, bpage,
+		} else if (!buf_flush_or_remove_page(buf_pool, bpage, flush,
 						     &must_restart)) {
 
 			/* Remove was unsuccessful, we have to try again
-			by scanning the entire list from the end. */
+			by scanning the entire list from the end.
+			This also means that we never released the
+			flush list mutex. Therefore we can trust the prev
+			pointer.
+			buf_flush_or_remove_page() released the
+			flush list mutex but not the LRU list mutex.
+			Therefore it is possible that a new page was
+			added to the flush list. For example, in case
+			where we are at the head of the flush list and
+			prev == NULL. That is OK because we have the
+			tablespace quiesced and no new pages for this
+			space-id should enter flush_list. This is
+			because the only callers of this function are
+			DROP TABLE and FLUSH TABLE FOR EXPORT.
+			We know that we'll have to do at least one more
+			scan but we don't break out of loop here and
+			try to do as much work as we can in this
+			iteration. */
+
+			all_freed = false;
+		} else if (flush) {
+
+			/* The processing was successful. And during the
+			processing we have released all the buf_pool mutexes
+			when calling buf_page_flush(). We cannot trust
+			prev pointer. */
+			goto rescan;
+		} else if (UNIV_UNLIKELY(must_restart)) {
 
-			all_freed = FALSE;
-		}
-		if (UNIV_UNLIKELY(must_restart)) {
 			ut_ad(!all_freed);
 			break;
 		}
@@ -668,15 +665,27 @@ buf_flush_or_remove_pages(
 			/* Reset the batch size counter if we had to yield. */
 
 			processed = 0;
-		} else if (UNIV_UNLIKELY(must_restart)) {
-			all_freed = FALSE;
 		}
 
+#ifdef DBUG_OFF
+		if (flush) {
+			DBUG_EXECUTE_IF("ib_export_flush_crash",
+					static ulint	n_pages;
+					if (++n_pages == 4) {DBUG_SUICIDE();});
+		}
+#endif /* DBUG_OFF */
+
+		/* The check for trx is interrupted is expensive, we want
+		to check every N iterations. */
+		if (!processed && trx && trx_is_interrupted(trx)) {
+			buf_flush_list_mutex_exit(buf_pool);
+			return(DB_INTERRUPTED);
+		}
 	}
 
 	buf_flush_list_mutex_exit(buf_pool);
 
-	return(all_freed);
+	return(all_freed ? DB_SUCCESS : DB_FAIL);
 }
 
 /******************************************************************//**
@@ -684,36 +693,47 @@ Remove or flush all the dirty pages that belong to a given tablespace
 inside a specific buffer pool instance. The pages will remain in the LRU
 list and will be evicted from the LRU list as they age and move towards
 the tail of the LRU list. */
-static
+static __attribute__((nonnull(1)))
 void
 buf_flush_dirty_pages(
 /*==================*/
 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	ulint		id)		/*!< in: space id */
+	ulint		id,		/*!< in: space id */
+	bool		flush,		/*!< in: flush to disk if true otherwise
+					remove the pages without flushing */
+	const trx_t*	trx)		/*!< to check if the operation must
+					be interrupted */
 {
-	ibool	all_freed;
+	dberr_t		err;
 
 	do {
-		ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
 		mutex_enter(&buf_pool->LRU_list_mutex);
 
-		all_freed = buf_flush_or_remove_pages(buf_pool, id);
+		err = buf_flush_or_remove_pages(buf_pool, id, flush, trx);
 
 		mutex_exit(&buf_pool->LRU_list_mutex);
 
 		ut_ad(buf_flush_validate(buf_pool));
 
-		if (!all_freed) {
-			os_thread_sleep(20000);
+		if (err == DB_FAIL) {
+			os_thread_sleep(2000);
 		}
 
-	} while (!all_freed);
+		/* DB_FAIL is a soft error, it means that the task wasn't
+		completed, needs to be retried. */
+
+		ut_ad(buf_flush_validate(buf_pool));
+
+	} while (err == DB_FAIL);
+
+	ut_ad(err == DB_INTERRUPTED
+	      || buf_pool_get_dirty_pages_count(buf_pool, id) == 0);
 }
 
 /******************************************************************//**
 Remove all pages that belong to a given tablespace inside a specific
 buffer pool instance when we are DISCARDing the tablespace. */
-static
+static __attribute__((nonnull))
 void
 buf_LRU_remove_all_pages(
 /*=====================*/
@@ -724,10 +744,7 @@ buf_LRU_remove_all_pages(
 	ibool		all_freed;
 
 scan_again:
-	//buf_pool_mutex_enter(buf_pool);
-	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
 	mutex_enter(&buf_pool->LRU_list_mutex);
-	rw_lock_x_lock(&buf_pool->page_hash_latch);
 
 	all_freed = TRUE;
 
@@ -735,44 +752,52 @@ scan_again:
 	     bpage != NULL;
 	     /* No op */) {
 
+		prio_rw_lock_t*	hash_lock;
 		buf_page_t*	prev_bpage;
-		mutex_t*	block_mutex;
+		ib_mutex_t*	block_mutex = NULL;
 
 		ut_a(buf_page_in_file(bpage));
 		ut_ad(bpage->in_LRU_list);
 
 		prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
 
-		block_mutex = buf_page_get_mutex_enter(bpage);
-
-		if (!block_mutex) {
-			/* It may be impossible case...
-			   Something wrong, so will be scan_again */
-
-			all_freed = FALSE;
-			goto next_page;
-		}
+		/* It is safe to check bpage->space and bpage->io_fix while
+		holding buf_pool->LRU_list_mutex only and later recheck
+		while holding the buf_page_get_mutex() mutex.  */
 
 		if (buf_page_get_space(bpage) != id) {
 			/* Skip this block, as it does not belong to
 			the space that is being invalidated. */
-
-			mutex_exit(block_mutex);
 			goto next_page;
-		} else if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+		} else if (UNIV_UNLIKELY(buf_page_get_io_fix_unlocked(bpage)
+					 != BUF_IO_NONE)) {
 			/* We cannot remove this page during this scan
 			yet; maybe the system is currently reading it
 			in, or flushing the modifications to the file */
 
-			mutex_exit(block_mutex);
 			all_freed = FALSE;
 			goto next_page;
 		} else {
+			ulint	fold = buf_page_address_fold(
+				bpage->space, bpage->offset);
+
+			hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+			rw_lock_x_lock(hash_lock);
 
-			if (bpage->buf_fix_count > 0) {
+			block_mutex = buf_page_get_mutex(bpage);
+			mutex_enter(block_mutex);
+
+			if (UNIV_UNLIKELY(
+				    buf_page_get_space(bpage) != id
+				    || bpage->buf_fix_count > 0
+				    || (buf_page_get_io_fix(bpage)
+					!= BUF_IO_NONE))) {
 
 				mutex_exit(block_mutex);
 
+				rw_lock_x_unlock(hash_lock);
+
 				/* We cannot remove this page during
 				this scan yet; maybe the system is
 				currently reading it in, or flushing
@@ -802,13 +827,14 @@ scan_again:
 			ulint	zip_size;
 
 			mutex_exit(&buf_pool->LRU_list_mutex);
-			rw_lock_x_unlock(&buf_pool->page_hash_latch);
 
 			zip_size = buf_page_get_zip_size(bpage);
 			page_no = buf_page_get_page_no(bpage);
 
 			mutex_exit(block_mutex);
 
+			rw_lock_x_unlock(hash_lock);
+
 			/* Note that the following call will acquire
 			and release block->lock X-latch. */
 
@@ -819,6 +845,7 @@ scan_again:
 		}
 
 		if (bpage->oldest_modification != 0) {
+
 			buf_flush_remove(bpage);
 		}
 
@@ -826,28 +853,28 @@ scan_again:
 
 		/* Remove from the LRU list. */
 
-		if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
-		    != BUF_BLOCK_ZIP_FREE) {
+		if (buf_LRU_block_remove_hashed(bpage, true)) {
 
-			buf_LRU_block_free_hashed_page((buf_block_t*) bpage, TRUE);
+			mutex_enter(block_mutex);
+			buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
 			mutex_exit(block_mutex);
-
 		} else {
-			/* The block_mutex should have been released
-			by buf_LRU_block_remove_hashed_page() when it
-			returns BUF_BLOCK_ZIP_FREE. */
 			ut_ad(block_mutex == &buf_pool->zip_mutex);
 		}
 
 		ut_ad(!mutex_own(block_mutex));
 
+#ifdef UNIV_SYNC_DEBUG
+		/* buf_LRU_block_remove_hashed() releases the hash_lock */
+		ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+		ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
 next_page:
 		bpage = prev_bpage;
 	}
 
-//	buf_pool_mutex_exit(buf_pool);
 	mutex_exit(&buf_pool->LRU_list_mutex);
-	rw_lock_x_unlock(&buf_pool->page_hash_latch);
 
 	if (!all_freed) {
 		os_thread_sleep(20000);
@@ -857,17 +884,63 @@ next_page:
 }
 
 /******************************************************************//**
-Removes all pages belonging to a given tablespace. */
+Remove pages belonging to a given tablespace inside a specific
+buffer pool instance when we are deleting the data file(s) of that
+tablespace. The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU only if buf_remove
+is BUF_REMOVE_FLUSH_NO_WRITE. */
+static	__attribute__((nonnull(1)))
+void
+buf_LRU_remove_pages(
+/*=================*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	ulint		id,		/*!< in: space id */
+	buf_remove_t	buf_remove,	/*!< in: remove or flush strategy */
+	const trx_t*	trx)		/*!< to check if the operation must
+					be interrupted */
+{
+	switch (buf_remove) {
+	case BUF_REMOVE_ALL_NO_WRITE:
+		buf_LRU_remove_all_pages(buf_pool, id);
+		break;
+
+	case BUF_REMOVE_FLUSH_NO_WRITE:
+		ut_a(trx == 0);
+		buf_flush_dirty_pages(buf_pool, id, false, NULL);
+		break;
+
+	case BUF_REMOVE_FLUSH_WRITE:
+		ut_a(trx != 0);
+		buf_flush_dirty_pages(buf_pool, id, true, trx);
+		/* Ensure that all asynchronous IO is completed. */
+		os_aio_wait_until_no_pending_writes();
+		fil_flush(id);
+		break;
+	}
+}
+
+/******************************************************************//**
+Flushes all dirty pages or removes all pages belonging
+to a given tablespace. A PROBLEM: if readahead is being started, what
+guarantees that it will not try to read in pages after this operation
+has completed? */
 UNIV_INTERN
 void
 buf_LRU_flush_or_remove_pages(
 /*==========================*/
-	ulint			id,	/*!< in: space id */
-	enum buf_remove_t	buf_remove)/*!< in: remove or flush
-					strategy */
+	ulint		id,		/*!< in: space id */
+	buf_remove_t	buf_remove,	/*!< in: remove or flush strategy */
+	const trx_t*	trx)		/*!< to check if the operation must
+					be interrupted */
 {
 	ulint		i;
 
+	/* Before we attempt to drop pages one by one we first
+	attempt to drop page hash index entries in batches to make
+	it more efficient. The batching attempt is a best effort
+	attempt and does not guarantee that all pages hash entries
+	will be dropped. We get rid of remaining page hash entries
+	one by one below. */
 	for (i = 0; i < srv_buf_pool_instances; i++) {
 		buf_pool_t*	buf_pool;
 
@@ -875,28 +948,21 @@ buf_LRU_flush_or_remove_pages(
 
 		switch (buf_remove) {
 		case BUF_REMOVE_ALL_NO_WRITE:
-			/* A DISCARD tablespace case. Remove AHI entries
-			and evict all pages from LRU. */
-
-			/* Before we attempt to drop pages hash entries
-			one by one we first attempt to drop page hash
-			index entries in batches to make it more
-			efficient. The batching attempt is a best effort
-			attempt and does not guarantee that all pages
-			hash entries will be dropped. We get rid of
-			remaining page hash entries one by one below. */
 			buf_LRU_drop_page_hash_for_tablespace(buf_pool, id);
-			buf_LRU_remove_all_pages(buf_pool, id);
 			break;
 
 		case BUF_REMOVE_FLUSH_NO_WRITE:
-			/* A DROP table case. AHI entries are already
-			removed. No need to evict all pages from LRU
-			list. Just evict pages from flush list without
-			writing. */
-			buf_flush_dirty_pages(buf_pool, id);
+			/* It is a DROP TABLE for a single table
+			tablespace. No AHI entries exist because
+			we already dealt with them when freeing up
+			extents. */
+		case BUF_REMOVE_FLUSH_WRITE:
+			/* We allow read-only queries against the
+			table, there is no need to drop the AHI entries. */
 			break;
 		}
+
+		buf_LRU_remove_pages(buf_pool, id, buf_remove, trx);
 	}
 }
 
@@ -912,8 +978,7 @@ buf_LRU_insert_zip_clean(
 	buf_page_t*	b;
 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
 
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-	//ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 	ut_ad(mutex_own(&buf_pool->zip_mutex));
 	ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
 
@@ -922,17 +987,17 @@ buf_LRU_insert_zip_clean(
 	b = bpage;
 	do {
 		b = UT_LIST_GET_NEXT(LRU, b);
-	} while (b && (buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE || !b->in_LRU_list));
+	} while (b && buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE);
 
 	/* Insert bpage before b, i.e., after the predecessor of b. */
 	if (b) {
-		b = UT_LIST_GET_PREV(zip_list, b);
+		b = UT_LIST_GET_PREV(list, b);
 	}
 
 	if (b) {
-		UT_LIST_INSERT_AFTER(zip_list, buf_pool->zip_clean, b, bpage);
+		UT_LIST_INSERT_AFTER(list, buf_pool->zip_clean, b, bpage);
 	} else {
-		UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_clean, bpage);
+		UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, bpage);
 	}
 }
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
@@ -946,63 +1011,48 @@ ibool
 buf_LRU_free_from_unzip_LRU_list(
 /*=============================*/
 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	ulint		n_iterations,	/*!< in: how many times this has
-					been called repeatedly without
-					result: a high value means that
-					we should search farther; we will
-					search n_iterations / 5 of the
-					unzip_LRU list, or nothing if
-					n_iterations >= 5 */
-	ibool*		have_LRU_mutex)
+	ibool		scan_all)	/*!< in: scan whole LRU list
+					if TRUE, otherwise scan only
+					srv_LRU_scan_depth / 2 blocks. */
 {
 	buf_block_t*	block;
-	ulint		distance;
-
-	//ut_ad(buf_pool_mutex_own(buf_pool));
+	ibool 		freed;
+	ulint		scanned;
 
-	/* Theoratically it should be much easier to find a victim
-	from unzip_LRU as we can choose even a dirty block (as we'll
-	be evicting only the uncompressed frame).  In a very unlikely
-	eventuality that we are unable to find a victim from
-	unzip_LRU, we fall back to the regular LRU list.  We do this
-	if we have done five iterations so far. */
-
-	if (UNIV_UNLIKELY(n_iterations >= 5)
-	    || !buf_LRU_evict_from_unzip_LRU(buf_pool, have_LRU_mutex)) {
+	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 
+	if (!buf_LRU_evict_from_unzip_LRU(buf_pool)) {
 		return(FALSE);
 	}
 
-	distance = 100 + (n_iterations
-			  * UT_LIST_GET_LEN(buf_pool->unzip_LRU)) / 5;
+	for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU),
+	     scanned = 1, freed = FALSE;
+	     block != NULL && !freed
+	     && (scan_all || scanned < srv_LRU_scan_depth);
+	     ++scanned) {
 
-restart:
-	for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
-	     UNIV_LIKELY(block != NULL) && UNIV_LIKELY(distance > 0);
-	     block = UT_LIST_GET_PREV(unzip_LRU, block), distance--) {
-
-		ibool freed;
+		buf_block_t*	prev_block = UT_LIST_GET_PREV(unzip_LRU,
+						block);
 
 		mutex_enter(&block->mutex);
-		if (!block->in_unzip_LRU_list || !block->page.in_LRU_list
-		    || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
-			mutex_exit(&block->mutex);
-			goto restart;
-		}
 
 		ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 		ut_ad(block->in_unzip_LRU_list);
 		ut_ad(block->page.in_LRU_list);
 
-		freed = buf_LRU_free_block(&block->page, FALSE, have_LRU_mutex);
+		freed = buf_LRU_free_page(&block->page, false);
+
 		mutex_exit(&block->mutex);
 
-		if (freed) {
-			return(TRUE);
-		}
+		block = prev_block;
 	}
 
-	return(FALSE);
+	MONITOR_INC_VALUE_CUMULATIVE(
+		MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+		MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+		MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+		scanned);
+	return(freed);
 }
 
 /******************************************************************//**
@@ -1012,61 +1062,56 @@ UNIV_INLINE
 ibool
 buf_LRU_free_from_common_LRU_list(
 /*==============================*/
-	buf_pool_t*	buf_pool,
-	ulint		n_iterations,
-				/*!< in: how many times this has been called
-				repeatedly without result: a high value means
-				that we should search farther; if
-				n_iterations < 10, then we search
-				n_iterations / 10 * buf_pool->curr_size
-				pages from the end of the LRU list */
-	ibool*		have_LRU_mutex)
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ibool		scan_all)	/*!< in: scan whole LRU list
+					if TRUE, otherwise scan only
+					srv_LRU_scan_depth / 2 blocks. */
 {
 	buf_page_t*	bpage;
-	ulint		distance;
-
-	//ut_ad(buf_pool_mutex_own(buf_pool));
+	ibool		freed;
+	ulint		scanned;
 
-	distance = 100 + (n_iterations * buf_pool->curr_size) / 10;
+	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 
-restart:
-	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
-	     UNIV_LIKELY(bpage != NULL) && UNIV_LIKELY(distance > 0);
-	     bpage = UT_LIST_GET_PREV(LRU, bpage), distance--) {
+	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU),
+	     scanned = 1, freed = FALSE;
+	     bpage != NULL && !freed
+	     && (scan_all || scanned < srv_LRU_scan_depth);
+	     ++scanned) {
 
-		ibool		freed;
 		unsigned	accessed;
-		mutex_t*	block_mutex = buf_page_get_mutex_enter(bpage);
-
-		if (!block_mutex) {
-			goto restart;
-		}
-
-		if (!bpage->in_LRU_list
-		    || !buf_page_in_file(bpage)) {
-			mutex_exit(block_mutex);
-			goto restart;
-		}
+		buf_page_t*	prev_bpage = UT_LIST_GET_PREV(LRU,
+						bpage);
+		ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
 
 		ut_ad(buf_page_in_file(bpage));
 		ut_ad(bpage->in_LRU_list);
 
 		accessed = buf_page_is_accessed(bpage);
-		freed = buf_LRU_free_block(bpage, TRUE, have_LRU_mutex);
+
+		mutex_enter(block_mutex);
+
+		freed = buf_LRU_free_page(bpage, true);
+
 		mutex_exit(block_mutex);
 
-		if (freed) {
+		if (freed && !accessed) {
 			/* Keep track of pages that are evicted without
 			ever being accessed. This gives us a measure of
 			the effectiveness of readahead */
-			if (!accessed) {
-				++buf_pool->stat.n_ra_pages_evicted;
-			}
-			return(TRUE);
+			++buf_pool->stat.n_ra_pages_evicted;
 		}
+
+		bpage = prev_bpage;
 	}
 
-	return(FALSE);
+	MONITOR_INC_VALUE_CUMULATIVE(
+		MONITOR_LRU_SEARCH_SCANNED,
+		MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+		MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+		scanned);
+
+	return(freed);
 }
 
 /******************************************************************//**
@@ -1074,90 +1119,34 @@ Try to free a replaceable block.
 @return	TRUE if found and freed */
 UNIV_INTERN
 ibool
-buf_LRU_search_and_free_block(
-/*==========================*/
-	buf_pool_t*	buf_pool,
-				/*!< in: buffer pool instance */
-	ulint		n_iterations)
-				/*!< in: how many times this has been called
-				repeatedly without result: a high value means
-				that we should search farther; if
-				n_iterations < 10, then we search
-				n_iterations / 10 * buf_pool->curr_size
-				pages from the end of the LRU list; if
-				n_iterations < 5, then we will also search
-				n_iterations / 5 of the unzip_LRU list. */
+buf_LRU_scan_and_free_block(
+/*========================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ibool		scan_all)	/*!< in: scan whole LRU list
+					if TRUE, otherwise scan only
+					'old' blocks. */
 {
 	ibool	freed = FALSE;
-	ibool	have_LRU_mutex = FALSE;
+	bool	use_unzip_list = UT_LIST_GET_LEN(buf_pool->unzip_LRU) > 0;
 
-	//buf_pool_mutex_enter(buf_pool);
-	if (UT_LIST_GET_LEN(buf_pool->unzip_LRU)) {
-		ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
-		mutex_enter(&buf_pool->LRU_list_mutex);
-		have_LRU_mutex = TRUE;
-	}
-
-	freed = buf_LRU_free_from_unzip_LRU_list(buf_pool, n_iterations, &have_LRU_mutex);
+	mutex_enter(&buf_pool->LRU_list_mutex);
 
-	if (!freed) {
-		freed = buf_LRU_free_from_common_LRU_list(
-			buf_pool, n_iterations, &have_LRU_mutex);
+	if (use_unzip_list) {
+		freed = buf_LRU_free_from_unzip_LRU_list(buf_pool, scan_all);
 	}
 
-	buf_pool_mutex_enter(buf_pool);
 	if (!freed) {
-		buf_pool->LRU_flush_ended = 0;
-	} else if (buf_pool->LRU_flush_ended > 0) {
-		buf_pool->LRU_flush_ended--;
+		freed = buf_LRU_free_from_common_LRU_list(buf_pool, scan_all);
 	}
 
-	buf_pool_mutex_exit(buf_pool);
-	if (have_LRU_mutex)
+	if (!freed) {
 		mutex_exit(&buf_pool->LRU_list_mutex);
+	}
 
 	return(freed);
 }
 
 /******************************************************************//**
-Tries to remove LRU flushed blocks from the end of the LRU list and put them
-to the free list. This is beneficial for the efficiency of the insert buffer
-operation, as flushed pages from non-unique non-clustered indexes are here
-taken out of the buffer pool, and their inserts redirected to the insert
-buffer. Otherwise, the flushed blocks could get modified again before read
-operations need new buffer blocks, and the i/o work done in flushing would be
-wasted. */
-UNIV_INTERN
-void
-buf_LRU_try_free_flushed_blocks(
-/*============================*/
-	buf_pool_t*	buf_pool)		/*!< in: buffer pool instance */
-{
-
-	if (buf_pool == NULL) {
-		ulint	i;
-
-		for (i = 0; i < srv_buf_pool_instances; i++) {
-			buf_pool = buf_pool_from_array(i);
-			buf_LRU_try_free_flushed_blocks(buf_pool);
-		}
-	} else {
-		buf_pool_mutex_enter(buf_pool);
-
-		while (buf_pool->LRU_flush_ended > 0) {
-
-			buf_pool_mutex_exit(buf_pool);
-
-			buf_LRU_search_and_free_block(buf_pool, 1);
-
-			buf_pool_mutex_enter(buf_pool);
-		}
-
-		buf_pool_mutex_exit(buf_pool);
-	}
-}
-
-/******************************************************************//**
 Returns TRUE if less than 25 % of the buffer pool in any instance is
 available. This can be used in heuristics to prevent huge transactions
 eating up the whole buffer pool for their locks.
@@ -1175,11 +1164,6 @@ buf_LRU_buf_pool_running_out(void)
 
 		buf_pool = buf_pool_from_array(i);
 
-		//buf_pool_mutex_enter(buf_pool);
-		ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
-		mutex_enter(&buf_pool->LRU_list_mutex);
-		mutex_enter(&buf_pool->free_list_mutex);
-
 		if (!recv_recovery_on
 		    && UT_LIST_GET_LEN(buf_pool->free)
 		       + UT_LIST_GET_LEN(buf_pool->LRU)
@@ -1187,10 +1171,6 @@ buf_LRU_buf_pool_running_out(void)
 
 			ret = TRUE;
 		}
-
-		//buf_pool_mutex_exit(buf_pool);
-		mutex_exit(&buf_pool->LRU_list_mutex);
-		mutex_exit(&buf_pool->free_list_mutex);
 	}
 
 	return(ret);
@@ -1208,9 +1188,8 @@ buf_LRU_get_free_only(
 {
 	buf_block_t*	block;
 
-	//ut_ad(buf_pool_mutex_own(buf_pool));
+	mutex_enter_last(&buf_pool->free_list_mutex);
 
-	mutex_enter(&buf_pool->free_list_mutex);
 	block = (buf_block_t*) UT_LIST_GET_LAST(buf_pool->free);
 
 	if (block) {
@@ -1220,44 +1199,37 @@ buf_LRU_get_free_only(
 		ut_ad(!block->page.in_flush_list);
 		ut_ad(!block->page.in_LRU_list);
 		ut_a(!buf_page_in_file(&block->page));
-		UT_LIST_REMOVE(free, buf_pool->free, (&block->page));
+		UT_LIST_REMOVE(list, buf_pool->free, (&block->page));
+		buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE);
 
 		mutex_exit(&buf_pool->free_list_mutex);
 
 		mutex_enter(&block->mutex);
 
-		buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE);
 		UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE);
 
 		ut_ad(buf_pool_from_block(block) == buf_pool);
 
 		mutex_exit(&block->mutex);
-	} else {
-		mutex_exit(&buf_pool->free_list_mutex);
+		return(block);
 	}
 
-	return(block);
+	mutex_exit(&buf_pool->free_list_mutex);
+
+	return(NULL);
 }
 
 /******************************************************************//**
-Returns a free block from the buf_pool. The block is taken off the
-free list. If it is empty, blocks are moved from the end of the
-LRU list to the free list.
-@return	the free control block, in state BUF_BLOCK_READY_FOR_USE */
-UNIV_INTERN
-buf_block_t*
-buf_LRU_get_free_block(
-/*===================*/
-	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
+Checks how much of buf_pool is occupied by non-data objects like
+AHI, lock heaps etc. Depending on the size of non-data objects this
+function will either assert or issue a warning and switch on the
+status monitor. */
+static
+void
+buf_LRU_check_size_of_non_data_objects(
+/*===================================*/
+	const buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
 {
-	buf_block_t*	block		= NULL;
-	ibool		freed;
-	ulint		n_iterations	= 1;
-	ibool		mon_value_was	= FALSE;
-	ibool		started_monitor	= FALSE;
-loop:
-	//buf_pool_mutex_enter(buf_pool);
-
 	if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free)
 	    + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 20) {
 		ut_print_timestamp(stderr);
@@ -1309,7 +1281,7 @@ loop:
 
 			buf_lru_switched_on_innodb_mon = TRUE;
 			srv_print_innodb_monitor = TRUE;
-			os_event_set(srv_lock_timeout_thread_event);
+			os_event_set(lock_sys->timeout_event);
 		}
 	} else if (buf_lru_switched_on_innodb_mon) {
 
@@ -1321,12 +1293,67 @@ loop:
 		buf_lru_switched_on_innodb_mon = FALSE;
 		srv_print_innodb_monitor = FALSE;
 	}
+}
+
+/** The maximum allowed backoff sleep time duration, microseconds */
+#define MAX_FREE_LIST_BACKOFF_SLEEP 10000
+
+/** The sleep reduction factor for high-priority waiter backoff sleeps */
+#define FREE_LIST_BACKOFF_HIGH_PRIO_DIVIDER 100
+
+/** The sleep reduction factor for low-priority waiter backoff sleeps */
+#define FREE_LIST_BACKOFF_LOW_PRIO_DIVIDER 1
+
+/******************************************************************//**
+Returns a free block from the buf_pool. The block is taken off the
+free list. If free list is empty, blocks are moved from the end of the
+LRU list to the free list.
+This function is called from a user thread when it needs a clean
+block to read in a page. Note that we only ever get a block from
+the free list. Even when we flush a page or find a page in LRU scan
+we put it to free list to be used.
+* iteration 0:
+  * get a block from free list, success:done
+  * if there is an LRU flush batch in progress:
+    * wait for batch to end: retry free list
+  * if buf_pool->try_LRU_scan is set
+    * scan LRU up to srv_LRU_scan_depth to find a clean block
+    * the above will put the block on free list
+    * success:retry the free list
+  * flush one dirty page from tail of LRU to disk
+    * the above will put the block on free list
+    * success: retry the free list
+* iteration 1:
+  * same as iteration 0 except:
+    * scan whole LRU list
+    * scan LRU list even if buf_pool->try_LRU_scan is not set
+* iteration > 1:
+  * same as iteration 1 but sleep 100ms
+@return	the free control block, in state BUF_BLOCK_READY_FOR_USE */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_block(
+/*===================*/
+	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
+{
+	buf_block_t*	block		= NULL;
+	ibool		freed		= FALSE;
+	ulint		n_iterations	= 0;
+	ulint		flush_failures	= 0;
+	ibool		mon_value_was	= FALSE;
+	ibool		started_monitor	= FALSE;
+
+	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
+
+	MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH);
+loop:
+	buf_LRU_check_size_of_non_data_objects(buf_pool);
 
 	/* If there is a block in the free list, take it */
 	block = buf_LRU_get_free_only(buf_pool);
-	//buf_pool_mutex_exit(buf_pool);
 
 	if (block) {
+
 		ut_ad(buf_pool_from_block(block) == buf_pool);
 		memset(&block->page.zip, 0, sizeof block->page.zip);
 
@@ -1337,20 +1364,108 @@ loop:
 		return(block);
 	}
 
-	/* If no block was in the free list, search from the end of the LRU
-	list and try to free a block there */
+	if (srv_empty_free_list_algorithm == SRV_EMPTY_FREE_LIST_BACKOFF
+	    && buf_page_cleaner_is_active
+	    && srv_shutdown_state == SRV_SHUTDOWN_NONE) {
 
-	freed = buf_LRU_search_and_free_block(buf_pool, n_iterations);
+		/* Backoff to minimize the free list mutex contention while the
+		free list is empty */
+		ulint	priority = srv_current_thread_priority;
+
+		if (n_iterations < 3) {
+
+			os_thread_yield();
+			if (!priority) {
+				os_thread_yield();
+			}
+		} else {
+
+			ulint	i, b;
+
+			if (n_iterations < 6) {
+				i = n_iterations - 3;
+			} else if (n_iterations < 8) {
+				i = 4;
+			} else if (n_iterations < 11) {
+				i = 5;
+			} else {
+				i = n_iterations - 5;
+			}
+			b = 1 << i;
+			if (b > MAX_FREE_LIST_BACKOFF_SLEEP) {
+				b = MAX_FREE_LIST_BACKOFF_SLEEP;
+			}
+			os_thread_sleep(b / (priority
+				? FREE_LIST_BACKOFF_HIGH_PRIO_DIVIDER
+				: FREE_LIST_BACKOFF_LOW_PRIO_DIVIDER));
+		}
+
+		/* In case of backoff, do not ever attempt single page flushes
+		and wait for the cleaner to free some pages instead.  */
+
+		n_iterations++;
 
-	if (freed > 0) {
+		goto loop;
+	} else {
+
+		/* The cleaner is not running or Oracle MySQL 5.6 algorithm was
+		requested, will perform a single page flush  */
+		ut_ad((srv_empty_free_list_algorithm
+		       == SRV_EMPTY_FREE_LIST_LEGACY)
+		      || !buf_page_cleaner_is_active
+		      || (srv_shutdown_state != SRV_SHUTDOWN_NONE));
+	}
+
+	mutex_enter(&buf_pool->flush_state_mutex);
+
+	if (buf_pool->init_flush[BUF_FLUSH_LRU]
+	    && srv_use_doublewrite_buf
+	    && buf_dblwr != NULL) {
+
+		mutex_exit(&buf_pool->flush_state_mutex);
+
+		/* If there is an LRU flush happening in the background
+		then we wait for it to end instead of trying a single
+		page flush. If, however, we are not using doublewrite
+		buffer then it is better to do our own single page
+		flush instead of waiting for LRU flush to end. */
+		buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
 		goto loop;
 	}
 
-	if (n_iterations > 30) {
+	mutex_exit(&buf_pool->flush_state_mutex);
+
+	freed = FALSE;
+	if (buf_pool->try_LRU_scan || n_iterations > 0) {
+
+		/* If no block was in the free list, search from the
+		end of the LRU list and try to free a block there.
+		If we are doing for the first time we'll scan only
+		tail of the LRU list otherwise we scan the whole LRU
+		list. */
+		freed = buf_LRU_scan_and_free_block(buf_pool,
+						    n_iterations > 0);
+
+		if (!freed && n_iterations == 0) {
+			/* Tell other threads that there is no point
+			in scanning the LRU list. This flag is set to
+			TRUE again when we flush a batch from this
+			buffer pool. */
+			buf_pool->try_LRU_scan = FALSE;
+		}
+	}
+
+	if (freed) {
+		goto loop;
+
+	}
+
+	if (n_iterations > 20) {
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
 			"  InnoDB: Warning: difficult to find free blocks in\n"
-			"InnoDB: the buffer pool (%lu search iterations)!"
+			"InnoDB: the buffer pool (%lu search iterations)!\n"
+			"InnoDB: %lu failed attempts to flush a page!"
 			" Consider\n"
 			"InnoDB: increasing the buffer pool size.\n"
 			"InnoDB: It is also possible that"
@@ -1369,6 +1484,7 @@ loop:
 			"InnoDB: Starting InnoDB Monitor to print further\n"
 			"InnoDB: diagnostics to the standard output.\n",
 			(ulong) n_iterations,
+			(ulong)	flush_failures,
 			(ulong) fil_n_pending_log_flushes,
 			(ulong) fil_n_pending_tablespace_flushes,
 			(ulong) os_n_file_reads, (ulong) os_n_file_writes,
@@ -1377,35 +1493,35 @@ loop:
 		mon_value_was = srv_print_innodb_monitor;
 		started_monitor = TRUE;
 		srv_print_innodb_monitor = TRUE;
-		os_event_set(srv_lock_timeout_thread_event);
+		os_event_set(lock_sys->timeout_event);
 	}
 
-	/* No free block was found: try to flush the LRU list */
-
-	buf_flush_free_margin(buf_pool, TRUE);
-	++srv_buf_pool_wait_free;
-
-	os_aio_simulated_wake_handler_threads();
-
-	buf_pool_mutex_enter(buf_pool);
-
-	if (buf_pool->LRU_flush_ended > 0) {
-		/* We have written pages in an LRU flush. To make the insert
-		buffer more efficient, we try to move these pages to the free
-		list. */
-
-		buf_pool_mutex_exit(buf_pool);
+	/* If we have scanned the whole LRU and still are unable to
+	find a free block then we should sleep here to let the
+	page_cleaner do an LRU batch for us.
+	TODO: It'd be better if we can signal the page_cleaner. Perhaps
+	we should use timed wait for page_cleaner. */
+	if (n_iterations > 1) {
 
-		buf_LRU_try_free_flushed_blocks(buf_pool);
-	} else {
-		buf_pool_mutex_exit(buf_pool);
+		os_thread_sleep(100000);
 	}
 
-	if (n_iterations > 10) {
-
-		os_thread_sleep(500000);
+	/* No free block was found: try to flush the LRU list.
+	This call will flush one page from the LRU and put it on the
+	free list. That means that the free block is up for grabs for
+	all user threads.
+	TODO: A more elegant way would have been to return the freed
+	up block to the caller here but the code that deals with
+	removing the block from page_hash and LRU_list is fairly
+	involved (particularly in case of compressed pages). We
+	can do that in a separate patch sometime in future. */
+	if (!buf_flush_single_page_from_LRU(buf_pool)) {
+		MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
+		++flush_failures;
 	}
 
+	srv_stats.buf_pool_wait_free.add(n_iterations, 1);
+
 	n_iterations++;
 
 	goto loop;
@@ -1424,7 +1540,6 @@ buf_LRU_old_adjust_len(
 	ulint	new_len;
 
 	ut_a(buf_pool->LRU_old);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 	ut_ad(buf_pool->LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN);
 	ut_ad(buf_pool->LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX);
@@ -1491,7 +1606,6 @@ buf_LRU_old_init(
 {
 	buf_page_t*	bpage;
 
-	//ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 	ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN);
 
@@ -1527,14 +1641,13 @@ buf_unzip_LRU_remove_block_if_needed(
 	ut_ad(buf_pool);
 	ut_ad(bpage);
 	ut_ad(buf_page_in_file(bpage));
-	//ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 
 	if (buf_page_belongs_to_unzip_LRU(bpage)) {
 		buf_block_t*	block = (buf_block_t*) bpage;
 
 		ut_ad(block->in_unzip_LRU_list);
-		block->in_unzip_LRU_list = FALSE;
+		ut_d(block->in_unzip_LRU_list = FALSE);
 
 		UT_LIST_REMOVE(unzip_LRU, buf_pool->unzip_LRU, block);
 	}
@@ -1553,7 +1666,6 @@ buf_LRU_remove_block(
 
 	ut_ad(buf_pool);
 	ut_ad(bpage);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 
 	ut_a(buf_page_in_file(bpage));
@@ -1584,7 +1696,7 @@ buf_LRU_remove_block(
 
 	/* Remove the block from the LRU list */
 	UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage);
-	bpage->in_LRU_list = FALSE;
+	ut_d(bpage->in_LRU_list = FALSE);
 
 	zip_size = page_zip_get_size(&bpage->zip);
 	buf_pool->stat.LRU_bytes -= zip_size ? zip_size : UNIV_PAGE_SIZE;
@@ -1634,13 +1746,12 @@ buf_unzip_LRU_add_block(
 
 	ut_ad(buf_pool);
 	ut_ad(block);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 
 	ut_a(buf_page_belongs_to_unzip_LRU(&block->page));
 
 	ut_ad(!block->in_unzip_LRU_list);
-	block->in_unzip_LRU_list = TRUE;
+	ut_d(block->in_unzip_LRU_list = TRUE);
 
 	if (old) {
 		UT_LIST_ADD_LAST(unzip_LRU, buf_pool->unzip_LRU, block);
@@ -1664,14 +1775,13 @@ buf_LRU_add_block_to_end_low(
 
 	ut_ad(buf_pool);
 	ut_ad(bpage);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 
 	ut_a(buf_page_in_file(bpage));
 
 	ut_ad(!bpage->in_LRU_list);
 	UT_LIST_ADD_LAST(LRU, buf_pool->LRU, bpage);
-	bpage->in_LRU_list = TRUE;
+	ut_d(bpage->in_LRU_list = TRUE);
 
 	incr_LRU_size_in_bytes(bpage, buf_pool);
 
@@ -1721,7 +1831,6 @@ buf_LRU_add_block_low(
 
 	ut_ad(buf_pool);
 	ut_ad(bpage);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 
 	ut_a(buf_page_in_file(bpage));
@@ -1747,7 +1856,7 @@ buf_LRU_add_block_low(
 		buf_pool->LRU_old_len++;
 	}
 
-	bpage->in_LRU_list = TRUE;
+	ut_d(bpage->in_LRU_list = TRUE);
 
 	incr_LRU_size_in_bytes(bpage, buf_pool);
 
@@ -1806,7 +1915,6 @@ buf_LRU_make_block_young(
 {
 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
 
-	//ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 
 	if (bpage->old) {
@@ -1833,32 +1941,36 @@ buf_LRU_make_block_old(
 Try to free a block.  If bpage is a descriptor of a compressed-only
 page, the descriptor object will be freed as well.
 
-NOTE: If this function returns TRUE, it will temporarily
-release buf_pool->mutex.  Furthermore, the page frame will no longer be
-accessible via bpage.
+NOTE: If this function returns true, it will release the LRU list mutex,
+and temporarily release and relock the buf_page_get_mutex() mutex.
+Furthermore, the page frame will no longer be accessible via bpage.  If this
+function returns false, the buf_page_get_mutex() might be temporarily released
+and relocked too.
+
+The caller must hold the LRU list and buf_page_get_mutex() mutexes.
 
-The caller must hold buf_pool->mutex and buf_page_get_mutex(bpage) and
-release these two mutexes after the call.  No other
-buf_page_get_mutex() may be held when calling this function.
-@return TRUE if freed, FALSE otherwise. */
+@return true if freed, false otherwise. */
 UNIV_INTERN
-ibool
-buf_LRU_free_block(
+bool
+buf_LRU_free_page(
 /*===============*/
 	buf_page_t*	bpage,	/*!< in: block to be freed */
-	ibool		zip,	/*!< in: TRUE if should remove also the
+	bool		zip)	/*!< in: true if should remove also the
 				compressed page of an uncompressed page */
-	ibool*		have_LRU_mutex)
 {
 	buf_page_t*	b = NULL;
 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+	const ulint	fold = buf_page_address_fold(bpage->space,
+						     bpage->offset);
+	prio_rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+	ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
 
-	//ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 	ut_ad(mutex_own(block_mutex));
 	ut_ad(buf_page_in_file(bpage));
-	//ut_ad(bpage->in_LRU_list);
-	ut_ad(!bpage->in_flush_list == !bpage->oldest_modification);
+	ut_ad(bpage->in_LRU_list);
+
 #if UNIV_WORD_SIZE == 4
 	/* On 32-bit systems, there is no padding in buf_page_t.  On
 	other systems, Valgrind could complain about uninitialized pad
@@ -1866,10 +1978,10 @@ buf_LRU_free_block(
 	UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage);
 #endif
 
-	if (!bpage->in_LRU_list || !block_mutex || !buf_page_can_relocate(bpage)) {
+	if (!buf_page_can_relocate(bpage)) {
 
 		/* Do not free buffer-fixed or I/O-fixed blocks. */
-		return(FALSE);
+		return(false);
 	}
 
 #ifdef UNIV_IBUF_COUNT_DEBUG
@@ -1881,28 +1993,32 @@ buf_LRU_free_block(
 		/* Do not completely free dirty blocks. */
 
 		if (bpage->oldest_modification) {
-			return(FALSE);
+			return(false);
 		}
-	} else if (bpage->oldest_modification) {
-		/* Do not completely free dirty blocks. */
+	} else if ((bpage->oldest_modification)
+		   && (buf_page_get_state(bpage)
+		       != BUF_BLOCK_FILE_PAGE)) {
 
-		if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
-			ut_ad(buf_page_get_state(bpage)
-			      == BUF_BLOCK_ZIP_DIRTY);
-			return(FALSE);
-		}
+		ut_ad(buf_page_get_state(bpage)
+		      == BUF_BLOCK_ZIP_DIRTY);
+
+		return(false);
 
-		goto alloc;
 	} else if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
-		/* Allocate the control block for the compressed page.
-		If it cannot be allocated (without freeing a block
-		from the LRU list), refuse to free bpage. */
-alloc:
 		b = buf_page_alloc_descriptor();
 		ut_a(b);
-		//memcpy(b, bpage, sizeof *b);
 	}
 
+	ut_ad(buf_page_in_file(bpage));
+	ut_ad(bpage->in_LRU_list);
+	ut_ad(!bpage->in_flush_list == !bpage->oldest_modification);
+#if UNIV_WORD_SIZE == 4
+	/* On 32-bit systems, there is no padding in buf_page_t.  On
+	other systems, Valgrind could complain about uninitialized pad
+	bytes. */
+	UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage);
+#endif
+
 #ifdef UNIV_DEBUG
 	if (buf_debug_prints) {
 		fprintf(stderr, "Putting space %lu page %lu to free list\n",
@@ -1911,232 +2027,216 @@ alloc:
 	}
 #endif /* UNIV_DEBUG */
 
-	/* not to break latch order, must re-enter block_mutex */
 	mutex_exit(block_mutex);
 
-	if (!*have_LRU_mutex) {
-		mutex_enter(&buf_pool->LRU_list_mutex); /* optimistic */
-		*have_LRU_mutex = TRUE;
-	}
-	rw_lock_x_lock(&buf_pool->page_hash_latch);
+	rw_lock_x_lock(hash_lock);
 	mutex_enter(block_mutex);
 
-	/* recheck states of block */
-	if (!bpage->in_LRU_list || block_mutex != buf_page_get_mutex(bpage)
-	    || !buf_page_can_relocate(bpage)) {
+	if (UNIV_UNLIKELY(!buf_page_can_relocate(bpage)
+			  || ((zip || !bpage->zip.data)
+			      && bpage->oldest_modification))) {
+
 not_freed:
+		rw_lock_x_unlock(hash_lock);
 		if (b) {
 			buf_page_free_descriptor(b);
 		}
-		if (*have_LRU_mutex) {
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			*have_LRU_mutex = FALSE;
-		}
-		rw_lock_x_unlock(&buf_pool->page_hash_latch);
-		return(FALSE);
-	} else if (zip || !bpage->zip.data) {
-		if (bpage->oldest_modification)
-			goto not_freed;
-	} else if (bpage->oldest_modification) {
-		if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
-			ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY);
-			goto not_freed;
-		}
+
+		return(false);
+	} else if (UNIV_UNLIKELY(bpage->oldest_modification
+				 && (buf_page_get_state(bpage)
+				     != BUF_BLOCK_FILE_PAGE))) {
+
+		ut_ad(buf_page_get_state(bpage)
+		      == BUF_BLOCK_ZIP_DIRTY);
+		goto not_freed;
 	}
 
 	if (b) {
 		memcpy(b, bpage, sizeof *b);
 	}
 
-	if (buf_LRU_block_remove_hashed_page(bpage, zip)
-	    != BUF_BLOCK_ZIP_FREE) {
-		ut_a(bpage->buf_fix_count == 0);
+	if (!buf_LRU_block_remove_hashed(bpage, zip)) {
+
+		mutex_exit(&buf_pool->LRU_list_mutex);
 
 		if (b) {
-			buf_page_t*	hash_b;
-			buf_page_t*	prev_b	= UT_LIST_GET_PREV(LRU, b);
+			buf_page_free_descriptor(b);
+		}
 
-			const ulint	fold = buf_page_address_fold(
-				bpage->space, bpage->offset);
+		mutex_enter(block_mutex);
 
-			hash_b	= buf_page_hash_get_low(
-				buf_pool, bpage->space, bpage->offset, fold);
+		return(true);
+	}
 
-			ut_a(!hash_b);
+#ifdef UNIV_SYNC_DEBUG
+	/* buf_LRU_block_remove_hashed() releases the hash_lock */
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)
+	      && !rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* We have just freed a BUF_BLOCK_FILE_PAGE. If b != NULL
+	then it was a compressed page with an uncompressed frame and
+	we are interested in freeing only the uncompressed frame.
+	Therefore we have to reinsert the compressed page descriptor
+	into the LRU and page_hash (and possibly flush_list).
+	if b == NULL then it was a regular page that has been freed */
 
-			while (prev_b && !prev_b->in_LRU_list) {
-				prev_b = UT_LIST_GET_PREV(LRU, prev_b);
-			}
+	if (b) {
+		buf_page_t*	prev_b	= UT_LIST_GET_PREV(LRU, b);
 
-			b->state = b->oldest_modification
-				? BUF_BLOCK_ZIP_DIRTY
-				: BUF_BLOCK_ZIP_PAGE;
-			UNIV_MEM_DESC(b->zip.data,
-				      page_zip_get_size(&b->zip), b);
-
-			/* The fields in_page_hash and in_LRU_list of
-			the to-be-freed block descriptor should have
-			been cleared in
-			buf_LRU_block_remove_hashed_page(), which
-			invokes buf_LRU_remove_block(). */
-			ut_ad(!bpage->in_page_hash);
-			ut_ad(!bpage->in_LRU_list);
-			/* bpage->state was BUF_BLOCK_FILE_PAGE because
-			b != NULL. The type cast below is thus valid. */
-			ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
+		rw_lock_x_lock(hash_lock);
+		mutex_enter(block_mutex);
+
+		ut_a(!buf_page_hash_get_low(buf_pool,
+					    bpage->space,
+					    bpage->offset,
+					    fold));
+
+		b->state = b->oldest_modification
+			? BUF_BLOCK_ZIP_DIRTY
+			: BUF_BLOCK_ZIP_PAGE;
+		UNIV_MEM_DESC(b->zip.data,
+			      page_zip_get_size(&b->zip));
+
+		/* The fields in_page_hash and in_LRU_list of
+		the to-be-freed block descriptor should have
+		been cleared in
+		buf_LRU_block_remove_hashed(), which
+		invokes buf_LRU_remove_block(). */
+		ut_ad(!bpage->in_page_hash);
+		ut_ad(!bpage->in_LRU_list);
+		/* bpage->state was BUF_BLOCK_FILE_PAGE because
+		b != NULL. The type cast below is thus valid. */
+		ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
 
-			/* The fields of bpage were copied to b before
-			buf_LRU_block_remove_hashed_page() was invoked. */
-			ut_ad(!b->in_zip_hash);
-			ut_ad(b->in_page_hash);
-			ut_ad(b->in_LRU_list);
+		/* The fields of bpage were copied to b before
+		buf_LRU_block_remove_hashed() was invoked. */
+		ut_ad(!b->in_zip_hash);
+		ut_ad(b->in_page_hash);
+		ut_ad(b->in_LRU_list);
 
-			HASH_INSERT(buf_page_t, hash,
-				    buf_pool->page_hash, fold, b);
+		HASH_INSERT(buf_page_t, hash,
+			    buf_pool->page_hash, fold, b);
 
-			/* Insert b where bpage was in the LRU list. */
-			if (UNIV_LIKELY(prev_b != NULL)) {
-				ulint	lru_len;
+		/* Insert b where bpage was in the LRU list. */
+		if (UNIV_LIKELY(prev_b != NULL)) {
+			ulint	lru_len;
 
-				ut_ad(prev_b->in_LRU_list);
-				ut_ad(buf_page_in_file(prev_b));
+			ut_ad(prev_b->in_LRU_list);
+			ut_ad(buf_page_in_file(prev_b));
 #if UNIV_WORD_SIZE == 4
-				/* On 32-bit systems, there is no
-				padding in buf_page_t.  On other
-				systems, Valgrind could complain about
-				uninitialized pad bytes. */
-				UNIV_MEM_ASSERT_RW(prev_b, sizeof *prev_b);
+			/* On 32-bit systems, there is no
+			padding in buf_page_t.  On other
+			systems, Valgrind could complain about
+			uninitialized pad bytes. */
+			UNIV_MEM_ASSERT_RW(prev_b, sizeof *prev_b);
 #endif
-				UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU,
-						     prev_b, b);
+			UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU,
+					     prev_b, b);
 
-				incr_LRU_size_in_bytes(b, buf_pool);
+			incr_LRU_size_in_bytes(b, buf_pool);
 
-				if (buf_page_is_old(b)) {
-					buf_pool->LRU_old_len++;
-					if (UNIV_UNLIKELY
-					    (buf_pool->LRU_old
-					     == UT_LIST_GET_NEXT(LRU, b))) {
+			if (buf_page_is_old(b)) {
+				buf_pool->LRU_old_len++;
+				if (UNIV_UNLIKELY
+				    (buf_pool->LRU_old
+				     == UT_LIST_GET_NEXT(LRU, b))) {
 
-						buf_pool->LRU_old = b;
-					}
+					buf_pool->LRU_old = b;
 				}
+			}
 
-				lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
-
-				if (lru_len > BUF_LRU_OLD_MIN_LEN) {
-					ut_ad(buf_pool->LRU_old);
-					/* Adjust the length of the
-					old block list if necessary */
-					buf_LRU_old_adjust_len(buf_pool);
-				} else if (lru_len == BUF_LRU_OLD_MIN_LEN) {
-					/* The LRU list is now long
-					enough for LRU_old to become
-					defined: init it */
-					buf_LRU_old_init(buf_pool);
-				}
+			lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+			if (lru_len > BUF_LRU_OLD_MIN_LEN) {
+				ut_ad(buf_pool->LRU_old);
+				/* Adjust the length of the
+				old block list if necessary */
+				buf_LRU_old_adjust_len(buf_pool);
+			} else if (lru_len == BUF_LRU_OLD_MIN_LEN) {
+				/* The LRU list is now long
+				enough for LRU_old to become
+				defined: init it */
+				buf_LRU_old_init(buf_pool);
+			}
 #ifdef UNIV_LRU_DEBUG
-				/* Check that the "old" flag is consistent
-				in the block and its neighbours. */
-				buf_page_set_old(b, buf_page_is_old(b));
+			/* Check that the "old" flag is consistent
+			in the block and its neighbours. */
+			buf_page_set_old(b, buf_page_is_old(b));
 #endif /* UNIV_LRU_DEBUG */
-			} else {
-				b->in_LRU_list = FALSE;
-				buf_LRU_add_block_low(b, buf_page_is_old(b));
-			}
+		} else {
+			ut_d(b->in_LRU_list = FALSE);
+			buf_LRU_add_block_low(b, buf_page_is_old(b));
+		}
 
-			mutex_enter(&buf_pool->zip_mutex);
-			if (b->state == BUF_BLOCK_ZIP_PAGE) {
+		mutex_enter(&buf_pool->zip_mutex);
+		rw_lock_x_unlock(hash_lock);
+		if (b->state == BUF_BLOCK_ZIP_PAGE) {
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-				buf_LRU_insert_zip_clean(b);
+			buf_LRU_insert_zip_clean(b);
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-			} else {
-				/* Relocate on buf_pool->flush_list. */
-				buf_flush_relocate_on_flush_list(bpage, b);
-			}
-
-			bpage->zip.data = NULL;
-			page_zip_set_size(&bpage->zip, 0);
-
-			/* Prevent buf_page_get_gen() from
-			decompressing the block while we release
-			buf_pool->mutex and block_mutex. */
-			buf_page_set_sticky(b);
-			mutex_exit(&buf_pool->zip_mutex);
+		} else {
+			/* Relocate on buf_pool->flush_list. */
+			buf_flush_relocate_on_flush_list(bpage, b);
 		}
 
-		//buf_pool_mutex_exit(buf_pool);
-		if (*have_LRU_mutex) {
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			*have_LRU_mutex = FALSE;
-		}
-		rw_lock_x_unlock(&buf_pool->page_hash_latch);
+		bpage->zip.data = NULL;
+		page_zip_set_size(&bpage->zip, 0);
+
+		/* Prevent buf_page_get_gen() from
+		decompressing the block while we release block_mutex. */
+		buf_page_set_sticky(b);
+		mutex_exit(&buf_pool->zip_mutex);
 		mutex_exit(block_mutex);
 
-		/* Remove possible adaptive hash index on the page.
-		The page was declared uninitialized by
-		buf_LRU_block_remove_hashed_page().  We need to flag
-		the contents of the page valid (which it still is) in
-		order to avoid bogus Valgrind warnings.*/
+	}
 
-		UNIV_MEM_VALID(((buf_block_t*) bpage)->frame,
-			       UNIV_PAGE_SIZE);
-		btr_search_drop_page_hash_index((buf_block_t*) bpage);
-		UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame,
-				 UNIV_PAGE_SIZE);
+	mutex_exit(&buf_pool->LRU_list_mutex);
 
-		if (b) {
-			/* Compute and stamp the compressed page
-			checksum while not holding any mutex.  The
-			block is already half-freed
-			(BUF_BLOCK_REMOVE_HASH) and removed from
-			buf_pool->page_hash, thus inaccessible by any
-			other thread. */
-
-			mach_write_to_4(
-				b->zip.data + FIL_PAGE_SPACE_OR_CHKSUM,
-				UNIV_LIKELY(srv_use_checksums)
-				? page_zip_calc_checksum(
-					b->zip.data,
-					page_zip_get_size(&b->zip))
-				: BUF_NO_CHECKSUM_MAGIC);
-		}
+	/* Remove possible adaptive hash index on the page.
+	The page was declared uninitialized by
+	buf_LRU_block_remove_hashed().  We need to flag
+	the contents of the page valid (which it still is) in
+	order to avoid bogus Valgrind warnings.*/
 
-		//buf_pool_mutex_enter(buf_pool);
-		if (!*have_LRU_mutex) {
-			mutex_enter(&buf_pool->LRU_list_mutex);
-			*have_LRU_mutex = TRUE;
-		}
-		mutex_enter(block_mutex);
+	UNIV_MEM_VALID(((buf_block_t*) bpage)->frame,
+		       UNIV_PAGE_SIZE);
+	btr_search_drop_page_hash_index((buf_block_t*) bpage);
+	UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame,
+			 UNIV_PAGE_SIZE);
 
-		if (b) {
-			mutex_enter(&buf_pool->zip_mutex);
-			buf_page_unset_sticky(b);
-			mutex_exit(&buf_pool->zip_mutex);
-		}
+	if (b) {
+		ib_uint32_t	checksum;
+		/* Compute and stamp the compressed page
+		checksum while not holding any mutex.  The
+		block is already half-freed
+		(BUF_BLOCK_REMOVE_HASH) and removed from
+		buf_pool->page_hash, thus inaccessible by any
+		other thread. */
 
-		buf_LRU_block_free_hashed_page((buf_block_t*) bpage, FALSE);
+		checksum = page_zip_calc_checksum(
+			b->zip.data,
+			page_zip_get_size(&b->zip),
+			static_cast<srv_checksum_algorithm_t>(
+				srv_checksum_algorithm));
 
-		if (*have_LRU_mutex) {
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			*have_LRU_mutex = FALSE;
-		}
+		mach_write_to_4(b->zip.data + FIL_PAGE_SPACE_OR_CHKSUM,
+				checksum);
+	}
 
-	} else {
-		/* The block_mutex should have been released by
-		buf_LRU_block_remove_hashed_page() when it returns
-		BUF_BLOCK_ZIP_FREE. */
-		ut_ad(block_mutex == &buf_pool->zip_mutex);
-		mutex_enter(block_mutex);
+	mutex_enter(block_mutex);
 
-		if (*have_LRU_mutex) {
-			mutex_exit(&buf_pool->LRU_list_mutex);
-			*have_LRU_mutex = FALSE;
-		}
-		rw_lock_x_unlock(&buf_pool->page_hash_latch);
+	if (b) {
+		mutex_enter(&buf_pool->zip_mutex);
+		buf_page_unset_sticky(b);
+		mutex_exit(&buf_pool->zip_mutex);
 	}
 
-	return(TRUE);
+	buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
+	ut_ad(mutex_own(block_mutex));
+	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
+	return(true);
 }
 
 /******************************************************************//**
@@ -2145,14 +2245,12 @@ UNIV_INTERN
 void
 buf_LRU_block_free_non_file_page(
 /*=============================*/
-	buf_block_t*	block,	/*!< in: block, must not contain a file page */
-	ibool		have_page_hash_mutex)
+	buf_block_t*	block)	/*!< in: block, must not contain a file page */
 {
 	void*		data;
 	buf_pool_t*	buf_pool = buf_pool_from_block(block);
 
 	ut_ad(block);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(mutex_own(&block->mutex));
 
 	switch (buf_block_get_state(block)) {
@@ -2170,8 +2268,6 @@ buf_LRU_block_free_non_file_page(
 	ut_ad(!block->page.in_flush_list);
 	ut_ad(!block->page.in_LRU_list);
 
-	buf_block_set_state(block, BUF_BLOCK_NOT_USED);
-
 	UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE);
 #ifdef UNIV_DEBUG
 	/* Wipe contents of page to reveal possible stale pointers to it */
@@ -2186,19 +2282,17 @@ buf_LRU_block_free_non_file_page(
 	if (data) {
 		block->page.zip.data = NULL;
 		mutex_exit(&block->mutex);
-		//buf_pool_mutex_exit_forbid(buf_pool);
 
 		buf_buddy_free(
-			buf_pool, data, page_zip_get_size(&block->page.zip),
-			have_page_hash_mutex);
+			buf_pool, data, page_zip_get_size(&block->page.zip));
 
-		//buf_pool_mutex_exit_allow(buf_pool);
 		mutex_enter(&block->mutex);
 		page_zip_set_size(&block->page.zip, 0);
 	}
 
-	mutex_enter(&buf_pool->free_list_mutex);
-	UT_LIST_ADD_FIRST(free, buf_pool->free, (&block->page));
+	mutex_enter_first(&buf_pool->free_list_mutex);
+	buf_block_set_state(block, BUF_BLOCK_NOT_USED);
+	UT_LIST_ADD_FIRST(list, buf_pool->free, (&block->page));
 	ut_d(block->page.in_free_list = TRUE);
 	mutex_exit(&buf_pool->free_list_mutex);
 
@@ -2208,35 +2302,42 @@ buf_LRU_block_free_non_file_page(
 /******************************************************************//**
 Takes a block out of the LRU list and page hash table.
 If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
-the object will be freed and buf_pool->zip_mutex will be released.
+the object will be freed.
 
-If a compressed page or a compressed-only block descriptor is freed,
-other compressed pages or compressed-only block descriptors may be
-relocated.
-@return the new state of the block (BUF_BLOCK_ZIP_FREE if the state
-was BUF_BLOCK_ZIP_PAGE, or BUF_BLOCK_REMOVE_HASH otherwise) */
+The caller must hold buf_pool->LRU_list_mutex, the buf_page_get_mutex() mutex
+and the appropriate hash_lock. This function will release the
+buf_page_get_mutex() and the hash_lock.
+
+If a compressed page is freed other compressed pages may be relocated.
+@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
+caller needs to free the page to the free list
+@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
+this case the block is already returned to the buddy allocator. */
 static
-enum buf_page_state
-buf_LRU_block_remove_hashed_page(
-/*=============================*/
+bool
+buf_LRU_block_remove_hashed(
+/*========================*/
 	buf_page_t*	bpage,	/*!< in: block, must contain a file page and
 				be in a state where it can be freed; there
 				may or may not be a hash index to the page */
-	ibool		zip)	/*!< in: TRUE if should remove also the
+	bool		zip)	/*!< in: true if should remove also the
 				compressed page of an uncompressed page */
 {
 	ulint			fold;
 	const buf_page_t*	hashed_bpage;
 	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
+	prio_rw_lock_t*		hash_lock;
 
 	ut_ad(bpage);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_EX));
-#endif
 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 
+	fold = buf_page_address_fold(bpage->space, bpage->offset);
+	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+#ifdef UNIV_SYNC_DEBUG
+        ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
 	ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
 	ut_a(bpage->buf_fix_count == 0);
 
@@ -2312,7 +2413,7 @@ buf_LRU_block_remove_hashed_page(
 		UNIV_MEM_ASSERT_W(bpage->zip.data,
 				  page_zip_get_size(&bpage->zip));
 		break;
-	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_POOL_WATCH:
 	case BUF_BLOCK_ZIP_DIRTY:
 	case BUF_BLOCK_NOT_USED:
 	case BUF_BLOCK_READY_FOR_USE:
@@ -2322,9 +2423,8 @@ buf_LRU_block_remove_hashed_page(
 		break;
 	}
 
-	fold = buf_page_address_fold(bpage->space, bpage->offset);
-	hashed_bpage = buf_page_hash_get_low(
-		buf_pool, bpage->space, bpage->offset, fold);
+	hashed_bpage = buf_page_hash_get_low(buf_pool, bpage->space,
+					     bpage->offset, fold);
 
 	if (UNIV_UNLIKELY(bpage != hashed_bpage)) {
 		fprintf(stderr,
@@ -2344,9 +2444,8 @@ buf_LRU_block_remove_hashed_page(
 
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 		mutex_exit(buf_page_get_mutex(bpage));
-		//buf_pool_mutex_exit(buf_pool);
+		rw_lock_x_unlock(hash_lock);
 		mutex_exit(&buf_pool->LRU_list_mutex);
-		rw_lock_x_unlock(&buf_pool->page_hash_latch);
 		buf_print();
 		buf_LRU_print();
 		buf_validate();
@@ -2368,19 +2467,18 @@ buf_LRU_block_remove_hashed_page(
 		ut_a(buf_page_get_zip_size(bpage));
 
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-		UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, bpage);
+		UT_LIST_REMOVE(list, buf_pool->zip_clean, bpage);
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
 		mutex_exit(&buf_pool->zip_mutex);
-		//buf_pool_mutex_exit_forbid(buf_pool);
+		rw_lock_x_unlock(hash_lock);
 
 		buf_buddy_free(
 			buf_pool, bpage->zip.data,
-			page_zip_get_size(&bpage->zip), TRUE);
+			page_zip_get_size(&bpage->zip));
 
-		//buf_pool_mutex_exit_allow(buf_pool);
 		buf_page_free_descriptor(bpage);
-		return(BUF_BLOCK_ZIP_FREE);
+		return(false);
 
 	case BUF_BLOCK_FILE_PAGE:
 		memset(((buf_block_t*) bpage)->frame
@@ -2391,6 +2489,29 @@ buf_LRU_block_remove_hashed_page(
 				 UNIV_PAGE_SIZE);
 		buf_page_set_state(bpage, BUF_BLOCK_REMOVE_HASH);
 
+		/* Question: If we release bpage and hash mutex here
+		then what protects us against:
+		1) Some other thread buffer fixing this page
+		2) Some other thread trying to read this page and
+		not finding it in buffer pool attempting to read it
+		from the disk.
+		Answer:
+		1) Cannot happen because the page is no longer in the
+		page_hash. Only possibility is when while invalidating
+		a tablespace we buffer fix the prev_page in LRU to
+		avoid relocation during the scan. But that is not
+		possible because we are holding LRU list mutex.
+
+		2) Not possible because in buf_page_init_for_read()
+		we do a look up of page_hash while holding LRU list
+		mutex and since we are holding LRU list mutex here
+		and by the time we'll release it in the caller we'd
+		have inserted the compressed only descriptor in the
+		page_hash. */
+		ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+		rw_lock_x_unlock(hash_lock);
+		mutex_exit(&((buf_block_t*) bpage)->mutex);
+
 		if (zip && bpage->zip.data) {
 			/* Free the compressed page. */
 			void*	data = bpage->zip.data;
@@ -2399,21 +2520,17 @@ buf_LRU_block_remove_hashed_page(
 			ut_ad(!bpage->in_free_list);
 			ut_ad(!bpage->in_flush_list);
 			ut_ad(!bpage->in_LRU_list);
-			mutex_exit(&((buf_block_t*) bpage)->mutex);
-			//buf_pool_mutex_exit_forbid(buf_pool);
 
 			buf_buddy_free(
 				buf_pool, data,
-				page_zip_get_size(&bpage->zip), TRUE);
+				page_zip_get_size(&bpage->zip));
 
-			//buf_pool_mutex_exit_allow(buf_pool);
-			mutex_enter(&((buf_block_t*) bpage)->mutex);
 			page_zip_set_size(&bpage->zip, 0);
 		}
 
-		return(BUF_BLOCK_REMOVE_HASH);
+		return(true);
 
-	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_POOL_WATCH:
 	case BUF_BLOCK_ZIP_DIRTY:
 	case BUF_BLOCK_NOT_USED:
 	case BUF_BLOCK_READY_FOR_USE:
@@ -2423,7 +2540,7 @@ buf_LRU_block_remove_hashed_page(
 	}
 
 	ut_error;
-	return(BUF_BLOCK_ZIP_FREE);
+	return(false);
 }
 
 /******************************************************************//**
@@ -2432,19 +2549,14 @@ static
 void
 buf_LRU_block_free_hashed_page(
 /*===========================*/
-	buf_block_t*	block,	/*!< in: block, must contain a file page and
+	buf_block_t*	block)	/*!< in: block, must contain a file page and
 				be in a state where it can be freed */
-	ibool		have_page_hash_mutex)
 {
-#ifdef UNIV_DEBUG
-	//buf_pool_t*	buf_pool = buf_pool_from_block(block);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-#endif
 	ut_ad(mutex_own(&block->mutex));
 
 	buf_block_set_state(block, BUF_BLOCK_MEMORY);
 
-	buf_LRU_block_free_non_file_page(block, have_page_hash_mutex);
+	buf_LRU_block_free_non_file_page(block);
 }
 
 /******************************************************************//**
@@ -2457,24 +2569,34 @@ buf_LRU_free_one_page(
 				be in a state where it can be freed; there
 				may or may not be a hash index to the page */
 {
-#ifdef UNIV_DEBUG
+#if defined(UNIV_DEBUG) || defined(UNIV_SYNC_DEBUG)
 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
 #endif
-	mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+#ifdef UNIV_SYNC_DEBUG
+	const ulint	fold = buf_page_address_fold(bpage->space,
+						     bpage->offset);
+	prio_rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+#endif
+	ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
 
 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 	ut_ad(mutex_own(block_mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif
 
-	if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
-	    != BUF_BLOCK_ZIP_FREE) {
-		buf_LRU_block_free_hashed_page((buf_block_t*) bpage, TRUE);
-	} else {
-		/* The block_mutex should have been released by
-		buf_LRU_block_remove_hashed_page() when it returns
-		BUF_BLOCK_ZIP_FREE. */
-		ut_ad(block_mutex == &buf_pool->zip_mutex);
+	if (buf_LRU_block_remove_hashed(bpage, true)) {
 		mutex_enter(block_mutex);
+		buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
+		mutex_exit(block_mutex);
 	}
+
+	/* buf_LRU_block_remove_hashed() releases hash_lock and block_mutex */
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)
+	      && !rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(!mutex_own(block_mutex));
 }
 
 /**********************************************************************//**
@@ -2501,8 +2623,6 @@ buf_LRU_old_ratio_update_instance(
 	}
 
 	if (adjust) {
-		//buf_pool_mutex_enter(buf_pool);
-		ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
 		mutex_enter(&buf_pool->LRU_list_mutex);
 
 		if (ratio != buf_pool->LRU_old_ratio) {
@@ -2515,12 +2635,11 @@ buf_LRU_old_ratio_update_instance(
 			}
 		}
 
-		//buf_pool_mutex_exit(buf_pool);
 		mutex_exit(&buf_pool->LRU_list_mutex);
 	} else {
 		buf_pool->LRU_old_ratio = ratio;
 	}
-	/* the reverse of 
+	/* the reverse of
 	ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */
 	return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5));
 }
@@ -2606,369 +2725,6 @@ func_exit:
 	memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur);
 }
 
-/********************************************************************//**
-Dump the LRU page list to the specific file. */
-#define LRU_DUMP_FILE "ib_lru_dump"
-#define LRU_DUMP_TEMP_FILE "ib_lru_dump.tmp"
-#define LRU_OS_FILE_WRITE() \
-	os_file_write(LRU_DUMP_FILE, dump_file, buffer, \
-		(buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL, \
-		(buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)), \
-		buffer_size)
-#define LRU_DUMP_PAGE_COUNT	1	/* Specifies how many dump pages
-					   should be filled for each hold
-					   of the LRU_list_mutex. */
-
-UNIV_INTERN
-ibool
-buf_LRU_file_dump(void)
-/*===================*/
-{
-	os_file_t	dump_file = (os_file_t) -1;
-	ibool		success;
-	byte*		buffer_base = NULL;
-	byte*		buffer = NULL;
-	const ulint	buffer_size = LRU_DUMP_PAGE_COUNT * UNIV_PAGE_SIZE;
-	buf_page_t*	bpage;
-	buf_page_t*	first_bpage;
-	ulint		buffers;
-	ulint		offset;
-	ulint		pages_written;
-	ulint		i;
-	ulint		total_pages;
-
-	/* Sanity test to make sure page size is a multiple of 
-	   assumed dump record size */
-	ut_a(UNIV_PAGE_SIZE % 8 == 0);
-
-	for (i = 0; i < srv_n_data_files; i++) {
-		if (strstr(srv_data_file_names[i], LRU_DUMP_FILE) != NULL) {
-			fprintf(stderr,
-				" InnoDB: The name '%s' seems to be used for"
-				" innodb_data_file_path. Dumping LRU list is"
-				"  not done for safeness.\n", LRU_DUMP_FILE);
-			goto end;
-		}
-	}
-
-	buffer_base = ut_malloc(UNIV_PAGE_SIZE + buffer_size);
-	buffer = ut_align(buffer_base, UNIV_PAGE_SIZE);
-	if (!buffer) {
-		fprintf(stderr,
-			" InnoDB: cannot allocate buffer.\n");
-		goto end;
-	}
-
-	dump_file = os_file_create(innodb_file_temp_key, LRU_DUMP_TEMP_FILE,
-			OS_FILE_OVERWRITE, OS_FILE_NORMAL, OS_DATA_FILE,
-			&success);
-
-	if (!success) {
-		os_file_get_last_error(TRUE);
-		fprintf(stderr,
-			" InnoDB: cannot open %s\n", LRU_DUMP_FILE);
-		goto end;
-	}
-
-	buffers = offset = 0;
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
-		mutex_enter(&buf_pool->LRU_list_mutex);
-		bpage = first_bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
-		total_pages = UT_LIST_GET_LEN(buf_pool->LRU);
-
-		pages_written = 0;
-		while (bpage != NULL && (pages_written++ < total_pages)) {
-
-			buf_page_t* next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
-
-			if (next_bpage == first_bpage) {
-				/* Do not release list mutex here, it will be
-				   released just outside this while loop */
-				fprintf(stderr,
-					"InnoDB: detected cycle in LRU for"
-					" buffer pool %lu, skipping to next"
-					" buffer pool.\n", i);
-				break;
-			}
-
-			mach_write_to_4(buffer + offset * 4, bpage->space);
-			offset++;
-			mach_write_to_4(buffer + offset * 4, bpage->offset);
-			offset++;
-
-			ut_a(offset <= buffer_size);
-			if (offset == buffer_size/4) {
-				mutex_t		*next_block_mutex = NULL;
-
-				if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
-					mutex_exit(&buf_pool->LRU_list_mutex);
-					success = FALSE;
-					fprintf(stderr,
-						" InnoDB: stopped dumping lru"
-						" pages because of server"
-						" shutdown.\n");
-					goto end;
-				}
-
-				/* While writing file, release buffer pool
-				   mutex but keep the next page fixed so we
-				   don't worry about our list iterator becoming
-				   invalid */
-				if (next_bpage) {
-					next_block_mutex = buf_page_get_mutex(
-								next_bpage);
-
-					mutex_enter(next_block_mutex);
-					next_bpage->buf_fix_count++;
-					mutex_exit(next_block_mutex);
-				}
-				mutex_exit(&buf_pool->LRU_list_mutex);
-
-				success = LRU_OS_FILE_WRITE();
-
-				/* Grab this here so that next_bpage can't
-				   be purged when we drop the fix_count */
-				ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
-				mutex_enter(&buf_pool->LRU_list_mutex);
-
-				if (next_bpage) {
-					mutex_enter(next_block_mutex);
-					next_bpage->buf_fix_count--;
-					mutex_exit(next_block_mutex);
-				}
-
-				if (!success) {
-					mutex_exit(&buf_pool->LRU_list_mutex);
-					fprintf(stderr,
-						" InnoDB: cannot write page"
-						" %lu of %s\n",
-						buffers, LRU_DUMP_FILE);
-					goto end;
-				}
-				buffers++;
-				offset = 0;
-
-				bpage = next_bpage;
-			} else {
-				bpage = UT_LIST_GET_NEXT(LRU, bpage);
-			}
-		} /* while(bpage ...) */
-		mutex_exit(&buf_pool->LRU_list_mutex);
-	} /* for(srv_buf_pool_instances ...) */
-
-	mach_write_to_4(buffer + offset * 4, 0xFFFFFFFFUL);
-	offset++;
-	mach_write_to_4(buffer + offset * 4, 0xFFFFFFFFUL);
-	offset++;
-
-	success = LRU_OS_FILE_WRITE();
-end:
-	if (dump_file != (os_file_t) -1) {
-		if (success) {
-			success = os_file_flush(dump_file, TRUE);
-		}
-		os_file_close(dump_file);
-	}
-	if (success) {
-		success = os_file_rename(innodb_file_temp_key,
-			LRU_DUMP_TEMP_FILE, LRU_DUMP_FILE);
-	}
-	if (buffer_base)
-		ut_free(buffer_base);
-
-	return(success);
-}
-
-typedef struct {
-	ib_uint32_t space_id;
-	ib_uint32_t page_no;
-} dump_record_t;
-
-static int dump_record_cmp(const void *a, const void *b)
-{
-	const dump_record_t *rec1 = (dump_record_t *) a;
-	const dump_record_t *rec2 = (dump_record_t *) b;
-
-	if (rec1->space_id < rec2->space_id)
-		return -1;
-	if (rec1->space_id > rec2->space_id)
-		return 1;
-	if (rec1->page_no < rec2->page_no)
-		return -1;
-	return rec1->page_no > rec2->page_no;
-}
-
-/********************************************************************//**
-Read the pages based on the specific file.*/
-UNIV_INTERN
-ibool
-buf_LRU_file_restore(void)
-/*======================*/
-{
-	os_file_t	dump_file = (os_file_t) -1;
-	ibool		success;
-	byte*		buffer_base = NULL;
-	byte*		buffer = NULL;
-	ulint		buffers;
-	ulint		offset;
-	ulint		reads = 0;
-	ulint		req = 0;
-	ibool		terminated = FALSE;
-	ibool		ret = FALSE;
-	dump_record_t*	records = NULL;
-	ulint		size;
-	ulint		size_high;
-	ulint		recsize = sizeof(dump_record_t);
-	ulint		length;
-
-	dump_file = os_file_create_simple_no_error_handling(innodb_file_temp_key,
-		LRU_DUMP_FILE, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success);
-	if (!success || !os_file_get_size(dump_file, &size, &size_high)) {
-		os_file_get_last_error(TRUE);
-		fprintf(stderr,
-			" InnoDB: cannot open %s, "
-			" buffer pool preload not done.\n", LRU_DUMP_FILE);
-		goto end;
-	}
-
-	if (size == 0 || size_high > 0 || size % recsize) {
-		fprintf(stderr, " InnoDB: broken LRU dump file,"
-			" buffer pool preload not done\n");
-		goto end;
-	}
-
-	ut_print_timestamp(stderr);
-	fprintf(stderr, " InnoDB: Restoring buffer pool pages from %s\n",
-		LRU_DUMP_FILE);
-
-	if (size == 0 || size_high > 0 || size % 8) {
-		fprintf(stderr, " InnoDB: broken LRU dump file\n");
-		goto end;
-	}
-	buffer_base = ut_malloc(2 * UNIV_PAGE_SIZE);
-	buffer = ut_align(buffer_base, UNIV_PAGE_SIZE);
-	records = ut_malloc(size);
-	if (!buffer || !records) {
-		fprintf(stderr,
-			" InnoDB: cannot allocate buffer.\n");
-		goto end;
-	}
-
-	buffers = 0;
-	length = 0;
-	while (!terminated) {
-		success = os_file_read(dump_file, buffer,
-				(buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL,
-				(buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)),
-				UNIV_PAGE_SIZE);
-		if (!success) {
-			fprintf(stderr,
-				" InnoDB: either could not read page %lu of %s,"
-				" or terminated unexpectedly.\n",
-				buffers, LRU_DUMP_FILE);
-			goto end;
-		}
-
-		for (offset = 0; offset < UNIV_PAGE_SIZE/4; offset += 2) {
-			ulint	space_id;
-			ulint	page_no;
-
-			space_id = mach_read_from_4(buffer + offset * 4);
-			page_no = mach_read_from_4(buffer + (offset + 1) * 4);
-			if (space_id == 0xFFFFFFFFUL
-			    || page_no == 0xFFFFFFFFUL) {
-				terminated = TRUE;
-				break;
-			}
-
-			records[length].space_id = space_id;
-			records[length].page_no = page_no;
-			length++;
-			if (length * 8 >= size) {
-				fprintf(stderr,
-					" InnoDB: could not find the "
-					"end-of-file marker after reading "
-					"the expected %lu bytes from the "
-					"LRU dump file.\n"
-					" InnoDB: this could be caused by a "
-					"broken or incomplete file.\n"
-					" InnoDB: trying to process what has "
-					"been read so far.\n",
-					size);
-				terminated= TRUE;
-				break;
-			}
-		}
-		buffers++;
-	}
-
-	qsort(records, length, sizeof(dump_record_t), dump_record_cmp);
-
-	for (offset = 0; offset < length; offset++) {
-		ulint		space_id;
-		ulint		page_no;
-		ulint		zip_size;
-		ulint		err;
-		ib_int64_t	tablespace_version;
-
-		space_id = records[offset].space_id;
-		page_no = records[offset].page_no;
-
-		if (offset % 16 == 15) {
-			os_aio_simulated_wake_handler_threads();
-			buf_flush_free_margins(FALSE);
-			/* skip loading of the rest of the file if we are
- 			   terminating anyway */
-			if(srv_shutdown_state != SRV_SHUTDOWN_NONE) {
-				fprintf(stderr,
-					" InnoDB: stopped loading lru pages"
-					" because of server shutdown\n");
-				break;
-			}
-		}
-
-		zip_size = fil_space_get_zip_size(space_id);
-		if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
-			continue;
-		}
-
-		if (fil_is_exist(space_id, page_no)) {
-
-			tablespace_version = fil_space_get_version(space_id);
-
-			req++;
-			reads += buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
-						   | OS_AIO_SIMULATED_WAKE_LATER,
-						   space_id, zip_size, TRUE,
-						   tablespace_version, page_no, NULL);
-			buf_LRU_stat_inc_io();
-		}
-	}
-
-	os_aio_simulated_wake_handler_threads();
-	buf_flush_free_margins(FALSE);
-
-	ut_print_timestamp(stderr);
-	fprintf(stderr,
-		" InnoDB: Completed reading buffer pool pages"
-		" (requested: %lu, read: %lu)\n", req, reads);
-	ret = TRUE;
-end:
-	if (dump_file != (os_file_t) -1)
-		os_file_close(dump_file);
-	if (buffer_base)
-		ut_free(buffer_base);
-	if (records)
-		ut_free(records);
-
-	return(ret);
-}
-
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 /**********************************************************************//**
 Validates the LRU list for one buffer pool instance. */
@@ -2984,8 +2740,6 @@ buf_LRU_validate_instance(
 	ulint		new_len;
 
 	ut_ad(buf_pool);
-	//buf_pool_mutex_enter(buf_pool);
-	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
 	mutex_enter(&buf_pool->LRU_list_mutex);
 
 	if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) {
@@ -3002,17 +2756,16 @@ buf_LRU_validate_instance(
 		ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE);
 	}
 
-	UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU,
-			 ut_ad(ut_list_node_313->in_LRU_list));
-
-	bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+	UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU, CheckInLRUList());
 
 	old_len = 0;
 
-	while (bpage != NULL) {
+	for (bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+	     bpage != NULL;
+             bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
 
 		switch (buf_page_get_state(bpage)) {
-		case BUF_BLOCK_ZIP_FREE:
+		case BUF_BLOCK_POOL_WATCH:
 		case BUF_BLOCK_NOT_USED:
 		case BUF_BLOCK_READY_FOR_USE:
 		case BUF_BLOCK_MEMORY:
@@ -3041,32 +2794,30 @@ buf_LRU_validate_instance(
 
 			ut_a(!next || buf_page_is_old(next));
 		}
-
-		bpage = UT_LIST_GET_NEXT(LRU, bpage);
 	}
 
 	ut_a(buf_pool->LRU_old_len == old_len);
 
 	mutex_exit(&buf_pool->LRU_list_mutex);
+
 	mutex_enter(&buf_pool->free_list_mutex);
 
-	UT_LIST_VALIDATE(free, buf_page_t, buf_pool->free,
-			 ut_ad(ut_list_node_313->in_free_list));
+	UT_LIST_VALIDATE(list, buf_page_t, buf_pool->free, CheckInFreeList());
 
 	for (bpage = UT_LIST_GET_FIRST(buf_pool->free);
 	     bpage != NULL;
-	     bpage = UT_LIST_GET_NEXT(free, bpage)) {
+	     bpage = UT_LIST_GET_NEXT(list, bpage)) {
 
 		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_NOT_USED);
 	}
 
 	mutex_exit(&buf_pool->free_list_mutex);
-	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
+
 	mutex_enter(&buf_pool->LRU_list_mutex);
 
-	UT_LIST_VALIDATE(unzip_LRU, buf_block_t, buf_pool->unzip_LRU,
-			 ut_ad(ut_list_node_313->in_unzip_LRU_list
-			       && ut_list_node_313->page.in_LRU_list));
+	UT_LIST_VALIDATE(
+                unzip_LRU, buf_block_t, buf_pool->unzip_LRU,
+                CheckUnzipLRUAndLRUList());
 
 	for (block = UT_LIST_GET_FIRST(buf_pool->unzip_LRU);
 	     block;
@@ -3077,7 +2828,6 @@ buf_LRU_validate_instance(
 		ut_a(buf_page_belongs_to_unzip_LRU(&block->page));
 	}
 
-	//buf_pool_mutex_exit(buf_pool);
 	mutex_exit(&buf_pool->LRU_list_mutex);
 }
 
@@ -3114,8 +2864,6 @@ buf_LRU_print_instance(
 	const buf_page_t*	bpage;
 
 	ut_ad(buf_pool);
-	//buf_pool_mutex_enter(buf_pool);
-	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
 	mutex_enter(&buf_pool->LRU_list_mutex);
 
 	bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
@@ -3173,7 +2921,6 @@ buf_LRU_print_instance(
 		bpage = UT_LIST_GET_NEXT(LRU, bpage);
 	}
 
-	//buf_pool_mutex_exit(buf_pool);
 	mutex_exit(&buf_pool->LRU_list_mutex);
 }
 
@@ -3193,3 +2940,4 @@ buf_LRU_print(void)
 	}
 }
 #endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/buf/buf0rea.c b/storage/xtradb/buf/buf0rea.cc
index 44db27cf943..6e348bbf004 100644
--- a/storage/xtradb/buf/buf0rea.c
+++ b/storage/xtradb/buf/buf0rea.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file buf/buf0rea.c
+@file buf/buf0rea.cc
 The database buffer read
 
 Created 11/5/1995 Heikki Tuuri
@@ -31,6 +31,7 @@ Created 11/5/1995 Heikki Tuuri
 #include "buf0buf.h"
 #include "buf0flu.h"
 #include "buf0lru.h"
+#include "buf0dblwr.h"
 #include "ibuf0ibuf.h"
 #include "log0recv.h"
 #include "trx0sys.h"
@@ -60,12 +61,17 @@ buf_read_page_handle_error(
 	buf_page_t*	bpage)	/*!< in: pointer to the block */
 {
 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	const ibool	uncompressed = (buf_page_get_state(bpage)
+	const bool	uncompressed = (buf_page_get_state(bpage)
 					== BUF_BLOCK_FILE_PAGE);
+	const ulint	fold = buf_page_address_fold(bpage->space,
+						     bpage->offset);
+	prio_rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
 
-	/* First unfix and release lock on the bpage */
 	mutex_enter(&buf_pool->LRU_list_mutex);
+	rw_lock_x_lock(hash_lock);
 	mutex_enter(buf_page_get_mutex(bpage));
+
+	/* First unfix and release lock on the bpage */
 	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
 	ut_ad(bpage->buf_fix_count == 0);
 
@@ -81,11 +87,10 @@ buf_read_page_handle_error(
 	/* remove the block from LRU list */
 	buf_LRU_free_one_page(bpage);
 
-	ut_ad(buf_pool->n_pend_reads > 0);
-	buf_pool->n_pend_reads--;
-
-	mutex_exit(buf_page_get_mutex(bpage));
 	mutex_exit(&buf_pool->LRU_list_mutex);
+
+	ut_ad(buf_pool->n_pend_reads > 0);
+	os_atomic_decrement_ulint(&buf_pool->n_pend_reads, 1);
 }
 
 /********************************************************************//**
@@ -96,16 +101,16 @@ flag is cleared and the x-lock released by an i/o-handler thread.
 @return 1 if a read request was queued, 0 if the page already resided
 in buf_pool, or if the page is in the doublewrite buffer blocks in
 which case it is never read into the pool, or if the tablespace does
-not exist or is being dropped 
+not exist or is being dropped
 @return 1 if read request is issued. 0 if it is not */
 UNIV_INTERN
 ulint
 buf_read_page_low(
 /*==============*/
-	ulint*	err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
+	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
 			trying to read from a non-existent tablespace, or a
 			tablespace which is just now being dropped */
-	ibool	sync,	/*!< in: TRUE if synchronous aio is desired */
+	bool	sync,	/*!< in: true if synchronous aio is desired */
 	ulint	mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ...,
 			ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
 			at read-ahead functions) */
@@ -122,21 +127,17 @@ buf_read_page_low(
 {
 	buf_page_t*	bpage;
 	ulint		wake_later;
+	ibool		ignore_nonexistent_pages;
 
 	*err = DB_SUCCESS;
 
 	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
 	mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
 
-	if (trx_doublewrite
-	    && (space == TRX_SYS_SPACE
-		|| (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE))
-	    && (   (offset >= trx_doublewrite->block1
-		    && offset < trx_doublewrite->block1
-		    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
-		   || (offset >= trx_doublewrite->block2
-		       && offset < trx_doublewrite->block2
-		       + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
+	ignore_nonexistent_pages = mode & BUF_READ_IGNORE_NONEXISTENT_PAGES;
+	mode &= ~BUF_READ_IGNORE_NONEXISTENT_PAGES;
+
+	if (space == TRX_SYS_SPACE && buf_dblwr_page_inside(offset)) {
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
 			"  InnoDB: Warning: trying to read"
@@ -155,7 +156,7 @@ buf_read_page_low(
 		syncronous i/o, to make sure they do not get involved in
 		thread deadlocks. */
 
-		sync = TRUE;
+		sync = true;
 	}
 
 	/* The following call will also check if the tablespace does not exist
@@ -178,7 +179,7 @@ buf_read_page_low(
 			}
 
 			/* recv_get_fil_addr_struct() */
-			recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
+			recv_addr = (recv_addr_t*)HASH_GET_FIRST(recv_sys->addr_hash,
 					hash_calc_hash(ut_fold_ulint_pair(space, offset),
 						recv_sys->addr_hash));
 			while (recv_addr) {
@@ -186,7 +187,7 @@ buf_read_page_low(
 					&& (recv_addr->page_no == offset)) {
 					break;
 				}
-				recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+				recv_addr = (recv_addr_t*)HASH_GET_NEXT(addr_hash, recv_addr);
 			}
 
 			if ((recv_addr == NULL)
@@ -212,26 +213,28 @@ not_to_recover:
 #ifdef UNIV_DEBUG
 	if (buf_debug_prints) {
 		fprintf(stderr,
-			"Posting read request for page %lu, sync %lu\n",
-			(ulong) offset,
-			(ulong) sync);
+			"Posting read request for page %lu, sync %s\n",
+			(ulong) offset, sync ? "true" : "false");
 	}
 #endif
 
 	ut_ad(buf_page_in_file(bpage));
+	ut_ad(!mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex));
 
 	if (sync) {
 		thd_wait_begin(NULL, THD_WAIT_DISKIO);
 	}
 
 	if (zip_size) {
-		*err = _fil_io(OS_FILE_READ | wake_later,
-			      sync, space, zip_size, offset, 0, zip_size,
-			      bpage->zip.data, bpage, trx);
+		*err = _fil_io(OS_FILE_READ | wake_later
+			       | ignore_nonexistent_pages,
+			       sync, space, zip_size, offset, 0, zip_size,
+			       bpage->zip.data, bpage, trx);
 	} else {
 		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
 
-		*err = _fil_io(OS_FILE_READ | wake_later,
+		*err = _fil_io(OS_FILE_READ | wake_later
+			      | ignore_nonexistent_pages,
 			      sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
 			      ((buf_block_t*) bpage)->frame, bpage, trx);
 	}
@@ -240,13 +243,15 @@ not_to_recover:
 		thd_wait_end(NULL);
 	}
 
-	if (*err == DB_TABLESPACE_DELETED) {
-		buf_read_page_handle_error(bpage);
-		return(0);
+	if (*err != DB_SUCCESS) {
+		if (ignore_nonexistent_pages || *err == DB_TABLESPACE_DELETED) {
+			buf_read_page_handle_error(bpage);
+			return(0);
+		}
+		SRV_CORRUPT_TABLE_CHECK(*err == DB_SUCCESS,
+					bpage->is_corrupt = TRUE;);
 	}
 
-	SRV_CORRUPT_TABLE_CHECK(*err == DB_SUCCESS, bpage->is_corrupt = TRUE;);
-
 	if (sync) {
 		/* The i/o is already completed when we arrive from
 		fil_read */
@@ -291,7 +296,7 @@ buf_read_ahead_random(
 	ulint		ibuf_mode;
 	ulint		count;
 	ulint		low, high;
-	ulint		err;
+	dberr_t		err;
 	ulint		i;
 	const ulint	buf_read_ahead_random_area
 				= BUF_READ_AHEAD_AREA(buf_pool);
@@ -331,11 +336,8 @@ buf_read_ahead_random(
 		high = fil_space_get_size(space);
 	}
 
-	buf_pool_mutex_enter(buf_pool);
-
 	if (buf_pool->n_pend_reads
 	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
-		buf_pool_mutex_exit(buf_pool);
 
 		return(0);
 	}
@@ -344,8 +346,12 @@ buf_read_ahead_random(
 	that is, reside near the start of the LRU list. */
 
 	for (i = low; i < high; i++) {
+
+		prio_rw_lock_t*	hash_lock;
+
 		const buf_page_t* bpage =
-			buf_page_hash_get(buf_pool, space, i);
+			buf_page_hash_get_s_locked(buf_pool, space, i,
+						   &hash_lock);
 
 		if (bpage
 		    && buf_page_is_accessed(bpage)
@@ -356,13 +362,16 @@ buf_read_ahead_random(
 			if (recent_blocks
 			    >= BUF_READ_AHEAD_RANDOM_THRESHOLD(buf_pool)) {
 
-				buf_pool_mutex_exit(buf_pool);
+				rw_lock_s_unlock(hash_lock);
 				goto read_ahead;
 			}
 		}
+
+		if (bpage) {
+			rw_lock_s_unlock(hash_lock);
+		}
 	}
 
-	buf_pool_mutex_exit(buf_pool);
 	/* Do nothing */
 	return(0);
 
@@ -383,7 +392,7 @@ read_ahead:
 
 		if (!ibuf_bitmap_page(zip_size, i)) {
 			count += buf_read_page_low(
-				&err, FALSE,
+				&err, false,
 				ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
 				space, zip_size, FALSE,
 				tablespace_version, i, trx);
@@ -420,7 +429,7 @@ read_ahead:
 	buf_LRU_stat_inc_io();
 
 	buf_pool->stat.n_ra_pages_read_rnd += count;
-	srv_buf_pool_reads += count;
+	srv_stats.buf_pool_reads.add(count);
 	return(count);
 }
 
@@ -439,20 +448,19 @@ buf_read_page(
 	ulint	offset,	/*!< in: page number */
 	trx_t*	trx)
 {
-	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
 	ib_int64_t	tablespace_version;
 	ulint		count;
-	ulint		err;
+	dberr_t		err;
 
 	tablespace_version = fil_space_get_version(space);
 
 	/* We do the i/o in the synchronous aio mode to save thread
 	switches: hence TRUE */
 
-	count = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
+	count = buf_read_page_low(&err, true, BUF_READ_ANY_PAGE, space,
 				  zip_size, FALSE,
 				  tablespace_version, offset, trx);
-	srv_buf_pool_reads += count;
+	srv_stats.buf_pool_reads.add(count);
 	if (err == DB_TABLESPACE_DELETED) {
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
@@ -463,9 +471,6 @@ buf_read_page(
 			(ulong) space, (ulong) offset);
 	}
 
-	/* Flush pages from the end of the LRU list if necessary */
-	buf_flush_free_margin(buf_pool, TRUE);
-
 	/* Increment number of I/O operations used for LRU policy. */
 	buf_LRU_stat_inc_io();
 
@@ -473,6 +478,49 @@ buf_read_page(
 }
 
 /********************************************************************//**
+High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@return TRUE if page has been read in, FALSE in case of failure */
+UNIV_INTERN
+ibool
+buf_read_page_async(
+/*================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: page number */
+{
+	ulint		zip_size;
+	ib_int64_t	tablespace_version;
+	ulint		count;
+	dberr_t		err;
+
+	zip_size = fil_space_get_zip_size(space);
+
+	if (zip_size == ULINT_UNDEFINED) {
+		return(FALSE);
+	}
+
+	tablespace_version = fil_space_get_version(space);
+
+	count = buf_read_page_low(&err, true, BUF_READ_ANY_PAGE
+				  | OS_AIO_SIMULATED_WAKE_LATER
+				  | BUF_READ_IGNORE_NONEXISTENT_PAGES,
+				  space, zip_size, FALSE,
+				  tablespace_version, offset, NULL);
+	srv_stats.buf_pool_reads.add(count);
+
+	/* We do not increment number of I/O operations used for LRU policy
+	here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
+	about evicting uncompressed version of compressed pages from the
+	buffer pool. Since this function is called from buffer pool load
+	these IOs are deliberate and are not part of normal workload we can
+	ignore these in our heuristics. */
+
+	return(count > 0);
+}
+
+/********************************************************************//**
 Applies linear read-ahead if in the buf_pool the page is a border page of
 a linear read-ahead area and all the pages in the area have been accessed.
 Does not read any page if the read-ahead mechanism is not activated. Note
@@ -511,6 +559,7 @@ buf_read_ahead_linear(
 	buf_page_t*	bpage;
 	buf_frame_t*	frame;
 	buf_page_t*	pred_bpage	= NULL;
+	unsigned	pred_bpage_is_accessed = 0;
 	ulint		pred_offset;
 	ulint		succ_offset;
 	ulint		count;
@@ -519,13 +568,14 @@ buf_read_ahead_linear(
 	ulint		fail_count;
 	ulint		ibuf_mode;
 	ulint		low, high;
-	ulint		err;
+	dberr_t		err;
 	ulint		i;
 	const ulint	buf_read_ahead_linear_area
 		= BUF_READ_AHEAD_AREA(buf_pool);
 	ulint		threshold;
 
-	if (!(srv_read_ahead & 2)) {
+	/* check if readahead is disabled */
+	if (!srv_read_ahead_threshold) {
 		return(0);
 	}
 
@@ -561,10 +611,7 @@ buf_read_ahead_linear(
 
 	tablespace_version = fil_space_get_version(space);
 
-	buf_pool_mutex_enter(buf_pool);
-
 	if (high > fil_space_get_size(space)) {
-		buf_pool_mutex_exit(buf_pool);
 		/* The area is not whole, return */
 
 		return(0);
@@ -572,11 +619,9 @@ buf_read_ahead_linear(
 
 	if (buf_pool->n_pend_reads
 	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
-		buf_pool_mutex_exit(buf_pool);
 
 		return(0);
 	}
-	buf_pool_mutex_exit(buf_pool);
 
 	/* Check that almost all pages in the area have been accessed; if
 	offset == low, the accesses must be in a descending order, otherwise,
@@ -595,9 +640,12 @@ buf_read_ahead_linear(
 
 	fail_count = 0;
 
-	rw_lock_s_lock(&buf_pool->page_hash_latch);
 	for (i = low; i < high; i++) {
-		bpage = buf_page_hash_get(buf_pool, space, i);
+
+		prio_rw_lock_t*	hash_lock;
+
+		bpage = buf_page_hash_get_s_locked(buf_pool, space, i,
+						   &hash_lock);
 
 		if (bpage == NULL || !buf_page_is_accessed(bpage)) {
 			/* Not accessed */
@@ -614,7 +662,7 @@ buf_read_ahead_linear(
 			a little against this. */
 			int res = ut_ulint_cmp(
 				buf_page_is_accessed(bpage),
-				buf_page_is_accessed(pred_bpage));
+				pred_bpage_is_accessed);
 			/* Accesses not in the right order */
 			if (res != 0 && res != asc_or_desc) {
 				fail_count++;
@@ -623,13 +671,20 @@ buf_read_ahead_linear(
 
 		if (fail_count > threshold) {
 			/* Too many failures: return */
-			//buf_pool_mutex_exit(buf_pool);
-			rw_lock_s_unlock(&buf_pool->page_hash_latch);
+			if (bpage) {
+				rw_lock_s_unlock(hash_lock);
+			}
 			return(0);
 		}
 
-		if (bpage && buf_page_is_accessed(bpage)) {
-			pred_bpage = bpage;
+		if (bpage) {
+			if (buf_page_is_accessed(bpage)) {
+				pred_bpage = bpage;
+				pred_bpage_is_accessed
+					= buf_page_is_accessed(bpage);
+			}
+
+			rw_lock_s_unlock(hash_lock);
 		}
 	}
 
@@ -639,8 +694,6 @@ buf_read_ahead_linear(
 	bpage = buf_page_hash_get(buf_pool, space, offset);
 
 	if (bpage == NULL) {
-		//buf_pool_mutex_exit(buf_pool);
-		rw_lock_s_unlock(&buf_pool->page_hash_latch);
 
 		return(0);
 	}
@@ -666,9 +719,6 @@ buf_read_ahead_linear(
 	pred_offset = fil_page_get_prev(frame);
 	succ_offset = fil_page_get_next(frame);
 
-	//buf_pool_mutex_exit(buf_pool);
-	rw_lock_s_unlock(&buf_pool->page_hash_latch);
-
 	if ((offset == low) && (succ_offset == offset + 1)) {
 
 		/* This is ok, we can continue */
@@ -721,7 +771,7 @@ buf_read_ahead_linear(
 
 		if (!ibuf_bitmap_page(zip_size, i)) {
 			count += buf_read_page_low(
-				&err, FALSE,
+				&err, false,
 				ibuf_mode,
 				space, zip_size, FALSE, tablespace_version, i, trx);
 			if (err == DB_TABLESPACE_DELETED) {
@@ -743,9 +793,6 @@ buf_read_ahead_linear(
 
 	os_aio_simulated_wake_handler_threads();
 
-	/* Flush pages from the end of the LRU list if necessary */
-	buf_flush_free_margin(buf_pool, TRUE);
-
 #ifdef UNIV_DEBUG
 	if (buf_debug_prints && (count > 0)) {
 		fprintf(stderr,
@@ -770,7 +817,7 @@ UNIV_INTERN
 void
 buf_read_ibuf_merge_pages(
 /*======================*/
-	ibool		sync,		/*!< in: TRUE if the caller
+	bool		sync,		/*!< in: true if the caller
 					wants this function to wait
 					for the highest address page
 					to get read in, before this
@@ -797,7 +844,7 @@ buf_read_ibuf_merge_pages(
 #endif
 
 	for (i = 0; i < n_stored; i++) {
-		ulint		err;
+		dberr_t		err;
 		buf_pool_t*	buf_pool;
 		ulint		zip_size = fil_space_get_zip_size(space_ids[i]);
 
@@ -831,9 +878,6 @@ tablespace_deleted:
 
 	os_aio_simulated_wake_handler_threads();
 
-	/* Flush pages from the end of all the LRU lists if necessary */
-	buf_flush_free_margins(FALSE);
-
 #ifdef UNIV_DEBUG
 	if (buf_debug_prints) {
 		fprintf(stderr,
@@ -866,7 +910,7 @@ buf_read_recv_pages(
 {
 	ib_int64_t	tablespace_version;
 	ulint		count;
-	ulint		err;
+	dberr_t		err;
 	ulint		i;
 
 	zip_size = fil_space_get_zip_size(space);
@@ -890,7 +934,7 @@ buf_read_recv_pages(
 
 			for (i = 0; i < n_stored; i++) {
 				/* recv_get_fil_addr_struct() */
-				recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
+				recv_addr = (recv_addr_t*)HASH_GET_FIRST(recv_sys->addr_hash,
 						hash_calc_hash(ut_fold_ulint_pair(space, page_nos[i]),
 							recv_sys->addr_hash));
 				while (recv_addr) {
@@ -898,7 +942,7 @@ buf_read_recv_pages(
 						&& (recv_addr->page_no == page_nos[i])) {
 						break;
 					}
-					recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+					recv_addr = (recv_addr_t*)HASH_GET_NEXT(addr_hash, recv_addr);
 				}
 
 				if ((recv_addr == NULL)
@@ -931,7 +975,8 @@ not_to_recover:
 
 		os_aio_print_debug = FALSE;
 		buf_pool = buf_pool_get(space, page_nos[i]);
-		while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
+		while (buf_pool->n_pend_reads
+		       >= recv_n_pool_free_frames / 2) {
 
 			os_aio_simulated_wake_handler_threads();
 			os_thread_sleep(10000);
@@ -947,7 +992,7 @@ not_to_recover:
 					"InnoDB: Number of pending reads %lu,"
 					" pending pread calls %lu\n",
 					(ulong) buf_pool->n_pend_reads,
-					(ulong)os_file_n_pending_preads);
+					(ulong) os_file_n_pending_preads);
 
 				os_aio_print_debug = TRUE;
 			}
@@ -956,11 +1001,11 @@ not_to_recover:
 		os_aio_print_debug = FALSE;
 
 		if ((i + 1 == n_stored) && sync) {
-			buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
+			buf_read_page_low(&err, true, BUF_READ_ANY_PAGE, space,
 					  zip_size, TRUE, tablespace_version,
 					  page_nos[i], NULL);
 		} else {
-			buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
+			buf_read_page_low(&err, false, BUF_READ_ANY_PAGE
 					  | OS_AIO_SIMULATED_WAKE_LATER,
 					  space, zip_size, TRUE,
 					  tablespace_version, page_nos[i], NULL);
@@ -969,9 +1014,6 @@ not_to_recover:
 
 	os_aio_simulated_wake_handler_threads();
 
-	/* Flush pages from the end of all the LRU lists if necessary */
-	buf_flush_free_margins(FALSE);
-
 #ifdef UNIV_DEBUG
 	if (buf_debug_prints) {
 		fprintf(stderr,
diff --git a/storage/xtradb/data/data0data.c b/storage/xtradb/data/data0data.cc
index 51054679762..179de79b69f 100644
--- a/storage/xtradb/data/data0data.c
+++ b/storage/xtradb/data/data0data.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file data/data0data.c
+@file data/data0data.cc
 SQL data field and tuple
 
 Created 5/30/1994 Heikki Tuuri
@@ -53,35 +53,6 @@ UNIV_INTERN ulint	data_dummy;
 #endif /* UNIV_DEBUG */
 
 #ifndef UNIV_HOTBACKUP
-/*********************************************************************//**
-Tests if dfield data length and content is equal to the given.
-@return	TRUE if equal */
-UNIV_INTERN
-ibool
-dfield_data_is_binary_equal(
-/*========================*/
-	const dfield_t*	field,	/*!< in: field */
-	ulint		len,	/*!< in: data length or UNIV_SQL_NULL */
-	const byte*	data)	/*!< in: data */
-{
-	if (len != dfield_get_len(field)) {
-
-		return(FALSE);
-	}
-
-	if (len == UNIV_SQL_NULL) {
-
-		return(TRUE);
-	}
-
-	if (0 != memcmp(dfield_get_data(field), data, len)) {
-
-		return(FALSE);
-	}
-
-	return(TRUE);
-}
-
 /************************************************************//**
 Compare two data tuples, respecting the collation of character fields.
 @return 1, 0 , -1 if tuple1 is greater, equal, less, respectively,
@@ -274,7 +245,9 @@ dtuple_validate(
 
 		if (!dfield_is_null(field)) {
 
-			const byte*	data = dfield_get_data(field);
+			const byte*	data;
+
+			data = static_cast<const byte*>(dfield_get_data(field));
 #ifndef UNIV_DEBUG_VALGRIND
 			ulint		j;
 
@@ -311,7 +284,7 @@ dfield_print(
 	ulint		i;
 
 	len = dfield_get_len(dfield);
-	data = dfield_get_data(dfield);
+	data = static_cast<const byte*>(dfield_get_data(dfield));
 
 	if (dfield_is_null(dfield)) {
 		fputs("NULL", stderr);
@@ -333,7 +306,7 @@ dfield_print(
 		break;
 	case DATA_INT:
 		ut_a(len == 4); /* only works for 32-bit integers */
-		fprintf(stderr, "%d", (int)mach_read_from_4(data));
+		fprintf(stderr, "%d", (int) mach_read_from_4(data));
 		break;
 	default:
 		ut_error;
@@ -356,7 +329,7 @@ dfield_print_also_hex(
 	ibool		print_also_hex;
 
 	len = dfield_get_len(dfield);
-	data = dfield_get_data(dfield);
+	data = static_cast<const byte*>(dfield_get_data(dfield));
 
 	if (dfield_is_null(dfield)) {
 		fputs("NULL", stderr);
@@ -438,25 +411,25 @@ dfield_print_also_hex(
 		case DATA_TRX_ID:
 			id = mach_read_from_6(data);
 
-			fprintf(stderr, "trx_id " TRX_ID_FMT, (ullint) id);
+			fprintf(stderr, "trx_id " TRX_ID_FMT, id);
 			break;
 
 		case DATA_ROLL_PTR:
 			id = mach_read_from_7(data);
 
-			fprintf(stderr, "roll_ptr " TRX_ID_FMT, (ullint) id);
+			fprintf(stderr, "roll_ptr " TRX_ID_FMT, id);
 			break;
 
 		case DATA_ROW_ID:
 			id = mach_read_from_6(data);
 
-			fprintf(stderr, "row_id " TRX_ID_FMT, (ullint) id);
+			fprintf(stderr, "row_id " TRX_ID_FMT, id);
 			break;
 
 		default:
 			id = mach_ull_read_compressed(data);
 
-			fprintf(stderr, "mix_id " TRX_ID_FMT, (ullint) id);
+			fprintf(stderr, "mix_id " TRX_ID_FMT, id);
 		}
 		break;
 
@@ -484,7 +457,7 @@ dfield_print_also_hex(
 			break;
 		}
 
-		data = dfield_get_data(dfield);
+		data = static_cast<byte*>(dfield_get_data(dfield));
 		/* fall through */
 
 	case DATA_BINARY:
@@ -579,11 +552,11 @@ dtuple_convert_big_rec(
 	ulint		local_len;
 	ulint		local_prefix_len;
 
-	if (UNIV_UNLIKELY(!dict_index_is_clust(index))) {
+	if (!dict_index_is_clust(index)) {
 		return(NULL);
 	}
 
-	if (dict_table_get_format(index->table) < DICT_TF_FORMAT_ZIP) {
+	if (dict_table_get_format(index->table) < UNIV_FORMAT_B) {
 		/* up to MySQL 5.1: store a 768-byte prefix locally */
 		local_len = BTR_EXTERN_FIELD_REF_SIZE
 			+ DICT_ANTELOPE_MAX_INDEX_COL_LEN;
@@ -608,11 +581,15 @@ dtuple_convert_big_rec(
 	heap = mem_heap_create(size + dtuple_get_n_fields(entry)
 			       * sizeof(big_rec_field_t) + 1000);
 
-	vector = mem_heap_alloc(heap, sizeof(big_rec_t));
+	vector = static_cast<big_rec_t*>(
+		mem_heap_alloc(heap, sizeof(big_rec_t)));
 
 	vector->heap = heap;
-	vector->fields = mem_heap_alloc(heap, dtuple_get_n_fields(entry)
-					* sizeof(big_rec_field_t));
+
+	vector->fields = static_cast<big_rec_field_t*>(
+		mem_heap_alloc(
+			heap,
+			dtuple_get_n_fields(entry) * sizeof(big_rec_field_t)));
 
 	/* Decide which fields to shorten: the algorithm is to look for
 	a variable-length field that yields the biggest savings when
@@ -703,7 +680,7 @@ skip_field:
 		b->data = (char*) dfield_get_data(dfield) + local_prefix_len;
 
 		/* Allocate the locally stored part of the column. */
-		data = mem_heap_alloc(heap, local_len);
+		data = static_cast<byte*>(mem_heap_alloc(heap, local_len));
 
 		/* Copy the local prefix. */
 		memcpy(data, dfield_get_data(dfield), local_prefix_len);
diff --git a/storage/xtradb/data/data0type.c b/storage/xtradb/data/data0type.cc
index 9f855d58adf..0b9e08544a5 100644
--- a/storage/xtradb/data/data0type.c
+++ b/storage/xtradb/data/data0type.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file data/data0type.c
+@file data/data0type.cc
 Data types
 
 Created 1/16/1996 Heikki Tuuri
@@ -158,7 +158,7 @@ dtype_form_prtype(
 	ulint	charset_coll)	/*!< in: MySQL charset-collation code */
 {
 	ut_a(old_prtype < 256 * 256);
-	ut_a(charset_coll < 256);
+	ut_a(charset_coll <= MAX_CHAR_COLL_NUM);
 
 	return(old_prtype + (charset_coll << 16));
 }
diff --git a/storage/xtradb/dict/dict0boot.c b/storage/xtradb/dict/dict0boot.cc
index 692c37cc5a2..94a3af2852b 100644
--- a/storage/xtradb/dict/dict0boot.c
+++ b/storage/xtradb/dict/dict0boot.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file dict/dict0boot.c
+@file dict/dict0boot.cc
 Data dictionary creation and booting
 
 Created 4/18/1996 Heikki Tuuri
@@ -33,7 +33,6 @@ Created 4/18/1996 Heikki Tuuri
 #include "btr0btr.h"
 #include "btr0sea.h"
 #include "dict0load.h"
-#include "dict0load.h"
 #include "trx0trx.h"
 #include "srv0srv.h"
 #include "ibuf0ibuf.h"
@@ -241,171 +240,11 @@ dict_hdr_create(
 }
 
 /*****************************************************************//**
-Verifies the SYS_STATS table by scanning its clustered index.  This
-function may only be called at InnoDB startup time.
-
-@return	TRUE if SYS_STATS was verified successfully */
-UNIV_INTERN
-ibool
-dict_verify_xtradb_sys_stats(void)
-/*==============================*/
-{
-	dict_index_t* sys_stats_index;
-	ulint	      saved_srv_pass_corrupt_table = srv_pass_corrupt_table;
-	ibool	      result;
-
-	sys_stats_index = dict_table_get_first_index(dict_sys->sys_stats);
-
-	/* Since this may be called only during server startup, avoid hitting
-	   various asserts by using XtraDB pass_corrupt_table option. */
-	srv_pass_corrupt_table = 1;
-	result = btr_validate_index(sys_stats_index, NULL);
-	srv_pass_corrupt_table = saved_srv_pass_corrupt_table;
-
-	return result;
-}
-
-/*****************************************************************//**
-Creates the B-tree for the SYS_STATS clustered index, adds the XtraDB
-mark and the id of the index to the dictionary header page.  Rewrites
-both passed args. */
-static
-void
-dict_create_xtradb_sys_stats(
-/*=========================*/
-	dict_hdr_t**	dict_hdr,	/*!< in/out: dictionary header */
-	mtr_t*		mtr)		/*!< in/out: mtr */
-{
-	ulint	root_page_no;
-
-	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
-				  DICT_HDR_SPACE, 0, DICT_STATS_ID,
-				  dict_ind_redundant, mtr);
-	if (root_page_no == FIL_NULL) {
-		fprintf(stderr, "InnoDB: Warning: failed to create SYS_STATS btr.\n");
-		srv_use_sys_stats_table = FALSE;
-	} else {
-		mlog_write_ulint(*dict_hdr + DICT_HDR_STATS, root_page_no,
-				 MLOG_4BYTES, mtr);
-		mlog_write_ull(*dict_hdr + DICT_HDR_XTRADB_MARK,
-			       DICT_HDR_XTRADB_FLAG, mtr);
-	}
-	mtr_commit(mtr);
-	/* restart mtr */
-	mtr_start(mtr);
-	*dict_hdr = dict_hdr_get(mtr);
-}
-
-/*****************************************************************//**
-Create the table and index structure of SYS_STATS for the dictionary
-cache and add it there.  If called for the first time, also support
-wrong root page id injection for testing purposes. */
-static
-void
-dict_add_to_cache_xtradb_sys_stats(
-/*===============================*/
-	ibool		first_time __attribute__((unused)),
-					/*!< in: first invocation flag. If
-					TRUE, optionally inject wrong root page
-					id */
-	mem_heap_t*	heap,		/*!< in: memory heap for table/index
-					allocation */
-	dict_hdr_t*	dict_hdr,	/*!< in: dictionary header */
-	mtr_t*		mtr)		/*!< in: mtr */
-{
-	dict_table_t*	table;
-	dict_index_t*	index;
-	ulint		root_page_id;
-	ulint		error;
-
-	table = dict_mem_table_create("SYS_STATS", DICT_HDR_SPACE, 4, 0);
-	table->n_mysql_handles_opened = 1; /* for pin */
-
-	dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 0);
-	dict_mem_table_add_col(table, heap, "KEY_COLS", DATA_INT, 0, 4);
-	dict_mem_table_add_col(table, heap, "DIFF_VALS", DATA_BINARY, 0, 0);
-	dict_mem_table_add_col(table, heap, "NON_NULL_VALS", DATA_BINARY, 0, 0);
-
-	/* The '+ 2' below comes from the fields DB_TRX_ID, DB_ROLL_PTR */
-#if DICT_SYS_STATS_DIFF_VALS_FIELD != 2 + 2
-#error "DICT_SYS_STATS_DIFF_VALS_FIELD != 2 + 2"
-#endif
-#if DICT_SYS_STATS_NON_NULL_VALS_FIELD != 3 + 2
-#error "DICT_SYS_STATS_NON_NULL_VALS_FIELD != 3 + 2"
-#endif
-
-	table->id = DICT_STATS_ID;
-	dict_table_add_to_cache(table, heap);
-	dict_sys->sys_stats = table;
-	mem_heap_empty(heap);
-
-	index = dict_mem_index_create("SYS_STATS", "CLUST_IND",
-				      DICT_HDR_SPACE,
-				      DICT_UNIQUE | DICT_CLUSTERED, 2);
-
-	dict_mem_index_add_field(index, "INDEX_ID", 0);
-	dict_mem_index_add_field(index, "KEY_COLS", 0);
-
-	index->id = DICT_STATS_ID;
-	btr_search_index_init(index);
-
-	root_page_id = mtr_read_ulint(dict_hdr + DICT_HDR_STATS, MLOG_4BYTES,
-				      mtr);
-#ifdef UNIV_DEBUG
-	if ((srv_sys_stats_root_page != 0) && first_time)
-		root_page_id = srv_sys_stats_root_page;
-#endif
-	error = dict_index_add_to_cache(table, index, root_page_id, FALSE);
-	ut_a(error == DB_SUCCESS);
-
-	mem_heap_empty(heap);
-}
-
-/*****************************************************************//**
-Discard the existing dictionary cache SYS_STATS information, create and
-add it there anew.  Does not touch the old SYS_STATS tablespace page
-under the assumption that they are corrupted or overwritten for other
-purposes. */
-UNIV_INTERN
-void
-dict_recreate_xtradb_sys_stats(void)
-/*================================*/
-{
-	mtr_t		mtr;
-	dict_hdr_t*	dict_hdr;
-	dict_index_t*	sys_stats_clust_idx;
-	mem_heap_t*	heap;
-
-	heap = mem_heap_create(450);
-
-	mutex_enter(&(dict_sys->mutex));
-
-	sys_stats_clust_idx = dict_table_get_first_index(dict_sys->sys_stats);
-	dict_index_remove_from_cache(dict_sys->sys_stats, sys_stats_clust_idx);
-
-	dict_table_remove_from_cache(dict_sys->sys_stats);
-
-	dict_sys->sys_stats = NULL;
-
-	mtr_start(&mtr);
-
-	dict_hdr = dict_hdr_get(&mtr);
-
-	dict_create_xtradb_sys_stats(&dict_hdr, &mtr);
-	dict_add_to_cache_xtradb_sys_stats(FALSE, heap, dict_hdr, &mtr);
-
-	mem_heap_free(heap);
-
-	mtr_commit(&mtr);
-
-	mutex_exit(&(dict_sys->mutex));
-}
-
-/*****************************************************************//**
 Initializes the data dictionary memory structures when the database is
-started. This function is also called when the data dictionary is created. */
+started. This function is also called when the data dictionary is created.
+@return DB_SUCCESS or error code. */
 UNIV_INTERN
-void
+dberr_t
 dict_boot(void)
 /*===========*/
 {
@@ -414,27 +253,38 @@ dict_boot(void)
 	dict_hdr_t*	dict_hdr;
 	mem_heap_t*	heap;
 	mtr_t		mtr;
-	ulint		error;
-
-	heap = mem_heap_create(450);
+	dberr_t		error;
+
+	/* Be sure these constants do not ever change.  To avoid bloat,
+	only check the *NUM_FIELDS* in each table */
+
+	ut_ad(DICT_NUM_COLS__SYS_TABLES == 8);
+	ut_ad(DICT_NUM_FIELDS__SYS_TABLES == 10);
+	ut_ad(DICT_NUM_FIELDS__SYS_TABLE_IDS == 2);
+	ut_ad(DICT_NUM_COLS__SYS_COLUMNS == 7);
+	ut_ad(DICT_NUM_FIELDS__SYS_COLUMNS == 9);
+	ut_ad(DICT_NUM_COLS__SYS_INDEXES == 7);
+	ut_ad(DICT_NUM_FIELDS__SYS_INDEXES == 9);
+	ut_ad(DICT_NUM_COLS__SYS_FIELDS == 3);
+	ut_ad(DICT_NUM_FIELDS__SYS_FIELDS == 5);
+	ut_ad(DICT_NUM_COLS__SYS_FOREIGN == 4);
+	ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN == 6);
+	ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME == 2);
+	ut_ad(DICT_NUM_COLS__SYS_FOREIGN_COLS == 4);
+	ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN_COLS == 6);
 
 	mtr_start(&mtr);
 
 	/* Create the hash tables etc. */
 	dict_init();
 
+	heap = mem_heap_create(450);
+
 	mutex_enter(&(dict_sys->mutex));
 
 	/* Get the dictionary header */
 	dict_hdr = dict_hdr_get(&mtr);
 
-	if (mach_read_from_8(dict_hdr + DICT_HDR_XTRADB_MARK)
-	    != DICT_HDR_XTRADB_FLAG) {
-
-		/* not extended yet by XtraDB, need to be extended */
-		dict_create_xtradb_sys_stats(&dict_hdr, &mtr);
-	}
-
 	/* Because we only write new row ids to disk-based data structure
 	(dictionary header) when it is divisible by
 	DICT_HDR_ROW_ID_WRITE_MARGIN, in recovery we will not recover
@@ -452,15 +302,14 @@ dict_boot(void)
 	/* Insert into the dictionary cache the descriptions of the basic
 	system tables */
 	/*-------------------------*/
-	table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, 0);
-	table->n_mysql_handles_opened = 1; /* for pin */
+	table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, 0, 0);
 
 	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
 	dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0);
 	/* ROW_FORMAT = (N_COLS >> 31) ? COMPACT : REDUNDANT */
 	dict_mem_table_add_col(table, heap, "N_COLS", DATA_INT, 0, 4);
-	/* TYPE is either DICT_TABLE_ORDINARY, or (TYPE & DICT_TF_COMPACT)
-	and (TYPE & DICT_TF_FORMAT_MASK) are nonzero and TYPE = table->flags */
+	/* The low order bit of TYPE is always set to 1.  If the format
+	is UNIV_FORMAT_B or higher, this field matches table->flags. */
 	dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
 	dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0);
 	/* MIX_LEN may contain additional table flags when
@@ -472,7 +321,7 @@ dict_boot(void)
 
 	table->id = DICT_TABLES_ID;
 
-	dict_table_add_to_cache(table, heap);
+	dict_table_add_to_cache(table, FALSE, heap);
 	dict_sys->sys_tables = table;
 	mem_heap_empty(heap);
 
@@ -499,7 +348,6 @@ dict_boot(void)
 
 	index->id = DICT_TABLE_IDS_ID;
 	btr_search_index_init(index);
-
 	error = dict_index_add_to_cache(table, index,
 					mtr_read_ulint(dict_hdr
 						       + DICT_HDR_TABLE_IDS,
@@ -508,8 +356,7 @@ dict_boot(void)
 	ut_a(error == DB_SUCCESS);
 
 	/*-------------------------*/
-	table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, 0);
-	table->n_mysql_handles_opened = 1; /* for pin */
+	table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, 0, 0);
 
 	dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0);
 	dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
@@ -521,7 +368,7 @@ dict_boot(void)
 
 	table->id = DICT_COLUMNS_ID;
 
-	dict_table_add_to_cache(table, heap);
+	dict_table_add_to_cache(table, FALSE, heap);
 	dict_sys->sys_columns = table;
 	mem_heap_empty(heap);
 
@@ -542,8 +389,7 @@ dict_boot(void)
 	ut_a(error == DB_SUCCESS);
 
 	/*-------------------------*/
-	table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, 0);
-	table->n_mysql_handles_opened = 1; /* for pin */
+	table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, 0, 0);
 
 	dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0);
 	dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0);
@@ -553,22 +399,9 @@ dict_boot(void)
 	dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
 	dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4);
 
-	/* The '+ 2' below comes from the fields DB_TRX_ID, DB_ROLL_PTR */
-#if DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2
-#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2"
-#endif
-#if DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2
-#error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2"
-#endif
-#if DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2
-#error "DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2"
-#endif
-#if DICT_SYS_INDEXES_NAME_FIELD != 2 + 2
-#error "DICT_SYS_INDEXES_NAME_FIELD != 2 + 2"
-#endif
-
 	table->id = DICT_INDEXES_ID;
-	dict_table_add_to_cache(table, heap);
+
+	dict_table_add_to_cache(table, FALSE, heap);
 	dict_sys->sys_indexes = table;
 	mem_heap_empty(heap);
 
@@ -589,17 +422,17 @@ dict_boot(void)
 	ut_a(error == DB_SUCCESS);
 
 	/*-------------------------*/
-	table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, 0);
-	table->n_mysql_handles_opened = 1; /* for pin */
+	table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, 0, 0);
 
 	dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 0);
 	dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
 	dict_mem_table_add_col(table, heap, "COL_NAME", DATA_BINARY, 0, 0);
 
 	table->id = DICT_FIELDS_ID;
-	dict_table_add_to_cache(table, heap);
+
+	dict_table_add_to_cache(table, FALSE, heap);
 	dict_sys->sys_fields = table;
-	mem_heap_empty(heap);
+	mem_heap_free(heap);
 
 	index = dict_mem_index_create("SYS_FIELDS", "CLUST_IND",
 				      DICT_HDR_SPACE,
@@ -617,26 +450,35 @@ dict_boot(void)
 					FALSE);
 	ut_a(error == DB_SUCCESS);
 
-	dict_add_to_cache_xtradb_sys_stats(TRUE, heap, dict_hdr, &mtr);
-
-	mem_heap_free(heap);
-
 	mtr_commit(&mtr);
+
 	/*-------------------------*/
 
 	/* Initialize the insert buffer table and index for each tablespace */
 
 	ibuf_init_at_db_start();
 
-	/* Load definitions of other indexes on system tables */
+	dberr_t	err = DB_SUCCESS;
 
-	dict_load_sys_table(dict_sys->sys_tables);
-	dict_load_sys_table(dict_sys->sys_columns);
-	dict_load_sys_table(dict_sys->sys_indexes);
-	dict_load_sys_table(dict_sys->sys_fields);
-	dict_load_sys_table(dict_sys->sys_stats);
+	if (srv_read_only_mode && !ibuf_is_empty()) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Change buffer must be empty when --innodb-read-only "
+			"is set!");
+
+		err = DB_ERROR;
+	} else {
+		/* Load definitions of other indexes on system tables */
+
+		dict_load_sys_table(dict_sys->sys_tables);
+		dict_load_sys_table(dict_sys->sys_columns);
+		dict_load_sys_table(dict_sys->sys_indexes);
+		dict_load_sys_table(dict_sys->sys_fields);
+	}
 
 	mutex_exit(&(dict_sys->mutex));
+
+	return(err);
 }
 
 /*****************************************************************//**
@@ -651,9 +493,10 @@ dict_insert_initial_data(void)
 }
 
 /*****************************************************************//**
-Creates and initializes the data dictionary at the database creation. */
+Creates and initializes the data dictionary at the server bootstrap.
+@return DB_SUCCESS or error code. */
 UNIV_INTERN
-void
+dberr_t
 dict_create(void)
 /*=============*/
 {
@@ -665,7 +508,11 @@ dict_create(void)
 
 	mtr_commit(&mtr);
 
-	dict_boot();
+	dberr_t	err = dict_boot();
+
+	if (err == DB_SUCCESS) {
+		dict_insert_initial_data();
+	}
 
-	dict_insert_initial_data();
+	return(err);
 }
diff --git a/storage/xtradb/dict/dict0crea.c b/storage/xtradb/dict/dict0crea.cc
index d87d2942fef..c7cb3aa21bb 100644
--- a/storage/xtradb/dict/dict0crea.c
+++ b/storage/xtradb/dict/dict0crea.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file dict/dict0crea.c
+@file dict/dict0crea.cc
 Database object creation
 
 Created 1/8/1996 Heikki Tuuri
@@ -42,23 +42,10 @@ Created 1/8/1996 Heikki Tuuri
 #include "trx0roll.h"
 #include "usr0sess.h"
 #include "ut0vec.h"
+#include "dict0priv.h"
+#include "fts0priv.h"
 #include "ha_prototypes.h"
 
-/*************************************************************************
-Checks if a table name contains the string TEMP_TABLE_PATH_PREFIX which
-denotes temporary tables in MySQL. */
-static
-ibool
-row_is_mysql_tmp_table_name(
-/*========================*/
-				/* out: TRUE if temporary table */
-	const char*     name)   /* in: table name in the form
-				'database/tablename' */
-{
-	return(strstr(name, TEMP_TABLE_PATH_PREFIX) != NULL);
-}
-
-
 /*****************************************************************//**
 Based on a table object, this function builds the entry to be inserted
 in the SYS_TABLES system table.
@@ -76,6 +63,7 @@ dict_create_sys_tables_tuple(
 	dtuple_t*	entry;
 	dfield_t*	dfield;
 	byte*		ptr;
+	ulint		type;
 
 	ut_ad(table);
 	ut_ad(heap);
@@ -87,65 +75,74 @@ dict_create_sys_tables_tuple(
 	dict_table_copy_types(entry, sys_tables);
 
 	/* 0: NAME -----------------------------*/
-	dfield = dtuple_get_nth_field(entry, 0/*NAME*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__NAME);
 
 	dfield_set_data(dfield, table->name, ut_strlen(table->name));
+
+	/* 1: DB_TRX_ID added later */
+	/* 2: DB_ROLL_PTR added later */
 	/* 3: ID -------------------------------*/
-	dfield = dtuple_get_nth_field(entry, 1/*ID*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__ID);
 
-	ptr = mem_heap_alloc(heap, 8);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(ptr, table->id);
 
 	dfield_set_data(dfield, ptr, 8);
-	/* 4: N_COLS ---------------------------*/
-	dfield = dtuple_get_nth_field(entry, 2/*N_COLS*/);
 
-#if DICT_TF_COMPACT != 1
-#error
-#endif
+	/* 4: N_COLS ---------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__N_COLS);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, table->n_def
 			| ((table->flags & DICT_TF_COMPACT) << 31));
 	dfield_set_data(dfield, ptr, 4);
-	/* 5: TYPE -----------------------------*/
-	dfield = dtuple_get_nth_field(entry, 3/*TYPE*/);
-
-	ptr = mem_heap_alloc(heap, 4);
-	if (table->flags & (~DICT_TF_COMPACT & ~(~0 << DICT_TF_BITS))) {
-		ut_a(table->flags & DICT_TF_COMPACT);
-		ut_a(dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP);
-		ut_a(((ulonglong) table->flags & DICT_TF_ZSSIZE_MASK)
-		     <= (ulonglong) (DICT_TF_ZSSIZE_MAX << DICT_TF_ZSSIZE_SHIFT));
-		ut_a(!(table->flags & (~0 << DICT_TF2_BITS)));
-		mach_write_to_4(ptr, table->flags & ~(~0 << DICT_TF_BITS));
-	} else {
-		mach_write_to_4(ptr, DICT_TABLE_ORDINARY);
-	}
+
+	/* 5: TYPE (table flags) -----------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__TYPE);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	/* Validate the table flags and convert them to what is saved in
+	SYS_TABLES.TYPE.  Table flag values 0 and 1 are both written to
+	SYS_TABLES.TYPE as 1. */
+	type = dict_tf_to_sys_tables_type(table->flags);
+	mach_write_to_4(ptr, type);
 
 	dfield_set_data(dfield, ptr, 4);
+
 	/* 6: MIX_ID (obsolete) ---------------------------*/
-	dfield = dtuple_get_nth_field(entry, 4/*MIX_ID*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__MIX_ID);
 
-	ptr = mem_heap_zalloc(heap, 8);
+	ptr = static_cast<byte*>(mem_heap_zalloc(heap, 8));
 
 	dfield_set_data(dfield, ptr, 8);
-	/* 7: MIX_LEN (additional flags) --------------------------*/
 
-	dfield = dtuple_get_nth_field(entry, 5/*MIX_LEN*/);
+	/* 7: MIX_LEN (additional flags) --------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__MIX_LEN);
 
-	ptr = mem_heap_alloc(heap, 4);
-	mach_write_to_4(ptr, table->flags >> DICT_TF2_SHIFT);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	/* Be sure all non-used bits are zero. */
+	ut_a(!(table->flags2 & ~DICT_TF2_BIT_MASK));
+	mach_write_to_4(ptr, table->flags2);
 
 	dfield_set_data(dfield, ptr, 4);
+
 	/* 8: CLUSTER_NAME ---------------------*/
-	dfield = dtuple_get_nth_field(entry, 6/*CLUSTER_NAME*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__CLUSTER_ID);
 	dfield_set_null(dfield); /* not supported */
 
 	/* 9: SPACE ----------------------------*/
-	dfield = dtuple_get_nth_field(entry, 7/*SPACE*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__SPACE);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, table->space);
 
 	dfield_set_data(dfield, ptr, 4);
@@ -187,49 +184,57 @@ dict_create_sys_columns_tuple(
 	dict_table_copy_types(entry, sys_columns);
 
 	/* 0: TABLE_ID -----------------------*/
-	dfield = dtuple_get_nth_field(entry, 0/*TABLE_ID*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__TABLE_ID);
 
-	ptr = mem_heap_alloc(heap, 8);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(ptr, table->id);
 
 	dfield_set_data(dfield, ptr, 8);
+
 	/* 1: POS ----------------------------*/
-	dfield = dtuple_get_nth_field(entry, 1/*POS*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__POS);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, i);
 
 	dfield_set_data(dfield, ptr, 4);
+
+	/* 2: DB_TRX_ID added later */
+	/* 3: DB_ROLL_PTR added later */
 	/* 4: NAME ---------------------------*/
-	dfield = dtuple_get_nth_field(entry, 2/*NAME*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__NAME);
 
 	col_name = dict_table_get_col_name(table, i);
 	dfield_set_data(dfield, col_name, ut_strlen(col_name));
+
 	/* 5: MTYPE --------------------------*/
-	dfield = dtuple_get_nth_field(entry, 3/*MTYPE*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__MTYPE);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, column->mtype);
 
 	dfield_set_data(dfield, ptr, 4);
+
 	/* 6: PRTYPE -------------------------*/
-	dfield = dtuple_get_nth_field(entry, 4/*PRTYPE*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__PRTYPE);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, column->prtype);
 
 	dfield_set_data(dfield, ptr, 4);
+
 	/* 7: LEN ----------------------------*/
-	dfield = dtuple_get_nth_field(entry, 5/*LEN*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__LEN);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, column->len);
 
 	dfield_set_data(dfield, ptr, 4);
+
 	/* 8: PREC ---------------------------*/
-	dfield = dtuple_get_nth_field(entry, 6/*PREC*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__PREC);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, 0/* unused */);
 
 	dfield_set_data(dfield, ptr, 4);
@@ -241,8 +246,8 @@ dict_create_sys_columns_tuple(
 /***************************************************************//**
 Builds a table definition to insert.
 @return	DB_SUCCESS or error code */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 dict_build_table_def_step(
 /*======================*/
 	que_thr_t*	thr,	/*!< in: query thread */
@@ -250,35 +255,31 @@ dict_build_table_def_step(
 {
 	dict_table_t*	table;
 	dtuple_t*	row;
-	ulint		error;
-	ulint		flags;
-	const char*	path_or_name;
-	ibool		is_path;
+	dberr_t		error;
+	const char*	path;
 	mtr_t		mtr;
 	ulint		space = 0;
-	ibool		file_per_table;
+	bool		use_tablespace;
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 
 	table = node->table;
-
-	/* Cache the global variable "srv_file_per_table" to
-	a local variable before using it. Please note
-	"srv_file_per_table" is not under dict_sys mutex
-	protection, and could be changed while executing
-	this function. So better to cache the current value
-	to a local variable, and all future reference to
-	"srv_file_per_table" should use this local variable. */
-	file_per_table = srv_file_per_table;
+	use_tablespace = DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_TABLESPACE);
 
 	dict_hdr_get_new_id(&table->id, NULL, NULL);
 
 	thr_get_trx(thr)->table_id = table->id;
 
-	if (file_per_table) {
-		/* Get a new space id if srv_file_per_table is set */
+	if (use_tablespace) {
+		/* This table will not use the system tablespace.
+		Get a new space id. */
 		dict_hdr_get_new_id(NULL, NULL, &space);
 
+		DBUG_EXECUTE_IF(
+			"ib_create_table_fail_out_of_space_ids",
+			space = ULINT_UNDEFINED;
+		);
+
 		if (UNIV_UNLIKELY(space == ULINT_UNDEFINED)) {
 			return(DB_ERROR);
 		}
@@ -291,26 +292,19 @@ dict_build_table_def_step(
 		- page 3 will contain the root of the clustered index of the
 		table we create here. */
 
-		if (table->dir_path_of_temp_table) {
-			/* We place tables created with CREATE TEMPORARY
-			TABLE in the tmp dir of mysqld server */
-
-			path_or_name = table->dir_path_of_temp_table;
-			is_path = TRUE;
-		} else {
-			path_or_name = table->name;
-			is_path = FALSE;
-		}
+		path = table->data_dir_path ? table->data_dir_path
+					    : table->dir_path_of_temp_table;
 
-		ut_ad(dict_table_get_format(table) <= DICT_TF_FORMAT_MAX);
+		ut_ad(dict_table_get_format(table) <= UNIV_FORMAT_MAX);
 		ut_ad(!dict_table_zip_size(table)
-		      || dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP);
+		      || dict_table_get_format(table) >= UNIV_FORMAT_B);
 
-		flags = table->flags & ~(~0 << DICT_TF_BITS);
 		error = fil_create_new_single_table_tablespace(
-			space, path_or_name, is_path,
-			flags == DICT_TF_COMPACT ? 0 : flags,
+			space, table->name, path,
+			dict_tf_to_fsp_flags(table->flags),
+			table->flags2,
 			FIL_IBD_FILE_INITIAL_SIZE);
+
 		table->space = (unsigned int) space;
 
 		if (error != DB_SUCCESS) {
@@ -324,8 +318,10 @@ dict_build_table_def_step(
 
 		mtr_commit(&mtr);
 	} else {
-		/* Create in the system tablespace: disallow new features */
-		table->flags &= (~0 << DICT_TF_BITS) | DICT_TF_COMPACT;
+		/* Create in the system tablespace: disallow Barracuda
+		features by keeping only the first bit which says whether
+		the row format is redundant or compact */
+		table->flags &= DICT_TF_COMPACT;
 	}
 
 	row = dict_create_sys_tables_tuple(table, node->heap);
@@ -336,10 +332,9 @@ dict_build_table_def_step(
 }
 
 /***************************************************************//**
-Builds a column definition to insert.
-@return	DB_SUCCESS */
+Builds a column definition to insert. */
 static
-ulint
+void
 dict_build_col_def_step(
 /*====================*/
 	tab_node_t*	node)	/*!< in: table create node */
@@ -349,8 +344,6 @@ dict_build_col_def_step(
 	row = dict_create_sys_columns_tuple(node->table, node->col_no,
 					    node->heap);
 	ins_node_set_new_row(node->col_def, row);
-
-	return(DB_SUCCESS);
 }
 
 /*****************************************************************//**
@@ -378,68 +371,76 @@ dict_create_sys_indexes_tuple(
 
 	sys_indexes = dict_sys->sys_indexes;
 
-	table = dict_table_get_low(index->table_name, DICT_ERR_IGNORE_NONE);
+	table = dict_table_get_low(index->table_name);
 
 	entry = dtuple_create(heap, 7 + DATA_N_SYS_COLS);
 
 	dict_table_copy_types(entry, sys_indexes);
 
 	/* 0: TABLE_ID -----------------------*/
-	dfield = dtuple_get_nth_field(entry, 0/*TABLE_ID*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__TABLE_ID);
 
-	ptr = mem_heap_alloc(heap, 8);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(ptr, table->id);
 
 	dfield_set_data(dfield, ptr, 8);
+
 	/* 1: ID ----------------------------*/
-	dfield = dtuple_get_nth_field(entry, 1/*ID*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__ID);
 
-	ptr = mem_heap_alloc(heap, 8);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(ptr, index->id);
 
 	dfield_set_data(dfield, ptr, 8);
+
+	/* 2: DB_TRX_ID added later */
+	/* 3: DB_ROLL_PTR added later */
 	/* 4: NAME --------------------------*/
-	dfield = dtuple_get_nth_field(entry, 2/*NAME*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__NAME);
 
 	dfield_set_data(dfield, index->name, ut_strlen(index->name));
+
 	/* 5: N_FIELDS ----------------------*/
-	dfield = dtuple_get_nth_field(entry, 3/*N_FIELDS*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__N_FIELDS);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, index->n_fields);
 
 	dfield_set_data(dfield, ptr, 4);
+
 	/* 6: TYPE --------------------------*/
-	dfield = dtuple_get_nth_field(entry, 4/*TYPE*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__TYPE);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, index->type);
 
 	dfield_set_data(dfield, ptr, 4);
-	/* 7: SPACE --------------------------*/
 
-#if DICT_SYS_INDEXES_SPACE_NO_FIELD != 7
-#error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 7"
-#endif
+	/* 7: SPACE --------------------------*/
 
-	dfield = dtuple_get_nth_field(entry, 5/*SPACE*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__SPACE);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, index->space);
 
 	dfield_set_data(dfield, ptr, 4);
-	/* 8: PAGE_NO --------------------------*/
 
-#if DICT_SYS_INDEXES_PAGE_NO_FIELD != 8
-#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 8"
-#endif
+	/* 8: PAGE_NO --------------------------*/
 
-	dfield = dtuple_get_nth_field(entry, 6/*PAGE_NO*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__PAGE_NO);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, FIL_NULL);
 
 	dfield_set_data(dfield, ptr, 4);
+
 	/*--------------------------------*/
 
 	return(entry);
@@ -454,7 +455,7 @@ dtuple_t*
 dict_create_sys_fields_tuple(
 /*=========================*/
 	const dict_index_t*	index,	/*!< in: index */
-	ulint			i,	/*!< in: field number */
+	ulint			fld_no,	/*!< in: field number */
 	mem_heap_t*		heap)	/*!< in: memory heap from
 					which the memory for the built
 					tuple is allocated */
@@ -477,7 +478,7 @@ dict_create_sys_fields_tuple(
 		}
 	}
 
-	field = dict_index_get_nth_field(index, i);
+	field = dict_index_get_nth_field(index, fld_no);
 
 	sys_fields = dict_sys->sys_fields;
 
@@ -486,35 +487,39 @@ dict_create_sys_fields_tuple(
 	dict_table_copy_types(entry, sys_fields);
 
 	/* 0: INDEX_ID -----------------------*/
-	dfield = dtuple_get_nth_field(entry, 0/*INDEX_ID*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__INDEX_ID);
 
-	ptr = mem_heap_alloc(heap, 8);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(ptr, index->id);
 
 	dfield_set_data(dfield, ptr, 8);
-	/* 1: POS + PREFIX LENGTH ----------------------------*/
 
-	dfield = dtuple_get_nth_field(entry, 1/*POS*/);
+	/* 1: POS; FIELD NUMBER & PREFIX LENGTH -----------------------*/
+
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__POS);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 
 	if (index_contains_column_prefix_field) {
 		/* If there are column prefix fields in the index, then
 		we store the number of the field to the 2 HIGH bytes
 		and the prefix length to the 2 low bytes, */
 
-		mach_write_to_4(ptr, (i << 16) + field->prefix_len);
+		mach_write_to_4(ptr, (fld_no << 16) + field->prefix_len);
 	} else {
 		/* Else we store the number of the field to the 2 LOW bytes.
 		This is to keep the storage format compatible with
 		InnoDB versions < 4.0.14. */
 
-		mach_write_to_4(ptr, i);
+		mach_write_to_4(ptr, fld_no);
 	}
 
 	dfield_set_data(dfield, ptr, 4);
+
+	/* 2: DB_TRX_ID added later */
+	/* 3: DB_ROLL_PTR added later */
 	/* 4: COL_NAME -------------------------*/
-	dfield = dtuple_get_nth_field(entry, 2/*COL_NAME*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__COL_NAME);
 
 	dfield_set_data(dfield, field->name,
 			ut_strlen(field->name));
@@ -524,56 +529,6 @@ dict_create_sys_fields_tuple(
 }
 
 /*****************************************************************//**
-Based on an index object, this function builds the entry to be inserted
-in the SYS_STATS system table.
-@return	the tuple which should be inserted */
-static
-dtuple_t*
-dict_create_sys_stats_tuple(
-/*========================*/
-	const dict_index_t*	index,
-	ulint			i,
-	mem_heap_t*		heap)
-{
-	dict_table_t*	sys_stats;
-	dtuple_t*	entry;
-	dfield_t*	dfield;
-	byte*		ptr;
-
-	ut_ad(index);
-	ut_ad(heap);
-
-	sys_stats = dict_sys->sys_stats;
-
-	entry = dtuple_create(heap, 4 + DATA_N_SYS_COLS);
-
-	dict_table_copy_types(entry, sys_stats);
-
-	/* 0: INDEX_ID -----------------------*/
-	dfield = dtuple_get_nth_field(entry, 0/*INDEX_ID*/);
-	ptr = mem_heap_alloc(heap, 8);
-	mach_write_to_8(ptr, index->id);
-	dfield_set_data(dfield, ptr, 8);
-	/* 1: KEY_COLS -----------------------*/
-	dfield = dtuple_get_nth_field(entry, 1/*KEY_COLS*/);
-	ptr = mem_heap_alloc(heap, 4);
-	mach_write_to_4(ptr, i);
-	dfield_set_data(dfield, ptr, 4);
-	/* 4: DIFF_VALS ----------------------*/
-	dfield = dtuple_get_nth_field(entry, 2/*DIFF_VALS*/);
-	ptr = mem_heap_alloc(heap, 8);
-	mach_write_to_8(ptr, 0); /* initial value is 0 */
-	dfield_set_data(dfield, ptr, 8);
-	/* 5: NON_NULL_VALS ------------------*/
-	dfield = dtuple_get_nth_field(entry, 3/*NON_NULL_VALS*/);
-	ptr = mem_heap_alloc(heap, 8);
-	mach_write_to_8(ptr, 0); /* initial value is 0 */
-	dfield_set_data(dfield, ptr, 8);
-
-	return(entry);
-}
-
-/*****************************************************************//**
 Creates the tuple with which the index entry is searched for writing the index
 tree root page number, if such a tree is created.
 @return	the tuple for search */
@@ -612,8 +567,8 @@ dict_create_search_tuple(
 /***************************************************************//**
 Builds an index definition row to insert.
 @return	DB_SUCCESS or error code */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 dict_build_index_def_step(
 /*======================*/
 	que_thr_t*	thr,	/*!< in: query thread */
@@ -630,13 +585,16 @@ dict_build_index_def_step(
 
 	index = node->index;
 
-	table = dict_table_get_low(index->table_name, DICT_ERR_IGNORE_NONE);
+	table = dict_table_get_low(index->table_name);
 
 	if (table == NULL) {
 		return(DB_TABLE_NOT_FOUND);
 	}
 
-	trx->table_id = table->id;
+	if (!trx->table_id) {
+		/* Record only the first table id. */
+		trx->table_id = table->id;
+	}
 
 	node->table = table;
 
@@ -657,15 +615,16 @@ dict_build_index_def_step(
 
 	/* Note that the index was created by this transaction. */
 	index->trx_id = trx->id;
+	ut_ad(table->def_trx_id <= trx->id);
+	table->def_trx_id = trx->id;
 
 	return(DB_SUCCESS);
 }
 
 /***************************************************************//**
-Builds a field definition row to insert.
-@return	DB_SUCCESS */
+Builds a field definition row to insert. */
 static
-ulint
+void
 dict_build_field_def_step(
 /*======================*/
 	ind_node_t*	node)	/*!< in: index create node */
@@ -678,36 +637,13 @@ dict_build_field_def_step(
 	row = dict_create_sys_fields_tuple(index, node->field_no, node->heap);
 
 	ins_node_set_new_row(node->field_def, row);
-
-	return(DB_SUCCESS);
-}
-
-/***************************************************************//**
-Builds a row for storing stats to insert.
-@return DB_SUCCESS */
-static
-ulint
-dict_build_stats_def_step(
-/*======================*/
-	ind_node_t*	node)
-{
-	dict_index_t*	index;
-	dtuple_t*	row;
-
-	index = node->index;
-
-	row = dict_create_sys_stats_tuple(index, node->stats_no, node->heap);
-
-	ins_node_set_new_row(node->stats_def, row);
-
-	return(DB_SUCCESS);
 }
 
 /***************************************************************//**
 Creates an index tree for the index if it is not a member of a cluster.
 @return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 dict_create_index_tree_step(
 /*========================*/
 	ind_node_t*	node)	/*!< in: index create node */
@@ -715,7 +651,6 @@ dict_create_index_tree_step(
 	dict_index_t*	index;
 	dict_table_t*	sys_indexes;
 	dtuple_t*	search_tuple;
-	ulint		zip_size;
 	btr_pcur_t	pcur;
 	mtr_t		mtr;
 
@@ -725,6 +660,11 @@ dict_create_index_tree_step(
 
 	sys_indexes = dict_sys->sys_indexes;
 
+	if (index->type == DICT_FTS) {
+		/* FTS index does not need an index tree */
+		return(DB_SUCCESS);
+	}
+
 	/* Run a mini-transaction in which the index tree is allocated for
 	the index and its root address is written to the index entry in
 	sys_indexes */
@@ -739,25 +679,37 @@ dict_create_index_tree_step(
 
 	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
 
-	zip_size = dict_table_zip_size(index->table);
 
-	node->page_no = btr_create(index->type, index->space, zip_size,
-				   index->id, index, &mtr);
-	/* printf("Created a new index tree in space %lu root page %lu\n",
-	index->space, index->page_no); */
+	dberr_t		err = DB_SUCCESS;
+	ulint		zip_size = dict_table_zip_size(index->table);
 
-	page_rec_write_field(btr_pcur_get_rec(&pcur),
-			     DICT_SYS_INDEXES_PAGE_NO_FIELD,
-			     node->page_no, &mtr);
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
+	if (node->index->table->ibd_file_missing
+	    || dict_table_is_discarded(node->index->table)) {
 
-	if (node->page_no == FIL_NULL) {
+		node->page_no = FIL_NULL;
+	} else {
+		node->page_no = btr_create(
+			index->type, index->space, zip_size,
+			index->id, index, &mtr);
 
-		return(DB_OUT_OF_FILE_SPACE);
+		if (node->page_no == FIL_NULL) {
+			err = DB_OUT_OF_FILE_SPACE;
+		}
+
+		DBUG_EXECUTE_IF("ib_import_create_index_failure_1",
+				node->page_no = FIL_NULL;
+				err = DB_OUT_OF_FILE_SPACE; );
 	}
 
-	return(DB_SUCCESS);
+	page_rec_write_field(
+		btr_pcur_get_rec(&pcur), DICT_FLD__SYS_INDEXES__PAGE_NO,
+		node->page_no, &mtr);
+
+	btr_pcur_close(&pcur);
+
+	mtr_commit(&mtr);
+
+	return(err);
 }
 
 /*******************************************************************//**
@@ -778,7 +730,8 @@ dict_drop_index_tree(
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 	ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
-	ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);
+	ptr = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
 
 	ut_ad(len == 4);
 
@@ -790,8 +743,8 @@ dict_drop_index_tree(
 		return;
 	}
 
-	ptr = rec_get_nth_field_old(rec,
-				    DICT_SYS_INDEXES_SPACE_NO_FIELD, &len);
+	ptr = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__SPACE, &len);
 
 	ut_ad(len == 4);
 
@@ -818,7 +771,7 @@ dict_drop_index_tree(
 	root_page_no); */
 	btr_free_root(space, zip_size, root_page_no, mtr);
 
-	page_rec_write_field(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD,
+	page_rec_write_field(rec, DICT_FLD__SYS_INDEXES__PAGE_NO,
 			     FIL_NULL, mtr);
 }
 
@@ -854,7 +807,8 @@ dict_truncate_index_tree(
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 	ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
 	rec = btr_pcur_get_rec(pcur);
-	ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);
+	ptr = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
 
 	ut_ad(len == 4);
 
@@ -869,8 +823,8 @@ dict_truncate_index_tree(
 		drop = FALSE;
 	}
 
-	ptr = rec_get_nth_field_old(rec,
-				    DICT_SYS_INDEXES_SPACE_NO_FIELD, &len);
+	ptr = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__SPACE, &len);
 
 	ut_ad(len == 4);
 
@@ -890,12 +844,12 @@ dict_truncate_index_tree(
 		return(FIL_NULL);
 	}
 
-	ptr = rec_get_nth_field_old(rec,
-				    DICT_SYS_INDEXES_TYPE_FIELD, &len);
+	ptr = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__TYPE, &len);
 	ut_ad(len == 4);
 	type = mach_read_from_4(ptr);
 
-	ptr = rec_get_nth_field_old(rec, 1, &len);
+	ptr = rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__ID, &len);
 	ut_ad(len == 8);
 	index_id = mach_read_from_8(ptr);
 
@@ -922,7 +876,7 @@ create:
 	in SYS_INDEXES, so that the database will not get into an
 	inconsistent state in case it crashes between the mtr_commit()
 	below and the following mtr_commit() call. */
-	page_rec_write_field(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD,
+	page_rec_write_field(rec, DICT_FLD__SYS_INDEXES__PAGE_NO,
 			     FIL_NULL, mtr);
 
 	/* We will need to commit the mini-transaction in order to avoid
@@ -939,10 +893,14 @@ create:
 	     index;
 	     index = UT_LIST_GET_NEXT(indexes, index)) {
 		if (index->id == index_id) {
-			root_page_no = btr_create(type, space, zip_size,
-						  index_id, index, mtr);
-			index->page = (unsigned int) root_page_no;
-			return(root_page_no);
+			if (index->type & DICT_FTS) {
+				return(FIL_NULL);
+			} else {
+				root_page_no = btr_create(type, space, zip_size,
+							  index_id, index, mtr);
+				index->page = (unsigned int) root_page_no;
+				return(root_page_no);
+			}
 		}
 	}
 
@@ -965,11 +923,14 @@ tab_create_graph_create(
 /*====================*/
 	dict_table_t*	table,	/*!< in: table to create, built as a memory data
 				structure */
-	mem_heap_t*	heap)	/*!< in: heap where created */
+	mem_heap_t*	heap,	/*!< in: heap where created */
+	bool		commit)	/*!< in: true if the commit node should be
+				added to the query graph */
 {
 	tab_node_t*	node;
 
-	node = mem_heap_alloc(heap, sizeof(tab_node_t));
+	node = static_cast<tab_node_t*>(
+		mem_heap_alloc(heap, sizeof(tab_node_t)));
 
 	node->common.type = QUE_NODE_CREATE_TABLE;
 
@@ -986,8 +947,12 @@ tab_create_graph_create(
 					heap);
 	node->col_def->common.parent = node;
 
-	node->commit_node = commit_node_create(heap);
-	node->commit_node->common.parent = node;
+	if (commit) {
+		node->commit_node = trx_commit_node_create(heap);
+		node->commit_node->common.parent = node;
+	} else {
+		node->commit_node = 0;
+	}
 
 	return(node);
 }
@@ -1001,11 +966,14 @@ ind_create_graph_create(
 /*====================*/
 	dict_index_t*	index,	/*!< in: index to create, built as a memory data
 				structure */
-	mem_heap_t*	heap)	/*!< in: heap where created */
+	mem_heap_t*	heap,	/*!< in: heap where created */
+	bool		commit)	/*!< in: true if the commit node should be
+				added to the query graph */
 {
 	ind_node_t*	node;
 
-	node = mem_heap_alloc(heap, sizeof(ind_node_t));
+	node = static_cast<ind_node_t*>(
+		mem_heap_alloc(heap, sizeof(ind_node_t)));
 
 	node->common.type = QUE_NODE_CREATE_INDEX;
 
@@ -1023,52 +991,13 @@ ind_create_graph_create(
 					  dict_sys->sys_fields, heap);
 	node->field_def->common.parent = node;
 
-	if (srv_use_sys_stats_table) {
-		node->stats_def = ins_node_create(INS_DIRECT,
-						  dict_sys->sys_stats, heap);
-		node->stats_def->common.parent = node;
+	if (commit) {
+		node->commit_node = trx_commit_node_create(heap);
+		node->commit_node->common.parent = node;
 	} else {
-		node->stats_def = NULL;
+		node->commit_node = 0;
 	}
 
-	node->commit_node = commit_node_create(heap);
-	node->commit_node->common.parent = node;
-
-	return(node);
-}
-
-/*********************************************************************//**
-*/
-UNIV_INTERN
-ind_node_t*
-ind_insert_stats_graph_create(
-/*==========================*/
-	dict_index_t*	index,
-	mem_heap_t*	heap)
-{
-	ind_node_t*	node;
-
-	node = mem_heap_alloc(heap, sizeof(ind_node_t));
-
-	node->common.type = QUE_NODE_INSERT_STATS;
-
-	node->index = index;
-
-	node->state = INDEX_BUILD_STATS_COLS;
-	node->page_no = FIL_NULL;
-	node->heap = mem_heap_create(256);
-
-	node->ind_def = NULL;
-	node->field_def = NULL;
-
-	node->stats_def = ins_node_create(INS_DIRECT,
-					  dict_sys->sys_stats, heap);
-	node->stats_def->common.parent = node;
-	node->stats_no = 0;
-
-	node->commit_node = commit_node_create(heap);
-	node->commit_node->common.parent = node;
-
 	return(node);
 }
 
@@ -1082,7 +1011,7 @@ dict_create_table_step(
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	tab_node_t*	node;
-	ulint		err	= DB_ERROR;
+	dberr_t		err	= DB_ERROR;
 	trx_t*		trx;
 
 	ut_ad(thr);
@@ -1090,7 +1019,7 @@ dict_create_table_step(
 
 	trx = thr_get_trx(thr);
 
-	node = thr->run_node;
+	node = static_cast<tab_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_TABLE);
 
@@ -1121,12 +1050,7 @@ dict_create_table_step(
 
 		if (node->col_no < (node->table)->n_def) {
 
-			err = dict_build_col_def_step(node);
-
-			if (err != DB_SUCCESS) {
-
-				goto function_exit;
-			}
+			dict_build_col_def_step(node);
 
 			node->col_no++;
 
@@ -1153,7 +1077,7 @@ dict_create_table_step(
 
 	if (node->state == TABLE_ADD_TO_CACHE) {
 
-		dict_table_add_to_cache(node->table, node->heap);
+		dict_table_add_to_cache(node->table, TRUE, node->heap);
 
 		err = DB_SUCCESS;
 	}
@@ -1189,7 +1113,7 @@ dict_create_index_step(
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	ind_node_t*	node;
-	ulint		err	= DB_ERROR;
+	dberr_t		err	= DB_ERROR;
 	trx_t*		trx;
 
 	ut_ad(thr);
@@ -1197,7 +1121,7 @@ dict_create_index_step(
 
 	trx = thr_get_trx(thr);
 
-	node = thr->run_node;
+	node = static_cast<ind_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_INDEX);
 
@@ -1216,7 +1140,6 @@ dict_create_index_step(
 
 		node->state = INDEX_BUILD_FIELD_DEF;
 		node->field_no = 0;
-		node->stats_no = 0;
 
 		thr->run_node = node->ind_def;
 
@@ -1227,12 +1150,7 @@ dict_create_index_step(
 
 		if (node->field_no < (node->index)->n_fields) {
 
-			err = dict_build_field_def_step(node);
-
-			if (err != DB_SUCCESS) {
-
-				goto function_exit;
-			}
+			dict_build_field_def_step(node);
 
 			node->field_no++;
 
@@ -1252,7 +1170,7 @@ dict_create_index_step(
 			node->table, node->index, FIL_NULL,
 			trx_is_strict(trx)
 			|| dict_table_get_format(node->table)
-			>= DICT_TF_FORMAT_ZIP);
+			>= UNIV_FORMAT_B);
 
 		node->index = dict_index_get_if_in_cache_low(index_id);
 		ut_a(!node->index == (err != DB_SUCCESS));
@@ -1262,38 +1180,44 @@ dict_create_index_step(
 			goto function_exit;
 		}
 
-		if (srv_use_sys_stats_table
-		    && !((node->table->flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY)) {
-			node->state = INDEX_BUILD_STATS_COLS;
-		} else {
-			node->state = INDEX_CREATE_INDEX_TREE;
-		}
-	}
-	if (node->state == INDEX_BUILD_STATS_COLS) {
-		if (node->stats_no <= dict_index_get_n_unique(node->index)) {
-
-			err = dict_build_stats_def_step(node);
-
-			if (err != DB_SUCCESS) {
-
-				goto function_exit;
-			}
-
-			node->stats_no++;
-
-			thr->run_node = node->stats_def;
-
-			return(thr);
-		} else {
-			node->state = INDEX_CREATE_INDEX_TREE;
-		}
+		node->state = INDEX_CREATE_INDEX_TREE;
 	}
 
 	if (node->state == INDEX_CREATE_INDEX_TREE) {
 
 		err = dict_create_index_tree_step(node);
 
+		DBUG_EXECUTE_IF("ib_dict_create_index_tree_fail",
+				err = DB_OUT_OF_MEMORY;);
+
 		if (err != DB_SUCCESS) {
+			/* If this is a FTS index, we will need to remove
+			it from fts->cache->indexes list as well */
+			if ((node->index->type & DICT_FTS)
+			    && node->table->fts) {
+				fts_index_cache_t*	index_cache;
+
+				rw_lock_x_lock(
+					&node->table->fts->cache->init_lock);
+
+				index_cache = (fts_index_cache_t*)
+					 fts_find_index_cache(
+						node->table->fts->cache,
+						node->index);
+
+				if (index_cache->words) {
+					rbt_free(index_cache->words);
+					index_cache->words = 0;
+				}
+
+				ib_vector_remove(
+					node->table->fts->cache->indexes,
+					*reinterpret_cast<void**>(index_cache));
+
+				rw_lock_x_unlock(
+					&node->table->fts->cache->init_lock);
+			}
+
 			dict_index_remove_from_cache(node->table, node->index);
 			node->index = NULL;
 
@@ -1301,6 +1225,11 @@ dict_create_index_step(
 		}
 
 		node->index->page = node->page_no;
+		/* These should have been set in
+		dict_build_index_def_step() and
+		dict_index_add_to_cache(). */
+		ut_ad(node->index->trx_id == trx->id);
+		ut_ad(node->index->table->def_trx_id == trx->id);
 		node->state = INDEX_COMMIT_WORK;
 	}
 
@@ -1338,124 +1267,107 @@ function_exit:
 }
 
 /****************************************************************//**
-*/
-UNIV_INTERN
-que_thr_t*
-dict_insert_stats_step(
-/*===================*/
-	que_thr_t*	thr)	/*!< in: query thread */
+Check whether a system table exists.  Additionally, if it exists,
+move it to the non-LRU end of the table LRU list.  This is oly used
+for system tables that can be upgraded or added to an older database,
+which include SYS_FOREIGN, SYS_FOREIGN_COLS, SYS_TABLESPACES and
+SYS_DATAFILES.
+@return DB_SUCCESS if the sys table exists, DB_CORRUPTION if it exists
+but is not current, DB_TABLE_NOT_FOUND if it does not exist*/
+static
+dberr_t
+dict_check_if_system_table_exists(
+/*==============================*/
+	const char*	tablename,	/*!< in: name of table */
+	ulint		num_fields,	/*!< in: number of fields */
+	ulint		num_indexes)	/*!< in: number of indexes */
 {
-	ind_node_t*	node;
-	ulint		err	= DB_ERROR;
-	trx_t*		trx;
+	dict_table_t*	sys_table;
+	dberr_t		error = DB_SUCCESS;
 
-	ut_ad(thr);
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
 
-	trx = thr_get_trx(thr);
+	mutex_enter(&dict_sys->mutex);
 
-	node = thr->run_node;
+	sys_table = dict_table_get_low(tablename);
 
-	if (thr->prev_node == que_node_get_parent(node)) {
-		node->state = INDEX_BUILD_STATS_COLS;
-	}
-
-	if (node->state == INDEX_BUILD_STATS_COLS) {
-		if (node->stats_no <= dict_index_get_n_unique(node->index)) {
-
-			err = dict_build_stats_def_step(node);
-
-			if (err != DB_SUCCESS) {
-
-				goto function_exit;
-			}
-
-			node->stats_no++;
+	if (sys_table == NULL) {
+		error = DB_TABLE_NOT_FOUND;
 
-			thr->run_node = node->stats_def;
+	} else if (UT_LIST_GET_LEN(sys_table->indexes) != num_indexes
+		   || sys_table->n_cols != num_fields) {
+		error = DB_CORRUPTION;
 
-			return(thr);
-		} else {
-			node->state = INDEX_COMMIT_WORK;
-		}
-	}
-
-	if (node->state == INDEX_COMMIT_WORK) {
-
-		/* do not commit transaction here for now */
-	}
-
-function_exit:
-	trx->error_state = err;
-
-	if (err == DB_SUCCESS) {
 	} else {
-		return(NULL);
+		/* This table has already been created, and it is OK.
+		Ensure that it can't be evicted from the table LRU cache. */
+
+		dict_table_move_from_lru_to_non_lru(sys_table);
 	}
 
-	thr->run_node = que_node_get_parent(node);
+	mutex_exit(&dict_sys->mutex);
 
-	return(thr);
+	return(error);
 }
 
 /****************************************************************//**
 Creates the foreign key constraints system tables inside InnoDB
-at database creation or database start if they are not found or are
+at server bootstrap or server start if they are not found or are
 not of the right form.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 dict_create_or_check_foreign_constraint_tables(void)
 /*================================================*/
 {
-	dict_table_t*	table1;
-	dict_table_t*	table2;
-	ulint		error;
 	trx_t*		trx;
+	my_bool		srv_file_per_table_backup;
+	dberr_t		err;
+	dberr_t		sys_foreign_err;
+	dberr_t		sys_foreign_cols_err;
 
-	mutex_enter(&(dict_sys->mutex));
-
-	table1 = dict_table_get_low("SYS_FOREIGN", DICT_ERR_IGNORE_NONE);
-	table2 = dict_table_get_low("SYS_FOREIGN_COLS", DICT_ERR_IGNORE_NONE);
-
-	if (table1 && table2
-	    && UT_LIST_GET_LEN(table1->indexes) == 3
-	    && UT_LIST_GET_LEN(table2->indexes) == 1) {
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
 
-		/* Foreign constraint system tables have already been
-		created, and they are ok */
+	/* Note: The master thread has not been started at this point. */
 
-		table1->n_mysql_handles_opened = 1; /* for pin */
-		table2->n_mysql_handles_opened = 1; /* for pin */
 
-		mutex_exit(&(dict_sys->mutex));
+	sys_foreign_err = dict_check_if_system_table_exists(
+		"SYS_FOREIGN", DICT_NUM_FIELDS__SYS_FOREIGN + 1, 3);
+	sys_foreign_cols_err = dict_check_if_system_table_exists(
+		"SYS_FOREIGN_COLS", DICT_NUM_FIELDS__SYS_FOREIGN_COLS + 1, 1);
 
+	if (sys_foreign_err == DB_SUCCESS
+	    && sys_foreign_cols_err == DB_SUCCESS) {
 		return(DB_SUCCESS);
 	}
 
-	mutex_exit(&(dict_sys->mutex));
-
 	trx = trx_allocate_for_mysql();
 
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
 	trx->op_info = "creating foreign key sys tables";
 
 	row_mysql_lock_data_dictionary(trx);
 
-	if (table1) {
-		fprintf(stderr,
-			"InnoDB: dropping incompletely created"
-			" SYS_FOREIGN table\n");
+	/* Check which incomplete table definition to drop. */
+
+	if (sys_foreign_err == DB_CORRUPTION) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Dropping incompletely created "
+			"SYS_FOREIGN table.");
 		row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE);
 	}
 
-	if (table2) {
-		fprintf(stderr,
-			"InnoDB: dropping incompletely created"
-			" SYS_FOREIGN_COLS table\n");
+	if (sys_foreign_cols_err == DB_CORRUPTION) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Dropping incompletely created "
+			"SYS_FOREIGN_COLS table.");
+
 		row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE);
 	}
 
-	fprintf(stderr,
-		"InnoDB: Creating foreign key constraint system tables\n");
+	ib_logf(IB_LOG_LEVEL_WARN,
+		"Creating foreign key constraint system tables.");
 
 	/* NOTE: in dict_load_foreigns we use the fact that
 	there are 2 secondary indexes on SYS_FOREIGN, and they
@@ -1467,80 +1379,93 @@ dict_create_or_check_foreign_constraint_tables(void)
 	VARBINARY, like in other InnoDB system tables, to get a clean
 	design. */
 
-	error = que_eval_sql(NULL,
-			     "PROCEDURE CREATE_FOREIGN_SYS_TABLES_PROC () IS\n"
-			     "BEGIN\n"
-			     "CREATE TABLE\n"
-			     "SYS_FOREIGN(ID CHAR, FOR_NAME CHAR,"
-			     " REF_NAME CHAR, N_COLS INT);\n"
-			     "CREATE UNIQUE CLUSTERED INDEX ID_IND"
-			     " ON SYS_FOREIGN (ID);\n"
-			     "CREATE INDEX FOR_IND"
-			     " ON SYS_FOREIGN (FOR_NAME);\n"
-			     "CREATE INDEX REF_IND"
-			     " ON SYS_FOREIGN (REF_NAME);\n"
-			     "CREATE TABLE\n"
-			     "SYS_FOREIGN_COLS(ID CHAR, POS INT,"
-			     " FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n"
-			     "CREATE UNIQUE CLUSTERED INDEX ID_IND"
-			     " ON SYS_FOREIGN_COLS (ID, POS);\n"
-			     "END;\n"
-			     , FALSE, trx);
-
-	if (error != DB_SUCCESS) {
-		fprintf(stderr, "InnoDB: error %lu in creation\n",
-			(ulong) error);
-
-		ut_a(error == DB_OUT_OF_FILE_SPACE
-		     || error == DB_TOO_MANY_CONCURRENT_TRXS);
-
-		fprintf(stderr,
-			"InnoDB: creation failed\n"
-			"InnoDB: tablespace is full\n"
-			"InnoDB: dropping incompletely created"
-			" SYS_FOREIGN tables\n");
+	srv_file_per_table_backup = srv_file_per_table;
+
+	/* We always want SYSTEM tables to be created inside the system
+	tablespace. */
+
+	srv_file_per_table = 0;
+
+	err = que_eval_sql(
+		NULL,
+		"PROCEDURE CREATE_FOREIGN_SYS_TABLES_PROC () IS\n"
+		"BEGIN\n"
+		"CREATE TABLE\n"
+		"SYS_FOREIGN(ID CHAR, FOR_NAME CHAR,"
+		" REF_NAME CHAR, N_COLS INT);\n"
+		"CREATE UNIQUE CLUSTERED INDEX ID_IND"
+		" ON SYS_FOREIGN (ID);\n"
+		"CREATE INDEX FOR_IND"
+		" ON SYS_FOREIGN (FOR_NAME);\n"
+		"CREATE INDEX REF_IND"
+		" ON SYS_FOREIGN (REF_NAME);\n"
+		"CREATE TABLE\n"
+		"SYS_FOREIGN_COLS(ID CHAR, POS INT,"
+		" FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n"
+		"CREATE UNIQUE CLUSTERED INDEX ID_IND"
+		" ON SYS_FOREIGN_COLS (ID, POS);\n"
+		"END;\n",
+		FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Creation of SYS_FOREIGN and SYS_FOREIGN_COLS "
+			"has failed with error %lu.  Tablespace is full. "
+			"Dropping incompletely created tables.",
+			(ulong) err);
+
+		ut_ad(err == DB_OUT_OF_FILE_SPACE
+		      || err == DB_TOO_MANY_CONCURRENT_TRXS);
 
 		row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE);
 		row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE);
 
-		error = DB_MUST_GET_MORE_FILE_SPACE;
+		if (err == DB_OUT_OF_FILE_SPACE) {
+			err = DB_MUST_GET_MORE_FILE_SPACE;
+		}
 	}
 
 	trx_commit_for_mysql(trx);
 
-	table1 = dict_table_get_low("SYS_FOREIGN", DICT_ERR_IGNORE_NONE);
-	table2 = dict_table_get_low("SYS_FOREIGN_COLS", DICT_ERR_IGNORE_NONE);
-	table1->n_mysql_handles_opened = 1; /* for pin */
-	table2->n_mysql_handles_opened = 1; /* for pin */
-
 	row_mysql_unlock_data_dictionary(trx);
 
 	trx_free_for_mysql(trx);
 
-	if (error == DB_SUCCESS) {
-		fprintf(stderr,
-			"InnoDB: Foreign key constraint system tables"
-			" created\n");
+	srv_file_per_table = srv_file_per_table_backup;
+
+	if (err == DB_SUCCESS) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Foreign key constraint system tables created");
 	}
 
-	return(error);
+	/* Note: The master thread has not been started at this point. */
+	/* Confirm and move to the non-LRU part of the table LRU list. */
+	sys_foreign_err = dict_check_if_system_table_exists(
+		"SYS_FOREIGN", DICT_NUM_FIELDS__SYS_FOREIGN + 1, 3);
+	ut_a(sys_foreign_err == DB_SUCCESS);
+
+	sys_foreign_cols_err = dict_check_if_system_table_exists(
+		"SYS_FOREIGN_COLS", DICT_NUM_FIELDS__SYS_FOREIGN_COLS + 1, 1);
+	ut_a(sys_foreign_cols_err == DB_SUCCESS);
+
+	return(err);
 }
 
 /****************************************************************//**
 Evaluate the given foreign key SQL statement.
 @return	error code or DB_SUCCESS */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 dict_foreign_eval_sql(
 /*==================*/
-	pars_info_t*	info,	/*!< in: info struct, or NULL */
+	pars_info_t*	info,	/*!< in: info struct */
 	const char*	sql,	/*!< in: SQL string to evaluate */
-	dict_table_t*	table,	/*!< in: table */
-	dict_foreign_t*	foreign,/*!< in: foreign */
-	trx_t*		trx)	/*!< in: transaction */
+	const char*	name,	/*!< in: table name (for diagnostics) */
+	const char*	id,	/*!< in: foreign key id */
+	trx_t*		trx)	/*!< in/out: transaction */
 {
-	ulint		error;
-	FILE*		ef	= dict_foreign_err_file;
+	dberr_t	error;
+	FILE*	ef	= dict_foreign_err_file;
 
 	error = que_eval_sql(info, sql, FALSE, trx);
 
@@ -1550,9 +1475,9 @@ dict_foreign_eval_sql(
 		ut_print_timestamp(ef);
 		fputs(" Error in foreign key constraint creation for table ",
 		      ef);
-		ut_print_name(ef, trx, TRUE, table->name);
+		ut_print_name(ef, trx, TRUE, name);
 		fputs(".\nA foreign key constraint of name ", ef);
-		ut_print_name(ef, trx, TRUE, foreign->id);
+		ut_print_name(ef, trx, TRUE, id);
 		fputs("\nalready exists."
 		      " (Note that internally InnoDB adds 'databasename'\n"
 		      "in front of the user-defined constraint name.)\n"
@@ -1579,7 +1504,7 @@ dict_foreign_eval_sql(
 		ut_print_timestamp(ef);
 		fputs(" Internal error in foreign key constraint creation"
 		      " for table ", ef);
-		ut_print_name(ef, trx, TRUE, table->name);
+		ut_print_name(ef, trx, TRUE, name);
 		fputs(".\n"
 		      "See the MySQL .err log in the datadir"
 		      " for more information.\n", ef);
@@ -1595,14 +1520,14 @@ dict_foreign_eval_sql(
 Add a single foreign key field definition to the data dictionary tables in
 the database.
 @return	error code or DB_SUCCESS */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 dict_create_add_foreign_field_to_dictionary(
 /*========================================*/
-	ulint		field_nr,	/*!< in: foreign field number */
-	dict_table_t*	table,		/*!< in: table */
-	dict_foreign_t*	foreign,	/*!< in: foreign */
-	trx_t*		trx)		/*!< in: transaction */
+	ulint			field_nr,	/*!< in: field number */
+	const char*		table_name,	/*!< in: table name */
+	const dict_foreign_t*	foreign,	/*!< in: foreign */
+	trx_t*			trx)		/*!< in/out: transaction */
 {
 	pars_info_t*	info = pars_info_create();
 
@@ -1623,72 +1548,26 @@ dict_create_add_foreign_field_to_dictionary(
 		       "INSERT INTO SYS_FOREIGN_COLS VALUES"
 		       "(:id, :pos, :for_col_name, :ref_col_name);\n"
 		       "END;\n",
-		       table, foreign, trx));
+		       table_name, foreign->id, trx));
 }
 
 /********************************************************************//**
-Add a single foreign key definition to the data dictionary tables in the
-database. We also generate names to constraints that were not named by the
-user. A generated constraint has a name of the format
-databasename/tablename_ibfk_NUMBER, where the numbers start from 1, and
-are given locally for this table, that is, the number is not global, as in
-the old format constraints < 4.0.18 it used to be.
+Add a foreign key definition to the data dictionary tables.
 @return	error code or DB_SUCCESS */
-static
-ulint
+UNIV_INTERN
+dberr_t
 dict_create_add_foreign_to_dictionary(
 /*==================================*/
-	ulint*		id_nr,	/*!< in/out: number to use in id generation;
-				incremented if used */
-	dict_table_t*	table,	/*!< in: table */
-	dict_foreign_t*	foreign,/*!< in: foreign */
-	trx_t*		trx)	/*!< in: transaction */
+	const char*		name,	/*!< in: table name */
+	const dict_foreign_t*	foreign,/*!< in: foreign key */
+	trx_t*			trx)	/*!< in/out: dictionary transaction */
 {
-	ulint		error;
-	ulint		i;
-	pars_info_t*	info;
-
-	if (foreign->id == NULL) {
-		/* Generate a new constraint id */
-		ulint	namelen	= strlen(table->name);
-		char*	id	= mem_heap_alloc(foreign->heap, namelen + 20);
-
-		if (row_is_mysql_tmp_table_name(table->name)) {
-			sprintf(id, "%s_ibfk_%lu", table->name,
-				(ulong) (*id_nr)++);
-		} else {
-			char	table_name[MAX_TABLE_NAME_LEN + 20] = "";
-			uint	errors = 0;
-
-			strncpy(table_name, table->name,
-				MAX_TABLE_NAME_LEN + 20);
-
-			innobase_convert_to_system_charset(
-				strchr(table_name, '/') + 1,
-				strchr(table->name, '/') + 1,
-				MAX_TABLE_NAME_LEN, &errors);
-
-			if (errors) {
-				strncpy(table_name, table->name,
-					MAX_TABLE_NAME_LEN + 20);
-			}
-
-			sprintf(id, "%s_ibfk_%lu", table_name,
-				(ulong) (*id_nr)++);
-
-			if (innobase_check_identifier_length(
-				strchr(id,'/') + 1)) {
-				return(DB_IDENTIFIER_TOO_LONG);
-			}
-		}
-		foreign->id = id;
-	}
-
-	info = pars_info_create();
+	dberr_t		error;
+	pars_info_t*	info = pars_info_create();
 
 	pars_info_add_str_literal(info, "id", foreign->id);
 
-	pars_info_add_str_literal(info, "for_name", table->name);
+	pars_info_add_str_literal(info, "for_name", name);
 
 	pars_info_add_str_literal(info, "ref_name",
 				  foreign->referenced_table_name);
@@ -1702,16 +1581,16 @@ dict_create_add_foreign_to_dictionary(
 				      "INSERT INTO SYS_FOREIGN VALUES"
 				      "(:id, :for_name, :ref_name, :n_cols);\n"
 				      "END;\n"
-				      , table, foreign, trx);
+				      , name, foreign->id, trx);
 
 	if (error != DB_SUCCESS) {
 
 		return(error);
 	}
 
-	for (i = 0; i < foreign->n_fields; i++) {
+	for (ulint i = 0; i < foreign->n_fields; i++) {
 		error = dict_create_add_foreign_field_to_dictionary(
-			i, table, foreign, trx);
+			i, name, foreign, trx);
 
 		if (error != DB_SUCCESS) {
 
@@ -1719,13 +1598,6 @@ dict_create_add_foreign_to_dictionary(
 		}
 	}
 
-	error = dict_foreign_eval_sql(NULL,
-				      "PROCEDURE P () IS\n"
-				      "BEGIN\n"
-				      "COMMIT WORK;\n"
-				      "END;\n"
-				      , table, foreign, trx);
-
 	return(error);
 }
 
@@ -1733,7 +1605,7 @@ dict_create_add_foreign_to_dictionary(
 Adds foreign key definitions to data dictionary tables in the database.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 dict_create_add_foreigns_to_dictionary(
 /*===================================*/
 	ulint		start_id,/*!< in: if we are actually doing ALTER TABLE
@@ -1749,11 +1621,11 @@ dict_create_add_foreigns_to_dictionary(
 {
 	dict_foreign_t*	foreign;
 	ulint		number	= start_id + 1;
-	ulint		error;
+	dberr_t		error;
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 
-	if (NULL == dict_table_get_low("SYS_FOREIGN", DICT_ERR_IGNORE_NONE)) {
+	if (NULL == dict_table_get_low("SYS_FOREIGN")) {
 		fprintf(stderr,
 			"InnoDB: table SYS_FOREIGN not found"
 			" in internal data dictionary\n");
@@ -1765,7 +1637,15 @@ dict_create_add_foreigns_to_dictionary(
 	     foreign;
 	     foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) {
 
-		error = dict_create_add_foreign_to_dictionary(&number, table,
+		error = dict_create_add_foreign_id(&number, table->name,
+						   foreign);
+
+		if (error != DB_SUCCESS) {
+
+			return(error);
+		}
+
+		error = dict_create_add_foreign_to_dictionary(table->name,
 							      foreign, trx);
 
 		if (error != DB_SUCCESS) {
@@ -1774,5 +1654,188 @@ dict_create_add_foreigns_to_dictionary(
 		}
 	}
 
+	trx->op_info = "committing foreign key definitions";
+
+	trx_commit(trx);
+
+	trx->op_info = "";
+
 	return(DB_SUCCESS);
 }
+
+/****************************************************************//**
+Creates the tablespaces and datafiles system tables inside InnoDB
+at server bootstrap or server start if they are not found or are
+not of the right form.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_create_or_check_sys_tablespace(void)
+/*=====================================*/
+{
+	trx_t*		trx;
+	my_bool		srv_file_per_table_backup;
+	dberr_t		err;
+	dberr_t		sys_tablespaces_err;
+	dberr_t		sys_datafiles_err;
+
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
+
+	/* Note: The master thread has not been started at this point. */
+
+	sys_tablespaces_err = dict_check_if_system_table_exists(
+		"SYS_TABLESPACES", DICT_NUM_FIELDS__SYS_TABLESPACES + 1, 1);
+	sys_datafiles_err = dict_check_if_system_table_exists(
+		"SYS_DATAFILES", DICT_NUM_FIELDS__SYS_DATAFILES + 1, 1);
+
+	if (sys_tablespaces_err == DB_SUCCESS
+	    && sys_datafiles_err == DB_SUCCESS) {
+		return(DB_SUCCESS);
+	}
+
+	trx = trx_allocate_for_mysql();
+
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	trx->op_info = "creating tablepace and datafile sys tables";
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Check which incomplete table definition to drop. */
+
+	if (sys_tablespaces_err == DB_CORRUPTION) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Dropping incompletely created "
+			"SYS_TABLESPACES table.");
+		row_drop_table_for_mysql("SYS_TABLESPACES", trx, TRUE);
+	}
+
+	if (sys_datafiles_err == DB_CORRUPTION) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Dropping incompletely created "
+			"SYS_DATAFILES table.");
+
+		row_drop_table_for_mysql("SYS_DATAFILES", trx, TRUE);
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Creating tablespace and datafile system tables.");
+
+	/* We always want SYSTEM tables to be created inside the system
+	tablespace. */
+	srv_file_per_table_backup = srv_file_per_table;
+	srv_file_per_table = 0;
+
+	err = que_eval_sql(
+		NULL,
+		"PROCEDURE CREATE_SYS_TABLESPACE_PROC () IS\n"
+		"BEGIN\n"
+		"CREATE TABLE SYS_TABLESPACES(\n"
+		" SPACE INT, NAME CHAR, FLAGS INT);\n"
+		"CREATE UNIQUE CLUSTERED INDEX SYS_TABLESPACES_SPACE"
+		" ON SYS_TABLESPACES (SPACE);\n"
+		"CREATE TABLE SYS_DATAFILES(\n"
+		" SPACE INT, PATH CHAR);\n"
+		"CREATE UNIQUE CLUSTERED INDEX SYS_DATAFILES_SPACE"
+		" ON SYS_DATAFILES (SPACE);\n"
+		"END;\n",
+		FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Creation of SYS_TABLESPACES and SYS_DATAFILES "
+			"has failed with error %lu.  Tablespace is full. "
+			"Dropping incompletely created tables.",
+			(ulong) err);
+
+		ut_a(err == DB_OUT_OF_FILE_SPACE
+		     || err == DB_TOO_MANY_CONCURRENT_TRXS);
+
+		row_drop_table_for_mysql("SYS_TABLESPACES", trx, TRUE);
+		row_drop_table_for_mysql("SYS_DATAFILES", trx, TRUE);
+
+		if (err == DB_OUT_OF_FILE_SPACE) {
+			err = DB_MUST_GET_MORE_FILE_SPACE;
+		}
+	}
+
+	trx_commit_for_mysql(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx_free_for_mysql(trx);
+
+	srv_file_per_table = srv_file_per_table_backup;
+
+	if (err == DB_SUCCESS) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Tablespace and datafile system tables created.");
+	}
+
+	/* Note: The master thread has not been started at this point. */
+	/* Confirm and move to the non-LRU part of the table LRU list. */
+
+	sys_tablespaces_err = dict_check_if_system_table_exists(
+		"SYS_TABLESPACES", DICT_NUM_FIELDS__SYS_TABLESPACES + 1, 1);
+	ut_a(sys_tablespaces_err == DB_SUCCESS);
+
+	sys_datafiles_err = dict_check_if_system_table_exists(
+		"SYS_DATAFILES", DICT_NUM_FIELDS__SYS_DATAFILES + 1, 1);
+	ut_a(sys_datafiles_err == DB_SUCCESS);
+
+	return(err);
+}
+
+/********************************************************************//**
+Add a single tablespace definition to the data dictionary tables in the
+database.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_add_tablespace_to_dictionary(
+/*=====================================*/
+	ulint		space,		/*!< in: tablespace id */
+	const char*	name,		/*!< in: tablespace name */
+	ulint		flags,		/*!< in: tablespace flags */
+	const char*	path,		/*!< in: tablespace path */
+	trx_t*		trx,		/*!< in/out: transaction */
+	bool		commit)		/*!< in: if true then commit the
+					transaction */
+{
+	dberr_t		error;
+
+	pars_info_t*	info = pars_info_create();
+
+	ut_a(space > TRX_SYS_SPACE);
+
+	pars_info_add_int4_literal(info, "space", space);
+
+	pars_info_add_str_literal(info, "name", name);
+
+	pars_info_add_int4_literal(info, "flags", flags);
+
+	pars_info_add_str_literal(info, "path", path);
+
+	error = que_eval_sql(info,
+			     "PROCEDURE P () IS\n"
+			     "BEGIN\n"
+			     "INSERT INTO SYS_TABLESPACES VALUES"
+			     "(:space, :name, :flags);\n"
+			     "INSERT INTO SYS_DATAFILES VALUES"
+			     "(:space, :path);\n"
+			     "END;\n",
+			     FALSE, trx);
+
+	if (error != DB_SUCCESS) {
+		return(error);
+	}
+
+	if (commit) {
+		trx->op_info = "committing tablespace and datafile definition";
+		trx_commit(trx);
+	}
+
+	trx->op_info = "";
+
+	return(error);
+}
diff --git a/storage/xtradb/dict/dict0dict.c b/storage/xtradb/dict/dict0dict.cc
index 50da1c97beb..a20456fe3cf 100644
--- a/storage/xtradb/dict/dict0dict.c
+++ b/storage/xtradb/dict/dict0dict.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,18 +18,19 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /******************************************************************//**
-@file dict/dict0dict.c
+@file dict/dict0dict.cc
 Data dictionary system
 
 Created 1/8/1996 Heikki Tuuri
 ***********************************************************************/
 
-#include <my_sys.h>
-
 #include "dict0dict.h"
+#include "fts0fts.h"
+#include "fil0fil.h"
 
 #ifdef UNIV_NONINL
 #include "dict0dict.ic"
+#include "dict0priv.ic"
 #endif
 
 /** dummy index for ROW_FORMAT=REDUNDANT supremum and infimum records */
@@ -43,6 +45,7 @@ UNIV_INTERN dict_index_t*	dict_ind_compact;
 #include "dict0boot.h"
 #include "dict0mem.h"
 #include "dict0crea.h"
+#include "dict0stats.h"
 #include "trx0undo.h"
 #include "btr0btr.h"
 #include "btr0cur.h"
@@ -53,13 +56,23 @@ UNIV_INTERN dict_index_t*	dict_ind_compact;
 #include "pars0sym.h"
 #include "que0que.h"
 #include "rem0cmp.h"
-#include "row0merge.h"
+#include "fts0fts.h"
+#include "fts0types.h"
 #include "m_ctype.h" /* my_isspace() */
-#include "ha_prototypes.h" /* innobase_strcasecmp(), innobase_casedn_str()*/
+#include "ha_prototypes.h" /* innobase_strcasecmp(), innobase_casedn_str() */
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "lock0lock.h"
+#include "dict0priv.h"
 #include "row0upd.h"
-#include "srv0start.h" /* SRV_LOG_SPACE_FIRST_ID */
+#include "row0mysql.h"
+#include "row0merge.h"
+#include "row0log.h"
+#include "ut0ut.h" /* ut_format_name() */
 #include "m_string.h"
 #include "my_sys.h"
+#include "mysqld.h" /* system_charset_info */
+#include "strfunc.h" /* strconvert() */
 
 #include <ctype.h>
 
@@ -73,17 +86,27 @@ backround operations purge, rollback, foreign key checks reserve this
 in S-mode; we cannot trust that MySQL protects implicit or background
 operations a table drop since MySQL does not know of them; therefore
 we need this; NOTE: a transaction which reserves this must keep book
-on the mode in trx_struct::dict_operation_lock_mode */
+on the mode in trx_t::dict_operation_lock_mode */
 UNIV_INTERN rw_lock_t	dict_operation_lock;
 
+/** Percentage of compression failures that are allowed in a single
+round */
+UNIV_INTERN ulong	zip_failure_threshold_pct = 5;
+
+/** Maximum percentage of a page that can be allowed as a pad to avoid
+compression failures */
+UNIV_INTERN ulong	zip_pad_max = 50;
+
 /* Keys to register rwlocks and mutexes with performance schema */
 #ifdef UNIV_PFS_RWLOCK
 UNIV_INTERN mysql_pfs_key_t	dict_operation_lock_key;
 UNIV_INTERN mysql_pfs_key_t	index_tree_rw_lock_key;
+UNIV_INTERN mysql_pfs_key_t	index_online_log_key;
 UNIV_INTERN mysql_pfs_key_t	dict_table_stats_latch_key;
 #endif /* UNIV_PFS_RWLOCK */
 
 #ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	zip_pad_mutex_key;
 UNIV_INTERN mysql_pfs_key_t	dict_sys_mutex_key;
 UNIV_INTERN mysql_pfs_key_t	dict_foreign_err_mutex_key;
 #endif /* UNIV_PFS_MUTEX */
@@ -144,12 +167,14 @@ dict_index_build_internal_non_clust(
 	dict_index_t*		index);	/*!< in: user representation of
 					a non-clustered index */
 /**********************************************************************//**
-Removes a foreign constraint struct from the dictionary cache. */
+Builds the internal dictionary cache representation for an FTS index.
+@return	own: the internal representation of the FTS index */
 static
-void
-dict_foreign_remove_from_cache(
-/*===========================*/
-	dict_foreign_t*	foreign);	/*!< in, own: foreign constraint */
+dict_index_t*
+dict_index_build_internal_fts(
+/*==========================*/
+	dict_table_t*	table,	/*!< in: table */
+	dict_index_t*	index);	/*!< in: user representation of an FTS index */
 /**********************************************************************//**
 Prints a column data. */
 static
@@ -172,21 +197,57 @@ void
 dict_field_print_low(
 /*=================*/
 	const dict_field_t*	field);	/*!< in: field */
-#ifndef UNIV_HOTBACKUP
-/*********************************************************************//**
-Frees a foreign key struct. */
+
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
 static
 void
-dict_foreign_free(
-/*==============*/
-	dict_foreign_t*	foreign);	/*!< in, own: foreign key struct */
+dict_index_remove_from_cache_low(
+/*=============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	dict_index_t*	index,		/*!< in, own: index */
+	ibool		lru_evict);	/*!< in: TRUE if page being evicted
+					to make room in the table LRU list */
+/**********************************************************************//**
+Removes a table object from the dictionary cache. */
+static
+void
+dict_table_remove_from_cache_low(
+/*=============================*/
+	dict_table_t*	table,		/*!< in, own: table */
+	ibool		lru_evict);	/*!< in: TRUE if evicting from LRU */
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate the dictionary table LRU list.
+@return TRUE if validate OK */
+static
+ibool
+dict_lru_validate(void);
+/*===================*/
+/**********************************************************************//**
+Check if table is in the dictionary table LRU list.
+@return TRUE if table found */
+static
+ibool
+dict_lru_find_table(
+/*================*/
+	const dict_table_t*	find_table);	/*!< in: table to find */
+/**********************************************************************//**
+Check if a table exists in the dict table non-LRU list.
+@return TRUE if table found */
+static
+ibool
+dict_non_lru_find_table(
+/*====================*/
+	const dict_table_t*	find_table);	/*!< in: table to find */
+#endif /* UNIV_DEBUG */
 
 /* Stream for storing detailed information about the latest foreign key
-and unique key errors */
+and unique key errors. Only created if !srv_read_only_mode */
 UNIV_INTERN FILE*	dict_foreign_err_file		= NULL;
 /* mutex protecting the foreign and unique error buffers */
-UNIV_INTERN mutex_t	dict_foreign_err_mutex;
-#endif /* !UNIV_HOTBACKUP */
+UNIV_INTERN ib_mutex_t	dict_foreign_err_mutex;
+
 /******************************************************************//**
 Makes all characters in a NUL-terminated UTF-8 string lower case. */
 UNIV_INTERN
@@ -273,7 +334,7 @@ dict_mutex_exit_for_mysql(void)
 
 /** Get the latch that protects the stats of a given table */
 #define GET_TABLE_STATS_LATCH(table) \
-	(&dict_table_stats_latches[ut_fold_ull(table->id) \
+	(&dict_table_stats_latches[ut_fold_ull((ib_uint64_t) table) \
 				   % DICT_TABLE_STATS_LATCHES_SIZE])
 
 /**********************************************************************//**
@@ -332,26 +393,135 @@ dict_table_stats_unlock(
 	}
 }
 
+/**********************************************************************//**
+Try to drop any indexes after an aborted index creation.
+This can also be after a server kill during DROP INDEX. */
+static
+void
+dict_table_try_drop_aborted(
+/*========================*/
+	dict_table_t*	table,		/*!< in: table, or NULL if it
+					needs to be looked up again */
+	table_id_t	table_id,	/*!< in: table identifier */
+	ulint		ref_count)	/*!< in: expected table->n_ref_count */
+{
+	trx_t*		trx;
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "try to drop any indexes after an aborted index creation";
+	row_mysql_lock_data_dictionary(trx);
+	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+	if (table == NULL) {
+		table = dict_table_open_on_id_low(
+			table_id, DICT_ERR_IGNORE_NONE);
+	} else {
+		ut_ad(table->id == table_id);
+	}
+
+	if (table && table->n_ref_count == ref_count && table->drop_aborted) {
+		/* Silence a debug assertion in row_merge_drop_indexes(). */
+		ut_d(table->n_ref_count++);
+		row_merge_drop_indexes(trx, table, TRUE);
+		ut_d(table->n_ref_count--);
+		ut_ad(table->n_ref_count == ref_count);
+		trx_commit_for_mysql(trx);
+	}
+
+	row_mysql_unlock_data_dictionary(trx);
+	trx_free_for_background(trx);
+}
+
+/**********************************************************************//**
+When opening a table,
+try to drop any indexes after an aborted index creation.
+Release the dict_sys->mutex. */
+static
+void
+dict_table_try_drop_aborted_and_mutex_exit(
+/*=======================================*/
+	dict_table_t*	table,		/*!< in: table (may be NULL) */
+	ibool		try_drop)	/*!< in: FALSE if should try to
+					drop indexes whose online creation
+					was aborted */
+{
+	if (try_drop
+	    && table != NULL
+	    && table->drop_aborted
+	    && table->n_ref_count == 1
+	    && dict_table_get_first_index(table)) {
+
+		/* Attempt to drop the indexes whose online creation
+		was aborted. */
+		table_id_t	table_id = table->id;
+
+		mutex_exit(&dict_sys->mutex);
+
+		dict_table_try_drop_aborted(table, table_id, 1);
+	} else {
+		mutex_exit(&dict_sys->mutex);
+	}
+}
+
 /********************************************************************//**
-Decrements the count of open MySQL handles to a table. */
+Decrements the count of open handles to a table. */
 UNIV_INTERN
 void
-dict_table_decrement_handle_count(
-/*==============================*/
+dict_table_close(
+/*=============*/
 	dict_table_t*	table,		/*!< in/out: table */
-	ibool		dict_locked)	/*!< in: TRUE=data dictionary locked */
+	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
+	ibool		try_drop)	/*!< in: TRUE=try to drop any orphan
+					indexes after an aborted online
+					index creation */
 {
 	if (!dict_locked) {
 		mutex_enter(&dict_sys->mutex);
 	}
 
 	ut_ad(mutex_own(&dict_sys->mutex));
-	ut_a(table->n_mysql_handles_opened > 0);
+	ut_a(table->n_ref_count > 0);
+
+	--table->n_ref_count;
 
-	table->n_mysql_handles_opened--;
+	/* Force persistent stats re-read upon next open of the table
+	so that FLUSH TABLE can be used to forcibly fetch stats from disk
+	if they have been manually modified. We reset table->stat_initialized
+	only if table reference count is 0 because we do not want too frequent
+	stats re-reads (e.g. in other cases than FLUSH TABLE). */
+	if (strchr(table->name, '/') != NULL
+	    && table->n_ref_count == 0
+	    && dict_stats_is_persistent_enabled(table)) {
+
+		dict_stats_deinit(table);
+	}
+
+	MONITOR_DEC(MONITOR_TABLE_REFERENCE);
+
+	ut_ad(dict_lru_validate());
+
+#ifdef UNIV_DEBUG
+	if (table->can_be_evicted) {
+		ut_ad(dict_lru_find_table(table));
+	} else {
+		ut_ad(dict_non_lru_find_table(table));
+	}
+#endif /* UNIV_DEBUG */
 
 	if (!dict_locked) {
+		table_id_t	table_id	= table->id;
+		ibool		drop_aborted;
+
+		drop_aborted = try_drop
+			&& table->drop_aborted
+			&& table->n_ref_count == 1
+			&& dict_table_get_first_index(table);
+
 		mutex_exit(&dict_sys->mutex);
+
+		if (drop_aborted) {
+			dict_table_try_drop_aborted(NULL, table_id, 0);
+		}
 	}
 }
 #endif /* !UNIV_HOTBACKUP */
@@ -410,6 +580,33 @@ dict_table_autoinc_initialize(
 	table->autoinc = value;
 }
 
+/************************************************************************
+Get all the FTS indexes on a table.
+@return	number of FTS indexes */
+UNIV_INTERN
+ulint
+dict_table_get_all_fts_indexes(
+/*===========================*/
+	dict_table_t*   table,          /*!< in: table */
+	ib_vector_t*    indexes)        /*!< out: all FTS indexes on this
+					table */
+{
+	dict_index_t* index;
+
+	ut_a(ib_vector_size(indexes) == 0);
+
+	for (index = dict_table_get_first_index(table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+
+		if (index->type == DICT_FTS) {
+			ib_vector_push(indexes, &index);
+		}
+	}
+
+	return(ib_vector_size(indexes));
+}
+
 /********************************************************************//**
 Reads the next autoinc value (== autoinc counter value), 0 if not yet
 initialized.
@@ -454,34 +651,6 @@ dict_table_autoinc_unlock(
 {
 	mutex_exit(&table->autoinc_mutex);
 }
-
-/**********************************************************************//**
-Looks for an index with the given table and index id.
-NOTE that we do not reserve the dictionary mutex.
-@return	index or NULL if not found from cache */
-UNIV_INTERN
-dict_index_t*
-dict_index_get_on_id_low(
-/*=====================*/
-	dict_table_t*	table,	/*!< in: table */
-	index_id_t	id)	/*!< in: index id */
-{
-	dict_index_t*	index;
-
-	index = dict_table_get_first_index(table);
-
-	while (index) {
-		if (id == index->id) {
-			/* Found */
-
-			return(index);
-		}
-
-		index = dict_table_get_next_index(index);
-	}
-
-	return(NULL);
-}
 #endif /* !UNIV_HOTBACKUP */
 
 /********************************************************************//**
@@ -527,20 +696,6 @@ dict_index_get_nth_col_or_prefix_pos(
 	return(ULINT_UNDEFINED);
 }
 
-/********************************************************************//**
-Looks for column n in an index.
-@return position in internal representation of the index;
-ULINT_UNDEFINED if not contained */
-UNIV_INTERN
-ulint
-dict_index_get_nth_col_pos(
-/*=======================*/
-	const dict_index_t*	index,	/*!< in: index */
-	ulint			n)	/*!< in: column number */
-{
-	return(dict_index_get_nth_col_or_prefix_pos(index, n, FALSE));
-}
-
 #ifndef UNIV_HOTBACKUP
 /********************************************************************//**
 Returns TRUE if the index contains a column or a prefix of that column.
@@ -628,30 +783,41 @@ Returns a table object based on table id.
 @return	table, NULL if does not exist */
 UNIV_INTERN
 dict_table_t*
-dict_table_get_on_id(
-/*=================*/
+dict_table_open_on_id(
+/*==================*/
 	table_id_t	table_id,	/*!< in: table id */
-	trx_t*		trx)		/*!< in: transaction handle */
+	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
+	dict_table_op_t	table_op)	/*!< in: operation to perform */
 {
 	dict_table_t*	table;
 
-	if (trx->dict_operation_lock_mode == RW_X_LATCH) {
+	if (!dict_locked) {
+		mutex_enter(&dict_sys->mutex);
+	}
 
-		/* Note: An X latch implies that the transaction
-		already owns the dictionary mutex. */
+	ut_ad(mutex_own(&dict_sys->mutex));
 
-		ut_ad(mutex_own(&dict_sys->mutex));
+	table = dict_table_open_on_id_low(
+		table_id,
+		table_op == DICT_TABLE_OP_LOAD_TABLESPACE
+		? DICT_ERR_IGNORE_RECOVER_LOCK
+		: DICT_ERR_IGNORE_NONE);
 
-		return(dict_table_get_on_id_low(table_id));
-	}
+	if (table != NULL) {
 
-	mutex_enter(&(dict_sys->mutex));
+		if (table->can_be_evicted) {
+			dict_move_to_mru(table);
+		}
 
-	table = dict_table_get_on_id_low(table_id);
+		++table->n_ref_count;
 
-	dict_table_LRU_trim(table);
+		MONITOR_INC(MONITOR_TABLE_REFERENCE);
+	}
 
-	mutex_exit(&(dict_sys->mutex));
+	if (!dict_locked) {
+		dict_table_try_drop_aborted_and_mutex_exit(
+			table, table_op == DICT_TABLE_OP_DROP_ORPHAN);
+	}
 
 	return(table);
 }
@@ -716,7 +882,7 @@ dict_init(void)
 {
 	int	i;
 
-	dict_sys = mem_alloc(sizeof(dict_sys_t));
+	dict_sys = static_cast<dict_sys_t*>(mem_zalloc(sizeof(*dict_sys)));
 
 	mutex_create(dict_sys_mutex_key, &dict_sys->mutex, SYNC_DICT);
 
@@ -726,18 +892,16 @@ dict_init(void)
 	dict_sys->table_id_hash = hash_create(buf_pool_get_curr_size()
 					      / (DICT_POOL_PER_TABLE_HASH
 						 * UNIV_WORD_SIZE));
-	dict_sys->size = 0;
-
-	UT_LIST_INIT(dict_sys->table_LRU);
-
 	rw_lock_create(dict_operation_lock_key,
 		       &dict_operation_lock, SYNC_DICT_OPERATION);
 
-	dict_foreign_err_file = os_file_create_tmpfile();
-	ut_a(dict_foreign_err_file);
+	if (!srv_read_only_mode) {
+		dict_foreign_err_file = os_file_create_tmpfile();
+		ut_a(dict_foreign_err_file);
 
-	mutex_create(dict_foreign_err_mutex_key,
-		     &dict_foreign_err_mutex, SYNC_ANY_LATCH);
+		mutex_create(dict_foreign_err_mutex_key,
+			     &dict_foreign_err_mutex, SYNC_NO_ORDER_CHECK);
+	}
 
 	for (i = 0; i < DICT_TABLE_STATS_LATCHES_SIZE; i++) {
 		rw_lock_create(dict_table_stats_latch_key,
@@ -746,45 +910,100 @@ dict_init(void)
 }
 
 /**********************************************************************//**
-Returns a table object and optionally increment its MySQL open handle count.
+Move to the most recently used segment of the LRU list. */
+UNIV_INTERN
+void
+dict_move_to_mru(
+/*=============*/
+	dict_table_t*	table)		/*!< in: table to move to MRU */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(dict_lru_validate());
+	ut_ad(dict_lru_find_table(table));
+
+	ut_a(table->can_be_evicted);
+
+	UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+
+	UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
+
+	ut_ad(dict_lru_validate());
+}
+
+/**********************************************************************//**
+Returns a table object and increment its open handle count.
 NOTE! This is a high-level function to be used mainly from outside the
-'dict' directory. Inside this directory dict_table_get_low is usually the
-appropriate function.
+'dict' module. Inside this directory dict_table_get_low
+is usually the appropriate function.
 @return	table, NULL if does not exist */
 UNIV_INTERN
 dict_table_t*
-dict_table_get(
-/*===========*/
-	const char*		table_name,	/*!< in: table name */
-	ibool			inc_mysql_count,/*!< in: whether to increment
-						the open handle count on the
-						table */
-	dict_err_ignore_t	ignore_err)	/*!< in: errors to ignore when
-						loading the table */
+dict_table_open_on_name(
+/*====================*/
+	const char*	table_name,	/*!< in: table name */
+	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
+	ibool		try_drop,	/*!< in: TRUE=try to drop any orphan
+					indexes after an aborted online
+					index creation */
+	dict_err_ignore_t
+			ignore_err)	/*!< in: error to be ignored when
+					loading a table definition */
 {
 	dict_table_t*	table;
 
-	mutex_enter(&(dict_sys->mutex));
+	if (!dict_locked) {
+		mutex_enter(&(dict_sys->mutex));
+	}
 
-	table = dict_table_get_low(table_name, ignore_err);
+	ut_ad(table_name);
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	table = dict_table_check_if_in_cache_low(table_name);
 
-	if (inc_mysql_count && table) {
-		table->n_mysql_handles_opened++;
+	if (table == NULL) {
+		table = dict_load_table(table_name, TRUE, ignore_err);
 	}
 
-	dict_table_LRU_trim(table);
+	ut_ad(!table || table->cached);
 
-	mutex_exit(&(dict_sys->mutex));
+	if (table != NULL) {
+
+		/* If table is corrupted, return NULL */
+		if (ignore_err == DICT_ERR_IGNORE_NONE
+		    && table->corrupted) {
+
+			/* Make life easy for drop table. */
+			if (table->can_be_evicted) {
+				dict_table_move_from_lru_to_non_lru(table);
+			}
+
+			if (!dict_locked) {
+				mutex_exit(&dict_sys->mutex);
+			}
+
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr, "  InnoDB: table ");
+			ut_print_name(stderr, NULL, TRUE, table->name);
+			fprintf(stderr, "is corrupted. Please drop the table "
+				"and recreate\n");
+
+			return(NULL);
+		}
+
+		if (table->can_be_evicted) {
+			dict_move_to_mru(table);
+		}
+
+		++table->n_ref_count;
+
+		MONITOR_INC(MONITOR_TABLE_REFERENCE);
+	}
+
+	ut_ad(dict_lru_validate());
 
-	if (table != NULL && !table->is_corrupt) {
-		/* If table->ibd_file_missing == TRUE, this will
-		print an error message and return without doing
-		anything. */
-		dict_update_statistics(
-			table,
-			TRUE, /* only update stats if not initialized */
-			FALSE,
-			FALSE /* update even if not changed too much */);
+	if (!dict_locked) {
+		dict_table_try_drop_aborted_and_mutex_exit(table, try_drop);
 	}
 
 	return(table);
@@ -844,14 +1063,17 @@ UNIV_INTERN
 void
 dict_table_add_to_cache(
 /*====================*/
-	dict_table_t*	table,	/*!< in: table */
-	mem_heap_t*	heap)	/*!< in: temporary heap */
+	dict_table_t*	table,		/*!< in: table */
+	ibool		can_be_evicted,	/*!< in: TRUE if can be evicted */
+	mem_heap_t*	heap)		/*!< in: temporary heap */
 {
 	ulint	fold;
 	ulint	id_fold;
 	ulint	i;
 	ulint	row_len;
 
+	ut_ad(dict_lru_validate());
+
 	/* The lower limit for what we consider a "big" row */
 #define BIG_ROW_SIZE 1024
 
@@ -923,18 +1145,215 @@ dict_table_add_to_cache(
 	/* Add table to hash table of tables based on table id */
 	HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, id_fold,
 		    table);
-	/* Add table to LRU list of tables */
-	UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
+
+	table->can_be_evicted = can_be_evicted;
+
+	if (table->can_be_evicted) {
+		UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
+	} else {
+		UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_non_LRU, table);
+	}
+
+	ut_ad(dict_lru_validate());
 
 	dict_sys->size += mem_heap_get_size(table->heap)
 		+ strlen(table->name) + 1;
 }
 
 /**********************************************************************//**
+Test whether a table can be evicted from the LRU cache.
+@return TRUE if table can be evicted. */
+static
+ibool
+dict_table_can_be_evicted(
+/*======================*/
+	const dict_table_t*	table)		/*!< in: table to test */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_a(table->can_be_evicted);
+	ut_a(UT_LIST_GET_LEN(table->foreign_list) == 0);
+	ut_a(UT_LIST_GET_LEN(table->referenced_list) == 0);
+
+	if (table->n_ref_count == 0) {
+		dict_index_t*	index;
+
+		/* The transaction commit and rollback are called from
+		outside the handler interface. This means that there is
+		a window where the table->n_ref_count can be zero but
+		the table instance is in "use". */
+
+		if (lock_table_has_locks(table)) {
+			return(FALSE);
+		}
+
+		for (index = dict_table_get_first_index(table);
+		     index != NULL;
+		     index = dict_table_get_next_index(index)) {
+
+			btr_search_t*	info = btr_search_get_info(index);
+
+			/* We are not allowed to free the in-memory index
+			struct dict_index_t until all entries in the adaptive
+			hash index that point to any of the page belonging to
+			his b-tree index are dropped. This is so because
+			dropping of these entries require access to
+			dict_index_t struct. To avoid such scenario we keep
+			a count of number of such pages in the search_info and
+			only free the dict_index_t struct when this count
+			drops to zero.
+
+			See also: dict_index_remove_from_cache_low() */
+
+			if (btr_search_info_get_ref_count(info, index) > 0) {
+				return(FALSE);
+			}
+		}
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Make room in the table cache by evicting an unused table. The unused table
+should not be part of FK relationship and currently not used in any user
+transaction. There is no guarantee that it will remove a table.
+@return number of tables evicted. If the number of tables in the dict_LRU
+is less than max_tables it will not do anything. */
+UNIV_INTERN
+ulint
+dict_make_room_in_cache(
+/*====================*/
+	ulint		max_tables,	/*!< in: max tables allowed in cache */
+	ulint		pct_check)	/*!< in: max percent to check */
+{
+	ulint		i;
+	ulint		len;
+	dict_table_t*	table;
+	ulint		check_up_to;
+	ulint		n_evicted = 0;
+
+	ut_a(pct_check > 0);
+	ut_a(pct_check <= 100);
+	ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(dict_lru_validate());
+
+	i = len = UT_LIST_GET_LEN(dict_sys->table_LRU);
+
+	if (len < max_tables) {
+		return(0);
+	}
+
+	check_up_to = len - ((len * pct_check) / 100);
+
+	/* Check for overflow */
+	ut_a(i == 0 || check_up_to <= i);
+
+	/* Find a suitable candidate to evict from the cache. Don't scan the
+	entire LRU list. Only scan pct_check list entries. */
+
+	for (table = UT_LIST_GET_LAST(dict_sys->table_LRU);
+	     table != NULL
+	     && i > check_up_to
+	     && (len - n_evicted) > max_tables;
+	     --i) {
+
+		dict_table_t*	prev_table;
+
+	        prev_table = UT_LIST_GET_PREV(table_LRU, table);
+
+		if (dict_table_can_be_evicted(table)) {
+
+			dict_table_remove_from_cache_low(table, TRUE);
+
+			++n_evicted;
+		}
+
+		table = prev_table;
+	}
+
+	return(n_evicted);
+}
+
+/**********************************************************************//**
+Move a table to the non-LRU list from the LRU list. */
+UNIV_INTERN
+void
+dict_table_move_from_lru_to_non_lru(
+/*================================*/
+	dict_table_t*	table)	/*!< in: table to move from LRU to non-LRU */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(dict_lru_find_table(table));
+
+	ut_a(table->can_be_evicted);
+
+	UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+
+	UT_LIST_ADD_LAST(table_LRU, dict_sys->table_non_LRU, table);
+
+	table->can_be_evicted = FALSE;
+}
+
+/**********************************************************************//**
+Move a table to the LRU list from the non-LRU list. */
+UNIV_INTERN
+void
+dict_table_move_from_non_lru_to_lru(
+/*================================*/
+	dict_table_t*	table)	/*!< in: table to move from non-LRU to LRU */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(dict_non_lru_find_table(table));
+
+	ut_a(!table->can_be_evicted);
+
+	UT_LIST_REMOVE(table_LRU, dict_sys->table_non_LRU, table);
+
+	UT_LIST_ADD_LAST(table_LRU, dict_sys->table_LRU, table);
+
+	table->can_be_evicted = TRUE;
+}
+
+/**********************************************************************//**
+Looks for an index with the given id given a table instance.
+@return	index or NULL */
+static
+dict_index_t*
+dict_table_find_index_on_id(
+/*========================*/
+	const dict_table_t*	table,	/*!< in: table instance */
+	index_id_t		id)	/*!< in: index id */
+{
+	dict_index_t*	index;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (id == index->id) {
+			/* Found */
+
+			return(index);
+		}
+	}
+
+	return(NULL);
+}
+
+/**********************************************************************//**
 Looks for an index with the given id. NOTE that we do not reserve
 the dictionary mutex: this function is for emergency purposes like
 printing info of a corrupt database page!
-@return	index or NULL if not found from cache */
+@return	index or NULL if not found in cache */
 UNIV_INTERN
 dict_index_t*
 dict_index_find_on_id_low(
@@ -942,29 +1361,32 @@ dict_index_find_on_id_low(
 	index_id_t	id)	/*!< in: index id */
 {
 	dict_table_t*	table;
-	dict_index_t*	index;
 
 	/* This can happen if the system tablespace is the wrong page size */
 	if (dict_sys == NULL) {
 		return(NULL);
 	}
 
-	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
 
-	while (table) {
-		index = dict_table_get_first_index(table);
+		dict_index_t*	index = dict_table_find_index_on_id(table, id);
 
-		while (index) {
-			if (id == index->id) {
-				/* Found */
+		if (index != NULL) {
+			return(index);
+		}
+	}
 
-				return(index);
-			}
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
 
-			index = dict_table_get_next_index(index);
-		}
+		dict_index_t*	index = dict_table_find_index_on_id(table, id);
 
-		table = UT_LIST_GET_NEXT(table_LRU, table);
+		if (index != NULL) {
+			return(index);
+		}
 	}
 
 	return(NULL);
@@ -974,7 +1396,7 @@ dict_index_find_on_id_low(
 Renames a table object.
 @return	TRUE if success */
 UNIV_INTERN
-ibool
+dberr_t
 dict_table_rename_in_cache(
 /*=======================*/
 	dict_table_t*	table,		/*!< in/out: table */
@@ -988,7 +1410,6 @@ dict_table_rename_in_cache(
 	ulint		fold;
 	char		old_name[MAX_FULL_NAME_LEN + 1];
 
-	ut_ad(table);
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 
 	/* store the old/current name to an automatic variable */
@@ -1005,28 +1426,60 @@ dict_table_rename_in_cache(
 	fold = ut_fold_string(new_name);
 
 	/* Look for a table with the same name: error if such exists */
-	{
-		dict_table_t*	table2;
-		HASH_SEARCH(name_hash, dict_sys->table_hash, fold,
-			    dict_table_t*, table2, ut_ad(table2->cached),
-			    (ut_strcmp(table2->name, new_name) == 0));
-		if (UNIV_LIKELY_NULL(table2)) {
-			ut_print_timestamp(stderr);
-			fputs("  InnoDB: Error: dictionary cache"
-			      " already contains a table ", stderr);
-			ut_print_name(stderr, NULL, TRUE, new_name);
-			fputs("\n"
-			      "InnoDB: cannot rename table ", stderr);
-			ut_print_name(stderr, NULL, TRUE, old_name);
-			putc('\n', stderr);
-			return(FALSE);
-		}
+	dict_table_t*	table2;
+	HASH_SEARCH(name_hash, dict_sys->table_hash, fold,
+			dict_table_t*, table2, ut_ad(table2->cached),
+			(ut_strcmp(table2->name, new_name) == 0));
+	DBUG_EXECUTE_IF("dict_table_rename_in_cache_failure",
+		if (table2 == NULL) {
+			table2 = (dict_table_t*) -1;
+		} );
+	if (table2) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot rename table '%s' to '%s' since the "
+			"dictionary cache already contains '%s'.",
+			old_name, new_name, new_name);
+		return(DB_ERROR);
 	}
 
 	/* If the table is stored in a single-table tablespace, rename the
-	.ibd file */
+	.ibd file and rebuild the .isl file if needed. */
+
+	if (dict_table_is_discarded(table)) {
+		os_file_type_t	type;
+		ibool		exists;
+		char*		filepath;
+
+		ut_ad(table->space != TRX_SYS_SPACE);
+
+		if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+
+			dict_get_and_save_data_dir_path(table, true);
+			ut_a(table->data_dir_path);
+
+			filepath = os_file_make_remote_pathname(
+				table->data_dir_path, table->name, "ibd");
+		} else {
+			filepath = fil_make_ibd_name(table->name, false);
+		}
+
+		fil_delete_tablespace(table->space, BUF_REMOVE_ALL_NO_WRITE);
+
+		/* Delete any temp file hanging around. */
+		if (os_file_status(filepath, &exists, &type)
+		    && exists
+		    && !os_file_delete_if_exists(innodb_file_temp_key,
+						 filepath)) {
+
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Delete of %s failed.", filepath);
+		}
+
+		mem_free(filepath);
+
+	} else if (table->space != TRX_SYS_SPACE) {
+		char*	new_path = NULL;
 
-	if (table->space != 0) {
 		if (table->dir_path_of_temp_table != NULL) {
 			ut_print_timestamp(stderr);
 			fputs("  InnoDB: Error: trying to rename a"
@@ -1036,10 +1489,40 @@ dict_table_rename_in_cache(
 			ut_print_filename(stderr,
 					  table->dir_path_of_temp_table);
 			fputs(" )\n", stderr);
-			return(FALSE);
-		} else if (!fil_rename_tablespace(old_name, table->space,
-						  new_name)) {
-			return(FALSE);
+			return(DB_ERROR);
+
+		} else if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+			char*		old_path;
+
+			old_path = fil_space_get_first_path(table->space);
+
+			new_path = os_file_make_new_pathname(
+				old_path, new_name);
+
+			mem_free(old_path);
+
+			dberr_t	err = fil_create_link_file(
+				new_name, new_path);
+
+			if (err != DB_SUCCESS) {
+				mem_free(new_path);
+				return(DB_TABLESPACE_EXISTS);
+			}
+		}
+
+		ibool	success = fil_rename_tablespace(
+			old_name, table->space, new_name, new_path);
+
+		/* If the tablespace is remote, a new .isl file was created
+		If success, delete the old one. If not, delete the new one.  */
+		if (new_path) {
+
+			mem_free(new_path);
+			fil_delete_link_file(success ? old_name : new_name);
+		}
+
+		if (!success) {
+			return(DB_ERROR);
 		}
 	}
 
@@ -1052,7 +1535,9 @@ dict_table_rename_in_cache(
 		memory fragmentation, we assume a repeated calls of
 		ut_realloc() with the same size do not cause fragmentation */
 		ut_a(strlen(new_name) <= MAX_FULL_NAME_LEN);
-		table->name = ut_realloc(table->name, MAX_FULL_NAME_LEN + 1);
+
+		table->name = static_cast<char*>(
+			ut_realloc(table->name, MAX_FULL_NAME_LEN + 1));
 	}
 	memcpy(table->name, new_name, strlen(new_name) + 1);
 
@@ -1064,12 +1549,11 @@ dict_table_rename_in_cache(
 	ut_a(dict_sys->size > 0);
 
 	/* Update the table_name field in indexes */
-	index = dict_table_get_first_index(table);
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
 
-	while (index != NULL) {
 		index->table_name = table->name;
-
-		index = dict_table_get_next_index(index);
 	}
 
 	if (!rename_also_foreigns) {
@@ -1104,7 +1588,7 @@ dict_table_rename_in_cache(
 
 		UT_LIST_INIT(table->referenced_list);
 
-		return(TRUE);
+		return(DB_SUCCESS);
 	}
 
 	/* Update the table name fields in foreign constraints, and update also
@@ -1200,10 +1684,11 @@ dict_table_rename_in_cache(
 				uint	errors = 0;
 
 				if (strlen(table->name) > strlen(old_name)) {
-					foreign->id = mem_heap_alloc(
+					foreign->id = static_cast<char*>(
+						mem_heap_alloc(
 						foreign->heap,
 						strlen(table->name)
-						+ strlen(old_id) + 1);
+						+ strlen(old_id) + 1));
 				}
 
 				/* Convert the table name to UTF-8 */
@@ -1244,9 +1729,10 @@ dict_table_rename_in_cache(
 				if (dict_get_db_name_len(table->name)
 				    > dict_get_db_name_len(foreign->id)) {
 
-					foreign->id = mem_heap_alloc(
+					foreign->id = static_cast<char*>(
+						mem_heap_alloc(
 						foreign->heap,
-						db_len + strlen(old_id) + 1);
+						db_len + strlen(old_id) + 1));
 				}
 
 				/* Replace the database prefix in id with the
@@ -1264,9 +1750,10 @@ dict_table_rename_in_cache(
 		foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
 	}
 
-	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+	for (foreign = UT_LIST_GET_FIRST(table->referenced_list);
+	     foreign != NULL;
+	     foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) {
 
-	while (foreign != NULL) {
 		if (ut_strlen(foreign->referenced_table_name)
 		    < ut_strlen(table->name)) {
 			/* Allocate a longer name buffer;
@@ -1274,16 +1761,19 @@ dict_table_rename_in_cache(
 
 			foreign->referenced_table_name = mem_heap_strdup(
 				foreign->heap, table->name);
-			dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
+
+			dict_mem_referenced_table_name_lookup_set(
+				foreign, TRUE);
 		} else {
 			/* Use the same buffer */
 			strcpy(foreign->referenced_table_name, table->name);
-			dict_mem_referenced_table_name_lookup_set(foreign, FALSE);
+
+			dict_mem_referenced_table_name_lookup_set(
+				foreign, FALSE);
 		}
-		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
 	}
 
-	return(TRUE);
+	return(DB_SUCCESS);
 }
 
 /**********************************************************************//**
@@ -1313,17 +1803,22 @@ dict_table_change_id_in_cache(
 
 /**********************************************************************//**
 Removes a table object from the dictionary cache. */
-UNIV_INTERN
+static
 void
-dict_table_remove_from_cache(
-/*=========================*/
-	dict_table_t*	table)	/*!< in, own: table */
+dict_table_remove_from_cache_low(
+/*=============================*/
+	dict_table_t*	table,		/*!< in, own: table */
+	ibool		lru_evict)	/*!< in: TRUE if table being evicted
+					to make room in the table LRU list */
 {
 	dict_foreign_t*	foreign;
 	dict_index_t*	index;
 	ulint		size;
 
 	ut_ad(table);
+	ut_ad(dict_lru_validate());
+	ut_a(table->n_ref_count == 0);
+	ut_a(table->n_rec_locks == 0);
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
 
@@ -1334,40 +1829,75 @@ dict_table_remove_from_cache(
 #endif
 
 	/* Remove the foreign constraints from the cache */
-	foreign = UT_LIST_GET_LAST(table->foreign_list);
 
-	while (foreign != NULL) {
+	for (foreign = UT_LIST_GET_LAST(table->foreign_list);
+	     foreign != NULL;
+	     foreign = UT_LIST_GET_LAST(table->foreign_list)) {
+
 		dict_foreign_remove_from_cache(foreign);
-		foreign = UT_LIST_GET_LAST(table->foreign_list);
 	}
 
 	/* Reset table field in referencing constraints */
 
-	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+	for (foreign = UT_LIST_GET_FIRST(table->referenced_list);
+	     foreign != NULL;
+	     foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) {
 
-	while (foreign != NULL) {
 		foreign->referenced_table = NULL;
 		foreign->referenced_index = NULL;
-
-		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
 	}
 
 	/* Remove the indexes from the cache */
-	index = UT_LIST_GET_LAST(table->indexes);
 
-	while (index != NULL) {
-		dict_index_remove_from_cache(table, index);
-		index = UT_LIST_GET_LAST(table->indexes);
+	for (index = UT_LIST_GET_LAST(table->indexes);
+	     index != NULL;
+	     index = UT_LIST_GET_LAST(table->indexes)) {
+
+		dict_index_remove_from_cache_low(table, index, lru_evict);
 	}
 
 	/* Remove table from the hash tables of tables */
+
 	HASH_DELETE(dict_table_t, name_hash, dict_sys->table_hash,
 		    ut_fold_string(table->name), table);
+
 	HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash,
 		    ut_fold_ull(table->id), table);
 
-	/* Remove table from LRU list of tables */
-	UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+	/* Remove table from LRU or non-LRU list. */
+	if (table->can_be_evicted) {
+		ut_ad(dict_lru_find_table(table));
+		UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+	} else {
+		ut_ad(dict_non_lru_find_table(table));
+		UT_LIST_REMOVE(table_LRU, dict_sys->table_non_LRU, table);
+	}
+
+	ut_ad(dict_lru_validate());
+
+	if (lru_evict && table->drop_aborted) {
+		/* Do as dict_table_try_drop_aborted() does. */
+
+		trx_t* trx = trx_allocate_for_background();
+
+		ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+		/* Mimic row_mysql_lock_data_dictionary(). */
+		trx->dict_operation_lock_mode = RW_X_LATCH;
+
+		trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+		/* Silence a debug assertion in row_merge_drop_indexes(). */
+		ut_d(table->n_ref_count++);
+		row_merge_drop_indexes(trx, table, TRUE);
+		ut_d(table->n_ref_count--);
+		ut_ad(table->n_ref_count == 0);
+		trx_commit_for_mysql(trx);
+		trx->dict_operation_lock_mode = 0;
+		trx_free_for_background(trx);
+	}
 
 	size = mem_heap_get_size(table->heap) + strlen(table->name) + 1;
 
@@ -1378,62 +1908,15 @@ dict_table_remove_from_cache(
 	dict_mem_table_free(table);
 }
 
-/**************************************************************************
-Frees tables from the end of table_LRU if the dictionary cache occupies
-too much space. */
+/**********************************************************************//**
+Removes a table object from the dictionary cache. */
 UNIV_INTERN
 void
-dict_table_LRU_trim(
-/*================*/
-	dict_table_t*	self)
+dict_table_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table)	/*!< in, own: table */
 {
-	dict_table_t*	table;
-	dict_table_t*	prev_table;
-	dict_foreign_t*	foreign;
-	ulint		n_removed;
-	ulint		n_have_parent;
-	ulint		cached_foreign_tables;
-
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(mutex_own(&(dict_sys->mutex)));
-#endif /* UNIV_SYNC_DEBUG */
-
-retry:
-	n_removed = n_have_parent = 0;
-	table = UT_LIST_GET_LAST(dict_sys->table_LRU);
-
-	while ( srv_dict_size_limit && table
-		&& ((dict_sys->table_hash->n_cells
-		     + dict_sys->table_id_hash->n_cells) * sizeof(hash_cell_t)
-		    + dict_sys->size) > srv_dict_size_limit ) {
-		prev_table = UT_LIST_GET_PREV(table_LRU, table);
-
-		if (table == self || table->n_mysql_handles_opened || table->is_corrupt)
-			goto next_loop;
-
-		cached_foreign_tables = 0;
-		foreign = UT_LIST_GET_FIRST(table->foreign_list);
-		while (foreign != NULL) {
-			if (foreign->referenced_table)
-				cached_foreign_tables++;
-			foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
-		}
-
-		if (cached_foreign_tables == 0) {
-			dict_table_remove_from_cache(table);
-			n_removed++;
-		} else {
-			n_have_parent++;
-		}
-next_loop:
-		table = prev_table;
-	}
-
-	if ( srv_dict_size_limit && n_have_parent && n_removed
-		&& ((dict_sys->table_hash->n_cells
-		     + dict_sys->table_id_hash->n_cells) * sizeof(hash_cell_t)
-		    + dict_sys->size) > srv_dict_size_limit )
-		goto retry;
+	dict_table_remove_from_cache_low(table, FALSE);
 }
 
 /****************************************************************//**
@@ -1468,6 +1951,11 @@ dict_col_name_is_reserved(
 	return(FALSE);
 }
 
+#if 1	/* This function is not very accurate at determining
+	whether an UNDO record will be too big. See innodb_4k.test,
+	Bug 13336585, for a testcase that shows an index that can
+	be created but cannot be updated. */
+
 /****************************************************************//**
 If an undo log record for this table might not fit on a single page,
 return TRUE.
@@ -1496,7 +1984,13 @@ dict_index_too_big_for_undo(
 		+ 10 + FIL_PAGE_DATA_END /* trx_undo_left() */
 		+ 2/* pointer to previous undo log record */;
 
-	if (UNIV_UNLIKELY(!clust_index)) {
+	/* FTS index consists of auxiliary tables, they shall be excluded from
+	index row size check */
+	if (new_index->type & DICT_FTS) {
+		return(false);
+	}
+
+	if (!clust_index) {
 		ut_a(dict_index_is_clust(new_index));
 		clust_index = new_index;
 	}
@@ -1583,7 +2077,7 @@ is_ord_part:
 			/* We only store the needed prefix length in undo log */
 			if (max_prefix) {
 			     ut_ad(dict_table_get_format(table)
-				   >= DICT_TF_FORMAT_ZIP);
+				   >= UNIV_FORMAT_B);
 
 				max_size = ut_min(max_prefix, max_size);
 			}
@@ -1596,6 +2090,7 @@ is_ord_part:
 
 	return(undo_page_len >= UNIV_PAGE_SIZE);
 }
+#endif
 
 /****************************************************************//**
 If a record of this index might not fit on a single B-tree page,
@@ -1618,6 +2113,12 @@ dict_index_too_big_for_tree(
 	/* maximum allowed size of a node pointer record */
 	ulint	page_ptr_max;
 
+	/* FTS index consists of auxiliary tables, they shall be excluded from
+	index row size check */
+	if (new_index->type & DICT_FTS) {
+		return(false);
+	}
+
 	DBUG_EXECUTE_IF(
 		"ib_force_create_table",
 		return(FALSE););
@@ -1757,7 +2258,7 @@ add_field_size:
 Adds an index to the dictionary cache.
 @return	DB_SUCCESS, DB_TOO_BIG_RECORD, or DB_CORRUPTION */
 UNIV_INTERN
-ulint
+dberr_t
 dict_index_add_to_cache(
 /*====================*/
 	dict_table_t*	table,	/*!< in: table on which the index is */
@@ -1776,6 +2277,7 @@ dict_index_add_to_cache(
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 	ut_ad(index->n_def == index->n_fields);
 	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(!dict_index_is_online_ddl(index));
 
 	ut_ad(mem_heap_validate(index->heap));
 	ut_a(!dict_index_is_clust(index)
@@ -1790,7 +2292,9 @@ dict_index_add_to_cache(
 	/* Build the cache internal representation of the index,
 	containing also the added system fields */
 
-	if (dict_index_is_clust(index)) {
+	if (index->type == DICT_FTS) {
+		new_index = dict_index_build_internal_fts(table, index);
+	} else if (dict_index_is_clust(index)) {
 		new_index = dict_index_build_internal_clust(table, index);
 	} else {
 		new_index = dict_index_build_internal_non_clust(table, index);
@@ -1800,6 +2304,7 @@ dict_index_add_to_cache(
 	number of fields in the cache internal representation */
 
 	new_index->n_fields = new_index->n_def;
+	new_index->trx_id = index->trx_id;
 
 	if (strict && dict_index_too_big_for_tree(table, new_index)) {
 too_big:
@@ -1808,21 +2313,27 @@ too_big:
 		return(DB_TOO_BIG_RECORD);
 	}
 
-	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+	if (dict_index_is_univ(index)) {
 		n_ord = new_index->n_fields;
 	} else {
 		n_ord = new_index->n_uniq;
 	}
 
+#if 1	/* The following code predetermines whether to call
+	dict_index_too_big_for_undo().  This function is not
+	accurate. See innodb_4k.test, Bug 13336585, for a
+	testcase that shows an index that can be created but
+	cannot be updated. */
+
 	switch (dict_table_get_format(table)) {
-	case DICT_TF_FORMAT_51:
+	case UNIV_FORMAT_A:
 		/* ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT store
 		prefixes of externally stored columns locally within
 		the record.  There are no special considerations for
 		the undo log record size. */
 		goto undo_size_ok;
 
-	case DICT_TF_FORMAT_ZIP:
+	case UNIV_FORMAT_B:
 		/* In ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED,
 		column prefix indexes require that prefixes of
 		externally stored columns are written to the undo log.
@@ -1832,8 +2343,8 @@ too_big:
 		checked for below. */
 		break;
 
-#if DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX
-# error "DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX"
+#if UNIV_FORMAT_B != UNIV_FORMAT_MAX
+# error "UNIV_FORMAT_B != UNIV_FORMAT_MAX"
 #endif
 	}
 
@@ -1872,6 +2383,7 @@ too_big:
 	}
 
 undo_size_ok:
+#endif
 	/* Flag the ordering columns and also set column max_prefix */
 
 	for (i = 0; i < n_ord; i++) {
@@ -1885,48 +2397,42 @@ undo_size_ok:
 		}
 	}
 
+	if (!dict_index_is_univ(new_index)) {
+
+		new_index->stat_n_diff_key_vals =
+			static_cast<ib_uint64_t*>(mem_heap_zalloc(
+			new_index->heap,
+			dict_index_get_n_unique(new_index)
+			* sizeof(*new_index->stat_n_diff_key_vals)));
+
+		new_index->stat_n_sample_sizes =
+			static_cast<ib_uint64_t*>(mem_heap_zalloc(
+			new_index->heap,
+			dict_index_get_n_unique(new_index)
+			* sizeof(*new_index->stat_n_sample_sizes)));
+
+		new_index->stat_n_non_null_key_vals =
+			static_cast<ib_uint64_t*>(mem_heap_zalloc(
+			new_index->heap,
+			dict_index_get_n_unique(new_index)
+			* sizeof(*new_index->stat_n_non_null_key_vals)));
+	}
+
+	new_index->stat_index_size = 1;
+	new_index->stat_n_leaf_pages = 1;
+
 	/* Add the new index as the last index for the table */
 
 	UT_LIST_ADD_LAST(indexes, table->indexes, new_index);
 	new_index->table = table;
 	new_index->table_name = table->name;
-
 	new_index->search_info = btr_search_info_create(new_index->heap);
 
-	new_index->stat_index_size = 1;
-	new_index->stat_n_leaf_pages = 1;
-
 	new_index->page = page_no;
 	rw_lock_create(index_tree_rw_lock_key, &new_index->lock,
 		       dict_index_is_ibuf(index)
 		       ? SYNC_IBUF_INDEX_TREE : SYNC_INDEX_TREE);
 
-	DBUG_EXECUTE_IF(
-		"index_partially_created_should_kick",
-		DEBUG_SYNC_C("index_partially_created");
-	);
-
-	if (!UNIV_UNLIKELY(new_index->type & DICT_UNIVERSAL)) {
-
-		new_index->stat_n_diff_key_vals = mem_heap_alloc(
-			new_index->heap,
-			(1 + dict_index_get_n_unique(new_index))
-			* sizeof(ib_int64_t));
-
-		new_index->stat_n_non_null_key_vals = mem_heap_zalloc(
-			new_index->heap,
-			(1 + dict_index_get_n_unique(new_index))
-			* sizeof(*new_index->stat_n_non_null_key_vals));
-
-		/* Give some sensible values to stat_n_... in case we do
-		not calculate statistics quickly enough */
-
-		for (i = 0; i <= dict_index_get_n_unique(new_index); i++) {
-
-			new_index->stat_n_diff_key_vals[i] = 100;
-		}
-	}
-
 	dict_sys->size += mem_heap_get_size(new_index->heap);
 
 	dict_mem_index_free(index);
@@ -1936,12 +2442,14 @@ undo_size_ok:
 
 /**********************************************************************//**
 Removes an index from the dictionary cache. */
-UNIV_INTERN
+static
 void
-dict_index_remove_from_cache(
-/*=========================*/
-	dict_table_t*	table,	/*!< in/out: table */
-	dict_index_t*	index)	/*!< in, own: index */
+dict_index_remove_from_cache_low(
+/*=============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	dict_index_t*	index,		/*!< in, own: index */
+	ibool		lru_evict)	/*!< in: TRUE if index being evicted
+					to make room in the table LRU list */
 {
 	ulint		size;
 	ulint		retries = 0;
@@ -1952,27 +2460,32 @@ dict_index_remove_from_cache(
 	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 
-	/* remove all entry of the index from adaptive hash index,
-	because removing from adaptive hash index needs dict_index */
-	if (btr_search_enabled && srv_dict_size_limit)
-		btr_search_drop_page_hash_index_on_index(index);
+	/* No need to acquire the dict_index_t::lock here because
+	there can't be any active operations on this index (or table). */
+
+	if (index->online_log) {
+		ut_ad(index->online_status == ONLINE_INDEX_CREATION);
+		row_log_free(index->online_log);
+	}
 
 	/* We always create search info whether or not adaptive
 	hash index is enabled or not. */
-	info = index->search_info;
+	info = btr_search_get_info(index);
 	ut_ad(info);
 
 	/* We are not allowed to free the in-memory index struct
- 	dict_index_t until all entries in the adaptive hash index
+	dict_index_t until all entries in the adaptive hash index
 	that point to any of the page belonging to his b-tree index
 	are dropped. This is so because dropping of these entries
 	require access to dict_index_t struct. To avoid such scenario
 	We keep a count of number of such pages in the search_info and
 	only free the dict_index_t struct when this count drops to
-	zero. */
+	zero. See also: dict_table_can_be_evicted() */
+
+	do {
+		ulint ref_count = btr_search_info_get_ref_count(info,
+								index);
 
-	for (;;) {
-		ulint ref_count = btr_search_info_get_ref_count(info, index);
 		if (ref_count == 0) {
 			break;
 		}
@@ -1984,15 +2497,15 @@ dict_index_remove_from_cache(
 		if (retries % 500 == 0) {
 			/* No luck after 5 seconds of wait. */
 			fprintf(stderr, "InnoDB: Error: Waited for"
-					" %lu secs for hash index"
-					" ref_count (%lu) to drop"
-					" to 0.\n"
-					"index: \"%s\""
-					" table: \"%s\"\n",
-					retries/100,
-					ref_count,
-					index->name,
-					table->name);
+				" %lu secs for hash index"
+				" ref_count (%lu) to drop"
+				" to 0.\n"
+				"index: \"%s\""
+				" table: \"%s\"\n",
+				retries/100,
+				ref_count,
+				index->name,
+				table->name);
 		}
 
 		/* To avoid a hang here we commit suicide if the
@@ -2000,7 +2513,7 @@ dict_index_remove_from_cache(
 		if (retries >= 60000) {
 			ut_error;
 		}
-	}
+	} while (srv_shutdown_state == SRV_SHUTDOWN_NONE || !lru_evict);
 
 	rw_lock_free(&index->lock);
 
@@ -2016,6 +2529,18 @@ dict_index_remove_from_cache(
 	dict_mem_index_free(index);
 }
 
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	dict_index_t*	index)	/*!< in, own: index */
+{
+	dict_index_remove_from_cache_low(table, index, FALSE);
+}
+
 /*******************************************************************//**
 Tries to find column names for the index and sets the col field of the
 index.
@@ -2150,7 +2675,7 @@ dict_index_copy_types(
 {
 	ulint		i;
 
-	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+	if (dict_index_is_univ(index)) {
 		dtuple_set_types_binary(tuple, n_fields);
 
 		return;
@@ -2189,6 +2714,33 @@ dict_table_copy_types(
 	}
 }
 
+/********************************************************************
+Wait until all the background threads of the given table have exited, i.e.,
+bg_threads == 0. Note: bg_threads_mutex must be reserved when
+calling this. */
+UNIV_INTERN
+void
+dict_table_wait_for_bg_threads_to_exit(
+/*===================================*/
+	dict_table_t*	table,	/*< in: table */
+	ulint		delay)	/*< in: time in microseconds to wait between
+				checks of bg_threads. */
+{
+	fts_t*		fts = table->fts;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&fts->bg_threads_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+	while (fts->bg_threads > 0) {
+		mutex_exit(&fts->bg_threads_mutex);
+
+		os_thread_sleep(delay);
+
+		mutex_enter(&fts->bg_threads_mutex);
+	}
+}
+
 /*******************************************************************//**
 Builds the internal dictionary cache representation for a clustered
 index, containing also system fields not defined by the user.
@@ -2229,7 +2781,7 @@ dict_index_build_internal_clust(
 	/* Copy the fields of index */
 	dict_index_copy(new_index, index, table, 0, index->n_fields);
 
-	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+	if (dict_index_is_univ(index)) {
 		/* No fixed number of fields determines an entry uniquely */
 
 		new_index->n_uniq = REC_MAX_N_FIELDS;
@@ -2316,7 +2868,8 @@ dict_index_build_internal_clust(
 	}
 
 	/* Remember the table columns already contained in new_index */
-	indexed = mem_zalloc(table->n_cols * sizeof *indexed);
+	indexed = static_cast<ibool*>(
+		mem_zalloc(table->n_cols * sizeof *indexed));
 
 	/* Mark the table columns already contained in new_index */
 	for (i = 0; i < new_index->n_def; i++) {
@@ -2382,7 +2935,7 @@ dict_index_build_internal_non_clust(
 
 	ut_ad(clust_index);
 	ut_ad(dict_index_is_clust(clust_index));
-	ut_ad(!(clust_index->type & DICT_UNIVERSAL));
+	ut_ad(!dict_index_is_univ(clust_index));
 
 	/* Create a new index */
 	new_index = dict_mem_index_create(
@@ -2401,7 +2954,8 @@ dict_index_build_internal_non_clust(
 	dict_index_copy(new_index, index, table, 0, index->n_fields);
 
 	/* Remember the table columns already contained in new_index */
-	indexed = mem_zalloc(table->n_cols * sizeof *indexed);
+	indexed = static_cast<ibool*>(
+		mem_zalloc(table->n_cols * sizeof *indexed));
 
 	/* Mark the table columns already contained in new_index */
 	for (i = 0; i < new_index->n_def; i++) {
@@ -2448,7 +3002,55 @@ dict_index_build_internal_non_clust(
 	return(new_index);
 }
 
-#ifndef UNIV_HOTBACKUP
+/***********************************************************************
+Builds the internal dictionary cache representation for an FTS index.
+@return	own: the internal representation of the FTS index */
+static
+dict_index_t*
+dict_index_build_internal_fts(
+/*==========================*/
+	dict_table_t*	table,	/*!< in: table */
+	dict_index_t*	index)	/*!< in: user representation of an FTS index */
+{
+	dict_index_t*	new_index;
+
+	ut_ad(table && index);
+	ut_ad(index->type == DICT_FTS);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	/* Create a new index */
+	new_index = dict_mem_index_create(
+		table->name, index->name, index->space, index->type,
+		index->n_fields);
+
+	/* Copy other relevant data from the old index struct to the new
+	struct: it inherits the values */
+
+	new_index->n_user_defined_cols = index->n_fields;
+
+	new_index->id = index->id;
+	btr_search_index_init(new_index);
+
+	/* Copy fields from index to new_index */
+	dict_index_copy(new_index, index, table, 0, index->n_fields);
+
+	new_index->n_uniq = 0;
+	new_index->cached = TRUE;
+
+	if (table->fts->cache == NULL) {
+		table->fts->cache = fts_cache_create(table);
+	}
+
+	rw_lock_x_lock(&table->fts->cache->init_lock);
+	/* Notify the FTS cache about this index. */
+	fts_cache_index_cache_create(table, new_index);
+	rw_lock_x_unlock(&table->fts->cache->init_lock);
+
+	return(new_index);
+}
 /*====================== FOREIGN KEY PROCESSING ========================*/
 
 /*********************************************************************//**
@@ -2515,8 +3117,7 @@ dict_table_get_foreign_constraint(
 	     foreign;
 	     foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) {
 
-		if (foreign->foreign_index == index
-		    || foreign->referenced_index == index) {
+		if (foreign->foreign_index == index) {
 
 			return(foreign);
 		}
@@ -2527,20 +3128,18 @@ dict_table_get_foreign_constraint(
 
 /*********************************************************************//**
 Frees a foreign key struct. */
-static
+UNIV_INTERN
 void
 dict_foreign_free(
 /*==============*/
 	dict_foreign_t*	foreign)	/*!< in, own: foreign key struct */
 {
-	ut_a(foreign->foreign_table->n_foreign_key_checks_running == 0);
-
 	mem_heap_free(foreign->heap);
 }
 
 /**********************************************************************//**
 Removes a foreign constraint struct from the dictionary cache. */
-static
+UNIV_INTERN
 void
 dict_foreign_remove_from_cache(
 /*===========================*/
@@ -2604,176 +3203,58 @@ dict_foreign_find(
 	return(NULL);
 }
 
+
 /*********************************************************************//**
 Tries to find an index whose first fields are the columns in the array,
 in the same order and is not marked for deletion and is not the same
 as types_idx.
 @return	matching index, NULL if not found */
-static
+UNIV_INTERN
 dict_index_t*
 dict_foreign_find_index(
 /*====================*/
-	dict_table_t*	table,	/*!< in: table */
-	const char**	columns,/*!< in: array of column names */
-	ulint		n_cols,	/*!< in: number of columns */
-	dict_index_t*	types_idx, /*!< in: NULL or an index to whose types the
-				   column types must match */
-	ibool		check_charsets,
-				/*!< in: whether to check charsets.
-				only has an effect if types_idx != NULL */
-	ulint		check_null)
-				/*!< in: nonzero if none of the columns must
-				be declared NOT NULL */
+	const dict_table_t*	table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols,	/*!< in: number of columns */
+	const dict_index_t*	types_idx,
+					/*!< in: NULL or an index
+					whose types the column types
+					must match */
+	bool			check_charsets,
+					/*!< in: whether to check
+					charsets.  only has an effect
+					if types_idx != NULL */
+	ulint			check_null)
+					/*!< in: nonzero if none of
+					the columns must be declared
+					NOT NULL */
 {
 	dict_index_t*	index;
 
-	index = dict_table_get_first_index(table);
-
-	while (index != NULL) {
-		/* Ignore matches that refer to the same instance
-		or the index is to be dropped */
-		if (index->to_be_dropped || types_idx == index) {
-
-			goto next_rec;
-
-		} else if (dict_index_get_n_fields(index) >= n_cols) {
-			ulint		i;
-
-			for (i = 0; i < n_cols; i++) {
-				dict_field_t*	field;
-				const char*	col_name;
-
-				field = dict_index_get_nth_field(index, i);
-
-				col_name = dict_table_get_col_name(
-					table, dict_col_get_no(field->col));
-
-				if (field->prefix_len != 0) {
-					/* We do not accept column prefix
-					indexes here */
-
-					break;
-				}
-
-				if (0 != innobase_strcasecmp(columns[i],
-							     col_name)) {
-					break;
-				}
-
-				if (check_null
-				    && (field->col->prtype & DATA_NOT_NULL)) {
-
-					return(NULL);
-				}
-
-				if (types_idx && !cmp_cols_are_equal(
-					    dict_index_get_nth_col(index, i),
-					    dict_index_get_nth_col(types_idx,
-								   i),
-					    check_charsets)) {
-
-					break;
-				}
-			}
-
-			if (i == n_cols) {
-				/* We found a matching index */
-
-				return(index);
-			}
-		}
-
-next_rec:
-		index = dict_table_get_next_index(index);
-	}
-
-	return(NULL);
-}
-
-/**********************************************************************//**
-Find an index that is equivalent to the one passed in and is not marked
-for deletion.
-@return	index equivalent to foreign->foreign_index, or NULL */
-UNIV_INTERN
-dict_index_t*
-dict_foreign_find_equiv_index(
-/*==========================*/
-	dict_foreign_t*	foreign)/*!< in: foreign key */
-{
-	ut_a(foreign != NULL);
-
-	/* Try to find an index which contains the columns as the
-	first fields and in the right order, and the types are the
-	same as in foreign->foreign_index */
-
-	return(dict_foreign_find_index(
-		       foreign->foreign_table,
-		       foreign->foreign_col_names, foreign->n_fields,
-		       foreign->foreign_index, TRUE, /* check types */
-		       FALSE/* allow columns to be NULL */));
-}
-
-#endif /* !UNIV_HOTBACKUP */
-/**********************************************************************//**
-Returns an index object by matching on the name and column names and
-if more than one index matches return the index with the max id
-@return	matching index, NULL if not found */
-UNIV_INTERN
-dict_index_t*
-dict_table_get_index_by_max_id(
-/*===========================*/
-	dict_table_t*	table,	/*!< in: table */
-	const char*	name,	/*!< in: the index name to find */
-	const char**	columns,/*!< in: array of column names */
-	ulint		n_cols)	/*!< in: number of columns */
-{
-	dict_index_t*	index;
-	dict_index_t*	found;
+	ut_ad(mutex_own(&dict_sys->mutex));
 
-	found = NULL;
 	index = dict_table_get_first_index(table);
 
 	while (index != NULL) {
-		if (ut_strcmp(index->name, name) == 0
-		    && dict_index_get_n_ordering_defined_by_user(index)
-		    == n_cols) {
-
-			ulint		i;
-
-			for (i = 0; i < n_cols; i++) {
-				dict_field_t*	field;
-				const char*	col_name;
-
-				field = dict_index_get_nth_field(index, i);
-
-				col_name = dict_table_get_col_name(
-					table, dict_col_get_no(field->col));
-
-				if (0 != innobase_strcasecmp(
-					    columns[i], col_name)) {
-
-					break;
-				}
-			}
-
-			if (i == n_cols) {
-				/* We found a matching index, select
-				the index with the higher id*/
-
-				if (!found || index->id > found->id) {
-
-					found = index;
-				}
-			}
+		if (types_idx != index
+		    && !(index->type & DICT_FTS)
+		    && !index->to_be_dropped
+		    && dict_foreign_qualify_index(
+			    table, col_names, columns, n_cols,
+			    index, types_idx,
+			    check_charsets, check_null)) {
+			return(index);
 		}
 
 		index = dict_table_get_next_index(index);
 	}
 
-	return(found);
+	return(NULL);
 }
 
-#ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
 Report an error in a foreign key definition. */
 static
@@ -2823,14 +3304,19 @@ At least one of the foreign table and the referenced table must already
 be in the dictionary cache!
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 dict_foreign_add_to_cache(
 /*======================*/
-	dict_foreign_t*		foreign,	/*!< in, own: foreign key
-						constraint */
-	ibool			check_charsets,	/*!< in: TRUE=check charset
-						compatibility */
-	dict_err_ignore_t	ignore_err)	/*!< in: error to be ignored */
+	dict_foreign_t*		foreign,
+				/*!< in, own: foreign key constraint */
+	const char**		col_names,
+				/*!< in: column names, or NULL to use
+				foreign->foreign_table->col_names */
+	bool			check_charsets,
+				/*!< in: whether to check charset
+				compatibility */
+	dict_err_ignore_t	ignore_err)
+				/*!< in: error to be ignored */
 {
 	dict_table_t*	for_table;
 	dict_table_t*	ref_table;
@@ -2863,12 +3349,12 @@ dict_foreign_add_to_cache(
 		for_in_cache = foreign;
 	}
 
-	if (for_in_cache->referenced_table == NULL && ref_table) {
+	if (ref_table && !for_in_cache->referenced_table) {
 		index = dict_foreign_find_index(
-			ref_table,
+			ref_table, NULL,
 			for_in_cache->referenced_col_names,
 			for_in_cache->n_fields, for_in_cache->foreign_index,
-			check_charsets, FALSE);
+			check_charsets, false);
 
 		if (index == NULL
 		    && !(ignore_err & DICT_ERR_IGNORE_FK_NOKEY)) {
@@ -2896,9 +3382,9 @@ dict_foreign_add_to_cache(
 		added_to_referenced_list = TRUE;
 	}
 
-	if (for_in_cache->foreign_table == NULL && for_table) {
+	if (for_table && !for_in_cache->foreign_table) {
 		index = dict_foreign_find_index(
-			for_table,
+			for_table, col_names,
 			for_in_cache->foreign_col_names,
 			for_in_cache->n_fields,
 			for_in_cache->referenced_index, check_charsets,
@@ -2940,10 +3426,22 @@ dict_foreign_add_to_cache(
 				 for_in_cache);
 	}
 
+	/* We need to move the table to the non-LRU end of the table LRU
+	list. Otherwise it will be evicted from the cache. */
+
+	if (ref_table != NULL && ref_table->can_be_evicted) {
+		dict_table_move_from_lru_to_non_lru(ref_table);
+	}
+
+	if (for_table != NULL && for_table->can_be_evicted) {
+		dict_table_move_from_lru_to_non_lru(for_table);
+	}
+
+	ut_ad(dict_lru_validate());
+
 	return(DB_SUCCESS);
 }
 
-#endif /* !UNIV_HOTBACKUP */
 /*********************************************************************//**
 Scans from pointer onwards. Stops if is at the start of a copy of
 'string' where characters are compared without case sensitivity, and
@@ -2957,7 +3455,7 @@ dict_scan_to(
 	const char*	string)	/*!< in: look for this */
 {
 	char	quote	= '\0';
-	ibool	escape	= FALSE;
+	bool	escape	= false;
 
 	for (; *ptr; ptr++) {
 		if (*ptr == quote) {
@@ -2967,16 +3465,16 @@ dict_scan_to(
 			/* If the quote character is escaped by a
 			backslash, ignore it. */
 			if (escape) {
-				escape = FALSE;
+				escape = false;
 			} else {
 				quote = '\0';
 			}
 		} else if (quote) {
 			/* Within quotes: do nothing. */
 			if (escape) {
-				escape = FALSE;
+				escape = false;
 			} else if (*ptr == '\\') {
-				escape = TRUE;
+				escape = true;
 			}
 		} else if (*ptr == '`' || *ptr == '"' || *ptr == '\'') {
 			/* Starting quote: remember the quote character. */
@@ -3115,7 +3613,10 @@ dict_scan_id(
 
 	if (quote) {
 		char*	d;
-		str = d = mem_heap_alloc(heap, len + 1);
+
+		str = d = static_cast<char*>(
+			mem_heap_alloc(heap, len + 1));
+
 		while (len--) {
 			if ((*d++ = *s++) == quote) {
 				s++;
@@ -3134,21 +3635,21 @@ convert_id:
 		/* Convert the identifier from connection character set
 		to UTF-8. */
 		len = 3 * len + 1;
-		*id = dst = mem_heap_alloc(heap, len);
+		*id = dst = static_cast<char*>(mem_heap_alloc(heap, len));
 
 		innobase_convert_from_id(cs, dst, str, len);
 	} else if (!strncmp(str, srv_mysql50_table_name_prefix,
-			    sizeof srv_mysql50_table_name_prefix)) {
+			    sizeof(srv_mysql50_table_name_prefix) - 1)) {
 		/* This is a pre-5.1 table name
 		containing chars other than [A-Za-z0-9].
 		Discard the prefix and use raw UTF-8 encoding. */
-		str += sizeof srv_mysql50_table_name_prefix;
-		len -= sizeof srv_mysql50_table_name_prefix;
+		str += sizeof(srv_mysql50_table_name_prefix) - 1;
+		len -= sizeof(srv_mysql50_table_name_prefix) - 1;
 		goto convert_id;
 	} else {
 		/* Encode using filename-safe characters. */
 		len = 5 * len + 1;
-		*id = dst = mem_heap_alloc(heap, len);
+		*id = dst = static_cast<char*>(mem_heap_alloc(heap, len));
 
 		innobase_convert_from_table_id(cs, dst, str, len);
 	}
@@ -3207,6 +3708,67 @@ dict_scan_col(
 	return(ptr);
 }
 
+
+/*********************************************************************//**
+Open a table from its database and table name, this is currently used by
+foreign constraint parser to get the referenced table.
+@return complete table name with database and table name, allocated from
+heap memory passed in */
+UNIV_INTERN
+char*
+dict_get_referenced_table(
+/*======================*/
+	const char*	name,		/*!< in: foreign key table name */
+	const char*	database_name,	/*!< in: table db name */
+	ulint		database_name_len, /*!< in: db name length */
+	const char*	table_name,	/*!< in: table name */
+	ulint		table_name_len, /*!< in: table name length */
+	dict_table_t**	table,		/*!< out: table object or NULL */
+	mem_heap_t*	heap)		/*!< in/out: heap memory */
+{
+	char*		ref;
+	const char*	db_name;
+
+	if (!database_name) {
+		/* Use the database name of the foreign key table */
+
+		db_name = name;
+		database_name_len = dict_get_db_name_len(name);
+	} else {
+		db_name = database_name;
+	}
+
+	/* Copy database_name, '/', table_name, '\0' */
+	ref = static_cast<char*>(
+		mem_heap_alloc(heap, database_name_len + table_name_len + 2));
+
+	memcpy(ref, db_name, database_name_len);
+	ref[database_name_len] = '/';
+	memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
+
+	/* Values;  0 = Store and compare as given; case sensitive
+	            1 = Store and compare in lower; case insensitive
+	            2 = Store as given, compare in lower; case semi-sensitive */
+	if (innobase_get_lower_case_table_names() == 2) {
+		innobase_casedn_str(ref);
+		*table = dict_table_get_low(ref);
+		memcpy(ref, db_name, database_name_len);
+		ref[database_name_len] = '/';
+		memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
+
+	} else {
+#ifndef __WIN__
+		if (innobase_get_lower_case_table_names() == 1) {
+			innobase_casedn_str(ref);
+		}
+#else
+		innobase_casedn_str(ref);
+#endif /* !__WIN__ */
+		*table = dict_table_get_low(ref);
+	}
+
+	return(ref);
+}
 /*********************************************************************//**
 Scans a table name from an SQL string.
 @return	scanned to */
@@ -3226,9 +3788,7 @@ dict_scan_table_name(
 	const char*	database_name	= NULL;
 	ulint		database_name_len = 0;
 	const char*	table_name	= NULL;
-	ulint		table_name_len;
 	const char*	scan_name;
-	char*		ref;
 
 	*success = FALSE;
 	*table = NULL;
@@ -3276,44 +3836,11 @@ dict_scan_table_name(
 		table_name = scan_name;
 	}
 
-	if (database_name == NULL) {
-		/* Use the database name of the foreign key table */
-
-		database_name = name;
-		database_name_len = dict_get_db_name_len(name);
-	}
-
-	table_name_len = strlen(table_name);
-
-	/* Copy database_name, '/', table_name, '\0' */
-	ref = mem_heap_alloc(heap, database_name_len + table_name_len + 2);
-	memcpy(ref, database_name, database_name_len);
-	ref[database_name_len] = '/';
-	memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
-
-	/* Values;  0 = Store and compare as given; case sensitive
-	            1 = Store and compare in lower; case insensitive
-	            2 = Store as given, compare in lower; case semi-sensitive */
-	if (innobase_get_lower_case_table_names() == 2) {
-		innobase_casedn_str(ref);
-		*table = dict_table_get_low(ref, DICT_ERR_IGNORE_NONE);
-		memcpy(ref, database_name, database_name_len);
-		ref[database_name_len] = '/';
-		memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
-
-	} else {
-#ifndef __WIN__
-		if (innobase_get_lower_case_table_names() == 1) {
-			innobase_casedn_str(ref);
-		}
-#else
-		innobase_casedn_str(ref);
-#endif /* !__WIN__ */
-		*table = dict_table_get_low(ref, DICT_ERR_IGNORE_NONE);
-	}
+	*ref_name = dict_get_referenced_table(
+		name, database_name, database_name_len,
+		table_name, strlen(table_name), table, heap);
 
 	*success = TRUE;
-	*ref_name = ref;
 	return(ptr);
 }
 
@@ -3363,13 +3890,13 @@ dict_strip_comments(
 	char*		ptr;
 	/* unclosed quote character (0 if none) */
 	char		quote	= 0;
-	ibool		escape = FALSE;
+	bool		escape = false;
 
 	DBUG_ENTER("dict_strip_comments");
 
 	DBUG_PRINT("dict_strip_comments", ("%s", sql_string));
 
-	str = mem_alloc(sql_length + 1);
+	str = static_cast<char*>(mem_alloc(sql_length + 1));
 
 	sptr = sql_string;
 	ptr = str;
@@ -3393,7 +3920,7 @@ end_of_string:
 			/* If the quote character is escaped by a
 			backslash, ignore it. */
 			if (escape) {
-				escape = FALSE;
+				escape = false;
 			} else {
 				quote = 0;
 			}
@@ -3401,9 +3928,9 @@ end_of_string:
 			/* Within quotes: do not look for
 			starting quotes or comments. */
 			if (escape) {
-				escape = FALSE;
+				escape = false;
 			} else if (*sptr == '\\') {
-				escape = TRUE;
+				escape = true;
 			}
 		} else if (*sptr == '"' || *sptr == '`' || *sptr == '\'') {
 			/* Starting quote: remember the quote character. */
@@ -3454,13 +3981,12 @@ end_of_string:
 	}
 }
 
-#ifndef UNIV_HOTBACKUP
 /*********************************************************************//**
 Finds the highest [number] for foreign key constraints of the table. Looks
 only at the >= 4.0.18-format id's, which are of the form
 databasename/tablename_ibfk_[number].
 @return	highest number, 0 if table has no new format foreign key constraints */
-static
+UNIV_INTERN
 ulint
 dict_table_get_highest_foreign_id(
 /*==============================*/
@@ -3515,6 +4041,8 @@ dict_foreign_report_syntax_err(
 					in the SQL string */
 	const char*	ptr)		/*!< in: place of the syntax error */
 {
+	ut_ad(!srv_read_only_mode);
+
 	FILE*	ef = dict_foreign_err_file;
 
 	mutex_enter(&dict_foreign_err_mutex);
@@ -3532,7 +4060,7 @@ be accompanied with indexes in both participating tables. The indexes are
 allowed to contain more fields than mentioned in the constraint.
 @return	error code or DB_SUCCESS */
 static
-ulint
+dberr_t
 dict_create_foreign_constraints_low(
 /*================================*/
 	trx_t*		trx,	/*!< in: transaction */
@@ -3563,7 +4091,7 @@ dict_create_foreign_constraints_low(
 	FILE*		ef			= dict_foreign_err_file;
 	const char*	constraint_name;
 	ibool		success;
-	ulint		error;
+	dberr_t		error;
 	const char*	ptr1;
 	const char*	ptr2;
 	ulint		i;
@@ -3575,9 +4103,10 @@ dict_create_foreign_constraints_low(
 	const char*	column_names[500];
 	const char*	referenced_table_name;
 
+	ut_ad(!srv_read_only_mode);
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 
-	table = dict_table_get_low(name, DICT_ERR_IGNORE_NONE);
+	table = dict_table_get_low(name);
 
 	if (table == NULL) {
 		mutex_enter(&dict_foreign_err_mutex);
@@ -3771,10 +4300,13 @@ col_loop1:
 	}
 
 	/* Try to find an index which contains the columns
-	as the first fields and in the right order */
+	as the first fields and in the right order. There is
+	no need to check column type match (on types_idx), since
+	the referenced table can be NULL if foreign_key_checks is
+	set to 0 */
 
-	index = dict_foreign_find_index(table, column_names, i,
-					NULL, TRUE, FALSE);
+	index = dict_foreign_find_index(
+		table, NULL, column_names, i, NULL, TRUE, FALSE);
 
 	if (!index) {
 		mutex_enter(&dict_foreign_err_mutex);
@@ -3812,8 +4344,8 @@ col_loop1:
 
 		db_len = dict_get_db_name_len(table->name);
 
-		foreign->id = mem_heap_alloc(
-			foreign->heap, db_len + strlen(constraint_name) + 2);
+		foreign->id = static_cast<char*>(mem_heap_alloc(
+			foreign->heap, db_len + strlen(constraint_name) + 2));
 
 		ut_memcpy(foreign->id, table->name, db_len);
 		foreign->id[db_len] = '/';
@@ -3827,8 +4359,10 @@ col_loop1:
 
 	foreign->foreign_index = index;
 	foreign->n_fields = (unsigned int) i;
-	foreign->foreign_col_names = mem_heap_alloc(foreign->heap,
-						    i * sizeof(void*));
+
+	foreign->foreign_col_names = static_cast<const char**>(
+		mem_heap_alloc(foreign->heap, i * sizeof(void*)));
+
 	for (i = 0; i < foreign->n_fields; i++) {
 		foreign->foreign_col_names[i] = mem_heap_strdup(
 			foreign->heap,
@@ -4044,7 +4578,7 @@ try_find_index:
 	foreign->foreign_index */
 
 	if (referenced_table) {
-		index = dict_foreign_find_index(referenced_table,
+		index = dict_foreign_find_index(referenced_table, NULL,
 						column_names, i,
 						foreign->foreign_index,
 						TRUE, FALSE);
@@ -4085,8 +4619,9 @@ try_find_index:
 		foreign->heap, referenced_table_name);
 	dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
 
-	foreign->referenced_col_names = mem_heap_alloc(foreign->heap,
-						       i * sizeof(void*));
+	foreign->referenced_col_names = static_cast<const char**>(
+		mem_heap_alloc(foreign->heap, i * sizeof(void*)));
+
 	for (i = 0; i < foreign->n_fields; i++) {
 		foreign->referenced_col_names[i]
 			= mem_heap_strdup(foreign->heap, column_names[i]);
@@ -4104,6 +4639,23 @@ try_find_index:
 
 	goto loop;
 }
+/**************************************************************************
+Determines whether a string starts with the specified keyword.
+@return	TRUE if str starts with keyword */
+UNIV_INTERN
+ibool
+dict_str_starts_with_keyword(
+/*=========================*/
+	THD*		thd,		/*!< in: MySQL thread handle */
+	const char*	str,		/*!< in: string to scan for keyword */
+	const char*	keyword)	/*!< in: keyword to look for */
+{
+	struct charset_info_st*	cs = innobase_get_charset(thd);
+	ibool			success;
+
+	dict_accept(cs, str, keyword, &success);
+	return(success);
+}
 
 /*********************************************************************//**
 Scans a table create SQL string and adds to the data dictionary the foreign
@@ -4113,7 +4665,7 @@ be accompanied with indexes in both participating tables. The indexes are
 allowed to contain more fields than mentioned in the constraint.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 dict_create_foreign_constraints(
 /*============================*/
 	trx_t*		trx,		/*!< in: transaction */
@@ -4133,9 +4685,9 @@ dict_create_foreign_constraints(
 					code DB_CANNOT_ADD_CONSTRAINT if
 					any foreign keys are found. */
 {
-	char*			str;
-	ulint			err;
-	mem_heap_t*		heap;
+	char*		str;
+	dberr_t		err;
+	mem_heap_t*	heap;
 
 	ut_a(trx);
 	ut_a(trx->mysql_thd);
@@ -4158,7 +4710,7 @@ Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
 @return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
 constraint id does not match */
 UNIV_INTERN
-ulint
+dberr_t
 dict_foreign_parse_drop_constraints(
 /*================================*/
 	mem_heap_t*	heap,			/*!< in: heap from which we can
@@ -4176,7 +4728,6 @@ dict_foreign_parse_drop_constraints(
 	size_t			len;
 	const char*		ptr;
 	const char*		id;
-	FILE*			ef	= dict_foreign_err_file;
 	struct charset_info_st*	cs;
 
 	ut_a(trx);
@@ -4186,7 +4737,8 @@ dict_foreign_parse_drop_constraints(
 
 	*n = 0;
 
-	*constraints_to_drop = mem_heap_alloc(heap, 1000 * sizeof(char*));
+	*constraints_to_drop = static_cast<const char**>(
+		mem_heap_alloc(heap, 1000 * sizeof(char*)));
 
 	ptr = innobase_get_stmt(trx->mysql_thd, &len);
 
@@ -4241,10 +4793,11 @@ loop:
 	foreign = UT_LIST_GET_FIRST(table->foreign_list);
 
 	while (foreign != NULL) {
-		if (0 == strcmp(foreign->id, id)
+		if (0 == innobase_strcasecmp(foreign->id, id)
 		    || (strchr(foreign->id, '/')
-			&& 0 == strcmp(id,
-				       dict_remove_db_name(foreign->id)))) {
+			&& 0 == innobase_strcasecmp(
+				id,
+				dict_remove_db_name(foreign->id)))) {
 			/* Found */
 			break;
 		}
@@ -4252,20 +4805,26 @@ loop:
 		foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
 	}
 
+
 	if (foreign == NULL) {
-		mutex_enter(&dict_foreign_err_mutex);
-		rewind(ef);
-		ut_print_timestamp(ef);
-		fputs(" Error in dropping of a foreign key constraint"
-		      " of table ", ef);
-		ut_print_name(ef, NULL, TRUE, table->name);
-		fputs(",\n"
-		      "in SQL command\n", ef);
-		fputs(str, ef);
-		fputs("\nCannot find a constraint with the given id ", ef);
-		ut_print_name(ef, NULL, FALSE, id);
-		fputs(".\n", ef);
-		mutex_exit(&dict_foreign_err_mutex);
+
+		if (!srv_read_only_mode) {
+			FILE*	ef = dict_foreign_err_file;
+
+			mutex_enter(&dict_foreign_err_mutex);
+			rewind(ef);
+			ut_print_timestamp(ef);
+			fputs(" Error in dropping of a foreign key "
+			      "constraint of table ", ef);
+			ut_print_name(ef, NULL, TRUE, table->name);
+			fputs(",\nin SQL command\n", ef);
+			fputs(str, ef);
+			fputs("\nCannot find a constraint with the "
+			      "given id ", ef);
+			ut_print_name(ef, NULL, FALSE, id);
+			fputs(".\n", ef);
+			mutex_exit(&dict_foreign_err_mutex);
+		}
 
 		mem_free(str);
 
@@ -4275,15 +4834,19 @@ loop:
 	goto loop;
 
 syntax_error:
-	mutex_enter(&dict_foreign_err_mutex);
-	rewind(ef);
-	ut_print_timestamp(ef);
-	fputs(" Syntax error in dropping of a"
-	      " foreign key constraint of table ", ef);
-	ut_print_name(ef, NULL, TRUE, table->name);
-	fprintf(ef, ",\n"
-		"close to:\n%s\n in SQL command\n%s\n", ptr, str);
-	mutex_exit(&dict_foreign_err_mutex);
+	if (!srv_read_only_mode) {
+		FILE*	ef = dict_foreign_err_file;
+
+		mutex_enter(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+		fputs(" Syntax error in dropping of a"
+		      " foreign key constraint of table ", ef);
+		ut_print_name(ef, NULL, TRUE, table->name);
+		fprintf(ef, ",\n"
+			"close to:\n%s\n in SQL command\n%s\n", ptr, str);
+		mutex_exit(&dict_foreign_err_mutex);
+	}
 
 	mem_free(str);
 
@@ -4291,7 +4854,7 @@ syntax_error:
 }
 
 /*==================== END OF FOREIGN KEY PROCESSING ====================*/
-#endif /* !UNIV_HOTBACKUP */
+
 /**********************************************************************//**
 Returns an index object if it is found in the dictionary cache.
 Assumes that dict_sys->mutex is already being held.
@@ -4374,7 +4937,7 @@ dict_index_build_node_ptr(
 	byte*		buf;
 	ulint		n_unique;
 
-	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+	if (dict_index_is_univ(index)) {
 		/* In a universal index tree, we take the whole record as
 		the node pointer if the record is on the leaf level,
 		on non-leaf levels we remove the last field, which
@@ -4403,7 +4966,7 @@ dict_index_build_node_ptr(
 
 	dict_index_copy_types(tuple, index, n_unique);
 
-	buf = mem_heap_alloc(heap, 4);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
 
 	mach_write_to_4(buf, page_no);
 
@@ -4441,7 +5004,7 @@ dict_index_copy_rec_order_prefix(
 
 	UNIV_PREFETCH_R(rec);
 
-	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+	if (dict_index_is_univ(index)) {
 		ut_a(!dict_table_is_comp(index->table));
 		n = rec_get_n_fields_old(rec);
 	} else {
@@ -4531,547 +5094,6 @@ dict_index_calc_min_rec_len(
 	return(sum);
 }
 
-/*********************************************************************//**
-functions to use SYS_STATS system table. */
-static
-ibool
-dict_reload_statistics(
-/*===================*/
-	dict_table_t*	table,
-	ulint*		sum_of_index_sizes)
-{
-	dict_index_t*	index;
-	ulint		size;
-	mem_heap_t*	heap;
-
-	index = dict_table_get_first_index(table);
-
-	if (index == NULL) {
-		/* Table definition is corrupt */
-
-		return(FALSE);
-	}
-
-	heap = mem_heap_create(1000);
-
-	while (index) {
-		mtr_t mtr;
-
-		if (UNIV_UNLIKELY(table->is_corrupt)) {
-			ut_a(srv_pass_corrupt_table);
-			mem_heap_free(heap);
-			return(FALSE);
-		}
-
-		mtr_start(&mtr);
-		mtr_s_lock(dict_index_get_lock(index), &mtr);
-
-		size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
-
-		index->stat_index_size = size;
-
-		*sum_of_index_sizes += size;
-
-		size = btr_get_size(index, BTR_N_LEAF_PAGES, &mtr);
-
-		if (size == 0) {
-			/* The root node of the tree is a leaf */
-			size = 1;
-		}
-
-		mtr_commit(&mtr);
-
-		index->stat_n_leaf_pages = size;
-
-/*===========================================*/
-{
-	dict_table_t*	sys_stats;
-	dict_index_t*	sys_index;
-	btr_pcur_t	pcur;
-	dtuple_t*	tuple;
-	dfield_t*	dfield;
-	ulint		key_cols;
-	ulint		n_cols;
-	const rec_t*	rec;
-	ulint		n_fields;
-	const byte*	field;
-	ulint		len;
-	ib_int64_t*	stat_n_diff_key_vals_tmp;
-	ib_int64_t*	stat_n_non_null_key_vals_tmp;
-	byte*		buf;
-	ulint		i;
-	mtr_t		mtr;
-
-	n_cols = dict_index_get_n_unique(index);
-	stat_n_diff_key_vals_tmp = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_int64_t));
-	stat_n_non_null_key_vals_tmp = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_int64_t));
-
-	sys_stats = dict_sys->sys_stats;
-	sys_index = UT_LIST_GET_FIRST(sys_stats->indexes);
-	ut_a(!dict_table_is_comp(sys_stats));
-
-	tuple = dtuple_create(heap, 1);
-	dfield = dtuple_get_nth_field(tuple, 0);
-
-	buf = mem_heap_alloc(heap, 8);
-	mach_write_to_8(buf, index->id);
-
-	dfield_set_data(dfield, buf, 8);
-	dict_index_copy_types(tuple, sys_index, 1);
-
-	mtr_start(&mtr);
-
-	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
-				  BTR_SEARCH_LEAF, &pcur, &mtr);
-	for (i = 0; i <= n_cols; i++) {
-		rec = btr_pcur_get_rec(&pcur);
-
-		if (!btr_pcur_is_on_user_rec(&pcur)
-		    || mach_read_from_8(rec_get_nth_field_old(rec, 0, &len))
-			!= index->id) {
-			/* not found: even 1 if not found should not be alowed */
-			fprintf(stderr, "InnoDB: Warning: stats for %s/%s (%lu/%lu)"
-				        " not found in SYS_STATS\n",
-					index->table_name, index->name, i, n_cols);
-			btr_pcur_close(&pcur);
-			mtr_commit(&mtr);
-			mem_heap_free(heap);
-			return(FALSE);
-		}
-
-		if (rec_get_deleted_flag(rec, 0)) {
-			/* don't count */
-			i--;
-			goto next_rec;
-		}
-
-		n_fields = rec_get_n_fields_old(rec);
-
-		field = rec_get_nth_field_old(rec, 1, &len);
-		ut_a(len == 4);
-
-		key_cols = mach_read_from_4(field);
-
-		ut_a(i == key_cols);
-
-		field = rec_get_nth_field_old(rec, DICT_SYS_STATS_DIFF_VALS_FIELD, &len);
-		ut_a(len == 8);
-
-		stat_n_diff_key_vals_tmp[i] = mach_read_from_8(field);
-
-		if (n_fields > DICT_SYS_STATS_NON_NULL_VALS_FIELD) {
-			field = rec_get_nth_field_old(rec, DICT_SYS_STATS_NON_NULL_VALS_FIELD, &len);
-			ut_a(len == 8);
-
-			stat_n_non_null_key_vals_tmp[i] = mach_read_from_8(field);
-		} else {
-			/* not enough fields: should be older */
-			fprintf(stderr, "InnoDB: Notice: stats for %s/%s (%lu/%lu)"
-					" in SYS_STATS seems older format. "
-					"Please execute ANALYZE TABLE for it.\n",
-					index->table_name, index->name, i, n_cols);
-
-			stat_n_non_null_key_vals_tmp[i] = ((ib_int64_t)(-1));
-		}
-next_rec:
-		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
-	}
-
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
-
-	for (i = 0; i <= n_cols; i++) {
-		index->stat_n_diff_key_vals[i] = stat_n_diff_key_vals_tmp[i];
-		if (stat_n_non_null_key_vals_tmp[i] == ((ib_int64_t)(-1))) {
-			/* approximate value */
-			index->stat_n_non_null_key_vals[i] = stat_n_diff_key_vals_tmp[n_cols];
-		} else {
-			index->stat_n_non_null_key_vals[i] = stat_n_non_null_key_vals_tmp[i];
-		}
-	}
-}
-/*===========================================*/
-
-		index = dict_table_get_next_index(index);
-	}
-
-	mem_heap_free(heap);
-	return(TRUE);
-}
-
-static
-void
-dict_store_statistics(
-/*==================*/
-	dict_table_t*	table)
-{
-	dict_index_t*	index;
-	mem_heap_t*	heap;
-
-	index = dict_table_get_first_index(table);
-
-	ut_a(index);
-
-	heap = mem_heap_create(1000);
-
-	while (index) {
-		if (UNIV_UNLIKELY(table->is_corrupt)) {
-			ut_a(srv_pass_corrupt_table);
-			mem_heap_free(heap);
-			return;
-		}
-
-/*===========================================*/
-{
-	dict_table_t*	sys_stats;
-	dict_index_t*	sys_index;
-	btr_pcur_t	pcur;
-	dtuple_t*	tuple;
-	dfield_t*	dfield;
-	ulint		key_cols;
-	ulint		n_cols;
-	ulint		rests;
-	const rec_t*	rec;
-	ulint		n_fields;
-	const byte*	field;
-	ulint		len;
-	ib_int64_t*	stat_n_diff_key_vals_tmp;
-	ib_int64_t*	stat_n_non_null_key_vals_tmp;
-	byte*		buf;
-	ulint		i;
-	mtr_t		mtr;
-
-	n_cols = dict_index_get_n_unique(index);
-	stat_n_diff_key_vals_tmp = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_int64_t));
-	stat_n_non_null_key_vals_tmp = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_int64_t));
-
-	for (i = 0; i <= n_cols; i++) {
-		stat_n_diff_key_vals_tmp[i] = index->stat_n_diff_key_vals[i];
-		stat_n_non_null_key_vals_tmp[i] = index->stat_n_non_null_key_vals[i];
-	}
-
-	sys_stats = dict_sys->sys_stats;
-	sys_index = UT_LIST_GET_FIRST(sys_stats->indexes);
-	ut_a(!dict_table_is_comp(sys_stats));
-
-	tuple = dtuple_create(heap, 1);
-	dfield = dtuple_get_nth_field(tuple, 0);
-
-	buf = mem_heap_alloc(heap, 8);
-	mach_write_to_8(buf, index->id);
-
-	dfield_set_data(dfield, buf, 8);
-	dict_index_copy_types(tuple, sys_index, 1);
-
-	mtr_start(&mtr);
-
-	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
-				  BTR_MODIFY_LEAF, &pcur, &mtr);
-	rests = n_cols + 1;
-	for (i = 0; i <= n_cols; i++) {
-		rec = btr_pcur_get_rec(&pcur);
-
-		if (!btr_pcur_is_on_user_rec(&pcur)
-		    || mach_read_from_8(rec_get_nth_field_old(rec, 0, &len))
-			!= index->id) {
-			/* not found */
-
-
-			break;
-		}
-
-		btr_pcur_store_position(&pcur, &mtr);
-
-		if (rec_get_deleted_flag(rec, 0)) {
-			/* don't count */
-			i--;
-			goto next_rec;
-		}
-
-		n_fields = rec_get_n_fields_old(rec);
-
-		if (n_fields <= DICT_SYS_STATS_NON_NULL_VALS_FIELD) {
-			/* not update for the older smaller format */
-			fprintf(stderr, "InnoDB: Notice: stats for %s/%s (%lu/%lu)"
-					" in SYS_STATS seems older format. Please ANALYZE TABLE it.\n",
-					index->table_name, index->name, i, n_cols);
-			goto next_rec;
-		}
-
-		field = rec_get_nth_field_old(rec, 1, &len);
-		ut_a(len == 4);
-
-		key_cols = mach_read_from_4(field);
-
-		field = rec_get_nth_field_old(rec, DICT_SYS_STATS_DIFF_VALS_FIELD, &len);
-		ut_a(len == 8);
-
-		mlog_write_ull((byte*)field, stat_n_diff_key_vals_tmp[key_cols], &mtr);
-
-		field = rec_get_nth_field_old(rec, DICT_SYS_STATS_NON_NULL_VALS_FIELD, &len);
-		ut_a(len == 8);
-
-		mlog_write_ull((byte*)field, stat_n_non_null_key_vals_tmp[key_cols], &mtr);
-
-		rests--;
-
-next_rec:
-		mtr_commit(&mtr);
-		mtr_start(&mtr);
-		btr_pcur_restore_position(BTR_MODIFY_LEAF, &pcur, &mtr);
-
-		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
-	}
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
-}
-/*===========================================*/
-
-		index = dict_table_get_next_index(index);
-	}
-
-	mem_heap_free(heap);
-}
-
-/*********************************************************************//**
-Calculates new estimates for table and index statistics. The statistics
-are used in query optimization. */
-UNIV_INTERN
-void
-dict_update_statistics(
-/*===================*/
-	dict_table_t*	table,		/*!< in/out: table */
-	ibool		only_calc_if_missing_stats,/*!< in: only
-					update/recalc the stats if they have
-					not been initialized yet, otherwise
-					do nothing */
-	ibool		sync,		/*!< in: TRUE if must update
-					SYS_STATS */
-	ibool		only_calc_if_changed_too_much)/*!< in: only
-					update/recalc the stats if the table
-					has been changed too much since the
-					last stats update/recalc */
-{
-	dict_index_t*	index;
-	ulint		sum_of_index_sizes	= 0;
-
-	if (table->ibd_file_missing) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: cannot calculate statistics for table %s\n"
-			"InnoDB: because the .ibd file is missing.  For help,"
-			" please refer to\n"
-			"InnoDB: " REFMAN "innodb-troubleshooting.html\n",
-			table->name);
-
-		return;
-	}
-
-	if (srv_use_sys_stats_table && !((table->flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY) && !sync) {
-		dict_table_stats_lock(table, RW_X_LATCH);
-
-		/* reload statistics from SYS_STATS table */
-		if (dict_reload_statistics(table, &sum_of_index_sizes)) {
-			/* success */
-#ifdef UNIV_DEBUG
-			fprintf(stderr, "InnoDB: DEBUG: reload_statistics succeeded for %s.\n",
-					table->name);
-#endif
-			goto end;
-		}
-
-		dict_table_stats_unlock(table, RW_X_LATCH);
-	}
-#ifdef UNIV_DEBUG
-	fprintf(stderr, "InnoDB: DEBUG: update_statistics for %s.\n",
-			table->name);
-#endif
-	sum_of_index_sizes = 0;
-
-	/* Find out the sizes of the indexes and how many different values
-	for the key they approximately have */
-
-	index = dict_table_get_first_index(table);
-
-	if (index == NULL) {
-		/* Table definition is corrupt */
-
-		return;
-	}
-
-	dict_table_stats_lock(table, RW_X_LATCH);
-
-	if ((only_calc_if_missing_stats && table->stat_initialized)
-	    || (only_calc_if_changed_too_much
-		&& !DICT_TABLE_CHANGED_TOO_MUCH(table))) {
-
-		dict_table_stats_unlock(table, RW_X_LATCH);
-		return;
-	}
-
-	for (; index != NULL; index = dict_table_get_next_index(index)) {
-
-		/* Skip incomplete indexes. */
-		if (index->name[0] == TEMP_INDEX_PREFIX) {
-			continue;
-		}
-
-		if (UNIV_LIKELY
-		    (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE
-		     || (srv_force_recovery < SRV_FORCE_NO_LOG_REDO
-			 && dict_index_is_clust(index)))) {
-			mtr_t	mtr;
-			ulint	size;
-
-			if (UNIV_UNLIKELY(table->is_corrupt)) {
-				ut_a(srv_pass_corrupt_table);
-				dict_table_stats_unlock(table, RW_X_LATCH);
-				return;
-			}
-
-			mtr_start(&mtr);
-			mtr_s_lock(dict_index_get_lock(index), &mtr);
-
-			size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
-
-			if (size != ULINT_UNDEFINED) {
-				sum_of_index_sizes += size;
-				index->stat_index_size = size;
-				size = btr_get_size(
-					index, BTR_N_LEAF_PAGES, &mtr);
-			}
-
-			mtr_commit(&mtr);
-
-			switch (size) {
-			case ULINT_UNDEFINED:
-				goto fake_statistics;
-			case 0:
-				/* The root node of the tree is a leaf */
-				size = 1;
-			}
-
-			index->stat_n_leaf_pages = size;
-
-			btr_estimate_number_of_different_key_vals(index);
-		} else {
-			/* If we have set a high innodb_force_recovery
-			level, do not calculate statistics, as a badly
-			corrupted index can cause a crash in it.
-			Initialize some bogus index cardinality
-			statistics, so that the data can be queried in
-			various means, also via secondary indexes. */
-			ulint	i;
-
-fake_statistics:
-			sum_of_index_sizes++;
-			index->stat_index_size = index->stat_n_leaf_pages = 1;
-
-			for (i = dict_index_get_n_unique(index); i; ) {
-				index->stat_n_diff_key_vals[i--] = 1;
-			}
-
-			memset(index->stat_n_non_null_key_vals, 0,
-			       (1 + dict_index_get_n_unique(index))
-                               * sizeof(*index->stat_n_non_null_key_vals));
-		}
-	}
-
-	if (srv_use_sys_stats_table && !((table->flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY)) {
-		/* store statistics to SYS_STATS table */
-		dict_store_statistics(table);
-	}
-end:
-	index = dict_table_get_first_index(table);
-
-	table->stat_n_rows = index->stat_n_diff_key_vals[
-		dict_index_get_n_unique(index)];
-
-	table->stat_clustered_index_size = index->stat_index_size;
-
-	table->stat_sum_of_other_index_sizes = sum_of_index_sizes
-		- index->stat_index_size;
-
-	table->stat_initialized = TRUE;
-
-	table->stat_modified_counter = 0;
-
-	dict_table_stats_unlock(table, RW_X_LATCH);
-}
-
-/*********************************************************************//**
-*/
-UNIV_INTERN
-ibool
-dict_is_older_statistics(
-/*=====================*/
-	dict_index_t*	index)
-{
-	mem_heap_t*	heap;
-	dict_table_t*	sys_stats;
-	dict_index_t*	sys_index;
-	btr_pcur_t	pcur;
-	dtuple_t*	tuple;
-	dfield_t*	dfield;
-	const rec_t*	rec;
-	ulint		n_fields;
-	ulint		len;
-	byte*		buf;
-	mtr_t		mtr;
-
-	heap = mem_heap_create(100);
-
-	sys_stats = dict_sys->sys_stats;
-	sys_index = UT_LIST_GET_FIRST(sys_stats->indexes);
-	ut_a(!dict_table_is_comp(sys_stats));
-
-	tuple = dtuple_create(heap, 1);
-	dfield = dtuple_get_nth_field(tuple, 0);
-
-	buf = mem_heap_alloc(heap, 8);
-	mach_write_to_8(buf, index->id);
-
-	dfield_set_data(dfield, buf, 8);
-	dict_index_copy_types(tuple, sys_index, 1);
-
-	mtr_start(&mtr);
-
-	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
-				  BTR_SEARCH_LEAF, &pcur, &mtr);
-
-next_rec:
-	rec = btr_pcur_get_rec(&pcur);
-
-	if (!btr_pcur_is_on_user_rec(&pcur)
-	    || mach_read_from_8(rec_get_nth_field_old(rec, 0, &len))
-		!= index->id) {
-		/* not found */
-		btr_pcur_close(&pcur);
-		mtr_commit(&mtr);
-		mem_heap_free(heap);
-		/* no statistics == not older statistics */
-		return(FALSE);
-	}
-
-	if (rec_get_deleted_flag(rec, 0)) {
-		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
-		goto next_rec;
-	}
-
-	n_fields = rec_get_n_fields_old(rec);
-
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
-	mem_heap_free(heap);
-
-	if (n_fields > DICT_SYS_STATS_NON_NULL_VALS_FIELD) {
-		return(FALSE);
-	} else {
-		return(TRUE);
-	}
-}
-
-#ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
 Prints info of a foreign key constraint. */
 static
@@ -5102,7 +5124,6 @@ dict_foreign_print_low(
 	fputs(" )\n", stderr);
 }
 
-#endif /* !UNIV_HOTBACKUP */
 /**********************************************************************//**
 Prints a table data. */
 UNIV_INTERN
@@ -5111,67 +5132,29 @@ dict_table_print(
 /*=============*/
 	dict_table_t*	table)	/*!< in: table */
 {
-	mutex_enter(&(dict_sys->mutex));
-	dict_table_print_low(table);
-	mutex_exit(&(dict_sys->mutex));
-}
-
-/**********************************************************************//**
-Prints a table data when we know the table name. */
-UNIV_INTERN
-void
-dict_table_print_by_name(
-/*=====================*/
-	const char*	name)	/*!< in: table name */
-{
-	dict_table_t*	table;
-
-	mutex_enter(&(dict_sys->mutex));
-
-	table = dict_table_get_low(name, DICT_ERR_IGNORE_NONE);
-
-	ut_a(table);
-
-	dict_table_print_low(table);
-	mutex_exit(&(dict_sys->mutex));
-}
-
-/**********************************************************************//**
-Prints a table data. */
-UNIV_INTERN
-void
-dict_table_print_low(
-/*=================*/
-	dict_table_t*	table)	/*!< in: table */
-{
 	dict_index_t*	index;
 	dict_foreign_t*	foreign;
 	ulint		i;
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 
-	if (srv_stats_auto_update) {
+	dict_table_stats_lock(table, RW_X_LATCH);
 
-		dict_update_statistics(
-			table,
-			FALSE /* update even if initialized */,
-			FALSE,
-			FALSE /* update even if not changed too much */);
+	if (!table->stat_initialized) {
+		dict_stats_update_transient(table);
 	}
 
-	dict_table_stats_lock(table, RW_S_LATCH);
-
 	fprintf(stderr,
 		"--------------------------------------\n"
 		"TABLE: name %s, id %llu, flags %lx, columns %lu,"
-		" indexes %lu, appr.rows %lu\n"
+		" indexes %lu, appr.rows " UINT64PF "\n"
 		"  COLUMNS: ",
 		table->name,
 		(ullint) table->id,
 		(ulong) table->flags,
 		(ulong) table->n_cols,
 		(ulong) UT_LIST_GET_LEN(table->indexes),
-		(ulong) table->stat_n_rows);
+		table->stat_n_rows);
 
 	for (i = 0; i < (ulint) table->n_cols; i++) {
 		dict_col_print_low(table, dict_table_get_nth_col(table, i));
@@ -5187,7 +5170,9 @@ dict_table_print_low(
 		index = UT_LIST_GET_NEXT(indexes, index);
 	}
 
-	dict_table_stats_unlock(table, RW_S_LATCH);
+	table->stat_initialized = FALSE;
+
+	dict_table_stats_unlock(table, RW_X_LATCH);
 
 	foreign = UT_LIST_GET_FIRST(table->foreign_list);
 
@@ -5235,13 +5220,15 @@ dict_index_print_low(
 	ib_int64_t	n_vals;
 	ulint		i;
 
+	ut_a(index->table->stat_initialized);
+
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 
 	if (index->n_user_defined_cols > 0) {
 		n_vals = index->stat_n_diff_key_vals[
-			index->n_user_defined_cols];
+			index->n_user_defined_cols - 1];
 	} else {
-		n_vals = index->stat_n_diff_key_vals[1];
+		n_vals = index->stat_n_diff_key_vals[0];
 	}
 
 	fprintf(stderr,
@@ -5291,7 +5278,6 @@ dict_field_print_low(
 	}
 }
 
-#ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
 Outputs info on a foreign key of a table in a format suitable for
 CREATE TABLE. */
@@ -5480,7 +5466,6 @@ dict_print_info_on_foreign_keys(
 	mutex_exit(&(dict_sys->mutex));
 }
 
-#endif /* !UNIV_HOTBACKUP */
 /********************************************************************//**
 Displays the names of the index and the table. */
 UNIV_INTERN
@@ -5488,7 +5473,7 @@ void
 dict_index_name_print(
 /*==================*/
 	FILE*			file,	/*!< in: output stream */
-	trx_t*			trx,	/*!< in: transaction */
+	const trx_t*		trx,	/*!< in: transaction */
 	const dict_index_t*	index)	/*!< in: index to print */
 {
 	fputs("index ", file);
@@ -5496,6 +5481,245 @@ dict_index_name_print(
 	fputs(" of table ", file);
 	ut_print_name(file, trx, TRUE, index->table_name);
 }
+
+/**********************************************************************//**
+Find a table in dict_sys->table_LRU list with specified space id
+@return table if found, NULL if not */
+static
+dict_table_t*
+dict_find_table_by_space(
+/*=====================*/
+	ulint	space_id)		/*!< in: space ID */
+{
+	dict_table_t*   table;
+	ulint		num_item;
+	ulint		count = 0;
+
+	ut_ad(space_id > 0);
+
+	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	num_item =  UT_LIST_GET_LEN(dict_sys->table_LRU);
+
+	/* This function intentionally does not acquire mutex as it is used
+	by error handling code in deep call stack as last means to avoid
+	killing the server, so it worth to risk some consequencies for
+	the action. */
+	while (table && count < num_item) {
+		if (table->space == space_id) {
+			return(table);
+		}
+
+		table = UT_LIST_GET_NEXT(table_LRU, table);
+		count++;
+	}
+
+	return(NULL);
+}
+
+/**********************************************************************//**
+Flags a table with specified space_id corrupted in the data dictionary
+cache
+@return TRUE if successful */
+UNIV_INTERN
+ibool
+dict_set_corrupted_by_space(
+/*========================*/
+	ulint	space_id)		/*!< in: space ID */
+{
+	dict_table_t*   table;
+
+	table = dict_find_table_by_space(space_id);
+
+	if (!table) {
+		return(FALSE);
+	}
+
+	/* mark the table->corrupted bit only, since the caller
+	could be too deep in the stack for SYS_INDEXES update */
+	table->corrupted = TRUE;
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Flags an index corrupted both in the data dictionary cache
+and in the SYS_INDEXES */
+UNIV_INTERN
+void
+dict_set_corrupted(
+/*===============*/
+	dict_index_t*	index,	/*!< in/out: index */
+	trx_t*		trx,	/*!< in/out: transaction */
+	const char*	ctx)	/*!< in: context */
+{
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+	dict_index_t*	sys_index;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	byte*		buf;
+	char*		table_name;
+	const char*	status;
+	btr_cur_t	cursor;
+	bool		locked	= RW_X_LATCH == trx->dict_operation_lock_mode;
+
+	if (!locked) {
+		row_mysql_lock_data_dictionary(trx);
+	}
+
+	ut_ad(index);
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(!dict_table_is_comp(dict_sys->sys_tables));
+	ut_ad(!dict_table_is_comp(dict_sys->sys_indexes));
+
+#ifdef UNIV_SYNC_DEBUG
+        ut_ad(sync_thread_levels_empty_except_dict());
+#endif
+
+	/* Mark the table as corrupted only if the clustered index
+	is corrupted */
+	if (dict_index_is_clust(index)) {
+		index->table->corrupted = TRUE;
+	}
+
+	if (index->type & DICT_CORRUPT) {
+		/* The index was already flagged corrupted. */
+		ut_ad(!dict_index_is_clust(index) || index->table->corrupted);
+		goto func_exit;
+	}
+
+	heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t)
+			       + sizeof(que_fork_t) + sizeof(upd_node_t)
+			       + sizeof(upd_t) + 12));
+	mtr_start(&mtr);
+	index->type |= DICT_CORRUPT;
+
+	sys_index = UT_LIST_GET_FIRST(dict_sys->sys_indexes->indexes);
+
+	/* Find the index row in SYS_INDEXES */
+	tuple = dtuple_create(heap, 2);
+
+	dfield = dtuple_get_nth_field(tuple, 0);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(buf, index->table->id);
+	dfield_set_data(dfield, buf, 8);
+
+	dfield = dtuple_get_nth_field(tuple, 1);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(buf, index->id);
+	dfield_set_data(dfield, buf, 8);
+
+	dict_index_copy_types(tuple, sys_index, 2);
+
+	btr_cur_search_to_nth_level(sys_index, 0, tuple, PAGE_CUR_GE,
+				    BTR_MODIFY_LEAF,
+				    &cursor, 0, __FILE__, __LINE__, &mtr);
+
+	if (cursor.up_match == dtuple_get_n_fields(tuple)) {
+		/* UPDATE SYS_INDEXES SET TYPE=index->type
+		WHERE TABLE_ID=index->table->id AND INDEX_ID=index->id */
+		ulint	len;
+		byte*	field	= rec_get_nth_field_old(
+			btr_cur_get_rec(&cursor),
+			DICT_FLD__SYS_INDEXES__TYPE, &len);
+		if (len != 4) {
+			goto fail;
+		}
+		mlog_write_ulint(field, index->type, MLOG_4BYTES, &mtr);
+		status = "Flagged";
+	} else {
+fail:
+		status = "Unable to flag";
+	}
+
+	mtr_commit(&mtr);
+	mem_heap_empty(heap);
+	table_name = static_cast<char*>(mem_heap_alloc(heap, FN_REFLEN + 1));
+	*innobase_convert_name(
+		table_name, FN_REFLEN,
+		index->table_name, strlen(index->table_name),
+		NULL, TRUE) = 0;
+
+	ib_logf(IB_LOG_LEVEL_ERROR, "%s corruption of %s in table %s in %s",
+		status, index->name, table_name, ctx);
+
+	mem_heap_free(heap);
+
+func_exit:
+	if (!locked) {
+		row_mysql_unlock_data_dictionary(trx);
+	}
+}
+
+/**********************************************************************//**
+Flags an index corrupted in the data dictionary cache only. This
+is used mostly to mark a corrupted index when index's own dictionary
+is corrupted, and we force to load such index for repair purpose */
+UNIV_INTERN
+void
+dict_set_corrupted_index_cache_only(
+/*================================*/
+	dict_index_t*	index,		/*!< in/out: index */
+	dict_table_t*	table)		/*!< in/out: table */
+{
+	ut_ad(index);
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(!dict_table_is_comp(dict_sys->sys_tables));
+	ut_ad(!dict_table_is_comp(dict_sys->sys_indexes));
+
+	/* Mark the table as corrupted only if the clustered index
+	is corrupted */
+	if (dict_index_is_clust(index)) {
+		dict_table_t*	corrupt_table;
+
+		corrupt_table = table ? table : index->table;
+		ut_ad(!index->table || !table || index->table  == table);
+
+		if (corrupt_table) {
+			corrupt_table->corrupted = TRUE;
+		}
+	}
+
+	index->type |= DICT_CORRUPT;
+}
+
+/*************************************************************************
+set is_corrupt flag by space_id*/
+
+void
+dict_table_set_corrupt_by_space(
+/*============================*/
+	ulint	space_id,
+	ibool	need_mutex)
+{
+	dict_table_t*	table;
+	ibool		found = FALSE;
+
+	ut_a(space_id != 0 && space_id < SRV_LOG_SPACE_FIRST_ID);
+
+	if (need_mutex)
+		mutex_enter(&(dict_sys->mutex));
+
+	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+
+	while (table) {
+		if (table->space == space_id) {
+			table->is_corrupt = TRUE;
+			found = TRUE;
+		}
+
+		table = UT_LIST_GET_NEXT(table_LRU, table);
+	}
+
+	if (need_mutex)
+		mutex_exit(&(dict_sys->mutex));
+
+	if (!found) {
+		fprintf(stderr, "InnoDB: space to be marked as "
+			"crashed was not found for id " ULINTPF ".\n",
+			space_id);
+	}
+}
 #endif /* !UNIV_HOTBACKUP */
 
 /**********************************************************************//**
@@ -5508,7 +5732,7 @@ dict_ind_init(void)
 	dict_table_t*		table;
 
 	/* create dummy table and index for REDUNDANT infimum and supremum */
-	table = dict_mem_table_create("SYS_DUMMY1", DICT_HDR_SPACE, 1, 0);
+	table = dict_mem_table_create("SYS_DUMMY1", DICT_HDR_SPACE, 1, 0, 0);
 	dict_mem_table_add_col(table, NULL, NULL, DATA_CHAR,
 			       DATA_ENGLISH | DATA_NOT_NULL, 8);
 
@@ -5517,9 +5741,11 @@ dict_ind_init(void)
 	dict_index_add_col(dict_ind_redundant, table,
 			   dict_table_get_nth_col(table, 0), 0);
 	dict_ind_redundant->table = table;
+
 	/* create dummy table and index for COMPACT infimum and supremum */
 	table = dict_mem_table_create("SYS_DUMMY2",
-				      DICT_HDR_SPACE, 1, DICT_TF_COMPACT);
+				      DICT_HDR_SPACE, 1,
+				      DICT_TF_COMPACT, 0);
 	dict_mem_table_add_col(table, NULL, NULL, DATA_CHAR,
 			       DATA_ENGLISH | DATA_NOT_NULL, 8);
 	dict_ind_compact = dict_mem_index_create("SYS_DUMMY2", "SYS_DUMMY2",
@@ -5532,6 +5758,7 @@ dict_ind_init(void)
 	dict_ind_redundant->cached = dict_ind_compact->cached = TRUE;
 }
 
+#ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
 Frees dict_ind_redundant and dict_ind_compact. */
 static
@@ -5552,7 +5779,6 @@ dict_ind_free(void)
 	dict_mem_table_free(table);
 }
 
-#ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
 Get index by name
 @return	index, NULL if does not exist */
@@ -5565,10 +5791,15 @@ dict_table_get_index_on_name(
 {
 	dict_index_t*	index;
 
+	/* If name is NULL, just return */
+	if (!name) {
+		return(NULL);
+	}
+
 	index = dict_table_get_first_index(table);
 
 	while (index != NULL) {
-		if (ut_strcmp(index->name, name) == 0) {
+		if (innobase_strcasecmp(index->name, name) == 0) {
 
 			return(index);
 		}
@@ -5577,62 +5808,77 @@ dict_table_get_index_on_name(
 	}
 
 	return(NULL);
-
 }
 
 /**********************************************************************//**
-Replace the index passed in with another equivalent index in the tables
-foreign key list. */
+Replace the index passed in with another equivalent index in the
+foreign key lists of the table.
+@return whether all replacements were found */
 UNIV_INTERN
-void
-dict_table_replace_index_in_foreign_list(
-/*=====================================*/
-	dict_table_t*	table,  /*!< in/out: table */
-	dict_index_t*	index,	/*!< in: index to be replaced */
-	const trx_t*	trx)	/*!< in: transaction handle */
+bool
+dict_foreign_replace_index(
+/*=======================*/
+	dict_table_t*		table,  /*!< in/out: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const dict_index_t*	index)	/*!< in: index to be replaced */
 {
+	bool		found	= true;
 	dict_foreign_t*	foreign;
 
+	ut_ad(index->to_be_dropped);
+	ut_ad(index->table == table);
+
 	for (foreign = UT_LIST_GET_FIRST(table->foreign_list);
 	     foreign;
 	     foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) {
 
 		if (foreign->foreign_index == index) {
-			dict_index_t*	new_index
-				= dict_foreign_find_equiv_index(foreign);
+			ut_ad(foreign->foreign_table == index->table);
 
-			/* There must exist an alternative index if
-			check_foreigns (FOREIGN_KEY_CHECKS) is on, 
-			since ha_innobase::prepare_drop_index had done
-			the check before we reach here. */
-
-			ut_a(new_index || !trx->check_foreigns);
+			dict_index_t* new_index = dict_foreign_find_index(
+				foreign->foreign_table, col_names,
+				foreign->foreign_col_names,
+				foreign->n_fields, index,
+				/*check_charsets=*/TRUE, /*check_null=*/FALSE);
+			if (new_index) {
+				ut_ad(new_index->table == index->table);
+				ut_ad(!new_index->to_be_dropped);
+			} else {
+				found = false;
+			}
 
 			foreign->foreign_index = new_index;
 		}
 	}
 
-
 	for (foreign = UT_LIST_GET_FIRST(table->referenced_list);
 	     foreign;
 	     foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) {
 
-		dict_index_t*	new_index;
-
 		if (foreign->referenced_index == index) {
 			ut_ad(foreign->referenced_table == index->table);
 
-			new_index = dict_foreign_find_index(
-				foreign->referenced_table,
+			dict_index_t* new_index = dict_foreign_find_index(
+				foreign->referenced_table, NULL,
 				foreign->referenced_col_names,
 				foreign->n_fields, index,
 				/*check_charsets=*/TRUE, /*check_null=*/FALSE);
-			ut_ad(new_index || !trx->check_foreigns);
-			ut_ad(!new_index || new_index->table == index->table);
+			/* There must exist an alternative index,
+			since this must have been checked earlier. */
+			if (new_index) {
+				ut_ad(new_index->table == index->table);
+				ut_ad(!new_index->to_be_dropped);
+			} else {
+				found = false;
+			}
 
 			foreign->referenced_index = new_index;
 		}
 	}
+
+	return(found);
 }
 
 /**********************************************************************//**
@@ -5676,8 +5922,8 @@ dict_table_check_for_dup_indexes(
 /*=============================*/
 	const dict_table_t*	table,	/*!< in: Check for dup indexes
 					in this table */
-	ibool			tmp_ok)	/*!< in: TRUE=allow temporary
-					index names */
+	enum check_name		check)	/*!< in: whether and when to allow
+					temporary index names */
 {
 	/* Check for duplicates, ignoring indexes that are marked
 	as to be dropped */
@@ -5693,17 +5939,32 @@ dict_table_check_for_dup_indexes(
 	index1 = UT_LIST_GET_FIRST(table->indexes);
 
 	do {
-		ut_ad(tmp_ok || *index1->name != TEMP_INDEX_PREFIX);
-
-		index2 = UT_LIST_GET_NEXT(indexes, index1);
-
-		while (index2) {
-
-			if (!index2->to_be_dropped) {
-				ut_ad(ut_strcmp(index1->name, index2->name));
+		if (*index1->name == TEMP_INDEX_PREFIX) {
+			ut_a(!dict_index_is_clust(index1));
+
+			switch (check) {
+			case CHECK_ALL_COMPLETE:
+				ut_error;
+			case CHECK_ABORTED_OK:
+				switch (dict_index_get_online_status(index1)) {
+				case ONLINE_INDEX_COMPLETE:
+				case ONLINE_INDEX_CREATION:
+					ut_error;
+					break;
+				case ONLINE_INDEX_ABORTED:
+				case ONLINE_INDEX_ABORTED_DROPPED:
+					break;
+				}
+				/* fall through */
+			case CHECK_PARTIAL_OK:
+				break;
 			}
+		}
 
-			index2 = UT_LIST_GET_NEXT(indexes, index2);
+		for (index2 = UT_LIST_GET_NEXT(indexes, index1);
+		     index2 != NULL;
+		     index2 = UT_LIST_GET_NEXT(indexes, index2)) {
+			ut_ad(ut_strcmp(index1->name, index2->name));
 		}
 
 		index1 = UT_LIST_GET_NEXT(indexes, index1);
@@ -5711,7 +5972,270 @@ dict_table_check_for_dup_indexes(
 }
 #endif /* UNIV_DEBUG */
 
-/**************************************************************************
+/*********************************************************************//**
+Checks whether a table exists and whether it has the given structure.
+The table must have the same number of columns with the same names and
+types. The order of the columns does not matter.
+The caller must own the dictionary mutex.
+dict_table_schema_check() @{
+@return DB_SUCCESS if the table exists and contains the necessary columns */
+UNIV_INTERN
+dberr_t
+dict_table_schema_check(
+/*====================*/
+	dict_table_schema_t*	req_schema,	/*!< in/out: required table
+						schema */
+	char*			errstr,		/*!< out: human readable error
+						message if != DB_SUCCESS is
+						returned */
+	size_t			errstr_sz)	/*!< in: errstr size */
+{
+	char		buf[MAX_FULL_NAME_LEN];
+	dict_table_t*	table;
+	ulint		i;
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	table = dict_table_get_low(req_schema->table_name);
+
+	if (table == NULL) {
+		/* no such table */
+
+		ut_snprintf(errstr, errstr_sz,
+			    "Table %s not found.",
+			    ut_format_name(req_schema->table_name,
+					   TRUE, buf, sizeof(buf)));
+
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	if (table->ibd_file_missing) {
+		/* missing tablespace */
+
+		ut_snprintf(errstr, errstr_sz,
+			    "Tablespace for table %s is missing.",
+			    ut_format_name(req_schema->table_name,
+					   TRUE, buf, sizeof(buf)));
+
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	if ((ulint) table->n_def - DATA_N_SYS_COLS != req_schema->n_cols) {
+		/* the table has a different number of columns than
+		required */
+
+		ut_snprintf(errstr, errstr_sz,
+			    "%s has %d columns but should have %lu.",
+			    ut_format_name(req_schema->table_name,
+					   TRUE, buf, sizeof(buf)),
+			    table->n_def - DATA_N_SYS_COLS,
+			    req_schema->n_cols);
+
+		return(DB_ERROR);
+	}
+
+	/* For each column from req_schema->columns[] search
+	whether it is present in table->cols[].
+	The following algorithm is O(n_cols^2), but is optimized to
+	be O(n_cols) if the columns are in the same order in both arrays. */
+
+	for (i = 0; i < req_schema->n_cols; i++) {
+		ulint	j;
+
+		char	req_type[64];
+		char	actual_type[64];
+
+		/* check if i'th column is the same in both arrays */
+		if (innobase_strcasecmp(req_schema->columns[i].name,
+			       dict_table_get_col_name(table, i)) == 0) {
+
+			/* we found the column in table->cols[] quickly */
+			j = i;
+		} else {
+
+			/* columns in both arrays are not in the same order,
+			do a full scan of the second array */
+			for (j = 0; j < table->n_def; j++) {
+				const char*	name;
+
+				name = dict_table_get_col_name(table, j);
+
+				if (innobase_strcasecmp(name,
+					req_schema->columns[i].name) == 0) {
+
+					/* found the column on j'th
+					position */
+					break;
+				}
+			}
+
+			if (j == table->n_def) {
+
+				ut_snprintf(errstr, errstr_sz,
+					    "required column %s "
+					    "not found in table %s.",
+					    req_schema->columns[i].name,
+					    ut_format_name(
+						    req_schema->table_name,
+						    TRUE, buf, sizeof(buf)));
+
+				return(DB_ERROR);
+			}
+		}
+
+		/* we found a column with the same name on j'th position,
+		compare column types and flags */
+
+		dtype_sql_name(req_schema->columns[i].mtype,
+			       req_schema->columns[i].prtype_mask,
+			       req_schema->columns[i].len,
+			       req_type, sizeof(req_type));
+
+		dtype_sql_name(table->cols[j].mtype,
+			       table->cols[j].prtype,
+			       table->cols[j].len,
+			       actual_type, sizeof(actual_type));
+
+		/* check length for exact match */
+		if (req_schema->columns[i].len != table->cols[j].len) {
+
+			ut_snprintf(errstr, errstr_sz,
+				    "Column %s in table %s is %s "
+				    "but should be %s (length mismatch).",
+				    req_schema->columns[i].name,
+				    ut_format_name(req_schema->table_name,
+						   TRUE, buf, sizeof(buf)),
+				    actual_type, req_type);
+
+			return(DB_ERROR);
+		}
+
+		/* check mtype for exact match */
+		if (req_schema->columns[i].mtype != table->cols[j].mtype) {
+
+			ut_snprintf(errstr, errstr_sz,
+				    "Column %s in table %s is %s "
+				    "but should be %s (type mismatch).",
+				    req_schema->columns[i].name,
+				    ut_format_name(req_schema->table_name,
+						   TRUE, buf, sizeof(buf)),
+				    actual_type, req_type);
+
+			return(DB_ERROR);
+		}
+
+		/* check whether required prtype mask is set */
+		if (req_schema->columns[i].prtype_mask != 0
+		    && (table->cols[j].prtype
+			& req_schema->columns[i].prtype_mask)
+		       != req_schema->columns[i].prtype_mask) {
+
+			ut_snprintf(errstr, errstr_sz,
+				    "Column %s in table %s is %s "
+				    "but should be %s (flags mismatch).",
+				    req_schema->columns[i].name,
+				    ut_format_name(req_schema->table_name,
+						   TRUE, buf, sizeof(buf)),
+				    actual_type, req_type);
+
+			return(DB_ERROR);
+		}
+	}
+
+	if (req_schema->n_foreign != UT_LIST_GET_LEN(table->foreign_list)) {
+		ut_snprintf(
+			errstr, errstr_sz,
+			"Table %s has %lu foreign key(s) pointing to other "
+			"tables, but it must have %lu.",
+			ut_format_name(req_schema->table_name,
+				       TRUE, buf, sizeof(buf)),
+			UT_LIST_GET_LEN(table->foreign_list),
+			req_schema->n_foreign);
+		return(DB_ERROR);
+	}
+
+	if (req_schema->n_referenced != UT_LIST_GET_LEN(table->referenced_list)) {
+		ut_snprintf(
+			errstr, errstr_sz,
+			"There are %lu foreign key(s) pointing to %s, "
+			"but there must be %lu.",
+			UT_LIST_GET_LEN(table->referenced_list),
+			ut_format_name(req_schema->table_name,
+				       TRUE, buf, sizeof(buf)),
+			req_schema->n_referenced);
+		return(DB_ERROR);
+	}
+
+	return(DB_SUCCESS);
+}
+/* @} */
+
+/*********************************************************************//**
+Converts a database and table name from filesystem encoding
+(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two
+strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be
+at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */
+UNIV_INTERN
+void
+dict_fs2utf8(
+/*=========*/
+	const char*	db_and_table,	/*!< in: database and table names,
+					e.g. d@i1b/a@q1b@1Kc */
+	char*		db_utf8,	/*!< out: database name, e.g. dцb */
+	size_t		db_utf8_size,	/*!< in: dbname_utf8 size */
+	char*		table_utf8,	/*!< out: table name, e.g. aюbØc */
+	size_t		table_utf8_size)/*!< in: table_utf8 size */
+{
+	char	db[MAX_DATABASE_NAME_LEN + 1];
+	ulint	db_len;
+	uint	errors;
+
+	db_len = dict_get_db_name_len(db_and_table);
+
+	ut_a(db_len <= sizeof(db));
+
+	memcpy(db, db_and_table, db_len);
+	db[db_len] = '\0';
+
+	strconvert(
+		&my_charset_filename, db, db_len,
+		system_charset_info, db_utf8, db_utf8_size,
+		&errors);
+
+	/* convert each # to @0023 in table name and store the result in buf */
+	const char*	table = dict_remove_db_name(db_and_table);
+	const char*	table_p;
+	char		buf[MAX_TABLE_NAME_LEN * 5 + 1];
+	char*		buf_p;
+	for (table_p = table, buf_p = buf; table_p[0] != '\0'; table_p++) {
+		if (table_p[0] != '#') {
+			buf_p[0] = table_p[0];
+			buf_p++;
+		} else {
+			buf_p[0] = '@';
+			buf_p[1] = '0';
+			buf_p[2] = '0';
+			buf_p[3] = '2';
+			buf_p[4] = '3';
+			buf_p += 5;
+		}
+		ut_a((size_t) (buf_p - buf) < sizeof(buf));
+	}
+	buf_p[0] = '\0';
+
+	errors = 0;
+	strconvert(
+		&my_charset_filename, buf, buf_p - buf,
+		system_charset_info, table_utf8, table_utf8_size,
+		&errors);
+
+	if (errors != 0) {
+		ut_snprintf(table_utf8, table_utf8_size, "%s%s",
+			    srv_mysql50_table_name_prefix, table);
+	}
+}
+
+/**********************************************************************//**
 Closes the data dictionary module. */
 UNIV_INTERN
 void
@@ -5725,12 +6249,14 @@ dict_close(void)
 	for (i = 0; i < hash_get_n_cells(dict_sys->table_hash); i++) {
 		dict_table_t*	table;
 
-		table = HASH_GET_FIRST(dict_sys->table_hash, i);
+		table = static_cast<dict_table_t*>(
+			HASH_GET_FIRST(dict_sys->table_hash, i));
 
 		while (table) {
 			dict_table_t*	prev_table = table;
 
-			table = HASH_GET_NEXT(name_hash, prev_table);
+			table = static_cast<dict_table_t*>(
+				HASH_GET_NEXT(name_hash, prev_table));
 #ifdef UNIV_DEBUG
 			ut_a(prev_table->magic_n == DICT_TABLE_MAGIC_N);
 #endif
@@ -5756,7 +6282,9 @@ dict_close(void)
 	rw_lock_free(&dict_operation_lock);
 	memset(&dict_operation_lock, 0x0, sizeof(dict_operation_lock));
 
-	mutex_free(&dict_foreign_err_mutex);
+	if (!srv_read_only_mode) {
+		mutex_free(&dict_foreign_err_mutex);
+	}
 
 	mem_free(dict_sys);
 	dict_sys = NULL;
@@ -5766,224 +6294,364 @@ dict_close(void)
 	}
 }
 
+#ifdef UNIV_DEBUG
 /**********************************************************************//**
-Find a table in dict_sys->table_LRU list with specified space id
-@return table if found, NULL if not */
+Validate the dictionary table LRU list.
+@return TRUE if valid  */
 static
-dict_table_t*
-dict_find_table_by_space(
-/*=====================*/
-	ulint	space_id)		/*!< in: space ID */
+ibool
+dict_lru_validate(void)
+/*===================*/
 {
-	dict_table_t*   table;
-	ulint		num_item;
-	ulint		count = 0;
+	dict_table_t*	table;
 
-	ut_ad(space_id > 0);
+	ut_ad(mutex_own(&dict_sys->mutex));
 
-	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
-	num_item =  UT_LIST_GET_LEN(dict_sys->table_LRU);
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
 
-	/* This function intentionally does not acquire mutex as it is used
-	by error handling code in deep call stack as last means to avoid
-	killing the server, so it worth to risk some consequencies for
-	the action. */
-	while (table && count < num_item) {
-		if (table->space == space_id) {
-			return(table);
-		}
+		ut_a(table->can_be_evicted);
+	}
 
-		table = UT_LIST_GET_NEXT(table_LRU, table);
-		count++;
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+		ut_a(!table->can_be_evicted);
 	}
 
-	return(NULL);
+	return(TRUE);
 }
 
 /**********************************************************************//**
-Flags a table with specified space_id corrupted in the data dictionary
-cache
-@return TRUE if successful */
-UNIV_INTERN
+Check if a table exists in the dict table LRU list.
+@return TRUE if table found in LRU list */
+static
 ibool
-dict_set_corrupted_by_space(
-/*========================*/
-	ulint	space_id)		/*!< in: space ID */
+dict_lru_find_table(
+/*================*/
+	const dict_table_t*	find_table)	/*!< in: table to find */
 {
-	dict_table_t*   table;
+	dict_table_t*		table;
 
-	table = dict_find_table_by_space(space_id);
+	ut_ad(find_table != NULL);
+	ut_ad(mutex_own(&dict_sys->mutex));
 
-	if (!table) {
-		return(FALSE);
-	}
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
 
-	/* mark the table->corrupted bit only, since the caller
-	could be too deep in the stack for SYS_INDEXES update */
-	table->corrupted = TRUE;
+		ut_a(table->can_be_evicted);
 
-	return(TRUE);
+		if (table == find_table) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
 }
 
 /**********************************************************************//**
-Flags an index corrupted both in the data dictionary cache
-and in the SYS_INDEXES */
-UNIV_INTERN
-void
-dict_set_corrupted(
-/*===============*/
-	dict_index_t*	index)		/*!< in/out: index */
+Check if a table exists in the dict table non-LRU list.
+@return TRUE if table found in non-LRU list */
+static
+ibool
+dict_non_lru_find_table(
+/*====================*/
+	const dict_table_t*	find_table)	/*!< in: table to find */
 {
-	mem_heap_t*	heap;
-	mtr_t		mtr;
-	dict_index_t*	sys_index;
-	dtuple_t*	tuple;
-	dfield_t*	dfield;
-	byte*		buf;
-	const char*	status;
-	btr_cur_t	cursor;
+	dict_table_t*		table;
 
-	ut_ad(index);
+	ut_ad(find_table != NULL);
 	ut_ad(mutex_own(&dict_sys->mutex));
-	ut_ad(!dict_table_is_comp(dict_sys->sys_tables));
-	ut_ad(!dict_table_is_comp(dict_sys->sys_indexes));
 
-#ifdef UNIV_SYNC_DEBUG
-        ut_ad(sync_thread_levels_empty_except_dict());
-#endif
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
 
-	/* Mark the table as corrupted only if the clustered index
-	is corrupted */
-	if (dict_index_is_clust(index)) {
-		index->table->corrupted = TRUE;
+		ut_a(!table->can_be_evicted);
+
+		if (table == find_table) {
+			return(TRUE);
+		}
 	}
 
-	if (UNIV_UNLIKELY(dict_index_is_corrupted(index))) {
-		/* The index was already flagged corrupted. */
-		ut_ad(index->table->corrupted);
-		return;
+	return(FALSE);
+}
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Check an index to see whether its first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return true if the index qualifies, otherwise false */
+UNIV_INTERN
+bool
+dict_foreign_qualify_index(
+/*=======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols,	/*!< in: number of columns */
+	const dict_index_t*	index,	/*!< in: index to check */
+	const dict_index_t*	types_idx,
+					/*!< in: NULL or an index
+					whose types the column types
+					must match */
+	bool			check_charsets,
+					/*!< in: whether to check
+					charsets.  only has an effect
+					if types_idx != NULL */
+	ulint			check_null)
+					/*!< in: nonzero if none of
+					the columns must be declared
+					NOT NULL */
+{
+	if (dict_index_get_n_fields(index) < n_cols) {
+		return(false);
 	}
 
-	heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t)
-			       + sizeof(que_fork_t) + sizeof(upd_node_t)
-			       + sizeof(upd_t) + 12));
-	mtr_start(&mtr);
-	index->type |= DICT_CORRUPT;
+	for (ulint i = 0; i < n_cols; i++) {
+		dict_field_t*	field;
+		const char*	col_name;
+		ulint		col_no;
 
-	sys_index = UT_LIST_GET_FIRST(dict_sys->sys_indexes->indexes);
+		field = dict_index_get_nth_field(index, i);
+		col_no = dict_col_get_no(field->col);
 
-	/* Find the index row in SYS_INDEXES */
-	tuple = dtuple_create(heap, 2);
-
-	dfield = dtuple_get_nth_field(tuple, 0);
-	buf = mem_heap_alloc(heap, 8);
-	mach_write_to_8(buf, index->table->id);
-	dfield_set_data(dfield, buf, 8);
+		if (field->prefix_len != 0) {
+			/* We do not accept column prefix
+			indexes here */
+			return(false);
+		}
 
-	dfield = dtuple_get_nth_field(tuple, 1);
-	buf = mem_heap_alloc(heap, 8);
-	mach_write_to_8(buf, index->id);
-	dfield_set_data(dfield, buf, 8);
+		if (check_null
+		    && (field->col->prtype & DATA_NOT_NULL)) {
+			return(false);
+		}
 
-	dict_index_copy_types(tuple, sys_index, 2);
+		col_name = col_names
+			? col_names[col_no]
+			: dict_table_get_col_name(table, col_no);
 
-	btr_cur_search_to_nth_level(sys_index, 0, tuple, PAGE_CUR_GE,
-				    BTR_MODIFY_LEAF,
-				    &cursor, 0, __FILE__, __LINE__, &mtr);
+		if (0 != innobase_strcasecmp(columns[i], col_name)) {
+			return(false);
+		}
 
-	if (cursor.up_match == dtuple_get_n_fields(tuple)) {
-		/* UPDATE SYS_INDEXES SET TYPE=index->type
-		WHERE TABLE_ID=index->table->id AND INDEX_ID=index->id */
-		ulint	len;
-		byte*	field	= rec_get_nth_field_old(
-			btr_cur_get_rec(&cursor),
-			DICT_SYS_INDEXES_TYPE_FIELD, &len);
-		if (len != 4) {
-			goto fail;
+		if (types_idx && !cmp_cols_are_equal(
+			    dict_index_get_nth_col(index, i),
+			    dict_index_get_nth_col(types_idx, i),
+			    check_charsets)) {
+			return(false);
 		}
-		mlog_write_ulint(field, index->type, MLOG_4BYTES, &mtr);
-		status = "  InnoDB: Flagged corruption of ";
-	} else {
-fail:
-		status = "  InnoDB: Unable to flag corruption of ";
 	}
 
-	mtr_commit(&mtr);
-	mem_heap_free(heap);
+	return(true);
+}
 
-	ut_print_timestamp(stderr);
-	fputs(status, stderr);
-	dict_index_name_print(stderr, NULL, index);
-	putc('\n', stderr);
+/*********************************************************************//**
+Update the state of compression failure padding heuristics. This is
+called whenever a compression operation succeeds or fails.
+The caller must be holding info->mutex */
+static
+void
+dict_index_zip_pad_update(
+/*======================*/
+	zip_pad_info_t*	info,	/*<! in/out: info to be updated */
+	ulint	zip_threshold)	/*<! in: zip threshold value */
+{
+	ulint	total;
+	ulint	fail_pct;
+
+	ut_ad(info);
+
+	total = info->success + info->failure;
+
+	ut_ad(total > 0);
+
+	if(zip_threshold == 0) {
+		/* User has just disabled the padding. */
+		return;
+	}
+
+	if (total < ZIP_PAD_ROUND_LEN) {
+		/* We are in middle of a round. Do nothing. */
+		return;
+	}
+
+	/* We are at a 'round' boundary. Reset the values but first
+	calculate fail rate for our heuristic. */
+	fail_pct = (info->failure * 100) / total;
+	info->failure = 0;
+	info->success = 0;
+
+	if (fail_pct > zip_threshold) {
+		/* Compression failures are more then user defined
+		threshold. Increase the pad size to reduce chances of
+		compression failures. */
+		ut_ad(info->pad % ZIP_PAD_INCR == 0);
+
+		/* Only do increment if it won't increase padding
+		beyond max pad size. */
+		if (info->pad + ZIP_PAD_INCR
+		    < (UNIV_PAGE_SIZE * zip_pad_max) / 100) {
+#ifdef HAVE_ATOMIC_BUILTINS
+			/* Use atomics even though we have the mutex.
+			This is to ensure that we are able to read
+			info->pad atomically where atomics are
+			supported. */
+			os_atomic_increment_ulint(&info->pad, ZIP_PAD_INCR);
+#else /* HAVE_ATOMIC_BUILTINS */
+			info->pad += ZIP_PAD_INCR;
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+			MONITOR_INC(MONITOR_PAD_INCREMENTS);
+		}
+
+		info->n_rounds = 0;
+
+	} else {
+		/* Failure rate was OK. Another successful round
+		completed. */
+		++info->n_rounds;
+
+		/* If enough successful rounds are completed with
+		compression failure rate in control, decrease the
+		padding. */
+		if (info->n_rounds >= ZIP_PAD_SUCCESSFUL_ROUND_LIMIT
+		    && info->pad > 0) {
+
+			ut_ad(info->pad % ZIP_PAD_INCR == 0);
+#ifdef HAVE_ATOMIC_BUILTINS
+			/* Use atomics even though we have the mutex.
+			This is to ensure that we are able to read
+			info->pad atomically where atomics are
+			supported. */
+			os_atomic_decrement_ulint(&info->pad, ZIP_PAD_INCR);
+#else /* HAVE_ATOMIC_BUILTINS */
+			info->pad -= ZIP_PAD_INCR;
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+			info->n_rounds = 0;
+
+			MONITOR_INC(MONITOR_PAD_DECREMENTS);
+		}
+	}
 }
 
-/**********************************************************************//**
-Flags an index corrupted in the data dictionary cache only. This
-is used mostly to mark a corrupted index when index's own dictionary
-is corrupted, and we force to load such index for repair purpose */
+/*********************************************************************//**
+This function should be called whenever a page is successfully
+compressed. Updates the compression padding information. */
 UNIV_INTERN
 void
-dict_set_corrupted_index_cache_only(
-/*================================*/
-	dict_index_t*	index,		/*!< in/out: index */
-	dict_table_t*	table)		/*!< in/out: table */
+dict_index_zip_success(
+/*===================*/
+	dict_index_t*	index)	/*!< in/out: index to be updated. */
 {
 	ut_ad(index);
-	ut_ad(mutex_own(&dict_sys->mutex));
-	ut_ad(!dict_table_is_comp(dict_sys->sys_tables));
-	ut_ad(!dict_table_is_comp(dict_sys->sys_indexes));
 
-	/* Mark the table as corrupted only if the clustered index
-	is corrupted */
-	if (dict_index_is_clust(index)) {
-		dict_table_t*	corrupt_table;
+	ulint zip_threshold = zip_failure_threshold_pct;
+	if (!zip_threshold) {
+		/* Disabled by user. */
+		return;
+	}
 
-		corrupt_table = table ? table : index->table;
-		ut_ad(!index->table || !table || index->table  == table);
+	os_fast_mutex_lock(&index->zip_pad.mutex);
+	++index->zip_pad.success;
+	dict_index_zip_pad_update(&index->zip_pad, zip_threshold);
+	os_fast_mutex_unlock(&index->zip_pad.mutex);
+}
 
-		if (corrupt_table) {
-			corrupt_table->corrupted = TRUE;
-		}
+/*********************************************************************//**
+This function should be called whenever a page compression attempt
+fails. Updates the compression padding information. */
+UNIV_INTERN
+void
+dict_index_zip_failure(
+/*===================*/
+	dict_index_t*	index)	/*!< in/out: index to be updated. */
+{
+	ut_ad(index);
+
+	ulint zip_threshold = zip_failure_threshold_pct;
+	if (!zip_threshold) {
+		/* Disabled by user. */
+		return;
 	}
 
-	index->type |= DICT_CORRUPT;
+	os_fast_mutex_lock(&index->zip_pad.mutex);
+	++index->zip_pad.failure;
+	dict_index_zip_pad_update(&index->zip_pad, zip_threshold);
+	os_fast_mutex_unlock(&index->zip_pad.mutex);
 }
 
-/*************************************************************************
-set is_corrupt flag by space_id*/
 
-void
-dict_table_set_corrupt_by_space(
-/*============================*/
-	ulint	space_id,
-	ibool	need_mutex)
+/*********************************************************************//**
+Return the optimal page size, for which page will likely compress.
+@return page size beyond which page might not compress */
+UNIV_INTERN
+ulint
+dict_index_zip_pad_optimal_page_size(
+/*=================================*/
+	dict_index_t*	index)	/*!< in: index for which page size
+				is requested */
 {
-	dict_table_t*	table;
-	ibool		found = FALSE;
+	ulint	pad;
+	ulint	min_sz;
+	ulint	sz;
 
-	ut_a(!trx_sys_sys_space(space_id) && space_id < SRV_LOG_SPACE_FIRST_ID);
+	ut_ad(index);
 
-	if (need_mutex)
-		mutex_enter(&(dict_sys->mutex));
+	if (!zip_failure_threshold_pct) {
+		/* Disabled by user. */
+		return(UNIV_PAGE_SIZE);
+	}
 
-	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	/* We use atomics to read index->zip_pad.pad. Here we use zero
+	as increment as are not changing the value of the 'pad'. On
+	platforms where atomics are not available we grab the mutex. */
 
-	while (table) {
-		if (table->space == space_id) {
-			table->is_corrupt = TRUE;
-			found = TRUE;
-		}
+#ifdef HAVE_ATOMIC_BUILTINS
+	pad = os_atomic_increment_ulint(&index->zip_pad.pad, 0);
+#else /* HAVE_ATOMIC_BUILTINS */
+	os_fast_mutex_lock(&index->zip_pad.mutex);
+	pad = index->zip_pad.pad;
+	os_fast_mutex_unlock(&index->zip_pad.mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
 
-		table = UT_LIST_GET_NEXT(table_LRU, table);
-	}
+	ut_ad(pad < UNIV_PAGE_SIZE);
+	sz = UNIV_PAGE_SIZE - pad;
 
-	if (need_mutex)
-		mutex_exit(&(dict_sys->mutex));
+	/* Min size allowed by user. */
+	ut_ad(zip_pad_max < 100);
+	min_sz = (UNIV_PAGE_SIZE * (100 - zip_pad_max)) / 100;
 
-	if (!found) {
-		fprintf(stderr, "InnoDB: space to be marked as "
-			"crashed was not found for id %lu.\n",
-			(ulong) space_id);
-	}
+	return(ut_max(sz, min_sz));
+}
+
+/*************************************************************//**
+Convert table flag to row format string.
+@return row format name. */
+UNIV_INTERN
+const char*
+dict_tf_to_row_format_string(
+/*=========================*/
+	ulint	table_flag)		/*!< in: row format setting */
+{
+	switch (dict_tf_get_rec_format(table_flag)) {
+	case REC_FORMAT_REDUNDANT:
+		return("ROW_TYPE_REDUNDANT");
+	case REC_FORMAT_COMPACT:
+		return("ROW_TYPE_COMPACT");
+	case REC_FORMAT_COMPRESSED:
+		return("ROW_TYPE_COMPRESSED");
+	case REC_FORMAT_DYNAMIC:
+		return("ROW_TYPE_DYNAMIC");
+	}
+
+	ut_error;
+	return(0);
 }
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/dict/dict0load.c b/storage/xtradb/dict/dict0load.cc
index 0de6698f86c..d731aeca55f 100644
--- a/storage/xtradb/dict/dict0load.c
+++ b/storage/xtradb/dict/dict0load.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2013, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file dict/dict0load.c
+@file dict/dict0load.cc
 Loads to the memory cache database object definitions
 from dictionary tables
 
@@ -33,18 +33,21 @@ Created 4/24/1996 Heikki Tuuri
 
 #include "btr0pcur.h"
 #include "btr0btr.h"
-#include "btr0sea.h"
 #include "page0page.h"
 #include "mach0data.h"
 #include "dict0dict.h"
 #include "dict0boot.h"
+#include "dict0stats.h"
 #include "rem0cmp.h"
 #include "srv0start.h"
 #include "srv0srv.h"
+#include "dict0crea.h"
+#include "dict0priv.h"
 #include "ha_prototypes.h" /* innobase_casedn_str() */
-#include "trx0sys.h"
+#include "fts0priv.h"
 
-/** Following are six InnoDB system tables */
+/** Following are the InnoDB system tables. The positions in
+this array are referenced by enum dict_system_table_id. */
 static const char* SYSTEM_TABLE_NAME[] = {
 	"SYS_TABLES",
 	"SYS_INDEXES",
@@ -52,13 +55,15 @@ static const char* SYSTEM_TABLE_NAME[] = {
 	"SYS_FIELDS",
 	"SYS_FOREIGN",
 	"SYS_FOREIGN_COLS",
-	"SYS_STATS"
+	"SYS_TABLESPACES",
+	"SYS_DATAFILES"
 };
 
 /* If this flag is TRUE, then we will load the cluster index's (and tables')
 metadata even if it is marked as "corrupted". */
 UNIV_INTERN my_bool     srv_load_corrupted = FALSE;
 
+#ifdef UNIV_DEBUG
 /****************************************************************//**
 Compare the name of an index column.
 @return	TRUE if the i'th column of index is 'name'. */
@@ -77,6 +82,7 @@ name_of_col_is(
 
 	return(strcmp(name, dict_table_get_col_name(table, tmp)) == 0);
 }
+#endif /* UNIV_DEBUG */
 
 /********************************************************************//**
 Finds the first table name in the given database.
@@ -105,9 +111,9 @@ dict_get_first_table_name_in_db(
 
 	mtr_start(&mtr);
 
-	sys_tables = dict_table_get_low("SYS_TABLES", DICT_ERR_IGNORE_NONE);
+	sys_tables = dict_table_get_low("SYS_TABLES");
 	sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
-	ut_a(!dict_table_is_comp(sys_tables));
+	ut_ad(!dict_table_is_comp(sys_tables));
 
 	tuple = dtuple_create(heap, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
@@ -130,7 +136,8 @@ loop:
 		return(NULL);
 	}
 
-	field = rec_get_nth_field_old(rec, 0, &len);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__NAME, &len);
 
 	if (len < strlen(name)
 	    || ut_memcmp(name, field, strlen(name)) != 0) {
@@ -178,9 +185,10 @@ dict_print(void)
 	/* Enlarge the fatal semaphore wait timeout during the InnoDB table
 	monitor printout */
 
-	mutex_enter(&kernel_mutex);
-	srv_fatal_semaphore_wait_threshold += SRV_SEMAPHORE_WAIT_EXTENSION;
-	mutex_exit(&kernel_mutex);
+	os_increment_counter_by_amount(
+		server_mutex,
+		srv_fatal_semaphore_wait_threshold,
+		SRV_SEMAPHORE_WAIT_EXTENSION);
 
 	heap = mem_heap_create(1000);
 	mutex_enter(&(dict_sys->mutex));
@@ -191,14 +199,13 @@ dict_print(void)
 	while (rec) {
 		const char* err_msg;
 
-		err_msg = dict_process_sys_tables_rec(
-			heap, rec, &table, DICT_TABLE_LOAD_FROM_CACHE
-			| DICT_TABLE_UPDATE_STATS);
-
-		mtr_commit(&mtr);
+		err_msg = static_cast<const char*>(
+			dict_process_sys_tables_rec_and_mtr_commit(
+				heap, rec, &table, DICT_TABLE_LOAD_FROM_CACHE,
+				&mtr));
 
 		if (!err_msg) {
-			dict_table_print_low(table);
+			dict_table_print(table);
 		} else {
 			ut_print_timestamp(stderr);
 			fprintf(stderr, "  InnoDB: %s\n", err_msg);
@@ -215,12 +222,12 @@ dict_print(void)
 	mem_heap_free(heap);
 
 	/* Restore the fatal semaphore wait timeout */
-	mutex_enter(&kernel_mutex);
-	srv_fatal_semaphore_wait_threshold -= SRV_SEMAPHORE_WAIT_EXTENSION;
-	mutex_exit(&kernel_mutex);
+	os_decrement_counter_by_amount(
+		server_mutex,
+		srv_fatal_semaphore_wait_threshold,
+		SRV_SEMAPHORE_WAIT_EXTENSION);
 }
 
-
 /********************************************************************//**
 This function gets the next system table record as it scans the table.
 @return	the next record if found, NULL if end of scan */
@@ -254,7 +261,7 @@ dict_getnext_system_low(
 }
 
 /********************************************************************//**
-This function opens a system table, and return the first record.
+This function opens a system table, and returns the first record.
 @return	first record of the system table */
 UNIV_INTERN
 const rec_t*
@@ -271,13 +278,12 @@ dict_startscan_system(
 
 	ut_a(system_id < SYS_NUM_SYSTEM_TABLES);
 
-	system_table = dict_table_get_low(SYSTEM_TABLE_NAME[system_id],
-					  DICT_ERR_IGNORE_NONE);
+	system_table = dict_table_get_low(SYSTEM_TABLE_NAME[system_id]);
 
 	clust_index = UT_LIST_GET_FIRST(system_table->indexes);
 
-	btr_pcur_open_at_index_side(TRUE, clust_index, BTR_SEARCH_LEAF, pcur,
-				    TRUE, mtr);
+	btr_pcur_open_at_index_side(true, clust_index, BTR_SEARCH_LEAF, pcur,
+				    true, 0, mtr);
 
 	rec = dict_getnext_system_low(pcur, mtr);
 
@@ -295,16 +301,17 @@ dict_getnext_system(
 					to the record */
 	mtr_t*		mtr)		/*!< in: the mini-transaction */
 {
-        const rec_t*	rec;
+	const rec_t*	rec;
 
 	/* Restore the position */
-        btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+	btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
 
 	/* Get the next record */
-        rec = dict_getnext_system_low(pcur, mtr);
+	rec = dict_getnext_system_low(pcur, mtr);
 
 	return(rec);
 }
+
 /********************************************************************//**
 This function processes one SYS_TABLES record and populate the dict_table_t
 struct for the table. Extracted out of dict_print() to be used by
@@ -312,57 +319,54 @@ both monitor table output and information schema innodb_sys_tables output.
 @return error message, or NULL on success */
 UNIV_INTERN
 const char*
-dict_process_sys_tables_rec(
-/*========================*/
+dict_process_sys_tables_rec_and_mtr_commit(
+/*=======================================*/
 	mem_heap_t*	heap,		/*!< in/out: temporary memory heap */
 	const rec_t*	rec,		/*!< in: SYS_TABLES record */
 	dict_table_t**	table,		/*!< out: dict_table_t to fill */
-	dict_table_info_t status)	/*!< in: status bit controls
+	dict_table_info_t status,	/*!< in: status bit controls
 					options such as whether we shall
 					look for dict_table_t from cache
 					first */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction,
+					will be committed */
 {
 	ulint		len;
 	const char*	field;
 	const char*	err_msg = NULL;
 	char*		table_name;
 
-	field = (const char*) rec_get_nth_field_old(rec, 0, &len);
+	field = (const char*) rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__NAME, &len);
 
 	ut_a(!rec_get_deleted_flag(rec, 0));
 
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+
 	/* Get the table name */
 	table_name = mem_heap_strdupl(heap, field, len);
 
 	/* If DICT_TABLE_LOAD_FROM_CACHE is set, first check
-	whether there is cached dict_table_t struct first */
+	whether there is cached dict_table_t struct */
 	if (status & DICT_TABLE_LOAD_FROM_CACHE) {
-		*table = dict_table_get_low(table_name, DICT_ERR_IGNORE_NONE);
+
+		/* Commit before load the table again */
+		mtr_commit(mtr);
+
+		*table = dict_table_get_low(table_name);
 
 		if (!(*table)) {
 			err_msg = "Table not found in cache";
 		}
 	} else {
 		err_msg = dict_load_table_low(table_name, rec, table);
+		mtr_commit(mtr);
 	}
 
 	if (err_msg) {
 		return(err_msg);
 	}
 
-	if ((status & DICT_TABLE_UPDATE_STATS)
-	    && srv_stats_auto_update
-	    && dict_table_get_first_index(*table)) {
-
-		/* Update statistics if DICT_TABLE_UPDATE_STATS
-		is set */
-		dict_update_statistics(
-			*table,
-			FALSE, /* update even if initialized */
-			FALSE,
-			FALSE /* update even if not changed too much */);
-	}
-
 	return(NULL);
 }
 
@@ -383,7 +387,7 @@ dict_process_sys_indexes_rec(
 	const char*	err_msg;
 	byte*		buf;
 
-	buf = mem_heap_alloc(heap, 8);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
 
 	/* Parse the record, and get "dict_index_t" struct filled */
 	err_msg = dict_load_index_low(buf, NULL,
@@ -393,6 +397,7 @@ dict_process_sys_indexes_rec(
 
 	return(err_msg);
 }
+
 /********************************************************************//**
 This function parses a SYS_COLUMNS record and populate a dict_column_t
 structure with the information from the record.
@@ -415,6 +420,7 @@ dict_process_sys_columns_rec(
 
 	return(err_msg);
 }
+
 /********************************************************************//**
 This function parses a SYS_FIELDS record and populates a dict_field_t
 structure with the information from the record.
@@ -435,13 +441,13 @@ dict_process_sys_fields_rec(
 	byte*		last_index_id;
 	const char*	err_msg;
 
-	buf = mem_heap_alloc(heap, 8);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
 
-	last_index_id = mem_heap_alloc(heap, 8);
+	last_index_id = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(last_index_id, last_id);
 
 	err_msg = dict_load_field_low(buf, NULL, sys_field,
-				      pos, last_index_id, heap, rec, NULL, 0);
+				      pos, last_index_id, heap, rec);
 
 	*index_id = mach_read_from_8(buf);
 
@@ -449,7 +455,6 @@ dict_process_sys_fields_rec(
 
 }
 
-//#ifdef FOREIGN_NOT_USED
 /********************************************************************//**
 This function parses a SYS_FOREIGN record and populate a dict_foreign_t
 structure with the information from the record. For detail information
@@ -468,54 +473,60 @@ dict_process_sys_foreign_rec(
 	const byte*	field;
 	ulint		n_fields_and_type;
 
-	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, 0))) {
+	if (rec_get_deleted_flag(rec, 0)) {
 		return("delete-marked record in SYS_FOREIGN");
 	}
 
-	if (UNIV_UNLIKELY(rec_get_n_fields_old(rec) != 6)) {
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FOREIGN) {
 		return("wrong number of columns in SYS_FOREIGN record");
 	}
 
-	field = rec_get_nth_field_old(rec, 0/*ID*/, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__ID, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
 err_len:
 		return("incorrect column length in SYS_FOREIGN");
 	}
-	
+
 	/* This recieves a dict_foreign_t* that points to a stack variable.
 	So mem_heap_free(foreign->heap) is not used as elsewhere.
 	Since the heap used here is freed elsewhere, foreign->heap
 	is not assigned. */
 	foreign->id = mem_heap_strdupl(heap, (const char*) field, len);
 
-	rec_get_nth_field_offs_old(rec, 1/*DB_TRX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FOREIGN__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
-	rec_get_nth_field_offs_old(rec, 2/*DB_ROLL_PTR*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
 	/* The _lookup versions of the referenced and foreign table names
 	 are not assigned since they are not used in this dict_foreign_t */
 
-	field = rec_get_nth_field_old(rec, 3/*FOR_NAME*/, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
 		goto err_len;
 	}
 	foreign->foreign_table_name = mem_heap_strdupl(
 		heap, (const char*) field, len);
 
-	field = rec_get_nth_field_old(rec, 4/*REF_NAME*/, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
 		goto err_len;
 	}
 	foreign->referenced_table_name = mem_heap_strdupl(
 		heap, (const char*) field, len);
 
-	field = rec_get_nth_field_old(rec, 5/*N_COLS*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 	n_fields_and_type = mach_read_from_4(field);
@@ -525,9 +536,7 @@ err_len:
 
 	return(NULL);
 }
-//#endif  /* FOREIGN_NOT_USED */
 
-//#ifdef FOREIGN_NOT_USED
 /********************************************************************//**
 This function parses a SYS_FOREIGN_COLS record and extract necessary
 information from the record and return to caller.
@@ -547,125 +556,181 @@ dict_process_sys_foreign_col_rec(
 	ulint		len;
 	const byte*	field;
 
-	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, 0))) {
+	if (rec_get_deleted_flag(rec, 0)) {
 		return("delete-marked record in SYS_FOREIGN_COLS");
 	}
 
-	if (UNIV_UNLIKELY(rec_get_n_fields_old(rec) != 6)) {
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FOREIGN_COLS) {
 		return("wrong number of columns in SYS_FOREIGN_COLS record");
 	}
 
-	field = rec_get_nth_field_old(rec, 0/*ID*/, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
 err_len:
 		return("incorrect column length in SYS_FOREIGN_COLS");
 	}
 	*name = mem_heap_strdupl(heap, (char*) field, len);
 
-	field = rec_get_nth_field_old(rec, 1/*POS*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 	*pos = mach_read_from_4(field);
 
-	rec_get_nth_field_offs_old(rec, 2/*DB_TRX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
-	rec_get_nth_field_offs_old(rec, 3/*DB_ROLL_PTR*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
-	field = rec_get_nth_field_old(rec, 4/*FOR_COL_NAME*/, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
 		goto err_len;
 	}
 	*for_col_name = mem_heap_strdupl(heap, (char*) field, len);
 
-	field = rec_get_nth_field_old(rec, 5/*REF_COL_NAME*/, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
 		goto err_len;
 	}
 	*ref_col_name = mem_heap_strdupl(heap, (char*) field, len);
 
 	return(NULL);
 }
-//#endif  /* FOREIGN_NOT_USED */
 
 /********************************************************************//**
-This function parses a SYS_STATS record and extract necessary
-information from the record and return to caller.
+This function parses a SYS_TABLESPACES record, extracts necessary
+information from the record and returns to caller.
 @return error message, or NULL on success */
 UNIV_INTERN
 const char*
-dict_process_sys_stats_rec(
-/*=============================*/
-	mem_heap_t*	heap __attribute__((unused)),		/*!< in/out: heap memory */
-	const rec_t*	rec,		/*!< in: current SYS_STATS rec */
-	index_id_t*	index_id,	/*!< out: INDEX_ID */
-	ulint*		key_cols,	/*!< out: KEY_COLS */
-	ib_uint64_t*	diff_vals,	/*!< out: DIFF_VALS */
-	ib_uint64_t*	non_null_vals)	/*!< out: NON_NULL_VALS */
+dict_process_sys_tablespaces(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_TABLESPACES rec */
+	ulint*		space,		/*!< out: space id */
+	const char**	name,		/*!< out: tablespace name */
+	ulint*		flags)		/*!< out: tablespace flags */
 {
 	ulint		len;
 	const byte*	field;
-	ulint		n_fields;
 
-	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, 0))) {
-		return("delete-marked record in SYS_STATS");
-	}
+	/* Initialize the output values */
+	*space = ULINT_UNDEFINED;
+	*name = NULL;
+	*flags = ULINT_UNDEFINED;
 
-	n_fields = rec_get_n_fields_old(rec);
+	if (rec_get_deleted_flag(rec, 0)) {
+		return("delete-marked record in SYS_TABLESPACES");
+	}
 
-	if (UNIV_UNLIKELY(n_fields < 5)) {
-		return("wrong number of columns in SYS_STATS record");
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLESPACES) {
+		return("wrong number of columns in SYS_TABLESPACES record");
 	}
 
-	field = rec_get_nth_field_old(rec, 0/*INDEX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != 8)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLESPACES__SPACE, &len);
+	if (len != DICT_FLD_LEN_SPACE) {
 err_len:
-		return("incorrect column length in SYS_STATS");
+		return("incorrect column length in SYS_TABLESPACES");
 	}
-	*index_id = mach_read_from_8(field);
+	*space = mach_read_from_4(field);
 
-	field = rec_get_nth_field_old(rec, 1/*KEY_COLS*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLESPACES__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
-	*key_cols = mach_read_from_4(field);
 
-	rec_get_nth_field_offs_old(rec, 2/*DB_TRX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLESPACES__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
-	rec_get_nth_field_offs_old(rec, 3/*DB_ROLL_PTR*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL)) {
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLESPACES__NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
 		goto err_len;
 	}
+	*name = mem_heap_strdupl(heap, (char*) field, len);
 
-	field = rec_get_nth_field_old(rec, 4/*DIFF_VALS*/, &len);
-	if (UNIV_UNLIKELY(len != 8)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLESPACES__FLAGS, &len);
+	if (len != DICT_FLD_LEN_FLAGS) {
 		goto err_len;
 	}
-	*diff_vals = mach_read_from_8(field);
+	*flags = mach_read_from_4(field);
 
-	if (n_fields < 6) {
-		*non_null_vals = ((ib_uint64_t)(-1));
-	} else {
-		field = rec_get_nth_field_old(rec, 5/*NON_NULL_VALS*/, &len);
-		if (UNIV_UNLIKELY(len != 8)) {
-			goto err_len;
-		}
-		*non_null_vals = mach_read_from_8(field);
+	return(NULL);
+}
+
+/********************************************************************//**
+This function parses a SYS_DATAFILES record, extracts necessary
+information from the record and returns it to the caller.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_datafiles(
+/*=======================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_DATAFILES rec */
+	ulint*		space,		/*!< out: space id */
+	const char**	path)		/*!< out: datafile paths */
+{
+	ulint		len;
+	const byte*	field;
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		return("delete-marked record in SYS_DATAFILES");
+	}
+
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_DATAFILES) {
+		return("wrong number of columns in SYS_DATAFILES record");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_DATAFILES__SPACE, &len);
+	if (len != DICT_FLD_LEN_SPACE) {
+err_len:
+		return("incorrect column length in SYS_DATAFILES");
+	}
+	*space = mach_read_from_4(field);
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_DATAFILES__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_DATAFILES__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_DATAFILES__PATH, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
 	}
+	*path = mem_heap_strdupl(heap, (char*) field, len);
 
 	return(NULL);
 }
+
 /********************************************************************//**
-Determine the flags of a table described in SYS_TABLES.
-@return compressed page size in kilobytes; or 0 if the tablespace is
-uncompressed, ULINT_UNDEFINED on error */
+Determine the flags of a table as stored in SYS_TABLES.TYPE and N_COLS.
+@return  ULINT_UNDEFINED if error, else a valid dict_table_t::flags. */
 static
 ulint
 dict_sys_tables_get_flags(
@@ -674,57 +739,216 @@ dict_sys_tables_get_flags(
 {
 	const byte*	field;
 	ulint		len;
+	ulint		type;
 	ulint		n_cols;
-	ulint		flags;
 
-	field = rec_get_nth_field_old(rec, 5, &len);
+	/* read the 4 byte flags from the TYPE field */
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__TYPE, &len);
 	ut_a(len == 4);
+	type = mach_read_from_4(field);
 
-	flags = mach_read_from_4(field);
-
-	if (UNIV_LIKELY(flags == DICT_TABLE_ORDINARY)) {
-		return(0);
-	}
-
-	field = rec_get_nth_field_old(rec, 4/*N_COLS*/, &len);
+	/* The low order bit of SYS_TABLES.TYPE is always set to 1. But in
+	dict_table_t::flags the low order bit is used to determine if the
+	row format is Redundant or Compact when the format is Antelope.
+	Read the 4 byte N_COLS field and look at the high order bit.  It
+	should be set for COMPACT and later.  It should not be set for
+	REDUNDANT. */
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__N_COLS, &len);
+	ut_a(len == 4);
 	n_cols = mach_read_from_4(field);
 
-	if (UNIV_UNLIKELY(!(n_cols & 0x80000000UL))) {
-		/* New file formats require ROW_FORMAT=COMPACT. */
+	/* This validation function also combines the DICT_N_COLS_COMPACT
+	flag in n_cols into the type field to effectively make it a
+	dict_table_t::flags. */
+
+	if (ULINT_UNDEFINED == dict_sys_tables_type_validate(type, n_cols)) {
 		return(ULINT_UNDEFINED);
 	}
 
-	switch (flags & (DICT_TF_FORMAT_MASK | DICT_TF_COMPACT)) {
-	default:
-	case DICT_TF_FORMAT_51 << DICT_TF_FORMAT_SHIFT:
-	case DICT_TF_FORMAT_51 << DICT_TF_FORMAT_SHIFT | DICT_TF_COMPACT:
-		/* flags should be DICT_TABLE_ORDINARY,
-		or DICT_TF_FORMAT_MASK should be nonzero. */
-		return(ULINT_UNDEFINED);
+	return(dict_sys_tables_type_to_tf(type, n_cols));
+}
 
-	case DICT_TF_FORMAT_ZIP << DICT_TF_FORMAT_SHIFT | DICT_TF_COMPACT:
-#if DICT_TF_FORMAT_MAX > DICT_TF_FORMAT_ZIP
-# error "missing case labels for DICT_TF_FORMAT_ZIP .. DICT_TF_FORMAT_MAX"
-#endif
-		/* We support this format. */
-		break;
-	}
+/********************************************************************//**
+Gets the filepath for a spaceid from SYS_DATAFILES and checks it against
+the contents of a link file. This function is called when there is no
+fil_node_t entry for this space ID so both durable locations on  disk
+must be checked and compared.
+We use a temporary heap here for the table lookup, but not for the path
+returned which the caller must free.
+This function can return NULL if the space ID is not found in SYS_DATAFILES,
+then the caller will assume that the ibd file is in the normal datadir.
+@return	own: A copy of the first datafile found in SYS_DATAFILES.PATH for
+the given space ID. NULL if space ID is zero or not found. */
+UNIV_INTERN
+char*
+dict_get_first_path(
+/*================*/
+	ulint		space,	/*!< in: space id */
+	const char*	name)	/*!< in: tablespace name */
+{
+	mtr_t		mtr;
+	dict_table_t*	sys_datafiles;
+	dict_index_t*	sys_index;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	byte*		buf;
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	char*		dict_filepath = NULL;
+	mem_heap_t*	heap = mem_heap_create(1024);
 
-	if (UNIV_UNLIKELY((flags & DICT_TF_ZSSIZE_MASK)
-			  > (DICT_TF_ZSSIZE_MAX << DICT_TF_ZSSIZE_SHIFT))) {
-		/* Unsupported compressed page size. */
-		return(ULINT_UNDEFINED);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	mtr_start(&mtr);
+
+	sys_datafiles = dict_table_get_low("SYS_DATAFILES");
+	sys_index = UT_LIST_GET_FIRST(sys_datafiles->indexes);
+	ut_ad(!dict_table_is_comp(sys_datafiles));
+	ut_ad(name_of_col_is(sys_datafiles, sys_index,
+			     DICT_FLD__SYS_DATAFILES__SPACE, "SPACE"));
+	ut_ad(name_of_col_is(sys_datafiles, sys_index,
+			     DICT_FLD__SYS_DATAFILES__PATH, "PATH"));
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, DICT_FLD__SYS_DATAFILES__SPACE);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(buf, space);
+
+	dfield_set_data(dfield, buf, 4);
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+
+	rec = btr_pcur_get_rec(&pcur);
+
+	/* If the file-per-table tablespace was created with
+	an earlier version of InnoDB, then this record is not
+	in SYS_DATAFILES.  But a link file still might exist. */
+
+	if (btr_pcur_is_on_user_rec(&pcur)) {
+		/* A record for this space ID was found. */
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_DATAFILES__PATH, &len);
+		ut_a(len > 0 || len == UNIV_SQL_NULL);
+		ut_a(len < OS_FILE_MAX_PATH);
+		dict_filepath = mem_strdupl((char*) field, len);
+		ut_a(dict_filepath);
 	}
 
-	if (UNIV_UNLIKELY(flags & (~0 << DICT_TF_BITS))) {
-		/* Some unused bits are set. */
-		return(ULINT_UNDEFINED);
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+
+	return(dict_filepath);
+}
+
+/********************************************************************//**
+Update the record for space_id in SYS_TABLESPACES to this filepath.
+@return	DB_SUCCESS if OK, dberr_t if the insert failed */
+UNIV_INTERN
+dberr_t
+dict_update_filepath(
+/*=================*/
+	ulint		space_id,	/*!< in: space id */
+	const char*	filepath)	/*!< in: filepath */
+{
+	dberr_t		err = DB_SUCCESS;
+	trx_t*		trx;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "update filepath";
+	trx->dict_operation_lock_mode = RW_X_LATCH;
+	trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+
+	pars_info_t*	info = pars_info_create();
+
+	pars_info_add_int4_literal(info, "space", space_id);
+	pars_info_add_str_literal(info, "path", filepath);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE UPDATE_FILEPATH () IS\n"
+			   "BEGIN\n"
+			   "UPDATE SYS_DATAFILES"
+			   " SET PATH = :path\n"
+			   " WHERE SPACE = :space;\n"
+			   "END;\n", FALSE, trx);
+
+	trx_commit_for_mysql(trx);
+	trx->dict_operation_lock_mode = 0;
+	trx_free_for_background(trx);
+
+	if (err == DB_SUCCESS) {
+		/* We just updated SYS_DATAFILES due to the contents in
+		a link file.  Make a note that we did this. */
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"The InnoDB data dictionary table SYS_DATAFILES "
+			"for tablespace ID %lu was updated to use file %s.",
+			(ulong) space_id, filepath);
+	} else {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Problem updating InnoDB data dictionary table "
+			"SYS_DATAFILES for tablespace ID %lu to file %s.",
+			(ulong) space_id, filepath);
 	}
 
-	return(flags);
+	return(err);
+}
+
+/********************************************************************//**
+Insert records into SYS_TABLESPACES and SYS_DATAFILES.
+@return	DB_SUCCESS if OK, dberr_t if the insert failed */
+UNIV_INTERN
+dberr_t
+dict_insert_tablespace_and_filepath(
+/*================================*/
+	ulint		space,		/*!< in: space id */
+	const char*	name,		/*!< in: talespace name */
+	const char*	filepath,	/*!< in: filepath */
+	ulint		fsp_flags)	/*!< in: tablespace flags */
+{
+	dberr_t		err = DB_SUCCESS;
+	trx_t*		trx;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(filepath);
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "insert tablespace and filepath";
+	trx->dict_operation_lock_mode = RW_X_LATCH;
+	trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+
+	/* A record for this space ID was not found in
+	SYS_DATAFILES. Assume the record is also missing in
+	SYS_TABLESPACES.  Insert records onto them both. */
+	err = dict_create_add_tablespace_to_dictionary(
+		space, name, fsp_flags, filepath, trx, false);
+
+	trx_commit_for_mysql(trx);
+	trx->dict_operation_lock_mode = 0;
+	trx_free_for_background(trx);
+
+	return(err);
 }
 
 /********************************************************************//**
+This function looks at each table defined in SYS_TABLES.  It checks the
+tablespace for any table with a space_id > 0.  It looks up the tablespace
+in SYS_DATAFILES to ensure the correct path.
+
 In a crash recovery we already have all the tablespace objects created.
 This function compares the space id information in the InnoDB data dictionary
 to what we already read with fil_load_single_table_tablespaces().
@@ -736,7 +960,7 @@ UNIV_INTERN
 void
 dict_check_tablespaces_and_store_max_id(
 /*====================================*/
-	ibool	in_crash_recovery)	/*!< in: are we doing a crash recovery */
+	dict_check_t	dict_check)	/*!< in: how to check */
 {
 	dict_table_t*	sys_tables;
 	dict_index_t*	sys_index;
@@ -745,21 +969,22 @@ dict_check_tablespaces_and_store_max_id(
 	ulint		max_space_id;
 	mtr_t		mtr;
 
+	rw_lock_x_lock(&dict_operation_lock);
 	mutex_enter(&(dict_sys->mutex));
 
 	mtr_start(&mtr);
 
-	sys_tables = dict_table_get_low("SYS_TABLES", DICT_ERR_IGNORE_NONE);
+	sys_tables = dict_table_get_low("SYS_TABLES");
 	sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
-	ut_a(!dict_table_is_comp(sys_tables));
+	ut_ad(!dict_table_is_comp(sys_tables));
 
 	max_space_id = mtr_read_ulint(dict_hdr_get(&mtr)
 				      + DICT_HDR_MAX_SPACE_ID,
 				      MLOG_4BYTES, &mtr);
 	fil_set_max_space_id_if_bigger(max_space_id);
 
-	btr_pcur_open_at_index_side(TRUE, sys_index, BTR_SEARCH_LEAF, &pcur,
-				    TRUE, &mtr);
+	btr_pcur_open_at_index_side(true, sys_index, BTR_SEARCH_LEAF, &pcur,
+				    true, 0, &mtr);
 loop:
 	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
 
@@ -779,6 +1004,7 @@ loop:
 		fil_set_max_space_id_if_bigger(max_space_id);
 
 		mutex_exit(&(dict_sys->mutex));
+		rw_lock_x_unlock(&dict_operation_lock);
 
 		return;
 	}
@@ -792,27 +1018,33 @@ loop:
 		ulint		flags;
 		char*		name;
 
-		field = rec_get_nth_field_old(rec, 0, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__NAME, &len);
+
 		name = mem_strdupl((char*) field, len);
 
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name), name, FALSE);
+
 		flags = dict_sys_tables_get_flags(rec);
 		if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) {
-
-			field = rec_get_nth_field_old(rec, 5, &len);
+			/* Read again the 4 bytes from rec. */
+			field = rec_get_nth_field_old(
+				rec, DICT_FLD__SYS_TABLES__TYPE, &len);
+			ut_ad(len == 4); /* this was checked earlier */
 			flags = mach_read_from_4(field);
 
-			ut_print_timestamp(stderr);
-			fputs("  InnoDB: Error: table ", stderr);
-			ut_print_filename(stderr, name);
-			fprintf(stderr, "\n"
-				"InnoDB: in InnoDB data dictionary"
-				" has unknown type %lx.\n",
-				(ulong) flags);
-
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Table '%s' in InnoDB data dictionary"
+				" has unknown type %lx", table_name, flags);
+			mem_free(name);
 			goto loop;
 		}
 
-		field = rec_get_nth_field_old(rec, 9, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__SPACE, &len);
 		ut_a(len == 4);
 
 		space_id = mach_read_from_4(field);
@@ -821,48 +1053,111 @@ loop:
 
 		mtr_commit(&mtr);
 
-		if (trx_sys_sys_space(space_id)) {
+		/* For tables created with old versions of InnoDB,
+		SYS_TABLES.MIX_LEN may contain garbage.  Such tables
+		would always be in ROW_FORMAT=REDUNDANT. Pretend that
+		all such tables are non-temporary. That is, do not
+		suppress error printouts about temporary or discarded
+		tablespaces not being found. */
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len);
+
+		bool		is_temp = false;
+		bool		discarded = false;
+		ib_uint32_t	flags2 = mach_read_from_4(field);
+
+		/* Check that the tablespace (the .ibd file) really
+		exists; print a warning to the .err log if not.
+		Do not print warnings for temporary tables or for
+		tablespaces that have been discarded. */
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__N_COLS, &len);
+
+		/* MIX_LEN valid only for ROW_FORMAT > REDUNDANT. */
+		if (mach_read_from_4(field) & DICT_N_COLS_COMPACT) {
+
+			is_temp = !!(flags2 & DICT_TF2_TEMPORARY);
+			discarded = !!(flags2 & DICT_TF2_DISCARDED);
+		}
+
+		if (space_id == 0) {
 			/* The system tablespace always exists. */
-		} else if (in_crash_recovery) {
-			/* Check that the tablespace (the .ibd file) really
-			exists; print a warning to the .err log if not.
-			Do not print warnings for temporary tables. */
-			ibool	is_temp;
-
-			field = rec_get_nth_field_old(rec, 4, &len);
-			if (0x80000000UL &  mach_read_from_4(field)) {
-				/* ROW_FORMAT=COMPACT: read the is_temp
-				flag from SYS_TABLES.MIX_LEN. */
-				field = rec_get_nth_field_old(rec, 7, &len);
-				is_temp = mach_read_from_4(field)
-					& DICT_TF2_TEMPORARY;
-			} else {
-				/* For tables created with old versions
-				of InnoDB, SYS_TABLES.MIX_LEN may contain
-				garbage.  Such tables would always be
-				in ROW_FORMAT=REDUNDANT.  Pretend that
-				all such tables are non-temporary.  That is,
-				do not suppress error printouts about
-				temporary tables not being found. */
-				is_temp = FALSE;
-			}
+			ut_ad(!discarded);
+			goto next_tablespace;
+		}
+
+		switch (dict_check) {
+		case DICT_CHECK_ALL_LOADED:
+			/* All tablespaces should have been found in
+			fil_load_single_table_tablespaces(). */
 
 			fil_space_for_table_exists_in_mem(
-				space_id, name, is_temp, TRUE, !is_temp);
-		} else {
-			/* It is a normal database startup: create the space
-			object and check that the .ibd file exists. */
+				space_id, name, TRUE, !(is_temp || discarded),
+				false, NULL, 0);
+			break;
 
-			fil_open_single_table_tablespace(FALSE, space_id,
-							 flags, name, NULL);
-		}
+		case DICT_CHECK_SOME_LOADED:
+			/* Some tablespaces may have been opened in
+			trx_resurrect_table_locks(). */
+			if (fil_space_for_table_exists_in_mem(
+				    space_id, name, FALSE, FALSE,
+				    false, NULL, 0)) {
+				break;
+			}
+			/* fall through */
+		case DICT_CHECK_NONE_LOADED:
+			if (discarded) {
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"DISCARD flag set for table '%s',"
+					" ignored.",
+					table_name);
+				break;
+			}
 
-		mem_free(name);
+			/* It is a normal database startup: create the
+			space object and check that the .ibd file exists.
+			If the table uses a remote tablespace, look for the
+			space_id in SYS_DATAFILES to find the filepath */
+
+			/* Use the remote filepath if known. */
+			char*	filepath = NULL;
+			if (DICT_TF_HAS_DATA_DIR(flags)) {
+				filepath = dict_get_first_path(
+					space_id, name);
+			}
+
+			/* We set the 2nd param (fix_dict = true)
+			here because we already have an x-lock on
+			dict_operation_lock and dict_sys->mutex. Besides,
+			this is at startup and we are now single threaded.
+			If the filepath is not known, it will need to
+			be discovered. */
+			dberr_t	err = fil_open_single_table_tablespace(
+				false, srv_read_only_mode ? false : true,
+				space_id, dict_tf_to_fsp_flags(flags),
+				name, filepath);
+
+			if (err != DB_SUCCESS) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Tablespace open failed for '%s', "
+					"ignored.", table_name);
+			}
+
+			if (filepath) {
+				mem_free(filepath);
+			}
+
+			break;
+		}
 
 		if (space_id > max_space_id) {
 			max_space_id = space_id;
 		}
 
+next_tablespace:
+		mem_free(name);
 		mtr_start(&mtr);
 
 		btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
@@ -901,49 +1196,54 @@ dict_load_column_low(
 
 	ut_ad(table || column);
 
-	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, 0))) {
+	if (rec_get_deleted_flag(rec, 0)) {
 		return("delete-marked record in SYS_COLUMNS");
 	}
 
-	if (UNIV_UNLIKELY(rec_get_n_fields_old(rec) != 9)) {
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_COLUMNS) {
 		return("wrong number of columns in SYS_COLUMNS record");
 	}
 
-	field = rec_get_nth_field_old(rec, 0/*TABLE_ID*/, &len);
-	if (UNIV_UNLIKELY(len != 8)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__TABLE_ID, &len);
+	if (len != 8) {
 err_len:
 		return("incorrect column length in SYS_COLUMNS");
 	}
 
 	if (table_id) {
 		*table_id = mach_read_from_8(field);
-	} else if (UNIV_UNLIKELY(table->id != mach_read_from_8(field))) {
+	} else if (table->id != mach_read_from_8(field)) {
 		return("SYS_COLUMNS.TABLE_ID mismatch");
 	}
 
-	field = rec_get_nth_field_old(rec, 1/*POS*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__POS, &len);
+	if (len != 4) {
 
 		goto err_len;
 	}
 
 	pos = mach_read_from_4(field);
 
-	if (UNIV_UNLIKELY(table && table->n_def != pos)) {
+	if (table && table->n_def != pos) {
 		return("SYS_COLUMNS.POS mismatch");
 	}
 
-	rec_get_nth_field_offs_old(rec, 2/*DB_TRX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_COLUMNS__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
-	rec_get_nth_field_offs_old(rec, 3/*DB_ROLL_PTR*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
-	field = rec_get_nth_field_old(rec, 4/*NAME*/, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
@@ -953,15 +1253,17 @@ err_len:
 		*col_name = name;
 	}
 
-	field = rec_get_nth_field_old(rec, 5/*MTYPE*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__MTYPE, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 
 	mtype = mach_read_from_4(field);
 
-	field = rec_get_nth_field_old(rec, 6/*PRTYPE*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__PRTYPE, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 	prtype = mach_read_from_4(field);
@@ -987,13 +1289,15 @@ err_len:
 		}
 	}
 
-	field = rec_get_nth_field_old(rec, 7/*LEN*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__LEN, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 	col_len = mach_read_from_4(field);
-	field = rec_get_nth_field_old(rec, 8/*PREC*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__PREC, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 
@@ -1032,17 +1336,19 @@ dict_load_columns(
 
 	mtr_start(&mtr);
 
-	sys_columns = dict_table_get_low("SYS_COLUMNS", DICT_ERR_IGNORE_NONE);
+	sys_columns = dict_table_get_low("SYS_COLUMNS");
 	sys_index = UT_LIST_GET_FIRST(sys_columns->indexes);
-	ut_a(!dict_table_is_comp(sys_columns));
+	ut_ad(!dict_table_is_comp(sys_columns));
 
-	ut_a(name_of_col_is(sys_columns, sys_index, 4, "NAME"));
-	ut_a(name_of_col_is(sys_columns, sys_index, 8, "PREC"));
+	ut_ad(name_of_col_is(sys_columns, sys_index,
+			     DICT_FLD__SYS_COLUMNS__NAME, "NAME"));
+	ut_ad(name_of_col_is(sys_columns, sys_index,
+			     DICT_FLD__SYS_COLUMNS__PREC, "PREC"));
 
 	tuple = dtuple_create(heap, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
 
-	buf = mem_heap_alloc(heap, 8);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(buf, table->id);
 
 	dfield_set_data(dfield, buf, 8);
@@ -1051,20 +1357,55 @@ dict_load_columns(
 	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
 				  BTR_SEARCH_LEAF, &pcur, &mtr);
 	for (i = 0; i + DATA_N_SYS_COLS < (ulint) table->n_cols; i++) {
-		const char* err_msg;
+		const char*	err_msg;
+		const char*	name;
 
 		rec = btr_pcur_get_rec(&pcur);
 
 		ut_a(btr_pcur_is_on_user_rec(&pcur));
 
 		err_msg = dict_load_column_low(table, heap, NULL, NULL,
-					       NULL, rec);
+					       &name, rec);
 
 		if (err_msg) {
 			fprintf(stderr, "InnoDB: %s\n", err_msg);
 			ut_error;
 		}
 
+		/* Note: Currently we have one DOC_ID column that is
+		shared by all FTS indexes on a table. */
+		if (innobase_strcasecmp(name,
+					FTS_DOC_ID_COL_NAME) == 0) {
+			dict_col_t*	col;
+			/* As part of normal loading of tables the
+			table->flag is not set for tables with FTS
+			till after the FTS indexes are loaded. So we
+			create the fts_t instance here if there isn't
+			one already created.
+
+			This case does not arise for table create as
+			the flag is set before the table is created. */
+			if (table->fts == NULL) {
+				table->fts = fts_create(table);
+				fts_optimize_add_table(table);
+			}
+
+			ut_a(table->fts->doc_col == ULINT_UNDEFINED);
+
+			col = dict_table_get_nth_col(table, i);
+
+			ut_ad(col->len == sizeof(doc_id_t));
+
+			if (col->prtype & DATA_FTS_DOC_ID) {
+				DICT_TF2_FLAG_SET(
+					table, DICT_TF2_FTS_HAS_DOC_ID);
+				DICT_TF2_FLAG_UNSET(
+					table, DICT_TF2_FTS_ADD_DOC_ID);
+			}
+
+			table->fts->doc_col = i;
+		}
+
 		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
 	}
 
@@ -1075,9 +1416,6 @@ dict_load_columns(
 /** Error message for a delete-marked record in dict_load_field_low() */
 static const char* dict_load_field_del = "delete-marked record in SYS_FIELDS";
 
-static const char* dict_load_field_too_big = "column prefix exceeds maximum"
-					     " limit";
-
 /********************************************************************//**
 Loads an index field definition from a SYS_FIELDS record to
 dict_index_t.
@@ -1088,23 +1426,18 @@ dict_load_field_low(
 /*================*/
 	byte*		index_id,	/*!< in/out: index id (8 bytes)
 					an "in" value if index != NULL
-                                        and "out" if index == NULL */
+					and "out" if index == NULL */
 	dict_index_t*	index,		/*!< in/out: index, could be NULL
 					if we just populate a dict_field_t
 					struct with information from
-					a SYS_FIELDSS record */
+					a SYS_FIELDS record */
 	dict_field_t*	sys_field,	/*!< out: dict_field_t to be
 					filled */
 	ulint*		pos,		/*!< out: Field position */
 	byte*		last_index_id,	/*!< in: last index id */
 	mem_heap_t*	heap,		/*!< in/out: memory heap
 					for temporary storage */
-	const rec_t*	rec,		/*!< in: SYS_FIELDS record */
-	char*		addition_err_str,/*!< out: additional error message
-					that requires information to be
-					filled, or NULL */
-	ulint		err_str_len)	/*!< in: length of addition_err_str
-					in bytes */
+	const rec_t*	rec)		/*!< in: SYS_FIELDS record */
 {
 	const byte*	field;
 	ulint		len;
@@ -1116,23 +1449,24 @@ dict_load_field_low(
 	/* Either index or sys_field is supplied, not both */
 	ut_a((!index) || (!sys_field));
 
-	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, 0))) {
+	if (rec_get_deleted_flag(rec, 0)) {
 		return(dict_load_field_del);
 	}
 
-	if (UNIV_UNLIKELY(rec_get_n_fields_old(rec) != 5)) {
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FIELDS) {
 		return("wrong number of columns in SYS_FIELDS record");
 	}
 
-	field = rec_get_nth_field_old(rec, 0/*INDEX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != 8)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FIELDS__INDEX_ID, &len);
+	if (len != 8) {
 err_len:
 		return("incorrect column length in SYS_FIELDS");
 	}
 
 	if (!index) {
 		ut_a(last_index_id);
-		memcpy(index_id, (const char*)field, 8);
+		memcpy(index_id, (const char*) field, 8);
 		first_field = memcmp(index_id, last_index_id, 8);
 	} else {
 		first_field = (index->n_def == 0);
@@ -1141,20 +1475,6 @@ err_len:
 		}
 	}
 
-	field = rec_get_nth_field_old(rec, 1/*POS*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
-		goto err_len;
-	}
-
-	rec_get_nth_field_offs_old(rec, 2/*DB_TRX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL)) {
-		goto err_len;
-	}
-	rec_get_nth_field_offs_old(rec, 3/*DB_ROLL_PTR*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL)) {
-		goto err_len;
-	}
-
 	/* The next field stores the field position in the index and a
 	possible column prefix length if the index field does not
 	contain the whole column. The storage format is like this: if
@@ -1163,6 +1483,12 @@ err_len:
 	bytes the prefix length for the field. Otherwise the field
 	number (index->n_def) is contained in the 2 LOW bytes. */
 
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FIELDS__POS, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
 	pos_and_prefix_len = mach_read_from_4(field);
 
 	if (index && UNIV_UNLIKELY
@@ -1179,22 +1505,21 @@ err_len:
 		position = pos_and_prefix_len & 0xFFFFUL;
 	}
 
-	field = rec_get_nth_field_old(rec, 4, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FIELDS__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FIELDS__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
-	if (prefix_len > REC_VERSION_56_MAX_INDEX_COL_LEN) {
-		if (addition_err_str) {
-			ut_snprintf(addition_err_str, err_str_len,
-				    "index field '%s' has a prefix length"
-				    " of %lu bytes",
-				    mem_heap_strdupl(
-						heap, (const char*) field, len),
-				    (ulong) prefix_len);
-		}
-
-		return(dict_load_field_too_big);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FIELDS__COL_NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
 	}
 
 	if (index) {
@@ -1233,21 +1558,22 @@ dict_load_fields(
 	byte*		buf;
 	ulint		i;
 	mtr_t		mtr;
-	ulint		error;
+	dberr_t		error;
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 
 	mtr_start(&mtr);
 
-	sys_fields = dict_table_get_low("SYS_FIELDS", DICT_ERR_IGNORE_NONE);
+	sys_fields = dict_table_get_low("SYS_FIELDS");
 	sys_index = UT_LIST_GET_FIRST(sys_fields->indexes);
-	ut_a(!dict_table_is_comp(sys_fields));
-	ut_a(name_of_col_is(sys_fields, sys_index, 4, "COL_NAME"));
+	ut_ad(!dict_table_is_comp(sys_fields));
+	ut_ad(name_of_col_is(sys_fields, sys_index,
+			     DICT_FLD__SYS_FIELDS__COL_NAME, "COL_NAME"));
 
 	tuple = dtuple_create(heap, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
 
-	buf = mem_heap_alloc(heap, 8);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(buf, index->id);
 
 	dfield_set_data(dfield, buf, 8);
@@ -1256,16 +1582,14 @@ dict_load_fields(
 	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
 				  BTR_SEARCH_LEAF, &pcur, &mtr);
 	for (i = 0; i < index->n_fields; i++) {
-		const char*	err_msg;
-		char		addition_err_str[1024];
+		const char* err_msg;
 
 		rec = btr_pcur_get_rec(&pcur);
 
 		ut_a(btr_pcur_is_on_user_rec(&pcur));
 
 		err_msg = dict_load_field_low(buf, index, NULL, NULL, NULL,
-					      heap, rec, addition_err_str,
-					      sizeof(addition_err_str));
+					      heap, rec);
 
 		if (err_msg == dict_load_field_del) {
 			/* There could be delete marked records in
@@ -1274,24 +1598,7 @@ dict_load_fields(
 
 			goto next_rec;
 		} else if (err_msg) {
-			if (err_msg == dict_load_field_too_big) {
-				fprintf(stderr, "InnoDB: Error: load index"
-					" '%s' failed.\n"
-					"InnoDB: %s,\n"
-					"InnoDB: which exceeds the"
-					" maximum limit of %lu bytes.\n"
-					"InnoDB: Please use server that"
-					" supports long index prefix\n"
-					"InnoDB: or turn on"
-					" innodb_force_recovery to load"
-					" the table\n",
-					index->name, addition_err_str,
-					(ulong) (REC_VERSION_56_MAX_INDEX_COL_LEN));
-
-			} else {
-				fprintf(stderr, "InnoDB: %s\n", err_msg);
-			}
-
+			fprintf(stderr, "InnoDB: %s\n", err_msg);
 			error = DB_CORRUPTION;
 			goto func_exit;
 		}
@@ -1347,76 +1654,85 @@ dict_load_index_low(
 		*index = NULL;
 	}
 
-	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, 0))) {
+	if (rec_get_deleted_flag(rec, 0)) {
 		return(dict_load_index_del);
 	}
 
-	if (UNIV_UNLIKELY(rec_get_n_fields_old(rec) != 9)) {
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_INDEXES) {
 		return("wrong number of columns in SYS_INDEXES record");
 	}
 
-	field = rec_get_nth_field_old(rec, 0/*TABLE_ID*/, &len);
-	if (UNIV_UNLIKELY(len != 8)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__TABLE_ID, &len);
+	if (len != 8) {
 err_len:
 		return("incorrect column length in SYS_INDEXES");
 	}
 
 	if (!allocate) {
 		/* We are reading a SYS_INDEXES record. Copy the table_id */
-		memcpy(table_id, (const char*)field, 8);
+		memcpy(table_id, (const char*) field, 8);
 	} else if (memcmp(field, table_id, 8)) {
 		/* Caller supplied table_id, verify it is the same
 		id as on the index record */
 		return(dict_load_index_id_err);
 	}
 
-	field = rec_get_nth_field_old(rec, 1/*ID*/, &len);
-	if (UNIV_UNLIKELY(len != 8)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__ID, &len);
+	if (len != 8) {
 		goto err_len;
 	}
 
 	id = mach_read_from_8(field);
 
-	rec_get_nth_field_offs_old(rec, 2/*DB_TRX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_INDEXES__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
-	rec_get_nth_field_offs_old(rec, 3/*DB_ROLL_PTR*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_INDEXES__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
-	field = rec_get_nth_field_old(rec, 4/*NAME*/, &name_len);
-	if (UNIV_UNLIKELY(name_len == UNIV_SQL_NULL)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__NAME, &name_len);
+	if (name_len == UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
 	name_buf = mem_heap_strdupl(heap, (const char*) field,
 				    name_len);
 
-	field = rec_get_nth_field_old(rec, 5/*N_FIELDS*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__N_FIELDS, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 	n_fields = mach_read_from_4(field);
 
-	field = rec_get_nth_field_old(rec, 6/*TYPE*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__TYPE, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 	type = mach_read_from_4(field);
-	if (UNIV_UNLIKELY(type & (~0 << DICT_IT_BITS))) {
+	if (type & (~0 << DICT_IT_BITS)) {
 		return("unknown SYS_INDEXES.TYPE bits");
 	}
 
-	field = rec_get_nth_field_old(rec, 7/*SPACE*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__SPACE, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 	space = mach_read_from_4(field);
 
-	field = rec_get_nth_field_old(rec, 8/*PAGE_NO*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 
@@ -1443,8 +1759,8 @@ Loads definitions for table indexes. Adds them to the data dictionary
 cache.
 @return DB_SUCCESS if ok, DB_CORRUPTION if corruption of dictionary
 table or DB_UNSUPPORTED if table has unknown index type */
-static
-ulint
+static __attribute__((nonnull))
+dberr_t
 dict_load_indexes(
 /*==============*/
 	dict_table_t*	table,	/*!< in/out: table */
@@ -1461,22 +1777,24 @@ dict_load_indexes(
 	const rec_t*	rec;
 	byte*		buf;
 	mtr_t		mtr;
-	ulint		error = DB_SUCCESS;
+	dberr_t		error = DB_SUCCESS;
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 
 	mtr_start(&mtr);
 
-	sys_indexes = dict_table_get_low("SYS_INDEXES", DICT_ERR_IGNORE_NONE);
+	sys_indexes = dict_table_get_low("SYS_INDEXES");
 	sys_index = UT_LIST_GET_FIRST(sys_indexes->indexes);
-	ut_a(!dict_table_is_comp(sys_indexes));
-	ut_a(name_of_col_is(sys_indexes, sys_index, 4, "NAME"));
-	ut_a(name_of_col_is(sys_indexes, sys_index, 8, "PAGE_NO"));
+	ut_ad(!dict_table_is_comp(sys_indexes));
+	ut_ad(name_of_col_is(sys_indexes, sys_index,
+			     DICT_FLD__SYS_INDEXES__NAME, "NAME"));
+	ut_ad(name_of_col_is(sys_indexes, sys_index,
+			     DICT_FLD__SYS_INDEXES__PAGE_NO, "PAGE_NO"));
 
 	tuple = dtuple_create(heap, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
 
-	buf = mem_heap_alloc(heap, 8);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(buf, table->id);
 
 	dfield_set_data(dfield, buf, 8);
@@ -1490,18 +1808,65 @@ dict_load_indexes(
 
 		if (!btr_pcur_is_on_user_rec(&pcur)) {
 
+			/* We should allow the table to open even
+			without index when DICT_ERR_IGNORE_CORRUPT is set.
+			DICT_ERR_IGNORE_CORRUPT is currently only set
+			for drop table */
+			if (dict_table_get_first_index(table) == NULL
+			    && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)) {
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"Cannot load table %s "
+					"because it has no indexes in "
+					"InnoDB internal data dictionary.",
+					table->name);
+				error = DB_CORRUPTION;
+				goto func_exit;
+			}
+
 			break;
 		}
 
 		rec = btr_pcur_get_rec(&pcur);
 
+		if ((ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)
+		    && rec_get_n_fields_old(rec)
+		    == DICT_NUM_FIELDS__SYS_INDEXES) {
+			const byte*	field;
+			ulint		len;
+			field = rec_get_nth_field_old(
+				rec, DICT_FLD__SYS_INDEXES__NAME, &len);
+
+			if (len != UNIV_SQL_NULL
+			    && char(*field) == char(TEMP_INDEX_PREFIX)) {
+				/* Skip indexes whose name starts with
+				TEMP_INDEX_PREFIX, because they will
+				be dropped during crash recovery. */
+				goto next_rec;
+			}
+		}
+
 		err_msg = dict_load_index_low(buf, table->name, heap, rec,
 					      TRUE, &index);
-		ut_ad((index == NULL) == (err_msg != NULL));
+		ut_ad((index == NULL && err_msg != NULL)
+		      || (index != NULL && err_msg == NULL));
 
 		if (err_msg == dict_load_index_id_err) {
 			/* TABLE_ID mismatch means that we have
 			run out of index definitions for the table. */
+
+			if (dict_table_get_first_index(table) == NULL
+			    && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)) {
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"Failed to load the "
+					"clustered index for table %s "
+					"because of the following error: %s. "
+					"Refusing to load the rest of the "
+					"indexes (if any) and the whole table "
+					"altogether.", table->name, err_msg);
+				error = DB_CORRUPTION;
+				goto func_exit;
+			}
+
 			break;
 		} else if (err_msg == dict_load_index_del) {
 			/* Skip delete-marked records. */
@@ -1545,19 +1910,27 @@ dict_load_indexes(
 			}
 		}
 
+		if (index->type & DICT_FTS
+		    && !DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS)) {
+			/* This should have been created by now. */
+			ut_a(table->fts != NULL);
+			DICT_TF2_FLAG_SET(table, DICT_TF2_FTS);
+		}
+
 		/* We check for unsupported types first, so that the
 		subsequent checks are relevant for the supported types. */
 		if (index->type & ~(DICT_CLUSTERED | DICT_UNIQUE
-				    | DICT_CORRUPT)) {
-			fprintf(stderr,
-				"InnoDB: Error: unknown type %lu"
-				" of index %s of table %s\n",
+				    | DICT_CORRUPT | DICT_FTS)) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Unknown type %lu of index %s of table %s",
 				(ulong) index->type, index->name, table->name);
 
 			error = DB_UNSUPPORTED;
 			dict_mem_index_free(index);
 			goto func_exit;
-		} else if (index->page == FIL_NULL) {
+		} else if (index->page == FIL_NULL
+			   && !table->ibd_file_missing
+			   && (!(index->type & DICT_FTS))) {
 
 			fprintf(stderr,
 				"InnoDB: Error: trying to load index %s"
@@ -1598,7 +1971,7 @@ corrupted:
 			      " is not clustered!\n", stderr);
 
 			goto corrupted;
-		} else if (table->id < DICT_HDR_FIRST_ID
+		} else if (dict_is_sys_table(table->id)
 			   && (dict_index_is_clust(index)
 			       || ((table == dict_sys->sys_tables)
 				   && !strcmp("ID_IND", index->name)))) {
@@ -1607,29 +1980,11 @@ corrupted:
 			of the database server */
 			dict_mem_index_free(index);
 		} else {
-			error = dict_load_fields(index, heap);
-
-			if (error != DB_SUCCESS) {
-
-				fprintf(stderr, "InnoDB: Error: load index '%s'"
-					" for table '%s' failed\n",
-					index->name, table->name);
-
-				/* If the force recovery flag is set, and
-				if the failed index is not the clustered index,
-				we will continue and open other indexes */
-				if ((srv_force_recovery
-				     || srv_load_corrupted)
-				    && !dict_index_is_clust(index)) {
-					error = DB_SUCCESS;
-					goto next_rec;
-				} else {
-					goto func_exit;
-				}
-			}
+			dict_load_fields(index, heap);
+
+			error = dict_index_add_to_cache(
+				table, index, index->page, FALSE);
 
-			error = dict_index_add_to_cache(table, index,
-							index->page, FALSE);
 			/* The data dictionary tables should never contain
 			invalid index definitions.  If we ignored this error
 			and simply did not load this index definition, the
@@ -1640,11 +1995,17 @@ corrupted:
 				goto func_exit;
 			}
 		}
-
 next_rec:
 		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
 	}
 
+	/* If the table contains FTS indexes, populate table->fts->indexes */
+	if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS)) {
+		/* table->fts->indexes should have been created. */
+		ut_a(table->fts->indexes != NULL);
+		dict_table_get_all_fts_indexes(table, table->fts->indexes);
+	}
+
 func_exit:
 	btr_pcur_close(&pcur);
 	mtr_commit(&mtr);
@@ -1668,109 +2029,109 @@ dict_load_table_low(
 	ulint		len;
 	ulint		space;
 	ulint		n_cols;
-	ulint		flags;
+	ulint		flags = 0;
+	ulint		flags2;
 
-	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, 0))) {
+	if (rec_get_deleted_flag(rec, 0)) {
 		return("delete-marked record in SYS_TABLES");
 	}
 
-	if (UNIV_UNLIKELY(rec_get_n_fields_old(rec) != 10)) {
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLES) {
 		return("wrong number of columns in SYS_TABLES record");
 	}
 
-	rec_get_nth_field_offs_old(rec, 0/*NAME*/, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
 err_len:
 		return("incorrect column length in SYS_TABLES");
 	}
-	rec_get_nth_field_offs_old(rec, 1/*DB_TRX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
-	rec_get_nth_field_offs_old(rec, 2/*DB_ROLL_PTR*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
-	rec_get_nth_field_offs_old(rec, 3/*ID*/, &len);
-	if (UNIV_UNLIKELY(len != 8)) {
+	rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_TABLES__ID, &len);
+	if (len != 8) {
 		goto err_len;
 	}
 
-	field = rec_get_nth_field_old(rec, 4/*N_COLS*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__N_COLS, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 
 	n_cols = mach_read_from_4(field);
 
-	rec_get_nth_field_offs_old(rec, 5/*TYPE*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_TABLES__TYPE, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 
-	rec_get_nth_field_offs_old(rec, 6/*MIX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != 8)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__MIX_ID, &len);
+	if (len != 8) {
 		goto err_len;
 	}
 
-	rec_get_nth_field_offs_old(rec, 7/*MIX_LEN*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 
-	rec_get_nth_field_offs_old(rec, 8/*CLUSTER_ID*/, &len);
-	if (UNIV_UNLIKELY(len != UNIV_SQL_NULL)) {
+	/* MIX_LEN may hold additional flags in post-antelope file formats. */
+	flags2 = mach_read_from_4(field);
+
+	/* DICT_TF2_FTS will be set when indexes is being loaded */
+	flags2 &= ~DICT_TF2_FTS;
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__CLUSTER_ID, &len);
+	if (len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
-	field = rec_get_nth_field_old(rec, 9/*SPACE*/, &len);
-
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__SPACE, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 
 	space = mach_read_from_4(field);
 
 	/* Check if the tablespace exists and has the right name */
-	if (!trx_sys_sys_space(space)) {
-		flags = dict_sys_tables_get_flags(rec);
+	flags = dict_sys_tables_get_flags(rec);
 
-		if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) {
-			field = rec_get_nth_field_old(rec, 5/*TYPE*/, &len);
-			ut_ad(len == 4); /* this was checked earlier */
-			flags = mach_read_from_4(field);
+	if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) {
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__TYPE, &len);
+		ut_ad(len == 4); /* this was checked earlier */
+		flags = mach_read_from_4(field);
 
-			ut_print_timestamp(stderr);
-			fputs("  InnoDB: Error: table ", stderr);
-			ut_print_filename(stderr, name);
-			fprintf(stderr, "\n"
-				"InnoDB: in InnoDB data dictionary"
-				" has unknown type %lx.\n",
-				(ulong) flags);
-			return("incorrect flags in SYS_TABLES");
-		}
-	} else {
-		flags = 0;
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_filename(stderr, name);
+		fprintf(stderr, "\n"
+			"InnoDB: in InnoDB data dictionary"
+			" has unknown type %lx.\n",
+			(ulong) flags);
+		return("incorrect flags in SYS_TABLES");
 	}
 
 	/* The high-order bit of N_COLS is the "compact format" flag.
 	For tables in that format, MIX_LEN may hold additional flags. */
-	if (n_cols & 0x80000000UL) {
-		ulint	flags2;
-
-		flags |= DICT_TF_COMPACT;
-
-		field = rec_get_nth_field_old(rec, 7, &len);
-
-		if (UNIV_UNLIKELY(len != 4)) {
-
-			goto err_len;
-		}
+	if (n_cols & DICT_N_COLS_COMPACT) {
+		ut_ad(flags & DICT_TF_COMPACT);
 
-		flags2 = mach_read_from_4(field);
-
-		if (flags2 & (~0 << (DICT_TF2_BITS - DICT_TF2_SHIFT))) {
+		if (flags2 & ~DICT_TF2_BIT_MASK) {
 			ut_print_timestamp(stderr);
 			fputs("  InnoDB: Warning: table ", stderr);
 			ut_print_filename(stderr, name);
@@ -1779,17 +2140,20 @@ err_len:
 				" has unknown flags %lx.\n",
 				(ulong) flags2);
 
-			flags2 &= ~(~0 << (DICT_TF2_BITS - DICT_TF2_SHIFT));
+			/* Clean it up and keep going */
+			flags2 &= DICT_TF2_BIT_MASK;
 		}
-
-		flags |= flags2 << DICT_TF2_SHIFT;
+	} else {
+		/* Do not trust the MIX_LEN field when the
+		row format is Redundant. */
+		flags2 = 0;
 	}
 
 	/* See if the tablespace is available. */
-	*table = dict_mem_table_create(name, space, n_cols & ~0x80000000UL,
-				       flags);
+	*table = dict_mem_table_create(
+		name, space, n_cols & ~DICT_N_COLS_COMPACT, flags, flags2);
 
-	field = rec_get_nth_field_old(rec, 3/*ID*/, &len);
+	field = rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__ID, &len);
 	ut_ad(len == 8); /* this was checked earlier */
 
 	(*table)->id = mach_read_from_8(field);
@@ -1800,6 +2164,77 @@ err_len:
 }
 
 /********************************************************************//**
+Using the table->heap, copy the null-terminated filepath into
+table->data_dir_path and replace the 'databasename/tablename.ibd'
+portion with 'tablename'.
+This allows SHOW CREATE TABLE to return the correct DATA DIRECTORY path.
+Make this data directory path only if it has not yet been saved. */
+UNIV_INTERN
+void
+dict_save_data_dir_path(
+/*====================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	char*		filepath)	/*!< in: filepath of tablespace */
+{
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_a(DICT_TF_HAS_DATA_DIR(table->flags));
+
+	ut_a(!table->data_dir_path);
+	ut_a(filepath);
+
+	/* Be sure this filepath is not the default filepath. */
+	char*	default_filepath = fil_make_ibd_name(table->name, false);
+	if (strcmp(filepath, default_filepath)) {
+		ulint pathlen = strlen(filepath);
+		ut_a(pathlen < OS_FILE_MAX_PATH);
+		ut_a(0 == strcmp(filepath + pathlen - 4, ".ibd"));
+
+		table->data_dir_path = mem_heap_strdup(table->heap, filepath);
+		os_file_make_data_dir_path(table->data_dir_path);
+	} else {
+		/* This does not change SYS_DATAFILES or SYS_TABLES
+		or FSP_FLAGS on the header page of the tablespace,
+		but it makes dict_table_t consistent */
+		table->flags &= ~DICT_TF_MASK_DATA_DIR;
+	}
+	mem_free(default_filepath);
+}
+
+/*****************************************************************//**
+Make sure the data_file_name is saved in dict_table_t if needed. Try to
+read it from the file dictionary first, then from SYS_DATAFILES. */
+UNIV_INTERN
+void
+dict_get_and_save_data_dir_path(
+/*============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	bool		dict_mutex_own)	/*!< in: true if dict_sys->mutex
+					is owned already */
+{
+	if (DICT_TF_HAS_DATA_DIR(table->flags)
+	    && (!table->data_dir_path)) {
+		char*	path = fil_space_get_first_path(table->space);
+
+		if (!dict_mutex_own) {
+			dict_mutex_enter_for_mysql();
+		}
+		if (!path) {
+			path = dict_get_first_path(
+				table->space, table->name);
+		}
+
+		if (path) {
+			dict_save_data_dir_path(table, path);
+			mem_free(path);
+		}
+
+		if (!dict_mutex_own) {
+			dict_mutex_exit_for_mysql();
+		}
+	}
+}
+
+/********************************************************************//**
 Loads a table definition and also all its index definitions, and also
 the cluster definition if the table is a member in a cluster. Also loads
 all foreign key constraints where the foreign key is in the table or where
@@ -1819,6 +2254,7 @@ dict_load_table(
 				/*!< in: error to be ignored when loading
 				table and its indexes' definition */
 {
+	dberr_t		err;
 	dict_table_t*	table;
 	dict_table_t*	sys_tables;
 	btr_pcur_t	pcur;
@@ -1829,7 +2265,7 @@ dict_load_table(
 	const rec_t*	rec;
 	const byte*	field;
 	ulint		len;
-	ulint		err;
+	char*		filepath = NULL;
 	const char*	err_msg;
 	mtr_t		mtr;
 
@@ -1839,14 +2275,19 @@ dict_load_table(
 
 	mtr_start(&mtr);
 
-	sys_tables = dict_table_get_low("SYS_TABLES", DICT_ERR_IGNORE_NONE);
+	sys_tables = dict_table_get_low("SYS_TABLES");
 	sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
-	ut_a(!dict_table_is_comp(sys_tables));
-	ut_a(name_of_col_is(sys_tables, sys_index, 3, "ID"));
-	ut_a(name_of_col_is(sys_tables, sys_index, 4, "N_COLS"));
-	ut_a(name_of_col_is(sys_tables, sys_index, 5, "TYPE"));
-	ut_a(name_of_col_is(sys_tables, sys_index, 7, "MIX_LEN"));
-	ut_a(name_of_col_is(sys_tables, sys_index, 9, "SPACE"));
+	ut_ad(!dict_table_is_comp(sys_tables));
+	ut_ad(name_of_col_is(sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__ID, "ID"));
+	ut_ad(name_of_col_is(sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__N_COLS, "N_COLS"));
+	ut_ad(name_of_col_is(sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__TYPE, "TYPE"));
+	ut_ad(name_of_col_is(sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__MIX_LEN, "MIX_LEN"));
+	ut_ad(name_of_col_is(sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__SPACE, "SPACE"));
 
 	tuple = dtuple_create(heap, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
@@ -1869,7 +2310,8 @@ err_exit:
 		return(NULL);
 	}
 
-	field = rec_get_nth_field_old(rec, 0, &len);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__NAME, &len);
 
 	/* Check if the table name in record is the searched one */
 	if (len != ut_strlen(name) || ut_memcmp(name, field, len) != 0) {
@@ -1886,53 +2328,95 @@ err_exit:
 		goto err_exit;
 	}
 
-	if (trx_sys_sys_space(table->space)) {
+	char	table_name[MAX_FULL_NAME_LEN + 1];
+
+	innobase_format_name(table_name, sizeof(table_name), name, FALSE);
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	if (table->space == 0) {
 		/* The system tablespace is always available. */
+	} else if (table->flags2 & DICT_TF2_DISCARDED) {
+
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Table '%s' tablespace is set as discarded.",
+			table_name);
+
+		table->ibd_file_missing = TRUE;
+
 	} else if (!fil_space_for_table_exists_in_mem(
-			   table->space, name,
-			   (table->flags >> DICT_TF2_SHIFT)
-			   & DICT_TF2_TEMPORARY,
-			   FALSE, FALSE)) {
+			table->space, name, FALSE, FALSE, true, heap,
+			table->id)) {
 
-		if (table->flags & (DICT_TF2_TEMPORARY << DICT_TF2_SHIFT)) {
+		if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY)) {
 			/* Do not bother to retry opening temporary tables. */
 			table->ibd_file_missing = TRUE;
+
 		} else {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: error: space object of table ");
-			ut_print_filename(stderr, name);
-			fprintf(stderr, ",\n"
-				"InnoDB: space id %lu did not exist in memory."
-				" Retrying an open.\n",
-				(ulong) table->space);
-			/* Try to open the tablespace */
-			if (!fil_open_single_table_tablespace(
-				TRUE, table->space,
-				table->flags == DICT_TF_COMPACT ? 0 :
-				table->flags & ~(~0 << DICT_TF_BITS), name, NULL)) {
+			if (!(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Failed to find tablespace for "
+					"table '%s' in the cache. "
+					"Attempting to load the tablespace "
+					"with space id %lu.",
+					table_name, (ulong) table->space);
+			}
+
+			/* Use the remote filepath if needed. */
+			if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+				/* This needs to be added to the table
+				from SYS_DATAFILES */
+				dict_get_and_save_data_dir_path(table, true);
+
+				if (table->data_dir_path) {
+					filepath = os_file_make_remote_pathname(
+						table->data_dir_path,
+						table->name, "ibd");
+				}
+			}
+
+			/* Try to open the tablespace.  We set the
+			2nd param (fix_dict = false) here because we
+			do not have an x-lock on dict_operation_lock */
+			err = fil_open_single_table_tablespace(
+				true, false, table->space,
+				dict_tf_to_fsp_flags(table->flags),
+				name, filepath);
+
+			if (err != DB_SUCCESS) {
 				/* We failed to find a sensible
 				tablespace file */
 
 				table->ibd_file_missing = TRUE;
 			}
+			if (filepath) {
+				mem_free(filepath);
+			}
 		}
 	}
 
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
-
 	dict_load_columns(table, heap);
 
 	if (cached) {
-		dict_table_add_to_cache(table, heap);
+		dict_table_add_to_cache(table, TRUE, heap);
 	} else {
 		dict_table_add_system_columns(table, heap);
 	}
 
 	mem_heap_empty(heap);
 
-	err = dict_load_indexes(table, heap, ignore_err);
+	/* If there is no tablespace for the table then we only need to
+	load the index definitions. So that we can IMPORT the tablespace
+	later. When recovering table locks for resurrected incomplete
+	transactions, the tablespace should exist, because DDL operations
+	were not allowed while the table is being locked by a transaction. */
+	dict_err_ignore_t index_load_err =
+		!(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)
+		&& table->ibd_file_missing
+		? DICT_ERR_IGNORE_ALL
+		: ignore_err;
+	err = dict_load_indexes(table, heap, index_load_err);
 
 	if (err == DB_INDEX_CORRUPT) {
 		/* Refuse to load the table if the table has a corrupted
@@ -1966,15 +2450,16 @@ err_exit:
 	of the error condition, since the user may want to dump data from the
 	clustered index. However we load the foreign key information only if
 	all indexes were loaded. */
-	if (!cached) {
+	if (!cached || table->ibd_file_missing) {
+		/* Don't attempt to load the indexes from disk. */
 	} else if (err == DB_SUCCESS) {
-		err = dict_load_foreigns(table->name, TRUE, TRUE,
+		err = dict_load_foreigns(table->name, NULL, true, true,
 					 ignore_err);
 
 		if (err != DB_SUCCESS) {
-			fprintf(stderr,
-				"InnoDB: Load table '%s' failed, the table "
-				"has missing foreign key indexes. Turn off "
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Load table '%s' failed, the table has missing "
+				"foreign key indexes. Turn off "
 				"'foreign_key_checks' and try again.",
 				table->name);
 
@@ -1984,17 +2469,21 @@ err_exit:
 			table->fk_max_recusive_level = 0;
 		}
 	} else {
-		dict_index_t*	index;
+		dict_index_t*   index;
 
 		/* Make sure that at least the clustered index was loaded.
 		Otherwise refuse to load the table */
 		index = dict_table_get_first_index(table);
 
-		if (!srv_force_recovery || !index
+		if (!srv_force_recovery
+		    || !index
 		    || !dict_index_is_clust(index)) {
+
 			dict_table_remove_from_cache(table);
 			table = NULL;
-		} else if (dict_index_is_corrupted(index)) {
+
+		} else if (dict_index_is_corrupted(index)
+			   && !table->ibd_file_missing) {
 
 			/* It is possible we force to load a corrupted
 			clustered index if srv_load_corrupted is set.
@@ -2002,33 +2491,28 @@ err_exit:
 			table->corrupted = TRUE;
 		}
 	}
-#if 0
-	if (err != DB_SUCCESS && table != NULL) {
-
-		mutex_enter(&dict_foreign_err_mutex);
 
-		ut_print_timestamp(stderr);
-
-		fprintf(stderr,
-			"  InnoDB: Error: could not make a foreign key"
-			" definition to match\n"
-			"InnoDB: the foreign key table"
-			" or the referenced table!\n"
-			"InnoDB: The data dictionary of InnoDB is corrupt."
-			" You may need to drop\n"
-			"InnoDB: and recreate the foreign key table"
-			" or the referenced table.\n"
-			"InnoDB: Submit a detailed bug report"
-			" to http://bugs.mysql.com\n"
-			"InnoDB: Latest foreign key error printout:\n%s\n",
-			dict_foreign_err_buf);
-
-		mutex_exit(&dict_foreign_err_mutex);
-	}
-#endif /* 0 */
 func_exit:
 	mem_heap_free(heap);
 
+	ut_ad(!table
+	      || ignore_err != DICT_ERR_IGNORE_NONE
+	      || table->ibd_file_missing
+	      || !table->corrupted);
+
+	if (table && table->fts) {
+		if (!(dict_table_has_fts_index(table)
+		      || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+		      || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID))) {
+			/* the table->fts could be created in dict_load_column
+			when a user defined FTS_DOC_ID is present, but no
+			FTS */
+			fts_free(table);
+		} else {
+			fts_optimize_add_table(table);
+		}
+	}
+
 	return(table);
 }
 
@@ -2039,7 +2523,9 @@ UNIV_INTERN
 dict_table_t*
 dict_load_table_on_id(
 /*==================*/
-	table_id_t	table_id)	/*!< in: table id */
+	table_id_t		table_id,	/*!< in: table id */
+	dict_err_ignore_t	ignore_err)	/*!< in: errors to ignore
+						when loading the table */
 {
 	byte		id_buf[8];
 	btr_pcur_t	pcur;
@@ -2068,7 +2554,8 @@ dict_load_table_on_id(
 	sys_tables = dict_sys->sys_tables;
 	sys_table_ids = dict_table_get_next_index(
 		dict_table_get_first_index(sys_tables));
-	ut_a(!dict_table_is_comp(sys_tables));
+	ut_ad(!dict_table_is_comp(sys_tables));
+	ut_ad(!dict_index_is_clust(sys_table_ids));
 	heap = mem_heap_create(256);
 
 	tuple  = dtuple_create(heap, 1);
@@ -2082,40 +2569,44 @@ dict_load_table_on_id(
 
 	btr_pcur_open_on_user_rec(sys_table_ids, tuple, PAGE_CUR_GE,
 				  BTR_SEARCH_LEAF, &pcur, &mtr);
-	rec = btr_pcur_get_rec(&pcur);
-
-	if (!btr_pcur_is_on_user_rec(&pcur)) {
-		/* Not found */
-		goto func_exit;
-	}
-
-	/* Find the first record that is not delete marked */
-	while (rec_get_deleted_flag(rec, 0)) {
-		if (!btr_pcur_move_to_next_user_rec(&pcur, &mtr)) {
-			goto func_exit;
-		}
-		rec = btr_pcur_get_rec(&pcur);
-	}
-
-	/*---------------------------------------------------*/
-	/* Now we have the record in the secondary index containing the
-	table ID and NAME */
 
+check_rec:
 	rec = btr_pcur_get_rec(&pcur);
-	field = rec_get_nth_field_old(rec, 0, &len);
-	ut_ad(len == 8);
 
-	/* Check if the table id in record is the one searched for */
-	if (table_id != mach_read_from_8(field)) {
-		goto func_exit;
+	if (page_rec_is_user_rec(rec)) {
+		/*---------------------------------------------------*/
+		/* Now we have the record in the secondary index
+		containing the table ID and NAME */
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLE_IDS__ID, &len);
+		ut_ad(len == 8);
+
+		/* Check if the table id in record is the one searched for */
+		if (table_id == mach_read_from_8(field)) {
+			if (rec_get_deleted_flag(rec, 0)) {
+				/* Until purge has completed, there
+				may be delete-marked duplicate records
+				for the same SYS_TABLES.ID.
+				Due to Bug #60049, some delete-marked
+				records may survive the purge forever. */
+				if (btr_pcur_move_to_next(&pcur, &mtr)) {
+
+					goto check_rec;
+				}
+			} else {
+				/* Now we get the table name from the record */
+				field = rec_get_nth_field_old(rec,
+					DICT_FLD__SYS_TABLE_IDS__NAME, &len);
+				/* Load the table definition to memory */
+				table = dict_load_table(
+					mem_heap_strdupl(
+						heap, (char*) field, len),
+					TRUE, ignore_err);
+			}
+		}
 	}
 
-	/* Now we get the table name from the record */
-	field = rec_get_nth_field_old(rec, 1, &len);
-	/* Load the table definition to memory */
-	table = dict_load_table(mem_heap_strdupl(heap, (char*) field, len),
-				TRUE, DICT_ERR_IGNORE_NONE);
-func_exit:
 	btr_pcur_close(&pcur);
 	mtr_commit(&mtr);
 	mem_heap_free(heap);
@@ -2145,15 +2636,20 @@ dict_load_sys_table(
 }
 
 /********************************************************************//**
-Loads foreign key constraint col names (also for the referenced table). */
+Loads foreign key constraint col names (also for the referenced table).
+Members that must be set (and valid) in foreign:
+foreign->heap
+foreign->n_fields
+foreign->id ('\0'-terminated)
+Members that will be created and set by this function:
+foreign->foreign_col_names[i]
+foreign->referenced_col_names[i]
+(for i=0..foreign->n_fields-1) */
 static
 void
 dict_load_foreign_cols(
 /*===================*/
-	const char*	id,	/*!< in: foreign constraint id, not
-				necessary '\0'-terminated */
-	ulint		id_len,	/*!< in: id length */
-	dict_foreign_t*	foreign)/*!< in: foreign constraint object */
+	dict_foreign_t*	foreign)/*!< in/out: foreign constraint object */
 {
 	dict_table_t*	sys_foreign_cols;
 	dict_index_t*	sys_index;
@@ -2165,25 +2661,31 @@ dict_load_foreign_cols(
 	ulint		len;
 	ulint		i;
 	mtr_t		mtr;
+	size_t		id_len;
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 
-	foreign->foreign_col_names = mem_heap_alloc(
-		foreign->heap, foreign->n_fields * sizeof(void*));
+	id_len = strlen(foreign->id);
+
+	foreign->foreign_col_names = static_cast<const char**>(
+		mem_heap_alloc(foreign->heap,
+			       foreign->n_fields * sizeof(void*)));
+
+	foreign->referenced_col_names = static_cast<const char**>(
+		mem_heap_alloc(foreign->heap,
+			       foreign->n_fields * sizeof(void*)));
 
-	foreign->referenced_col_names = mem_heap_alloc(
-		foreign->heap, foreign->n_fields * sizeof(void*));
 	mtr_start(&mtr);
 
-	sys_foreign_cols = dict_table_get_low("SYS_FOREIGN_COLS",
-					      DICT_ERR_IGNORE_NONE);
+	sys_foreign_cols = dict_table_get_low("SYS_FOREIGN_COLS");
+
 	sys_index = UT_LIST_GET_FIRST(sys_foreign_cols->indexes);
-	ut_a(!dict_table_is_comp(sys_foreign_cols));
+	ut_ad(!dict_table_is_comp(sys_foreign_cols));
 
 	tuple = dtuple_create(foreign->heap, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
 
-	dfield_set_data(dfield, id, id_len);
+	dfield_set_data(dfield, foreign->id, id_len);
 	dict_index_copy_types(tuple, sys_index, 1);
 
 	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
@@ -2195,19 +2697,57 @@ dict_load_foreign_cols(
 		ut_a(btr_pcur_is_on_user_rec(&pcur));
 		ut_a(!rec_get_deleted_flag(rec, 0));
 
-		field = rec_get_nth_field_old(rec, 0, &len);
-		ut_a(len == id_len);
-		ut_a(ut_memcmp(id, field, len) == 0);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len);
+
+		if (len != id_len || ut_memcmp(foreign->id, field, len) != 0) {
+			const rec_t*	pos;
+			ulint		pos_len;
+			const rec_t*	for_col_name;
+			ulint		for_col_name_len;
+			const rec_t*	ref_col_name;
+			ulint		ref_col_name_len;
+
+			pos = rec_get_nth_field_old(
+				rec, DICT_FLD__SYS_FOREIGN_COLS__POS,
+				&pos_len);
+
+			for_col_name = rec_get_nth_field_old(
+				rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME,
+				&for_col_name_len);
+
+			ref_col_name = rec_get_nth_field_old(
+				rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME,
+				&ref_col_name_len);
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Unable to load columns names for foreign "
+				"key '%s' because it was not found in "
+				"InnoDB internal table SYS_FOREIGN_COLS. The "
+				"closest entry we found is: "
+				"(ID='%.*s', POS=%lu, FOR_COL_NAME='%.*s', "
+				"REF_COL_NAME='%.*s')",
+				foreign->id,
+				(int) len, field,
+				mach_read_from_4(pos),
+				(int) for_col_name_len, for_col_name,
+				(int) ref_col_name_len, ref_col_name);
+
+			ut_error;
+		}
 
-		field = rec_get_nth_field_old(rec, 1, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len);
 		ut_a(len == 4);
 		ut_a(i == mach_read_from_4(field));
 
-		field = rec_get_nth_field_old(rec, 4, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len);
 		foreign->foreign_col_names[i] = mem_heap_strdupl(
 			foreign->heap, (char*) field, len);
 
-		field = rec_get_nth_field_old(rec, 5, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len);
 		foreign->referenced_col_names[i] = mem_heap_strdupl(
 			foreign->heap, (char*) field, len);
 
@@ -2221,21 +2761,23 @@ dict_load_foreign_cols(
 /***********************************************************************//**
 Loads a foreign key constraint to the dictionary cache.
 @return	DB_SUCCESS or error code */
-static
-ulint
+static __attribute__((nonnull(1), warn_unused_result))
+dberr_t
 dict_load_foreign(
 /*==============*/
 	const char*		id,
-				/*!< in: foreign constraint id, not
-				necessary '\0'-terminated */
-	ulint			id_len,
-				/*!< in: id length */
-	ibool			check_charsets,
-				/*!< in: TRUE=check charset compatibility */
-	ibool			check_recursive,
-				/*!< in: Whether to record the foreign table
+				/*!< in: foreign constraint id, must be
+				'\0'-terminated */
+	const char**		col_names,
+				/*!< in: column names, or NULL
+				to use foreign->foreign_table->col_names */
+	bool			check_recursive,
+				/*!< in: whether to record the foreign table
 				parent count to avoid unlimited recursive
 				load of chained foreign tables */
+	bool			check_charsets,
+				/*!< in: whether to check charset
+				compatibility */
 	dict_err_ignore_t	ignore_err)
 				/*!< in: error to be ignored */
 {
@@ -2253,16 +2795,20 @@ dict_load_foreign(
 	mtr_t		mtr;
 	dict_table_t*	for_table;
 	dict_table_t*	ref_table;
+	size_t		id_len;
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 
+	id_len = strlen(id);
+
 	heap2 = mem_heap_create(1000);
 
 	mtr_start(&mtr);
 
-	sys_foreign = dict_table_get_low("SYS_FOREIGN", DICT_ERR_IGNORE_NONE);
+	sys_foreign = dict_table_get_low("SYS_FOREIGN");
+
 	sys_index = UT_LIST_GET_FIRST(sys_foreign->indexes);
-	ut_a(!dict_table_is_comp(sys_foreign));
+	ut_ad(!dict_table_is_comp(sys_foreign));
 
 	tuple = dtuple_create(heap2, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
@@ -2279,8 +2825,9 @@ dict_load_foreign(
 		/* Not found */
 
 		fprintf(stderr,
-			"InnoDB: Error A: cannot load foreign constraint "
-			"%.*s\n", (int) id_len, id);
+			"InnoDB: Error: cannot load foreign constraint "
+			"%s: could not find the relevant record in "
+			"SYS_FOREIGN\n", id);
 
 		btr_pcur_close(&pcur);
 		mtr_commit(&mtr);
@@ -2289,14 +2836,15 @@ dict_load_foreign(
 		return(DB_ERROR);
 	}
 
-	field = rec_get_nth_field_old(rec, 0, &len);
+	field = rec_get_nth_field_old(rec, DICT_FLD__SYS_FOREIGN__ID, &len);
 
 	/* Check if the id in record is the searched one */
 	if (len != id_len || ut_memcmp(id, field, len) != 0) {
 
 		fprintf(stderr,
-			"InnoDB: Error B: cannot load foreign constraint "
-			"%.*s\n", (int) id_len, id);
+			"InnoDB: Error: cannot load foreign constraint "
+			"%s: found %.*s instead in SYS_FOREIGN\n",
+			id, (int) len, field);
 
 		btr_pcur_close(&pcur);
 		mtr_commit(&mtr);
@@ -2313,7 +2861,8 @@ dict_load_foreign(
 	foreign = dict_mem_foreign_create();
 
 	n_fields_and_type = mach_read_from_4(
-		rec_get_nth_field_old(rec, 5, &len));
+		rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len));
 
 	ut_a(len == 4);
 
@@ -2324,13 +2873,15 @@ dict_load_foreign(
 
 	foreign->id = mem_heap_strdupl(foreign->heap, id, id_len);
 
-	field = rec_get_nth_field_old(rec, 3, &len);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len);
 
 	foreign->foreign_table_name = mem_heap_strdupl(
 		foreign->heap, (char*) field, len);
 	dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
 
-	field = rec_get_nth_field_old(rec, 4, &len);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len);
 	foreign->referenced_table_name = mem_heap_strdupl(
 		foreign->heap, (char*) field, len);
 	dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
@@ -2338,7 +2889,7 @@ dict_load_foreign(
 	btr_pcur_close(&pcur);
 	mtr_commit(&mtr);
 
-	dict_load_foreign_cols(id, id_len, foreign);
+	dict_load_foreign_cols(foreign);
 
 	ref_table = dict_table_check_if_in_cache_low(
 			foreign->referenced_table_name_lookup);
@@ -2374,9 +2925,7 @@ dict_load_foreign(
 		have to load it so that we are able to make type comparisons
 		in the next function call. */
 
-		for_table = dict_table_get_low(
-				foreign->foreign_table_name_lookup,
-				DICT_ERR_IGNORE_NONE);
+		for_table = dict_table_get_low(foreign->foreign_table_name_lookup);
 
 		if (for_table && ref_table && check_recursive) {
 			/* This is to record the longest chain of ancesters
@@ -2399,7 +2948,8 @@ dict_load_foreign(
 	a new foreign key constraint but loading one from the data
 	dictionary. */
 
-	return(dict_foreign_add_to_cache(foreign, check_charsets, ignore_err));
+	return(dict_foreign_add_to_cache(foreign, col_names, check_charsets,
+					 ignore_err));
 }
 
 /***********************************************************************//**
@@ -2410,15 +2960,17 @@ cache already contains all constraints where the other relevant table is
 already in the dictionary cache.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 dict_load_foreigns(
 /*===============*/
 	const char*		table_name,	/*!< in: table name */
-	ibool			check_recursive,/*!< in: Whether to check
+	const char**		col_names,	/*!< in: column names, or NULL
+						to use table->col_names */
+	bool			check_recursive,/*!< in: Whether to check
 						recursive load of tables
 						chained by FK */
-	ibool			check_charsets,	/*!< in: TRUE=check charset
-						compatibility */
+	bool			check_charsets,	/*!< in: whether to check
+						charset compatibility */
 	dict_err_ignore_t	ignore_err)	/*!< in: error to be ignored */
 {
 	ulint		tuple_buf[(DTUPLE_EST_ALLOC(1) + sizeof(ulint) - 1)
@@ -2431,12 +2983,12 @@ dict_load_foreigns(
 	const rec_t*	rec;
 	const byte*	field;
 	ulint		len;
-	ulint		err;
+	dberr_t		err;
 	mtr_t		mtr;
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 
-	sys_foreign = dict_table_get_low("SYS_FOREIGN", DICT_ERR_IGNORE_NONE);
+	sys_foreign = dict_table_get_low("SYS_FOREIGN");
 
 	if (sys_foreign == NULL) {
 		/* No foreign keys defined yet in this database */
@@ -2448,7 +3000,7 @@ dict_load_foreigns(
 		return(DB_ERROR);
 	}
 
-	ut_a(!dict_table_is_comp(sys_foreign));
+	ut_ad(!dict_table_is_comp(sys_foreign));
 	mtr_start(&mtr);
 
 	/* Get the secondary index based on FOR_NAME from table
@@ -2456,6 +3008,7 @@ dict_load_foreigns(
 
 	sec_index = dict_table_get_next_index(
 		dict_table_get_first_index(sys_foreign));
+	ut_ad(!dict_index_is_clust(sec_index));
 start_load:
 
 	tuple = dtuple_create_from_mem(tuple_buf, sizeof(tuple_buf), 1);
@@ -2478,8 +3031,8 @@ loop:
 	/* Now we have the record in the secondary index containing a table
 	name and a foreign constraint ID */
 
-	rec = btr_pcur_get_rec(&pcur);
-	field = rec_get_nth_field_old(rec, 0, &len);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME, &len);
 
 	/* Check if the table name in the record is the one searched for; the
 	following call does the comparison in the latin1_swedish_ci
@@ -2487,7 +3040,9 @@ loop:
 
 	if (0 != cmp_data_data(dfield_get_type(dfield)->mtype,
 			       dfield_get_type(dfield)->prtype,
-			       dfield_get_data(dfield), dfield_get_len(dfield),
+			       static_cast<const byte*>(
+				       dfield_get_data(dfield)),
+			       dfield_get_len(dfield),
 			       field, len)) {
 
 		goto load_next_index;
@@ -2500,18 +3055,26 @@ loop:
 	may not be the same case, but the previous comparison showed that they
 	match with no-case.  */
 
-	if ((innobase_get_lower_case_table_names() != 2)
-	    && (0 != ut_memcmp(field, table_name, len))) {
+	if (rec_get_deleted_flag(rec, 0)) {
 		goto next_rec;
 	}
 
-	if (rec_get_deleted_flag(rec, 0)) {
-
+	if ((innobase_get_lower_case_table_names() != 2)
+	    && (0 != ut_memcmp(field, table_name, len))) {
 		goto next_rec;
 	}
 
 	/* Now we get a foreign key constraint id */
-	field = rec_get_nth_field_old(rec, 1, &len);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__ID, &len);
+
+	/* Copy the string because the page may be modified or evicted
+	after mtr_commit() below. */
+	char	fk_id[MAX_TABLE_NAME_LEN + 1];
+
+	ut_a(len <= MAX_TABLE_NAME_LEN);
+	memcpy(fk_id, field, len);
+	fk_id[len] = '\0';
 
 	btr_pcur_store_position(&pcur, &mtr);
 
@@ -2519,8 +3082,8 @@ loop:
 
 	/* Load the foreign constraint definition to the dictionary cache */
 
-	err = dict_load_foreign((char*) field, len, check_charsets,
-				check_recursive, ignore_err);
+	err = dict_load_foreign(fk_id, col_names,
+				check_recursive, check_charsets, ignore_err);
 
 	if (err != DB_SUCCESS) {
 		btr_pcur_close(&pcur);
diff --git a/storage/xtradb/dict/dict0mem.c b/storage/xtradb/dict/dict0mem.cc
index 002b55afbd7..9214a6e6e5a 100644
--- a/storage/xtradb/dict/dict0mem.c
+++ b/storage/xtradb/dict/dict0mem.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +12,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /******************************************************************//**
-@file dict/dict0mem.c
+@file dict/dict0mem.cc
 Data dictionary memory object creation
 
 Created 1/8/1996 Heikki Tuuri
@@ -33,8 +34,11 @@ Created 1/8/1996 Heikki Tuuri
 #include "data0type.h"
 #include "mach0data.h"
 #include "dict0dict.h"
+#include "fts0priv.h"
 #ifndef UNIV_HOTBACKUP
-# include "ha_prototypes.h" /* innobase_casedn_str()*/
+# include "ha_prototypes.h"	/* innobase_casedn_str(),
+				innobase_get_lower_case_table_names */
+# include "mysql_com.h"		/* NAME_LEN */
 # include "lock0lock.h"
 #endif /* !UNIV_HOTBACKUP */
 #ifdef UNIV_BLOB_DEBUG
@@ -58,35 +62,42 @@ dict_mem_table_create(
 /*==================*/
 	const char*	name,	/*!< in: table name */
 	ulint		space,	/*!< in: space where the clustered index of
-				the table is placed; this parameter is
-				ignored if the table is made a member of
-				a cluster */
+				the table is placed */
 	ulint		n_cols,	/*!< in: number of columns */
-	ulint		flags)	/*!< in: table flags */
+	ulint		flags,	/*!< in: table flags */
+	ulint		flags2)	/*!< in: table flags2 */
 {
 	dict_table_t*	table;
 	mem_heap_t*	heap;
 
 	ut_ad(name);
-	ut_a(!(flags & (~0 << DICT_TF2_BITS)));
+	ut_a(dict_tf_is_valid(flags));
+	ut_a(!(flags2 & ~DICT_TF2_BIT_MASK));
 
 	heap = mem_heap_create(DICT_HEAP_SIZE);
 
-	table = mem_heap_zalloc(heap, sizeof(dict_table_t));
+	table = static_cast<dict_table_t*>(
+		mem_heap_zalloc(heap, sizeof(dict_table_t)));
 
 	table->heap = heap;
 
 	table->flags = (unsigned int) flags;
-	table->name = ut_malloc(strlen(name) + 1);
+	table->flags2 = (unsigned int) flags2;
+	table->name = static_cast<char*>(ut_malloc(strlen(name) + 1));
 	memcpy(table->name, name, strlen(name) + 1);
 	table->space = (unsigned int) space;
 	table->n_cols = (unsigned int) (n_cols + DATA_N_SYS_COLS);
 
-	table->cols = mem_heap_alloc(heap, (n_cols + DATA_N_SYS_COLS)
-				     * sizeof(dict_col_t));
+	table->cols = static_cast<dict_col_t*>(
+		mem_heap_alloc(heap,
+			       (n_cols + DATA_N_SYS_COLS)
+			       * sizeof(dict_col_t)));
+
+	ut_d(table->magic_n = DICT_TABLE_MAGIC_N);
 
 #ifndef UNIV_HOTBACKUP
-	table->autoinc_lock = mem_heap_alloc(heap, lock_get_size());
+	table->autoinc_lock = static_cast<ib_lock_t*>(
+		mem_heap_alloc(heap, lock_get_size()));
 
 	mutex_create(autoinc_mutex_key,
 		     &table->autoinc_mutex, SYNC_DICT_AUTOINC_MUTEX);
@@ -97,10 +108,21 @@ dict_mem_table_create(
 	AUTOINC lock or have been granted the lock. */
 	table->n_waiting_or_granted_auto_inc_locks = 0;
 
+	/* If the table has an FTS index or we are in the process
+	of building one, create the table->fts */
+	if (dict_table_has_fts_index(table)
+	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		table->fts = fts_create(table);
+		table->fts->cache = fts_cache_create(table);
+	} else {
+		table->fts = NULL;
+	}
+
 	table->is_corrupt = FALSE;
+
 #endif /* !UNIV_HOTBACKUP */
 
-	ut_d(table->magic_n = DICT_TABLE_MAGIC_N);
 	return(table);
 }
 
@@ -116,6 +138,15 @@ dict_mem_table_free(
 	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
 	ut_d(table->cached = FALSE);
 
+        if (dict_table_has_fts_index(table)
+            || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+            || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		if (table->fts) {
+			fts_free(table);
+		}
+
+		fts_optimize_remove_table(table);
+	}
 #ifndef UNIV_HOTBACKUP
 	mutex_free(&(table->autoinc_mutex));
 #endif /* UNIV_HOTBACKUP */
@@ -160,7 +191,7 @@ dict_add_col_name(
 	new_len = strlen(name) + 1;
 	total_len = old_len + new_len;
 
-	res = mem_heap_alloc(heap, total_len);
+	res = static_cast<char*>(mem_heap_alloc(heap, total_len));
 
 	if (old_len > 0) {
 		memcpy(res, col_names, old_len);
@@ -199,7 +230,9 @@ dict_mem_table_add_col(
 		}
 		if (UNIV_LIKELY(i) && UNIV_UNLIKELY(!table->col_names)) {
 			/* All preceding column names are empty. */
-			char* s = mem_heap_zalloc(heap, table->n_def);
+			char* s = static_cast<char*>(
+				mem_heap_zalloc(heap, table->n_def));
+
 			table->col_names = s;
 		}
 
@@ -212,6 +245,156 @@ dict_mem_table_add_col(
 	dict_mem_fill_column_struct(col, i, mtype, prtype, len);
 }
 
+/**********************************************************************//**
+Renames a column of a table in the data dictionary cache. */
+static __attribute__((nonnull))
+void
+dict_mem_table_col_rename_low(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	unsigned	i,	/*!< in: column offset corresponding to s */
+	const char*	to,	/*!< in: new column name */
+	const char*	s)	/*!< in: pointer to table->col_names */
+{
+	size_t from_len = strlen(s), to_len = strlen(to);
+
+	ut_ad(i < table->n_def);
+	ut_ad(from_len <= NAME_LEN);
+	ut_ad(to_len <= NAME_LEN);
+
+	if (from_len == to_len) {
+		/* The easy case: simply replace the column name in
+		table->col_names. */
+		strcpy(const_cast<char*>(s), to);
+	} else {
+		/* We need to adjust all affected index->field
+		pointers, as in dict_index_add_col(). First, copy
+		table->col_names. */
+		ulint	prefix_len	= s - table->col_names;
+
+		for (; i < table->n_def; i++) {
+			s += strlen(s) + 1;
+		}
+
+		ulint	full_len	= s - table->col_names;
+		char*	col_names;
+
+		if (to_len > from_len) {
+			col_names = static_cast<char*>(
+				mem_heap_alloc(
+					table->heap,
+					full_len + to_len - from_len));
+
+			memcpy(col_names, table->col_names, prefix_len);
+		} else {
+			col_names = const_cast<char*>(table->col_names);
+		}
+
+		memcpy(col_names + prefix_len, to, to_len);
+		memmove(col_names + prefix_len + to_len,
+			table->col_names + (prefix_len + from_len),
+			full_len - (prefix_len + from_len));
+
+		/* Replace the field names in every index. */
+		for (dict_index_t* index = dict_table_get_first_index(table);
+		     index != NULL;
+		     index = dict_table_get_next_index(index)) {
+			ulint	n_fields = dict_index_get_n_fields(index);
+
+			for (ulint i = 0; i < n_fields; i++) {
+				dict_field_t*	field
+					= dict_index_get_nth_field(
+						index, i);
+				ulint		name_ofs
+					= field->name - table->col_names;
+				if (name_ofs <= prefix_len) {
+					field->name = col_names + name_ofs;
+				} else {
+					ut_a(name_ofs < full_len);
+					field->name = col_names
+						+ name_ofs + to_len - from_len;
+				}
+			}
+		}
+
+		table->col_names = col_names;
+	}
+
+	/* Replace the field names in every foreign key constraint. */
+	for (dict_foreign_t* foreign = UT_LIST_GET_FIRST(table->foreign_list);
+	     foreign != NULL;
+	     foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) {
+		for (unsigned f = 0; f < foreign->n_fields; f++) {
+			/* These can point straight to
+			table->col_names, because the foreign key
+			constraints will be freed at the same time
+			when the table object is freed. */
+			foreign->foreign_col_names[f]
+				= dict_index_get_nth_field(
+					foreign->foreign_index, f)->name;
+		}
+	}
+
+	for (dict_foreign_t* foreign = UT_LIST_GET_FIRST(
+		     table->referenced_list);
+	     foreign != NULL;
+	     foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) {
+		for (unsigned f = 0; f < foreign->n_fields; f++) {
+			/* foreign->referenced_col_names[] need to be
+			copies, because the constraint may become
+			orphan when foreign_key_checks=0 and the
+			parent table is dropped. */
+
+			const char* col_name = dict_index_get_nth_field(
+				foreign->referenced_index, f)->name;
+
+			if (strcmp(foreign->referenced_col_names[f],
+				   col_name)) {
+				char**	rc = const_cast<char**>(
+					foreign->referenced_col_names + f);
+				size_t	col_name_len_1 = strlen(col_name) + 1;
+
+				if (col_name_len_1 <= strlen(*rc) + 1) {
+					memcpy(*rc, col_name, col_name_len_1);
+				} else {
+					*rc = static_cast<char*>(
+						mem_heap_dup(
+							foreign->heap,
+							col_name,
+							col_name_len_1));
+				}
+			}
+		}
+	}
+}
+
+/**********************************************************************//**
+Renames a column of a table in the data dictionary cache. */
+UNIV_INTERN
+void
+dict_mem_table_col_rename(
+/*======================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	unsigned	nth_col,/*!< in: column index */
+	const char*	from,	/*!< in: old column name */
+	const char*	to)	/*!< in: new column name */
+{
+	const char*	s = table->col_names;
+
+	ut_ad(nth_col < table->n_def);
+
+	for (unsigned i = 0; i < nth_col; i++) {
+		size_t	len = strlen(s);
+		ut_ad(len > 0);
+		s += len + 1;
+	}
+
+	/* This could fail if the data dictionaries are out of sync.
+	Proceed with the renaming anyway. */
+	ut_ad(!strcmp(from, s));
+
+	dict_mem_table_col_rename_low(table, nth_col, to, s);
+}
 
 /**********************************************************************//**
 This function populates a dict_col_t memory structure with
@@ -266,11 +449,15 @@ dict_mem_index_create(
 	ut_ad(table_name && index_name);
 
 	heap = mem_heap_create(DICT_HEAP_SIZE);
-	index = mem_heap_zalloc(heap, sizeof(dict_index_t));
+
+	index = static_cast<dict_index_t*>(
+		mem_heap_zalloc(heap, sizeof(*index)));
 
 	dict_mem_fill_index_struct(index, heap, table_name, index_name,
 				   space, type, n_fields);
 
+	os_fast_mutex_init(zip_pad_mutex_key, &index->zip_pad.mutex);
+
 	return(index);
 }
 
@@ -288,7 +475,8 @@ dict_mem_foreign_create(void)
 
 	heap = mem_heap_create(100);
 
-	foreign = mem_heap_zalloc(heap, sizeof(dict_foreign_t));
+	foreign = static_cast<dict_foreign_t*>(
+		mem_heap_zalloc(heap, sizeof(dict_foreign_t)));
 
 	foreign->heap = heap;
 
@@ -309,9 +497,13 @@ dict_mem_foreign_table_name_lookup_set(
 {
 	if (innobase_get_lower_case_table_names() == 2) {
 		if (do_alloc) {
-			foreign->foreign_table_name_lookup = mem_heap_alloc(
-				foreign->heap,
-				strlen(foreign->foreign_table_name) + 1);
+			ulint	len;
+
+			len = strlen(foreign->foreign_table_name) + 1;
+
+			foreign->foreign_table_name_lookup =
+				static_cast<char*>(
+					mem_heap_alloc(foreign->heap, len));
 		}
 		strcpy(foreign->foreign_table_name_lookup,
 		       foreign->foreign_table_name);
@@ -336,9 +528,13 @@ dict_mem_referenced_table_name_lookup_set(
 {
 	if (innobase_get_lower_case_table_names() == 2) {
 		if (do_alloc) {
-			foreign->referenced_table_name_lookup = mem_heap_alloc(
-				foreign->heap,
-				strlen(foreign->referenced_table_name) + 1);
+			ulint	len;
+
+			len = strlen(foreign->referenced_table_name) + 1;
+
+			foreign->referenced_table_name_lookup =
+				static_cast<char*>(
+					mem_heap_alloc(foreign->heap, len));
 		}
 		strcpy(foreign->referenced_table_name_lookup,
 		       foreign->referenced_table_name);
@@ -348,8 +544,8 @@ dict_mem_referenced_table_name_lookup_set(
 			= foreign->referenced_table_name;
 	}
 }
-
 #endif /* !UNIV_HOTBACKUP */
+
 /**********************************************************************//**
 Adds a field definition to an index. NOTE: does not take a copy
 of the column name if the field is a column. The memory occupied
@@ -394,5 +590,31 @@ dict_mem_index_free(
 	}
 #endif /* UNIV_BLOB_DEBUG */
 
+	os_fast_mutex_free(&index->zip_pad.mutex);
+
 	mem_heap_free(index->heap);
 }
+
+/*******************************************************************//**
+Create a temporary tablename.
+@return temporary tablename suitable for InnoDB use */
+UNIV_INTERN
+char*
+dict_mem_create_temporary_tablename(
+/*================================*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	const char*	dbtab,	/*!< in: database/table name */
+	table_id_t	id)	/*!< in: InnoDB table id */
+{
+	const char*	dbend   = strchr(dbtab, '/');
+	ut_ad(dbend);
+	size_t		dblen   = dbend - dbtab + 1;
+	size_t		size = tmp_file_prefix_length + 4 + 9 + 9 + dblen;
+
+	char*	name = static_cast<char*>(mem_heap_alloc(heap, size));
+	memcpy(name, dbtab, dblen);
+	ut_snprintf(name + dblen, size - dblen,
+		    tmp_file_prefix "-ib" UINT64PF, id);
+	return(name);
+}
+
diff --git a/storage/xtradb/dict/dict0stats.cc b/storage/xtradb/dict/dict0stats.cc
new file mode 100644
index 00000000000..bfd0542b8e2
--- /dev/null
+++ b/storage/xtradb/dict/dict0stats.cc
@@ -0,0 +1,4004 @@
+/*****************************************************************************
+
+Copyright (c) 2009, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0stats.cc
+Code used for calculating and manipulating table statistics.
+
+Created Jan 06, 2010 Vasil Dimov
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+
+#include "univ.i"
+
+#include "btr0btr.h" /* btr_get_size() */
+#include "btr0cur.h" /* btr_estimate_number_of_different_key_vals() */
+#include "dict0dict.h" /* dict_table_get_first_index(), dict_fs2utf8() */
+#include "dict0mem.h" /* DICT_TABLE_MAGIC_N */
+#include "dict0stats.h"
+#include "data0type.h" /* dtype_t */
+#include "db0err.h" /* dberr_t */
+#include "page0page.h" /* page_align() */
+#include "pars0pars.h" /* pars_info_create() */
+#include "pars0types.h" /* pars_info_t */
+#include "que0que.h" /* que_eval_sql() */
+#include "rem0cmp.h" /* REC_MAX_N_FIELDS,cmp_rec_rec_with_match() */
+#include "row0sel.h" /* sel_node_t */
+#include "row0types.h" /* sel_node_t */
+#include "trx0trx.h" /* trx_create() */
+#include "trx0roll.h" /* trx_rollback_to_savepoint() */
+#include "ut0rnd.h" /* ut_rnd_interval() */
+#include "ut0ut.h" /* ut_format_name(), ut_time() */
+
+#include <vector>
+
+/* Sampling algorithm description @{
+
+The algorithm is controlled by one number - N_SAMPLE_PAGES(index),
+let it be A, which is the number of leaf pages to analyze for a given index
+for each n-prefix (if the index is on 3 columns, then 3*A leaf pages will be
+analyzed).
+
+Let the total number of leaf pages in the table be T.
+Level 0 - leaf pages, level H - root.
+
+Definition: N-prefix-boring record is a record on a non-leaf page that equals
+the next (to the right, cross page boundaries, skipping the supremum and
+infimum) record on the same level when looking at the fist n-prefix columns.
+The last (user) record on a level is not boring (it does not match the
+non-existent user record to the right). We call the records boring because all
+the records on the page below a boring record are equal to that boring record.
+
+We avoid diving below boring records when searching for a leaf page to
+estimate the number of distinct records because we know that such a leaf
+page will have number of distinct records == 1.
+
+For each n-prefix: start from the root level and full scan subsequent lower
+levels until a level that contains at least A*10 distinct records is found.
+Lets call this level LA.
+As an optimization the search is canceled if it has reached level 1 (never
+descend to the level 0 (leaf)) and also if the next level to be scanned
+would contain more than A pages. The latter is because the user has asked
+to analyze A leaf pages and it does not make sense to scan much more than
+A non-leaf pages with the sole purpose of finding a good sample of A leaf
+pages.
+
+After finding the appropriate level LA with >A*10 distinct records (or less in
+the exceptions described above), divide it into groups of equal records and
+pick A such groups. Then pick the last record from each group. For example,
+let the level be:
+
+index:  0,1,2,3,4,5,6,7,8,9,10
+record: 1,1,1,2,2,7,7,7,7,7,9
+
+There are 4 groups of distinct records and if A=2 random ones are selected,
+e.g. 1,1,1 and 7,7,7,7,7, then records with indexes 2 and 9 will be selected.
+
+After selecting A records as described above, dive below them to find A leaf
+pages and analyze them, finding the total number of distinct records. The
+dive to the leaf level is performed by selecting a non-boring record from
+each page and diving below it.
+
+This way, a total of A leaf pages are analyzed for the given n-prefix.
+
+Let the number of different key values found in each leaf page i be Pi (i=1..A).
+Let N_DIFF_AVG_LEAF be (P1 + P2 + ... + PA) / A.
+Let the number of different key values on level LA be N_DIFF_LA.
+Let the total number of records on level LA be TOTAL_LA.
+Let R be N_DIFF_LA / TOTAL_LA, we assume this ratio is the same on the
+leaf level.
+Let the number of leaf pages be N.
+Then the total number of different key values on the leaf level is:
+N * R * N_DIFF_AVG_LEAF.
+See REF01 for the implementation.
+
+The above describes how to calculate the cardinality of an index.
+This algorithm is executed for each n-prefix of a multi-column index
+where n=1..n_uniq.
+@} */
+
+/* names of the tables from the persistent statistics storage */
+#define TABLE_STATS_NAME	"mysql/innodb_table_stats"
+#define TABLE_STATS_NAME_PRINT	"mysql.innodb_table_stats"
+#define INDEX_STATS_NAME	"mysql/innodb_index_stats"
+#define INDEX_STATS_NAME_PRINT	"mysql.innodb_index_stats"
+
+#ifdef UNIV_STATS_DEBUG
+#define DEBUG_PRINTF(fmt, ...)	printf(fmt, ## __VA_ARGS__)
+#else /* UNIV_STATS_DEBUG */
+#define DEBUG_PRINTF(fmt, ...)	/* noop */
+#endif /* UNIV_STATS_DEBUG */
+
+/* Gets the number of leaf pages to sample in persistent stats estimation */
+#define N_SAMPLE_PAGES(index)				\
+	((index)->table->stats_sample_pages != 0 ?	\
+	 (index)->table->stats_sample_pages :		\
+	 srv_stats_persistent_sample_pages)
+
+/* number of distinct records on a given level that are required to stop
+descending to lower levels and fetch N_SAMPLE_PAGES(index) records
+from that level */
+#define N_DIFF_REQUIRED(index)	(N_SAMPLE_PAGES(index) * 10)
+
+/* A dynamic array where we store the boundaries of each distinct group
+of keys. For example if a btree level is:
+index: 0,1,2,3,4,5,6,7,8,9,10,11,12
+data:  b,b,b,b,b,b,g,g,j,j,j, x, y
+then we would store 5,7,10,11,12 in the array. */
+typedef std::vector<ib_uint64_t>	boundaries_t;
+
+/*********************************************************************//**
+Checks whether an index should be ignored in stats manipulations:
+* stats fetch
+* stats recalc
+* stats save
+@return true if exists and all tables are ok */
+UNIV_INLINE
+bool
+dict_stats_should_ignore_index(
+/*===========================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	return((index->type & DICT_FTS)
+	       || dict_index_is_corrupted(index)
+	       || index->to_be_dropped
+	       || *index->name == TEMP_INDEX_PREFIX);
+}
+
+/*********************************************************************//**
+Checks whether the persistent statistics storage exists and that all
+tables have the proper structure.
+@return true if exists and all tables are ok */
+static
+bool
+dict_stats_persistent_storage_check(
+/*================================*/
+	bool	caller_has_dict_sys_mutex)	/*!< in: true if the caller
+						owns dict_sys->mutex */
+{
+	/* definition for the table TABLE_STATS_NAME */
+	dict_col_meta_t	table_stats_columns[] = {
+		{"database_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 192},
+
+		{"table_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 192},
+
+		{"last_update", DATA_INT,
+			DATA_NOT_NULL, 4},
+
+		{"n_rows", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 8},
+
+		{"clustered_index_size", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 8},
+
+		{"sum_of_other_index_sizes", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 8}
+	};
+	dict_table_schema_t	table_stats_schema = {
+		TABLE_STATS_NAME,
+		UT_ARR_SIZE(table_stats_columns),
+		table_stats_columns,
+		0 /* n_foreign */,
+		0 /* n_referenced */
+	};
+
+	/* definition for the table INDEX_STATS_NAME */
+	dict_col_meta_t	index_stats_columns[] = {
+		{"database_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 192},
+
+		{"table_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 192},
+
+		{"index_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 192},
+
+		{"last_update", DATA_INT,
+			DATA_NOT_NULL, 4},
+
+		{"stat_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 64*3},
+
+		{"stat_value", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 8},
+
+		{"sample_size", DATA_INT,
+			DATA_UNSIGNED, 8},
+
+		{"stat_description", DATA_VARMYSQL,
+			DATA_NOT_NULL, 1024*3}
+	};
+	dict_table_schema_t	index_stats_schema = {
+		INDEX_STATS_NAME,
+		UT_ARR_SIZE(index_stats_columns),
+		index_stats_columns,
+		0 /* n_foreign */,
+		0 /* n_referenced */
+	};
+
+	char		errstr[512];
+	dberr_t		ret;
+
+	if (!caller_has_dict_sys_mutex) {
+		mutex_enter(&(dict_sys->mutex));
+	}
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	/* first check table_stats */
+	ret = dict_table_schema_check(&table_stats_schema, errstr,
+				      sizeof(errstr));
+	if (ret == DB_SUCCESS) {
+		/* if it is ok, then check index_stats */
+		ret = dict_table_schema_check(&index_stats_schema, errstr,
+					      sizeof(errstr));
+	}
+
+	if (!caller_has_dict_sys_mutex) {
+		mutex_exit(&(dict_sys->mutex));
+	}
+
+	if (ret != DB_SUCCESS) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: %s\n", errstr);
+		return(false);
+	}
+	/* else */
+
+	return(true);
+}
+
+/*********************************************************************//**
+Executes a given SQL statement using the InnoDB internal SQL parser
+in its own transaction and commits it.
+This function will free the pinfo object.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+dict_stats_exec_sql(
+/*================*/
+	pars_info_t*	pinfo,	/*!< in/out: pinfo to pass to que_eval_sql()
+				must already have any literals bound to it */
+	const char*	sql)	/*!< in: SQL string to execute */
+{
+	trx_t*	trx;
+	dberr_t	err;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	if (!dict_stats_persistent_storage_check(true)) {
+		pars_info_free(pinfo);
+		return(DB_STATS_DO_NOT_EXIST);
+	}
+
+	trx = trx_allocate_for_background();
+	trx_start_if_not_started(trx);
+
+	err = que_eval_sql(pinfo, sql, FALSE, trx); /* pinfo is freed here */
+
+	if (err == DB_SUCCESS) {
+		trx_commit_for_mysql(trx);
+	} else {
+		trx->op_info = "rollback of internal trx on stats tables";
+		trx->dict_operation_lock_mode = RW_X_LATCH;
+		trx_rollback_to_savepoint(trx, NULL);
+		trx->dict_operation_lock_mode = 0;
+		trx->op_info = "";
+		ut_a(trx->error_state == DB_SUCCESS);
+	}
+
+	trx_free_for_background(trx);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Duplicate a table object and its indexes.
+This function creates a dummy dict_table_t object and initializes the
+following table and index members:
+dict_table_t::id (copied)
+dict_table_t::heap (newly created)
+dict_table_t::name (copied)
+dict_table_t::corrupted (copied)
+dict_table_t::indexes<> (newly created)
+dict_table_t::magic_n
+for each entry in dict_table_t::indexes, the following are initialized:
+(indexes that have DICT_FTS set in index->type are skipped)
+dict_index_t::id (copied)
+dict_index_t::name (copied)
+dict_index_t::table_name (points to the copied table name)
+dict_index_t::table (points to the above semi-initialized object)
+dict_index_t::type (copied)
+dict_index_t::to_be_dropped (copied)
+dict_index_t::online_status (copied)
+dict_index_t::n_uniq (copied)
+dict_index_t::fields[] (newly created, only first n_uniq, only fields[i].name)
+dict_index_t::indexes<> (newly created)
+dict_index_t::stat_n_diff_key_vals[] (only allocated, left uninitialized)
+dict_index_t::stat_n_sample_sizes[] (only allocated, left uninitialized)
+dict_index_t::stat_n_non_null_key_vals[] (only allocated, left uninitialized)
+dict_index_t::magic_n
+The returned object should be freed with dict_stats_table_clone_free()
+when no longer needed.
+@return incomplete table object */
+static
+dict_table_t*
+dict_stats_table_clone_create(
+/*==========================*/
+	const dict_table_t*	table)	/*!< in: table whose stats to copy */
+{
+	size_t		heap_size;
+	dict_index_t*	index;
+
+	/* Estimate the size needed for the table and all of its indexes */
+
+	heap_size = 0;
+	heap_size += sizeof(dict_table_t);
+	heap_size += strlen(table->name) + 1;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (dict_stats_should_ignore_index(index)) {
+			continue;
+		}
+
+		ut_ad(!dict_index_is_univ(index));
+
+		ulint	n_uniq = dict_index_get_n_unique(index);
+
+		heap_size += sizeof(dict_index_t);
+		heap_size += strlen(index->name) + 1;
+		heap_size += n_uniq * sizeof(index->fields[0]);
+		for (ulint i = 0; i < n_uniq; i++) {
+			heap_size += strlen(index->fields[i].name) + 1;
+		}
+		heap_size += n_uniq * sizeof(index->stat_n_diff_key_vals[0]);
+		heap_size += n_uniq * sizeof(index->stat_n_sample_sizes[0]);
+		heap_size += n_uniq * sizeof(index->stat_n_non_null_key_vals[0]);
+	}
+
+	/* Allocate the memory and copy the members */
+
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(heap_size);
+
+	dict_table_t*	t;
+
+	t = (dict_table_t*) mem_heap_alloc(heap, sizeof(*t));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->id, sizeof(table->id));
+	t->id = table->id;
+
+	t->heap = heap;
+
+	UNIV_MEM_ASSERT_RW_ABORT(table->name, strlen(table->name) + 1);
+	t->name = (char*) mem_heap_strdup(heap, table->name);
+
+	t->corrupted = table->corrupted;
+
+	UT_LIST_INIT(t->indexes);
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (dict_stats_should_ignore_index(index)) {
+			continue;
+		}
+
+		ut_ad(!dict_index_is_univ(index));
+
+		dict_index_t*	idx;
+
+		idx = (dict_index_t*) mem_heap_alloc(heap, sizeof(*idx));
+
+		UNIV_MEM_ASSERT_RW_ABORT(&index->id, sizeof(index->id));
+		idx->id = index->id;
+
+		UNIV_MEM_ASSERT_RW_ABORT(index->name, strlen(index->name) + 1);
+		idx->name = (char*) mem_heap_strdup(heap, index->name);
+
+		idx->table_name = t->name;
+
+		idx->table = t;
+
+		idx->type = index->type;
+
+		idx->to_be_dropped = 0;
+
+		idx->online_status = ONLINE_INDEX_COMPLETE;
+
+		idx->n_uniq = index->n_uniq;
+
+		idx->fields = (dict_field_t*) mem_heap_alloc(
+			heap, idx->n_uniq * sizeof(idx->fields[0]));
+
+		for (ulint i = 0; i < idx->n_uniq; i++) {
+			UNIV_MEM_ASSERT_RW_ABORT(index->fields[i].name, strlen(index->fields[i].name) + 1);
+			idx->fields[i].name = (char*) mem_heap_strdup(
+				heap, index->fields[i].name);
+		}
+
+		/* hook idx into t->indexes */
+		UT_LIST_ADD_LAST(indexes, t->indexes, idx);
+
+		idx->stat_n_diff_key_vals = (ib_uint64_t*) mem_heap_alloc(
+			heap,
+			idx->n_uniq * sizeof(idx->stat_n_diff_key_vals[0]));
+
+		idx->stat_n_sample_sizes = (ib_uint64_t*) mem_heap_alloc(
+			heap,
+			idx->n_uniq * sizeof(idx->stat_n_sample_sizes[0]));
+
+		idx->stat_n_non_null_key_vals = (ib_uint64_t*) mem_heap_alloc(
+			heap,
+			idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0]));
+		ut_d(idx->magic_n = DICT_INDEX_MAGIC_N);
+	}
+
+	ut_d(t->magic_n = DICT_TABLE_MAGIC_N);
+
+	return(t);
+}
+
+/*********************************************************************//**
+Free the resources occupied by an object returned by
+dict_stats_table_clone_create(). */
+static
+void
+dict_stats_table_clone_free(
+/*========================*/
+	dict_table_t*	t)	/*!< in: dummy table object to free */
+{
+	mem_heap_free(t->heap);
+}
+
+/*********************************************************************//**
+Write all zeros (or 1 where it makes sense) into an index
+statistics members. The resulting stats correspond to an empty index.
+The caller must own index's table stats latch in X mode
+(dict_table_stats_lock(table, RW_X_LATCH)) */
+static
+void
+dict_stats_empty_index(
+/*===================*/
+	dict_index_t*	index)	/*!< in/out: index */
+{
+	ut_ad(!(index->type & DICT_FTS));
+	ut_ad(!dict_index_is_univ(index));
+
+	ulint	n_uniq = index->n_uniq;
+
+	for (ulint i = 0; i < n_uniq; i++) {
+		index->stat_n_diff_key_vals[i] = 0;
+		index->stat_n_sample_sizes[i] = 1;
+		index->stat_n_non_null_key_vals[i] = 0;
+	}
+
+	index->stat_index_size = 1;
+	index->stat_n_leaf_pages = 1;
+}
+
+/*********************************************************************//**
+Write all zeros (or 1 where it makes sense) into a table and its indexes'
+statistics members. The resulting stats correspond to an empty table. */
+static
+void
+dict_stats_empty_table(
+/*===================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	/* Zero the stats members */
+
+	dict_table_stats_lock(table, RW_X_LATCH);
+
+	table->stat_n_rows = 0;
+	table->stat_clustered_index_size = 1;
+	/* 1 page for each index, not counting the clustered */
+	table->stat_sum_of_other_index_sizes
+		= UT_LIST_GET_LEN(table->indexes) - 1;
+	table->stat_modified_counter = 0;
+
+	dict_index_t*	index;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		ut_ad(!dict_index_is_univ(index));
+
+		dict_stats_empty_index(index);
+	}
+
+	table->stat_initialized = TRUE;
+
+	dict_table_stats_unlock(table, RW_X_LATCH);
+}
+
+/*********************************************************************//**
+Check whether index's stats are initialized (assert if they are not). */
+static
+void
+dict_stats_assert_initialized_index(
+/*================================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	UNIV_MEM_ASSERT_RW_ABORT(
+		index->stat_n_diff_key_vals,
+		index->n_uniq * sizeof(index->stat_n_diff_key_vals[0]));
+
+	UNIV_MEM_ASSERT_RW_ABORT(
+		index->stat_n_sample_sizes,
+		index->n_uniq * sizeof(index->stat_n_sample_sizes[0]));
+
+	UNIV_MEM_ASSERT_RW_ABORT(
+		index->stat_n_non_null_key_vals,
+		index->n_uniq * sizeof(index->stat_n_non_null_key_vals[0]));
+
+	UNIV_MEM_ASSERT_RW_ABORT(
+		&index->stat_index_size,
+		sizeof(index->stat_index_size));
+
+	UNIV_MEM_ASSERT_RW_ABORT(
+		&index->stat_n_leaf_pages,
+		sizeof(index->stat_n_leaf_pages));
+}
+
+/*********************************************************************//**
+Check whether table's stats are initialized (assert if they are not). */
+static
+void
+dict_stats_assert_initialized(
+/*==========================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_a(table->stat_initialized);
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stats_last_recalc,
+			   sizeof(table->stats_last_recalc));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stat_persistent,
+			   sizeof(table->stat_persistent));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stats_auto_recalc,
+			   sizeof(table->stats_auto_recalc));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stats_sample_pages,
+			   sizeof(table->stats_sample_pages));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stat_n_rows,
+			   sizeof(table->stat_n_rows));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stat_clustered_index_size,
+			   sizeof(table->stat_clustered_index_size));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stat_sum_of_other_index_sizes,
+			   sizeof(table->stat_sum_of_other_index_sizes));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stat_modified_counter,
+			   sizeof(table->stat_modified_counter));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stats_bg_flag,
+			   sizeof(table->stats_bg_flag));
+
+	for (dict_index_t* index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (!dict_stats_should_ignore_index(index)) {
+			dict_stats_assert_initialized_index(index);
+		}
+	}
+}
+
+#define INDEX_EQ(i1, i2) \
+	((i1) != NULL \
+	 && (i2) != NULL \
+	 && (i1)->id == (i2)->id \
+	 && strcmp((i1)->name, (i2)->name) == 0)
+
+/*********************************************************************//**
+Copy table and index statistics from one table to another, including index
+stats. Extra indexes in src are ignored and extra indexes in dst are
+initialized to correspond to an empty index. */
+static
+void
+dict_stats_copy(
+/*============*/
+	dict_table_t*		dst,	/*!< in/out: destination table */
+	const dict_table_t*	src)	/*!< in: source table */
+{
+	dst->stats_last_recalc = src->stats_last_recalc;
+	dst->stat_n_rows = src->stat_n_rows;
+	dst->stat_clustered_index_size = src->stat_clustered_index_size;
+	dst->stat_sum_of_other_index_sizes = src->stat_sum_of_other_index_sizes;
+	dst->stat_modified_counter = src->stat_modified_counter;
+
+	dict_index_t*	dst_idx;
+	dict_index_t*	src_idx;
+
+	for (dst_idx = dict_table_get_first_index(dst),
+	     src_idx = dict_table_get_first_index(src);
+	     dst_idx != NULL;
+	     dst_idx = dict_table_get_next_index(dst_idx),
+	     (src_idx != NULL
+	      && (src_idx = dict_table_get_next_index(src_idx)))) {
+
+		if (dict_stats_should_ignore_index(dst_idx)) {
+			continue;
+		}
+
+		ut_ad(!dict_index_is_univ(dst_idx));
+
+		if (!INDEX_EQ(src_idx, dst_idx)) {
+			for (src_idx = dict_table_get_first_index(src);
+			     src_idx != NULL;
+			     src_idx = dict_table_get_next_index(src_idx)) {
+
+				if (INDEX_EQ(src_idx, dst_idx)) {
+					break;
+				}
+			}
+		}
+
+		if (!INDEX_EQ(src_idx, dst_idx)) {
+			dict_stats_empty_index(dst_idx);
+			continue;
+		}
+
+		ulint	n_copy_el;
+
+		if (dst_idx->n_uniq > src_idx->n_uniq) {
+			n_copy_el = src_idx->n_uniq;
+			/* Since src is smaller some elements in dst
+			will remain untouched by the following memmove(),
+			thus we init all of them here. */
+			dict_stats_empty_index(dst_idx);
+		} else {
+			n_copy_el = dst_idx->n_uniq;
+		}
+
+		memmove(dst_idx->stat_n_diff_key_vals,
+			src_idx->stat_n_diff_key_vals,
+			n_copy_el * sizeof(dst_idx->stat_n_diff_key_vals[0]));
+
+		memmove(dst_idx->stat_n_sample_sizes,
+			src_idx->stat_n_sample_sizes,
+			n_copy_el * sizeof(dst_idx->stat_n_sample_sizes[0]));
+
+		memmove(dst_idx->stat_n_non_null_key_vals,
+			src_idx->stat_n_non_null_key_vals,
+			n_copy_el * sizeof(dst_idx->stat_n_non_null_key_vals[0]));
+
+		dst_idx->stat_index_size = src_idx->stat_index_size;
+
+		dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages;
+	}
+
+	dst->stat_initialized = TRUE;
+}
+
+/*********************************************************************//**
+Duplicate the stats of a table and its indexes.
+This function creates a dummy dict_table_t object and copies the input
+table's stats into it. The returned table object is not in the dictionary
+cache and cannot be accessed by any other threads. In addition to the
+members copied in dict_stats_table_clone_create() this function initializes
+the following:
+dict_table_t::stat_initialized
+dict_table_t::stat_persistent
+dict_table_t::stat_n_rows
+dict_table_t::stat_clustered_index_size
+dict_table_t::stat_sum_of_other_index_sizes
+dict_table_t::stat_modified_counter
+dict_index_t::stat_n_diff_key_vals[]
+dict_index_t::stat_n_sample_sizes[]
+dict_index_t::stat_n_non_null_key_vals[]
+dict_index_t::stat_index_size
+dict_index_t::stat_n_leaf_pages
+The returned object should be freed with dict_stats_snapshot_free()
+when no longer needed.
+@return incomplete table object */
+static
+dict_table_t*
+dict_stats_snapshot_create(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table whose stats to copy */
+{
+	mutex_enter(&dict_sys->mutex);
+
+	dict_table_stats_lock(table, RW_S_LATCH);
+
+	dict_stats_assert_initialized(table);
+
+	dict_table_t*	t;
+
+	t = dict_stats_table_clone_create(table);
+
+	dict_stats_copy(t, table);
+
+	t->stat_persistent = table->stat_persistent;
+	t->stats_auto_recalc = table->stats_auto_recalc;
+	t->stats_sample_pages = table->stats_sample_pages;
+	t->stats_bg_flag = table->stats_bg_flag;
+
+	dict_table_stats_unlock(table, RW_S_LATCH);
+
+	mutex_exit(&dict_sys->mutex);
+
+	return(t);
+}
+
+/*********************************************************************//**
+Free the resources occupied by an object returned by
+dict_stats_snapshot_create(). */
+static
+void
+dict_stats_snapshot_free(
+/*=====================*/
+	dict_table_t*	t)	/*!< in: dummy table object to free */
+{
+	dict_stats_table_clone_free(t);
+}
+
+/*********************************************************************//**
+Calculates new estimates for index statistics. This function is
+relatively quick and is used to calculate transient statistics that
+are not saved on disk. This was the only way to calculate statistics
+before the Persistent Statistics feature was introduced. */
+static
+void
+dict_stats_update_transient_for_index(
+/*==================================*/
+	dict_index_t*	index)	/*!< in/out: index */
+{
+	if (UNIV_LIKELY
+	    (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE
+	     || (srv_force_recovery < SRV_FORCE_NO_LOG_REDO
+		 && dict_index_is_clust(index)))) {
+		mtr_t	mtr;
+		ulint	size;
+		mtr_start(&mtr);
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+
+		size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
+
+		if (size != ULINT_UNDEFINED) {
+			index->stat_index_size = size;
+
+			size = btr_get_size(
+				index, BTR_N_LEAF_PAGES, &mtr);
+		}
+
+		mtr_commit(&mtr);
+
+		switch (size) {
+		case ULINT_UNDEFINED:
+			dict_stats_empty_index(index);
+			return;
+		case 0:
+			/* The root node of the tree is a leaf */
+			size = 1;
+		}
+
+		index->stat_n_leaf_pages = size;
+
+		btr_estimate_number_of_different_key_vals(index);
+	} else {
+		/* If we have set a high innodb_force_recovery
+		level, do not calculate statistics, as a badly
+		corrupted index can cause a crash in it.
+		Initialize some bogus index cardinality
+		statistics, so that the data can be queried in
+		various means, also via secondary indexes. */
+		dict_stats_empty_index(index);
+	}
+}
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. This function
+is relatively quick and is used to calculate transient statistics that
+are not saved on disk.
+This was the only way to calculate statistics before the
+Persistent Statistics feature was introduced. */
+UNIV_INTERN
+void
+dict_stats_update_transient(
+/*========================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	dict_index_t*	index;
+	ulint		sum_of_index_sizes	= 0;
+
+	/* Find out the sizes of the indexes and how many different values
+	for the key they approximately have */
+
+	index = dict_table_get_first_index(table);
+
+	if (dict_table_is_discarded(table)) {
+		/* Nothing to do. */
+		dict_stats_empty_table(table);
+		return;
+	} else if (index == NULL) {
+		/* Table definition is corrupt */
+
+		char	buf[MAX_FULL_NAME_LEN];
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: table %s has no indexes. "
+			"Cannot calculate statistics.\n",
+			ut_format_name(table->name, TRUE, buf, sizeof(buf)));
+		dict_stats_empty_table(table);
+		return;
+	}
+
+	for (; index != NULL; index = dict_table_get_next_index(index)) {
+
+		ut_ad(!dict_index_is_univ(index));
+
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		dict_stats_empty_index(index);
+
+		if (dict_stats_should_ignore_index(index)) {
+			continue;
+		}
+
+		dict_stats_update_transient_for_index(index);
+
+		sum_of_index_sizes += index->stat_index_size;
+	}
+
+	index = dict_table_get_first_index(table);
+
+	table->stat_n_rows = index->stat_n_diff_key_vals[
+		dict_index_get_n_unique(index) - 1];
+
+	table->stat_clustered_index_size = index->stat_index_size;
+
+	table->stat_sum_of_other_index_sizes = sum_of_index_sizes
+		- index->stat_index_size;
+
+	table->stats_last_recalc = ut_time();
+
+	table->stat_modified_counter = 0;
+
+	table->stat_initialized = TRUE;
+}
+
+/* @{ Pseudo code about the relation between the following functions
+
+let N = N_SAMPLE_PAGES(index)
+
+dict_stats_analyze_index()
+  for each n_prefix
+    search for good enough level:
+      dict_stats_analyze_index_level() // only called if level has <= N pages
+        // full scan of the level in one mtr
+        collect statistics about the given level
+      if we are not satisfied with the level, search next lower level
+    we have found a good enough level here
+    dict_stats_analyze_index_for_n_prefix(that level, stats collected above)
+      // full scan of the level in one mtr
+      dive below some records and analyze the leaf page there:
+      dict_stats_analyze_index_below_cur()
+@} */
+
+/*********************************************************************//**
+Find the total number and the number of distinct keys on a given level in
+an index. Each of the 1..n_uniq prefixes are looked up and the results are
+saved in the array n_diff[0] .. n_diff[n_uniq - 1]. The total number of
+records on the level is saved in total_recs.
+Also, the index of the last record in each group of equal records is saved
+in n_diff_boundaries[0..n_uniq - 1], records indexing starts from the leftmost
+record on the level and continues cross pages boundaries, counting from 0. */
+static
+void
+dict_stats_analyze_index_level(
+/*===========================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		level,		/*!< in: level */
+	ib_uint64_t*	n_diff,		/*!< out: array for number of
+					distinct keys for all prefixes */
+	ib_uint64_t*	total_recs,	/*!< out: total number of records */
+	ib_uint64_t*	total_pages,	/*!< out: total number of pages */
+	boundaries_t*	n_diff_boundaries,/*!< out: boundaries of the groups
+					of distinct keys */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	ulint		n_uniq;
+	mem_heap_t*	heap;
+	btr_pcur_t	pcur;
+	const page_t*	page;
+	const rec_t*	rec;
+	const rec_t*	prev_rec;
+	bool		prev_rec_is_copied;
+	byte*		prev_rec_buf = NULL;
+	ulint		prev_rec_buf_size = 0;
+	ulint*		rec_offsets;
+	ulint*		prev_rec_offsets;
+	ulint		i;
+
+	DEBUG_PRINTF("    %s(table=%s, index=%s, level=%lu)\n", __func__,
+		     index->table->name, index->name, level);
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_S_LOCK));
+
+	n_uniq = dict_index_get_n_unique(index);
+
+	/* elements in the n_diff array are 0..n_uniq-1 (inclusive) */
+	memset(n_diff, 0x0, n_uniq * sizeof(n_diff[0]));
+
+	/* Allocate space for the offsets header (the allocation size at
+	offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_fields + 1,
+	so that this will never be less than the size calculated in
+	rec_get_offsets_func(). */
+	i = (REC_OFFS_HEADER_SIZE + 1 + 1) + index->n_fields;
+
+	heap = mem_heap_create((2 * sizeof *rec_offsets) * i);
+	rec_offsets = static_cast<ulint*>(
+		mem_heap_alloc(heap, i * sizeof *rec_offsets));
+	prev_rec_offsets = static_cast<ulint*>(
+		mem_heap_alloc(heap, i * sizeof *prev_rec_offsets));
+	rec_offs_set_n_alloc(rec_offsets, i);
+	rec_offs_set_n_alloc(prev_rec_offsets, i);
+
+	/* reset the dynamic arrays n_diff_boundaries[0..n_uniq-1] */
+	if (n_diff_boundaries != NULL) {
+		for (i = 0; i < n_uniq; i++) {
+			n_diff_boundaries[i].erase(
+				n_diff_boundaries[i].begin(),
+				n_diff_boundaries[i].end());
+		}
+	}
+
+	/* Position pcur on the leftmost record on the leftmost page
+	on the desired level. */
+
+	btr_pcur_open_at_index_side(
+		true, index, BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED,
+		&pcur, true, level, mtr);
+	btr_pcur_move_to_next_on_page(&pcur);
+
+	page = btr_pcur_get_page(&pcur);
+
+	/* The page must not be empty, except when
+	it is the root page (and the whole index is empty). */
+	ut_ad(btr_pcur_is_on_user_rec(&pcur) || page_is_leaf(page));
+	ut_ad(btr_pcur_get_rec(&pcur)
+	      == page_rec_get_next_const(page_get_infimum_rec(page)));
+
+	/* check that we are indeed on the desired level */
+	ut_a(btr_page_get_level(page, mtr) == level);
+
+	/* there should not be any pages on the left */
+	ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
+
+	/* check whether the first record on the leftmost page is marked
+	as such, if we are on a non-leaf level */
+	ut_a((level == 0)
+	     == !(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+			  btr_pcur_get_rec(&pcur), page_is_comp(page))));
+
+	prev_rec = NULL;
+	prev_rec_is_copied = false;
+
+	/* no records by default */
+	*total_recs = 0;
+
+	*total_pages = 0;
+
+	/* iterate over all user records on this level
+	and compare each two adjacent ones, even the last on page
+	X and the fist on page X+1 */
+	for (;
+	     btr_pcur_is_on_user_rec(&pcur);
+	     btr_pcur_move_to_next_user_rec(&pcur, mtr)) {
+
+		ulint	matched_fields = 0;
+		ulint	matched_bytes = 0;
+		bool	rec_is_last_on_page;
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		/* If rec and prev_rec are on different pages, then prev_rec
+		must have been copied, because we hold latch only on the page
+		where rec resides. */
+		if (prev_rec != NULL
+		    && page_align(rec) != page_align(prev_rec)) {
+
+			ut_a(prev_rec_is_copied);
+		}
+
+		rec_is_last_on_page =
+			page_rec_is_supremum(page_rec_get_next_const(rec));
+
+		/* increment the pages counter at the end of each page */
+		if (rec_is_last_on_page) {
+
+			(*total_pages)++;
+		}
+
+		/* Skip delete-marked records on the leaf level. If we
+		do not skip them, then ANALYZE quickly after DELETE
+		could count them or not (purge may have already wiped
+		them away) which brings non-determinism. We skip only
+		leaf-level delete marks because delete marks on
+		non-leaf level do not make sense. */
+		if (level == 0 &&
+		    rec_get_deleted_flag(
+			    rec,
+			    page_is_comp(btr_pcur_get_page(&pcur)))) {
+
+			if (rec_is_last_on_page
+			    && !prev_rec_is_copied
+			    && prev_rec != NULL) {
+				/* copy prev_rec */
+
+				prev_rec_offsets = rec_get_offsets(
+					prev_rec, index, prev_rec_offsets,
+					n_uniq, &heap);
+
+				prev_rec = rec_copy_prefix_to_buf(
+					prev_rec, index,
+					rec_offs_n_fields(prev_rec_offsets),
+					&prev_rec_buf, &prev_rec_buf_size);
+
+				prev_rec_is_copied = true;
+			}
+
+			continue;
+		}
+
+		rec_offsets = rec_get_offsets(
+			rec, index, rec_offsets, n_uniq, &heap);
+
+		(*total_recs)++;
+
+		if (prev_rec != NULL) {
+			prev_rec_offsets = rec_get_offsets(
+				prev_rec, index, prev_rec_offsets,
+				n_uniq, &heap);
+
+			cmp_rec_rec_with_match(rec,
+					       prev_rec,
+					       rec_offsets,
+					       prev_rec_offsets,
+					       index,
+					       FALSE,
+					       &matched_fields,
+					       &matched_bytes);
+
+			for (i = matched_fields; i < n_uniq; i++) {
+
+				if (n_diff_boundaries != NULL) {
+					/* push the index of the previous
+					record, that is - the last one from
+					a group of equal keys */
+
+					ib_uint64_t	idx;
+
+					/* the index of the current record
+					is total_recs - 1, the index of the
+					previous record is total_recs - 2;
+					we know that idx is not going to
+					become negative here because if we
+					are in this branch then there is a
+					previous record and thus
+					total_recs >= 2 */
+					idx = *total_recs - 2;
+
+					n_diff_boundaries[i].push_back(idx);
+				}
+
+				/* increment the number of different keys
+				for n_prefix=i+1 (e.g. if i=0 then we increment
+				for n_prefix=1 which is stored in n_diff[0]) */
+				n_diff[i]++;
+			}
+		} else {
+			/* this is the first non-delete marked record */
+			for (i = 0; i < n_uniq; i++) {
+				n_diff[i] = 1;
+			}
+		}
+
+		if (rec_is_last_on_page) {
+			/* end of a page has been reached */
+
+			/* we need to copy the record instead of assigning
+			like prev_rec = rec; because when we traverse the
+			records on this level at some point we will jump from
+			one page to the next and then rec and prev_rec will
+			be on different pages and
+			btr_pcur_move_to_next_user_rec() will release the
+			latch on the page that prev_rec is on */
+			prev_rec = rec_copy_prefix_to_buf(
+				rec, index, rec_offs_n_fields(rec_offsets),
+				&prev_rec_buf, &prev_rec_buf_size);
+			prev_rec_is_copied = true;
+
+		} else {
+			/* still on the same page, the next call to
+			btr_pcur_move_to_next_user_rec() will not jump
+			on the next page, we can simply assign pointers
+			instead of copying the records like above */
+
+			prev_rec = rec;
+			prev_rec_is_copied = false;
+		}
+	}
+
+	/* if *total_pages is left untouched then the above loop was not
+	entered at all and there is one page in the whole tree which is
+	empty or the loop was entered but this is level 0, contains one page
+	and all records are delete-marked */
+	if (*total_pages == 0) {
+
+		ut_ad(level == 0);
+		ut_ad(*total_recs == 0);
+
+		*total_pages = 1;
+	}
+
+	/* if there are records on this level and boundaries
+	should be saved */
+	if (*total_recs > 0 && n_diff_boundaries != NULL) {
+
+		/* remember the index of the last record on the level as the
+		last one from the last group of equal keys; this holds for
+		all possible prefixes */
+		for (i = 0; i < n_uniq; i++) {
+			ib_uint64_t	idx;
+
+			idx = *total_recs - 1;
+
+			n_diff_boundaries[i].push_back(idx);
+		}
+	}
+
+	/* now in n_diff_boundaries[i] there are exactly n_diff[i] integers,
+	for i=0..n_uniq-1 */
+
+#ifdef UNIV_STATS_DEBUG
+	for (i = 0; i < n_uniq; i++) {
+
+		DEBUG_PRINTF("    %s(): total recs: " UINT64PF
+			     ", total pages: " UINT64PF
+			     ", n_diff[%lu]: " UINT64PF "\n",
+			     __func__, *total_recs,
+			     *total_pages,
+			     i, n_diff[i]);
+
+#if 0
+		if (n_diff_boundaries != NULL) {
+			ib_uint64_t	j;
+
+			DEBUG_PRINTF("    %s(): boundaries[%lu]: ",
+				     __func__, i);
+
+			for (j = 0; j < n_diff[i]; j++) {
+				ib_uint64_t	idx;
+
+				idx = n_diff_boundaries[i][j];
+
+				DEBUG_PRINTF(UINT64PF "=" UINT64PF ", ",
+					     j, idx);
+			}
+			DEBUG_PRINTF("\n");
+		}
+#endif
+	}
+#endif /* UNIV_STATS_DEBUG */
+
+	/* Release the latch on the last page, because that is not done by
+	btr_pcur_close(). This function works also for non-leaf pages. */
+	btr_leaf_page_release(btr_pcur_get_block(&pcur), BTR_SEARCH_LEAF, mtr);
+
+	btr_pcur_close(&pcur);
+
+	if (prev_rec_buf != NULL) {
+
+		mem_free(prev_rec_buf);
+	}
+
+	mem_heap_free(heap);
+}
+
+/* aux enum for controlling the behavior of dict_stats_scan_page() @{ */
+enum page_scan_method_t {
+	COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED,/* scan all records on
+				the given page and count the number of
+				distinct ones, also ignore delete marked
+				records */
+	QUIT_ON_FIRST_NON_BORING/* quit when the first record that differs
+				from its right neighbor is found */
+};
+/* @} */
+
+/*********************************************************************//**
+Scan a page, reading records from left to right and counting the number
+of distinct records on that page (looking only at the first n_prefix
+columns). If scan_method is QUIT_ON_FIRST_NON_BORING then the function
+will return as soon as it finds a record that does not match its neighbor
+to the right, which means that in the case of QUIT_ON_FIRST_NON_BORING the
+returned n_diff can either be 0 (empty page), 1 (the whole page has all keys
+equal) or 2 (the function found a non-boring record and returned).
+@return offsets1 or offsets2 (the offsets of *out_rec),
+or NULL if the page is empty and does not contain user records. */
+UNIV_INLINE __attribute__((nonnull))
+ulint*
+dict_stats_scan_page(
+/*=================*/
+	const rec_t**		out_rec,	/*!< out: record, or NULL */
+	ulint*			offsets1,	/*!< out: rec_get_offsets()
+						working space (must be big
+						enough) */
+	ulint*			offsets2,	/*!< out: rec_get_offsets()
+						working space (must be big
+						enough) */
+	dict_index_t*		index,		/*!< in: index of the page */
+	const page_t*		page,		/*!< in: the page to scan */
+	ulint			n_prefix,	/*!< in: look at the first
+						n_prefix columns */
+	page_scan_method_t	scan_method,	/*!< in: scan to the end of
+						the page or not */
+	ib_uint64_t*		n_diff)		/*!< out: number of distinct
+						records encountered */
+{
+	ulint*		offsets_rec		= offsets1;
+	ulint*		offsets_next_rec	= offsets2;
+	const rec_t*	rec;
+	const rec_t*	next_rec;
+	/* A dummy heap, to be passed to rec_get_offsets().
+	Because offsets1,offsets2 should be big enough,
+	this memory heap should never be used. */
+	mem_heap_t*	heap			= NULL;
+	const rec_t*	(*get_next)(const rec_t*);
+
+	if (scan_method == COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED) {
+		get_next = page_rec_get_next_non_del_marked;
+	} else {
+		get_next = page_rec_get_next_const;
+	}
+
+	rec = get_next(page_get_infimum_rec(page));
+
+	if (page_rec_is_supremum(rec)) {
+		/* the page is empty or contains only delete-marked records */
+		*n_diff = 0;
+		*out_rec = NULL;
+		return(NULL);
+	}
+
+	offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+				      ULINT_UNDEFINED, &heap);
+
+	next_rec = get_next(rec);
+
+	*n_diff = 1;
+
+	while (!page_rec_is_supremum(next_rec)) {
+
+		ulint	matched_fields = 0;
+		ulint	matched_bytes = 0;
+
+		offsets_next_rec = rec_get_offsets(next_rec, index,
+						   offsets_next_rec,
+						   ULINT_UNDEFINED,
+						   &heap);
+
+		/* check whether rec != next_rec when looking at
+		the first n_prefix fields */
+		cmp_rec_rec_with_match(rec, next_rec,
+				       offsets_rec, offsets_next_rec,
+				       index, FALSE, &matched_fields,
+				       &matched_bytes);
+
+		if (matched_fields < n_prefix) {
+			/* rec != next_rec, => rec is non-boring */
+
+			(*n_diff)++;
+
+			if (scan_method == QUIT_ON_FIRST_NON_BORING) {
+				goto func_exit;
+			}
+		}
+
+		rec = next_rec;
+		{
+			/* Assign offsets_rec = offsets_next_rec
+			so that offsets_rec matches with rec which
+			was just assigned rec = next_rec above.
+			Also need to point offsets_next_rec to the
+			place where offsets_rec was pointing before
+			because we have just 2 placeholders where
+			data is actually stored:
+			offsets_onstack1 and offsets_onstack2 and we
+			are using them in circular fashion
+			(offsets[_next]_rec are just pointers to
+			those placeholders). */
+			ulint*	offsets_tmp;
+			offsets_tmp = offsets_rec;
+			offsets_rec = offsets_next_rec;
+			offsets_next_rec = offsets_tmp;
+		}
+
+		next_rec = get_next(next_rec);
+	}
+
+func_exit:
+	/* offsets1,offsets2 should have been big enough */
+	ut_a(heap == NULL);
+	*out_rec = rec;
+	return(offsets_rec);
+}
+
+/*********************************************************************//**
+Dive below the current position of a cursor and calculate the number of
+distinct records on the leaf page, when looking at the fist n_prefix
+columns.
+@return number of distinct records on the leaf page */
+static
+ib_uint64_t
+dict_stats_analyze_index_below_cur(
+/*===============================*/
+	const btr_cur_t*cur,		/*!< in: cursor */
+	ulint		n_prefix,	/*!< in: look at the first n_prefix
+					columns when comparing records */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	dict_index_t*	index;
+	ulint		space;
+	ulint		zip_size;
+	buf_block_t*	block;
+	ulint		page_no;
+	const page_t*	page;
+	mem_heap_t*	heap;
+	const rec_t*	rec;
+	ulint*		offsets1;
+	ulint*		offsets2;
+	ulint*		offsets_rec;
+	ib_uint64_t	n_diff; /* the result */
+	ulint		size;
+
+	index = btr_cur_get_index(cur);
+
+	/* Allocate offsets for the record and the node pointer, for
+	node pointer records. In a secondary index, the node pointer
+	record will consist of all index fields followed by a child
+	page number.
+	Allocate space for the offsets header (the allocation size at
+	offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_fields + 1,
+	so that this will never be less than the size calculated in
+	rec_get_offsets_func(). */
+	size = (1 + REC_OFFS_HEADER_SIZE) + 1 + dict_index_get_n_fields(index);
+
+	heap = mem_heap_create(size * (sizeof *offsets1 + sizeof *offsets2));
+
+	offsets1 = static_cast<ulint*>(mem_heap_alloc(
+			heap, size * sizeof *offsets1));
+
+	offsets2 = static_cast<ulint*>(mem_heap_alloc(
+			heap, size * sizeof *offsets2));
+
+	rec_offs_set_n_alloc(offsets1, size);
+	rec_offs_set_n_alloc(offsets2, size);
+
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+
+	rec = btr_cur_get_rec(cur);
+
+	offsets_rec = rec_get_offsets(rec, index, offsets1,
+				      ULINT_UNDEFINED, &heap);
+
+	page_no = btr_node_ptr_get_child_page_no(rec, offsets_rec);
+
+	/* descend to the leaf level on the B-tree */
+	for (;;) {
+
+		block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH,
+					 NULL /* no guessed block */,
+					 BUF_GET, __FILE__, __LINE__, mtr);
+
+		page = buf_block_get_frame(block);
+
+		if (btr_page_get_level(page, mtr) == 0) {
+			/* leaf level */
+			break;
+		}
+		/* else */
+
+		/* search for the first non-boring record on the page */
+		offsets_rec = dict_stats_scan_page(
+			&rec, offsets1, offsets2, index, page, n_prefix,
+			QUIT_ON_FIRST_NON_BORING, &n_diff);
+
+		/* pages on level > 0 are not allowed to be empty */
+		ut_a(offsets_rec != NULL);
+		/* if page is not empty (offsets_rec != NULL) then n_diff must
+		be > 0, otherwise there is a bug in dict_stats_scan_page() */
+		ut_a(n_diff > 0);
+
+		if (n_diff == 1) {
+			/* page has all keys equal and the end of the page
+			was reached by dict_stats_scan_page(), no need to
+			descend to the leaf level */
+			mem_heap_free(heap);
+			return(1);
+		}
+		/* else */
+
+		/* when we instruct dict_stats_scan_page() to quit on the
+		first non-boring record it finds, then the returned n_diff
+		can either be 0 (empty page), 1 (page has all keys equal) or
+		2 (non-boring record was found) */
+		ut_a(n_diff == 2);
+
+		/* we have a non-boring record in rec, descend below it */
+
+		page_no = btr_node_ptr_get_child_page_no(rec, offsets_rec);
+	}
+
+	/* make sure we got a leaf page as a result from the above loop */
+	ut_ad(btr_page_get_level(page, mtr) == 0);
+
+	/* scan the leaf page and find the number of distinct keys,
+	when looking only at the first n_prefix columns */
+
+	offsets_rec = dict_stats_scan_page(
+		&rec, offsets1, offsets2, index, page, n_prefix,
+		COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED, &n_diff);
+
+#if 0
+	DEBUG_PRINTF("      %s(): n_diff below page_no=%lu: " UINT64PF "\n",
+		     __func__, page_no, n_diff);
+#endif
+
+	mem_heap_free(heap);
+
+	return(n_diff);
+}
+
+/*********************************************************************//**
+For a given level in an index select N_SAMPLE_PAGES(index)
+(or less) records from that level and dive below them to the corresponding
+leaf pages, then scan those leaf pages and save the sampling results in
+index->stat_n_diff_key_vals[n_prefix - 1] and the number of pages scanned in
+index->stat_n_sample_sizes[n_prefix - 1]. */
+static
+void
+dict_stats_analyze_index_for_n_prefix(
+/*==================================*/
+	dict_index_t*	index,		/*!< in/out: index */
+	ulint		level,		/*!< in: level, must be >= 1 */
+	ib_uint64_t	total_recs_on_level,
+					/*!< in: total number of
+					records on the given level */
+	ulint		n_prefix,	/*!< in: look at first
+					n_prefix columns when
+					comparing records */
+	ib_uint64_t	n_diff_for_this_prefix,
+					/*!< in: number of distinct
+					records on the given level,
+					when looking at the first
+					n_prefix columns */
+	boundaries_t*	boundaries,	/*!< in: array that contains
+					n_diff_for_this_prefix
+					integers each of which
+					represents the index (on the
+					level, counting from
+					left/smallest to right/biggest
+					from 0) of the last record
+					from each group of distinct
+					keys */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	btr_pcur_t	pcur;
+	const page_t*	page;
+	ib_uint64_t	rec_idx;
+	ib_uint64_t	last_idx_on_level;
+	ib_uint64_t	n_recs_to_dive_below;
+	ib_uint64_t	n_diff_sum_of_all_analyzed_pages;
+	ib_uint64_t	i;
+
+#if 0
+	DEBUG_PRINTF("    %s(table=%s, index=%s, level=%lu, n_prefix=%lu, "
+		     "n_diff_for_this_prefix=" UINT64PF ")\n",
+		     __func__, index->table->name, index->name, level,
+		     n_prefix, n_diff_for_this_prefix);
+#endif
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_S_LOCK));
+
+	/* if some of those is 0 then this means that there is exactly one
+	page in the B-tree and it is empty and we should have done full scan
+	and should not be here */
+	ut_ad(total_recs_on_level > 0);
+	ut_ad(n_diff_for_this_prefix > 0);
+
+	/* this must be at least 1 */
+	ut_ad(N_SAMPLE_PAGES(index) > 0);
+
+	/* Position pcur on the leftmost record on the leftmost page
+	on the desired level. */
+
+	btr_pcur_open_at_index_side(
+		true, index, BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED,
+		&pcur, true, level, mtr);
+	btr_pcur_move_to_next_on_page(&pcur);
+
+	page = btr_pcur_get_page(&pcur);
+
+	/* The page must not be empty, except when
+	it is the root page (and the whole index is empty). */
+	ut_ad(btr_pcur_is_on_user_rec(&pcur) || page_is_leaf(page));
+	ut_ad(btr_pcur_get_rec(&pcur)
+	      == page_rec_get_next_const(page_get_infimum_rec(page)));
+
+	/* check that we are indeed on the desired level */
+	ut_a(btr_page_get_level(page, mtr) == level);
+
+	/* there should not be any pages on the left */
+	ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
+
+	/* check whether the first record on the leftmost page is marked
+	as such, if we are on a non-leaf level */
+	ut_a((level == 0)
+	     == !(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+			  btr_pcur_get_rec(&pcur), page_is_comp(page))));
+
+	last_idx_on_level = boundaries->at(n_diff_for_this_prefix - 1);
+
+	rec_idx = 0;
+
+	n_diff_sum_of_all_analyzed_pages = 0;
+
+	n_recs_to_dive_below = ut_min(N_SAMPLE_PAGES(index),
+				      n_diff_for_this_prefix);
+
+	for (i = 0; i < n_recs_to_dive_below; i++) {
+		ib_uint64_t	left;
+		ib_uint64_t	right;
+		ulint		rnd;
+		ib_uint64_t	dive_below_idx;
+
+		/* there are n_diff_for_this_prefix elements
+		in 'boundaries' and we divide those elements
+		into n_recs_to_dive_below segments, for example:
+
+		let n_diff_for_this_prefix=100, n_recs_to_dive_below=4, then:
+		segment i=0:  [0, 24]
+		segment i=1: [25, 49]
+		segment i=2: [50, 74]
+		segment i=3: [75, 99] or
+
+		let n_diff_for_this_prefix=1, n_recs_to_dive_below=1, then:
+		segment i=0: [0, 0] or
+
+		let n_diff_for_this_prefix=2, n_recs_to_dive_below=2, then:
+		segment i=0: [0, 0]
+		segment i=1: [1, 1] or
+
+		let n_diff_for_this_prefix=13, n_recs_to_dive_below=7, then:
+		segment i=0:  [0,  0]
+		segment i=1:  [1,  2]
+		segment i=2:  [3,  4]
+		segment i=3:  [5,  6]
+		segment i=4:  [7,  8]
+		segment i=5:  [9, 10]
+		segment i=6: [11, 12]
+
+		then we select a random record from each segment and dive
+		below it */
+		left = n_diff_for_this_prefix * i / n_recs_to_dive_below;
+		right = n_diff_for_this_prefix * (i + 1)
+			/ n_recs_to_dive_below - 1;
+
+		ut_a(left <= right);
+		ut_a(right <= last_idx_on_level);
+
+		/* we do not pass (left, right) because we do not want to ask
+		ut_rnd_interval() to work with too big numbers since
+		ib_uint64_t could be bigger than ulint */
+		rnd = ut_rnd_interval(0, (ulint) (right - left));
+
+		dive_below_idx = boundaries->at(left + rnd);
+
+#if 0
+		DEBUG_PRINTF("    %s(): dive below record with index="
+			     UINT64PF "\n", __func__, dive_below_idx);
+#endif
+
+		/* seek to the record with index dive_below_idx */
+		while (rec_idx < dive_below_idx
+		       && btr_pcur_is_on_user_rec(&pcur)) {
+
+			btr_pcur_move_to_next_user_rec(&pcur, mtr);
+			rec_idx++;
+		}
+
+		/* if the level has finished before the record we are
+		searching for, this means that the B-tree has changed in
+		the meantime, quit our sampling and use whatever stats
+		we have collected so far */
+		if (rec_idx < dive_below_idx) {
+
+			ut_ad(!btr_pcur_is_on_user_rec(&pcur));
+			break;
+		}
+
+		/* it could be that the tree has changed in such a way that
+		the record under dive_below_idx is the supremum record, in
+		this case rec_idx == dive_below_idx and pcur is positioned
+		on the supremum, we do not want to dive below it */
+		if (!btr_pcur_is_on_user_rec(&pcur)) {
+			break;
+		}
+
+		ut_a(rec_idx == dive_below_idx);
+
+		ib_uint64_t	n_diff_on_leaf_page;
+
+		n_diff_on_leaf_page = dict_stats_analyze_index_below_cur(
+			btr_pcur_get_btr_cur(&pcur), n_prefix, mtr);
+
+		/* We adjust n_diff_on_leaf_page here to avoid counting
+		one record twice - once as the last on some page and once
+		as the first on another page. Consider the following example:
+		Leaf level:
+		page: (2,2,2,2,3,3)
+		... many pages like (3,3,3,3,3,3) ...
+		page: (3,3,3,3,5,5)
+		... many pages like (5,5,5,5,5,5) ...
+		page: (5,5,5,5,8,8)
+		page: (8,8,8,8,9,9)
+		our algo would (correctly) get an estimate that there are
+		2 distinct records per page (average). Having 4 pages below
+		non-boring records, it would (wrongly) estimate the number
+		of distinct records to 8. */
+		if (n_diff_on_leaf_page > 0) {
+			n_diff_on_leaf_page--;
+		}
+
+		n_diff_sum_of_all_analyzed_pages += n_diff_on_leaf_page;
+	}
+
+	/* n_diff_sum_of_all_analyzed_pages can be 0 here if all the leaf
+	pages sampled contained only delete-marked records. In this case
+	we should assign 0 to index->stat_n_diff_key_vals[n_prefix - 1], which
+	the formula below does. */
+
+	/* See REF01 for an explanation of the algorithm */
+	index->stat_n_diff_key_vals[n_prefix - 1]
+		= index->stat_n_leaf_pages
+
+		* n_diff_for_this_prefix
+		/ total_recs_on_level
+
+		* n_diff_sum_of_all_analyzed_pages
+		/ n_recs_to_dive_below;
+
+	index->stat_n_sample_sizes[n_prefix - 1] = n_recs_to_dive_below;
+
+	DEBUG_PRINTF("    %s(): n_diff=" UINT64PF " for n_prefix=%lu "
+		     "(%lu"
+		     " * " UINT64PF " / " UINT64PF
+		     " * " UINT64PF " / " UINT64PF ")\n",
+		     __func__, index->stat_n_diff_key_vals[n_prefix - 1],
+		     n_prefix,
+		     index->stat_n_leaf_pages,
+		     n_diff_for_this_prefix, total_recs_on_level,
+		     n_diff_sum_of_all_analyzed_pages, n_recs_to_dive_below);
+
+	btr_pcur_close(&pcur);
+}
+
+/*********************************************************************//**
+Calculates new statistics for a given index and saves them to the index
+members stat_n_diff_key_vals[], stat_n_sample_sizes[], stat_index_size and
+stat_n_leaf_pages. This function could be slow. */
+static
+void
+dict_stats_analyze_index(
+/*=====================*/
+	dict_index_t*	index)	/*!< in/out: index to analyze */
+{
+	ulint		root_level;
+	ulint		level;
+	bool		level_is_analyzed;
+	ulint		n_uniq;
+	ulint		n_prefix;
+	ib_uint64_t*	n_diff_on_level;
+	ib_uint64_t	total_recs;
+	ib_uint64_t	total_pages;
+	boundaries_t*	n_diff_boundaries;
+	mtr_t		mtr;
+	ulint		size;
+	DBUG_ENTER("dict_stats_analyze_index");
+
+	DBUG_PRINT("info", ("index: %s, online status: %d", index->name,
+			    dict_index_get_online_status(index)));
+
+	DEBUG_PRINTF("  %s(index=%s)\n", __func__, index->name);
+
+	dict_stats_empty_index(index);
+
+	mtr_start(&mtr);
+
+	mtr_s_lock(dict_index_get_lock(index), &mtr);
+
+	size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
+
+	if (size != ULINT_UNDEFINED) {
+		index->stat_index_size = size;
+		size = btr_get_size(index, BTR_N_LEAF_PAGES, &mtr);
+	}
+
+	/* Release the X locks on the root page taken by btr_get_size() */
+	mtr_commit(&mtr);
+
+	switch (size) {
+	case ULINT_UNDEFINED:
+		dict_stats_assert_initialized_index(index);
+		DBUG_VOID_RETURN;
+	case 0:
+		/* The root node of the tree is a leaf */
+		size = 1;
+	}
+
+	index->stat_n_leaf_pages = size;
+
+	mtr_start(&mtr);
+
+	mtr_s_lock(dict_index_get_lock(index), &mtr);
+
+	root_level = btr_height_get(index, &mtr);
+
+	n_uniq = dict_index_get_n_unique(index);
+
+	/* If the tree has just one level (and one page) or if the user
+	has requested to sample too many pages then do full scan.
+
+	For each n-column prefix (for n=1..n_uniq) N_SAMPLE_PAGES(index)
+	will be sampled, so in total N_SAMPLE_PAGES(index) * n_uniq leaf
+	pages will be sampled. If that number is bigger than the total
+	number of leaf pages then do full scan of the leaf level instead
+	since it will be faster and will give better results. */
+
+	if (root_level == 0
+	    || N_SAMPLE_PAGES(index) * n_uniq > index->stat_n_leaf_pages) {
+
+		if (root_level == 0) {
+			DEBUG_PRINTF("  %s(): just one page, "
+				     "doing full scan\n", __func__);
+		} else {
+			DEBUG_PRINTF("  %s(): too many pages requested for "
+				     "sampling, doing full scan\n", __func__);
+		}
+
+		/* do full scan of level 0; save results directly
+		into the index */
+
+		dict_stats_analyze_index_level(index,
+					       0 /* leaf level */,
+					       index->stat_n_diff_key_vals,
+					       &total_recs,
+					       &total_pages,
+					       NULL /* boundaries not needed */,
+					       &mtr);
+
+		for (ulint i = 0; i < n_uniq; i++) {
+			index->stat_n_sample_sizes[i] = total_pages;
+		}
+
+		mtr_commit(&mtr);
+
+		dict_stats_assert_initialized_index(index);
+		DBUG_VOID_RETURN;
+	}
+
+	/* set to zero */
+	n_diff_on_level = reinterpret_cast<ib_uint64_t*>
+		(mem_zalloc(n_uniq * sizeof(ib_uint64_t)));
+
+	n_diff_boundaries = new boundaries_t[n_uniq];
+
+	/* total_recs is also used to estimate the number of pages on one
+	level below, so at the start we have 1 page (the root) */
+	total_recs = 1;
+
+	/* Here we use the following optimization:
+	If we find that level L is the first one (searching from the
+	root) that contains at least D distinct keys when looking at
+	the first n_prefix columns, then:
+	if we look at the first n_prefix-1 columns then the first
+	level that contains D distinct keys will be either L or a
+	lower one.
+	So if we find that the first level containing D distinct
+	keys (on n_prefix columns) is L, we continue from L when
+	searching for D distinct keys on n_prefix-1 columns. */
+	level = root_level;
+	level_is_analyzed = false;
+
+	for (n_prefix = n_uniq; n_prefix >= 1; n_prefix--) {
+
+		DEBUG_PRINTF("  %s(): searching level with >=%llu "
+			     "distinct records, n_prefix=%lu\n",
+			     __func__, N_DIFF_REQUIRED(index), n_prefix);
+
+		/* Commit the mtr to release the tree S lock to allow
+		other threads to do some work too. */
+		mtr_commit(&mtr);
+		mtr_start(&mtr);
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+		if (root_level != btr_height_get(index, &mtr)) {
+			/* Just quit if the tree has changed beyond
+			recognition here. The old stats from previous
+			runs will remain in the values that we have
+			not calculated yet. Initially when the index
+			object is created the stats members are given
+			some sensible values so leaving them untouched
+			here even the first time will not cause us to
+			read uninitialized memory later. */
+			break;
+		}
+
+		/* check whether we should pick the current level;
+		we pick level 1 even if it does not have enough
+		distinct records because we do not want to scan the
+		leaf level because it may contain too many records */
+		if (level_is_analyzed
+		    && (n_diff_on_level[n_prefix - 1] >= N_DIFF_REQUIRED(index)
+			|| level == 1)) {
+
+			goto found_level;
+		}
+
+		/* search for a level that contains enough distinct records */
+
+		if (level_is_analyzed && level > 1) {
+
+			/* if this does not hold we should be on
+			"found_level" instead of here */
+			ut_ad(n_diff_on_level[n_prefix - 1]
+			      < N_DIFF_REQUIRED(index));
+
+			level--;
+			level_is_analyzed = false;
+		}
+
+		/* descend into the tree, searching for "good enough" level */
+		for (;;) {
+
+			/* make sure we do not scan the leaf level
+			accidentally, it may contain too many pages */
+			ut_ad(level > 0);
+
+			/* scanning the same level twice is an optimization
+			bug */
+			ut_ad(!level_is_analyzed);
+
+			/* Do not scan if this would read too many pages.
+			Here we use the following fact:
+			the number of pages on level L equals the number
+			of records on level L+1, thus we deduce that the
+			following call would scan total_recs pages, because
+			total_recs is left from the previous iteration when
+			we scanned one level upper or we have not scanned any
+			levels yet in which case total_recs is 1. */
+			if (total_recs > N_SAMPLE_PAGES(index)) {
+
+				/* if the above cond is true then we are
+				not at the root level since on the root
+				level total_recs == 1 (set before we
+				enter the n-prefix loop) and cannot
+				be > N_SAMPLE_PAGES(index) */
+				ut_a(level != root_level);
+
+				/* step one level back and be satisfied with
+				whatever it contains */
+				level++;
+				level_is_analyzed = true;
+
+				break;
+			}
+
+			dict_stats_analyze_index_level(index,
+						       level,
+						       n_diff_on_level,
+						       &total_recs,
+						       &total_pages,
+						       n_diff_boundaries,
+						       &mtr);
+
+			level_is_analyzed = true;
+
+			if (n_diff_on_level[n_prefix - 1]
+			    >= N_DIFF_REQUIRED(index)
+			    || level == 1) {
+				/* we found a good level with many distinct
+				records or we have reached the last level we
+				could scan */
+				break;
+			}
+
+			level--;
+			level_is_analyzed = false;
+		}
+found_level:
+
+		DEBUG_PRINTF("  %s(): found level %lu that has " UINT64PF
+			     " distinct records for n_prefix=%lu\n",
+			     __func__, level, n_diff_on_level[n_prefix - 1],
+			     n_prefix);
+
+		/* here we are either on level 1 or the level that we are on
+		contains >= N_DIFF_REQUIRED distinct keys or we did not scan
+		deeper levels because they would contain too many pages */
+
+		ut_ad(level > 0);
+
+		ut_ad(level_is_analyzed);
+
+		/* pick some records from this level and dive below them for
+		the given n_prefix */
+
+		dict_stats_analyze_index_for_n_prefix(
+			index, level, total_recs, n_prefix,
+			n_diff_on_level[n_prefix - 1],
+			&n_diff_boundaries[n_prefix - 1], &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	delete[] n_diff_boundaries;
+
+	mem_free(n_diff_on_level);
+
+	dict_stats_assert_initialized_index(index);
+	DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. This function
+is relatively slow and is used to calculate persistent statistics that
+will be saved on disk.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+dict_stats_update_persistent(
+/*=========================*/
+	dict_table_t*	table)		/*!< in/out: table */
+{
+	dict_index_t*	index;
+
+	DEBUG_PRINTF("%s(table=%s)\n", __func__, table->name);
+
+	dict_table_stats_lock(table, RW_X_LATCH);
+
+	/* analyze the clustered index first */
+
+	index = dict_table_get_first_index(table);
+
+	if (index == NULL
+	    || dict_index_is_corrupted(index)
+	    || (index->type | DICT_UNIQUE) != (DICT_CLUSTERED | DICT_UNIQUE)) {
+
+		/* Table definition is corrupt */
+		dict_table_stats_unlock(table, RW_X_LATCH);
+		dict_stats_empty_table(table);
+
+		return(DB_CORRUPTION);
+	}
+
+	ut_ad(!dict_index_is_univ(index));
+
+	dict_stats_analyze_index(index);
+
+	ulint	n_unique = dict_index_get_n_unique(index);
+
+	table->stat_n_rows = index->stat_n_diff_key_vals[n_unique - 1];
+
+	table->stat_clustered_index_size = index->stat_index_size;
+
+	/* analyze other indexes from the table, if any */
+
+	table->stat_sum_of_other_index_sizes = 0;
+
+	for (index = dict_table_get_next_index(index);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		ut_ad(!dict_index_is_univ(index));
+
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		dict_stats_empty_index(index);
+
+		if (dict_stats_should_ignore_index(index)) {
+			continue;
+		}
+
+		if (!(table->stats_bg_flag & BG_STAT_SHOULD_QUIT)) {
+			dict_stats_analyze_index(index);
+		}
+
+		table->stat_sum_of_other_index_sizes
+			+= index->stat_index_size;
+	}
+
+	table->stats_last_recalc = ut_time();
+
+	table->stat_modified_counter = 0;
+
+	table->stat_initialized = TRUE;
+
+	dict_stats_assert_initialized(table);
+
+	dict_table_stats_unlock(table, RW_X_LATCH);
+
+	return(DB_SUCCESS);
+}
+
+#include "mysql_com.h"
+/*********************************************************************//**
+Save an individual index's statistic into the persistent statistics
+storage.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+dict_stats_save_index_stat(
+/*=======================*/
+	dict_index_t*	index,		/*!< in: index */
+	lint		last_update,	/*!< in: timestamp of the stat */
+	const char*	stat_name,	/*!< in: name of the stat */
+	ib_uint64_t	stat_value,	/*!< in: value of the stat */
+	ib_uint64_t*	sample_size,	/*!< in: n pages sampled or NULL */
+	const char*	stat_description)/*!< in: description of the stat */
+{
+	pars_info_t*	pinfo;
+	dberr_t		ret;
+	char		db_utf8[MAX_DB_UTF8_LEN];
+	char		table_utf8[MAX_TABLE_UTF8_LEN];
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	dict_fs2utf8(index->table->name, db_utf8, sizeof(db_utf8),
+		     table_utf8, sizeof(table_utf8));
+
+	pinfo = pars_info_create();
+	pars_info_add_str_literal(pinfo, "database_name", db_utf8);
+	pars_info_add_str_literal(pinfo, "table_name", table_utf8);
+	UNIV_MEM_ASSERT_RW_ABORT(index->name, strlen(index->name));
+	pars_info_add_str_literal(pinfo, "index_name", index->name);
+	UNIV_MEM_ASSERT_RW_ABORT(&last_update, 4);
+	pars_info_add_int4_literal(pinfo, "last_update", last_update);
+	UNIV_MEM_ASSERT_RW_ABORT(stat_name, strlen(stat_name));
+	pars_info_add_str_literal(pinfo, "stat_name", stat_name);
+	UNIV_MEM_ASSERT_RW_ABORT(&stat_value, 8);
+	pars_info_add_ull_literal(pinfo, "stat_value", stat_value);
+	if (sample_size != NULL) {
+		UNIV_MEM_ASSERT_RW_ABORT(sample_size, 8);
+		pars_info_add_ull_literal(pinfo, "sample_size", *sample_size);
+	} else {
+		pars_info_add_literal(pinfo, "sample_size", NULL,
+				      UNIV_SQL_NULL, DATA_FIXBINARY, 0);
+	}
+	UNIV_MEM_ASSERT_RW_ABORT(stat_description, strlen(stat_description));
+	pars_info_add_str_literal(pinfo, "stat_description",
+				  stat_description);
+
+	ret = dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE INDEX_STATS_SAVE_INSERT () IS\n"
+		"BEGIN\n"
+		"INSERT INTO \"" INDEX_STATS_NAME "\"\n"
+		"VALUES\n"
+		"(\n"
+		":database_name,\n"
+		":table_name,\n"
+		":index_name,\n"
+		":last_update,\n"
+		":stat_name,\n"
+		":stat_value,\n"
+		":sample_size,\n"
+		":stat_description\n"
+		");\n"
+		"END;");
+
+	if (ret == DB_DUPLICATE_KEY) {
+
+		pinfo = pars_info_create();
+		pars_info_add_str_literal(pinfo, "database_name", db_utf8);
+		pars_info_add_str_literal(pinfo, "table_name", table_utf8);
+		UNIV_MEM_ASSERT_RW_ABORT(index->name, strlen(index->name));
+		pars_info_add_str_literal(pinfo, "index_name", index->name);
+		UNIV_MEM_ASSERT_RW_ABORT(&last_update, 4);
+		pars_info_add_int4_literal(pinfo, "last_update", last_update);
+		UNIV_MEM_ASSERT_RW_ABORT(stat_name, strlen(stat_name));
+		pars_info_add_str_literal(pinfo, "stat_name", stat_name);
+		UNIV_MEM_ASSERT_RW_ABORT(&stat_value, 8);
+		pars_info_add_ull_literal(pinfo, "stat_value", stat_value);
+		if (sample_size != NULL) {
+			UNIV_MEM_ASSERT_RW_ABORT(sample_size, 8);
+			pars_info_add_ull_literal(pinfo, "sample_size", *sample_size);
+		} else {
+			pars_info_add_literal(pinfo, "sample_size", NULL,
+					      UNIV_SQL_NULL, DATA_FIXBINARY, 0);
+		}
+		UNIV_MEM_ASSERT_RW_ABORT(stat_description, strlen(stat_description));
+		pars_info_add_str_literal(pinfo, "stat_description",
+					  stat_description);
+
+		ret = dict_stats_exec_sql(
+			pinfo,
+			"PROCEDURE INDEX_STATS_SAVE_UPDATE () IS\n"
+			"BEGIN\n"
+			"UPDATE \"" INDEX_STATS_NAME "\" SET\n"
+			"last_update = :last_update,\n"
+			"stat_value = :stat_value,\n"
+			"sample_size = :sample_size,\n"
+			"stat_description = :stat_description\n"
+			"WHERE\n"
+			"database_name = :database_name AND\n"
+			"table_name = :table_name AND\n"
+			"index_name = :index_name AND\n"
+			"stat_name = :stat_name;\n"
+			"END;");
+	}
+
+	if (ret != DB_SUCCESS) {
+		char	buf_table[MAX_FULL_NAME_LEN];
+		char	buf_index[MAX_FULL_NAME_LEN];
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Cannot save index statistics for table "
+			"%s, index %s, stat name \"%s\": %s\n",
+			ut_format_name(index->table->name, TRUE,
+				       buf_table, sizeof(buf_table)),
+			ut_format_name(index->name, FALSE,
+				       buf_index, sizeof(buf_index)),
+			stat_name, ut_strerr(ret));
+	}
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Save the table's statistics into the persistent statistics storage.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+dict_stats_save(
+/*============*/
+	dict_table_t*	table_orig)	/*!< in: table */
+{
+	pars_info_t*	pinfo;
+	lint		now;
+	dberr_t		ret;
+	dict_table_t*	table;
+	char		db_utf8[MAX_DB_UTF8_LEN];
+	char		table_utf8[MAX_TABLE_UTF8_LEN];
+
+	table = dict_stats_snapshot_create(table_orig);
+
+	dict_fs2utf8(table->name, db_utf8, sizeof(db_utf8),
+		     table_utf8, sizeof(table_utf8));
+
+	rw_lock_x_lock(&dict_operation_lock);
+	mutex_enter(&dict_sys->mutex);
+
+	/* MySQL's timestamp is 4 byte, so we use
+	pars_info_add_int4_literal() which takes a lint arg, so "now" is
+	lint */
+	now = (lint) ut_time();
+
+#define PREPARE_PINFO_FOR_TABLE_SAVE(p, t, n)				\
+	do {								\
+	pars_info_add_str_literal((p), "database_name", db_utf8);	\
+	pars_info_add_str_literal((p), "table_name", table_utf8);	\
+	pars_info_add_int4_literal((p), "last_update", (n));		\
+	pars_info_add_ull_literal((p), "n_rows", (t)->stat_n_rows);	\
+	pars_info_add_ull_literal((p), "clustered_index_size",		\
+		(t)->stat_clustered_index_size);			\
+	pars_info_add_ull_literal((p), "sum_of_other_index_sizes",	\
+		(t)->stat_sum_of_other_index_sizes);			\
+	} while(false);
+
+	pinfo = pars_info_create();
+
+	PREPARE_PINFO_FOR_TABLE_SAVE(pinfo, table, now);
+
+	ret = dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE TABLE_STATS_SAVE_INSERT () IS\n"
+		"BEGIN\n"
+		"INSERT INTO \"" TABLE_STATS_NAME "\"\n"
+		"VALUES\n"
+		"(\n"
+		":database_name,\n"
+		":table_name,\n"
+		":last_update,\n"
+		":n_rows,\n"
+		":clustered_index_size,\n"
+		":sum_of_other_index_sizes\n"
+		");\n"
+		"END;");
+
+	if (ret == DB_DUPLICATE_KEY) {
+		pinfo = pars_info_create();
+
+		PREPARE_PINFO_FOR_TABLE_SAVE(pinfo, table, now);
+
+		ret = dict_stats_exec_sql(
+			pinfo,
+			"PROCEDURE TABLE_STATS_SAVE_UPDATE () IS\n"
+			"BEGIN\n"
+			"UPDATE \"" TABLE_STATS_NAME "\" SET\n"
+			"last_update = :last_update,\n"
+			"n_rows = :n_rows,\n"
+			"clustered_index_size = :clustered_index_size,\n"
+			"sum_of_other_index_sizes = "
+			"  :sum_of_other_index_sizes\n"
+			"WHERE\n"
+			"database_name = :database_name AND\n"
+			"table_name = :table_name;\n"
+			"END;");
+	}
+
+	if (ret != DB_SUCCESS) {
+		char	buf[MAX_FULL_NAME_LEN];
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Cannot save table statistics for table "
+			"%s: %s\n",
+			ut_format_name(table->name, TRUE, buf, sizeof(buf)),
+			ut_strerr(ret));
+		goto end;
+	}
+
+	dict_index_t*	index;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (dict_stats_should_ignore_index(index)) {
+			continue;
+		}
+
+		ut_ad(!dict_index_is_univ(index));
+
+		ret = dict_stats_save_index_stat(index, now, "size",
+						 index->stat_index_size,
+						 NULL,
+						 "Number of pages "
+						 "in the index");
+		if (ret != DB_SUCCESS) {
+			goto end;
+		}
+
+		ret = dict_stats_save_index_stat(index, now, "n_leaf_pages",
+						 index->stat_n_leaf_pages,
+						 NULL,
+						 "Number of leaf pages "
+						 "in the index");
+		if (ret != DB_SUCCESS) {
+			goto end;
+		}
+
+		for (ulint i = 0; i < index->n_uniq; i++) {
+
+			char	stat_name[16];
+			char	stat_description[1024];
+			ulint	j;
+
+			ut_snprintf(stat_name, sizeof(stat_name),
+				    "n_diff_pfx%02lu", i + 1);
+
+			/* craft a string that contains the columns names */
+			ut_snprintf(stat_description,
+				    sizeof(stat_description),
+				    "%s", index->fields[0].name);
+			for (j = 1; j <= i; j++) {
+				size_t	len;
+
+				len = strlen(stat_description);
+
+				ut_snprintf(stat_description + len,
+					    sizeof(stat_description) - len,
+					    ",%s", index->fields[j].name);
+			}
+
+			ret = dict_stats_save_index_stat(
+				index, now, stat_name,
+				index->stat_n_diff_key_vals[i],
+				&index->stat_n_sample_sizes[i],
+				stat_description);
+
+			if (ret != DB_SUCCESS) {
+				goto end;
+			}
+		}
+	}
+
+end:
+	mutex_exit(&dict_sys->mutex);
+	rw_lock_x_unlock(&dict_operation_lock);
+
+	dict_stats_snapshot_free(table);
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Called for the row that is selected by
+SELECT ... FROM mysql.innodb_table_stats WHERE table='...'
+The second argument is a pointer to the table and the fetched stats are
+written to it.
+@return non-NULL dummy */
+static
+ibool
+dict_stats_fetch_table_stats_step(
+/*==============================*/
+	void*	node_void,	/*!< in: select node */
+	void*	table_void)	/*!< out: table */
+{
+	sel_node_t*	node = (sel_node_t*) node_void;
+	dict_table_t*	table = (dict_table_t*) table_void;
+	que_common_t*	cnode;
+	int		i;
+
+	/* this should loop exactly 3 times - for
+	n_rows,clustered_index_size,sum_of_other_index_sizes */
+	for (cnode = static_cast<que_common_t*>(node->select_list), i = 0;
+	     cnode != NULL;
+	     cnode = static_cast<que_common_t*>(que_node_get_next(cnode)),
+	     i++) {
+
+		const byte*	data;
+		dfield_t*	dfield = que_node_get_val(cnode);
+		dtype_t*	type = dfield_get_type(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		data = static_cast<const byte*>(dfield_get_data(dfield));
+
+		switch (i) {
+		case 0: /* mysql.innodb_table_stats.n_rows */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			table->stat_n_rows = mach_read_from_8(data);
+
+			break;
+
+		case 1: /* mysql.innodb_table_stats.clustered_index_size */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			table->stat_clustered_index_size
+				= (ulint) mach_read_from_8(data);
+
+			break;
+
+		case 2: /* mysql.innodb_table_stats.sum_of_other_index_sizes */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			table->stat_sum_of_other_index_sizes
+				= (ulint) mach_read_from_8(data);
+
+			break;
+
+		default:
+
+			/* someone changed SELECT
+			n_rows,clustered_index_size,sum_of_other_index_sizes
+			to select more columns from innodb_table_stats without
+			adjusting here */
+			ut_error;
+		}
+	}
+
+	/* if i < 3 this means someone changed the
+	SELECT n_rows,clustered_index_size,sum_of_other_index_sizes
+	to select less columns from innodb_table_stats without adjusting here;
+	if i > 3 we would have ut_error'ed earlier */
+	ut_a(i == 3 /*n_rows,clustered_index_size,sum_of_other_index_sizes*/);
+
+	/* XXX this is not used but returning non-NULL is necessary */
+	return(TRUE);
+}
+
+/** Aux struct used to pass a table and a boolean to
+dict_stats_fetch_index_stats_step(). */
+struct index_fetch_t {
+	dict_table_t*	table;	/*!< table whose indexes are to be modified */
+	bool		stats_were_modified; /*!< will be set to true if at
+				least one index stats were modified */
+};
+
+/*********************************************************************//**
+Called for the rows that are selected by
+SELECT ... FROM mysql.innodb_index_stats WHERE table='...'
+The second argument is a pointer to the table and the fetched stats are
+written to its indexes.
+Let a table has N indexes and each index has Ui unique columns for i=1..N,
+then mysql.innodb_index_stats will have SUM(Ui) i=1..N rows for that table.
+So this function will be called SUM(Ui) times where SUM(Ui) is of magnitude
+N*AVG(Ui). In each call it searches for the currently fetched index into
+table->indexes linearly, assuming this list is not sorted. Thus, overall,
+fetching all indexes' stats from mysql.innodb_index_stats is O(N^2) where N
+is the number of indexes.
+This can be improved if we sort table->indexes in a temporary area just once
+and then search in that sorted list. Then the complexity will be O(N*log(N)).
+We assume a table will not have more than 100 indexes, so we go with the
+simpler N^2 algorithm.
+@return non-NULL dummy */
+static
+ibool
+dict_stats_fetch_index_stats_step(
+/*==============================*/
+	void*	node_void,	/*!< in: select node */
+	void*	arg_void)	/*!< out: table + a flag that tells if we
+				modified anything */
+{
+	sel_node_t*	node = (sel_node_t*) node_void;
+	index_fetch_t*	arg = (index_fetch_t*) arg_void;
+	dict_table_t*	table = arg->table;
+	dict_index_t*	index = NULL;
+	que_common_t*	cnode;
+	const char*	stat_name = NULL;
+	ulint		stat_name_len = ULINT_UNDEFINED;
+	ib_uint64_t	stat_value = UINT64_UNDEFINED;
+	ib_uint64_t	sample_size = UINT64_UNDEFINED;
+	int		i;
+
+	/* this should loop exactly 4 times - for the columns that
+	were selected: index_name,stat_name,stat_value,sample_size */
+	for (cnode = static_cast<que_common_t*>(node->select_list), i = 0;
+	     cnode != NULL;
+	     cnode = static_cast<que_common_t*>(que_node_get_next(cnode)),
+	     i++) {
+
+		const byte*	data;
+		dfield_t*	dfield = que_node_get_val(cnode);
+		dtype_t*	type = dfield_get_type(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		data = static_cast<const byte*>(dfield_get_data(dfield));
+
+		switch (i) {
+		case 0: /* mysql.innodb_index_stats.index_name */
+
+			ut_a(dtype_get_mtype(type) == DATA_VARMYSQL);
+
+			/* search for index in table's indexes whose name
+			matches data; the fetched index name is in data,
+			has no terminating '\0' and has length len */
+			for (index = dict_table_get_first_index(table);
+			     index != NULL;
+			     index = dict_table_get_next_index(index)) {
+
+				if (strlen(index->name) == len
+				    && memcmp(index->name, data, len) == 0) {
+					/* the corresponding index was found */
+					break;
+				}
+			}
+
+			/* if index is NULL here this means that
+			mysql.innodb_index_stats contains more rows than the
+			number of indexes in the table; this is ok, we just
+			return ignoring those extra rows; in other words
+			dict_stats_fetch_index_stats_step() has been called
+			for a row from index_stats with unknown index_name
+			column */
+			if (index == NULL) {
+
+				return(TRUE);
+			}
+
+			break;
+
+		case 1: /* mysql.innodb_index_stats.stat_name */
+
+			ut_a(dtype_get_mtype(type) == DATA_VARMYSQL);
+
+			ut_a(index != NULL);
+
+			stat_name = (const char*) data;
+			stat_name_len = len;
+
+			break;
+
+		case 2: /* mysql.innodb_index_stats.stat_value */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			ut_a(index != NULL);
+			ut_a(stat_name != NULL);
+			ut_a(stat_name_len != ULINT_UNDEFINED);
+
+			stat_value = mach_read_from_8(data);
+
+			break;
+
+		case 3: /* mysql.innodb_index_stats.sample_size */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8 || len == UNIV_SQL_NULL);
+
+			ut_a(index != NULL);
+			ut_a(stat_name != NULL);
+			ut_a(stat_name_len != ULINT_UNDEFINED);
+			ut_a(stat_value != UINT64_UNDEFINED);
+
+			if (len == UNIV_SQL_NULL) {
+				break;
+			}
+			/* else */
+
+			sample_size = mach_read_from_8(data);
+
+			break;
+
+		default:
+
+			/* someone changed
+			SELECT index_name,stat_name,stat_value,sample_size
+			to select more columns from innodb_index_stats without
+			adjusting here */
+			ut_error;
+		}
+	}
+
+	/* if i < 4 this means someone changed the
+	SELECT index_name,stat_name,stat_value,sample_size
+	to select less columns from innodb_index_stats without adjusting here;
+	if i > 4 we would have ut_error'ed earlier */
+	ut_a(i == 4 /* index_name,stat_name,stat_value,sample_size */);
+
+	ut_a(index != NULL);
+	ut_a(stat_name != NULL);
+	ut_a(stat_name_len != ULINT_UNDEFINED);
+	ut_a(stat_value != UINT64_UNDEFINED);
+	/* sample_size could be UINT64_UNDEFINED here, if it is NULL */
+
+#define PFX	"n_diff_pfx"
+#define PFX_LEN	10
+
+	if (stat_name_len == 4 /* strlen("size") */
+	    && strncasecmp("size", stat_name, stat_name_len) == 0) {
+		index->stat_index_size = (ulint) stat_value;
+		arg->stats_were_modified = true;
+	} else if (stat_name_len == 12 /* strlen("n_leaf_pages") */
+		   && strncasecmp("n_leaf_pages", stat_name, stat_name_len)
+		   == 0) {
+		index->stat_n_leaf_pages = (ulint) stat_value;
+		arg->stats_were_modified = true;
+	} else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */
+		   && strncasecmp(PFX, stat_name, PFX_LEN) == 0) {
+
+		const char*	num_ptr;
+		unsigned long	n_pfx;
+
+		/* point num_ptr into "1" from "n_diff_pfx12..." */
+		num_ptr = stat_name + PFX_LEN;
+
+		/* stat_name should have exactly 2 chars appended to PFX
+		and they should be digits */
+		if (stat_name_len != PFX_LEN + 2
+		    || num_ptr[0] < '0' || num_ptr[0] > '9'
+		    || num_ptr[1] < '0' || num_ptr[1] > '9') {
+
+			char	db_utf8[MAX_DB_UTF8_LEN];
+			char	table_utf8[MAX_TABLE_UTF8_LEN];
+
+			dict_fs2utf8(table->name, db_utf8, sizeof(db_utf8),
+				     table_utf8, sizeof(table_utf8));
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Ignoring strange row from "
+				"%s WHERE "
+				"database_name = '%s' AND "
+				"table_name = '%s' AND "
+				"index_name = '%s' AND "
+				"stat_name = '%.*s'; because stat_name "
+				"is malformed\n",
+				INDEX_STATS_NAME_PRINT,
+				db_utf8,
+				table_utf8,
+				index->name,
+				(int) stat_name_len,
+				stat_name);
+			return(TRUE);
+		}
+		/* else */
+
+		/* extract 12 from "n_diff_pfx12..." into n_pfx
+		note that stat_name does not have a terminating '\0' */
+		n_pfx = (num_ptr[0] - '0') * 10 + (num_ptr[1] - '0');
+
+		ulint	n_uniq = index->n_uniq;
+
+		if (n_pfx == 0 || n_pfx > n_uniq) {
+
+			char	db_utf8[MAX_DB_UTF8_LEN];
+			char	table_utf8[MAX_TABLE_UTF8_LEN];
+
+			dict_fs2utf8(table->name, db_utf8, sizeof(db_utf8),
+				     table_utf8, sizeof(table_utf8));
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Ignoring strange row from "
+				"%s WHERE "
+				"database_name = '%s' AND "
+				"table_name = '%s' AND "
+				"index_name = '%s' AND "
+				"stat_name = '%.*s'; because stat_name is "
+				"out of range, the index has %lu unique "
+				"columns\n",
+				INDEX_STATS_NAME_PRINT,
+				db_utf8,
+				table_utf8,
+				index->name,
+				(int) stat_name_len,
+				stat_name,
+				n_uniq);
+			return(TRUE);
+		}
+		/* else */
+
+		index->stat_n_diff_key_vals[n_pfx - 1] = stat_value;
+
+		if (sample_size != UINT64_UNDEFINED) {
+			index->stat_n_sample_sizes[n_pfx - 1] = sample_size;
+		} else {
+			/* hmm, strange... the user must have UPDATEd the
+			table manually and SET sample_size = NULL */
+			index->stat_n_sample_sizes[n_pfx - 1] = 0;
+		}
+
+		index->stat_n_non_null_key_vals[n_pfx - 1] = 0;
+
+		arg->stats_were_modified = true;
+	} else {
+		/* silently ignore rows with unknown stat_name, the
+		user may have developed her own stats */
+	}
+
+	/* XXX this is not used but returning non-NULL is necessary */
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Read table's statistics from the persistent statistics storage.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+dict_stats_fetch_from_ps(
+/*=====================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	index_fetch_t	index_fetch_arg;
+	trx_t*		trx;
+	pars_info_t*	pinfo;
+	dberr_t		ret;
+	char		db_utf8[MAX_DB_UTF8_LEN];
+	char		table_utf8[MAX_TABLE_UTF8_LEN];
+
+	ut_ad(!mutex_own(&dict_sys->mutex));
+
+	/* Initialize all stats to dummy values before fetching because if
+	the persistent storage contains incomplete stats (e.g. missing stats
+	for some index) then we would end up with (partially) uninitialized
+	stats. */
+	dict_stats_empty_table(table);
+
+	trx = trx_allocate_for_background();
+
+	/* Use 'read-uncommitted' so that the SELECTs we execute
+	do not get blocked in case some user has locked the rows we
+	are SELECTing */
+
+	trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
+
+	trx_start_if_not_started(trx);
+
+	dict_fs2utf8(table->name, db_utf8, sizeof(db_utf8),
+		     table_utf8, sizeof(table_utf8));
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "database_name", db_utf8);
+
+	pars_info_add_str_literal(pinfo, "table_name", table_utf8);
+
+	pars_info_bind_function(pinfo,
+			       "fetch_table_stats_step",
+			       dict_stats_fetch_table_stats_step,
+			       table);
+
+	index_fetch_arg.table = table;
+	index_fetch_arg.stats_were_modified = false;
+	pars_info_bind_function(pinfo,
+			        "fetch_index_stats_step",
+			        dict_stats_fetch_index_stats_step,
+			        &index_fetch_arg);
+
+	ret = que_eval_sql(pinfo,
+			   "PROCEDURE FETCH_STATS () IS\n"
+			   "found INT;\n"
+			   "DECLARE FUNCTION fetch_table_stats_step;\n"
+			   "DECLARE FUNCTION fetch_index_stats_step;\n"
+			   "DECLARE CURSOR table_stats_cur IS\n"
+			   "  SELECT\n"
+			   /* if you change the selected fields, be
+			   sure to adjust
+			   dict_stats_fetch_table_stats_step() */
+			   "  n_rows,\n"
+			   "  clustered_index_size,\n"
+			   "  sum_of_other_index_sizes\n"
+			   "  FROM \"" TABLE_STATS_NAME "\"\n"
+			   "  WHERE\n"
+			   "  database_name = :database_name AND\n"
+			   "  table_name = :table_name;\n"
+			   "DECLARE CURSOR index_stats_cur IS\n"
+			   "  SELECT\n"
+			   /* if you change the selected fields, be
+			   sure to adjust
+			   dict_stats_fetch_index_stats_step() */
+			   "  index_name,\n"
+			   "  stat_name,\n"
+			   "  stat_value,\n"
+			   "  sample_size\n"
+			   "  FROM \"" INDEX_STATS_NAME "\"\n"
+			   "  WHERE\n"
+			   "  database_name = :database_name AND\n"
+			   "  table_name = :table_name;\n"
+
+			   "BEGIN\n"
+
+			   "OPEN table_stats_cur;\n"
+			   "FETCH table_stats_cur INTO\n"
+			   "  fetch_table_stats_step();\n"
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "  CLOSE table_stats_cur;\n"
+			   "  RETURN;\n"
+			   "END IF;\n"
+			   "CLOSE table_stats_cur;\n"
+
+			   "OPEN index_stats_cur;\n"
+			   "found := 1;\n"
+			   "WHILE found = 1 LOOP\n"
+			   "  FETCH index_stats_cur INTO\n"
+			   "    fetch_index_stats_step();\n"
+			   "  IF (SQL % NOTFOUND) THEN\n"
+			   "    found := 0;\n"
+			   "  END IF;\n"
+			   "END LOOP;\n"
+			   "CLOSE index_stats_cur;\n"
+
+			   "END;",
+			   TRUE, trx);
+	/* pinfo is freed by que_eval_sql() */
+
+	trx_commit_for_mysql(trx);
+
+	trx_free_for_background(trx);
+
+	if (!index_fetch_arg.stats_were_modified) {
+		return(DB_STATS_DO_NOT_EXIST);
+	}
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Fetches or calculates new estimates for index statistics. */
+UNIV_INTERN
+void
+dict_stats_update_for_index(
+/*========================*/
+	dict_index_t*	index)	/*!< in/out: index */
+{
+	DBUG_ENTER("dict_stats_update_for_index");
+
+	ut_ad(!mutex_own(&dict_sys->mutex));
+
+	if (dict_stats_is_persistent_enabled(index->table)) {
+
+		if (dict_stats_persistent_storage_check(false)) {
+			dict_table_stats_lock(index->table, RW_X_LATCH);
+			dict_stats_analyze_index(index);
+			dict_table_stats_unlock(index->table, RW_X_LATCH);
+			dict_stats_save(index->table);
+			DBUG_VOID_RETURN;
+		}
+		/* else */
+
+		/* Fall back to transient stats since the persistent
+		storage is not present or is corrupted */
+		char	buf_table[MAX_FULL_NAME_LEN];
+		char	buf_index[MAX_FULL_NAME_LEN];
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Recalculation of persistent statistics "
+			"requested for table %s index %s but the required "
+			"persistent statistics storage is not present or is "
+			"corrupted. Using transient stats instead.\n",
+			ut_format_name(index->table->name, TRUE,
+				       buf_table, sizeof(buf_table)),
+			ut_format_name(index->name, FALSE,
+				       buf_index, sizeof(buf_index)));
+	}
+
+	dict_table_stats_lock(index->table, RW_X_LATCH);
+	dict_stats_update_transient_for_index(index);
+	dict_table_stats_unlock(index->table, RW_X_LATCH);
+
+	DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_update(
+/*==============*/
+	dict_table_t*		table,	/*!< in/out: table */
+	dict_stats_upd_option_t	stats_upd_option)
+					/*!< in: whether to (re) calc
+					the stats or to fetch them from
+					the persistent statistics
+					storage */
+{
+	char			buf[MAX_FULL_NAME_LEN];
+
+	ut_ad(!mutex_own(&dict_sys->mutex));
+
+	if (table->ibd_file_missing) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: cannot calculate statistics for table %s "
+			"because the .ibd file is missing. For help, please "
+			"refer to " REFMAN "innodb-troubleshooting.html\n",
+			ut_format_name(table->name, TRUE, buf, sizeof(buf)));
+		dict_stats_empty_table(table);
+		return(DB_TABLESPACE_DELETED);
+	} else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+		/* If we have set a high innodb_force_recovery level, do
+		not calculate statistics, as a badly corrupted index can
+		cause a crash in it. */
+		dict_stats_empty_table(table);
+		return(DB_SUCCESS);
+	}
+
+	switch (stats_upd_option) {
+	case DICT_STATS_RECALC_PERSISTENT:
+
+		if (srv_read_only_mode) {
+			goto transient;
+		}
+
+		/* Persistent recalculation requested, called from
+		1) ANALYZE TABLE, or
+		2) the auto recalculation background thread, or
+		3) open table if stats do not exist on disk and auto recalc
+		   is enabled */
+
+		/* InnoDB internal tables (e.g. SYS_TABLES) cannot have
+		persistent stats enabled */
+		ut_a(strchr(table->name, '/') != NULL);
+
+		/* check if the persistent statistics storage exists
+		before calling the potentially slow function
+		dict_stats_update_persistent(); that is a
+		prerequisite for dict_stats_save() succeeding */
+		if (dict_stats_persistent_storage_check(false)) {
+
+			dberr_t	err;
+
+			err = dict_stats_update_persistent(table);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+
+			err = dict_stats_save(table);
+
+			return(err);
+		}
+
+		/* Fall back to transient stats since the persistent
+		storage is not present or is corrupted */
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Recalculation of persistent statistics "
+			"requested for table %s but the required persistent "
+			"statistics storage is not present or is corrupted. "
+			"Using transient stats instead.\n",
+			ut_format_name(table->name, TRUE, buf, sizeof(buf)));
+
+		goto transient;
+
+	case DICT_STATS_RECALC_TRANSIENT:
+
+		goto transient;
+
+	case DICT_STATS_EMPTY_TABLE:
+
+		dict_stats_empty_table(table);
+
+		/* If table is using persistent stats,
+		then save the stats on disk */
+
+		if (dict_stats_is_persistent_enabled(table)) {
+
+			if (dict_stats_persistent_storage_check(false)) {
+
+				return(dict_stats_save(table));
+			}
+
+			return(DB_STATS_DO_NOT_EXIST);
+		}
+
+		return(DB_SUCCESS);
+
+	case DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY:
+
+		/* fetch requested, either fetch from persistent statistics
+		storage or use the old method */
+
+		if (table->stat_initialized) {
+			return(DB_SUCCESS);
+		}
+
+		/* InnoDB internal tables (e.g. SYS_TABLES) cannot have
+		persistent stats enabled */
+		ut_a(strchr(table->name, '/') != NULL);
+
+		if (!dict_stats_persistent_storage_check(false)) {
+			/* persistent statistics storage does not exist
+			or is corrupted, calculate the transient stats */
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Error: Fetch of persistent "
+				"statistics requested for table %s but the "
+				"required system tables %s and %s are not "
+				"present or have unexpected structure. "
+				"Using transient stats instead.\n",
+				ut_format_name(table->name, TRUE,
+					       buf, sizeof(buf)),
+				TABLE_STATS_NAME_PRINT,
+				INDEX_STATS_NAME_PRINT);
+
+			goto transient;
+		}
+
+		dict_table_t*	t;
+
+		/* Create a dummy table object with the same name and
+		indexes, suitable for fetching the stats into it. */
+		t = dict_stats_table_clone_create(table);
+
+		dberr_t	err = dict_stats_fetch_from_ps(t);
+
+		t->stats_last_recalc = table->stats_last_recalc;
+		t->stat_modified_counter = 0;
+
+		switch (err) {
+		case DB_SUCCESS:
+
+			dict_table_stats_lock(table, RW_X_LATCH);
+
+			/* Initialize all stats to dummy values before
+			copying because dict_stats_table_clone_create() does
+			skip corrupted indexes so our dummy object 't' may
+			have less indexes than the real object 'table'. */
+			dict_stats_empty_table(table);
+
+			dict_stats_copy(table, t);
+
+			dict_stats_assert_initialized(table);
+
+			dict_table_stats_unlock(table, RW_X_LATCH);
+
+			dict_stats_table_clone_free(t);
+
+			return(DB_SUCCESS);
+		case DB_STATS_DO_NOT_EXIST:
+
+			dict_stats_table_clone_free(t);
+
+			if (srv_read_only_mode) {
+				goto transient;
+			}
+
+			if (dict_stats_auto_recalc_is_enabled(table)) {
+				return(dict_stats_update(
+						table,
+						DICT_STATS_RECALC_PERSISTENT));
+			}
+
+			ut_format_name(table->name, TRUE, buf, sizeof(buf));
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Trying to use table %s which has "
+				"persistent statistics enabled, but auto "
+				"recalculation turned off and the statistics "
+				"do not exist in %s and %s. Please either run "
+				"\"ANALYZE TABLE %s;\" manually or enable the "
+				"auto recalculation with "
+				"\"ALTER TABLE %s STATS_AUTO_RECALC=1;\". "
+				"InnoDB will now use transient statistics for "
+				"%s.\n",
+				buf, TABLE_STATS_NAME, INDEX_STATS_NAME, buf,
+				buf, buf);
+
+			goto transient;
+		default:
+
+			dict_stats_table_clone_free(t);
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Error fetching persistent statistics "
+				"for table %s from %s and %s: %s. "
+				"Using transient stats method instead.\n",
+				ut_format_name(table->name, TRUE, buf,
+					       sizeof(buf)),
+				TABLE_STATS_NAME,
+				INDEX_STATS_NAME,
+				ut_strerr(err));
+
+			goto transient;
+		}
+	/* no "default:" in order to produce a compilation warning
+	about unhandled enumeration value */
+	}
+
+transient:
+
+	dict_table_stats_lock(table, RW_X_LATCH);
+
+	dict_stats_update_transient(table);
+
+	dict_table_stats_unlock(table, RW_X_LATCH);
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Removes the information for a particular index's stats from the persistent
+storage if it exists and if there is data stored for this index.
+This function creates its own trx and commits it.
+A note from Marko why we cannot edit user and sys_* tables in one trx:
+marko: The problem is that ibuf merges should be disabled while we are
+rolling back dict transactions.
+marko: If ibuf merges are not disabled, we need to scan the *.ibd files.
+But we shouldn't open *.ibd files before we have rolled back dict
+transactions and opened the SYS_* records for the *.ibd files.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_drop_index(
+/*==================*/
+	const char*	db_and_table,/*!< in: db and table, e.g. 'db/table' */
+	const char*	iname,	/*!< in: index name */
+	char*		errstr, /*!< out: error message if != DB_SUCCESS
+				is returned */
+	ulint		errstr_sz)/*!< in: size of the errstr buffer */
+{
+	char		db_utf8[MAX_DB_UTF8_LEN];
+	char		table_utf8[MAX_TABLE_UTF8_LEN];
+	pars_info_t*	pinfo;
+	dberr_t		ret;
+
+	ut_ad(!mutex_own(&dict_sys->mutex));
+
+	/* skip indexes whose table names do not contain a database name
+	e.g. if we are dropping an index from SYS_TABLES */
+	if (strchr(db_and_table, '/') == NULL) {
+
+		return(DB_SUCCESS);
+	}
+
+	dict_fs2utf8(db_and_table, db_utf8, sizeof(db_utf8),
+		     table_utf8, sizeof(table_utf8));
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "database_name", db_utf8);
+
+	pars_info_add_str_literal(pinfo, "table_name", table_utf8);
+
+	pars_info_add_str_literal(pinfo, "index_name", iname);
+
+	rw_lock_x_lock(&dict_operation_lock);
+	mutex_enter(&dict_sys->mutex);
+
+	ret = dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE DROP_INDEX_STATS () IS\n"
+		"BEGIN\n"
+		"DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n"
+		"database_name = :database_name AND\n"
+		"table_name = :table_name AND\n"
+		"index_name = :index_name;\n"
+		"END;\n");
+
+	mutex_exit(&dict_sys->mutex);
+	rw_lock_x_unlock(&dict_operation_lock);
+
+	if (ret == DB_STATS_DO_NOT_EXIST) {
+		ret = DB_SUCCESS;
+	}
+
+	if (ret != DB_SUCCESS) {
+		ut_snprintf(errstr, errstr_sz,
+			    "Unable to delete statistics for index %s "
+			    "from %s%s: %s. They can be deleted later using "
+			    "DELETE FROM %s WHERE "
+			    "database_name = '%s' AND "
+			    "table_name = '%s' AND "
+			    "index_name = '%s';",
+			    iname,
+			    INDEX_STATS_NAME_PRINT,
+			    (ret == DB_LOCK_WAIT_TIMEOUT
+			     ? " because the rows are locked"
+			     : ""),
+			    ut_strerr(ret),
+			    INDEX_STATS_NAME_PRINT,
+			    db_utf8,
+			    table_utf8,
+			    iname);
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: %s\n", errstr);
+	}
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Executes
+DELETE FROM mysql.innodb_table_stats
+WHERE database_name = '...' AND table_name = '...';
+Creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INLINE
+dberr_t
+dict_stats_delete_from_table_stats(
+/*===============================*/
+	const char*	database_name,	/*!< in: database name, e.g. 'db' */
+	const char*	table_name)	/*!< in: table name, e.g. 'table' */
+{
+	pars_info_t*	pinfo;
+	dberr_t		ret;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "database_name", database_name);
+	pars_info_add_str_literal(pinfo, "table_name", table_name);
+
+	ret = dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE DELETE_FROM_TABLE_STATS () IS\n"
+		"BEGIN\n"
+		"DELETE FROM \"" TABLE_STATS_NAME "\" WHERE\n"
+		"database_name = :database_name AND\n"
+		"table_name = :table_name;\n"
+		"END;\n");
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Executes
+DELETE FROM mysql.innodb_index_stats
+WHERE database_name = '...' AND table_name = '...';
+Creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INLINE
+dberr_t
+dict_stats_delete_from_index_stats(
+/*===============================*/
+	const char*	database_name,	/*!< in: database name, e.g. 'db' */
+	const char*	table_name)	/*!< in: table name, e.g. 'table' */
+{
+	pars_info_t*	pinfo;
+	dberr_t		ret;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "database_name", database_name);
+	pars_info_add_str_literal(pinfo, "table_name", table_name);
+
+	ret = dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE DELETE_FROM_INDEX_STATS () IS\n"
+		"BEGIN\n"
+		"DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n"
+		"database_name = :database_name AND\n"
+		"table_name = :table_name;\n"
+		"END;\n");
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Removes the statistics for a table and all of its indexes from the
+persistent statistics storage if it exists and if there is data stored for
+the table. This function creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_drop_table(
+/*==================*/
+	const char*	db_and_table,	/*!< in: db and table, e.g. 'db/table' */
+	char*		errstr,		/*!< out: error message
+					if != DB_SUCCESS is returned */
+	ulint		errstr_sz)	/*!< in: size of errstr buffer */
+{
+	char		db_utf8[MAX_DB_UTF8_LEN];
+	char		table_utf8[MAX_TABLE_UTF8_LEN];
+	dberr_t		ret;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	/* skip tables that do not contain a database name
+	e.g. if we are dropping SYS_TABLES */
+	if (strchr(db_and_table, '/') == NULL) {
+
+		return(DB_SUCCESS);
+	}
+
+	/* skip innodb_table_stats and innodb_index_stats themselves */
+	if (strcmp(db_and_table, TABLE_STATS_NAME) == 0
+	    || strcmp(db_and_table, INDEX_STATS_NAME) == 0) {
+
+		return(DB_SUCCESS);
+	}
+
+	dict_fs2utf8(db_and_table, db_utf8, sizeof(db_utf8),
+		     table_utf8, sizeof(table_utf8));
+
+	ret = dict_stats_delete_from_table_stats(db_utf8, table_utf8);
+
+	if (ret == DB_SUCCESS) {
+		ret = dict_stats_delete_from_index_stats(db_utf8, table_utf8);
+	}
+
+	if (ret == DB_STATS_DO_NOT_EXIST) {
+		ret = DB_SUCCESS;
+	}
+
+	if (ret != DB_SUCCESS) {
+
+		ut_snprintf(errstr, errstr_sz,
+			    "Unable to delete statistics for table %s.%s: %s. "
+			    "They can be deleted later using "
+
+			    "DELETE FROM %s WHERE "
+			    "database_name = '%s' AND "
+			    "table_name = '%s'; "
+
+			    "DELETE FROM %s WHERE "
+			    "database_name = '%s' AND "
+			    "table_name = '%s';",
+
+			    db_utf8, table_utf8,
+			    ut_strerr(ret),
+
+			    INDEX_STATS_NAME_PRINT,
+			    db_utf8, table_utf8,
+
+			    TABLE_STATS_NAME_PRINT,
+			    db_utf8, table_utf8);
+	}
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Executes
+UPDATE mysql.innodb_table_stats SET
+database_name = '...', table_name = '...'
+WHERE database_name = '...' AND table_name = '...';
+Creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INLINE
+dberr_t
+dict_stats_rename_in_table_stats(
+/*=============================*/
+	const char*	old_dbname_utf8,/*!< in: database name, e.g. 'olddb' */
+	const char*	old_tablename_utf8,/*!< in: table name, e.g. 'oldtable' */
+	const char*	new_dbname_utf8,/*!< in: database name, e.g. 'newdb' */
+	const char*	new_tablename_utf8)/*!< in: table name, e.g. 'newtable' */
+{
+	pars_info_t*	pinfo;
+	dberr_t		ret;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "old_dbname_utf8", old_dbname_utf8);
+	pars_info_add_str_literal(pinfo, "old_tablename_utf8", old_tablename_utf8);
+	pars_info_add_str_literal(pinfo, "new_dbname_utf8", new_dbname_utf8);
+	pars_info_add_str_literal(pinfo, "new_tablename_utf8", new_tablename_utf8);
+
+	ret = dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE RENAME_IN_TABLE_STATS () IS\n"
+		"BEGIN\n"
+		"UPDATE \"" TABLE_STATS_NAME "\" SET\n"
+		"database_name = :new_dbname_utf8,\n"
+		"table_name = :new_tablename_utf8\n"
+		"WHERE\n"
+		"database_name = :old_dbname_utf8 AND\n"
+		"table_name = :old_tablename_utf8;\n"
+		"END;\n");
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Executes
+UPDATE mysql.innodb_index_stats SET
+database_name = '...', table_name = '...'
+WHERE database_name = '...' AND table_name = '...';
+Creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INLINE
+dberr_t
+dict_stats_rename_in_index_stats(
+/*=============================*/
+	const char*	old_dbname_utf8,/*!< in: database name, e.g. 'olddb' */
+	const char*	old_tablename_utf8,/*!< in: table name, e.g. 'oldtable' */
+	const char*	new_dbname_utf8,/*!< in: database name, e.g. 'newdb' */
+	const char*	new_tablename_utf8)/*!< in: table name, e.g. 'newtable' */
+{
+	pars_info_t*	pinfo;
+	dberr_t		ret;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "old_dbname_utf8", old_dbname_utf8);
+	pars_info_add_str_literal(pinfo, "old_tablename_utf8", old_tablename_utf8);
+	pars_info_add_str_literal(pinfo, "new_dbname_utf8", new_dbname_utf8);
+	pars_info_add_str_literal(pinfo, "new_tablename_utf8", new_tablename_utf8);
+
+	ret = dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE RENAME_IN_INDEX_STATS () IS\n"
+		"BEGIN\n"
+		"UPDATE \"" INDEX_STATS_NAME "\" SET\n"
+		"database_name = :new_dbname_utf8,\n"
+		"table_name = :new_tablename_utf8\n"
+		"WHERE\n"
+		"database_name = :old_dbname_utf8 AND\n"
+		"table_name = :old_tablename_utf8;\n"
+		"END;\n");
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Renames a table in InnoDB persistent stats storage.
+This function creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_rename_table(
+/*====================*/
+	const char*	old_name,	/*!< in: old name, e.g. 'db/table' */
+	const char*	new_name,	/*!< in: new name, e.g. 'db/table' */
+	char*		errstr,		/*!< out: error string if != DB_SUCCESS
+					is returned */
+	size_t		errstr_sz)	/*!< in: errstr size */
+{
+	char		old_db_utf8[MAX_DB_UTF8_LEN];
+	char		new_db_utf8[MAX_DB_UTF8_LEN];
+	char		old_table_utf8[MAX_TABLE_UTF8_LEN];
+	char		new_table_utf8[MAX_TABLE_UTF8_LEN];
+	dberr_t		ret;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(!mutex_own(&dict_sys->mutex));
+
+	/* skip innodb_table_stats and innodb_index_stats themselves */
+	if (strcmp(old_name, TABLE_STATS_NAME) == 0
+	    || strcmp(old_name, INDEX_STATS_NAME) == 0
+	    || strcmp(new_name, TABLE_STATS_NAME) == 0
+	    || strcmp(new_name, INDEX_STATS_NAME) == 0) {
+
+		return(DB_SUCCESS);
+	}
+
+	dict_fs2utf8(old_name, old_db_utf8, sizeof(old_db_utf8),
+		     old_table_utf8, sizeof(old_table_utf8));
+
+	dict_fs2utf8(new_name, new_db_utf8, sizeof(new_db_utf8),
+		     new_table_utf8, sizeof(new_table_utf8));
+
+	rw_lock_x_lock(&dict_operation_lock);
+	mutex_enter(&dict_sys->mutex);
+
+	ulint	n_attempts = 0;
+	do {
+		n_attempts++;
+
+		ret = dict_stats_rename_in_table_stats(
+			old_db_utf8, old_table_utf8,
+			new_db_utf8, new_table_utf8);
+
+		if (ret == DB_DUPLICATE_KEY) {
+			dict_stats_delete_from_table_stats(
+				new_db_utf8, new_table_utf8);
+		}
+
+		if (ret == DB_STATS_DO_NOT_EXIST) {
+			ret = DB_SUCCESS;
+		}
+
+		if (ret != DB_SUCCESS) {
+			mutex_exit(&dict_sys->mutex);
+			rw_lock_x_unlock(&dict_operation_lock);
+			os_thread_sleep(200000 /* 0.2 sec */);
+			rw_lock_x_lock(&dict_operation_lock);
+			mutex_enter(&dict_sys->mutex);
+		}
+	} while ((ret == DB_DEADLOCK
+		  || ret == DB_DUPLICATE_KEY
+		  || ret == DB_LOCK_WAIT_TIMEOUT)
+		 && n_attempts < 5);
+
+	if (ret != DB_SUCCESS) {
+		ut_snprintf(errstr, errstr_sz,
+			    "Unable to rename statistics from "
+			    "%s.%s to %s.%s in %s: %s. "
+			    "They can be renamed later using "
+
+			    "UPDATE %s SET "
+			    "database_name = '%s', "
+			    "table_name = '%s' "
+			    "WHERE "
+			    "database_name = '%s' AND "
+			    "table_name = '%s';",
+
+			    old_db_utf8, old_table_utf8,
+			    new_db_utf8, new_table_utf8,
+			    TABLE_STATS_NAME_PRINT,
+			    ut_strerr(ret),
+
+			    TABLE_STATS_NAME_PRINT,
+			    new_db_utf8, new_table_utf8,
+			    old_db_utf8, old_table_utf8);
+		mutex_exit(&dict_sys->mutex);
+		rw_lock_x_unlock(&dict_operation_lock);
+		return(ret);
+	}
+	/* else */
+
+	n_attempts = 0;
+	do {
+		n_attempts++;
+
+		ret = dict_stats_rename_in_index_stats(
+			old_db_utf8, old_table_utf8,
+			new_db_utf8, new_table_utf8);
+
+		if (ret == DB_DUPLICATE_KEY) {
+			dict_stats_delete_from_index_stats(
+				new_db_utf8, new_table_utf8);
+		}
+
+		if (ret == DB_STATS_DO_NOT_EXIST) {
+			ret = DB_SUCCESS;
+		}
+
+		if (ret != DB_SUCCESS) {
+			mutex_exit(&dict_sys->mutex);
+			rw_lock_x_unlock(&dict_operation_lock);
+			os_thread_sleep(200000 /* 0.2 sec */);
+			rw_lock_x_lock(&dict_operation_lock);
+			mutex_enter(&dict_sys->mutex);
+		}
+	} while ((ret == DB_DEADLOCK
+		  || ret == DB_DUPLICATE_KEY
+		  || ret == DB_LOCK_WAIT_TIMEOUT)
+		 && n_attempts < 5);
+
+	mutex_exit(&dict_sys->mutex);
+	rw_lock_x_unlock(&dict_operation_lock);
+
+	if (ret != DB_SUCCESS) {
+		ut_snprintf(errstr, errstr_sz,
+			    "Unable to rename statistics from "
+			    "%s.%s to %s.%s in %s: %s. "
+			    "They can be renamed later using "
+
+			    "UPDATE %s SET "
+			    "database_name = '%s', "
+			    "table_name = '%s' "
+			    "WHERE "
+			    "database_name = '%s' AND "
+			    "table_name = '%s';",
+
+			    old_db_utf8, old_table_utf8,
+			    new_db_utf8, new_table_utf8,
+			    INDEX_STATS_NAME_PRINT,
+			    ut_strerr(ret),
+
+			    INDEX_STATS_NAME_PRINT,
+			    new_db_utf8, new_table_utf8,
+			    old_db_utf8, old_table_utf8);
+	}
+
+	return(ret);
+}
+
+/* tests @{ */
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+/* The following unit tests test some of the functions in this file
+individually, such testing cannot be performed by the mysql-test framework
+via SQL. */
+
+/* test_dict_table_schema_check() @{ */
+void
+test_dict_table_schema_check()
+{
+	/*
+	CREATE TABLE tcheck (
+		c01 VARCHAR(123),
+		c02 INT,
+		c03 INT NOT NULL,
+		c04 INT UNSIGNED,
+		c05 BIGINT,
+		c06 BIGINT UNSIGNED NOT NULL,
+		c07 TIMESTAMP
+	) ENGINE=INNODB;
+	*/
+	/* definition for the table 'test/tcheck' */
+	dict_col_meta_t	columns[] = {
+		{"c01", DATA_VARCHAR, 0, 123},
+		{"c02", DATA_INT, 0, 4},
+		{"c03", DATA_INT, DATA_NOT_NULL, 4},
+		{"c04", DATA_INT, DATA_UNSIGNED, 4},
+		{"c05", DATA_INT, 0, 8},
+		{"c06", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8},
+		{"c07", DATA_INT, 0, 4},
+		{"c_extra", DATA_INT, 0, 4}
+	};
+	dict_table_schema_t	schema = {
+		"test/tcheck",
+		0 /* will be set individually for each test below */,
+		columns
+	};
+	char	errstr[512];
+
+	ut_snprintf(errstr, sizeof(errstr), "Table not found");
+
+	/* prevent any data dictionary modifications while we are checking
+	the tables' structure */
+
+	mutex_enter(&(dict_sys->mutex));
+
+	/* check that a valid table is reported as valid */
+	schema.n_cols = 7;
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    == DB_SUCCESS) {
+		printf("OK: test.tcheck ok\n");
+	} else {
+		printf("ERROR: %s\n", errstr);
+		printf("ERROR: test.tcheck not present or corrupted\n");
+		goto test_dict_table_schema_check_end;
+	}
+
+	/* check columns with wrong length */
+	schema.columns[1].len = 8;
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    != DB_SUCCESS) {
+		printf("OK: test.tcheck.c02 has different length and is "
+		       "reported as corrupted\n");
+	} else {
+		printf("OK: test.tcheck.c02 has different length but is "
+		       "reported as ok\n");
+		goto test_dict_table_schema_check_end;
+	}
+	schema.columns[1].len = 4;
+
+	/* request that c02 is NOT NULL while actually it does not have
+	this flag set */
+	schema.columns[1].prtype_mask |= DATA_NOT_NULL;
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    != DB_SUCCESS) {
+		printf("OK: test.tcheck.c02 does not have NOT NULL while "
+		       "it should and is reported as corrupted\n");
+	} else {
+		printf("ERROR: test.tcheck.c02 does not have NOT NULL while "
+		       "it should and is not reported as corrupted\n");
+		goto test_dict_table_schema_check_end;
+	}
+	schema.columns[1].prtype_mask &= ~DATA_NOT_NULL;
+
+	/* check a table that contains some extra columns */
+	schema.n_cols = 6;
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    == DB_SUCCESS) {
+		printf("ERROR: test.tcheck has more columns but is not "
+		       "reported as corrupted\n");
+		goto test_dict_table_schema_check_end;
+	} else {
+		printf("OK: test.tcheck has more columns and is "
+		       "reported as corrupted\n");
+	}
+
+	/* check a table that has some columns missing */
+	schema.n_cols = 8;
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    != DB_SUCCESS) {
+		printf("OK: test.tcheck has missing columns and is "
+		       "reported as corrupted\n");
+	} else {
+		printf("ERROR: test.tcheck has missing columns but is "
+		       "reported as ok\n");
+		goto test_dict_table_schema_check_end;
+	}
+
+	/* check non-existent table */
+	schema.table_name = "test/tcheck_nonexistent";
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    != DB_SUCCESS) {
+		printf("OK: test.tcheck_nonexistent is not present\n");
+	} else {
+		printf("ERROR: test.tcheck_nonexistent is present!?\n");
+		goto test_dict_table_schema_check_end;
+	}
+
+test_dict_table_schema_check_end:
+
+	mutex_exit(&(dict_sys->mutex));
+}
+/* @} */
+
+/* save/fetch aux macros @{ */
+#define TEST_DATABASE_NAME		"foobardb"
+#define TEST_TABLE_NAME			"test_dict_stats"
+
+#define TEST_N_ROWS			111
+#define TEST_CLUSTERED_INDEX_SIZE	222
+#define TEST_SUM_OF_OTHER_INDEX_SIZES	333
+
+#define TEST_IDX1_NAME			"tidx1"
+#define TEST_IDX1_COL1_NAME		"tidx1_col1"
+#define TEST_IDX1_INDEX_SIZE		123
+#define TEST_IDX1_N_LEAF_PAGES		234
+#define TEST_IDX1_N_DIFF1		50
+#define TEST_IDX1_N_DIFF1_SAMPLE_SIZE	500
+
+#define TEST_IDX2_NAME			"tidx2"
+#define TEST_IDX2_COL1_NAME		"tidx2_col1"
+#define TEST_IDX2_COL2_NAME		"tidx2_col2"
+#define TEST_IDX2_COL3_NAME		"tidx2_col3"
+#define TEST_IDX2_COL4_NAME		"tidx2_col4"
+#define TEST_IDX2_INDEX_SIZE		321
+#define TEST_IDX2_N_LEAF_PAGES		432
+#define TEST_IDX2_N_DIFF1		60
+#define TEST_IDX2_N_DIFF1_SAMPLE_SIZE	600
+#define TEST_IDX2_N_DIFF2		61
+#define TEST_IDX2_N_DIFF2_SAMPLE_SIZE	610
+#define TEST_IDX2_N_DIFF3		62
+#define TEST_IDX2_N_DIFF3_SAMPLE_SIZE	620
+#define TEST_IDX2_N_DIFF4		63
+#define TEST_IDX2_N_DIFF4_SAMPLE_SIZE	630
+/* @} */
+
+/* test_dict_stats_save() @{ */
+void
+test_dict_stats_save()
+{
+	dict_table_t	table;
+	dict_index_t	index1;
+	dict_field_t	index1_fields[1];
+	ib_uint64_t	index1_stat_n_diff_key_vals[1];
+	ib_uint64_t	index1_stat_n_sample_sizes[1];
+	dict_index_t	index2;
+	dict_field_t	index2_fields[4];
+	ib_uint64_t	index2_stat_n_diff_key_vals[4];
+	ib_uint64_t	index2_stat_n_sample_sizes[4];
+	dberr_t		ret;
+
+	/* craft a dummy dict_table_t */
+	table.name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME);
+	table.stat_n_rows = TEST_N_ROWS;
+	table.stat_clustered_index_size = TEST_CLUSTERED_INDEX_SIZE;
+	table.stat_sum_of_other_index_sizes = TEST_SUM_OF_OTHER_INDEX_SIZES;
+	UT_LIST_INIT(table.indexes);
+	UT_LIST_ADD_LAST(indexes, table.indexes, &index1);
+	UT_LIST_ADD_LAST(indexes, table.indexes, &index2);
+	ut_d(table.magic_n = DICT_TABLE_MAGIC_N);
+	ut_d(index1.magic_n = DICT_INDEX_MAGIC_N);
+
+	index1.name = TEST_IDX1_NAME;
+	index1.table = &table;
+	index1.cached = 1;
+	index1.n_uniq = 1;
+	index1.fields = index1_fields;
+	index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals;
+	index1.stat_n_sample_sizes = index1_stat_n_sample_sizes;
+	index1.stat_index_size = TEST_IDX1_INDEX_SIZE;
+	index1.stat_n_leaf_pages = TEST_IDX1_N_LEAF_PAGES;
+	index1_fields[0].name = TEST_IDX1_COL1_NAME;
+	index1_stat_n_diff_key_vals[0] = TEST_IDX1_N_DIFF1;
+	index1_stat_n_sample_sizes[0] = TEST_IDX1_N_DIFF1_SAMPLE_SIZE;
+
+	ut_d(index2.magic_n = DICT_INDEX_MAGIC_N);
+	index2.name = TEST_IDX2_NAME;
+	index2.table = &table;
+	index2.cached = 1;
+	index2.n_uniq = 4;
+	index2.fields = index2_fields;
+	index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals;
+	index2.stat_n_sample_sizes = index2_stat_n_sample_sizes;
+	index2.stat_index_size = TEST_IDX2_INDEX_SIZE;
+	index2.stat_n_leaf_pages = TEST_IDX2_N_LEAF_PAGES;
+	index2_fields[0].name = TEST_IDX2_COL1_NAME;
+	index2_fields[1].name = TEST_IDX2_COL2_NAME;
+	index2_fields[2].name = TEST_IDX2_COL3_NAME;
+	index2_fields[3].name = TEST_IDX2_COL4_NAME;
+	index2_stat_n_diff_key_vals[0] = TEST_IDX2_N_DIFF1;
+	index2_stat_n_diff_key_vals[1] = TEST_IDX2_N_DIFF2;
+	index2_stat_n_diff_key_vals[2] = TEST_IDX2_N_DIFF3;
+	index2_stat_n_diff_key_vals[3] = TEST_IDX2_N_DIFF4;
+	index2_stat_n_sample_sizes[0] = TEST_IDX2_N_DIFF1_SAMPLE_SIZE;
+	index2_stat_n_sample_sizes[1] = TEST_IDX2_N_DIFF2_SAMPLE_SIZE;
+	index2_stat_n_sample_sizes[2] = TEST_IDX2_N_DIFF3_SAMPLE_SIZE;
+	index2_stat_n_sample_sizes[3] = TEST_IDX2_N_DIFF4_SAMPLE_SIZE;
+
+	ret = dict_stats_save(&table);
+
+	ut_a(ret == DB_SUCCESS);
+
+	printf("\nOK: stats saved successfully, now go ahead and read "
+	       "what's inside %s and %s:\n\n",
+	       TABLE_STATS_NAME_PRINT,
+	       INDEX_STATS_NAME_PRINT);
+
+	printf("SELECT COUNT(*) = 1 AS table_stats_saved_successfully\n"
+	       "FROM %s\n"
+	       "WHERE\n"
+	       "database_name = '%s' AND\n"
+	       "table_name = '%s' AND\n"
+	       "n_rows = %d AND\n"
+	       "clustered_index_size = %d AND\n"
+	       "sum_of_other_index_sizes = %d;\n"
+	       "\n",
+	       TABLE_STATS_NAME_PRINT,
+	       TEST_DATABASE_NAME,
+	       TEST_TABLE_NAME,
+	       TEST_N_ROWS,
+	       TEST_CLUSTERED_INDEX_SIZE,
+	       TEST_SUM_OF_OTHER_INDEX_SIZES);
+
+	printf("SELECT COUNT(*) = 3 AS tidx1_stats_saved_successfully\n"
+	       "FROM %s\n"
+	       "WHERE\n"
+	       "database_name = '%s' AND\n"
+	       "table_name = '%s' AND\n"
+	       "index_name = '%s' AND\n"
+	       "(\n"
+	       " (stat_name = 'size' AND stat_value = %d AND"
+	       "  sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_leaf_pages' AND stat_value = %d AND"
+	       "  sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s')\n"
+	       ");\n"
+	       "\n",
+	       INDEX_STATS_NAME_PRINT,
+	       TEST_DATABASE_NAME,
+	       TEST_TABLE_NAME,
+	       TEST_IDX1_NAME,
+	       TEST_IDX1_INDEX_SIZE,
+	       TEST_IDX1_N_LEAF_PAGES,
+	       TEST_IDX1_N_DIFF1,
+	       TEST_IDX1_N_DIFF1_SAMPLE_SIZE,
+	       TEST_IDX1_COL1_NAME);
+
+	printf("SELECT COUNT(*) = 6 AS tidx2_stats_saved_successfully\n"
+	       "FROM %s\n"
+	       "WHERE\n"
+	       "database_name = '%s' AND\n"
+	       "table_name = '%s' AND\n"
+	       "index_name = '%s' AND\n"
+	       "(\n"
+	       " (stat_name = 'size' AND stat_value = %d AND"
+	       "  sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_leaf_pages' AND stat_value = %d AND"
+	       "  sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s') OR\n"
+	       " (stat_name = 'n_diff_pfx02' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s,%s') OR\n"
+	       " (stat_name = 'n_diff_pfx03' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s,%s,%s') OR\n"
+	       " (stat_name = 'n_diff_pfx04' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s,%s,%s,%s')\n"
+	       ");\n"
+	       "\n",
+	       INDEX_STATS_NAME_PRINT,
+	       TEST_DATABASE_NAME,
+	       TEST_TABLE_NAME,
+	       TEST_IDX2_NAME,
+	       TEST_IDX2_INDEX_SIZE,
+	       TEST_IDX2_N_LEAF_PAGES,
+	       TEST_IDX2_N_DIFF1,
+	       TEST_IDX2_N_DIFF1_SAMPLE_SIZE, TEST_IDX2_COL1_NAME,
+	       TEST_IDX2_N_DIFF2,
+	       TEST_IDX2_N_DIFF2_SAMPLE_SIZE,
+	       TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME,
+	       TEST_IDX2_N_DIFF3,
+	       TEST_IDX2_N_DIFF3_SAMPLE_SIZE,
+	       TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME,
+	       TEST_IDX2_N_DIFF4,
+	       TEST_IDX2_N_DIFF4_SAMPLE_SIZE,
+	       TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME,
+	       TEST_IDX2_COL4_NAME);
+}
+/* @} */
+
+/* test_dict_stats_fetch_from_ps() @{ */
+void
+test_dict_stats_fetch_from_ps()
+{
+	dict_table_t	table;
+	dict_index_t	index1;
+	ib_uint64_t	index1_stat_n_diff_key_vals[1];
+	ib_uint64_t	index1_stat_n_sample_sizes[1];
+	dict_index_t	index2;
+	ib_uint64_t	index2_stat_n_diff_key_vals[4];
+	ib_uint64_t	index2_stat_n_sample_sizes[4];
+	dberr_t		ret;
+
+	/* craft a dummy dict_table_t */
+	table.name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME);
+	UT_LIST_INIT(table.indexes);
+	UT_LIST_ADD_LAST(indexes, table.indexes, &index1);
+	UT_LIST_ADD_LAST(indexes, table.indexes, &index2);
+	ut_d(table.magic_n = DICT_TABLE_MAGIC_N);
+
+	index1.name = TEST_IDX1_NAME;
+	ut_d(index1.magic_n = DICT_INDEX_MAGIC_N);
+	index1.cached = 1;
+	index1.n_uniq = 1;
+	index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals;
+	index1.stat_n_sample_sizes = index1_stat_n_sample_sizes;
+
+	index2.name = TEST_IDX2_NAME;
+	ut_d(index2.magic_n = DICT_INDEX_MAGIC_N);
+	index2.cached = 1;
+	index2.n_uniq = 4;
+	index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals;
+	index2.stat_n_sample_sizes = index2_stat_n_sample_sizes;
+
+	ret = dict_stats_fetch_from_ps(&table);
+
+	ut_a(ret == DB_SUCCESS);
+
+	ut_a(table.stat_n_rows == TEST_N_ROWS);
+	ut_a(table.stat_clustered_index_size == TEST_CLUSTERED_INDEX_SIZE);
+	ut_a(table.stat_sum_of_other_index_sizes
+	     == TEST_SUM_OF_OTHER_INDEX_SIZES);
+
+	ut_a(index1.stat_index_size == TEST_IDX1_INDEX_SIZE);
+	ut_a(index1.stat_n_leaf_pages == TEST_IDX1_N_LEAF_PAGES);
+	ut_a(index1_stat_n_diff_key_vals[0] == TEST_IDX1_N_DIFF1);
+	ut_a(index1_stat_n_sample_sizes[0] == TEST_IDX1_N_DIFF1_SAMPLE_SIZE);
+
+	ut_a(index2.stat_index_size == TEST_IDX2_INDEX_SIZE);
+	ut_a(index2.stat_n_leaf_pages == TEST_IDX2_N_LEAF_PAGES);
+	ut_a(index2_stat_n_diff_key_vals[0] == TEST_IDX2_N_DIFF1);
+	ut_a(index2_stat_n_sample_sizes[0] == TEST_IDX2_N_DIFF1_SAMPLE_SIZE);
+	ut_a(index2_stat_n_diff_key_vals[1] == TEST_IDX2_N_DIFF2);
+	ut_a(index2_stat_n_sample_sizes[1] == TEST_IDX2_N_DIFF2_SAMPLE_SIZE);
+	ut_a(index2_stat_n_diff_key_vals[2] == TEST_IDX2_N_DIFF3);
+	ut_a(index2_stat_n_sample_sizes[2] == TEST_IDX2_N_DIFF3_SAMPLE_SIZE);
+	ut_a(index2_stat_n_diff_key_vals[3] == TEST_IDX2_N_DIFF4);
+	ut_a(index2_stat_n_sample_sizes[3] == TEST_IDX2_N_DIFF4_SAMPLE_SIZE);
+
+	printf("OK: fetch successful\n");
+}
+/* @} */
+
+/* test_dict_stats_all() @{ */
+void
+test_dict_stats_all()
+{
+	test_dict_table_schema_check();
+
+	test_dict_stats_save();
+
+	test_dict_stats_fetch_from_ps();
+}
+/* @} */
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
+/* @} */
+
+#endif /* UNIV_HOTBACKUP */
diff --git a/storage/xtradb/dict/dict0stats_bg.cc b/storage/xtradb/dict/dict0stats_bg.cc
new file mode 100644
index 00000000000..9e1f75a13a9
--- /dev/null
+++ b/storage/xtradb/dict/dict0stats_bg.cc
@@ -0,0 +1,367 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0stats_bg.cc
+Code used for background table and index stats gathering.
+
+Created Apr 25, 2012 Vasil Dimov
+*******************************************************/
+
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+
+#ifdef UNIV_NONINL
+# include "dict0stats_bg.ic"
+#endif
+
+#include <vector>
+
+/** Minimum time interval between stats recalc for a given table */
+#define MIN_RECALC_INTERVAL	10 /* seconds */
+
+#define SHUTTING_DOWN()		(srv_shutdown_state != SRV_SHUTDOWN_NONE)
+
+/** Event to wake up the stats thread */
+UNIV_INTERN os_event_t		dict_stats_event = NULL;
+
+/** This mutex protects the "recalc_pool" variable. */
+static ib_mutex_t		recalc_pool_mutex;
+#ifdef HAVE_PSI_INTERFACE
+static mysql_pfs_key_t		recalc_pool_mutex_key;
+#endif /* HAVE_PSI_INTERFACE */
+
+/** The number of tables that can be added to "recalc_pool" before
+it is enlarged */
+static const ulint RECALC_POOL_INITIAL_SLOTS = 128;
+
+/** The multitude of tables whose stats are to be automatically
+recalculated - an STL vector */
+typedef std::vector<table_id_t>	recalc_pool_t;
+static recalc_pool_t		recalc_pool;
+
+typedef recalc_pool_t::iterator	recalc_pool_iterator_t;
+
+/*****************************************************************//**
+Initialize the recalc pool, called once during thread initialization. */
+static
+void
+dict_stats_recalc_pool_init()
+/*=========================*/
+{
+	ut_ad(!srv_read_only_mode);
+
+	recalc_pool.reserve(RECALC_POOL_INITIAL_SLOTS);
+}
+
+/*****************************************************************//**
+Free the resources occupied by the recalc pool, called once during
+thread de-initialization. */
+static
+void
+dict_stats_recalc_pool_deinit()
+/*===========================*/
+{
+	ut_ad(!srv_read_only_mode);
+
+	recalc_pool.clear();
+}
+
+/*****************************************************************//**
+Add a table to the recalc pool, which is processed by the
+background stats gathering thread. Only the table id is added to the
+list, so the table can be closed after being enqueued and it will be
+opened when needed. If the table does not exist later (has been DROPped),
+then it will be removed from the pool and skipped. */
+UNIV_INTERN
+void
+dict_stats_recalc_pool_add(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table to add */
+{
+	ut_ad(!srv_read_only_mode);
+
+	mutex_enter(&recalc_pool_mutex);
+
+	/* quit if already in the list */
+	for (recalc_pool_iterator_t iter = recalc_pool.begin();
+	     iter != recalc_pool.end();
+	     ++iter) {
+
+		if (*iter == table->id) {
+			mutex_exit(&recalc_pool_mutex);
+			return;
+		}
+	}
+
+	recalc_pool.push_back(table->id);
+
+	mutex_exit(&recalc_pool_mutex);
+
+	os_event_set(dict_stats_event);
+}
+
+/*****************************************************************//**
+Get a table from the auto recalc pool. The returned table id is removed
+from the pool.
+@return true if the pool was non-empty and "id" was set, false otherwise */
+static
+bool
+dict_stats_recalc_pool_get(
+/*=======================*/
+	table_id_t*	id)	/*!< out: table id, or unmodified if list is
+				empty */
+{
+	ut_ad(!srv_read_only_mode);
+
+	mutex_enter(&recalc_pool_mutex);
+
+	if (recalc_pool.empty()) {
+		mutex_exit(&recalc_pool_mutex);
+		return(false);
+	}
+
+	*id = recalc_pool[0];
+
+	recalc_pool.erase(recalc_pool.begin());
+
+	mutex_exit(&recalc_pool_mutex);
+
+	return(true);
+}
+
+/*****************************************************************//**
+Delete a given table from the auto recalc pool.
+dict_stats_recalc_pool_del() */
+UNIV_INTERN
+void
+dict_stats_recalc_pool_del(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table to remove */
+{
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	mutex_enter(&recalc_pool_mutex);
+
+	ut_ad(table->id > 0);
+
+	for (recalc_pool_iterator_t iter = recalc_pool.begin();
+	     iter != recalc_pool.end();
+	     ++iter) {
+
+		if (*iter == table->id) {
+			/* erase() invalidates the iterator */
+			recalc_pool.erase(iter);
+			break;
+		}
+	}
+
+	mutex_exit(&recalc_pool_mutex);
+}
+
+/*****************************************************************//**
+Wait until background stats thread has stopped using the specified table.
+The caller must have locked the data dictionary using
+row_mysql_lock_data_dictionary() and this function may unlock it temporarily
+and restore the lock before it exits.
+The background stats thread is guaranteed not to start using the specified
+table after this function returns and before the caller unlocks the data
+dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag
+under dict_sys->mutex. */
+UNIV_INTERN
+void
+dict_stats_wait_bg_to_stop_using_table(
+/*===================================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx)	/*!< in/out: transaction to use for
+				unlocking/locking the data dict */
+{
+	while (!dict_stats_stop_bg(table)) {
+		DICT_STATS_BG_YIELD(trx);
+	}
+}
+
+/*****************************************************************//**
+Initialize global variables needed for the operation of dict_stats_thread()
+Must be called before dict_stats_thread() is started. */
+UNIV_INTERN
+void
+dict_stats_thread_init()
+/*====================*/
+{
+	ut_a(!srv_read_only_mode);
+
+	dict_stats_event = os_event_create();
+
+	/* The recalc_pool_mutex is acquired from:
+	1) the background stats gathering thread before any other latch
+	   and released without latching anything else in between (thus
+	   any level would do here)
+	2) from row_update_statistics_if_needed()
+	   and released without latching anything else in between. We know
+	   that dict_sys->mutex (SYNC_DICT) is not acquired when
+	   row_update_statistics_if_needed() is called and it may be acquired
+	   inside that function (thus a level <=SYNC_DICT would do).
+	3) from row_drop_table_for_mysql() after dict_sys->mutex (SYNC_DICT)
+	   and dict_operation_lock (SYNC_DICT_OPERATION) have been locked
+	   (thus a level <SYNC_DICT && <SYNC_DICT_OPERATION would do)
+	So we choose SYNC_STATS_AUTO_RECALC to be about below SYNC_DICT. */
+	mutex_create(recalc_pool_mutex_key, &recalc_pool_mutex,
+		     SYNC_STATS_AUTO_RECALC);
+
+	dict_stats_recalc_pool_init();
+}
+
+/*****************************************************************//**
+Free resources allocated by dict_stats_thread_init(), must be called
+after dict_stats_thread() has exited. */
+UNIV_INTERN
+void
+dict_stats_thread_deinit()
+/*======================*/
+{
+	ut_a(!srv_read_only_mode);
+	ut_ad(!srv_dict_stats_thread_active);
+
+	dict_stats_recalc_pool_deinit();
+
+	mutex_free(&recalc_pool_mutex);
+	memset(&recalc_pool_mutex, 0x0, sizeof(recalc_pool_mutex));
+
+	os_event_free(dict_stats_event);
+	dict_stats_event = NULL;
+}
+
+/*****************************************************************//**
+Get the first table that has been added for auto recalc and eventually
+update its stats. */
+static
+void
+dict_stats_process_entry_from_recalc_pool()
+/*=======================================*/
+{
+	table_id_t	table_id;
+
+	ut_ad(!srv_read_only_mode);
+
+	/* pop the first table from the auto recalc pool */
+	if (!dict_stats_recalc_pool_get(&table_id)) {
+		/* no tables for auto recalc */
+		return;
+	}
+
+	dict_table_t*	table;
+
+	mutex_enter(&dict_sys->mutex);
+
+	table = dict_table_open_on_id(table_id, TRUE, DICT_TABLE_OP_NORMAL);
+
+	if (table == NULL) {
+		/* table does not exist, must have been DROPped
+		after its id was enqueued */
+		mutex_exit(&dict_sys->mutex);
+		return;
+	}
+
+	/* Check whether table is corrupted */
+	if (table->corrupted) {
+		dict_table_close(table, TRUE, FALSE);
+		mutex_exit(&dict_sys->mutex);
+		return;
+	}
+
+	table->stats_bg_flag = BG_STAT_IN_PROGRESS;
+
+	mutex_exit(&dict_sys->mutex);
+
+	/* ut_time() could be expensive, the current function
+	is called once every time a table has been changed more than 10% and
+	on a system with lots of small tables, this could become hot. If we
+	find out that this is a problem, then the check below could eventually
+	be replaced with something else, though a time interval is the natural
+	approach. */
+
+	if (ut_difftime(ut_time(), table->stats_last_recalc)
+	    < MIN_RECALC_INTERVAL) {
+
+		/* Stats were (re)calculated not long ago. To avoid
+		too frequent stats updates we put back the table on
+		the auto recalc list and do nothing. */
+
+		dict_stats_recalc_pool_add(table);
+
+	} else {
+
+		dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT);
+	}
+
+	mutex_enter(&dict_sys->mutex);
+
+	table->stats_bg_flag = BG_STAT_NONE;
+
+	dict_table_close(table, TRUE, FALSE);
+
+	mutex_exit(&dict_sys->mutex);
+}
+
+/*****************************************************************//**
+This is the thread for background stats gathering. It pops tables, from
+the auto recalc list and proceeds them, eventually recalculating their
+statistics.
+@return this function does not return, it calls os_thread_exit() */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(dict_stats_thread)(
+/*==============================*/
+	void*	arg __attribute__((unused)))	/*!< in: a dummy parameter
+						required by os_thread_create */
+{
+	ut_a(!srv_read_only_mode);
+
+	srv_dict_stats_thread_active = TRUE;
+
+	while (!SHUTTING_DOWN()) {
+
+		/* Wake up periodically even if not signaled. This is
+		because we may lose an event - if the below call to
+		dict_stats_process_entry_from_recalc_pool() puts the entry back
+		in the list, the os_event_set() will be lost by the subsequent
+		os_event_reset(). */
+		os_event_wait_time(
+			dict_stats_event, MIN_RECALC_INTERVAL * 1000000);
+
+		if (SHUTTING_DOWN()) {
+			break;
+		}
+
+		dict_stats_process_entry_from_recalc_pool();
+
+		os_event_reset(dict_stats_event);
+	}
+
+	srv_dict_stats_thread_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit instead of return(). */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
diff --git a/storage/xtradb/dyn/dyn0dyn.c b/storage/xtradb/dyn/dyn0dyn.cc
index d0f50ad0c32..3ef5297a7c9 100644
--- a/storage/xtradb/dyn/dyn0dyn.c
+++ b/storage/xtradb/dyn/dyn0dyn.cc
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file dyn/dyn0dyn.c
+@file dyn/dyn0dyn.cc
 The dynamically allocated array
 
 Created 2/5/1996 Heikki Tuuri
@@ -55,7 +55,8 @@ dyn_array_add_block(
 
 	heap = arr->heap;
 
-	block = mem_heap_alloc(heap, sizeof(dyn_block_t));
+	block = static_cast<dyn_block_t*>(
+		mem_heap_alloc(heap, sizeof(dyn_block_t)));
 
 	block->used = 0;
 
diff --git a/storage/xtradb/eval/eval0eval.c b/storage/xtradb/eval/eval0eval.cc
index fc34ce83a0f..ccc54781102 100644
--- a/storage/xtradb/eval/eval0eval.c
+++ b/storage/xtradb/eval/eval0eval.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file eval/eval0eval.c
+@file eval/eval0eval.cc
 SQL evaluator: evaluates simple data structures, like expressions, in
 a query graph
 
@@ -32,6 +32,7 @@ Created 12/29/1997 Heikki Tuuri
 
 #include "data0data.h"
 #include "row0sel.h"
+#include "rem0cmp.h"
 
 /** The RND function seed */
 static ulint	eval_rnd	= 128367121;
@@ -41,6 +42,18 @@ eval_node_alloc_val_buf */
 
 static byte	eval_dummy;
 
+/*************************************************************************
+Gets the like node from the node */
+UNIV_INLINE
+que_node_t*
+que_node_get_like_node(
+/*===================*/
+				/* out: next node in a list of nodes */
+	que_node_t*     node)   /* in: node in a list */
+{
+	return(((sym_node_t*) node)->like_node);
+}
+
 /*****************************************************************//**
 Allocate a buffer from global dynamic memory for a value of a que_node.
 NOTE that this memory must be explicitly freed when the query graph is
@@ -65,7 +78,7 @@ eval_node_alloc_val_buf(
 
 	dfield = que_node_get_val(node);
 
-	data = dfield_get_data(dfield);
+	data = static_cast<byte*>(dfield_get_data(dfield));
 
 	if (data && data != &eval_dummy) {
 		mem_free(data);
@@ -74,7 +87,7 @@ eval_node_alloc_val_buf(
 	if (size == 0) {
 		data = &eval_dummy;
 	} else {
-		data = mem_alloc(size);
+		data = static_cast<byte*>(mem_alloc(size));
 	}
 
 	que_node_set_val_buf_size(node, size);
@@ -102,7 +115,7 @@ eval_node_free_val_buf(
 
 	dfield = que_node_get_val(node);
 
-	data = dfield_get_data(dfield);
+	data = static_cast<byte*>(dfield_get_data(dfield));
 
 	if (que_node_get_val_buf_size(node) > 0) {
 		ut_a(data);
@@ -111,10 +124,80 @@ eval_node_free_val_buf(
 	}
 }
 
-/*****************************************************************//**
+/*********************************************************************
+Evaluates a LIKE comparison node.
+@return the result of the comparison */
+UNIV_INLINE
+ibool
+eval_cmp_like(
+/*==========*/
+	que_node_t*	arg1,		/* !< in: left operand */
+	que_node_t*	arg2)		/* !< in: right operand */
+{
+	ib_like_t	op;
+	int		res;
+	que_node_t*	arg3;
+	que_node_t*	arg4;
+	dfield_t*	dfield;
+	dtype_t*	dtype;
+	ibool		val = TRUE;
+
+	arg3 = que_node_get_like_node(arg2);
+
+	/* Get the comparison type operator */
+	ut_a(arg3);
+
+	dfield = que_node_get_val(arg3);
+	dtype = dfield_get_type(dfield);
+
+	ut_a(dtype_get_mtype(dtype) == DATA_INT);
+	op = static_cast<ib_like_t>(mach_read_from_4(static_cast<const unsigned char*>(dfield_get_data(dfield))));
+
+	switch (op) {
+	case	IB_LIKE_PREFIX:
+
+		arg4 = que_node_get_next(arg3);
+		res = cmp_dfield_dfield_like_prefix(
+			que_node_get_val(arg1),
+			que_node_get_val(arg4));
+		break;
+
+	case	IB_LIKE_SUFFIX:
+
+		arg4 = que_node_get_next(arg3);
+		res = cmp_dfield_dfield_like_suffix(
+			que_node_get_val(arg1),
+			que_node_get_val(arg4));
+		break;
+
+	case	IB_LIKE_SUBSTR:
+
+		arg4 = que_node_get_next(arg3);
+		res = cmp_dfield_dfield_like_substr(
+			que_node_get_val(arg1),
+			que_node_get_val(arg4));
+		break;
+
+	case	IB_LIKE_EXACT:
+		res = cmp_dfield_dfield(
+			que_node_get_val(arg1),
+			que_node_get_val(arg2));
+		break;
+
+	default:
+		ut_error;
+	}
+
+	if (res != 0) {
+		val = FALSE;
+	}
+
+	return(val);
+}
+
+/*********************************************************************
 Evaluates a comparison node.
-@return	the result of the comparison */
-UNIV_INTERN
+@return the result of the comparison */
 ibool
 eval_cmp(
 /*=====*/
@@ -123,45 +206,52 @@ eval_cmp(
 	que_node_t*	arg1;
 	que_node_t*	arg2;
 	int		res;
-	ibool		val;
 	int		func;
+	ibool		val = TRUE;
 
 	ut_ad(que_node_get_type(cmp_node) == QUE_NODE_FUNC);
 
 	arg1 = cmp_node->args;
 	arg2 = que_node_get_next(arg1);
 
-	res = cmp_dfield_dfield(que_node_get_val(arg1),
-				que_node_get_val(arg2));
-	val = TRUE;
-
 	func = cmp_node->func;
 
-	if (func == '=') {
-		if (res != 0) {
-			val = FALSE;
-		}
-	} else if (func == '<') {
-		if (res != -1) {
-			val = FALSE;
-		}
-	} else if (func == PARS_LE_TOKEN) {
-		if (res == 1) {
-			val = FALSE;
-		}
-	} else if (func == PARS_NE_TOKEN) {
-		if (res == 0) {
-			val = FALSE;
-		}
-	} else if (func == PARS_GE_TOKEN) {
-		if (res == -1) {
-			val = FALSE;
-		}
+	if (func == PARS_LIKE_TOKEN_EXACT
+	    || func == PARS_LIKE_TOKEN_PREFIX
+	    || func == PARS_LIKE_TOKEN_SUFFIX
+	    || func == PARS_LIKE_TOKEN_SUBSTR) {
+
+		val = eval_cmp_like(arg1, arg2);
 	} else {
-		ut_ad(func == '>');
+		res = cmp_dfield_dfield(
+			que_node_get_val(arg1), que_node_get_val(arg2));
 
-		if (res != 1) {
-			val = FALSE;
+		if (func == '=') {
+			if (res != 0) {
+				val = FALSE;
+			}
+		} else if (func == '<') {
+			if (res != -1) {
+				val = FALSE;
+			}
+		} else if (func == PARS_LE_TOKEN) {
+			if (res == 1) {
+				val = FALSE;
+			}
+		} else if (func == PARS_NE_TOKEN) {
+			if (res == 0) {
+				val = FALSE;
+			}
+		} else if (func == PARS_GE_TOKEN) {
+			if (res == -1) {
+				val = FALSE;
+			}
+		} else {
+			ut_ad(func == '>');
+
+			if (res != 1) {
+				val = FALSE;
+			}
 		}
 	}
 
@@ -344,8 +434,8 @@ eval_predefined_2(
 
 	} else if (func == PARS_RND_TOKEN) {
 
-		len1 = (ulint)eval_node_get_int_val(arg1);
-		len2 = (ulint)eval_node_get_int_val(arg2);
+		len1 = (ulint) eval_node_get_int_val(arg1);
+		len2 = (ulint) eval_node_get_int_val(arg2);
 
 		ut_ad(len2 >= len1);
 
@@ -362,7 +452,7 @@ eval_predefined_2(
 
 	} else if (func == PARS_RND_STR_TOKEN) {
 
-		len1 = (ulint)eval_node_get_int_val(arg1);
+		len1 = (ulint) eval_node_get_int_val(arg1);
 
 		data = eval_node_ensure_val_buf(func_node, len1);
 
@@ -390,7 +480,7 @@ eval_notfound(
 
 	ut_ad(func_node->func == PARS_NOTFOUND_TOKEN);
 
-	cursor = func_node->args;
+	cursor = static_cast<sym_node_t*>(func_node->args);
 
 	ut_ad(que_node_get_type(cursor) == QUE_NODE_SYMBOL);
 
@@ -436,10 +526,10 @@ eval_substr(
 
 	arg3 = que_node_get_next(arg2);
 
-	str1 = dfield_get_data(que_node_get_val(arg1));
+	str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1)));
 
-	len1 = (ulint)eval_node_get_int_val(arg2);
-	len2 = (ulint)eval_node_get_int_val(arg3);
+	len1 = (ulint) eval_node_get_int_val(arg2);
+	len2 = (ulint) eval_node_get_int_val(arg3);
 
 	dfield = que_node_get_val(func_node);
 
@@ -471,11 +561,11 @@ eval_replstr(
 	arg3 = que_node_get_next(arg2);
 	arg4 = que_node_get_next(arg3);
 
-	str1 = dfield_get_data(que_node_get_val(arg1));
-	str2 = dfield_get_data(que_node_get_val(arg2));
+	str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1)));
+	str2 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg2)));
 
-	len1 = (ulint)eval_node_get_int_val(arg3);
-	len2 = (ulint)eval_node_get_int_val(arg4);
+	len1 = (ulint) eval_node_get_int_val(arg3);
+	len2 = (ulint) eval_node_get_int_val(arg4);
 
 	if ((dfield_get_len(que_node_get_val(arg1)) < len1 + len2)
 	    || (dfield_get_len(que_node_get_val(arg2)) < len2)) {
@@ -513,8 +603,8 @@ eval_instr(
 	dfield1 = que_node_get_val(arg1);
 	dfield2 = que_node_get_val(arg2);
 
-	str1 = dfield_get_data(dfield1);
-	str2 = dfield_get_data(dfield2);
+	str1 = static_cast<byte*>(dfield_get_data(dfield1));
+	str2 = static_cast<byte*>(dfield_get_data(dfield2));
 
 	len1 = dfield_get_len(dfield1);
 	len2 = dfield_get_len(dfield2);
@@ -577,7 +667,7 @@ eval_binary_to_number(
 
 	dfield = que_node_get_val(arg1);
 
-	str1 = dfield_get_data(dfield);
+	str1 = static_cast<byte*>(dfield_get_data(dfield));
 	len1 = dfield_get_len(dfield);
 
 	if (len1 > 4) {
@@ -588,7 +678,7 @@ eval_binary_to_number(
 		str2 = str1;
 	} else {
 		int_val = 0;
-		str2 = (byte*)&int_val;
+		str2 = (byte*) &int_val;
 
 		ut_memcpy(str2 + (4 - len1), str1, len1);
 	}
@@ -659,7 +749,7 @@ eval_to_binary(
 
 	arg1 = func_node->args;
 
-	str1 = dfield_get_data(que_node_get_val(arg1));
+	str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1)));
 
 	if (dtype_get_mtype(que_node_get_data_type(arg1)) != DATA_INT) {
 
@@ -674,7 +764,7 @@ eval_to_binary(
 
 	arg2 = que_node_get_next(arg1);
 
-	len1 = (ulint)eval_node_get_int_val(arg2);
+	len1 = (ulint) eval_node_get_int_val(arg2);
 
 	if (len1 > 4) {
 
@@ -705,7 +795,7 @@ eval_predefined(
 
 	if (func == PARS_LENGTH_TOKEN) {
 
-		int_val = (lint)dfield_get_len(que_node_get_val(arg1));
+		int_val = (lint) dfield_get_len(que_node_get_val(arg1));
 
 	} else if (func == PARS_TO_CHAR_TOKEN) {
 
@@ -768,7 +858,7 @@ eval_predefined(
 			       dfield_get_data(que_node_get_val(arg1)));
 
 	} else if (func == PARS_SYSDATE_TOKEN) {
-		int_val = (lint)ut_time();
+		int_val = (lint) ut_time();
 	} else {
 		eval_predefined_2(func_node);
 
@@ -787,12 +877,12 @@ eval_func(
 	func_node_t*	func_node)	/*!< in: function node */
 {
 	que_node_t*	arg;
-	ulint		class;
+	ulint		fclass;
 	ulint		func;
 
 	ut_ad(que_node_get_type(func_node) == QUE_NODE_FUNC);
 
-	class = func_node->class;
+	fclass = func_node->fclass;
 	func = func_node->func;
 
 	arg = func_node->args;
@@ -805,7 +895,7 @@ eval_func(
 		values, except for eval_cmp and notfound */
 
 		if (dfield_is_null(que_node_get_val(arg))
-		    && (class != PARS_FUNC_CMP)
+		    && (fclass != PARS_FUNC_CMP)
 		    && (func != PARS_NOTFOUND_TOKEN)
 		    && (func != PARS_PRINTF_TOKEN)) {
 			ut_error;
@@ -814,34 +904,47 @@ eval_func(
 		arg = que_node_get_next(arg);
 	}
 
-	if (class == PARS_FUNC_CMP) {
+	switch (fclass) {
+	case PARS_FUNC_CMP:
 		eval_cmp(func_node);
-	} else if (class == PARS_FUNC_ARITH) {
+		return;
+	case PARS_FUNC_ARITH:
 		eval_arith(func_node);
-	} else if (class == PARS_FUNC_AGGREGATE) {
+		return;
+	case PARS_FUNC_AGGREGATE:
 		eval_aggregate(func_node);
-	} else if (class == PARS_FUNC_PREDEFINED) {
-
-		if (func == PARS_NOTFOUND_TOKEN) {
+		return;
+	case PARS_FUNC_PREDEFINED:
+		switch (func) {
+		case PARS_NOTFOUND_TOKEN:
 			eval_notfound(func_node);
-		} else if (func == PARS_SUBSTR_TOKEN) {
+			return;
+		case PARS_SUBSTR_TOKEN:
 			eval_substr(func_node);
-		} else if (func == PARS_REPLSTR_TOKEN) {
+			return;
+		case PARS_REPLSTR_TOKEN:
 			eval_replstr(func_node);
-		} else if (func == PARS_INSTR_TOKEN) {
+			return;
+		case PARS_INSTR_TOKEN:
 			eval_instr(func_node);
-		} else if (func == PARS_BINARY_TO_NUMBER_TOKEN) {
+			return;
+		case PARS_BINARY_TO_NUMBER_TOKEN:
 			eval_binary_to_number(func_node);
-		} else if (func == PARS_CONCAT_TOKEN) {
+			return;
+		case PARS_CONCAT_TOKEN:
 			eval_concat(func_node);
-		} else if (func == PARS_TO_BINARY_TOKEN) {
+			return;
+		case PARS_TO_BINARY_TOKEN:
 			eval_to_binary(func_node);
-		} else {
+			return;
+		default:
 			eval_predefined(func_node);
+			return;
 		}
-	} else {
-		ut_ad(class == PARS_FUNC_LOGICAL);
-
+	case PARS_FUNC_LOGICAL:
 		eval_logical(func_node);
+		return;
 	}
+
+	ut_error;
 }
diff --git a/storage/xtradb/eval/eval0proc.c b/storage/xtradb/eval/eval0proc.cc
index ba93fdd3977..e6f3a32cd48 100644
--- a/storage/xtradb/eval/eval0proc.c
+++ b/storage/xtradb/eval/eval0proc.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1998, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file eval/eval0proc.c
+@file eval/eval0proc.cc
 Executes SQL stored procedures and their control structures
 
 Created 1/20/1998 Heikki Tuuri
@@ -43,7 +43,7 @@ if_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<if_node_t*>(thr->run_node);
 	ut_ad(que_node_get_type(node) == QUE_NODE_IF);
 
 	if (thr->prev_node == que_node_get_parent(node)) {
@@ -80,7 +80,8 @@ if_step(
 					break;
 				}
 
-				elsif_node = que_node_get_next(elsif_node);
+				elsif_node = static_cast<elsif_node_t*>(
+					que_node_get_next(elsif_node));
 
 				if (elsif_node == NULL) {
 					thr->run_node = NULL;
@@ -118,7 +119,7 @@ while_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<while_node_t*>(thr->run_node);
 	ut_ad(que_node_get_type(node) == QUE_NODE_WHILE);
 
 	ut_ad((thr->prev_node == que_node_get_parent(node))
@@ -154,7 +155,7 @@ assign_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<assign_node_t*>(thr->run_node);
 	ut_ad(que_node_get_type(node) == QUE_NODE_ASSIGNMENT);
 
 	/* Evaluate the value to assign */
@@ -183,7 +184,7 @@ for_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<for_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_FOR);
 
@@ -244,7 +245,7 @@ exit_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<exit_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_EXIT);
 
@@ -276,7 +277,7 @@ return_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<return_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_RETURN);
 
diff --git a/storage/xtradb/fil/fil0fil.c b/storage/xtradb/fil/fil0fil.cc
index 19e656dfd92..9861f85b814 100644
--- a/storage/xtradb/fil/fil0fil.c
+++ b/storage/xtradb/fil/fil0fil.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file fil/fil0fil.c
+@file fil/fil0fil.cc
 The tablespace memory cache
 
 Created 10/25/1995 Heikki Tuuri
@@ -25,6 +25,9 @@ Created 10/25/1995 Heikki Tuuri
 
 #include "fil0fil.h"
 
+#include <debug_sync.h>
+#include <my_dbug.h>
+
 #include "mem0mem.h"
 #include "hash0hash.h"
 #include "os0file.h"
@@ -40,20 +43,15 @@ Created 10/25/1995 Heikki Tuuri
 #include "dict0dict.h"
 #include "page0page.h"
 #include "page0zip.h"
-#include "trx0trx.h"
 #include "trx0sys.h"
-#include "pars0pars.h"
 #include "row0mysql.h"
-#include "row0row.h"
-#include "que0que.h"
-#include "btr0btr.h"
-#include "btr0sea.h"
 #ifndef UNIV_HOTBACKUP
 # include "buf0lru.h"
 # include "ibuf0ibuf.h"
 # include "sync0sync.h"
 # include "os0sync.h"
 #else /* !UNIV_HOTBACKUP */
+# include "srv0srv.h"
 static ulint srv_data_read, srv_data_written;
 #endif /* !UNIV_HOTBACKUP */
 
@@ -126,6 +124,9 @@ UNIV_INTERN ulint	fil_n_pending_log_flushes		= 0;
 /** Number of pending tablespace flushes */
 UNIV_INTERN ulint	fil_n_pending_tablespace_flushes	= 0;
 
+/** Number of files currently open */
+UNIV_INTERN ulint	fil_n_file_opened			= 0;
+
 /** The null file address */
 UNIV_INTERN fil_addr_t	fil_addr_null = {FIL_NULL, 0};
 
@@ -140,12 +141,14 @@ UNIV_INTERN mysql_pfs_key_t	fil_space_latch_key;
 #endif /* UNIV_PFS_RWLOCK */
 
 /** File node of a tablespace or the log data space */
-struct fil_node_struct {
+struct fil_node_t {
 	fil_space_t*	space;	/*!< backpointer to the space where this node
 				belongs */
 	char*		name;	/*!< path to the file */
 	ibool		open;	/*!< TRUE if file open */
 	os_file_t	handle;	/*!< OS handle to the file, if file open */
+	os_event_t	sync_event;/*!< Condition event to group and
+				serialize calls to fsync */
 	ibool		is_raw_disk;/*!< TRUE if the 'file' is actually a raw
 				device or a raw disk partition */
 	ulint		size;	/*!< size of the file in database pages, 0 if
@@ -159,6 +162,9 @@ struct fil_node_struct {
 				/*!< count of pending flushes on this file;
 				closing of the file is not allowed if
 				this is > 0 */
+	ibool		being_extended;
+				/*!< TRUE if the node is currently
+				being extended. */
 	ib_int64_t	modification_counter;/*!< when we write to the file we
 				increment this by one */
 	ib_int64_t	flush_counter;/*!< up to what
@@ -171,11 +177,11 @@ struct fil_node_struct {
 	ulint		magic_n;/*!< FIL_NODE_MAGIC_N */
 };
 
-/** Value of fil_node_struct::magic_n */
+/** Value of fil_node_t::magic_n */
 #define	FIL_NODE_MAGIC_N	89389
 
 /** Tablespace or log data space: let us call them by a common name space */
-struct fil_space_struct {
+struct fil_space_t {
 	char*		name;	/*!< space name = the path to the first file in
 				it */
 	ulint		id;	/*!< space id */
@@ -213,7 +219,9 @@ struct fil_space_struct {
 				tablespace whose size we do not know yet;
 				last incomplete megabytes in data files may be
 				ignored if space == 0 */
-	ulint		flags;	/*!< compressed page size and file format, or 0 */
+	ulint		flags;	/*!< tablespace flags; see
+				fsp_flags_is_valid(),
+				fsp_flags_get_zip_size() */
 	ulint		n_reserved_extents;
 				/*!< number of reserved free extents for
 				ongoing operations like B-tree page split */
@@ -230,34 +238,30 @@ struct fil_space_struct {
 	hash_node_t	hash;	/*!< hash chain node */
 	hash_node_t	name_hash;/*!< hash chain the name_hash table */
 #ifndef UNIV_HOTBACKUP
-	rw_lock_t	latch;	/*!< latch protecting the file space storage
+	prio_rw_lock_t	latch;	/*!< latch protecting the file space storage
 				allocation */
 #endif /* !UNIV_HOTBACKUP */
 	UT_LIST_NODE_T(fil_space_t) unflushed_spaces;
 				/*!< list of spaces with at least one unflushed
 				file we have written to */
-	ibool		is_in_unflushed_spaces; /*!< TRUE if this space is
-				currently in unflushed_spaces */
+	bool		is_in_unflushed_spaces;
+				/*!< true if this space is currently in
+				unflushed_spaces */
 	ibool		is_corrupt;
 	UT_LIST_NODE_T(fil_space_t) space_list;
 				/*!< list of all spaces */
 	ulint		magic_n;/*!< FIL_SPACE_MAGIC_N */
 };
 
-/** Value of fil_space_struct::magic_n */
+/** Value of fil_space_t::magic_n */
 #define	FIL_SPACE_MAGIC_N	89472
 
-/** The tablespace memory cache */
-typedef	struct fil_system_struct	fil_system_t;
-
 /** The tablespace memory cache; also the totality of logs (the log
 data space) is stored here; below we talk about tablespaces, but also
 the ib_logfiles form a 'space' and it is handled here */
-
-struct fil_system_struct {
+struct fil_system_t {
 #ifndef UNIV_HOTBACKUP
-	mutex_t		mutex;		/*!< The mutex protecting the cache */
-	mutex_t		file_extend_mutex;
+	ib_mutex_t		mutex;		/*!< The mutex protecting the cache */
 #endif /* !UNIV_HOTBACKUP */
 	hash_table_t*	spaces;		/*!< The hash table of spaces in the
 					system; they are hashed on the space
@@ -312,6 +316,21 @@ struct fil_system_struct {
 initialized. */
 static fil_system_t*	fil_system	= NULL;
 
+/** Determine if (i) is a user tablespace id or not. */
+# define fil_is_user_tablespace_id(i) ((i) > srv_undo_tablespaces_open)
+
+/** Determine if user has explicitly disabled fsync(). */
+#ifndef __WIN__
+# define fil_buffering_disabled(s)					\
+	(((s)->purpose == FIL_TABLESPACE				\
+	    && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)\
+	  || ((s)->purpose == FIL_LOG					\
+	    && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT))
+    
+#else /* __WIN__ */
+# define fil_buffering_disabled(s)	(0)
+#endif /* __WIN__ */
+
 #ifdef UNIV_DEBUG
 /** Try fil_validate() every this many times */
 # define FIL_VALIDATE_SKIP	17
@@ -342,14 +361,28 @@ fil_validate_skip(void)
 #endif /* UNIV_DEBUG */
 
 /********************************************************************//**
+Determines if a file node belongs to the least-recently-used list.
+@return TRUE if the file belongs to fil_system->LRU mutex. */
+UNIV_INLINE
+ibool
+fil_space_belongs_in_lru(
+/*=====================*/
+	const fil_space_t*	space)	/*!< in: file space */
+{
+	return(space->purpose == FIL_TABLESPACE
+	       && fil_is_user_tablespace_id(space->id));
+}
+
+/********************************************************************//**
 NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
 
 Prepares a file node for i/o. Opens the file if it is closed. Updates the
 pending i/o's field in the node and the system appropriately. Takes the node
 off the LRU list if it is in the LRU list. The caller must hold the fil_sys
-mutex. */
+mutex.
+@return false if the file can't be opened, otherwise true */
 static
-void
+bool
 fil_node_prepare_for_io(
 /*====================*/
 	fil_node_t*	node,	/*!< in: file node */
@@ -368,16 +401,6 @@ fil_node_complete_io(
 				the node as modified if
 				type == OS_FILE_WRITE */
 /*******************************************************************//**
-Checks if a single-table tablespace for a given table name exists in the
-tablespace memory cache.
-@return	space id, ULINT_UNDEFINED if not found */
-static
-ulint
-fil_get_space_id_for_table(
-/*=======================*/
-	const char*	name);	/*!< in: table name in the standard
-				'databasename/tablename' format */
-/*******************************************************************//**
 Frees a space object from the tablespace memory cache. Closes the files in
 the chain but does not delete them. There must not be any pending i/o's or
 flushes on the files.
@@ -396,10 +419,10 @@ calculating the byte offset within a space.
 @return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
 i/o on a tablespace which does not exist */
 UNIV_INLINE
-ulint
+dberr_t
 fil_read(
 /*=====*/
-	ibool	sync,		/*!< in: TRUE if synchronous aio is desired */
+	bool	sync,		/*!< in: true if synchronous aio is desired */
 	ulint	space_id,	/*!< in: space id */
 	ulint	zip_size,	/*!< in: compressed page size in bytes;
 				0 for uncompressed pages */
@@ -425,10 +448,10 @@ calculating the byte offset within a space.
 @return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
 i/o on a tablespace which does not exist */
 UNIV_INLINE
-ulint
+dberr_t
 fil_write(
 /*======*/
-	ibool	sync,		/*!< in: TRUE if synchronous aio is desired */
+	bool	sync,		/*!< in: true if synchronous aio is desired */
 	ulint	space_id,	/*!< in: space id */
 	ulint	zip_size,	/*!< in: compressed page size in bytes;
 				0 for uncompressed pages */
@@ -443,6 +466,8 @@ fil_write(
 	void*	message)	/*!< in: message for aio handler if non-sync
 				aio used, else ignored */
 {
+	ut_ad(!srv_read_only_mode);
+
 	return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset,
 					   byte_offset, len, buf, message));
 }
@@ -523,7 +548,7 @@ fil_space_get_version(
 Returns the latch of a file space.
 @return	latch protecting storage allocation */
 UNIV_INTERN
-rw_lock_t*
+prio_rw_lock_t*
 fil_space_get_latch(
 /*================*/
 	ulint	id,	/*!< in: space id */
@@ -576,9 +601,9 @@ fil_space_get_type(
 /**********************************************************************//**
 Checks if all the file nodes in a space are flushed. The caller must hold
 the fil_system mutex.
-@return	TRUE if all are flushed */
+@return	true if all are flushed */
 static
-ibool
+bool
 fil_space_is_flushed(
 /*=================*/
 	fil_space_t*	space)	/*!< in: space */
@@ -592,19 +617,21 @@ fil_space_is_flushed(
 	while (node) {
 		if (node->modification_counter > node->flush_counter) {
 
-			return(FALSE);
+			ut_ad(!fil_buffering_disabled(space));
+			return(false);
 		}
 
 		node = UT_LIST_GET_NEXT(chain, node);
 	}
 
-	return(TRUE);
+	return(true);
 }
 
 /*******************************************************************//**
-Appends a new file to the chain of files of a space. File must be closed. */
+Appends a new file to the chain of files of a space. File must be closed.
+@return pointer to the file name, or NULL on error */
 UNIV_INTERN
-void
+char*
 fil_node_create(
 /*============*/
 	const char*	name,	/*!< in: file name (file must be closed) */
@@ -622,21 +649,16 @@ fil_node_create(
 
 	mutex_enter(&fil_system->mutex);
 
-	node = mem_alloc(sizeof(fil_node_t));
+	node = static_cast<fil_node_t*>(mem_zalloc(sizeof(fil_node_t)));
 
 	node->name = mem_strdup(name);
-	node->open = FALSE;
 
 	ut_a(!is_raw || srv_start_raw_disk_in_use);
 
+	node->sync_event = os_event_create();
 	node->is_raw_disk = is_raw;
 	node->size = size;
 	node->magic_n = FIL_NODE_MAGIC_N;
-	node->n_pending = 0;
-	node->n_pending_flushes = 0;
-
-	node->modification_counter = 0;
-	node->flush_counter = 0;
 
 	space = fil_space_get_by_id(id);
 
@@ -653,7 +675,7 @@ fil_node_create(
 
 		mutex_exit(&fil_system->mutex);
 
-		return;
+		return(NULL);
 	}
 
 	space->size += size;
@@ -662,34 +684,36 @@ fil_node_create(
 
 	UT_LIST_ADD_LAST(chain, space->chain, node);
 
-	if (id < SRV_EXTRA_SYS_SPACE_FIRST_ID && fil_system->max_assigned_id < id) {
+	if (id < SRV_LOG_SPACE_FIRST_ID && fil_system->max_assigned_id < id) {
 
 		fil_system->max_assigned_id = id;
 	}
 
 	mutex_exit(&fil_system->mutex);
+
+	return(node->name);
 }
 
 /********************************************************************//**
-Opens a the file of a node of a tablespace. The caller must own the fil_system
-mutex. */
+Opens a file of a node of a tablespace. The caller must own the fil_system
+mutex.
+@return false if the file can't be opened, otherwise true */
 static
-void
+bool
 fil_node_open_file(
 /*===============*/
 	fil_node_t*	node,	/*!< in: file node */
 	fil_system_t*	system,	/*!< in: tablespace memory cache */
 	fil_space_t*	space)	/*!< in: space */
 {
-	ib_int64_t	size_bytes;
-	ulint		size_low;
-	ulint		size_high;
+	os_offset_t	size_bytes;
 	ibool		ret;
 	ibool		success;
 	byte*		buf2;
 	byte*		page;
 	ulint		space_id;
 	ulint		flags;
+	ulint		page_size;
 
 	ut_ad(mutex_own(&(system->mutex)));
 	ut_a(node->n_pending == 0);
@@ -709,41 +733,38 @@ fil_node_open_file(
 			OS_FILE_READ_ONLY, &success);
 		if (!success) {
 			/* The following call prints an error message */
-			os_file_get_last_error(TRUE);
+			os_file_get_last_error(true);
 
 			ut_print_timestamp(stderr);
 
-			fprintf(stderr,
-				"  InnoDB: Fatal error: cannot open %s\n."
-				"InnoDB: Have you deleted .ibd files"
-				" under a running mysqld server?\n",
+			ib_logf(IB_LOG_LEVEL_WARN, "InnoDB: Error: cannot "
+				"open %s\n. InnoDB: Have you deleted .ibd "
+				"files under a running mysqld server?\n",
 				node->name);
-			ut_a(0);
-		}
 
-		os_file_get_size(node->handle, &size_low, &size_high);
+			return(false);
+		}
 
-		size_bytes = (((ib_int64_t)size_high) << 32)
-			+ (ib_int64_t)size_low;
+		size_bytes = os_file_get_size(node->handle);
+		ut_a(size_bytes != (os_offset_t) -1);
 #ifdef UNIV_HOTBACKUP
-		if (trx_sys_sys_space(space->id)) {
+		if (space->id == 0) {
 			node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
 			os_file_close(node->handle);
 			goto add_size;
 		}
 #endif /* UNIV_HOTBACKUP */
 		ut_a(space->purpose != FIL_LOG);
-		ut_a(!trx_sys_sys_space(space->id));
+		ut_a(fil_is_user_tablespace_id(space->id));
 
-		if (size_bytes < FIL_IBD_FILE_INITIAL_SIZE * (lint)UNIV_PAGE_SIZE) {
+		if (size_bytes < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) {
 			fprintf(stderr,
 				"InnoDB: Error: the size of single-table"
 				" tablespace file %s\n"
-				"InnoDB: is only %lu %lu,"
+				"InnoDB: is only "UINT64PF","
 				" should be at least %lu!\n",
 				node->name,
-				(ulong) size_high,
-				(ulong) size_low,
+				size_bytes,
 				(ulong) (FIL_IBD_FILE_INITIAL_SIZE
 					 * UNIV_PAGE_SIZE));
 
@@ -752,15 +773,15 @@ fil_node_open_file(
 
 		/* Read the first page of the tablespace */
 
-		buf2 = ut_malloc(2 * UNIV_PAGE_SIZE);
+		buf2 = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
 		/* Align the memory for file i/o if we might have O_DIRECT
 		set */
-		page = ut_align(buf2, UNIV_PAGE_SIZE);
+		page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
 
-		success = os_file_read(node->handle, page, 0, 0,
-				       UNIV_PAGE_SIZE);
+		success = os_file_read(node->handle, page, 0, UNIV_PAGE_SIZE);
 		space_id = fsp_header_get_space_id(page);
 		flags = fsp_header_get_flags(page);
+		page_size = fsp_flags_get_page_size(flags);
 
 		ut_free(buf2);
 
@@ -779,7 +800,7 @@ fil_node_open_file(
 		}
 
 		if (UNIV_UNLIKELY(space_id == ULINT_UNDEFINED
-				  || trx_sys_sys_space(space_id))) {
+				  || space_id == 0)) {
 			fprintf(stderr,
 				"InnoDB: Error: tablespace id %lu"
 				" in file %s is not sensible\n",
@@ -788,11 +809,24 @@ fil_node_open_file(
 			ut_error;
 		}
 
+		if (UNIV_UNLIKELY(fsp_flags_get_page_size(space->flags)
+				  != page_size)) {
+			fprintf(stderr,
+				"InnoDB: Error: tablespace file %s"
+				" has page size 0x%lx\n"
+				"InnoDB: but the data dictionary"
+				" expects page size 0x%lx!\n",
+				node->name, flags,
+				fsp_flags_get_page_size(space->flags));
+
+			ut_error;
+		}
+
 		if (UNIV_UNLIKELY(space->flags != flags)) {
 			fprintf(stderr,
-				"InnoDB: Error: table flags are %lx"
+				"InnoDB: Error: table flags are 0x%lx"
 				" in the data dictionary\n"
-				"InnoDB: but the flags in file %s are %lx!\n",
+				"InnoDB: but the flags in file %s are 0x%lx!\n",
 				space->flags, node->name, flags);
 
 			ut_error;
@@ -803,12 +837,12 @@ fil_node_open_file(
 			size_bytes = ut_2pow_round(size_bytes, 1024 * 1024);
 		}
 
-		if (!(flags & DICT_TF_ZSSIZE_MASK)) {
+		if (!fsp_flags_is_compressed(flags)) {
 			node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
 		} else {
 			node->size = (ulint)
 				(size_bytes
-				 / dict_table_flags_to_zip_size(flags));
+				 / fsp_flags_get_zip_size(flags));
 		}
 
 #ifdef UNIV_HOTBACKUP
@@ -846,11 +880,15 @@ add_size:
 	node->open = TRUE;
 
 	system->n_open++;
+	fil_n_file_opened++;
+
+	if (fil_space_belongs_in_lru(space)) {
 
-	if (space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(space->id)) {
 		/* Put the node to the LRU list */
 		UT_LIST_ADD_FIRST(LRU, system->LRU, node);
 	}
+
+	return(true);
 }
 
 /**********************************************************************//**
@@ -867,8 +905,9 @@ fil_node_close_file(
 	ut_ad(node && system);
 	ut_ad(mutex_own(&(system->mutex)));
 	ut_a(node->open);
-	ut_a(node->n_pending == 0 || node->space->stop_new_ops);
+	ut_a(node->n_pending == 0);
 	ut_a(node->n_pending_flushes == 0);
+	ut_a(!node->being_extended);
 #ifndef UNIV_HOTBACKUP
 	ut_a(node->modification_counter == node->flush_counter
 	     || srv_fast_shutdown == 2);
@@ -882,8 +921,10 @@ fil_node_close_file(
 	node->open = FALSE;
 	ut_a(system->n_open > 0);
 	system->n_open--;
+	fil_n_file_opened--;
+
+	if (fil_space_belongs_in_lru(node->space)) {
 
-	if (node->n_pending == 0 && node->space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(node->space->id)) {
 		ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
 
 		/* The node is in the LRU list, remove it */
@@ -910,41 +951,51 @@ fil_try_to_close_file_in_LRU(
 
 	ut_ad(mutex_own(&fil_system->mutex));
 
-	node = UT_LIST_GET_LAST(fil_system->LRU);
-
 	if (print_info) {
 		fprintf(stderr,
 			"InnoDB: fil_sys open file LRU len %lu\n",
 			(ulong) UT_LIST_GET_LEN(fil_system->LRU));
 	}
 
-	while (node != NULL) {
+	for (node = UT_LIST_GET_LAST(fil_system->LRU);
+	     node != NULL;
+	     node = UT_LIST_GET_PREV(LRU, node)) {
+
 		if (node->modification_counter == node->flush_counter
-		    && node->n_pending_flushes == 0) {
+		    && node->n_pending_flushes == 0
+		    && !node->being_extended) {
 
 			fil_node_close_file(node, fil_system);
 
 			return(TRUE);
 		}
 
-		if (print_info && node->n_pending_flushes > 0) {
+		if (!print_info) {
+			continue;
+		}
+
+		if (node->n_pending_flushes > 0) {
 			fputs("InnoDB: cannot close file ", stderr);
 			ut_print_filename(stderr, node->name);
 			fprintf(stderr, ", because n_pending_flushes %lu\n",
 				(ulong) node->n_pending_flushes);
 		}
 
-		if (print_info
-		    && node->modification_counter != node->flush_counter) {
+		if (node->modification_counter != node->flush_counter) {
 			fputs("InnoDB: cannot close file ", stderr);
 			ut_print_filename(stderr, node->name);
 			fprintf(stderr,
 				", because mod_count %ld != fl_count %ld\n",
 				(long) node->modification_counter,
 				(long) node->flush_counter);
+
 		}
 
-		node = UT_LIST_GET_PREV(LRU, node);
+		if (node->being_extended) {
+			fputs("InnoDB: cannot close file ", stderr);
+			ut_print_filename(stderr, node->name);
+			fprintf(stderr, ", because it is being extended\n");
+		}
 	}
 
 	return(FALSE);
@@ -969,7 +1020,7 @@ fil_mutex_enter_and_prepare_for_io(
 retry:
 	mutex_enter(&fil_system->mutex);
 
-	if (trx_sys_sys_space(space_id) || space_id >= SRV_LOG_SPACE_FIRST_ID) {
+	if (space_id == 0 || space_id >= SRV_LOG_SPACE_FIRST_ID) {
 		/* We keep log files and system tablespace files always open;
 		this is important in preventing deadlocks in this module, as
 		a page read completion often performs another read from the
@@ -1101,18 +1152,25 @@ fil_node_free(
 	ut_ad(node && system && space);
 	ut_ad(mutex_own(&(system->mutex)));
 	ut_a(node->magic_n == FIL_NODE_MAGIC_N);
-	ut_a(node->n_pending == 0 || space->stop_new_ops);
+	ut_a(node->n_pending == 0);
+	ut_a(!node->being_extended);
 
 	if (node->open) {
 		/* We fool the assertion in fil_node_close_file() to think
 		there are no unflushed modifications in the file */
 
 		node->modification_counter = node->flush_counter;
+		os_event_set(node->sync_event);
 
-		if (space->is_in_unflushed_spaces
-		    && fil_space_is_flushed(space)) {
+		if (fil_buffering_disabled(space)) {
 
-			space->is_in_unflushed_spaces = FALSE;
+			ut_ad(!space->is_in_unflushed_spaces);
+			ut_ad(fil_space_is_flushed(space));
+
+		} else if (space->is_in_unflushed_spaces
+			   && fil_space_is_flushed(space)) {
+
+			space->is_in_unflushed_spaces = false;
 
 			UT_LIST_REMOVE(unflushed_spaces,
 				       system->unflushed_spaces,
@@ -1126,6 +1184,7 @@ fil_node_free(
 
 	UT_LIST_REMOVE(chain, space->chain, node);
 
+	os_event_free(node->sync_event);
 	mem_free(node->name);
 	mem_free(node);
 }
@@ -1164,11 +1223,44 @@ fil_space_truncate_start(
 
 	mutex_exit(&fil_system->mutex);
 }
+
+/****************************************************************//**
+Check is there node in file space with given name. */
+UNIV_INTERN
+ibool
+fil_space_contains_node(
+/*====================*/
+	ulint	id,		/*!< in: space id */
+	char*	node_name)	/*!< in: node name */
+{
+	fil_node_t*	node;
+	fil_space_t*	space;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	ut_a(space);
+
+	for (node = UT_LIST_GET_FIRST(space->chain); node != NULL;
+	     node = UT_LIST_GET_NEXT(chain, node)) {
+
+		if (ut_strcmp(node->name, node_name) == 0) {
+			mutex_exit(&fil_system->mutex);
+			return(TRUE);
+		}
+
+	}
+
+	mutex_exit(&fil_system->mutex);
+	return(FALSE);
+}
+
 #endif /* UNIV_LOG_ARCHIVE */
 
 /*******************************************************************//**
-Creates a space memory object and puts it to the tablespace memory cache. If
-there is an error, prints an error message to the .err log.
+Creates a space memory object and puts it to the 'fil system' hash table.
+If there is an error, prints an error message to the .err log.
 @return	TRUE if success */
 UNIV_INTERN
 ibool
@@ -1176,102 +1268,63 @@ fil_space_create(
 /*=============*/
 	const char*	name,	/*!< in: space name */
 	ulint		id,	/*!< in: space id */
-	ulint		flags,	/*!< in: compressed page size
-				and file format, or 0 */
+	ulint		flags,	/*!< in: tablespace flags */
 	ulint		purpose)/*!< in: FIL_TABLESPACE, or FIL_LOG if log */
 {
 	fil_space_t*	space;
 
-	/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
-	ROW_FORMAT=COMPACT
-	((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
-	ROW_FORMAT=REDUNDANT (table->flags == 0).  For any other
-	format, the tablespace flags should equal
-	(table->flags & ~(~0 << DICT_TF_BITS)). */
-	ut_a(flags != DICT_TF_COMPACT);
-	ut_a(!(flags & (~0UL << DICT_TF_BITS)));
-
-try_again:
-	/*printf(
-	"InnoDB: Adding tablespace %lu of name %s, purpose %lu\n", id, name,
-	purpose);*/
+	DBUG_EXECUTE_IF("fil_space_create_failure", return(false););
 
 	ut_a(fil_system);
-	ut_a(name);
+	ut_a(fsp_flags_is_valid(flags));
 
-	mutex_enter(&fil_system->mutex);
+	/* Look for a matching tablespace and if found free it. */
+	do {
+		mutex_enter(&fil_system->mutex);
 
-	space = fil_space_get_by_name(name);
+		space = fil_space_get_by_name(name);
 
-	if (UNIV_LIKELY_NULL(space)) {
-		ibool	success;
-		ulint	namesake_id;
+		if (space != 0) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Tablespace '%s' exists in the cache "
+				"with id %lu != %lu",
+				name, (ulong) space->id, (ulong) id);
 
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: Warning: trying to init to the"
-			" tablespace memory cache\n"
-			"InnoDB: a tablespace %lu of name ", (ulong) id);
-		ut_print_filename(stderr, name);
-		fprintf(stderr, ",\n"
-			"InnoDB: but a tablespace %lu of the same name\n"
-			"InnoDB: already exists in the"
-			" tablespace memory cache!\n",
-			(ulong) space->id);
+			if (id == 0 || purpose != FIL_TABLESPACE) {
 
-		if (trx_sys_sys_space(id) || purpose != FIL_TABLESPACE) {
+				mutex_exit(&fil_system->mutex);
 
-			mutex_exit(&fil_system->mutex);
+				return(FALSE);
+			}
 
-			return(FALSE);
-		}
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Freeing existing tablespace '%s' entry "
+				"from the cache with id %lu",
+				name, (ulong) id);
 
-		fprintf(stderr,
-			"InnoDB: We assume that InnoDB did a crash recovery,"
-			" and you had\n"
-			"InnoDB: an .ibd file for which the table"
-			" did not exist in the\n"
-			"InnoDB: InnoDB internal data dictionary in the"
-			" ibdata files.\n"
-			"InnoDB: We assume that you later removed the"
-			" .ibd and .frm files,\n"
-			"InnoDB: and are now trying to recreate the table."
-			" We now remove the\n"
-			"InnoDB: conflicting tablespace object"
-			" from the memory cache and try\n"
-			"InnoDB: the init again.\n");
-
-		namesake_id = space->id;
-
-		success = fil_space_free(namesake_id, FALSE);
-		ut_a(success);
+			ibool	success = fil_space_free(space->id, FALSE);
+			ut_a(success);
 
-		mutex_exit(&fil_system->mutex);
+			mutex_exit(&fil_system->mutex);
+		}
 
-		goto try_again;
-	}
+	} while (space != 0);
 
 	space = fil_space_get_by_id(id);
 
-	if (UNIV_LIKELY_NULL(space)) {
-		fprintf(stderr,
-			"InnoDB: Error: trying to add tablespace %lu"
-			" of name ", (ulong) id);
-		ut_print_filename(stderr, name);
-		fprintf(stderr, "\n"
-			"InnoDB: to the tablespace memory cache,"
-			" but tablespace\n"
-			"InnoDB: %lu of name ", (ulong) space->id);
-		ut_print_filename(stderr, space->name);
-		fputs(" already exists in the tablespace\n"
-		      "InnoDB: memory cache!\n", stderr);
+	if (space != 0) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Trying to add tablespace '%s' with id %lu "
+			"to the tablespace memory cache, but tablespace '%s' "
+			"with id %lu already exists in the cache!",
+			name, (ulong) id, space->name, (ulong) space->id);
 
 		mutex_exit(&fil_system->mutex);
 
 		return(FALSE);
 	}
 
-	space = mem_alloc(sizeof(fil_space_t));
+	space = static_cast<fil_space_t*>(mem_zalloc(sizeof(*space)));
 
 	space->name = mem_strdup(name);
 	space->id = id;
@@ -1280,16 +1333,15 @@ try_again:
 	space->tablespace_version = fil_system->tablespace_version;
 	space->mark = FALSE;
 
-	if (UNIV_LIKELY(purpose == FIL_TABLESPACE && !recv_recovery_on)
-	    && UNIV_UNLIKELY(id < SRV_EXTRA_SYS_SPACE_FIRST_ID)
-	    && UNIV_UNLIKELY(id > fil_system->max_assigned_id)) {
+	if (purpose == FIL_TABLESPACE && !recv_recovery_on
+	    && id > fil_system->max_assigned_id) {
+
 		if (!fil_system->space_id_reuse_warned) {
 			fil_system->space_id_reuse_warned = TRUE;
 
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: Warning: allocated tablespace %lu,"
-				" old maximum was %lu\n",
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Allocated tablespace %lu, old maximum "
+				"was %lu",
 				(ulong) id,
 				(ulong) fil_system->max_assigned_id);
 		}
@@ -1297,18 +1349,9 @@ try_again:
 		fil_system->max_assigned_id = id;
 	}
 
-	space->stop_ios = FALSE;
-	space->stop_new_ops = FALSE;
 	space->purpose = purpose;
-	space->size = 0;
 	space->flags = flags;
 
-	space->n_reserved_extents = 0;
-
-	space->n_pending_flushes = 0;
-	space->n_pending_ops = 0;
-
-	UT_LIST_INIT(space->chain);
 	space->magic_n = FIL_SPACE_MAGIC_N;
 
 	rw_lock_create(fil_space_latch_key, &space->latch, SYNC_FSP);
@@ -1317,7 +1360,7 @@ try_again:
 
 	HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash,
 		    ut_fold_string(name), space);
-	space->is_in_unflushed_spaces = FALSE;
+	space->is_in_unflushed_spaces = false;
 
 	space->is_corrupt = FALSE;
 
@@ -1366,7 +1409,7 @@ fil_assign_new_space_id(
 			(ulong) SRV_LOG_SPACE_FIRST_ID);
 	}
 
-	success = (id < SRV_EXTRA_SYS_SPACE_FIRST_ID);
+	success = (id < SRV_LOG_SPACE_FIRST_ID);
 
 	if (success) {
 		*space_id = fil_system->max_assigned_id = id;
@@ -1403,8 +1446,7 @@ fil_space_free(
 					in X mode */
 {
 	fil_space_t*	space;
-	fil_space_t*	namespace;
-	fil_node_t*	fil_node;
+	fil_space_t*	fnamespace;
 
 	ut_ad(mutex_own(&fil_system->mutex));
 
@@ -1422,15 +1464,17 @@ fil_space_free(
 
 	HASH_DELETE(fil_space_t, hash, fil_system->spaces, id, space);
 
-	namespace = fil_space_get_by_name(space->name);
-	ut_a(namespace);
-	ut_a(space == namespace);
+	fnamespace = fil_space_get_by_name(space->name);
+	ut_a(fnamespace);
+	ut_a(space == fnamespace);
 
 	HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash,
 		    ut_fold_string(space->name), space);
 
 	if (space->is_in_unflushed_spaces) {
-		space->is_in_unflushed_spaces = FALSE;
+
+		ut_ad(!fil_buffering_disabled(space));
+		space->is_in_unflushed_spaces = false;
 
 		UT_LIST_REMOVE(unflushed_spaces, fil_system->unflushed_spaces,
 			       space);
@@ -1441,12 +1485,11 @@ fil_space_free(
 	ut_a(space->magic_n == FIL_SPACE_MAGIC_N);
 	ut_a(0 == space->n_pending_flushes);
 
-	fil_node = UT_LIST_GET_FIRST(space->chain);
+	for (fil_node_t* fil_node = UT_LIST_GET_FIRST(space->chain);
+	     fil_node != NULL;
+	     fil_node = UT_LIST_GET_FIRST(space->chain)) {
 
-	while (fil_node != NULL) {
 		fil_node_free(fil_node, fil_system, space);
-
-		fil_node = UT_LIST_GET_FIRST(space->chain);
 	}
 
 	ut_a(0 == UT_LIST_GET_LEN(space->chain));
@@ -1464,36 +1507,28 @@ fil_space_free(
 }
 
 /*******************************************************************//**
-Returns the size of the space in pages. The tablespace must be cached in the
-memory cache.
-@return	space size, 0 if space not found */
-UNIV_INTERN
-ulint
-fil_space_get_size(
-/*===============*/
+Returns a pointer to the file_space_t that is in the memory cache
+associated with a space id. The caller must lock fil_system->mutex.
+@return	file_space_t pointer, NULL if space not found */
+UNIV_INLINE
+fil_space_t*
+fil_space_get_space(
+/*================*/
 	ulint	id)	/*!< in: space id */
 {
-	fil_node_t*	node;
 	fil_space_t*	space;
-	ulint		size;
+	fil_node_t*	node;
 
 	ut_ad(fil_system);
 
-	mutex_enter(&fil_system->mutex);
-
 	space = fil_space_get_by_id(id);
-
 	if (space == NULL) {
-		mutex_exit(&fil_system->mutex);
-
-		return(0);
+		return(NULL);
 	}
 
 	if (space->size == 0 && space->purpose == FIL_TABLESPACE) {
 		ut_a(id != 0);
 
-		ut_a(1 == UT_LIST_GET_LEN(space->chain));
-
 		mutex_exit(&fil_system->mutex);
 
 		/* It is possible that the space gets evicted at this point
@@ -1505,23 +1540,90 @@ fil_space_get_size(
 		/* We are still holding the fil_system->mutex. Check if
 		the space is still in memory cache. */
 		space = fil_space_get_by_id(id);
-
 		if (space == NULL) {
-			mutex_exit(&fil_system->mutex);
-			return(0);
+			return(NULL);
 		}
 
+		/* The following code must change when InnoDB supports
+		multiple datafiles per tablespace. */
+		ut_a(1 == UT_LIST_GET_LEN(space->chain));
+
 		node = UT_LIST_GET_FIRST(space->chain);
 
 		/* It must be a single-table tablespace and we have not opened
 		the file yet; the following calls will open it and update the
 		size fields */
 
-		fil_node_prepare_for_io(node, fil_system, space);
+		if (!fil_node_prepare_for_io(node, fil_system, space)) {
+			/* The single-table tablespace can't be opened,
+			because the ibd file is missing. */
+			return(NULL);
+		}
 		fil_node_complete_io(node, fil_system, OS_FILE_READ);
 	}
 
-	size = space->size;
+	return(space);
+}
+
+/*******************************************************************//**
+Returns the path from the first fil_node_t found for the space ID sent.
+The caller is responsible for freeing the memory allocated here for the
+value returned.
+@return	own: A copy of fil_node_t::path, NULL if space ID is zero
+or not found. */
+UNIV_INTERN
+char*
+fil_space_get_first_path(
+/*=====================*/
+	ulint		id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+	fil_node_t*	node;
+	char*		path;
+
+	ut_ad(fil_system);
+	ut_a(id);
+
+	fil_mutex_enter_and_prepare_for_io(id);
+
+	space = fil_space_get_space(id);
+
+	if (space == NULL) {
+		mutex_exit(&fil_system->mutex);
+
+		return(NULL);
+	}
+
+	ut_ad(mutex_own(&fil_system->mutex));
+
+	node = UT_LIST_GET_FIRST(space->chain);
+
+	path = mem_strdup(node->name);
+
+	mutex_exit(&fil_system->mutex);
+
+	return(path);
+}
+
+/*******************************************************************//**
+Returns the size of the space in pages. The tablespace must be cached in the
+memory cache.
+@return	space size, 0 if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_size(
+/*===============*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+	ulint		size;
+
+	ut_ad(fil_system);
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_space(id);
+
+	size = space ? space->size : 0;
 
 	mutex_exit(&fil_system->mutex);
 
@@ -1538,19 +1640,18 @@ fil_space_get_flags(
 /*================*/
 	ulint	id)	/*!< in: space id */
 {
-	fil_node_t*	node;
 	fil_space_t*	space;
 	ulint		flags;
 
 	ut_ad(fil_system);
 
-	if (UNIV_UNLIKELY(!id)) {
+	if (!id) {
 		return(0);
 	}
 
 	mutex_enter(&fil_system->mutex);
 
-	space = fil_space_get_by_id(id);
+	space = fil_space_get_space(id);
 
 	if (space == NULL) {
 		mutex_exit(&fil_system->mutex);
@@ -1558,38 +1659,6 @@ fil_space_get_flags(
 		return(ULINT_UNDEFINED);
 	}
 
-	if (space->size == 0 && space->purpose == FIL_TABLESPACE) {
-		ut_a(id != 0);
-
-		ut_a(1 == UT_LIST_GET_LEN(space->chain));
-
-		mutex_exit(&fil_system->mutex);
-
-		/* It is possible that the space gets evicted at this point
-		before the fil_mutex_enter_and_prepare_for_io() acquires
-		the fil_system->mutex. Check for this after completing the
-		call to fil_mutex_enter_and_prepare_for_io(). */
-		fil_mutex_enter_and_prepare_for_io(id);
-
-		/* We are still holding the fil_system->mutex. Check if
-		the space is still in memory cache. */
-		space = fil_space_get_by_id(id);
-
-		if (space == NULL) {
-			mutex_exit(&fil_system->mutex);
-			return(0);
-		}
-
-		node = UT_LIST_GET_FIRST(space->chain);
-
-		/* It must be a single-table tablespace and we have not opened
-		the file yet; the following calls will open it and update the
-		size fields */
-
-		fil_node_prepare_for_io(node, fil_system, space);
-		fil_node_complete_io(node, fil_system, OS_FILE_READ);
-	}
-
 	flags = space->flags;
 
 	mutex_exit(&fil_system->mutex);
@@ -1613,7 +1682,7 @@ fil_space_get_zip_size(
 
 	if (flags && flags != ULINT_UNDEFINED) {
 
-		return(dict_table_flags_to_zip_size(flags));
+		return(fsp_flags_get_zip_size(flags));
 	}
 
 	return(flags);
@@ -1652,12 +1721,11 @@ fil_init(
 	ut_a(hash_size > 0);
 	ut_a(max_n_open > 0);
 
-	fil_system = mem_zalloc(sizeof(fil_system_t));
+	fil_system = static_cast<fil_system_t*>(
+		mem_zalloc(sizeof(fil_system_t)));
 
 	mutex_create(fil_system_mutex_key,
 		     &fil_system->mutex, SYNC_ANY_LATCH);
-	mutex_create(fil_system_mutex_key,
-		     &fil_system->file_extend_mutex, SYNC_OUTER_ANY_LATCH);
 
 	fil_system->spaces = hash_create(hash_size);
 	fil_system->name_hash = hash_create(hash_size);
@@ -1665,8 +1733,6 @@ fil_init(
 	UT_LIST_INIT(fil_system->LRU);
 
 	fil_system->max_n_open = max_n_open;
-
-	fil_system->max_assigned_id = TRX_SYS_SPACE_MAX;
 }
 
 /*******************************************************************//**
@@ -1681,47 +1747,59 @@ fil_open_log_and_system_tablespace_files(void)
 /*==========================================*/
 {
 	fil_space_t*	space;
-	fil_node_t*	node;
 
 	mutex_enter(&fil_system->mutex);
 
-	space = UT_LIST_GET_FIRST(fil_system->space_list);
+	for (space = UT_LIST_GET_FIRST(fil_system->space_list);
+	     space != NULL;
+	     space = UT_LIST_GET_NEXT(space_list, space)) {
 
-	while (space != NULL) {
-		if (space->purpose != FIL_TABLESPACE || trx_sys_sys_space(space->id)) {
-			node = UT_LIST_GET_FIRST(space->chain);
+		fil_node_t*	node;
 
-			while (node != NULL) {
-				if (!node->open) {
-					fil_node_open_file(node, fil_system,
-							   space);
-				}
-				if (fil_system->max_n_open
-				    < 10 + fil_system->n_open) {
-					fprintf(stderr,
-						"InnoDB: Warning: you must"
-						" raise the value of"
-						" innodb_open_files in\n"
-						"InnoDB: my.cnf! Remember that"
-						" InnoDB keeps all log files"
-						" and all system\n"
-						"InnoDB: tablespace files open"
-						" for the whole time mysqld is"
-						" running, and\n"
-						"InnoDB: needs to open also"
-						" some .ibd files if the"
-						" file-per-table storage\n"
-						"InnoDB: model is used."
-						" Current open files %lu,"
-						" max allowed"
-						" open files %lu.\n",
-						(ulong) fil_system->n_open,
-						(ulong) fil_system->max_n_open);
+		if (fil_space_belongs_in_lru(space)) {
+
+			continue;
+		}
+
+		for (node = UT_LIST_GET_FIRST(space->chain);
+		     node != NULL;
+		     node = UT_LIST_GET_NEXT(chain, node)) {
+
+			if (!node->open) {
+				if (!fil_node_open_file(node, fil_system,
+							space)) {
+					/* This func is called during server's
+					startup. If some file of log or system
+					tablespace is missing, the server
+					can't start successfully. So we should
+					assert for it. */
+					ut_a(0);
 				}
-				node = UT_LIST_GET_NEXT(chain, node);
+			}
+
+			if (fil_system->max_n_open < 10 + fil_system->n_open) {
+
+				fprintf(stderr,
+					"InnoDB: Warning: you must"
+					" raise the value of"
+					" innodb_open_files in\n"
+					"InnoDB: my.cnf! Remember that"
+					" InnoDB keeps all log files"
+					" and all system\n"
+					"InnoDB: tablespace files open"
+					" for the whole time mysqld is"
+					" running, and\n"
+					"InnoDB: needs to open also"
+					" some .ibd files if the"
+					" file-per-table storage\n"
+					"InnoDB: model is used."
+					" Current open files %lu,"
+					" max allowed"
+					" open files %lu.\n",
+					(ulong) fil_system->n_open,
+					(ulong) fil_system->max_n_open);
 			}
 		}
-		space = UT_LIST_GET_NEXT(space_list, space);
 	}
 
 	mutex_exit(&fil_system->mutex);
@@ -1763,6 +1841,49 @@ fil_close_all_files(void)
 }
 
 /*******************************************************************//**
+Closes the redo log files. There must not be any pending i/o's or not
+flushed modifications in the files. */
+UNIV_INTERN
+void
+fil_close_log_files(
+/*================*/
+	bool	free)	/*!< in: whether to free the memory object */
+{
+	fil_space_t*	space;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = UT_LIST_GET_FIRST(fil_system->space_list);
+
+	while (space != NULL) {
+		fil_node_t*	node;
+		fil_space_t*	prev_space = space;
+
+		if (space->purpose != FIL_LOG) {
+			space = UT_LIST_GET_NEXT(space_list, space);
+			continue;
+		}
+
+		for (node = UT_LIST_GET_FIRST(space->chain);
+		     node != NULL;
+		     node = UT_LIST_GET_NEXT(chain, node)) {
+
+			if (node->open) {
+				fil_node_close_file(node, fil_system);
+			}
+		}
+
+		space = UT_LIST_GET_NEXT(space_list, space);
+
+		if (free) {
+			fil_space_free(prev_space->id, FALSE);
+		}
+	}
+
+	mutex_exit(&fil_system->mutex);
+}
+
+/*******************************************************************//**
 Sets the max tablespace id counter if the given number is bigger than the
 previous value. */
 UNIV_INTERN
@@ -1778,10 +1899,6 @@ fil_set_max_space_id_if_bigger(
 		ut_error;
 	}
 
-	if (max_id >= SRV_EXTRA_SYS_SPACE_FIRST_ID) {
-		return;
-	}
-
 	mutex_enter(&fil_system->mutex);
 
 	if (fil_system->max_assigned_id < max_id) {
@@ -1796,34 +1913,36 @@ fil_set_max_space_id_if_bigger(
 Writes the flushed lsn and the latest archived log number to the page header
 of the first page of a data file of the system tablespace (space 0),
 which is uncompressed. */
-static
-ulint
+static __attribute__((warn_unused_result))
+dberr_t
 fil_write_lsn_and_arch_no_to_file(
 /*==============================*/
-	ulint		space_id,
-	ulint		sum_of_sizes,	/*!< in: combined size of previous files
-					in space, in database pages */
-	ib_uint64_t	lsn,		/*!< in: lsn to write */
-	ulint		arch_log_no __attribute__((unused)))
-					/*!< in: archived log number to write */
+	ulint	space,		/*!< in: space to write to */
+	ulint	sum_of_sizes,	/*!< in: combined size of previous files
+				in space, in database pages */
+	lsn_t	lsn,		/*!< in: lsn to write */
+	ulint	arch_log_no __attribute__((unused)))
+				/*!< in: archived log number to write */
 {
 	byte*	buf1;
 	byte*	buf;
+	dberr_t	err;
 
-	ut_a(trx_sys_sys_space(space_id));
+	buf1 = static_cast<byte*>(mem_alloc(2 * UNIV_PAGE_SIZE));
+	buf = static_cast<byte*>(ut_align(buf1, UNIV_PAGE_SIZE));
 
-	buf1 = mem_alloc(2 * UNIV_PAGE_SIZE);
-	buf = ut_align(buf1, UNIV_PAGE_SIZE);
+	err = fil_read(TRUE, space, 0, sum_of_sizes, 0,
+		       UNIV_PAGE_SIZE, buf, NULL);
+	if (err == DB_SUCCESS) {
+		mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn);
 
-	fil_read(TRUE, space_id, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
-
-	mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn);
-
-	fil_write(TRUE, space_id, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
+		err = fil_write(TRUE, space, 0, sum_of_sizes, 0,
+				UNIV_PAGE_SIZE, buf, NULL);
+	}
 
 	mem_free(buf1);
 
-	return(DB_SUCCESS);
+	return(err);
 }
 
 /****************************************************************//**
@@ -1831,39 +1950,42 @@ Writes the flushed lsn and the latest archived log number to the page
 header of the first page of each data file in the system tablespace.
 @return	DB_SUCCESS or error number */
 UNIV_INTERN
-ulint
+dberr_t
 fil_write_flushed_lsn_to_data_files(
 /*================================*/
-	ib_uint64_t	lsn,		/*!< in: lsn to write */
-	ulint		arch_log_no)	/*!< in: latest archived log
-					file number */
+	lsn_t	lsn,		/*!< in: lsn to write */
+	ulint	arch_log_no)	/*!< in: latest archived log file number */
 {
 	fil_space_t*	space;
 	fil_node_t*	node;
-	ulint		sum_of_sizes;
-	ulint		err;
+	dberr_t		err;
 
 	mutex_enter(&fil_system->mutex);
 
-	space = UT_LIST_GET_FIRST(fil_system->space_list);
+	for (space = UT_LIST_GET_FIRST(fil_system->space_list);
+	     space != NULL;
+	     space = UT_LIST_GET_NEXT(space_list, space)) {
 
-	while (space) {
 		/* We only write the lsn to all existing data files which have
 		been open during the lifetime of the mysqld process; they are
 		represented by the space objects in the tablespace memory
-		cache. Note that all data files in the system tablespace 0 are
-		always open. */
+		cache. Note that all data files in the system tablespace 0
+		and the UNDO log tablespaces (if separate) are always open. */
 
 		if (space->purpose == FIL_TABLESPACE
-		    && trx_sys_sys_space(space->id)) {
-			sum_of_sizes = 0;
+		    && !fil_is_user_tablespace_id(space->id)) {
+			ulint	sum_of_sizes = 0;
+
+			for (node = UT_LIST_GET_FIRST(space->chain);
+			     node != NULL;
+			     node = UT_LIST_GET_NEXT(chain, node)) {
 
-			node = UT_LIST_GET_FIRST(space->chain);
-			while (node) {
 				mutex_exit(&fil_system->mutex);
 
 				err = fil_write_lsn_and_arch_no_to_file(
-					space->id, sum_of_sizes, lsn, arch_log_no);
+					space->id, sum_of_sizes, lsn,
+					arch_log_no);
+
 				if (err != DB_SUCCESS) {
 
 					return(err);
@@ -1872,10 +1994,8 @@ fil_write_flushed_lsn_to_data_files(
 				mutex_enter(&fil_system->mutex);
 
 				sum_of_sizes += node->size;
-				node = UT_LIST_GET_NEXT(chain, node);
 			}
 		}
-		space = UT_LIST_GET_NEXT(space_list, space);
 	}
 
 	mutex_exit(&fil_system->mutex);
@@ -1904,6 +2024,10 @@ fil_check_first_page(
 	space_id = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page);
 	flags = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page);
 
+	if (UNIV_PAGE_SIZE != fsp_flags_get_page_size(flags)) {
+		return("innodb-page-size mismatch");
+	}
+
 	if (!space_id && !flags) {
 		ulint		nonzero_bytes	= UNIV_PAGE_SIZE;
 		const byte*	b		= page;
@@ -1918,7 +2042,7 @@ fil_check_first_page(
 	}
 
 	if (buf_page_is_corrupted(
-		    FALSE, page, dict_table_flags_to_zip_size(flags))) {
+		    false, page, fsp_flags_get_zip_size(flags))) {
 		return("checksum mismatch");
 	}
 
@@ -1944,29 +2068,28 @@ fil_read_first_page(
 						parameters below already
 						contain sensible data */
 	ulint*		flags,			/*!< out: tablespace flags */
-#ifdef UNIV_LOG_ARCHIVE
-	ulint*		min_arch_log_no,	/*!< out: min of archived
-						log numbers in data files */
-	ulint*		max_arch_log_no,	/*!< out: max of archived
-						log numbers in data files */
-#endif /* UNIV_LOG_ARCHIVE */
-	ib_uint64_t*	min_flushed_lsn,	/*!< out: min of flushed
+	ulint*		space_id,		/*!< out: tablespace ID */
+	lsn_t*		min_flushed_lsn,	/*!< out: min of flushed
 						lsn values in data files */
-	ib_uint64_t*	max_flushed_lsn)	/*!< out: max of flushed
+	lsn_t*		max_flushed_lsn)	/*!< out: max of flushed
 						lsn values in data files */
 {
 	byte*		buf;
-	page_t*		page;
-	ib_uint64_t	flushed_lsn;
+	byte*		page;
+	lsn_t		flushed_lsn;
 	const char*	check_msg = NULL;
 
-	buf = ut_malloc(2 * UNIV_PAGE_SIZE);
+	buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
+
 	/* Align the memory for a possible read from a raw device */
-	page = ut_align(buf, UNIV_PAGE_SIZE);
 
-	os_file_read(data_file, page, 0, 0, UNIV_PAGE_SIZE);
+	page = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
+
+	os_file_read(data_file, page, 0, UNIV_PAGE_SIZE);
 
-	*flags = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page);
+	*flags = fsp_header_get_flags(page);
+
+	*space_id = fsp_header_get_space_id(page);
 
 	flushed_lsn = mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN);
 
@@ -1983,10 +2106,7 @@ fil_read_first_page(
 	if (!one_read_already) {
 		*min_flushed_lsn = flushed_lsn;
 		*max_flushed_lsn = flushed_lsn;
-#ifdef UNIV_LOG_ARCHIVE
-		*min_arch_log_no = arch_log_no;
-		*max_arch_log_no = arch_log_no;
-#endif /* UNIV_LOG_ARCHIVE */
+
 		return(NULL);
 	}
 
@@ -1996,14 +2116,6 @@ fil_read_first_page(
 	if (*max_flushed_lsn < flushed_lsn) {
 		*max_flushed_lsn = flushed_lsn;
 	}
-#ifdef UNIV_LOG_ARCHIVE
-	if (*min_arch_log_no > arch_log_no) {
-		*min_arch_log_no = arch_log_no;
-	}
-	if (*max_arch_log_no < arch_log_no) {
-		*max_arch_log_no = arch_log_no;
-	}
-#endif /* UNIV_LOG_ARCHIVE */
 
 	return(NULL);
 }
@@ -2091,7 +2203,7 @@ fil_create_directory_for_tablename(
 	len = strlen(fil_path_to_mysql_datadir);
 	namend = strchr(name, '/');
 	ut_a(namend);
-	path = mem_alloc(len + (namend - name) + 2);
+	path = static_cast<char*>(mem_alloc(len + (namend - name) + 2));
 
 	memcpy(path, fil_path_to_mysql_datadir, len);
 	path[len] = '/';
@@ -2183,6 +2295,12 @@ created does not exist, then we create the directory, too.
 
 Note that ibbackup --apply-log sets fil_path_to_mysql_datadir to point to the
 datadir that we should use in replaying the file operations.
+
+InnoDB recovery does not replay these fully since it always sets the space id
+to zero. But ibbackup does replay them.  TODO: If remote tablespaces are used,
+ibbackup will only create tables in the default directory since MLOG_FILE_CREATE
+and MLOG_FILE_CREATE2 only know the tablename, not the path.
+
 @return end of log record, or NULL if the record was not completely
 contained between ptr and end_ptr */
 UNIV_INTERN
@@ -2264,7 +2382,6 @@ fil_op_log_parse_or_replay(
 	}
 	*/
 	if (!space_id) {
-
 		return(ptr);
 	} else {
 		/* Only replay file ops during recovery.  This is a
@@ -2275,7 +2392,7 @@ fil_op_log_parse_or_replay(
 
 	/* Let us try to perform the file operation, if sensible. Note that
 	ibbackup has at this stage already read in all space id info to the
-	fil0fil.c data structures.
+	fil0fil.cc data structures.
 
 	NOTE that our algorithm is not guaranteed to work correctly if there
 	were renames of tables during the backup. See ibbackup code for more
@@ -2284,7 +2401,9 @@ fil_op_log_parse_or_replay(
 	switch (type) {
 	case MLOG_FILE_DELETE:
 		if (fil_tablespace_exists_in_mem(space_id)) {
-			ut_a(fil_delete_tablespace(space_id, TRUE));
+			dberr_t	err = fil_delete_tablespace(
+				space_id, BUF_REMOVE_FLUSH_NO_WRITE);
+			ut_a(err == DB_SUCCESS);
 		}
 
 		break;
@@ -2305,10 +2424,10 @@ fil_op_log_parse_or_replay(
 
 			if (fil_get_space_id_for_table(new_name)
 			    == ULINT_UNDEFINED) {
-				/* We do not care of the old name, that is
-				why we pass NULL as the first argument */
+				/* We do not care about the old name, that
+				is why we pass NULL as the first argument. */
 				if (!fil_rename_tablespace(NULL, space_id,
-							   new_name)) {
+							   new_name, NULL)) {
 					ut_error;
 				}
 			}
@@ -2326,12 +2445,15 @@ fil_op_log_parse_or_replay(
 		} else if (log_flags & MLOG_FILE_FLAG_TEMP) {
 			/* Temporary table, do nothing */
 		} else {
+			const char*	path = NULL;
+
 			/* Create the database directory for name, if it does
 			not exist yet */
 			fil_create_directory_for_tablename(name);
 
 			if (fil_create_new_single_table_tablespace(
-				    space_id, name, FALSE, flags,
+				    space_id, name, path, flags,
+				    DICT_TF2_USE_TABLESPACE,
 				    FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) {
 				ut_error;
 			}
@@ -2347,115 +2469,271 @@ fil_op_log_parse_or_replay(
 }
 
 /*******************************************************************//**
-Deletes a single-table tablespace. The tablespace must be cached in the
-memory cache.
-@return	TRUE if success */
-UNIV_INTERN
-ibool
-fil_delete_tablespace(
-/*==================*/
-	ulint	id,		/*!< in: space id */
-	ibool	evict_all)	/*!< in: TRUE if we want all pages
-				evicted from LRU. */
+Allocates a file name for the EXPORT/IMPORT config file name.  The
+string must be freed by caller with mem_free().
+@return own: file name */
+static
+char*
+fil_make_cfg_name(
+/*==============*/
+	const char*	filepath)	/*!< in: .ibd file name */
 {
-	ibool		success;
-	fil_space_t*	space;
-	fil_node_t*	node;
-	ulint		count		= 0;
-	char*		path;
+	char*	cfg_name;
 
-	ut_a(id != 0);
-stop_new_ops:
-	mutex_enter(&fil_system->mutex);
+	/* Create a temporary file path by replacing the .ibd suffix
+	with .cfg. */
 
-	space = fil_space_get_by_id(id);
+	ut_ad(strlen(filepath) > 4);
 
-	if (space != NULL) {
-		space->stop_new_ops = TRUE;
+	cfg_name = mem_strdup(filepath);
+	ut_snprintf(cfg_name + strlen(cfg_name) - 3, 4, "cfg");
+	return(cfg_name);
+}
 
-		if (space->n_pending_ops == 0) {
-			mutex_exit(&fil_system->mutex);
+/*******************************************************************//**
+Check for change buffer merges.
+@return 0 if no merges else count + 1. */
+static
+ulint
+fil_ibuf_check_pending_ops(
+/*=======================*/
+	fil_space_t*	space,	/*!< in/out: Tablespace to check */
+	ulint		count)	/*!< in: number of attempts so far */
+{
+	ut_ad(mutex_own(&fil_system->mutex));
 
-			count = 0;
+	if (space != 0 && space->n_pending_ops != 0) {
 
-			goto try_again;
-		} else {
-			if (count > 5000) {
-				ut_print_timestamp(stderr);
-				fputs("  InnoDB: Warning: trying to"
-				      " delete tablespace ", stderr);
-				ut_print_filename(stderr, space->name);
-				fprintf(stderr, ",\n"
-					"InnoDB: but there are %lu pending"
-					" operations (most likely ibuf merges)"
-					" on it.\n"
-					"InnoDB: Loop %lu.\n",
-					(ulong) space->n_pending_ops,
-					(ulong) count);
-			}
+		if (count > 5000) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Trying to close/delete tablespace "
+				"'%s' but there are %lu pending change "
+				"buffer merges on it.",
+				space->name,
+				(ulong) space->n_pending_ops);
+		}
 
-			mutex_exit(&fil_system->mutex);
+		return(count + 1);
+	}
 
-			os_thread_sleep(20000);
-			count++;
+	return(0);
+}
 
-			goto stop_new_ops;
+/*******************************************************************//**
+Check for pending IO.
+@return 0 if no pending else count + 1. */
+static
+ulint
+fil_check_pending_io(
+/*=================*/
+	fil_space_t*	space,	/*!< in/out: Tablespace to check */
+	fil_node_t**	node,	/*!< out: Node in space list */
+	ulint		count)	/*!< in: number of attempts so far */
+{
+	ut_ad(mutex_own(&fil_system->mutex));
+	ut_a(space->n_pending_ops == 0);
+
+	/* The following code must change when InnoDB supports
+	multiple datafiles per tablespace. */
+	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+
+	*node = UT_LIST_GET_FIRST(space->chain);
+
+	if (space->n_pending_flushes > 0 || (*node)->n_pending > 0) {
+
+		ut_a(!(*node)->being_extended);
+
+		if (count > 1000) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Trying to close/delete tablespace '%s' "
+				"but there are %lu flushes "
+				" and %lu pending i/o's on it.",
+				space->name,
+				(ulong) space->n_pending_flushes,
+				(ulong) (*node)->n_pending);
 		}
+
+		return(count + 1);
 	}
 
-	mutex_exit(&fil_system->mutex);
-	count = 0;
+	return(0);
+}
+
+/*******************************************************************//**
+Check pending operations on a tablespace.
+@return DB_SUCCESS or error failure. */
+static
+dberr_t
+fil_check_pending_operations(
+/*=========================*/
+	ulint		id,	/*!< in: space id */
+	fil_space_t**	space,	/*!< out: tablespace instance in memory */
+	char**		path)	/*!< out/own: tablespace path */
+{
+	ulint		count = 0;
+
+	ut_a(id != TRX_SYS_SPACE);
+	ut_ad(space);
+
+	*space = 0;
 
-try_again:
 	mutex_enter(&fil_system->mutex);
+	fil_space_t* sp = fil_space_get_by_id(id);
+	if (sp) {
+		sp->stop_new_ops = TRUE;
+	}
+	mutex_exit(&fil_system->mutex);
 
-	space = fil_space_get_by_id(id);
+	/* Check for pending change buffer merges. */
 
-	if (space == NULL) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: Error: cannot delete tablespace %lu\n"
-			"InnoDB: because it is not found in the"
-			" tablespace memory cache.\n",
-			(ulong) id);
+	do {
+		mutex_enter(&fil_system->mutex);
+
+		sp = fil_space_get_by_id(id);
+
+		count = fil_ibuf_check_pending_ops(sp, count);
 
 		mutex_exit(&fil_system->mutex);
 
-		return(FALSE);
-	}
+		if (count > 0) {
+			os_thread_sleep(20000);
+		}
 
-	ut_a(space->stop_new_ops);
-	ut_a(space->n_pending_ops == 0);
+	} while (count > 0);
 
-	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
-	node = UT_LIST_GET_FIRST(space->chain);
+	/* Check for pending IO. */
 
-	if (space->n_pending_flushes > 0 || node->n_pending > 0) {
-		if (count > 1000) {
-			ut_print_timestamp(stderr);
-			fputs("  InnoDB: Warning: trying to"
-			      " delete tablespace ", stderr);
-			ut_print_filename(stderr, space->name);
-			fprintf(stderr, ",\n"
-				"InnoDB: but there are %lu flushes"
-				" and %lu pending i/o's on it\n"
-				"InnoDB: Loop %lu.\n",
-				(ulong) space->n_pending_flushes,
-				(ulong) node->n_pending,
-				(ulong) count);
+	*path = 0;
+
+	do {
+		mutex_enter(&fil_system->mutex);
+
+		sp = fil_space_get_by_id(id);
+
+		if (sp == NULL) {
+			mutex_exit(&fil_system->mutex);
+			return(DB_TABLESPACE_NOT_FOUND);
+		}
+
+		fil_node_t*	node;
+
+		count = fil_check_pending_io(sp, &node, count);
+
+		if (count == 0) {
+			*path = mem_strdup(node->name);
 		}
+
 		mutex_exit(&fil_system->mutex);
-		os_thread_sleep(20000);
 
-		count++;
+		if (count > 0) {
+			os_thread_sleep(20000);
+		}
+
+	} while (count > 0);
+
+	ut_ad(sp);
+
+	*space = sp;
+	return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Closes a single-table tablespace. The tablespace must be cached in the
+memory cache. Free all pages used by the tablespace.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+dberr_t
+fil_close_tablespace(
+/*=================*/
+	trx_t*		trx,	/*!< in/out: Transaction covering the close */
+	ulint		id)	/*!< in: space id */
+{
+	char*		path = 0;
+	fil_space_t*	space = 0;
+
+	ut_a(id != TRX_SYS_SPACE);
+
+	dberr_t		err = fil_check_pending_operations(id, &space, &path);
 
-		goto try_again;
+	if (err != DB_SUCCESS) {
+		return(err);
 	}
 
-	path = mem_strdup(space->name);
+	ut_a(space);
+	ut_a(path != 0);
+
+	rw_lock_x_lock(&space->latch);
+
+#ifndef UNIV_HOTBACKUP
+	/* Invalidate in the buffer pool all pages belonging to the
+	tablespace. Since we have set space->stop_new_ops = TRUE, readahead
+	or ibuf merge can no longer read more pages of this tablespace to the
+	buffer pool. Thus we can clean the tablespace out of the buffer pool
+	completely and permanently. The flag stop_new_ops also prevents
+	fil_flush() from being applied to this tablespace. */
+
+	buf_LRU_flush_or_remove_pages(id, BUF_REMOVE_FLUSH_WRITE, trx);
+#endif
+	mutex_enter(&fil_system->mutex);
+
+	/* If the free is successful, the X lock will be released before
+	the space memory data structure is freed. */
+
+	if (!fil_space_free(id, TRUE)) {
+		rw_lock_x_unlock(&space->latch);
+		err = DB_TABLESPACE_NOT_FOUND;
+	} else {
+		err = DB_SUCCESS;
+	}
 
 	mutex_exit(&fil_system->mutex);
 
+	/* If it is a delete then also delete any generated files, otherwise
+	when we drop the database the remove directory will fail. */
+
+	char*	cfg_name = fil_make_cfg_name(path);
+
+	os_file_delete_if_exists(innodb_file_data_key, cfg_name);
+
+	mem_free(path);
+	mem_free(cfg_name);
+
+	return(err);
+}
+
+/*******************************************************************//**
+Deletes a single-table tablespace. The tablespace must be cached in the
+memory cache.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+dberr_t
+fil_delete_tablespace(
+/*==================*/
+	ulint		id,		/*!< in: space id */
+	buf_remove_t	buf_remove)	/*!< in: specify the action to take
+					on the tables pages in the buffer
+					pool */
+{
+	char*		path = 0;
+	fil_space_t*	space = 0;
+
+	ut_a(id != TRX_SYS_SPACE);
+
+	dberr_t		err = fil_check_pending_operations(id, &space, &path);
+
+	if (err != DB_SUCCESS) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot delete tablespace %lu because it is not "
+			"found in the tablespace memory cache.",
+			(ulong) id);
+
+		return(err);
+	}
+
+	ut_a(space);
+	ut_a(path != 0);
+
 	/* Important: We rely on the data dictionary mutex to ensure
 	that a race is not possible here. It should serialize the tablespace
 	drop/free. We acquire an X latch only to avoid a race condition
@@ -2490,13 +2768,22 @@ try_again:
 	To deal with potential read requests by checking the
 	::stop_new_ops flag in fil_io() */
 
-	buf_LRU_flush_or_remove_pages(
-		id, evict_all
-		? BUF_REMOVE_ALL_NO_WRITE
-		: BUF_REMOVE_FLUSH_NO_WRITE);
+	buf_LRU_flush_or_remove_pages(id, buf_remove, 0);
 
-#endif
-	/* printf("Deleting tablespace %s id %lu\n", space->name, id); */
+#endif /* !UNIV_HOTBACKUP */
+
+	/* If it is a delete then also delete any generated files, otherwise
+	when we drop the database the remove directory will fail. */
+	{
+		char*	cfg_name = fil_make_cfg_name(path);
+		os_file_delete_if_exists(innodb_file_data_key, cfg_name);
+		mem_free(cfg_name);
+	}
+
+	/* Delete the link file pointing to the ibd file we are deleting. */
+	if (FSP_FLAGS_HAS_DATA_DIR(space->flags)) {
+		fil_delete_link_file(space->name);
+	}
 
 	mutex_enter(&fil_system->mutex);
 
@@ -2505,25 +2792,28 @@ try_again:
 	if (fil_space_get_by_id(id)) {
 		ut_a(space->n_pending_ops == 0);
 		ut_a(UT_LIST_GET_LEN(space->chain) == 1);
-		node = UT_LIST_GET_FIRST(space->chain);
+		fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
 		ut_a(node->n_pending == 0);
 	}
 
-	success = fil_space_free(id, TRUE);
+	if (!fil_space_free(id, TRUE)) {
+		err = DB_TABLESPACE_NOT_FOUND;
+	}
 
 	mutex_exit(&fil_system->mutex);
 
-	if (success) {
-		success = os_file_delete(path);
-
-		if (!success) {
-			success = os_file_delete_if_exists(path);
-		}
-	} else {
+	if (err != DB_SUCCESS) {
 		rw_lock_x_unlock(&space->latch);
+	} else if (!os_file_delete(innodb_file_data_key, path)
+		   && !os_file_delete_if_exists(innodb_file_data_key, path)) {
+
+		/* Note: This is because we have removed the
+		tablespace instance from the cache. */
+
+		err = DB_IO_ERROR;
 	}
 
-	if (success) {
+	if (err == DB_SUCCESS) {
 #ifndef UNIV_HOTBACKUP
 		/* Write a log record about the deletion of the .ibd
 		file, so that ibbackup can replay it in the
@@ -2538,14 +2828,12 @@ try_again:
 		fil_op_write_log(MLOG_FILE_DELETE, id, 0, 0, path, NULL, &mtr);
 		mtr_commit(&mtr);
 #endif
-		mem_free(path);
-
-		return(TRUE);
+		err = DB_SUCCESS;
 	}
 
 	mem_free(path);
 
-	return(FALSE);
+	return(err);
 }
 
 /*******************************************************************//**
@@ -2577,36 +2865,49 @@ fil_tablespace_is_being_deleted(
 /*******************************************************************//**
 Discards a single-table tablespace. The tablespace must be cached in the
 memory cache. Discarding is like deleting a tablespace, but
-1) we do not drop the table from the data dictionary;
-2) we remove all insert buffer entries for the tablespace immediately; in DROP
-TABLE they are only removed gradually in the background;
-3) when the user does IMPORT TABLESPACE, the tablespace will have the same id
-as it originally had.
-@return	TRUE if success */
+
+ 1. We do not drop the table from the data dictionary;
+
+ 2. We remove all insert buffer entries for the tablespace immediately;
+    in DROP TABLE they are only removed gradually in the background;
+
+ 3. Free all the pages in use by the tablespace.
+@return	DB_SUCCESS or error */
 UNIV_INTERN
-ibool
+dberr_t
 fil_discard_tablespace(
 /*===================*/
 	ulint	id)	/*!< in: space id */
 {
-	ibool	success;
+	dberr_t	err;
 
-	success = fil_delete_tablespace(id, TRUE);
+	switch (err = fil_delete_tablespace(id, BUF_REMOVE_ALL_NO_WRITE)) {
+	case DB_SUCCESS:
+		break;
 
-	if (!success) {
-		fprintf(stderr,
-			"InnoDB: Warning: cannot delete tablespace %lu"
-			" in DISCARD TABLESPACE.\n"
-			"InnoDB: But let us remove the"
-			" insert buffer entries for this tablespace.\n",
-			(ulong) id);
+	case DB_IO_ERROR:
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"While deleting tablespace %lu in DISCARD TABLESPACE."
+			" File rename/delete failed: %s",
+			(ulong) id, ut_strerr(err));
+		break;
+
+	case DB_TABLESPACE_NOT_FOUND:
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Cannot delete tablespace %lu in DISCARD "
+			"TABLESPACE. %s",
+			(ulong) id, ut_strerr(err));
+		break;
+
+	default:
+		ut_error;
 	}
 
 	/* Remove all insert buffer entries for the tablespace */
 
 	ibuf_delete_for_discarded_space(id);
 
-	return(success);
+	return(err);
 }
 #endif /* !UNIV_HOTBACKUP */
 
@@ -2619,7 +2920,8 @@ fil_rename_tablespace_in_mem(
 /*=========================*/
 	fil_space_t*	space,	/*!< in: tablespace memory object */
 	fil_node_t*	node,	/*!< in: file node of that tablespace */
-	const char*	path)	/*!< in: new name */
+	const char*	new_name,	/*!< in: new name */
+	const char*	new_path)	/*!< in: new file path */
 {
 	fil_space_t*	space2;
 	const char*	old_name	= space->name;
@@ -2635,10 +2937,10 @@ fil_rename_tablespace_in_mem(
 		return(FALSE);
 	}
 
-	space2 = fil_space_get_by_name(path);
+	space2 = fil_space_get_by_name(new_name);
 	if (space2 != NULL) {
 		fputs("InnoDB: Error: ", stderr);
-		ut_print_filename(stderr, path);
+		ut_print_filename(stderr, new_name);
 		fputs(" is already in tablespace memory cache\n", stderr);
 
 		return(FALSE);
@@ -2649,11 +2951,11 @@ fil_rename_tablespace_in_mem(
 	mem_free(space->name);
 	mem_free(node->name);
 
-	space->name = mem_strdup(path);
-	node->name = mem_strdup(path);
+	space->name = mem_strdup(new_name);
+	node->name = mem_strdup(new_path);
 
 	HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash,
-		    ut_fold_string(path), space);
+		    ut_fold_string(new_name), space);
 	return(TRUE);
 }
 
@@ -2661,27 +2963,27 @@ fil_rename_tablespace_in_mem(
 Allocates a file name for a single-table tablespace. The string must be freed
 by caller with mem_free().
 @return	own: file name */
-static
+UNIV_INTERN
 char*
 fil_make_ibd_name(
 /*==============*/
-	const char*	name,		/*!< in: table name or a dir path of a
-					TEMPORARY table */
-	ibool		is_temp)	/*!< in: TRUE if it is a dir path */
+	const char*	name,		/*!< in: table name or a dir path */
+	bool		is_full_path)	/*!< in: TRUE if it is a dir path */
 {
+	char*	filename;
 	ulint	namelen		= strlen(name);
 	ulint	dirlen		= strlen(fil_path_to_mysql_datadir);
-	char*	filename	= mem_alloc(namelen + dirlen + sizeof "/.ibd");
+	ulint	pathlen		= dirlen + namelen + sizeof "/.ibd";
 
-	if (is_temp) {
+	filename = static_cast<char*>(mem_alloc(pathlen));
+
+	if (is_full_path) {
 		memcpy(filename, name, namelen);
 		memcpy(filename + namelen, ".ibd", sizeof ".ibd");
 	} else {
-		memcpy(filename, fil_path_to_mysql_datadir, dirlen);
-		filename[dirlen] = '/';
+		ut_snprintf(filename, pathlen, "%s/%s.ibd",
+			fil_path_to_mysql_datadir, name);
 
-		memcpy(filename + dirlen + 1, name, namelen);
-		memcpy(filename + dirlen + namelen + 1, ".ibd", sizeof ".ibd");
 	}
 
 	srv_normalize_path_for_win(filename);
@@ -2690,6 +2992,31 @@ fil_make_ibd_name(
 }
 
 /*******************************************************************//**
+Allocates a file name for a tablespace ISL file (InnoDB Symbolic Link).
+The string must be freed by caller with mem_free().
+@return	own: file name */
+UNIV_INTERN
+char*
+fil_make_isl_name(
+/*==============*/
+	const char*	name)	/*!< in: table name */
+{
+	char*	filename;
+	ulint	namelen		= strlen(name);
+	ulint	dirlen		= strlen(fil_path_to_mysql_datadir);
+	ulint	pathlen		= dirlen + namelen + sizeof "/.isl";
+
+	filename = static_cast<char*>(mem_alloc(pathlen));
+
+	ut_snprintf(filename, pathlen, "%s/%s.isl",
+		fil_path_to_mysql_datadir, name);
+
+	srv_normalize_path_for_win(filename);
+
+	return(filename);
+}
+
+/*******************************************************************//**
 Renames a single-table tablespace. The tablespace must be cached in the
 tablespace memory cache.
 @return	TRUE if success */
@@ -2697,36 +3024,39 @@ UNIV_INTERN
 ibool
 fil_rename_tablespace(
 /*==================*/
-	const char*	old_name,	/*!< in: old table name in the standard
-					databasename/tablename format of
-					InnoDB, or NULL if we do the rename
-					based on the space id only */
+	const char*	old_name_in,	/*!< in: old table name in the
+					standard databasename/tablename
+					format of InnoDB, or NULL if we
+					do the rename based on the space
+					id only */
 	ulint		id,		/*!< in: space id */
-	const char*	new_name)	/*!< in: new table name in the standard
-					databasename/tablename format
-					of InnoDB */
+	const char*	new_name,	/*!< in: new table name in the
+					standard databasename/tablename
+					format of InnoDB */
+	const char*	new_path_in)	/*!< in: new full datafile path
+					if the tablespace is remotely
+					located, or NULL if it is located
+					in the normal data directory. */
 {
 	ibool		success;
 	fil_space_t*	space;
 	fil_node_t*	node;
 	ulint		count		= 0;
-	char*		path;
-	ibool		old_name_was_specified		= TRUE;
+	char*		new_path;
+	char*		old_name;
 	char*		old_path;
+	const char*	not_given	= "(name not specified)";
 
 	ut_a(id != 0);
 
-	if (old_name == NULL) {
-		old_name = "(name not specified)";
-		old_name_was_specified = FALSE;
-	}
 retry:
 	count++;
 
 	if (!(count % 1000)) {
 		ut_print_timestamp(stderr);
 		fputs("  InnoDB: Warning: problems renaming ", stderr);
-		ut_print_filename(stderr, old_name);
+		ut_print_filename(stderr,
+				  old_name_in ? old_name_in : not_given);
 		fputs(" to ", stderr);
 		ut_print_filename(stderr, new_name);
 		fprintf(stderr, ", %lu iterations\n", (ulong) count);
@@ -2736,13 +3066,14 @@ retry:
 
 	space = fil_space_get_by_id(id);
 
+	DBUG_EXECUTE_IF("fil_rename_tablespace_failure_1", space = NULL; );
+
 	if (space == NULL) {
-		fprintf(stderr,
-			"InnoDB: Error: cannot find space id %lu"
-			" in the tablespace memory cache\n"
-			"InnoDB: though the table ", (ulong) id);
-		ut_print_filename(stderr, old_name);
-		fputs(" in a rename operation should have that id\n", stderr);
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot find space id %lu in the tablespace "
+			"memory cache, though the table '%s' in a "
+			"rename operation should have that id.",
+			(ulong) id, old_name_in ? old_name_in : not_given);
 		mutex_exit(&fil_system->mutex);
 
 		return(FALSE);
@@ -2761,11 +3092,16 @@ retry:
 
 	space->stop_ios = TRUE;
 
+	/* The following code must change when InnoDB supports
+	multiple datafiles per tablespace. */
 	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
 	node = UT_LIST_GET_FIRST(space->chain);
 
-	if (node->n_pending > 0 || node->n_pending_flushes > 0) {
-		/* There are pending i/o's or flushes, sleep for a while and
+	if (node->n_pending > 0
+	    || node->n_pending_flushes > 0
+	    || node->being_extended) {
+		/* There are pending i/o's or flushes or the file is
+		currently being extended, sleep for a while and
 		retry */
 
 		mutex_exit(&fil_system->mutex);
@@ -2781,7 +3117,7 @@ retry:
 
 		os_thread_sleep(20000);
 
-		fil_flush(id, TRUE);
+		fil_flush(id);
 
 		goto retry;
 
@@ -2793,34 +3129,42 @@ retry:
 
 	/* Check that the old name in the space is right */
 
-	if (old_name_was_specified) {
-		old_path = fil_make_ibd_name(old_name, FALSE);
-
-		ut_a(strcmp(space->name, old_path) == 0);
-		ut_a(strcmp(node->name, old_path) == 0);
+	if (old_name_in) {
+		old_name = mem_strdup(old_name_in);
+		ut_a(strcmp(space->name, old_name) == 0);
 	} else {
-		old_path = mem_strdup(space->name);
+		old_name = mem_strdup(space->name);
 	}
+	old_path = mem_strdup(node->name);
 
 	/* Rename the tablespace and the node in the memory cache */
-	path = fil_make_ibd_name(new_name, FALSE);
-	success = fil_rename_tablespace_in_mem(space, node, path);
+	new_path = new_path_in ? mem_strdup(new_path_in)
+		: fil_make_ibd_name(new_name, false);
+
+	success = fil_rename_tablespace_in_mem(
+		space, node, new_name, new_path);
 
 	if (success) {
-		success = os_file_rename(innodb_file_data_key, old_path, path);
+
+		DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2",
+			goto skip_second_rename; );
+
+		success = os_file_rename(
+			innodb_file_data_key, old_path, new_path);
+
+		DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2",
+skip_second_rename:
+			success = FALSE; );
 
 		if (!success) {
 			/* We have to revert the changes we made
 			to the tablespace memory cache */
 
-			ut_a(fil_rename_tablespace_in_mem(space, node,
-							  old_path));
+			ut_a(fil_rename_tablespace_in_mem(
+					space, node, old_name, old_path));
 		}
 	}
 
-	mem_free(path);
-	mem_free(old_path);
-
 	space->stop_ios = FALSE;
 
 	mutex_exit(&fil_system->mutex);
@@ -2835,7 +3179,192 @@ retry:
 				 &mtr);
 		mtr_commit(&mtr);
 	}
-#endif
+#endif /* !UNIV_HOTBACKUP */
+
+	mem_free(new_path);
+	mem_free(old_path);
+	mem_free(old_name);
+
+	return(success);
+}
+
+/*******************************************************************//**
+Creates a new InnoDB Symbolic Link (ISL) file.  It is always created
+under the 'datadir' of MySQL. The datadir is the directory of a
+running mysqld program. We can refer to it by simply using the path '.'.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fil_create_link_file(
+/*=================*/
+	const char*	tablename,	/*!< in: tablename */
+	const char*	filepath)	/*!< in: pathname of tablespace */
+{
+	os_file_t	file;
+	ibool		success;
+	dberr_t		err = DB_SUCCESS;
+	char*		link_filepath;
+	char*		prev_filepath = fil_read_link_file(tablename);
+
+	ut_ad(!srv_read_only_mode);
+
+	if (prev_filepath) {
+		/* Truncate will call this with an existing
+		link file which contains the same filepath. */
+		if (0 == strcmp(prev_filepath, filepath)) {
+			mem_free(prev_filepath);
+			return(DB_SUCCESS);
+		}
+		mem_free(prev_filepath);
+	}
+
+	link_filepath = fil_make_isl_name(tablename);
+
+	file = os_file_create_simple_no_error_handling(
+		innodb_file_data_key, link_filepath,
+		OS_FILE_CREATE, OS_FILE_READ_WRITE, &success);
+
+	if (!success) {
+		/* The following call will print an error message */
+		ulint	error = os_file_get_last_error(true);
+
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Cannot create file ", stderr);
+		ut_print_filename(stderr, link_filepath);
+		fputs(".\n", stderr);
+
+		if (error == OS_FILE_ALREADY_EXISTS) {
+			fputs("InnoDB: The link file: ", stderr);
+			ut_print_filename(stderr, filepath);
+			fputs(" already exists.\n", stderr);
+			err = DB_TABLESPACE_EXISTS;
+
+		} else if (error == OS_FILE_DISK_FULL) {
+			err = DB_OUT_OF_FILE_SPACE;
+
+		} else {
+			err = DB_ERROR;
+		}
+
+		/* file is not open, no need to close it. */
+		mem_free(link_filepath);
+		return(err);
+	}
+
+	if (!os_file_write(link_filepath, file, filepath, 0,
+			    strlen(filepath))) {
+		err = DB_ERROR;
+	}
+
+	/* Close the file, we only need it at startup */
+	os_file_close(file);
+
+	mem_free(link_filepath);
+
+	return(err);
+}
+
+/*******************************************************************//**
+Deletes an InnoDB Symbolic Link (ISL) file. */
+UNIV_INTERN
+void
+fil_delete_link_file(
+/*=================*/
+	const char*	tablename)	/*!< in: name of table */
+{
+	char* link_filepath = fil_make_isl_name(tablename);
+
+	os_file_delete_if_exists(innodb_file_data_key, link_filepath);
+
+	mem_free(link_filepath);
+}
+
+/*******************************************************************//**
+Reads an InnoDB Symbolic Link (ISL) file.
+It is always created under the 'datadir' of MySQL.  The name is of the
+form {databasename}/{tablename}. and the isl file is expected to be in a
+'{databasename}' directory called '{tablename}.isl'. The caller must free
+the memory of the null-terminated path returned if it is not null.
+@return	own: filepath found in link file, NULL if not found. */
+UNIV_INTERN
+char*
+fil_read_link_file(
+/*===============*/
+	const char*	name)		/*!< in: tablespace name */
+{
+	char*		filepath = NULL;
+	char*		link_filepath;
+	FILE*		file = NULL;
+
+	/* The .isl file is in the 'normal' tablespace location. */
+	link_filepath = fil_make_isl_name(name);
+
+	file = fopen(link_filepath, "r+b");
+
+	mem_free(link_filepath);
+
+	if (file) {
+		filepath = static_cast<char*>(mem_alloc(OS_FILE_MAX_PATH));
+
+		os_file_read_string(file, filepath, OS_FILE_MAX_PATH);
+		fclose(file);
+
+		if (strlen(filepath)) {
+			/* Trim whitespace from end of filepath */
+			ulint lastch = strlen(filepath) - 1;
+			while (lastch > 4 && filepath[lastch] <= 0x20) {
+				filepath[lastch--] = 0x00;
+			}
+			srv_normalize_path_for_win(filepath);
+		}
+	}
+
+	return(filepath);
+}
+
+/*******************************************************************//**
+Opens a handle to the file linked to in an InnoDB Symbolic Link file.
+@return	TRUE if remote linked tablespace file is found and opened. */
+UNIV_INTERN
+ibool
+fil_open_linked_file(
+/*===============*/
+	const char*	tablename,	/*!< in: database/tablename */
+	char**		remote_filepath,/*!< out: remote filepath */
+	os_file_t*	remote_file)	/*!< out: remote file handle */
+
+{
+	ibool		success;
+
+	*remote_filepath = fil_read_link_file(tablename);
+	if (*remote_filepath == NULL) {
+		return(FALSE);
+	}
+
+	/* The filepath provided is different from what was
+	found in the link file. */
+	*remote_file = os_file_create_simple_no_error_handling(
+		innodb_file_data_key, *remote_filepath,
+		OS_FILE_OPEN, OS_FILE_READ_ONLY,
+		&success);
+
+	if (!success) {
+		char*	link_filepath = fil_make_isl_name(tablename);
+
+		/* The following call prints an error message */
+		os_file_get_last_error(true);
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"A link file was found named '%s' "
+			"but the linked tablespace '%s' "
+			"could not be opened.",
+			link_filepath, *remote_filepath);
+
+		mem_free(link_filepath);
+		mem_free(*remote_filepath);
+		*remote_filepath = NULL;
+	}
+
 	return(success);
 }
 
@@ -2845,101 +3374,106 @@ Database directories are under the 'datadir' of MySQL. The datadir is the
 directory of a running mysqld program. We can refer to it by simply the
 path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp
 dir of the mysqld server.
+
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 fil_create_new_single_table_tablespace(
 /*===================================*/
 	ulint		space_id,	/*!< in: space id */
 	const char*	tablename,	/*!< in: the table name in the usual
 					databasename/tablename format
-					of InnoDB, or a dir path to a temp
-					table */
-	ibool		is_temp,	/*!< in: TRUE if a table created with
-					CREATE TEMPORARY TABLE */
+					of InnoDB */
+	const char*	dir_path,	/*!< in: NULL or a dir path */
 	ulint		flags,		/*!< in: tablespace flags */
+	ulint		flags2,		/*!< in: table flags2 */
 	ulint		size)		/*!< in: the initial size of the
 					tablespace file in pages,
 					must be >= FIL_IBD_FILE_INITIAL_SIZE */
 {
 	os_file_t	file;
 	ibool		ret;
-	ulint		err;
+	dberr_t		err;
 	byte*		buf2;
 	byte*		page;
-	ibool		success;
 	char*		path;
+	ibool		success;
+	/* TRUE if a table is created with CREATE TEMPORARY TABLE */
+	bool		is_temp = !!(flags2 & DICT_TF2_TEMPORARY);
+	bool		has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags);
 
 	ut_a(space_id > 0);
+	ut_ad(!srv_read_only_mode);
 	ut_a(space_id < SRV_LOG_SPACE_FIRST_ID);
 	ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE);
-	/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
-	ROW_FORMAT=COMPACT
-	((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
-	ROW_FORMAT=REDUNDANT (table->flags == 0).  For any other
-	format, the tablespace flags should equal
-	(table->flags & ~(~0 << DICT_TF_BITS)). */
-	ut_a(flags != DICT_TF_COMPACT);
-	ut_a(!(flags & (~0UL << DICT_TF_BITS)));
-
-	path = fil_make_ibd_name(tablename, is_temp);
-
-	file = os_file_create(innodb_file_data_key, path,
-			      OS_FILE_CREATE, OS_FILE_NORMAL,
-			      OS_DATA_FILE, &ret);
-	if (ret == FALSE) {
-		ut_print_timestamp(stderr);
-		fputs("  InnoDB: Error creating file ", stderr);
-		ut_print_filename(stderr, path);
-		fputs(".\n", stderr);
-
-		/* The following call will print an error message */
+	ut_a(fsp_flags_is_valid(flags));
 
-		err = os_file_get_last_error(TRUE);
-
-		if (err == OS_FILE_ALREADY_EXISTS) {
-			fputs("InnoDB: The file already exists though"
-			      " the corresponding table did not\n"
-			      "InnoDB: exist in the InnoDB data dictionary."
-			      " Have you moved InnoDB\n"
-			      "InnoDB: .ibd files around without using the"
-			      " SQL commands\n"
-			      "InnoDB: DISCARD TABLESPACE and"
-			      " IMPORT TABLESPACE, or did\n"
-			      "InnoDB: mysqld crash in the middle of"
-			      " CREATE TABLE? You can\n"
-			      "InnoDB: resolve the problem by"
-			      " removing the file ", stderr);
-			ut_print_filename(stderr, path);
-			fputs("\n"
-			      "InnoDB: under the 'datadir' of MySQL.\n",
-			      stderr);
-
-			mem_free(path);
-			return(DB_TABLESPACE_ALREADY_EXISTS);
+	if (is_temp) {
+		/* Temporary table filepath */
+		ut_ad(dir_path);
+		path = fil_make_ibd_name(dir_path, true);
+	} else if (has_data_dir) {
+		ut_ad(dir_path);
+		path = os_file_make_remote_pathname(dir_path, tablename, "ibd");
+
+		/* Since this tablespace file will be created in a
+		remote directory, let's create the subdirectories
+		in the path, if they are not there already. */
+		success = os_file_create_subdirs_if_needed(path);
+		if (!success) {
+			err = DB_ERROR;
+			goto error_exit_3;
 		}
+	} else {
+		path = fil_make_ibd_name(tablename, false);
+	}
+
+	file = os_file_create(
+		innodb_file_data_key, path,
+		OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
+		OS_FILE_NORMAL,
+		OS_DATA_FILE,
+		&ret);
 
-		if (err == OS_FILE_DISK_FULL) {
+	if (ret == FALSE) {
+		/* The following call will print an error message */
+		ulint	error = os_file_get_last_error(true);
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot create file '%s'\n", path);
+
+		if (error == OS_FILE_ALREADY_EXISTS) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"The file '%s' already exists though the "
+				"corresponding table did not exist "
+				"in the InnoDB data dictionary. "
+				"Have you moved InnoDB .ibd files "
+				"around without using the SQL commands "
+				"DISCARD TABLESPACE and IMPORT TABLESPACE, "
+				"or did mysqld crash in the middle of "
+				"CREATE TABLE? "
+				"You can resolve the problem by removing "
+				"the file '%s' under the 'datadir' of MySQL.",
+				path, path);
+
+			err = DB_TABLESPACE_EXISTS;
+			goto error_exit_3;
+		}
 
-			mem_free(path);
-			return(DB_OUT_OF_FILE_SPACE);
+		if (error == OS_FILE_DISK_FULL) {
+			err = DB_OUT_OF_FILE_SPACE;
+			goto error_exit_3;
 		}
 
-		mem_free(path);
-		return(DB_ERROR);
+		err = DB_ERROR;
+		goto error_exit_3;
 	}
 
-	ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE, 0);
+	ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE);
 
 	if (!ret) {
 		err = DB_OUT_OF_FILE_SPACE;
-error_exit:
-		os_file_close(file);
-error_exit2:
-		os_file_delete(path);
-
-		mem_free(path);
-		return(err);
+		goto error_exit_2;
 	}
 
 	/* printf("Creating tablespace %s id %lu\n", path, space_id); */
@@ -2953,25 +3487,26 @@ error_exit2:
 	with zeros from the call of os_file_set_size(), until a buffer pool
 	flush would write to it. */
 
-	buf2 = ut_malloc(3 * UNIV_PAGE_SIZE);
+	buf2 = static_cast<byte*>(ut_malloc(3 * UNIV_PAGE_SIZE));
 	/* Align the memory for file i/o if we might have O_DIRECT set */
-	page = ut_align(buf2, UNIV_PAGE_SIZE);
+	page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
 
 	memset(page, '\0', UNIV_PAGE_SIZE);
 
+	/* Add the UNIV_PAGE_SIZE to the table flags and write them to the
+	tablespace header. */
+	flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE);
 	fsp_header_init_fields(page, space_id, flags);
 	mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
 
-	if (!(flags & DICT_TF_ZSSIZE_MASK)) {
+	if (!(fsp_flags_is_compressed(flags))) {
 		buf_flush_init_for_writing(page, NULL, 0);
-		ret = os_file_write(path, file, page, 0, 0, UNIV_PAGE_SIZE);
+		ret = os_file_write(path, file, page, 0, UNIV_PAGE_SIZE);
 	} else {
 		page_zip_des_t	page_zip;
 		ulint		zip_size;
 
-		zip_size = ((PAGE_ZIP_MIN_SIZE >> 1)
-			    << ((flags & DICT_TF_ZSSIZE_MASK)
-				>> DICT_TF_ZSSIZE_SHIFT));
+		zip_size = fsp_flags_get_zip_size(flags);
 
 		page_zip_set_size(&page_zip, zip_size);
 		page_zip.data = page + UNIV_PAGE_SIZE;
@@ -2981,1195 +3516,497 @@ error_exit2:
 			page_zip.m_end = page_zip.m_nonempty =
 			page_zip.n_blobs = 0;
 		buf_flush_init_for_writing(page, &page_zip, 0);
-		ret = os_file_write(path, file, page_zip.data, 0, 0, zip_size);
+		ret = os_file_write(path, file, page_zip.data, 0, zip_size);
 	}
 
 	ut_free(buf2);
 
 	if (!ret) {
-		fputs("InnoDB: Error: could not write the first page"
-		      " to tablespace ", stderr);
-		ut_print_filename(stderr, path);
-		putc('\n', stderr);
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Could not write the first page to tablespace "
+			"'%s'", path);
+
 		err = DB_ERROR;
-		goto error_exit;
+		goto error_exit_2;
 	}
 
-	ret = os_file_flush(file, TRUE);
+	ret = os_file_flush(file);
 
 	if (!ret) {
-		fputs("InnoDB: Error: file flush of tablespace ", stderr);
-		ut_print_filename(stderr, path);
-		fputs(" failed\n", stderr);
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"File flush of tablespace '%s' failed", path);
 		err = DB_ERROR;
-		goto error_exit;
+		goto error_exit_2;
 	}
 
-	os_file_close(file);
-
-	success = fil_space_create(path, space_id, flags, FIL_TABLESPACE);
+	if (has_data_dir) {
+		/* Now that the IBD file is created, make the ISL file. */
+		err = fil_create_link_file(tablename, path);
+		if (err != DB_SUCCESS) {
+			goto error_exit_2;
+		}
+	}
 
-	if (!success) {
+	success = fil_space_create(tablename, space_id, flags, FIL_TABLESPACE);
+	if (!success || !fil_node_create(path, size, space_id, FALSE)) {
 		err = DB_ERROR;
-		goto error_exit2;
+		goto error_exit_1;
 	}
 
-	fil_node_create(path, size, space_id, FALSE);
-
 #ifndef UNIV_HOTBACKUP
 	{
 		mtr_t		mtr;
+		ulint		mlog_file_flag = 0;
+
+		if (is_temp) {
+			mlog_file_flag |= MLOG_FILE_FLAG_TEMP;
+		}
 
 		mtr_start(&mtr);
 
 		fil_op_write_log(flags
 				 ? MLOG_FILE_CREATE2
 				 : MLOG_FILE_CREATE,
-				 space_id,
-				 is_temp ? MLOG_FILE_FLAG_TEMP : 0,
-				 flags,
+				 space_id, mlog_file_flag, flags,
 				 tablename, NULL, &mtr);
 
 		mtr_commit(&mtr);
 	}
 #endif
-	mem_free(path);
-	return(DB_SUCCESS);
-}
-
-#ifndef UNIV_HOTBACKUP
-/********************************************************************//**
-It is possible, though very improbable, that the lsn's in the tablespace to be
-imported have risen above the current system lsn, if a lengthy purge, ibuf
-merge, or rollback was performed on a backup taken with ibbackup. If that is
-the case, reset page lsn's in the file. We assume that mysqld was shut down
-after it performed these cleanup operations on the .ibd file, so that it at
-the shutdown stamped the latest lsn to the FIL_PAGE_FILE_FLUSH_LSN in the
-first page of the .ibd file, and we can determine whether we need to reset the
-lsn's just by looking at that flush lsn.
-@return	TRUE if success */
-UNIV_INTERN
-ibool
-fil_reset_too_high_lsns(
-/*====================*/
-	const char*	name,		/*!< in: table name in the
-					databasename/tablename format */
-	ib_uint64_t	current_lsn)	/*!< in: reset lsn's if the lsn stamped
-					to FIL_PAGE_FILE_FLUSH_LSN in the
-					first page is too high */
-{
-	os_file_t	file;
-	char*		filepath;
-	byte*		page;
-	byte*		buf2;
-	ib_uint64_t	flush_lsn;
-	ulint		space_id;
-	ib_int64_t	file_size;
-	ib_int64_t	offset;
-	ulint		zip_size;
-	ibool		success;
-	page_zip_des_t	page_zip;
-
-	filepath = fil_make_ibd_name(name, FALSE);
-
-	file = os_file_create_simple_no_error_handling(
-		innodb_file_data_key, filepath, OS_FILE_OPEN,
-		OS_FILE_READ_WRITE, &success);
-	if (!success) {
-		/* The following call prints an error message */
-		os_file_get_last_error(TRUE);
+	err = DB_SUCCESS;
 
-		ut_print_timestamp(stderr);
-
-		fputs("  InnoDB: Error: trying to open a table,"
-		      " but could not\n"
-		      "InnoDB: open the tablespace file ", stderr);
-		ut_print_filename(stderr, filepath);
-		fputs("!\n", stderr);
-		mem_free(filepath);
-
-		return(FALSE);
-	}
-
-	/* Read the first page of the tablespace */
-
-	buf2 = ut_malloc(3 * UNIV_PAGE_SIZE);
-	/* Align the memory for file i/o if we might have O_DIRECT set */
-	page = ut_align(buf2, UNIV_PAGE_SIZE);
-
-	success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE);
-	if (!success) {
-
-		goto func_exit;
-	}
-
-	/* We have to read the file flush lsn from the header of the file */
-
-	flush_lsn = mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN);
-
-	if (current_lsn >= flush_lsn) {
-		/* Ok */
-		success = TRUE;
-
-		goto func_exit;
+	/* Error code is set.  Cleanup the various variables used.
+	These labels reflect the order in which variables are assigned or
+	actions are done. */
+error_exit_1:
+	if (has_data_dir && err != DB_SUCCESS) {
+		fil_delete_link_file(tablename);
 	}
-
-	space_id = fsp_header_get_space_id(page);
-	zip_size = fsp_header_get_zip_size(page);
-
-	page_zip_des_init(&page_zip);
-	page_zip_set_size(&page_zip, zip_size);
-	if (zip_size) {
-		page_zip.data = page + UNIV_PAGE_SIZE;
-	}
-
-	ut_print_timestamp(stderr);
-	fprintf(stderr,
-		"  InnoDB: Flush lsn in the tablespace file %lu"
-		" to be imported\n"
-		"InnoDB: is %llu, which exceeds current"
-		" system lsn %llu.\n"
-		"InnoDB: We reset the lsn's in the file ",
-		(ulong) space_id,
-		flush_lsn, current_lsn);
-	ut_print_filename(stderr, filepath);
-	fputs(".\n", stderr);
-
-	ut_a(ut_is_2pow(zip_size));
-	ut_a(zip_size <= UNIV_PAGE_SIZE);
-
-	/* Loop through all the pages in the tablespace and reset the lsn and
-	the page checksum if necessary */
-
-	file_size = os_file_get_size_as_iblonglong(file);
-
-	for (offset = 0; offset < file_size;
-	     offset += zip_size ? zip_size : UNIV_PAGE_SIZE) {
-		success = os_file_read(file, page,
-				       (ulint)(offset & 0xFFFFFFFFUL),
-				       (ulint)(offset >> 32),
-				       zip_size ? zip_size : UNIV_PAGE_SIZE);
-		if (!success) {
-
-			goto func_exit;
-		}
-		if (mach_read_from_8(page + FIL_PAGE_LSN) > current_lsn) {
-			/* We have to reset the lsn */
-
-			if (zip_size) {
-				memcpy(page_zip.data, page, zip_size);
-				buf_flush_init_for_writing(
-					page, &page_zip, current_lsn);
-				success = os_file_write(
-					filepath, file, page_zip.data,
-					(ulint) offset & 0xFFFFFFFFUL,
-					(ulint) (offset >> 32), zip_size);
-			} else {
-				buf_flush_init_for_writing(
-					page, NULL, current_lsn);
-				success = os_file_write(
-					filepath, file, page,
-					(ulint)(offset & 0xFFFFFFFFUL),
-					(ulint)(offset >> 32),
-					UNIV_PAGE_SIZE);
-			}
-
-			if (!success) {
-
-				goto func_exit;
-			}
-		}
-	}
-
-	success = os_file_flush(file, TRUE);
-	if (!success) {
-
-		goto func_exit;
-	}
-
-	/* We now update the flush_lsn stamp at the start of the file */
-	success = os_file_read(file, page, 0, 0,
-			       zip_size ? zip_size : UNIV_PAGE_SIZE);
-	if (!success) {
-
-		goto func_exit;
-	}
-
-	mach_write_to_8(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn);
-
-	success = os_file_write(filepath, file, page, 0, 0,
-				zip_size ? zip_size : UNIV_PAGE_SIZE);
-	if (!success) {
-
-		goto func_exit;
-	}
-	success = os_file_flush(file, TRUE);
-func_exit:
+error_exit_2:
 	os_file_close(file);
-	ut_free(buf2);
-	mem_free(filepath);
-
-	return(success);
-}
-
-/********************************************************************//**
-Checks if a page is corrupt. (for offline page)
-*/
-static
-ibool
-fil_page_buf_page_is_corrupted_offline(
-/*===================================*/
-	const byte*	page,		/*!< in: a database page */
-	ulint		zip_size)	/*!< in: size of compressed page;
-					0 for uncompressed pages */
-{
-	ulint		checksum_field;
-	ulint		old_checksum_field;
-
-	if (!zip_size
-	    && memcmp(page + FIL_PAGE_LSN + 4,
-		      page + UNIV_PAGE_SIZE
-		      - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
-		return(TRUE);
-	}
-
-	checksum_field = mach_read_from_4(page
-					  + FIL_PAGE_SPACE_OR_CHKSUM);
-
-	if (zip_size) {
-		return(checksum_field != BUF_NO_CHECKSUM_MAGIC
-		       && checksum_field
-		       != page_zip_calc_checksum(page, zip_size));
-	}
-
-	old_checksum_field = mach_read_from_4(
-		page + UNIV_PAGE_SIZE
-		- FIL_PAGE_END_LSN_OLD_CHKSUM);
-
-	if (old_checksum_field != mach_read_from_4(page
-						   + FIL_PAGE_LSN)
-	    && old_checksum_field != BUF_NO_CHECKSUM_MAGIC
-	    && old_checksum_field
-	    != buf_calc_page_old_checksum(page)) {
-		return(TRUE);
-	}
-
-	if (!srv_fast_checksum
-	    && checksum_field != 0
-	    && checksum_field != BUF_NO_CHECKSUM_MAGIC
-	    && checksum_field
-	    != buf_calc_page_new_checksum(page)) {
-		return(TRUE);
-	}
-
-	if (srv_fast_checksum
-	    && checksum_field != 0
-	    && checksum_field != BUF_NO_CHECKSUM_MAGIC
-	    && checksum_field
-	    != buf_calc_page_new_checksum_32(page)
-	    && checksum_field
-	    != buf_calc_page_new_checksum(page)) {
-		return(TRUE);
+	if (err != DB_SUCCESS) {
+		os_file_delete(innodb_file_data_key, path);
 	}
+error_exit_3:
+	mem_free(path);
 
-	return(FALSE);
+	return(err);
 }
 
+#ifndef UNIV_HOTBACKUP
 /********************************************************************//**
-*/
+Report information about a bad tablespace. */
 static
 void
-fil_page_buf_page_store_checksum(
-/*=============================*/
-	byte*	page,
-	ulint	zip_size)
+fil_report_bad_tablespace(
+/*======================*/
+	const char*	filepath,	/*!< in: filepath */
+	const char*	check_msg,	/*!< in: fil_check_first_page() */
+	ulint		found_id,	/*!< in: found space ID */
+	ulint		found_flags,	/*!< in: found flags */
+	ulint		expected_id,	/*!< in: expected space id */
+	ulint		expected_flags)	/*!< in: expected flags */
 {
-	if (!zip_size) {
-		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
-				srv_use_checksums
-				? (!srv_fast_checksum
-				   ? buf_calc_page_new_checksum(page)
-				   : buf_calc_page_new_checksum_32(page))
-						: BUF_NO_CHECKSUM_MAGIC);
-		mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
-				srv_use_checksums
-				? buf_calc_page_old_checksum(page)
-						: BUF_NO_CHECKSUM_MAGIC);
-	} else {
-		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
-				srv_use_checksums
-				? page_zip_calc_checksum(page, zip_size)
-				: BUF_NO_CHECKSUM_MAGIC);
+	if (check_msg) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Error %s in file '%s',"
+			"tablespace id=%lu, flags=%lu. "
+			"Please refer to "
+			REFMAN "innodb-troubleshooting-datadict.html "
+			"for how to resolve the issue.",
+			check_msg, filepath,
+			(ulong) expected_id, (ulong) expected_flags);
+		return;
 	}
+
+	ib_logf(IB_LOG_LEVEL_ERROR,
+		"In file '%s', tablespace id and flags are %lu and %lu, "
+		"but in the InnoDB data dictionary they are %lu and %lu. "
+		"Have you moved InnoDB .ibd files around without using the "
+		"commands DISCARD TABLESPACE and IMPORT TABLESPACE? "
+		"Please refer to "
+		REFMAN "innodb-troubleshooting-datadict.html "
+		"for how to resolve the issue.",
+		filepath, (ulong) found_id, (ulong) found_flags,
+		(ulong) expected_id, (ulong) expected_flags);
 }
 
+struct fsp_open_info {
+	ibool		success;	/*!< Has the tablespace been opened? */
+	const char*	check_msg;	/*!< fil_check_first_page() message */
+	ibool		valid;		/*!< Is the tablespace valid? */
+	os_file_t	file;		/*!< File handle */
+	char*		filepath;	/*!< File path to open */
+	lsn_t		lsn;		/*!< Flushed LSN from header page */
+	ulint		id;		/*!< Space ID */
+	ulint		flags;		/*!< Tablespace flags */
+};
+
 /********************************************************************//**
-Tries to open a single-table tablespace and optionally checks the space id is
-right in it. If does not succeed, prints an error message to the .err log. This
-function is used to open a tablespace when we start up mysqld, and also in
-IMPORT TABLESPACE.
+Tries to open a single-table tablespace and optionally checks that the
+space id in it is correct. If this does not succeed, print an error message
+to the .err log. This function is used to open a tablespace when we start
+mysqld after the dictionary has been booted, and also in IMPORT TABLESPACE.
+
 NOTE that we assume this operation is used either at the database startup
 or under the protection of the dictionary mutex, so that two users cannot
 race here. This operation does not leave the file associated with the
 tablespace open, but closes it after we have looked at the space id in it.
-@return	TRUE if success */
+
+If the validate boolean is set, we read the first page of the file and
+check that the space id in the file is what we expect. We assume that
+this function runs much faster if no check is made, since accessing the
+file inode probably is much faster (the OS caches them) than accessing
+the first page of the file.  This boolean may be initially FALSE, but if
+a remote tablespace is found it will be changed to true.
+
+If the fix_dict boolean is set, then it is safe to use an internal SQL
+statement to update the dictionary tables if they are incorrect.
+
+@return	DB_SUCCESS or error code */
 UNIV_INTERN
-ibool
+dberr_t
 fil_open_single_table_tablespace(
 /*=============================*/
-	ibool		check_space_id,	/*!< in: should we check that the space
-					id in the file is right; we assume
-					that this function runs much faster
-					if no check is made, since accessing
-					the file inode probably is much
-					faster (the OS caches them) than
-					accessing the first page of the file */
+	bool		validate,	/*!< in: Do we validate tablespace? */
+	bool		fix_dict,	/*!< in: Can we fix the dictionary? */
 	ulint		id,		/*!< in: space id */
 	ulint		flags,		/*!< in: tablespace flags */
-	const char*	name,		/*!< in: table name in the
+	const char*	tablename,	/*!< in: table name in the
 					databasename/tablename format */
-	trx_t*		trx)		/*!< in: transaction. This is only used
-					for IMPORT TABLESPACE, must be NULL
-					otherwise */
+	const char*	path_in)	/*!< in: tablespace filepath */
 {
-	os_file_t	file;
-	char*		filepath;
-	ibool		success;
-	const char*	check_msg;
-	byte*		buf2;
-	byte*		page;
-	ulint		space_id;
-	ulint		space_flags;
-
-	filepath = fil_make_ibd_name(name, FALSE);
-
-	/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
-	ROW_FORMAT=COMPACT
-	((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
-	ROW_FORMAT=REDUNDANT (table->flags == 0).  For any other
-	format, the tablespace flags should equal
-	(table->flags & ~(~0 << DICT_TF_BITS)). */
-	ut_a(flags != DICT_TF_COMPACT);
-	ut_a(!(flags & (~0UL << DICT_TF_BITS)));
-
-	file = os_file_create_simple_no_error_handling(
-		innodb_file_data_key, filepath, OS_FILE_OPEN,
-		OS_FILE_READ_WRITE, &success);
-	if (!success) {
-		/* The following call prints an error message */
-		os_file_get_last_error(TRUE);
-
-		ut_print_timestamp(stderr);
-
-		fputs("  InnoDB: Error: trying to open a table,"
-		      " but could not\n"
-		      "InnoDB: open the tablespace file ", stderr);
-		ut_print_filename(stderr, filepath);
-		fputs("!\n"
-		      "InnoDB: Have you moved InnoDB .ibd files around"
-		      " without using the\n"
-		      "InnoDB: commands DISCARD TABLESPACE and"
-		      " IMPORT TABLESPACE?\n"
-		      "InnoDB: It is also possible that this is"
-		      " a temporary table #sql...,\n"
-		      "InnoDB: and MySQL removed the .ibd file for this.\n"
-		      "InnoDB: Please refer to\n"
-		      "InnoDB: " REFMAN "innodb-troubleshooting-datadict.html\n"
-		      "InnoDB: for how to resolve the issue.\n", stderr);
-
-		mem_free(filepath);
-
-		return(FALSE);
+	dberr_t		err = DB_SUCCESS;
+	bool		dict_filepath_same_as_default = false;
+	bool		link_file_found = false;
+	bool		link_file_is_bad = false;
+	fsp_open_info	def;
+	fsp_open_info	dict;
+	fsp_open_info	remote;
+	ulint		tablespaces_found = 0;
+	ulint		valid_tablespaces_found = 0;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(!fix_dict || mutex_own(&(dict_sys->mutex)));
+
+	if (!fsp_flags_is_valid(flags)) {
+		return(DB_CORRUPTION);
+	}
+
+	/* If the tablespace was relocated, we do not
+	compare the DATA_DIR flag */
+	ulint mod_flags = flags & ~FSP_FLAGS_MASK_DATA_DIR;
+
+	memset(&def, 0, sizeof(def));
+	memset(&dict, 0, sizeof(dict));
+	memset(&remote, 0, sizeof(remote));
+
+	/* Discover the correct filepath.  We will always look for an ibd
+	in the default location. If it is remote, it should not be here. */
+	def.filepath = fil_make_ibd_name(tablename, false);
+
+	/* The path_in was read from SYS_DATAFILES. */
+	if (path_in) {
+		if (strcmp(def.filepath, path_in)) {
+			dict.filepath = mem_strdup(path_in);
+			/* possibility of multiple files. */
+			validate = true;
+		} else {
+			dict_filepath_same_as_default = true;
+		}
 	}
 
-	if (!check_space_id) {
-		space_id = id;
-
-		goto skip_check;
+	link_file_found = fil_open_linked_file(
+		tablename, &remote.filepath, &remote.file);
+	remote.success = link_file_found;
+	if (remote.success) {
+		/* possibility of multiple files. */
+		validate = true;
+		tablespaces_found++;
+
+		/* A link file was found. MySQL does not allow a DATA
+		DIRECTORY to be be the same as the default filepath. */
+		ut_a(strcmp(def.filepath, remote.filepath));
+
+		/* If there was a filepath found in SYS_DATAFILES,
+		we hope it was the same as this remote.filepath found
+		in the ISL file. */
+		if (dict.filepath
+		    && (0 == strcmp(dict.filepath, remote.filepath))) {
+			remote.success = FALSE;
+			os_file_close(remote.file);
+			mem_free(remote.filepath);
+			remote.filepath = NULL;
+			tablespaces_found--;
+		}
 	}
 
-	/* Read the first page of the tablespace */
+	/* Attempt to open the tablespace at other possible filepaths. */
+	if (dict.filepath) {
+		dict.file = os_file_create_simple_no_error_handling(
+			innodb_file_data_key, dict.filepath, OS_FILE_OPEN,
+			OS_FILE_READ_ONLY, &dict.success);
+		if (dict.success) {
+			/* possibility of multiple files. */
+			validate = true;
+			tablespaces_found++;
+		}
+	}
 
-	buf2 = ut_malloc(2 * UNIV_PAGE_SIZE);
-	/* Align the memory for file i/o if we might have O_DIRECT set */
-	page = ut_align(buf2, UNIV_PAGE_SIZE);
-
-	success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE);
-
-	check_msg = fil_check_first_page(page);
-
-	/* We have to read the tablespace id and flags from the file. */
-
-	space_id = fsp_header_get_space_id(page);
-	space_flags = fsp_header_get_flags(page);
-
-	if (srv_expand_import) {
-
-		ibool		file_is_corrupt = FALSE;
-		byte*		buf3;
-		byte*		descr_page;
-		ibool		descr_is_corrupt = FALSE;
-		index_id_t	old_id[31];
-		index_id_t	new_id[31];
-		ulint		root_page[31];
-		ulint		n_index;
-		os_file_t	info_file = (os_file_t) -1;
-		char*		info_file_path;
-		ulint	i;
-		int		len;
-		ib_uint64_t	current_lsn;
-		ulint		size_low, size_high, size, free_limit;
-		ib_int64_t	size_bytes, free_limit_bytes;
-		dict_table_t*	table;
-		dict_index_t*	index;
-		fil_system_t*	system;
-		fil_node_t*	node = NULL;
-		fil_space_t*	space;
-		ulint		zip_size;
+	/* Always look for a file at the default location. */
+	ut_a(def.filepath);
+	def.file = os_file_create_simple_no_error_handling(
+		innodb_file_data_key, def.filepath, OS_FILE_OPEN,
+		OS_FILE_READ_ONLY, &def.success);
+	if (def.success) {
+		tablespaces_found++;
+	}
 
-		buf3 = ut_malloc(2 * UNIV_PAGE_SIZE);
-		descr_page = ut_align(buf3, UNIV_PAGE_SIZE);
+	/*  We have now checked all possible tablespace locations and
+	have a count of how many we found.  If things are normal, we
+	only found 1. */
+	if (!validate && tablespaces_found == 1) {
+		goto skip_validate;
+	}
 
-		current_lsn = log_get_lsn();
+	/* Read the first page of the datadir tablespace, if found. */
+	if (def.success) {
+		def.check_msg = fil_read_first_page(
+			def.file, FALSE, &def.flags, &def.id,
+			&def.lsn, &def.lsn);
+		def.valid = !def.check_msg;
 
-		/* check the header page's consistency */
-		if (buf_page_is_corrupted(TRUE, page,
-					  dict_table_flags_to_zip_size(space_flags))) {
-			fprintf(stderr, "InnoDB: page 0 of %s seems corrupt.\n", filepath);
-			file_is_corrupt = TRUE;
-			descr_is_corrupt = TRUE;
+		/* Validate this single-table-tablespace with SYS_TABLES,
+		but do not compare the DATA_DIR flag, in case the
+		tablespace was relocated. */
+		if (def.valid && def.id == id
+		    && (def.flags & ~FSP_FLAGS_MASK_DATA_DIR) == mod_flags) {
+			valid_tablespaces_found++;
+		} else {
+			def.valid = false;
+			/* Do not use this tablespace. */
+			fil_report_bad_tablespace(
+				def.filepath, def.check_msg, def.id,
+				def.flags, id, flags);
 		}
+	}
 
-		/* store as first descr page */
-		memcpy(descr_page, page, UNIV_PAGE_SIZE);
-
-		zip_size = dict_table_flags_to_zip_size(flags);
-		ut_a(zip_size == dict_table_flags_to_zip_size(space_flags));
-
-		/* get free limit (page number) of the table space */
-/* these should be same to the definition in fsp0fsp.c */
-#define FSP_HEADER_OFFSET	FIL_PAGE_DATA
-#define	FSP_FREE_LIMIT		12
-		free_limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + page);
-		free_limit_bytes = (ib_int64_t)free_limit * (ib_int64_t)(zip_size ? zip_size : UNIV_PAGE_SIZE);
-
-		/* overwrite fsp header */
-		fsp_header_init_fields(page, id, flags);
-		mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id);
-		space_id = id;
-		space_flags = flags;
-		if (mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN) > current_lsn)
-			mach_write_to_8(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn);
-
-		fil_page_buf_page_store_checksum(page, zip_size);
-
-		success = os_file_write(filepath, file, page, 0, 0, UNIV_PAGE_SIZE);
-
-		/* get file size */
-		os_file_get_size(file, &size_low, &size_high);
-		size_bytes = (((ib_int64_t)size_high) << 32)
-				+ (ib_int64_t)size_low;
-
-		if (size_bytes < free_limit_bytes) {
-			free_limit_bytes = size_bytes;
-			if (size_bytes >= (lint)FSP_EXTENT_SIZE * (lint)(zip_size ? zip_size : UNIV_PAGE_SIZE)) {
-				fprintf(stderr, "InnoDB: free limit of %s is larger than its real size.\n", filepath);
-				file_is_corrupt = TRUE;
-			}
-		}
+	/* Read the first page of the remote tablespace */
+	if (remote.success) {
+		remote.check_msg = fil_read_first_page(
+			remote.file, FALSE, &remote.flags, &remote.id,
+			&remote.lsn, &remote.lsn);
+		remote.valid = !remote.check_msg;
 
-		/* get cruster index information */
-		table = dict_table_get_low(name, DICT_ERR_IGNORE_NONE);
-		index = dict_table_get_first_index(table);
-		ut_a(index->page==3);
-
-		/* read metadata from .exp file */
-		n_index = 0;
-		memset(old_id, 0, sizeof(old_id));
-		memset(new_id, 0, sizeof(new_id));
-		memset(root_page, 0, sizeof(root_page));
-
-		info_file_path = fil_make_ibd_name(name, FALSE);
-		len = strlen(info_file_path);
-		info_file_path[len - 3] = 'e';
-		info_file_path[len - 2] = 'x';
-		info_file_path[len - 1] = 'p';
-
-		info_file = os_file_create_simple_no_error_handling(innodb_file_data_key,
-				info_file_path, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success);
-		if (!success) {
-			fprintf(stderr, "InnoDB: Cannot open the file: %s\n", info_file_path);
-			file_is_corrupt = TRUE;
-			goto skip_info;
-		}
-		success = os_file_read(info_file, page, 0, 0, UNIV_PAGE_SIZE);
-		if (!success) {
-			fprintf(stderr, "InnoDB: Cannot read the file: %s\n", info_file_path);
-			file_is_corrupt = TRUE;
-			goto skip_info;
-		}
-		if (mach_read_from_4(page) != 0x78706f72UL
-		    || mach_read_from_4(page + 4) != 0x74696e66UL) {
-			fprintf(stderr, "InnoDB: %s seems to be an incorrect .exp file.\n", info_file_path);
-			file_is_corrupt = TRUE;
-			goto skip_info;
+		/* Validate this single-table-tablespace with SYS_TABLES,
+		but do not compare the DATA_DIR flag, in case the
+		tablespace was relocated. */
+		if (remote.valid && remote.id == id
+		    && (remote.flags & ~FSP_FLAGS_MASK_DATA_DIR) == mod_flags) {
+			valid_tablespaces_found++;
+		} else {
+			remote.valid = false;
+			/* Do not use this linked tablespace. */
+			fil_report_bad_tablespace(
+				remote.filepath, remote.check_msg, remote.id,
+				remote.flags, id, flags);
+			link_file_is_bad = true;
 		}
+	}
 
-		fprintf(stderr, "InnoDB: Import: The extended import of %s is being started.\n", name);
+	/* Read the first page of the datadir tablespace, if found. */
+	if (dict.success) {
+		dict.check_msg = fil_read_first_page(
+			dict.file, FALSE, &dict.flags, &dict.id,
+			&dict.lsn, &dict.lsn);
+		dict.valid = !dict.check_msg;
 
-		n_index = mach_read_from_4(page + 8);
-		fprintf(stderr, "InnoDB: Import: %lu indexes have been detected.\n", (ulong)n_index);
-		for (i = 0; i < n_index; i++) {
-			new_id[i] =
-				dict_table_get_index_on_name(table,
-						(char*)(page + (i + 1) * 512 + 12))->id;
-			old_id[i] = mach_read_from_8(page + (i + 1) * 512);
-			root_page[i] = mach_read_from_4(page + (i + 1) * 512 + 8);
+		/* Validate this single-table-tablespace with SYS_TABLES,
+		but do not compare the DATA_DIR flag, in case the
+		tablespace was relocated. */
+		if (dict.valid && dict.id == id
+		    && (dict.flags & ~FSP_FLAGS_MASK_DATA_DIR) == mod_flags) {
+			valid_tablespaces_found++;
+		} else {
+			dict.valid = false;
+			/* Do not use this tablespace. */
+			fil_report_bad_tablespace(
+				dict.filepath, dict.check_msg, dict.id,
+				dict.flags, id, flags);
 		}
+	}
 
-skip_info:
-		if (info_file != (os_file_t) -1)
-			os_file_close(info_file);
-
-		/*
-		if (size_bytes >= 1024 * 1024) {
-			size_bytes = ut_2pow_round(size_bytes, 1024 * 1024);
+	/* Make sense of these three possible locations.
+	First, bail out if no tablespace files were found. */
+	if (valid_tablespaces_found == 0) {
+		/* The following call prints an error message */
+		os_file_get_last_error(true);
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Could not find a valid tablespace file for '%s'. "
+			"See " REFMAN "innodb-troubleshooting-datadict.html "
+			"for how to resolve the issue.",
+			tablename);
+
+		err = DB_CORRUPTION;
+
+		goto cleanup_and_exit;
+	}
+
+	/* Do not open any tablespaces if more than one tablespace with
+	the correct space ID and flags were found. */
+	if (tablespaces_found > 1) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"A tablespace for %s has been found in "
+			"multiple places;", tablename);
+		if (def.success) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Default location; %s, LSN=" LSN_PF
+				", Space ID=%lu, Flags=%lu",
+				def.filepath, def.lsn,
+				(ulong) def.id, (ulong) def.flags);
 		}
-		*/
-
-		if (zip_size) {
-			fprintf(stderr, "InnoDB: Warning: importing compressed table is still EXPERIMENTAL, currently.\n");
+		if (remote.success) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Remote location; %s, LSN=" LSN_PF
+				", Space ID=%lu, Flags=%lu",
+				remote.filepath, remote.lsn,
+				(ulong) remote.id, (ulong) remote.flags);
+		}
+		if (dict.success) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Dictionary location; %s, LSN=" LSN_PF
+				", Space ID=%lu, Flags=%lu",
+				dict.filepath, dict.lsn,
+				(ulong) dict.id, (ulong) dict.flags);
 		}
 
-		{
-			mem_heap_t*	heap = NULL;
-			ulint		offsets_[REC_OFFS_NORMAL_SIZE];
-			ulint*		offsets = offsets_;
-			ib_int64_t	offset;
-
-			size = (ulint) (size_bytes / (zip_size ? zip_size : UNIV_PAGE_SIZE));
-			/* over write space id of all pages */
-			rec_offs_init(offsets_);
-
-			/* Unlock the data dictionary to not block queries
-			accessing other tables */
-			ut_a(trx);
-			row_mysql_unlock_data_dictionary(trx);
-
-			fprintf(stderr, "InnoDB: Progress in %%:");
-
-			for (offset = 0; offset < free_limit_bytes;
-			     offset += zip_size ? zip_size : UNIV_PAGE_SIZE) {
-				ibool		page_is_corrupt;
-				ibool		is_descr_page = FALSE;
-
-				success = os_file_read(file, page,
-							(ulint)(offset & 0xFFFFFFFFUL),
-							(ulint)(offset >> 32),
-							zip_size ? zip_size : UNIV_PAGE_SIZE);
-
-				page_is_corrupt = FALSE;
-
-				/* check consistency */
-				if (fil_page_buf_page_is_corrupted_offline(page, zip_size)) {
-					page_is_corrupt = TRUE;
-				}
-
-				if (mach_read_from_4(page + FIL_PAGE_OFFSET)
-				    != offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)) {
-
-					page_is_corrupt = TRUE;
-				}
-
-				/* if it is free page, inconsistency is acceptable */
-				if (!offset) {
-					/* header page*/
-					/* it should be overwritten already */
-					ut_a(!page_is_corrupt);
-
-				} else if (!((offset / (zip_size ? zip_size : UNIV_PAGE_SIZE))
-					     % (zip_size ? zip_size : UNIV_PAGE_SIZE))) {
-					/* descr page (not header) */
-					if (page_is_corrupt) {
-						file_is_corrupt = TRUE;
-						descr_is_corrupt = TRUE;
-					} else {
-						ut_ad(fil_page_get_type(page) == FIL_PAGE_TYPE_XDES);
-						descr_is_corrupt = FALSE;
-					}
-
-					/* store as descr page */
-					memcpy(descr_page, page, (zip_size ? zip_size : UNIV_PAGE_SIZE));
-					is_descr_page = TRUE;
-
-				} else if (descr_is_corrupt) {
-					/* unknown state of the page */
-					if (page_is_corrupt) {
-						file_is_corrupt = TRUE;
-					}
-
-				} else {
-					/* check free page or not */
-					/* These definitions should be same to fsp0fsp.c */
-#define	FSP_HEADER_SIZE		(32 + 5 * FLST_BASE_NODE_SIZE)
-
-#define	XDES_BITMAP		(FLST_NODE_SIZE + 12)
-#define	XDES_BITS_PER_PAGE	2
-#define	XDES_FREE_BIT		0
-#define	XDES_SIZE							\
-	(XDES_BITMAP + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE))
-#define	XDES_ARR_OFFSET		(FSP_HEADER_OFFSET + FSP_HEADER_SIZE)
-
-					/*descr = descr_page + XDES_ARR_OFFSET + XDES_SIZE * xdes_calc_descriptor_index(zip_size, offset)*/
-					/*xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)*/
-					byte*	descr;
-					ulint	index;
-					ulint	byte_index;
-					ulint	bit_index;
-
-					descr = descr_page + XDES_ARR_OFFSET
-						+ XDES_SIZE * (ut_2pow_remainder(
-							(offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)),
-							(zip_size ? zip_size : UNIV_PAGE_SIZE)) / FSP_EXTENT_SIZE);
-
-					index = XDES_FREE_BIT
-						+ XDES_BITS_PER_PAGE * ((offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)) % FSP_EXTENT_SIZE);
-					byte_index = index / 8;
-					bit_index = index % 8;
-
-					if (ut_bit_get_nth(mach_read_from_1(descr + XDES_BITMAP + byte_index), bit_index)) {
-						/* free page */
-						if (page_is_corrupt) {
-							goto skip_write;
-						}
-					} else {
-						/* not free */
-						if (page_is_corrupt) {
-							file_is_corrupt = TRUE;
-						}
-					}
-				}
-
-				if (page_is_corrupt) {
-					fprintf(stderr, " [errp:%ld]", (long)(offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)));
-
-					/* cannot treat corrupt page */
-					goto skip_write;
-				}
-
-				if (mach_read_from_4(page + FIL_PAGE_OFFSET) || !offset) {
-					mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id);
-
-					for (i = 0; i < n_index; i++) {
-						if (offset / (zip_size ? zip_size : UNIV_PAGE_SIZE) == root_page[i]) {
-							if (fil_page_get_type(page) != FIL_PAGE_INDEX) {
-								file_is_corrupt = TRUE;
-								fprintf(stderr, " [etyp:%ld]",
-									(long)(offset / (zip_size ? zip_size : UNIV_PAGE_SIZE)));
-								goto skip_write;
-							}
-							/* this is index root page */
-							mach_write_to_4(page + FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
-											+ FSEG_HDR_SPACE, id);
-							mach_write_to_4(page + FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
-											+ FSEG_HDR_SPACE, id);
-							break;
-						}
-					}
-
-					if (fil_page_get_type(page) ==
-					    FIL_PAGE_INDEX && !is_descr_page) {
-						index_id_t tmp = mach_read_from_8(page + (PAGE_HEADER + PAGE_INDEX_ID));
-
-						for (i = 0; i < n_index; i++) {
-							if (old_id[i] == tmp) {
-								mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), new_id[i]);
-								break;
-							}
-						}
-
-						if (!zip_size && mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL) == 0
-						    && old_id[0] == tmp) {
-							/* leaf page of cluster index, reset trx_id of records */
-							rec_t*	rec;
-							rec_t*	supremum;
-							ulint	n_recs;
-
-							supremum = page_get_supremum_rec(page);
-							rec = page_rec_get_next(page_get_infimum_rec(page));
-							n_recs = page_get_n_recs(page);
-
-							while (rec && rec != supremum && n_recs > 0) {
-								ulint	n_fields;
-								ulint	i;
-								ulint	offset = index->trx_id_offset;
-								offsets = rec_get_offsets(rec, index, offsets,
-										ULINT_UNDEFINED, &heap);
-								n_fields = rec_offs_n_fields(offsets);
-								if (!offset) {
-									offset = row_get_trx_id_offset(index, offsets);
-								}
-								trx_write_trx_id(rec + offset, 1);
-
-								for (i = 0; i < n_fields; i++) {
-									if (rec_offs_nth_extern(offsets, i)) {
-										ulint	local_len;
-										byte*	data;
-
-										data = rec_get_nth_field(rec, offsets, i, &local_len);
-
-										local_len -= BTR_EXTERN_FIELD_REF_SIZE;
-
-										mach_write_to_4(data + local_len + BTR_EXTERN_SPACE_ID, id);
-									}
-								}
-
-								rec = page_rec_get_next(rec);
-								n_recs--;
-							}
-						} else if (mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL) == 0
-							   && old_id[0] != tmp) {
-							mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), 1);
-						}
-					}
-
-					if (mach_read_from_8(page + FIL_PAGE_LSN) > current_lsn) {
-						mach_write_to_8(page + FIL_PAGE_LSN, current_lsn);
-						if (!zip_size) {
-							mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
-									current_lsn);
-						}
-					}
-
-					fil_page_buf_page_store_checksum(page, zip_size);
-
-					success = os_file_write(filepath, file, page,
-								(ulint)(offset & 0xFFFFFFFFUL),
-								(ulint)(offset >> 32),
-								zip_size ? zip_size : UNIV_PAGE_SIZE);
-				}
-
-skip_write:
-				if (free_limit_bytes
-				    && ((ib_int64_t)((offset + (zip_size ? zip_size : UNIV_PAGE_SIZE)) * 100) / free_limit_bytes)
-					!= ((offset * 100) / free_limit_bytes)) {
-					fprintf(stderr, " %lu",
-						(ulong)((ib_int64_t)((offset + (zip_size ? zip_size : UNIV_PAGE_SIZE)) * 100) / free_limit_bytes));
-				}
+		/* Force-recovery will allow some tablespaces to be
+		skipped by REDO if there was more than one file found.
+		Unlike during the REDO phase of recovery, we now know
+		if the tablespace is valid according to the dictionary,
+		which was not available then. So if we did not force
+		recovery and there is only one good tablespace, ignore
+		any bad tablespaces. */
+		if (valid_tablespaces_found > 1 || srv_force_recovery > 0) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Will not open the tablespace for '%s'",
+				tablename);
+
+			if (def.success != def.valid
+			    || dict.success != dict.valid
+			    || remote.success != remote.valid) {
+				err = DB_CORRUPTION;
+			} else {
+				err = DB_ERROR;
 			}
+			goto cleanup_and_exit;
+		}
 
-			fprintf(stderr, " done.\n");
-
-			/* Reacquire the data dictionary lock */
-			row_mysql_lock_data_dictionary(trx);
-
-			/* update SYS_INDEXES set root page */
-			index = dict_table_get_first_index(table);
-			while (index) {
-				for (i = 0; i < n_index; i++) {
-					if (new_id[i] == index->id) {
-						break;
-					}
-				}
-
-				if (i != n_index
-				    && root_page[i] != index->page) {
-					/* must update */
-					ulint	error;
-					trx_t*	trx;
-					pars_info_t*	info = NULL;
-
-					trx = trx_allocate_for_mysql();
-					trx->op_info = "extended import";
-
-					info = pars_info_create();
-
-					pars_info_add_ull_literal(info, "indexid", new_id[i]);
-					pars_info_add_int4_literal(info, "new_page", (lint) root_page[i]);
-
-					error = que_eval_sql(info,
-						"PROCEDURE UPDATE_INDEX_PAGE () IS\n"
-						"BEGIN\n"
-						"UPDATE SYS_INDEXES"
-						" SET PAGE_NO = :new_page"
-						" WHERE ID = :indexid;\n"
-						"COMMIT WORK;\n"
-						"END;\n",
-						FALSE, trx);
-
-					if (error != DB_SUCCESS) {
-						fprintf(stderr, "InnoDB: failed to update SYS_INDEXES\n");
-					}
-
-					trx_commit_for_mysql(trx);
-
-					trx_free_for_mysql(trx);
-
-					index->page = root_page[i];
-				}
-
-				index = dict_table_get_next_index(index);
-			}
-			if (UNIV_LIKELY_NULL(heap)) {
-				mem_heap_free(heap);
-			}
+		/* There is only one valid tablespace found and we did
+		not use srv_force_recovery during REDO.  Use this one
+		tablespace and clean up invalid tablespace pointers */
+		if (def.success && !def.valid) {
+			def.success = false;
+			os_file_close(def.file);
+			tablespaces_found--;
 		}
-		/* .exp file should be removed */
-		success = os_file_delete(info_file_path);
-		if (!success) {
-			success = os_file_delete_if_exists(info_file_path);
+		if (dict.success && !dict.valid) {
+			dict.success = false;
+			os_file_close(dict.file);
+			/* Leave dict.filepath so that SYS_DATAFILES
+			can be corrected below. */
+			tablespaces_found--;
+		}
+		if (remote.success && !remote.valid) {
+			remote.success = false;
+			os_file_close(remote.file);
+			mem_free(remote.filepath);
+			remote.filepath = NULL;
+			tablespaces_found--;
 		}
-		mem_free(info_file_path);
+	}
 
-		system	= fil_system;
-		mutex_enter(&(system->mutex));
-		space = fil_space_get_by_id(id);
-		if (space)
-			node = UT_LIST_GET_FIRST(space->chain);
-		if (node && node->size < size) {
-			space->size += (size - node->size);
-			node->size = size;
+	/* At this point, there should be only one filepath. */
+	ut_a(tablespaces_found == 1);
+	ut_a(valid_tablespaces_found == 1);
+
+	/* Only fix the dictionary at startup when there is only one thread.
+	Calls to dict_load_table() can be done while holding other latches. */
+	if (!fix_dict) {
+		goto skip_validate;
+	}
+
+	/* We may need to change what is stored in SYS_DATAFILES or
+	SYS_TABLESPACES or adjust the link file.
+	Since a failure to update SYS_TABLESPACES or SYS_DATAFILES does
+	not prevent opening and using the single_table_tablespace either
+	this time or the next, we do not check the return code or fail
+	to open the tablespace. But dict_update_filepath() will issue a
+	warning to the log. */
+	if (dict.filepath) {
+		if (remote.success) {
+			dict_update_filepath(id, remote.filepath);
+		} else if (def.success) {
+			dict_update_filepath(id, def.filepath);
+			if (link_file_is_bad) {
+				fil_delete_link_file(tablename);
+			}
+		} else if (!link_file_found || link_file_is_bad) {
+			ut_ad(dict.success);
+			/* Fix the link file if we got our filepath
+			from the dictionary but a link file did not
+			exist or it did not point to a valid file. */
+			fil_delete_link_file(tablename);
+			fil_create_link_file(tablename, dict.filepath);
 		}
-		mutex_exit(&(system->mutex));
 
-		ut_free(buf3);
+	} else if (remote.success && dict_filepath_same_as_default) {
+		dict_update_filepath(id, remote.filepath);
 
-		if (file_is_corrupt) {
-			ut_print_timestamp(stderr);
-			fputs("  InnoDB: Error: file ",
-			      stderr);
-			ut_print_filename(stderr, filepath);
-			fprintf(stderr, " seems to be corrupt.\n"
-				"InnoDB: An attempt to convert and salvage all corrupt pages was not made.\n"
-				"InnoDB: ##### CAUTION #####\n"
-				"InnoDB: ## The .ibd file may cause InnoDB to crash, even though its re-import seems to have succeeded.\n"
-				"InnoDB: ## If you don't know how to salvage data from a .ibd, you should not use the file.\n"
-				"InnoDB: ###################\n");
-			success = FALSE;
+	} else if (remote.success && path_in == NULL) {
+		/* SYS_DATAFILES record for this space ID was not found. */
+		dict_insert_tablespace_and_filepath(
+			id, tablename, remote.filepath, flags);
+	}
 
-			ut_free(buf2);
+skip_validate:
+	if (err != DB_SUCCESS) {
+		; // Don't load the tablespace into the cache
+	} else if (!fil_space_create(tablename, id, flags, FIL_TABLESPACE)) {
+		err = DB_ERROR;
+	} else {
+		/* We do not measure the size of the file, that is why
+		we pass the 0 below */
 
-			goto func_exit;
+		if (!fil_node_create(remote.success ? remote.filepath :
+				     dict.success ? dict.filepath :
+				     def.filepath, 0, id, FALSE)) {
+			err = DB_ERROR;
 		}
 	}
 
-	ut_free(buf2);
-
-	if (check_msg) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr, "  InnoDB: Error: %s in file ", check_msg);
-		ut_print_filename(stderr, filepath);
-		fprintf(stderr, " (tablespace id=%lu, flags=%lu)\n"
-			"InnoDB: Please refer to " REFMAN
-			"innodb-troubleshooting-datadict.html\n",
-			(ulong) id, (ulong) flags);
-		success = FALSE;
-		goto func_exit;
+cleanup_and_exit:
+	if (remote.success) {
+		os_file_close(remote.file);
 	}
-
-	if (space_id != id
-	    || space_flags != (flags & ~(~0 << DICT_TF_BITS))) {
-		ut_print_timestamp(stderr);
-
-		fputs("  InnoDB: Error: tablespace id and flags in file ",
-		      stderr);
-		ut_print_filename(stderr, filepath);
-		fprintf(stderr, " are %lu and %lu, but in the InnoDB\n"
-			"InnoDB: data dictionary they are %lu and %lu.\n"
-			"InnoDB: Have you moved InnoDB .ibd files"
-			" around without using the\n"
-			"InnoDB: commands DISCARD TABLESPACE and"
-			" IMPORT TABLESPACE?\n"
-			"InnoDB: Please refer to\n"
-			"InnoDB: " REFMAN "innodb-troubleshooting-datadict.html\n"
-			"InnoDB: for how to resolve the issue.\n",
-			(ulong) space_id, (ulong) space_flags,
-			(ulong) id, (ulong) flags);
-
-		success = FALSE;
-
-		goto func_exit;
+	if (remote.filepath) {
+		mem_free(remote.filepath);
 	}
-
-skip_check:
-	success = fil_space_create(filepath, space_id, flags, FIL_TABLESPACE);
-
-	if (!success) {
-		goto func_exit;
+	if (dict.success) {
+		os_file_close(dict.file);
 	}
-
-	/* We do not measure the size of the file, that is why we pass the 0
-	below */
-
-	fil_node_create(filepath, 0, space_id, FALSE);
-func_exit:
-	os_file_close(file);
-	mem_free(filepath);
-
-	if (srv_expand_import && dict_table_flags_to_zip_size(flags)) {
-		ulint		page_no;
-		ulint		zip_size;
-		ulint		height;
-		rec_t*		node_ptr;
-		dict_table_t*	table;
-		dict_index_t*	index;
-		buf_block_t*	block;
-		page_t*		page;
-		page_zip_des_t*	page_zip;
-		mtr_t		mtr;
-
-		mem_heap_t*	heap		= NULL;
-		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
-		ulint*		offsets		= offsets_;
-
-		rec_offs_init(offsets_);
-
-		zip_size = dict_table_flags_to_zip_size(flags);
-
-		table = dict_table_get_low(name, DICT_ERR_IGNORE_NONE);
-		index = dict_table_get_first_index(table);
-		page_no = dict_index_get_page(index);
-		ut_a(page_no == 3);
-
-		fprintf(stderr, "InnoDB: It is compressed .ibd file. need to convert additionaly on buffer pool.\n");
-
-		/* down to leaf */
-		mtr_start(&mtr);
-		mtr_set_log_mode(&mtr, MTR_LOG_NONE);
-
-		height = ULINT_UNDEFINED;
-
-		for (;;) {
-			block = buf_page_get(space_id, zip_size, page_no,
-					     RW_NO_LATCH, &mtr);
-			page = buf_block_get_frame(block);
-
-			block->check_index_page_at_flush = TRUE;
-
-			if (height == ULINT_UNDEFINED) {
-				height = btr_page_get_level(page, &mtr);
-			}
-
-			if (height == 0) {
-				break;
-			}
-
-			node_ptr = page_rec_get_next(page_get_infimum_rec(page));
-
-			height--;
-
-			offsets = rec_get_offsets(node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
-			page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
-		}
-
-		mtr_commit(&mtr);
-
-		fprintf(stderr, "InnoDB: pages needs split are ...");
-
-		/* scan reaf pages */
-		while (page_no != FIL_NULL) {
-			rec_t*	rec;
-			rec_t*	supremum;
-			ulint	n_recs;
-
-			mtr_start(&mtr);
-
-			block = buf_page_get(space_id, zip_size, page_no,
-					     RW_X_LATCH, &mtr);
-			page = buf_block_get_frame(block);
-			page_zip = buf_block_get_page_zip(block);
-
-			if (!page_zip) {
-				/*something wrong*/
-				fprintf(stderr, "InnoDB: Something wrong with reading page %lu.\n", page_no);
-convert_err_exit:
-				mtr_commit(&mtr);
-				mutex_enter(&fil_system->mutex);
-				fil_space_free(space_id, FALSE);
-				mutex_exit(&fil_system->mutex);
-				success = FALSE;
-				goto convert_exit;
-			}
-
-			supremum = page_get_supremum_rec(page);
-			rec = page_rec_get_next(page_get_infimum_rec(page));
-			n_recs = page_get_n_recs(page);
-
-			/* illegal operation as InnoDB online system. so not logged */
-			while (rec && rec != supremum && n_recs > 0) {
-				ulint	n_fields;
-				ulint	i;
-				ulint	offset = index->trx_id_offset;
-
-				offsets = rec_get_offsets(rec, index, offsets,
-						ULINT_UNDEFINED, &heap);
-				n_fields = rec_offs_n_fields(offsets);
-				if (!offset) {
-					offset = row_get_trx_id_offset(index, offsets);
-				}
-				trx_write_trx_id(rec + offset, 1);
-
-				for (i = 0; i < n_fields; i++) {
-					if (rec_offs_nth_extern(offsets, i)) {
-						ulint	local_len;
-						byte*	data;
-
-						data = rec_get_nth_field(rec, offsets, i, &local_len);
-
-						local_len -= BTR_EXTERN_FIELD_REF_SIZE;
-
-						mach_write_to_4(data + local_len + BTR_EXTERN_SPACE_ID, id);
-					}
-				}
-
-				rec = page_rec_get_next(rec);
-				n_recs--;
-			}
-
-			/* dummy logged update for along with modified page path */
-			if (index->id != btr_page_get_index_id(page)) {
-				/* this should be adjusted already */
-				fprintf(stderr, "InnoDB: The page %lu seems to be converted wrong.\n", page_no);
-				goto convert_err_exit;
-			}
-			btr_page_set_index_id(page, page_zip, index->id, &mtr);
-
-			/* confirm whether fits to the page size or not */
-			if (!page_zip_compress(page_zip, page, index, &mtr)
-			    && !btr_page_reorganize(block, index, &mtr)) {
-				buf_block_t*	new_block;
-				page_t*		new_page;
-				page_zip_des_t*	new_page_zip;
-				rec_t*		split_rec;
-				ulint		n_uniq;
-
-				/* split page is needed */
-				fprintf(stderr, " %lu", page_no);
-
-				mtr_x_lock(dict_index_get_lock(index), &mtr);
-
-				n_uniq = dict_index_get_n_unique_in_tree(index);
-
-				if(page_get_n_recs(page) < 2) {
-					/* no way to make smaller */
-					fprintf(stderr, "InnoDB: The page %lu cannot be store to the page size.\n", page_no);
-					goto convert_err_exit;
-				}
-
-				if (UNIV_UNLIKELY(page_no == dict_index_get_page(index))) {
-					ulint		new_page_no;
-					dtuple_t*	node_ptr;
-					ulint		level;
-					rec_t*		node_ptr_rec;
-					page_cur_t	page_cursor;
-
-					/* it is root page, need to raise before split */
-
-					level = btr_page_get_level(page, &mtr);
-
-					new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, &mtr, &mtr);
-					new_page = buf_block_get_frame(new_block);
-					new_page_zip = buf_block_get_page_zip(new_block);
-					btr_page_create(new_block, new_page_zip, index, level, &mtr);
-
-					btr_page_set_next(new_page, new_page_zip, FIL_NULL, &mtr);
-					btr_page_set_prev(new_page, new_page_zip, FIL_NULL, &mtr);
-
-					page_zip_copy_recs(new_page_zip, new_page,
-							   page_zip, page, index, &mtr);
-					btr_search_move_or_delete_hash_entries(new_block, block, index);
-
-					rec = page_rec_get_next(page_get_infimum_rec(new_page));
-					new_page_no = buf_block_get_page_no(new_block);
-
-					node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap,
-									     level);
-					dtuple_set_info_bits(node_ptr,
-							     dtuple_get_info_bits(node_ptr)
-							     | REC_INFO_MIN_REC_FLAG);
-					btr_page_empty(block, page_zip, index, level + 1, &mtr);
-
-					btr_page_set_next(page, page_zip, FIL_NULL, &mtr);
-					btr_page_set_prev(page, page_zip, FIL_NULL, &mtr);
-
-					page_cur_set_before_first(block, &page_cursor);
-
-					node_ptr_rec = page_cur_tuple_insert(&page_cursor, node_ptr,
-									     index, 0, &mtr);
-					ut_a(node_ptr_rec);
-
-					if (!btr_page_reorganize(block, index, &mtr)) {
-						fprintf(stderr, "InnoDB: failed to store the page %lu.\n", page_no);
-						goto convert_err_exit;
-					}
-
-					/* move to the raised page */
-					page_no = new_page_no;
-					block = new_block;
-					page = new_page;
-					page_zip = new_page_zip;
-
-					fprintf(stderr, "(raise_to:%lu)", page_no);
-				}
-
-				split_rec = page_get_middle_rec(page);
-
-				new_block = btr_page_alloc(index, page_no + 1, FSP_UP,
-							   btr_page_get_level(page, &mtr), &mtr, &mtr);
-				new_page = buf_block_get_frame(new_block);
-				new_page_zip = buf_block_get_page_zip(new_block);
-				btr_page_create(new_block, new_page_zip, index,
-						btr_page_get_level(page, &mtr), &mtr);
-
-				offsets = rec_get_offsets(split_rec, index, offsets, n_uniq, &heap);
-
-				btr_attach_half_pages(index, block,
-						      split_rec, new_block, FSP_UP, &mtr);
-
-				page_zip_copy_recs(new_page_zip, new_page,
-						   page_zip, page, index, &mtr);
-				page_delete_rec_list_start(split_rec - page + new_page,
-							   new_block, index, &mtr);
-				btr_search_move_or_delete_hash_entries(new_block, block, index);
-				page_delete_rec_list_end(split_rec, block, index,
-							 ULINT_UNDEFINED, ULINT_UNDEFINED, &mtr);
-
-				fprintf(stderr, "(new:%lu)", buf_block_get_page_no(new_block));
-
-				/* Are they needed? */
-				if (!btr_page_reorganize(block, index, &mtr)) {
-					fprintf(stderr, "InnoDB: failed to store the page %lu.\n", page_no);
-					goto convert_err_exit;
-				}
-				if (!btr_page_reorganize(new_block, index, &mtr)) {
-					fprintf(stderr, "InnoDB: failed to store the page %lu.\n", buf_block_get_page_no(new_block));
-					goto convert_err_exit;
-				}
-			}
-
-			page_no = btr_page_get_next(page, &mtr);
-
-			mtr_commit(&mtr);
-
-			if (heap) {
-				mem_heap_empty(heap);
-			}
-		}
-
-		fprintf(stderr, "...done.\nInnoDB: waiting the flush batch of the additional conversion.\n");
-
-		/* should wait for the not-logged changes are all flushed */
-		buf_flush_list(ULINT_MAX, mtr.end_lsn + 1);
-		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
-
-		fprintf(stderr, "InnoDB: done.\n");
-convert_exit:
-		if (UNIV_LIKELY_NULL(heap)) {
-			mem_heap_free(heap);
-		}
+	if (dict.filepath) {
+		mem_free(dict.filepath);
+	}
+	if (def.success) {
+		os_file_close(def.file);
 	}
+	mem_free(def.filepath);
 
-	return(success);
+	return(err);
 }
 #endif /* !UNIV_HOTBACKUP */
 
@@ -4185,46 +4022,125 @@ fil_make_ibbackup_old_name(
 	const char*	name)		/*!< in: original file name */
 {
 	static const char suffix[] = "_ibbackup_old_vers_";
+	char*	path;
 	ulint	len	= strlen(name);
-	char*	path	= mem_alloc(len + (15 + sizeof suffix));
+
+	path = static_cast<char*>(mem_alloc(len + (15 + sizeof suffix)));
 
 	memcpy(path, name, len);
 	memcpy(path + len, suffix, (sizeof suffix) - 1);
-	ut_sprintf_timestamp_without_extra_chars(path + len + sizeof suffix);
+	ut_sprintf_timestamp_without_extra_chars(
+		path + len + ((sizeof suffix) - 1));
 	return(path);
 }
 #endif /* UNIV_HOTBACKUP */
 
 /********************************************************************//**
 Opens an .ibd file and adds the associated single-table tablespace to the
-InnoDB fil0fil.c data structures. */
+InnoDB fil0fil.cc data structures.
+Set fsp->success to TRUE if tablespace is valid, FALSE if not. */
+static
+void
+fil_validate_single_table_tablespace(
+/*=================================*/
+	const char*	tablename,	/*!< in: database/tablename */
+	fsp_open_info*	fsp)		/*!< in/out: tablespace info */
+{
+	if (const char* check_msg = fil_read_first_page(
+		    fsp->file, FALSE, &fsp->flags, &fsp->id,
+		    &fsp->lsn, &fsp->lsn)) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"%s in tablespace %s (table %s)",
+			check_msg, fsp->filepath, tablename);
+		fsp->success = FALSE;
+		return;
+	}
+
+	if (fsp->id == ULINT_UNDEFINED || fsp->id == 0) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Tablespace is not sensible;"
+			" Table: %s  Space ID: %lu  Filepath: %s\n",
+		tablename, (ulong) fsp->id, fsp->filepath);
+		fsp->success = FALSE;
+		return;
+	}
+
+	mutex_enter(&fil_system->mutex);
+	fil_space_t* space = fil_space_get_by_id(fsp->id);
+	mutex_exit(&fil_system->mutex);
+	if (space != NULL) {
+		char* prev_filepath = fil_space_get_first_path(fsp->id);
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Attempted to open a previously opened tablespace. "
+			"Previous tablespace %s uses space ID: %lu at "
+			"filepath: %s. Cannot open tablespace %s which uses "
+			"space ID: %lu at filepath: %s",
+			space->name, (ulong) space->id, prev_filepath,
+			tablename, (ulong) fsp->id, fsp->filepath);
+
+		mem_free(prev_filepath);
+		fsp->success = FALSE;
+		return;
+	}
+
+	fsp->success = TRUE;
+}
+
+
+/********************************************************************//**
+Opens an .ibd file and adds the associated single-table tablespace to the
+InnoDB fil0fil.cc data structures. */
 static
 void
 fil_load_single_table_tablespace(
 /*=============================*/
 	const char*	dbname,		/*!< in: database name */
 	const char*	filename)	/*!< in: file name (not a path),
-					including the .ibd extension */
+					including the .ibd or .isl extension */
 {
-	os_file_t	file;
-	char*		filepath;
-	ibool		success;
-	byte*		buf2;
-	byte*		page;
-	ulint		space_id;
-	ulint		flags;
-	ulint		size_low;
-	ulint		size_high;
-	ib_int64_t	size;
+	char*		tablename;
+	ulint		tablename_len;
+	ulint		dbname_len = strlen(dbname);
+	ulint		filename_len = strlen(filename);
+	fsp_open_info	def;
+	fsp_open_info	remote;
+	os_offset_t	size;
 #ifdef UNIV_HOTBACKUP
 	fil_space_t*	space;
 #endif
-	filepath = mem_alloc(strlen(dbname) + strlen(filename)
-			     + strlen(fil_path_to_mysql_datadir) + 3);
 
-	sprintf(filepath, "%s/%s/%s", fil_path_to_mysql_datadir, dbname,
-		filename);
-	srv_normalize_path_for_win(filepath);
+	memset(&def, 0, sizeof(def));
+	memset(&remote, 0, sizeof(remote));
+
+	/* The caller assured that the extension is ".ibd" or ".isl". */
+	ut_ad(0 == memcmp(filename + filename_len - 4, ".ibd", 4)
+	      || 0 == memcmp(filename + filename_len - 4, ".isl", 4));
+
+	/* Build up the tablename in the standard form database/table. */
+	tablename = static_cast<char*>(
+		mem_alloc(dbname_len + filename_len + 2));
+	sprintf(tablename, "%s/%s", dbname, filename);
+	tablename_len = strlen(tablename) - strlen(".ibd");
+	tablename[tablename_len] = '\0';
+
+	/* There may be both .ibd and .isl file in the directory.
+	And it is possible that the .isl file refers to a different
+	.ibd file.  If so, we open and compare them the first time
+	one of them is sent to this function.  So if this table has
+	already been loaded, there is nothing to do.*/
+	mutex_enter(&fil_system->mutex);
+	if (fil_space_get_by_name(tablename)) {
+		mem_free(tablename);
+		mutex_exit(&fil_system->mutex);
+		return;
+	}
+	mutex_exit(&fil_system->mutex);
+
+	/* Build up the filepath of the .ibd tablespace in the datadir.
+	This must be freed independent of def.success. */
+	def.filepath = fil_make_ibd_name(tablename, false);
+
 #ifdef __WIN__
 # ifndef UNIV_HOTBACKUP
 	/* If lower_case_table_names is 0 or 2, then MySQL allows database
@@ -4233,31 +4149,69 @@ fil_load_single_table_tablespace(
 	file path to lower case, so that we are consistent with InnoDB's
 	internal data dictionary. */
 
-	dict_casedn_str(filepath);
+	dict_casedn_str(def.filepath);
 # endif /* !UNIV_HOTBACKUP */
 #endif
-	file = os_file_create_simple_no_error_handling(
-		innodb_file_data_key, filepath, OS_FILE_OPEN,
-		OS_FILE_READ_ONLY, &success);
-	if (!success) {
-		/* The following call prints an error message */
-		os_file_get_last_error(TRUE);
 
+	/* Check for a link file which locates a remote tablespace. */
+	remote.success = fil_open_linked_file(
+		tablename, &remote.filepath, &remote.file);
+
+	/* Read the first page of the remote tablespace */
+	if (remote.success) {
+		fil_validate_single_table_tablespace(tablename, &remote);
+		if (!remote.success) {
+			os_file_close(remote.file);
+			mem_free(remote.filepath);
+		}
+	}
+
+
+	/* Try to open the tablespace in the datadir. */
+	def.file = os_file_create_simple_no_error_handling(
+		innodb_file_data_key, def.filepath, OS_FILE_OPEN,
+		OS_FILE_READ_ONLY, &def.success);
+
+	/* Read the first page of the remote tablespace */
+	if (def.success) {
+		fil_validate_single_table_tablespace(tablename, &def);
+		if (!def.success) {
+			os_file_close(def.file);
+		}
+	}
+
+	if (!def.success && !remote.success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(true);
+		fprintf(stderr,
+			"InnoDB: Error: could not open single-table"
+			" tablespace file %s\n", def.filepath);
+
+		if (!strncmp(filename,
+			     tmp_file_prefix, tmp_file_prefix_length)) {
+			/* Ignore errors for #sql tablespaces. */
+			mem_free(tablename);
+			if (remote.filepath) {
+				mem_free(remote.filepath);
+			}
+			if (def.filepath) {
+				mem_free(def.filepath);
+			}
+			return;
+		}
+no_good_file:
 		fprintf(stderr,
-			"InnoDB: Error: could not open single-table tablespace"
-			" file\n"
-			"InnoDB: %s!\n"
 			"InnoDB: We do not continue the crash recovery,"
 			" because the table may become\n"
-			"InnoDB: corrupt if we cannot apply the log records"
-			" in the InnoDB log to it.\n"
+			"InnoDB: corrupt if we cannot apply the log"
+			" records in the InnoDB log to it.\n"
 			"InnoDB: To fix the problem and start mysqld:\n"
 			"InnoDB: 1) If there is a permission problem"
 			" in the file and mysqld cannot\n"
 			"InnoDB: open the file, you should"
 			" modify the permissions.\n"
-			"InnoDB: 2) If the table is not needed, or you can"
-			" restore it from a backup,\n"
+			"InnoDB: 2) If the table is not needed, or you"
+			" can restore it from a backup,\n"
 			"InnoDB: then you can remove the .ibd file,"
 			" and InnoDB will do a normal\n"
 			"InnoDB: crash recovery and ignore that table.\n"
@@ -4266,63 +4220,21 @@ fil_load_single_table_tablespace(
 			"InnoDB: the .ibd file, you can set"
 			" innodb_force_recovery > 0 in my.cnf\n"
 			"InnoDB: and force InnoDB to continue crash"
-			" recovery here.\n", filepath);
-
-		mem_free(filepath);
-
-		if (srv_force_recovery > 0) {
-			fprintf(stderr,
-				"InnoDB: innodb_force_recovery"
-				" was set to %lu. Continuing crash recovery\n"
-				"InnoDB: even though we cannot access"
-				" the .ibd file of this table.\n",
-				srv_force_recovery);
-			return;
+			" recovery here.\n");
+will_not_choose:
+		mem_free(tablename);
+		if (remote.filepath) {
+			mem_free(remote.filepath);
+		}
+		if (def.filepath) {
+			mem_free(def.filepath);
 		}
-
-		exit(1);
-	}
-
-	success = os_file_get_size(file, &size_low, &size_high);
-
-	if (!success) {
-		/* The following call prints an error message */
-		os_file_get_last_error(TRUE);
-
-		fprintf(stderr,
-			"InnoDB: Error: could not measure the size"
-			" of single-table tablespace file\n"
-			"InnoDB: %s!\n"
-			"InnoDB: We do not continue crash recovery,"
-			" because the table will become\n"
-			"InnoDB: corrupt if we cannot apply the log records"
-			" in the InnoDB log to it.\n"
-			"InnoDB: To fix the problem and start mysqld:\n"
-			"InnoDB: 1) If there is a permission problem"
-			" in the file and mysqld cannot\n"
-			"InnoDB: access the file, you should"
-			" modify the permissions.\n"
-			"InnoDB: 2) If the table is not needed,"
-			" or you can restore it from a backup,\n"
-			"InnoDB: then you can remove the .ibd file,"
-			" and InnoDB will do a normal\n"
-			"InnoDB: crash recovery and ignore that table.\n"
-			"InnoDB: 3) If the file system or the disk is broken,"
-			" and you cannot remove\n"
-			"InnoDB: the .ibd file, you can set"
-			" innodb_force_recovery > 0 in my.cnf\n"
-			"InnoDB: and force InnoDB to continue"
-			" crash recovery here.\n", filepath);
-
-		os_file_close(file);
-		mem_free(filepath);
 
 		if (srv_force_recovery > 0) {
-			fprintf(stderr,
-				"InnoDB: innodb_force_recovery"
-				" was set to %lu. Continuing crash recovery\n"
-				"InnoDB: even though we cannot access"
-				" the .ibd file of this table.\n",
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"innodb_force_recovery was set to %lu. "
+				"Continuing crash recovery even though we "
+				"cannot access the .ibd file of this table.",
 				srv_force_recovery);
 			return;
 		}
@@ -4330,68 +4242,61 @@ fil_load_single_table_tablespace(
 		exit(1);
 	}
 
-	/* TODO: What to do in other cases where we cannot access an .ibd
-	file during a crash recovery? */
-
-	/* Every .ibd file is created >= 4 pages in size. Smaller files
-	cannot be ok. */
-
-	size = (((ib_int64_t)size_high) << 32) + (ib_int64_t)size_low;
-#ifndef UNIV_HOTBACKUP
-	if (size < FIL_IBD_FILE_INITIAL_SIZE * (lint)UNIV_PAGE_SIZE) {
-		fprintf(stderr,
-			"InnoDB: Error: the size of single-table tablespace"
-			" file %s\n"
-			"InnoDB: is only %lu %lu, should be at least %lu!",
-			filepath,
-			(ulong) size_high,
-			(ulong) size_low, (ulong) (4 * UNIV_PAGE_SIZE));
-		os_file_close(file);
-		mem_free(filepath);
+	if (def.success && remote.success) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Tablespaces for %s have been found in two places;\n"
+			"Location 1: SpaceID: %lu  LSN: %lu  File: %s\n"
+			"Location 2: SpaceID: %lu  LSN: %lu  File: %s\n"
+			"You must delete one of them.",
+			tablename, (ulong) def.id, (ulong) def.lsn,
+			def.filepath, (ulong) remote.id, (ulong) remote.lsn,
+			remote.filepath);
 
-		return;
+		def.success = FALSE;
+		os_file_close(def.file);
+		os_file_close(remote.file);
+		goto will_not_choose;
 	}
-#endif
-	/* Read the first page of the tablespace if the size big enough */
-
-	buf2 = ut_malloc(2 * UNIV_PAGE_SIZE);
-	/* Align the memory for file i/o if we might have O_DIRECT set */
-	page = ut_align(buf2, UNIV_PAGE_SIZE);
 
-	if (size >= FIL_IBD_FILE_INITIAL_SIZE * (lint)UNIV_PAGE_SIZE) {
-		const char*	check_msg;
+	/* At this point, only one tablespace is open */
+	ut_a(def.success == !remote.success);
 
-		success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE);
+	fsp_open_info*	fsp = def.success ? &def : &remote;
 
-		/* We have to read the tablespace id from the file */
+	/* Get and test the file size. */
+	size = os_file_get_size(fsp->file);
 
-		check_msg = fil_check_first_page(page);
+	if (size == (os_offset_t) -1) {
+		/* The following call prints an error message */
+		os_file_get_last_error(true);
 
-		if (check_msg) {
-			fprintf(stderr,
-				"InnoDB: Error: %s in file %s",
-				check_msg, filepath);
-			goto func_exit;
-		}
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"could not measure the size of single-table "
+			"tablespace file %s", fsp->filepath);
 
-		space_id = fsp_header_get_space_id(page);
-		flags = fsp_header_get_flags(page);
-	} else {
-		space_id = ULINT_UNDEFINED;
-		flags = 0;
+		os_file_close(fsp->file);
+		goto no_good_file;
 	}
 
+	/* Every .ibd file is created >= 4 pages in size. Smaller files
+	cannot be ok. */
+	ulong minimum_size = FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE;
+	if (size < minimum_size) {
 #ifndef UNIV_HOTBACKUP
-	if (space_id == ULINT_UNDEFINED || trx_sys_sys_space(space_id)) {
-		fprintf(stderr,
-			"InnoDB: Error: tablespace id %lu in file %s"
-			" is not sensible\n",
-			(ulong) space_id,
-			filepath);
-		goto func_exit;
-	}
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"The size of single-table tablespace file %s "
+			"is only " UINT64PF ", should be at least %lu!",
+			fsp->filepath, size, minimum_size);
+		os_file_close(fsp->file);
+		goto no_good_file;
 #else
-	if (space_id == ULINT_UNDEFINED || trx_sys_sys_space(space_id)) {
+		fsp->id = ULINT_UNDEFINED;
+		fsp->flags = 0;
+#endif /* !UNIV_HOTBACKUP */
+	}
+
+#ifdef UNIV_HOTBACKUP
+	if (fsp->id == ULINT_UNDEFINED || fsp->id == 0) {
 		char*	new_path;
 
 		fprintf(stderr,
@@ -4403,17 +4308,19 @@ fil_load_single_table_tablespace(
 			" is not sensible.\n"
 			"InnoDB: This can happen in an ibbackup run,"
 			" and is not dangerous.\n",
-			filepath, space_id, filepath, size);
-		os_file_close(file);
+			fsp->filepath, fsp->id, fsp->filepath, size);
+		os_file_close(fsp->file);
 
-		new_path = fil_make_ibbackup_old_name(filepath);
-		ut_a(os_file_rename(innodb_file_data_key, filepath, new_path));
+		new_path = fil_make_ibbackup_old_name(fsp->filepath);
+
+		bool	success = os_file_rename(
+			innodb_file_data_key, fsp->filepath, new_path);
+
+		ut_a(success);
 
-		ut_free(buf2);
-		mem_free(filepath);
 		mem_free(new_path);
 
-		return;
+		goto func_exit_after_close;
 	}
 
 	/* A backup may contain the same space several times, if the space got
@@ -4425,7 +4332,7 @@ fil_load_single_table_tablespace(
 
 	mutex_enter(&fil_system->mutex);
 
-	space = fil_space_get_by_id(space_id);
+	space = fil_space_get_by_id(fsp->id);
 
 	if (space) {
 		char*	new_path;
@@ -4437,50 +4344,64 @@ fil_load_single_table_tablespace(
 			"InnoDB: was scanned earlier. This can happen"
 			" if you have renamed tables\n"
 			"InnoDB: during an ibbackup run.\n",
-			filepath, space_id, filepath,
+			fsp->filepath, fsp->id, fsp->filepath,
 			space->name);
-		os_file_close(file);
+		os_file_close(fsp->file);
 
-		new_path = fil_make_ibbackup_old_name(filepath);
+		new_path = fil_make_ibbackup_old_name(fsp->filepath);
 
 		mutex_exit(&fil_system->mutex);
 
-		ut_a(os_file_rename(innodb_file_data_key, filepath, new_path));
+		bool	success = os_file_rename(
+			innodb_file_data_key, fsp->filepath, new_path);
+
+		ut_a(success);
 
-		ut_free(buf2);
-		mem_free(filepath);
 		mem_free(new_path);
 
-		return;
+		goto func_exit_after_close;
 	}
 	mutex_exit(&fil_system->mutex);
-#endif
-	success = fil_space_create(filepath, space_id, flags, FIL_TABLESPACE);
-
-	if (!success) {
+#endif /* UNIV_HOTBACKUP */
+	ibool file_space_create_success = fil_space_create(
+		tablename, fsp->id, fsp->flags, FIL_TABLESPACE);
 
+	if (!file_space_create_success) {
 		if (srv_force_recovery > 0) {
 			fprintf(stderr,
-				"InnoDB: innodb_force_recovery"
-				" was set to %lu. Continuing crash recovery\n"
-				"InnoDB: even though the tablespace creation"
-				" of this table failed.\n",
+				"InnoDB: innodb_force_recovery was set"
+				" to %lu. Continuing crash recovery\n"
+				"InnoDB: even though the tablespace"
+				" creation of this table failed.\n",
 				srv_force_recovery);
 			goto func_exit;
 		}
 
-		exit(1);
+		/* Exit here with a core dump, stack, etc. */
+		ut_a(file_space_create_success);
 	}
 
 	/* We do not use the size information we have about the file, because
 	the rounding formula for extents and pages is somewhat complex; we
 	let fil_node_open() do that task. */
 
-	fil_node_create(filepath, 0, space_id, FALSE);
+	if (!fil_node_create(fsp->filepath, 0, fsp->id, FALSE)) {
+		ut_error;
+	}
+
 func_exit:
-	os_file_close(file);
-	ut_free(buf2);
-	mem_free(filepath);
+	os_file_close(fsp->file);
+
+#ifdef UNIV_HOTBACKUP
+func_exit_after_close:
+#else
+	ut_ad(!mutex_own(&fil_system->mutex));
+#endif
+	mem_free(tablename);
+	if (remote.success) {
+		mem_free(remote.filepath);
+	}
+	mem_free(def.filepath);
 }
 
 /***********************************************************************//**
@@ -4493,29 +4414,25 @@ static
 int
 fil_file_readdir_next_file(
 /*=======================*/
-	ulint*		err,	/*!< out: this is set to DB_ERROR if an error
+	dberr_t*	err,	/*!< out: this is set to DB_ERROR if an error
 				was encountered, otherwise not changed */
 	const char*	dirname,/*!< in: directory name or path */
 	os_file_dir_t	dir,	/*!< in: directory stream */
-	os_file_stat_t*	info)	/*!< in/out: buffer where the info is returned */
+	os_file_stat_t*	info)	/*!< in/out: buffer where the
+				info is returned */
 {
-	ulint	i;
-	int	ret;
-
-	for (i = 0; i < 100; i++) {
-		ret = os_file_readdir_next_file(dirname, dir, info);
+	for (ulint i = 0; i < 100; i++) {
+		int	ret = os_file_readdir_next_file(dirname, dir, info);
 
 		if (ret != -1) {
 
 			return(ret);
 		}
 
-		fprintf(stderr,
-			"InnoDB: Error: os_file_readdir_next_file()"
-			" returned -1 in\n"
-			"InnoDB: directory %s\n"
-			"InnoDB: Crash recovery may have failed"
-			" for some .ibd files!\n", dirname);
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"os_file_readdir_next_file() returned -1 in "
+			"directory %s, crash recovery may have failed "
+			"for some .ibd files!", dirname);
 
 		*err = DB_ERROR;
 	}
@@ -4532,7 +4449,7 @@ in the doublewrite buffer, also to know where to apply log records where the
 space id is != 0.
 @return	DB_SUCCESS or error number */
 UNIV_INTERN
-ulint
+dberr_t
 fil_load_single_table_tablespaces(void)
 /*===================================*/
 {
@@ -4543,7 +4460,7 @@ fil_load_single_table_tablespaces(void)
 	os_file_dir_t	dbdir;
 	os_file_stat_t	dbinfo;
 	os_file_stat_t	fileinfo;
-	ulint		err		= DB_SUCCESS;
+	dberr_t		err		= DB_SUCCESS;
 
 	/* The datadir of MySQL is always the default directory of mysqld */
 
@@ -4554,7 +4471,7 @@ fil_load_single_table_tablespaces(void)
 		return(DB_ERROR);
 	}
 
-	dbpath = mem_alloc(dbpath_len);
+	dbpath = static_cast<char*>(mem_alloc(dbpath_len));
 
 	/* Scan all directories under the datadir. They are the database
 	directories of MySQL. */
@@ -4583,16 +4500,15 @@ fil_load_single_table_tablespaces(void)
 				mem_free(dbpath);
 			}
 
-			dbpath = mem_alloc(dbpath_len);
+			dbpath = static_cast<char*>(mem_alloc(dbpath_len));
 		}
-		sprintf(dbpath, "%s/%s", fil_path_to_mysql_datadir,
-			dbinfo.name);
+		ut_snprintf(dbpath, dbpath_len,
+			    "%s/%s", fil_path_to_mysql_datadir, dbinfo.name);
 		srv_normalize_path_for_win(dbpath);
 
 		dbdir = os_file_opendir(dbpath, FALSE);
 
 		if (dbdir != NULL) {
-			/* printf("Opened dir %s\n", dbinfo.name); */
 
 			/* We found a database directory; loop through it,
 			looking for possible .ibd files in it */
@@ -4600,8 +4516,6 @@ fil_load_single_table_tablespaces(void)
 			ret = fil_file_readdir_next_file(&err, dbpath, dbdir,
 							 &fileinfo);
 			while (ret == 0) {
-				/* printf(
-				"     Looking at file %s\n", fileinfo.name); */
 
 				if (fileinfo.type == OS_FILE_TYPE_DIR) {
 
@@ -4610,11 +4524,14 @@ fil_load_single_table_tablespaces(void)
 
 				/* We found a symlink or a file */
 				if (strlen(fileinfo.name) > 4
-				    && 0 == strcmp(fileinfo.name
+				    && (0 == strcmp(fileinfo.name
+						   + strlen(fileinfo.name) - 4,
+						   ".ibd")
+					|| 0 == strcmp(fileinfo.name
 						   + strlen(fileinfo.name) - 4,
-						   ".ibd")) {
-					/* The name ends in .ibd; try opening
-					the file */
+						   ".isl"))) {
+					/* The name ends in .ibd or .isl;
+					try opening the file */
 					fil_load_single_table_tablespace(
 						dbinfo.name, fileinfo.name);
 				}
@@ -4655,7 +4572,7 @@ next_datadir_item:
 /*******************************************************************//**
 Returns TRUE if a single-table tablespace does not exist in the memory cache,
 or is being deleted there.
-@return	TRUE if does not exist or is being\ deleted */
+@return	TRUE if does not exist or is being deleted */
 UNIV_INTERN
 ibool
 fil_tablespace_deleted_or_being_deleted_in_mem(
@@ -4714,6 +4631,29 @@ fil_tablespace_exists_in_mem(
 }
 
 /*******************************************************************//**
+Report that a tablespace for a table was not found. */
+static
+void
+fil_report_missing_tablespace(
+/*===========================*/
+	const char*	name,			/*!< in: table name */
+	ulint		space_id)		/*!< in: table's space id */
+{
+	char index_name[MAX_FULL_NAME_LEN + 1];
+
+	innobase_format_name(index_name, sizeof(index_name), name, TRUE);
+
+	ib_logf(IB_LOG_LEVEL_ERROR,
+		"Table %s in the InnoDB data dictionary has tablespace id %lu, "
+		"but tablespace with that id or name does not exist. Have "
+		"you deleted or moved .ibd files? This may also be a table "
+		"created with CREATE TEMPORARY TABLE whose .ibd and .frm "
+		"files MySQL automatically removed, but the table still "
+		"exists in the InnoDB internal data dictionary.",
+		name, space_id);
+}
+
+/*******************************************************************//**
 Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory
 cache. Note that if we have not done a crash recovery at the database startup,
 there may be many tablespaces which are not yet in the memory cache.
@@ -4723,33 +4663,33 @@ ibool
 fil_space_for_table_exists_in_mem(
 /*==============================*/
 	ulint		id,		/*!< in: space id */
-	const char*	name,		/*!< in: table name in the standard
-					'databasename/tablename' format or
-					the dir path to a temp table */
-	ibool		is_temp,	/*!< in: TRUE if created with CREATE
-					TEMPORARY TABLE */
+	const char*	name,		/*!< in: table name used in
+					fil_space_create().  Either the
+					standard 'dbname/tablename' format
+					or table->dir_path_of_temp_table */
 	ibool		mark_space,	/*!< in: in crash recovery, at database
 					startup we mark all spaces which have
 					an associated table in the InnoDB
 					data dictionary, so that
 					we can print a warning about orphaned
 					tablespaces */
-	ibool		print_error_if_does_not_exist)
+	ibool		print_error_if_does_not_exist,
 					/*!< in: print detailed error
 					information to the .err log if a
 					matching tablespace is not found from
 					memory */
+	bool		adjust_space,	/*!< in: whether to adjust space id
+					when find table space mismatch */
+	mem_heap_t*	heap,		/*!< in: heap memory */
+	table_id_t	table_id)	/*!< in: table id */
 {
-	fil_space_t*	namespace;
+	fil_space_t*	fnamespace;
 	fil_space_t*	space;
-	char*		path;
 
 	ut_ad(fil_system);
 
 	mutex_enter(&fil_system->mutex);
 
-	path = fil_make_ibd_name(name, is_temp);
-
 	/* Look if there is a space with the same id */
 
 	space = fil_space_get_by_id(id);
@@ -4757,15 +4697,55 @@ fil_space_for_table_exists_in_mem(
 	/* Look if there is a space with the same name; the name is the
 	directory path from the datadir to the file */
 
-	namespace = fil_space_get_by_name(path);
-	if (space && space == namespace) {
+	fnamespace = fil_space_get_by_name(name);
+	if (space && space == fnamespace) {
 		/* Found */
 
 		if (mark_space) {
 			space->mark = TRUE;
 		}
 
-		mem_free(path);
+		mutex_exit(&fil_system->mutex);
+
+		return(TRUE);
+	}
+
+	/* Info from "fnamespace" comes from the ibd file itself, it can
+	be different from data obtained from System tables since it is
+	not transactional. If adjust_space is set, and the mismatching
+	space are between a user table and its temp table, we shall
+	adjust the ibd file name according to system table info */
+	if (adjust_space
+	    && space != NULL
+	    && row_is_mysql_tmp_table_name(space->name)
+	    && !row_is_mysql_tmp_table_name(name)) {
+
+		mutex_exit(&fil_system->mutex);
+
+		DBUG_EXECUTE_IF("ib_crash_before_adjust_fil_space",
+				DBUG_SUICIDE(););
+
+		if (fnamespace) {
+			char*	tmp_name;
+
+			tmp_name = dict_mem_create_temporary_tablename(
+				heap, name, table_id);
+
+			fil_rename_tablespace(fnamespace->name, fnamespace->id,
+					      tmp_name, NULL);
+		}
+
+		DBUG_EXECUTE_IF("ib_crash_after_adjust_one_fil_space",
+				DBUG_SUICIDE(););
+
+		fil_rename_tablespace(space->name, id, name, NULL);
+
+		DBUG_EXECUTE_IF("ib_crash_after_adjust_fil_space",
+				DBUG_SUICIDE(););
+
+		mutex_enter(&fil_system->mutex);
+		fnamespace = fil_space_get_by_name(name);
+		ut_ad(space == fnamespace);
 		mutex_exit(&fil_system->mutex);
 
 		return(TRUE);
@@ -4773,30 +4753,16 @@ fil_space_for_table_exists_in_mem(
 
 	if (!print_error_if_does_not_exist) {
 
-		mem_free(path);
 		mutex_exit(&fil_system->mutex);
 
 		return(FALSE);
 	}
 
 	if (space == NULL) {
-		if (namespace == NULL) {
-			ut_print_timestamp(stderr);
-			fputs("  InnoDB: Error: table ", stderr);
-			ut_print_filename(stderr, name);
-			fprintf(stderr, "\n"
-				"InnoDB: in InnoDB data dictionary"
-				" has tablespace id %lu,\n"
-				"InnoDB: but tablespace with that id"
-				" or name does not exist. Have\n"
-				"InnoDB: you deleted or moved .ibd files?\n"
-				"InnoDB: This may also be a table created with"
-				" CREATE TEMPORARY TABLE\n"
-				"InnoDB: whose .ibd and .frm files"
-				" MySQL automatically removed, but the\n"
-				"InnoDB: table still exists in the"
-				" InnoDB internal data dictionary.\n",
-				(ulong) id);
+		if (fnamespace == NULL) {
+			if (print_error_if_does_not_exist) {
+				fil_report_missing_tablespace(name, id);
+			}
 		} else {
 			ut_print_timestamp(stderr);
 			fputs("  InnoDB: Error: table ", stderr);
@@ -4809,21 +4775,20 @@ fil_space_for_table_exists_in_mem(
 				"InnoDB: a tablespace of name %s and id %lu,"
 				" though. Have\n"
 				"InnoDB: you deleted or moved .ibd files?\n",
-				(ulong) id, namespace->name,
-				(ulong) namespace->id);
+				(ulong) id, fnamespace->name,
+				(ulong) fnamespace->id);
 		}
 error_exit:
 		fputs("InnoDB: Please refer to\n"
 		      "InnoDB: " REFMAN "innodb-troubleshooting-datadict.html\n"
 		      "InnoDB: for how to resolve the issue.\n", stderr);
 
-		mem_free(path);
 		mutex_exit(&fil_system->mutex);
 
 		return(FALSE);
 	}
 
-	if (0 != strcmp(space->name, path)) {
+	if (0 != strcmp(space->name, name)) {
 		ut_print_timestamp(stderr);
 		fputs("  InnoDB: Error: table ", stderr);
 		ut_print_filename(stderr, name);
@@ -4835,19 +4800,18 @@ error_exit:
 			"InnoDB: Have you deleted or moved .ibd files?\n",
 			(ulong) id, space->name);
 
-		if (namespace != NULL) {
+		if (fnamespace != NULL) {
 			fputs("InnoDB: There is a tablespace"
 			      " with the right name\n"
 			      "InnoDB: ", stderr);
-			ut_print_filename(stderr, namespace->name);
+			ut_print_filename(stderr, fnamespace->name);
 			fprintf(stderr, ", but its id is %lu.\n",
-				(ulong) namespace->id);
+				(ulong) fnamespace->id);
 		}
 
 		goto error_exit;
 	}
 
-	mem_free(path);
 	mutex_exit(&fil_system->mutex);
 
 	return(FALSE);
@@ -4857,34 +4821,28 @@ error_exit:
 Checks if a single-table tablespace for a given table name exists in the
 tablespace memory cache.
 @return	space id, ULINT_UNDEFINED if not found */
-static
+UNIV_INTERN
 ulint
 fil_get_space_id_for_table(
 /*=======================*/
-	const char*	name)	/*!< in: table name in the standard
+	const char*	tablename)	/*!< in: table name in the standard
 				'databasename/tablename' format */
 {
-	fil_space_t*	namespace;
+	fil_space_t*	fnamespace;
 	ulint		id		= ULINT_UNDEFINED;
-	char*		path;
 
 	ut_ad(fil_system);
 
 	mutex_enter(&fil_system->mutex);
 
-	path = fil_make_ibd_name(name, FALSE);
-
-	/* Look if there is a space with the same name; the name is the
-	directory path to the file */
+	/* Look if there is a space with the same name. */
 
-	namespace = fil_space_get_by_name(path);
+	fnamespace = fil_space_get_by_name(tablename);
 
-	if (namespace) {
-		id = namespace->id;
+	if (fnamespace) {
+		id = fnamespace->id;
 	}
 
-	mem_free(path);
-
 	mutex_exit(&fil_system->mutex);
 
 	return(id);
@@ -4914,15 +4872,16 @@ fil_extend_space_to_desired_size(
 	ulint		buf_size;
 	ulint		start_page_no;
 	ulint		file_start_page_no;
-	ulint		offset_high;
-	ulint		offset_low;
 	ulint		page_size;
-	ibool		success		= TRUE;
+	ulint		pages_added;
+	ibool		success;
+
+	ut_ad(!srv_read_only_mode);
+
+retry:
+	pages_added = 0;
+	success = TRUE;
 
-	/* file_extend_mutex is for http://bugs.mysql.com/56433 */
-	/* to protect from the other fil_extend_space_to_desired_size() */
-	/* during temprary releasing &fil_system->mutex */
-	mutex_enter(&fil_system->file_extend_mutex);
 	fil_mutex_enter_and_prepare_for_io(space_id);
 
 	space = fil_space_get_by_id(space_id);
@@ -4934,99 +4893,121 @@ fil_extend_space_to_desired_size(
 		*actual_size = space->size;
 
 		mutex_exit(&fil_system->mutex);
-		mutex_exit(&fil_system->file_extend_mutex);
 
 		return(TRUE);
 	}
 
-	page_size = dict_table_flags_to_zip_size(space->flags);
+	page_size = fsp_flags_get_zip_size(space->flags);
 	if (!page_size) {
 		page_size = UNIV_PAGE_SIZE;
 	}
 
 	node = UT_LIST_GET_LAST(space->chain);
 
-	fil_node_prepare_for_io(node, fil_system, space);
+	if (!node->being_extended) {
+		/* Mark this node as undergoing extension. This flag
+		is used by other threads to wait for the extension
+		opereation to finish. */
+		node->being_extended = TRUE;
+	} else {
+		/* Another thread is currently extending the file. Wait
+		for it to finish.
+		It'd have been better to use event driven mechanism but
+		the entire module is peppered with polling stuff. */
+		mutex_exit(&fil_system->mutex);
+		os_thread_sleep(100000);
+		goto retry;
+	}
+
+	if (!fil_node_prepare_for_io(node, fil_system, space)) {
+		/* The tablespace data file, such as .ibd file, is missing */
+		node->being_extended = false;
+		mutex_exit(&fil_system->mutex);
+
+		return(false);
+	}
+
+	/* At this point it is safe to release fil_system mutex. No
+	other thread can rename, delete or close the file because
+	we have set the node->being_extended flag. */
+	mutex_exit(&fil_system->mutex);
 
 	start_page_no = space->size;
 	file_start_page_no = space->size - node->size;
 
 #ifdef HAVE_POSIX_FALLOCATE
 	if (srv_use_posix_fallocate) {
-		offset_high = (size_after_extend - file_start_page_no)
-			* page_size / (4ULL * 1024 * 1024 * 1024);
-		offset_low = (size_after_extend - file_start_page_no)
-			* page_size % (4ULL * 1024 * 1024 * 1024);
 
 		mutex_exit(&fil_system->mutex);
 		success = os_file_set_size(node->name, node->handle,
-					   offset_low, offset_high);
+					   (size_after_extend
+					    - file_start_page_no) * page_size);
 		mutex_enter(&fil_system->mutex);
 		if (success) {
 			node->size += (size_after_extend - start_page_no);
 			space->size += (size_after_extend - start_page_no);
 			os_has_said_disk_full = FALSE;
 		}
+		node->being_extended = FALSE;
 		goto complete_io;
 	}
 #endif
 
 	/* Extend at most 64 pages at a time */
 	buf_size = ut_min(64, size_after_extend - start_page_no) * page_size;
-	buf2 = mem_alloc(buf_size + page_size);
-	buf = ut_align(buf2, page_size);
+	buf2 = static_cast<byte*>(mem_alloc(buf_size + page_size));
+	buf = static_cast<byte*>(ut_align(buf2, page_size));
 
 	memset(buf, 0, buf_size);
 
 	while (start_page_no < size_after_extend) {
-		ulint	n_pages = ut_min(buf_size / page_size,
-					 size_after_extend - start_page_no);
+		ulint		n_pages
+			= ut_min(buf_size / page_size,
+				 size_after_extend - start_page_no);
 
-		offset_high = (start_page_no - file_start_page_no)
-			/ (4096 * ((1024 * 1024) / page_size));
-		offset_low  = ((start_page_no - file_start_page_no)
-			       % (4096 * ((1024 * 1024) / page_size)))
+		os_offset_t	offset
+			= ((os_offset_t) (start_page_no - file_start_page_no))
 			* page_size;
-
-		mutex_exit(&fil_system->mutex);
 #ifdef UNIV_HOTBACKUP
 		success = os_file_write(node->name, node->handle, buf,
-					offset_low, offset_high,
-					page_size * n_pages);
+					offset, page_size * n_pages);
 #else
 		success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC,
 				 node->name, node->handle, buf,
-				 offset_low, offset_high,
-				 page_size * n_pages,
+				 offset, page_size * n_pages,
 				 NULL, NULL, space_id, NULL);
-#endif
-		mutex_enter(&fil_system->mutex);
-
+#endif /* UNIV_HOTBACKUP */
 		if (success) {
-			node->size += n_pages;
-			space->size += n_pages;
-
 			os_has_said_disk_full = FALSE;
 		} else {
 			/* Let us measure the size of the file to determine
 			how much we were able to extend it */
+			os_offset_t	size;
 
-			n_pages = ((ulint)
-				   (os_file_get_size_as_iblonglong(
-					   node->handle)
-				    / page_size)) - node->size;
+			size = os_file_get_size(node->handle);
+			ut_a(size != (os_offset_t) -1);
 
-			node->size += n_pages;
-			space->size += n_pages;
+			n_pages = ((ulint) (size / page_size))
+				- node->size - pages_added;
 
+			pages_added += n_pages;
 			break;
 		}
 
 		start_page_no += n_pages;
+		pages_added += n_pages;
 	}
 
 	mem_free(buf2);
 
+	mutex_enter(&fil_system->mutex);
+
+	ut_a(node->being_extended);
+
+	space->size += pages_added;
+	node->size += pages_added;
+	node->being_extended = FALSE;
+
 #ifdef HAVE_POSIX_FALLOCATE
 complete_io:
 	/* If posix_fallocate was used to extent the file space
@@ -5061,9 +5042,8 @@ complete_io:
 	printf("Extended %s to %lu, actual size %lu pages\n", space->name,
 	size_after_extend, *actual_size); */
 	mutex_exit(&fil_system->mutex);
-	mutex_exit(&fil_system->file_extend_mutex);
 
-	fil_flush(space_id, TRUE);
+	fil_flush(space_id);
 
 	return(success);
 }
@@ -5083,7 +5063,7 @@ fil_extend_tablespaces_to_stored_len(void)
 	byte*		buf;
 	ulint		actual_size;
 	ulint		size_in_header;
-	ulint		error;
+	dberr_t		error;
 	ibool		success;
 
 	buf = mem_alloc(UNIV_PAGE_SIZE);
@@ -5099,7 +5079,7 @@ fil_extend_tablespaces_to_stored_len(void)
 					      mutex, because this is a
 					      single-threaded operation */
 		error = fil_read(TRUE, space->id,
-				 dict_table_flags_to_zip_size(space->flags),
+				 fsp_flags_get_zip_size(space->flags),
 				 0, 0, UNIV_PAGE_SIZE, buf, NULL);
 		ut_a(error == DB_SUCCESS);
 
@@ -5117,7 +5097,7 @@ fil_extend_tablespaces_to_stored_len(void)
 				"InnoDB: Check that you have free disk space"
 				" and retry!\n",
 				space->name, size_in_header, actual_size);
-			exit(1);
+			ut_a(success);
 		}
 
 		mutex_enter(&fil_system->mutex);
@@ -5227,9 +5207,10 @@ NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
 Prepares a file node for i/o. Opens the file if it is closed. Updates the
 pending i/o's field in the node and the system appropriately. Takes the node
 off the LRU list if it is in the LRU list. The caller must hold the fil_sys
-mutex. */
+mutex.
+@return false if the file can't be opened, otherwise true */
 static
-void
+bool
 fil_node_prepare_for_io(
 /*====================*/
 	fil_node_t*	node,	/*!< in: file node */
@@ -5252,11 +5233,12 @@ fil_node_prepare_for_io(
 		/* File is closed: open it */
 		ut_a(node->n_pending == 0);
 
-		fil_node_open_file(node, system, space);
+		if (!fil_node_open_file(node, system, space)) {
+			return(false);
+		}
 	}
 
-	if (node->n_pending == 0 && space->purpose == FIL_TABLESPACE
-	    && !trx_sys_sys_space(space->id)) {
+	if (node->n_pending == 0 && fil_space_belongs_in_lru(space)) {
 		/* The node is in the LRU list, remove it */
 
 		ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
@@ -5265,6 +5247,8 @@ fil_node_prepare_for_io(
 	}
 
 	node->n_pending++;
+
+	return(true);
 }
 
 /********************************************************************//**
@@ -5289,20 +5273,29 @@ fil_node_complete_io(
 	node->n_pending--;
 
 	if (type == OS_FILE_WRITE) {
+		ut_ad(!srv_read_only_mode);
 		system->modification_counter++;
 		node->modification_counter = system->modification_counter;
 
-		if (!node->space->is_in_unflushed_spaces) {
+		if (fil_buffering_disabled(node->space)) {
+
+			/* We don't need to keep track of unflushed
+			changes as user has explicitly disabled
+			buffering. */
+			ut_ad(!node->space->is_in_unflushed_spaces);
+			node->flush_counter = node->modification_counter;
 
-			node->space->is_in_unflushed_spaces = TRUE;
+		} else if (!node->space->is_in_unflushed_spaces) {
+
+			node->space->is_in_unflushed_spaces = true;
 			UT_LIST_ADD_FIRST(unflushed_spaces,
 					  system->unflushed_spaces,
 					  node->space);
 		}
 	}
 
-	if (node->n_pending == 0 && node->space->purpose == FIL_TABLESPACE
-	    && !trx_sys_sys_space(node->space->id)) {
+	if (node->n_pending == 0 && fil_space_belongs_in_lru(node->space)) {
+
 		/* The node must be put back to the LRU list */
 		UT_LIST_ADD_FIRST(LRU, system->LRU, node);
 	}
@@ -5341,7 +5334,7 @@ Reads or writes data. This operation is asynchronous (aio).
 @return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
 i/o on a tablespace which does not exist */
 UNIV_INTERN
-ulint
+dberr_t
 _fil_io(
 /*===*/
 	ulint	type,		/*!< in: OS_FILE_READ or OS_FILE_WRITE,
@@ -5353,7 +5346,7 @@ _fil_io(
 				because i/os are not actually handled until
 				all have been posted: use with great
 				caution! */
-	ibool	sync,		/*!< in: TRUE if synchronous aio is desired */
+	bool	sync,		/*!< in: true if synchronous aio is desired */
 	ulint	space_id,	/*!< in: space id */
 	ulint	zip_size,	/*!< in: compressed page size in bytes;
 				0 for uncompressed pages */
@@ -5374,11 +5367,11 @@ _fil_io(
 	ulint		mode;
 	fil_space_t*	space;
 	fil_node_t*	node;
-	ulint		offset_high;
-	ulint		offset_low;
 	ibool		ret;
 	ulint		is_log;
 	ulint		wake_later;
+	os_offset_t	offset;
+	ibool		ignore_nonexistent_pages;
 
 	is_log = type & OS_FILE_LOG;
 	type = type & ~OS_FILE_LOG;
@@ -5386,21 +5379,30 @@ _fil_io(
 	wake_later = type & OS_AIO_SIMULATED_WAKE_LATER;
 	type = type & ~OS_AIO_SIMULATED_WAKE_LATER;
 
+	ignore_nonexistent_pages = type & BUF_READ_IGNORE_NONEXISTENT_PAGES;
+	type &= ~BUF_READ_IGNORE_NONEXISTENT_PAGES;
+
 	ut_ad(byte_offset < UNIV_PAGE_SIZE);
 	ut_ad(!zip_size || !byte_offset);
 	ut_ad(ut_is_2pow(zip_size));
 	ut_ad(buf);
 	ut_ad(len > 0);
-//#if (1 << UNIV_PAGE_SIZE_SHIFT) != UNIV_PAGE_SIZE
-//# error "(1 << UNIV_PAGE_SIZE_SHIFT) != UNIV_PAGE_SIZE"
-//#endif
+	ut_ad(UNIV_PAGE_SIZE == (ulong)(1 << UNIV_PAGE_SIZE_SHIFT));
+#if (1 << UNIV_PAGE_SIZE_SHIFT_MAX) != UNIV_PAGE_SIZE_MAX
+# error "(1 << UNIV_PAGE_SIZE_SHIFT_MAX) != UNIV_PAGE_SIZE_MAX"
+#endif
+#if (1 << UNIV_PAGE_SIZE_SHIFT_MIN) != UNIV_PAGE_SIZE_MIN
+# error "(1 << UNIV_PAGE_SIZE_SHIFT_MIN) != UNIV_PAGE_SIZE_MIN"
+#endif
 	ut_ad(fil_validate_skip());
 #ifndef UNIV_HOTBACKUP
 # ifndef UNIV_LOG_DEBUG
 	/* ibuf bitmap pages must be read in the sync aio mode: */
-	ut_ad(recv_no_ibuf_operations || (type == OS_FILE_WRITE)
+	ut_ad(recv_no_ibuf_operations
+	      || type == OS_FILE_WRITE
 	      || !ibuf_bitmap_page(zip_size, block_offset)
-	      || sync || is_log);
+	      || sync
+	      || is_log);
 # endif /* UNIV_LOG_DEBUG */
 	if (sync) {
 		mode = OS_AIO_SYNC;
@@ -5419,9 +5421,10 @@ _fil_io(
 #endif /* !UNIV_HOTBACKUP */
 
 	if (type == OS_FILE_READ) {
-		srv_data_read+= len;
+		srv_stats.data_read.add(len);
 	} else if (type == OS_FILE_WRITE) {
-		srv_data_written+= len;
+		ut_ad(!srv_read_only_mode);
+		srv_stats.data_written.add(len);
 	}
 
 	/* Reserve the fil_system mutex and make sure that we can open at
@@ -5433,42 +5436,43 @@ _fil_io(
 
 	/* If we are deleting a tablespace we don't allow any read
 	operations on that. However, we do allow write operations. */
-	if (!space || (type == OS_FILE_READ && space->stop_new_ops)) {
+	if (space == 0 || (type == OS_FILE_READ && space->stop_new_ops)) {
 		mutex_exit(&fil_system->mutex);
 
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: Error: trying to do i/o"
-			" to a tablespace which does not exist.\n"
-			"InnoDB: i/o type %lu, space id %lu,"
-			" page no. %lu, i/o length %lu bytes\n",
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Trying to do i/o to a tablespace which does "
+			"not exist. i/o type %lu, space id %lu, "
+			"page no. %lu, i/o length %lu bytes",
 			(ulong) type, (ulong) space_id, (ulong) block_offset,
 			(ulong) len);
 
 		return(DB_TABLESPACE_DELETED);
 	}
 
-	ut_ad((mode != OS_AIO_IBUF) || (space->purpose == FIL_TABLESPACE));
+	ut_ad(mode != OS_AIO_IBUF || space->purpose == FIL_TABLESPACE);
 
 	node = UT_LIST_GET_FIRST(space->chain);
 
 	for (;;) {
-		if (UNIV_UNLIKELY(node == NULL)) {
+		if (node == NULL) {
+			if (ignore_nonexistent_pages) {
+				mutex_exit(&fil_system->mutex);
+				return(DB_ERROR);
+			}
+
 			fil_report_invalid_page_access(
 				block_offset, space_id, space->name,
 				byte_offset, len, type);
 
 			ut_error;
-		}
 
-		if (space->id != 0 && node->size == 0) {
+		} else if (fil_is_user_tablespace_id(space->id)
+			   && node->size == 0) {
+
 			/* We do not know the size of a single-table tablespace
 			before we open the file */
-
 			break;
-		}
-
-		if (node->size > block_offset) {
+		} else if (node->size > block_offset) {
 			/* Found! */
 			break;
 		} else {
@@ -5478,10 +5482,31 @@ _fil_io(
 	}
 
 	/* Open file if closed */
-	fil_node_prepare_for_io(node, fil_system, space);
+	if (!fil_node_prepare_for_io(node, fil_system, space)) {
+		if (space->purpose == FIL_TABLESPACE
+		    && fil_is_user_tablespace_id(space->id)) {
+			mutex_exit(&fil_system->mutex);
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Trying to do i/o to a tablespace which "
+				"exists without .ibd data file. "
+				"i/o type %lu, space id %lu, page no %lu, "
+				"i/o length %lu bytes",
+				(ulong) type, (ulong) space_id,
+				(ulong) block_offset, (ulong) len);
+
+			return(DB_TABLESPACE_DELETED);
+		}
+
+		/* The tablespace is for log. Currently, we just assert here
+		to prevent handling errors along the way fil_io returns.
+		Also, if the log files are missing, it would be hard to
+		promise the server can continue running. */
+		ut_a(0);
+	}
 
 	/* Check that at least the start offset is within the bounds of a
-	single-table tablespace */
+	single-table tablespace, including rollback tablespaces. */
 	if (UNIV_UNLIKELY(node->size <= block_offset)
 	    && space->id != 0 && space->purpose == FIL_TABLESPACE) {
 
@@ -5498,9 +5523,8 @@ _fil_io(
 	/* Calculate the low 32 bits and the high 32 bits of the file offset */
 
 	if (!zip_size) {
-		offset_high = (block_offset >> (32 - UNIV_PAGE_SIZE_SHIFT));
-		offset_low  = ((block_offset << UNIV_PAGE_SIZE_SHIFT)
-			       & 0xFFFFFFFFUL) + byte_offset;
+		offset = ((os_offset_t) block_offset << UNIV_PAGE_SIZE_SHIFT)
+			+ byte_offset;
 
 		ut_a(node->size - block_offset
 		     >= ((byte_offset + len + (UNIV_PAGE_SIZE - 1))
@@ -5515,8 +5539,7 @@ _fil_io(
 		case 16384: zip_size_shift = 14; break;
 		default: ut_error;
 		}
-		offset_high = block_offset >> (32 - zip_size_shift);
-		offset_low = (block_offset << zip_size_shift & 0xFFFFFFFFUL)
+		offset = ((os_offset_t) block_offset << zip_size_shift)
 			+ byte_offset;
 		ut_a(node->size - block_offset
 		     >= (len + (zip_size - 1)) / zip_size);
@@ -5539,7 +5562,8 @@ _fil_io(
 			mutex_exit(&fil_system->mutex);
 			if (mode == OS_AIO_NORMAL) {
 				ut_a(space->purpose == FIL_TABLESPACE);
-				buf_page_io_complete(message);
+				buf_page_io_complete(static_cast<buf_page_t *>
+						     (message));
 			}
 		}
 
@@ -5551,24 +5575,22 @@ _fil_io(
 
 			return(DB_SUCCESS);
 		}
-	} /**/
+	}
 
 	/* Queue the aio request */
 	ret = os_aio(type, mode | wake_later, node->name, node->handle, buf,
-		offset_low, offset_high, len, node, message, space_id,
-		trx);
+		offset, len, node, message, space_id, trx);
 
 #else
 	/* In ibbackup do normal i/o, not aio */
 	if (type == OS_FILE_READ) {
-		ret = os_file_read(node->handle, buf, offset_low, offset_high,
-				   len);
+		ret = os_file_read(node->handle, buf, offset, len);
 	} else {
+		ut_ad(!srv_read_only_mode);
 		ret = os_file_write(node->name, node->handle, buf,
-				    offset_low, offset_high, len);
+				    offset, len);
 	}
-#endif
-
+#endif /* !UNIV_HOTBACKUP */
 	ut_a(ret);
 
 	if (mode == OS_AIO_SYNC) {
@@ -5587,75 +5609,11 @@ _fil_io(
 	return(DB_SUCCESS);
 }
 
-/********************************************************************//**
-Confirm whether the parameters are valid or not */
-UNIV_INTERN
-ibool
-fil_is_exist(
-/*==============*/
-	ulint	space_id,	/*!< in: space id */
-	ulint	block_offset)	/*!< in: offset in number of blocks */
-{
-	fil_space_t*	space;
-	fil_node_t*	node;
-
-	/* Reserve the fil_system mutex and make sure that we can open at
-	least one file while holding it, if the file is not already open */
-
-	fil_mutex_enter_and_prepare_for_io(space_id);
-
-	space = fil_space_get_by_id(space_id);
-
-	if (!space) {
-		mutex_exit(&fil_system->mutex);
-		return(FALSE);
-	}
-
-	node = UT_LIST_GET_FIRST(space->chain);
-
-	for (;;) {
-		if (UNIV_UNLIKELY(node == NULL)) {
-			mutex_exit(&fil_system->mutex);
-			return(FALSE);
-		}
-
-		if (space->id != 0 && node->size == 0) {
-			/* We do not know the size of a single-table tablespace
-			before we open the file */
-
-			break;
-		}
-
-		if (node->size > block_offset) {
-			/* Found! */
-			break;
-		} else {
-			block_offset -= node->size;
-			node = UT_LIST_GET_NEXT(chain, node);
-		}
-	}
-
-	/* Open file if closed */
-	fil_node_prepare_for_io(node, fil_system, space);
-	fil_node_complete_io(node, fil_system, OS_FILE_READ);
-
-	/* Check that at least the start offset is within the bounds of a
-	single-table tablespace */
-	if (UNIV_UNLIKELY(node->size <= block_offset)
-	    && space->id != 0 && space->purpose == FIL_TABLESPACE) {
-		mutex_exit(&fil_system->mutex);
-		return(FALSE);
-	}
-
-	mutex_exit(&fil_system->mutex);
-	return(TRUE);
-}
-
 #ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
 Waits for an aio operation to complete. This function is used to write the
 handler for completed requests. The aio array of pending requests is divided
-into segments (see os0file.c for more info). The thread specifies which
+into segments (see os0file.cc for more info). The thread specifies which
 segment it wants to wait for. */
 UNIV_INTERN
 void
@@ -5675,24 +5633,24 @@ fil_aio_wait(
 	if (srv_use_native_aio) {
 		srv_set_io_thread_op_info(segment, "native aio handle");
 #ifdef WIN_ASYNC_IO
-		ret = os_aio_windows_handle(segment, 0, &fil_node,
-					    &message, &type, &space_id);
+		ret = os_aio_windows_handle(
+			segment, 0, &fil_node, &message, &type, &space_id);
 #elif defined(LINUX_NATIVE_AIO)
-		ret = os_aio_linux_handle(segment, &fil_node,
-					  &message, &type, &space_id);
+		ret = os_aio_linux_handle(
+			segment, &fil_node, &message, &type, &space_id);
 #else
 		ut_error;
 		ret = 0; /* Eliminate compiler warning */
-#endif
+#endif /* WIN_ASYNC_IO */
 	} else {
 		srv_set_io_thread_op_info(segment, "simulated aio handle");
 
-		ret = os_aio_simulated_handle(segment, &fil_node,
-					      &message, &type, &space_id);
+		ret = os_aio_simulated_handle(
+			segment, &fil_node, &message, &type, &space_id);
 	}
 
 	ut_a(ret);
-	if (UNIV_UNLIKELY(fil_node == NULL)) {
+	if (fil_node == NULL) {
 		ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
 		return;
 	}
@@ -5715,10 +5673,10 @@ fil_aio_wait(
 
 	if (fil_node->space->purpose == FIL_TABLESPACE) {
 		srv_set_io_thread_op_info(segment, "complete io for buf page");
-		buf_page_io_complete(message);
+		buf_page_io_complete(static_cast<buf_page_t*>(message));
 	} else {
 		srv_set_io_thread_op_info(segment, "complete io for log");
-		log_io_complete(message);
+		log_io_complete(static_cast<log_group_t*>(message));
 	}
 }
 #endif /* UNIV_HOTBACKUP */
@@ -5730,14 +5688,13 @@ UNIV_INTERN
 void
 fil_flush(
 /*======*/
-	ulint	space_id,	/*!< in: file space id (this can be a group of
+	ulint	space_id)	/*!< in: file space id (this can be a group of
 				log files or a tablespace of the database) */
-	ibool	metadata)
 {
 	fil_space_t*	space;
 	fil_node_t*	node;
 	os_file_t	file;
-	ib_int64_t	old_mod_counter;
+
 
 	mutex_enter(&fil_system->mutex);
 
@@ -5749,89 +5706,112 @@ fil_flush(
 		return;
 	}
 
+	if (fil_buffering_disabled(space)) {
+
+		/* No need to flush. User has explicitly disabled
+		buffering. */
+		ut_ad(!space->is_in_unflushed_spaces);
+		ut_ad(fil_space_is_flushed(space));
+		ut_ad(space->n_pending_flushes == 0);
+
+#ifdef UNIV_DEBUG
+		for (node = UT_LIST_GET_FIRST(space->chain);
+		     node != NULL;
+		     node = UT_LIST_GET_NEXT(chain, node)) {
+			ut_ad(node->modification_counter
+			      == node->flush_counter);
+			ut_ad(node->n_pending_flushes == 0);
+		}
+#endif /* UNIV_DEBUG */
+
+		mutex_exit(&fil_system->mutex);
+		return;
+	}
+
 	space->n_pending_flushes++;	/*!< prevent dropping of the space while
 					we are flushing */
-	node = UT_LIST_GET_FIRST(space->chain);
+	for (node = UT_LIST_GET_FIRST(space->chain);
+	     node != NULL;
+	     node = UT_LIST_GET_NEXT(chain, node)) {
 
-	while (node) {
-		if (node->modification_counter > node->flush_counter) {
-			ut_a(node->open);
+		ib_int64_t old_mod_counter = node->modification_counter;;
+
+		if (old_mod_counter <= node->flush_counter) {
+			continue;
+		}
 
-			/* We want to flush the changes at least up to
-			old_mod_counter */
-			old_mod_counter = node->modification_counter;
+		ut_a(node->open);
 
-			if (space->purpose == FIL_TABLESPACE) {
-				fil_n_pending_tablespace_flushes++;
-			} else {
-				fil_n_pending_log_flushes++;
-				fil_n_log_flushes++;
-			}
+		if (space->purpose == FIL_TABLESPACE) {
+			fil_n_pending_tablespace_flushes++;
+		} else {
+			fil_n_pending_log_flushes++;
+			fil_n_log_flushes++;
+		}
 #ifdef __WIN__
-			if (node->is_raw_disk) {
+		if (node->is_raw_disk) {
 
-				goto skip_flush;
-			}
-#endif
+			goto skip_flush;
+		}
+#endif /* __WIN__ */
 retry:
-			if (node->n_pending_flushes > 0) {
-				/* We want to avoid calling os_file_flush() on
-				the file twice at the same time, because we do
-				not know what bugs OS's may contain in file
-				i/o; sleep for a while */
+		if (node->n_pending_flushes > 0) {
+			/* We want to avoid calling os_file_flush() on
+			the file twice at the same time, because we do
+			not know what bugs OS's may contain in file
+			i/o */
 
-				mutex_exit(&fil_system->mutex);
+			ib_int64_t sig_count =
+				os_event_reset(node->sync_event);
 
-				os_thread_sleep(20000);
+			mutex_exit(&fil_system->mutex);
 
-				mutex_enter(&fil_system->mutex);
+			os_event_wait_low(node->sync_event, sig_count);
 
-				if (node->flush_counter >= old_mod_counter) {
+			mutex_enter(&fil_system->mutex);
 
-					goto skip_flush;
-				}
+			if (node->flush_counter >= old_mod_counter) {
 
-				goto retry;
+				goto skip_flush;
 			}
 
-			ut_a(node->open);
-			file = node->handle;
-			node->n_pending_flushes++;
+			goto retry;
+		}
 
-			mutex_exit(&fil_system->mutex);
+		ut_a(node->open);
+		file = node->handle;
+		node->n_pending_flushes++;
 
-			/* fprintf(stderr, "Flushing to file %s\n",
-			node->name); */
+		mutex_exit(&fil_system->mutex);
 
-			os_file_flush(file, metadata);
+		os_file_flush(file);
 
-			mutex_enter(&fil_system->mutex);
+		mutex_enter(&fil_system->mutex);
 
-			node->n_pending_flushes--;
-skip_flush:
-			if (node->flush_counter < old_mod_counter) {
-				node->flush_counter = old_mod_counter;
+		os_event_set(node->sync_event);
 
-				if (space->is_in_unflushed_spaces
-				    && fil_space_is_flushed(space)) {
+		node->n_pending_flushes--;
+skip_flush:
+		if (node->flush_counter < old_mod_counter) {
+			node->flush_counter = old_mod_counter;
 
-					space->is_in_unflushed_spaces = FALSE;
+			if (space->is_in_unflushed_spaces
+			    && fil_space_is_flushed(space)) {
 
-					UT_LIST_REMOVE(
-						unflushed_spaces,
-						fil_system->unflushed_spaces,
-						space);
-				}
-			}
+				space->is_in_unflushed_spaces = false;
 
-			if (space->purpose == FIL_TABLESPACE) {
-				fil_n_pending_tablespace_flushes--;
-			} else {
-				fil_n_pending_log_flushes--;
+				UT_LIST_REMOVE(
+					unflushed_spaces,
+					fil_system->unflushed_spaces,
+					space);
 			}
 		}
 
-		node = UT_LIST_GET_NEXT(chain, node);
+		if (space->purpose == FIL_TABLESPACE) {
+			fil_n_pending_tablespace_flushes--;
+		} else {
+			fil_n_pending_log_flushes--;
+		}
 	}
 
 	space->n_pending_flushes--;
@@ -5866,7 +5846,8 @@ fil_flush_file_spaces(
 	traversed fil_system->unflushed_spaces and called UT_LIST_GET_NEXT()
 	on a space that was just removed from the list by fil_flush().
 	Thus, the space could be dropped and the memory overwritten. */
-	space_ids = mem_alloc(n_space_ids * sizeof *space_ids);
+	space_ids = static_cast<ulint*>(
+		mem_alloc(n_space_ids * sizeof *space_ids));
 
 	n_space_ids = 0;
 
@@ -5886,12 +5867,20 @@ fil_flush_file_spaces(
 	a non-existing space id. */
 	for (i = 0; i < n_space_ids; i++) {
 
-		fil_flush(space_ids[i], TRUE);
+		fil_flush(space_ids[i]);
 	}
 
 	mem_free(space_ids);
 }
 
+/** Functor to validate the space list. */
+struct	Check {
+	void	operator()(const fil_node_t* elem)
+	{
+		ut_a(elem->open || !elem->n_pending);
+	}
+};
+
 /******************************************************************//**
 Checks the consistency of the tablespace cache.
 @return	TRUE if ok */
@@ -5911,16 +5900,19 @@ fil_validate(void)
 
 	for (i = 0; i < hash_get_n_cells(fil_system->spaces); i++) {
 
-		space = HASH_GET_FIRST(fil_system->spaces, i);
+		for (space = static_cast<fil_space_t*>(
+				HASH_GET_FIRST(fil_system->spaces, i));
+		     space != 0;
+		     space = static_cast<fil_space_t*>(
+			     	HASH_GET_NEXT(hash, space))) {
 
-		while (space != NULL) {
-			UT_LIST_VALIDATE(chain, fil_node_t, space->chain,
-					 ut_a(ut_list_node_313->open
-					      || !ut_list_node_313->n_pending));
+			UT_LIST_VALIDATE(
+				chain, fil_node_t, space->chain, Check());
 
-			fil_node = UT_LIST_GET_FIRST(space->chain);
+			for (fil_node = UT_LIST_GET_FIRST(space->chain);
+			     fil_node != 0;
+			     fil_node = UT_LIST_GET_NEXT(chain, fil_node)) {
 
-			while (fil_node != NULL) {
 				if (fil_node->n_pending > 0) {
 					ut_a(fil_node->open);
 				}
@@ -5928,25 +5920,22 @@ fil_validate(void)
 				if (fil_node->open) {
 					n_open++;
 				}
-				fil_node = UT_LIST_GET_NEXT(chain, fil_node);
 			}
-			space = HASH_GET_NEXT(hash, space);
 		}
 	}
 
 	ut_a(fil_system->n_open == n_open);
 
-	UT_LIST_VALIDATE(LRU, fil_node_t, fil_system->LRU, (void) 0);
+	UT_LIST_CHECK(LRU, fil_node_t, fil_system->LRU);
 
-	fil_node = UT_LIST_GET_FIRST(fil_system->LRU);
+	for (fil_node = UT_LIST_GET_FIRST(fil_system->LRU);
+	     fil_node != 0;
+	     fil_node = UT_LIST_GET_NEXT(LRU, fil_node)) {
 
-	while (fil_node != NULL) {
 		ut_a(fil_node->n_pending == 0);
+		ut_a(!fil_node->being_extended);
 		ut_a(fil_node->open);
-		ut_a(fil_node->space->purpose == FIL_TABLESPACE);
-		ut_a(!trx_sys_sys_space(fil_node->space->id));
-
-		fil_node = UT_LIST_GET_NEXT(LRU, fil_node);
+		ut_a(fil_space_belongs_in_lru(fil_node->space));
 	}
 
 	mutex_exit(&fil_system->mutex);
@@ -6020,7 +6009,7 @@ fil_page_get_type(
 }
 
 /****************************************************************//**
-Initializes the tablespace memory cache. */
+Closes the tablespace memory cache. */
 UNIV_INTERN
 void
 fil_close(void)
@@ -6044,6 +6033,333 @@ fil_close(void)
 	fil_system = NULL;
 }
 
+/********************************************************************//**
+Initializes a buffer control block when the buf_pool is created. */
+static
+void
+fil_buf_block_init(
+/*===============*/
+	buf_block_t*	block,		/*!< in: pointer to control block */
+	byte*		frame)		/*!< in: pointer to buffer frame */
+{
+	UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE);
+
+	block->frame = frame;
+
+	block->page.io_fix = BUF_IO_NONE;
+	/* There are assertions that check for this. */
+	block->page.buf_fix_count = 1;
+	block->page.state = BUF_BLOCK_READY_FOR_USE;
+
+	page_zip_des_init(&block->page.zip);
+}
+
+struct fil_iterator_t {
+	os_file_t	file;			/*!< File handle */
+	const char*	filepath;		/*!< File path name */
+	os_offset_t	start;			/*!< From where to start */
+	os_offset_t	end;			/*!< Where to stop */
+	os_offset_t	file_size;		/*!< File size in bytes */
+	ulint		page_size;		/*!< Page size */
+	ulint		n_io_buffers;		/*!< Number of pages to use
+						for IO */
+	byte*		io_buffer;		/*!< Buffer to use for IO */
+};
+
+/********************************************************************//**
+TODO: This can be made parallel trivially by chunking up the file and creating
+a callback per thread. . Main benefit will be to use multiple CPUs for
+checksums and compressed tables. We have to do compressed tables block by
+block right now. Secondly we need to decompress/compress and copy too much
+of data. These are CPU intensive.
+
+Iterate over all the pages in the tablespace.
+@param iter - Tablespace iterator
+@param block - block to use for IO
+@param callback - Callback to inspect and update page contents
+@retval DB_SUCCESS or error code */
+static
+dberr_t
+fil_iterate(
+/*========*/
+	const fil_iterator_t&	iter,
+	buf_block_t*		block,
+	PageCallback&		callback)
+{
+	os_offset_t		offset;
+	ulint			page_no = 0;
+	ulint			space_id = callback.get_space_id();
+	ulint			n_bytes = iter.n_io_buffers * iter.page_size;
+
+	ut_ad(!srv_read_only_mode);
+
+	/* TODO: For compressed tables we do a lot of useless
+	copying for non-index pages. Unfortunately, it is
+	required by buf_zip_decompress() */
+
+	for (offset = iter.start; offset < iter.end; offset += n_bytes) {
+
+		byte*		io_buffer = iter.io_buffer;
+
+		block->frame = io_buffer;
+
+		if (callback.get_zip_size() > 0) {
+			page_zip_des_init(&block->page.zip);
+			page_zip_set_size(&block->page.zip, iter.page_size);
+			block->page.zip.data = block->frame + UNIV_PAGE_SIZE;
+			ut_d(block->page.zip.m_external = true);
+			ut_ad(iter.page_size == callback.get_zip_size());
+
+			/* Zip IO is done in the compressed page buffer. */
+			io_buffer = block->page.zip.data;
+		} else {
+			io_buffer = iter.io_buffer;
+		}
+
+		/* We have to read the exact number of bytes. Otherwise the
+		InnoDB IO functions croak on failed reads. */
+
+		n_bytes = static_cast<ulint>(
+			ut_min(static_cast<os_offset_t>(n_bytes),
+			       iter.end - offset));
+
+		ut_ad(n_bytes > 0);
+		ut_ad(!(n_bytes % iter.page_size));
+
+		if (!os_file_read(iter.file, io_buffer, offset,
+				  (ulint) n_bytes)) {
+
+			ib_logf(IB_LOG_LEVEL_ERROR, "os_file_read() failed");
+
+			return(DB_IO_ERROR);
+		}
+
+		bool		updated = false;
+		os_offset_t	page_off = offset;
+		ulint		n_pages_read = (ulint) n_bytes / iter.page_size;
+
+		for (ulint i = 0; i < n_pages_read; ++i) {
+
+			buf_block_set_file_page(block, space_id, page_no++);
+
+			dberr_t	err;
+
+			if ((err = callback(page_off, block)) != DB_SUCCESS) {
+
+				return(err);
+
+			} else if (!updated) {
+				updated = buf_block_get_state(block)
+					== BUF_BLOCK_FILE_PAGE;
+			}
+
+			buf_block_set_state(block, BUF_BLOCK_NOT_USED);
+			buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE);
+
+			page_off += iter.page_size;
+			block->frame += iter.page_size;
+		}
+
+		/* A page was updated in the set, write back to disk. */
+		if (updated
+		    && !os_file_write(
+				iter.filepath, iter.file, io_buffer,
+				offset, (ulint) n_bytes)) {
+
+			ib_logf(IB_LOG_LEVEL_ERROR, "os_file_write() failed");
+
+			return(DB_IO_ERROR);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/********************************************************************//**
+Iterate over all the pages in the tablespace.
+@param table - the table definiton in the server
+@param n_io_buffers - number of blocks to read and write together
+@param callback - functor that will do the page updates
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fil_tablespace_iterate(
+/*===================*/
+	dict_table_t*	table,
+	ulint		n_io_buffers,
+	PageCallback&	callback)
+{
+	dberr_t		err;
+	os_file_t	file;
+	char*		filepath;
+
+	ut_a(n_io_buffers > 0);
+	ut_ad(!srv_read_only_mode);
+
+	DBUG_EXECUTE_IF("ib_import_trigger_corruption_1",
+			return(DB_CORRUPTION););
+
+	if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+		dict_get_and_save_data_dir_path(table, false);
+		ut_a(table->data_dir_path);
+
+		filepath = os_file_make_remote_pathname(
+			table->data_dir_path, table->name, "ibd");
+	} else {
+		filepath = fil_make_ibd_name(table->name, false);
+	}
+
+	{
+		ibool	success;
+
+		file = os_file_create_simple_no_error_handling(
+			innodb_file_data_key, filepath,
+			OS_FILE_OPEN, OS_FILE_READ_WRITE, &success);
+
+		DBUG_EXECUTE_IF("fil_tablespace_iterate_failure",
+		{
+			static bool once;
+
+			if (!once || ut_rnd_interval(0, 10) == 5) {
+				once = true;
+				success = FALSE;
+				os_file_close(file);
+			}
+		});
+
+		if (!success) {
+			/* The following call prints an error message */
+			os_file_get_last_error(true);
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Trying to import a tablespace, but could not "
+				"open the tablespace file %s", filepath);
+
+			mem_free(filepath);
+
+			return(DB_TABLESPACE_NOT_FOUND);
+
+		} else {
+			err = DB_SUCCESS;
+		}
+	}
+
+	callback.set_file(filepath, file);
+
+	os_offset_t	file_size = os_file_get_size(file);
+	ut_a(file_size != (os_offset_t) -1);
+
+	/* The block we will use for every physical page */
+	buf_block_t	block;
+
+	memset(&block, 0x0, sizeof(block));
+
+	/* Allocate a page to read in the tablespace header, so that we
+	can determine the page size and zip_size (if it is compressed).
+	We allocate an extra page in case it is a compressed table. One
+	page is to ensure alignement. */
+
+	void*	page_ptr = mem_alloc(3 * UNIV_PAGE_SIZE);
+	byte*	page = static_cast<byte*>(ut_align(page_ptr, UNIV_PAGE_SIZE));
+
+	fil_buf_block_init(&block, page);
+
+	/* Read the first page and determine the page and zip size. */
+
+	if (!os_file_read(file, page, 0, UNIV_PAGE_SIZE)) {
+
+		err = DB_IO_ERROR;
+
+	} else if ((err = callback.init(file_size, &block)) == DB_SUCCESS) {
+		fil_iterator_t	iter;
+
+		iter.file = file;
+		iter.start = 0;
+		iter.end = file_size;
+		iter.filepath = filepath;
+		iter.file_size = file_size;
+		iter.n_io_buffers = n_io_buffers;
+		iter.page_size = callback.get_page_size();
+
+		/* Compressed pages can't be optimised for block IO for now.
+		We do the IMPORT page by page. */
+
+		if (callback.get_zip_size() > 0) {
+			iter.n_io_buffers = 1;
+			ut_a(iter.page_size == callback.get_zip_size());
+		}
+
+		/** Add an extra page for compressed page scratch area. */
+
+		void*	io_buffer = mem_alloc(
+			(2 + iter.n_io_buffers) * UNIV_PAGE_SIZE);
+
+		iter.io_buffer = static_cast<byte*>(
+			ut_align(io_buffer, UNIV_PAGE_SIZE));
+
+		err = fil_iterate(iter, &block, callback);
+
+		mem_free(io_buffer);
+	}
+
+	if (err == DB_SUCCESS) {
+
+		ib_logf(IB_LOG_LEVEL_INFO, "Sync to disk");
+
+		if (!os_file_flush(file)) {
+			ib_logf(IB_LOG_LEVEL_INFO, "os_file_flush() failed!");
+			err = DB_IO_ERROR;
+		} else {
+			ib_logf(IB_LOG_LEVEL_INFO, "Sync to disk - done!");
+		}
+	}
+
+	os_file_close(file);
+
+	mem_free(page_ptr);
+	mem_free(filepath);
+
+	return(err);
+}
+
+/**
+Set the tablespace compressed table size.
+@return DB_SUCCESS if it is valie or DB_CORRUPTION if not */
+dberr_t
+PageCallback::set_zip_size(const buf_frame_t* page) UNIV_NOTHROW
+{
+	m_zip_size = fsp_header_get_zip_size(page);
+
+	if (!ut_is_2pow(m_zip_size) || m_zip_size > UNIV_ZIP_SIZE_MAX) {
+		return(DB_CORRUPTION);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/********************************************************************//**
+Delete the tablespace file and any related files like .cfg.
+This should not be called for temporary tables. */
+UNIV_INTERN
+void
+fil_delete_file(
+/*============*/
+	const char*	ibd_name)	/*!< in: filepath of the ibd
+					tablespace */
+{
+	/* Force a delete of any stale .ibd files that are lying around. */
+
+	ib_logf(IB_LOG_LEVEL_INFO, "Deleting %s", ibd_name);
+
+	os_file_delete_if_exists(innodb_file_data_key, ibd_name);
+
+	char*	cfg_name = fil_make_cfg_name(ibd_name);
+
+	os_file_delete_if_exists(innodb_file_data_key, cfg_name);
+
+	mem_free(cfg_name);
+}
+
 /*************************************************************************
 Return local hash table informations. */
 
@@ -6071,6 +6387,79 @@ fil_system_hash_nodes(void)
        }
 }
 
+/**
+Iterate over all the spaces in the space list and fetch the
+tablespace names. It will return a copy of the name that must be
+freed by the caller using: delete[].
+@return DB_SUCCESS if all OK. */
+UNIV_INTERN
+dberr_t
+fil_get_space_names(
+/*================*/
+	space_name_list_t&	space_name_list)
+				/*!< in/out: List to append to */
+{
+	fil_space_t*	space;
+	dberr_t		err = DB_SUCCESS;
+
+	mutex_enter(&fil_system->mutex);
+
+	for (space = UT_LIST_GET_FIRST(fil_system->space_list);
+	     space != NULL;
+	     space = UT_LIST_GET_NEXT(space_list, space)) {
+
+		if (space->purpose == FIL_TABLESPACE) {
+			ulint	len;
+			char*	name;
+
+			len = strlen(space->name);
+			name = new(std::nothrow) char[len + 1];
+
+			if (name == 0) {
+				/* Caller to free elements allocated so far. */
+				err = DB_OUT_OF_MEMORY;
+				break;
+			}
+
+			memcpy(name, space->name, len);
+			name[len] = 0;
+
+			space_name_list.push_back(name);
+		}
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(err);
+}
+
+/****************************************************************//**
+Generate redo logs for swapping two .ibd files */
+UNIV_INTERN
+void
+fil_mtr_rename_log(
+/*===============*/
+	ulint		old_space_id,	/*!< in: tablespace id of the old
+					table. */
+	const char*	old_name,	/*!< in: old table name */
+	ulint		new_space_id,	/*!< in: tablespace id of the new
+					table */
+	const char*	new_name,	/*!< in: new table name */
+	const char*	tmp_name,	/*!< in: temp table name used while
+					swapping */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	if (old_space_id != TRX_SYS_SPACE) {
+		fil_op_write_log(MLOG_FILE_RENAME, old_space_id,
+				 0, 0, old_name, tmp_name, mtr);
+	}
+
+	if (new_space_id != TRX_SYS_SPACE) {
+		fil_op_write_log(MLOG_FILE_RENAME, new_space_id,
+				 0, 0, new_name, old_name, mtr);
+	}
+}
+
 /*************************************************************************
 functions to access is_corrupt flag of fil_space_t*/
 
@@ -6086,7 +6475,7 @@ fil_space_is_corrupt(
 
 	space = fil_space_get_by_id(space_id);
 
-	if (space && space->is_corrupt) {
+	if (UNIV_UNLIKELY(space && space->is_corrupt)) {
 		ret = TRUE;
 	}
 
@@ -6112,27 +6501,3 @@ fil_space_set_corrupt(
 
 	mutex_exit(&fil_system->mutex);
 }
-
-/****************************************************************//**
-Generate redo logs for swapping two .ibd files */
-UNIV_INTERN
-void
-fil_mtr_rename_log(
-/*===============*/
-	ulint		old_space_id,	/*!< in: tablespace id of the old
-					table. */
-	const char*	old_name,	/*!< in: old table name */
-	ulint		new_space_id,	/*!< in: tablespace id of the new
-					table */
-	const char*	new_name,	/*!< in: new table name */
-	const char*	tmp_name)	/*!< in: temp table name used while
-					swapping */
-{
-	mtr_t           mtr;
-	mtr_start(&mtr);
-	fil_op_write_log(MLOG_FILE_RENAME, old_space_id,
-			 0, 0, old_name, tmp_name, &mtr);
-	fil_op_write_log(MLOG_FILE_RENAME, new_space_id,
-			 0, 0, new_name, old_name, &mtr);
-	mtr_commit(&mtr);
-}
diff --git a/storage/xtradb/fsp/fsp0fsp.c b/storage/xtradb/fsp/fsp0fsp.cc
index 772e224f6f7..c06d4213d73 100644
--- a/storage/xtradb/fsp/fsp0fsp.c
+++ b/storage/xtradb/fsp/fsp0fsp.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /******************************************************************//**
-@file fsp/fsp0fsp.c
+@file fsp/fsp0fsp.cc
 File space management
 
 Created 11/29/1995 Heikki Tuuri
@@ -48,129 +48,11 @@ Created 11/29/1995 Heikki Tuuri
 # include "log0log.h"
 #endif /* UNIV_HOTBACKUP */
 #include "dict0mem.h"
-#include "trx0sys.h"
-
-/*			FILE SEGMENT INODE
-			==================
-
-Segment inode which is created for each segment in a tablespace. NOTE: in
-purge we assume that a segment having only one currently used page can be
-freed in a few steps, so that the freeing cannot fill the file buffer with
-bufferfixed file pages. */
-
-typedef	byte	fseg_inode_t;
-
-#define FSEG_INODE_PAGE_NODE	FSEG_PAGE_DATA
-					/* the list node for linking
-					segment inode pages */
-
-#define FSEG_ARR_OFFSET		(FSEG_PAGE_DATA + FLST_NODE_SIZE)
-/*-------------------------------------*/
-#define	FSEG_ID			0	/* 8 bytes of segment id: if this is 0,
-					it means that the header is unused */
-#define FSEG_NOT_FULL_N_USED	8
-					/* number of used segment pages in
-					the FSEG_NOT_FULL list */
-#define	FSEG_FREE		12
-					/* list of free extents of this
-					segment */
-#define	FSEG_NOT_FULL		(12 + FLST_BASE_NODE_SIZE)
-					/* list of partially free extents */
-#define	FSEG_FULL		(12 + 2 * FLST_BASE_NODE_SIZE)
-					/* list of full extents */
-#define	FSEG_MAGIC_N		(12 + 3 * FLST_BASE_NODE_SIZE)
-					/* magic number used in debugging */
-#define	FSEG_FRAG_ARR		(16 + 3 * FLST_BASE_NODE_SIZE)
-					/* array of individual pages
-					belonging to this segment in fsp
-					fragment extent lists */
-#define FSEG_FRAG_ARR_N_SLOTS	(FSP_EXTENT_SIZE / 2)
-					/* number of slots in the array for
-					the fragment pages */
-#define	FSEG_FRAG_SLOT_SIZE	4	/* a fragment page slot contains its
-					page number within space, FIL_NULL
-					means that the slot is not in use */
-/*-------------------------------------*/
-#define FSEG_INODE_SIZE					\
-	(16 + 3 * FLST_BASE_NODE_SIZE			\
-	 + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE)
-
-#define FSP_SEG_INODES_PER_PAGE(zip_size)		\
-	(((zip_size ? zip_size : UNIV_PAGE_SIZE)	\
-	  - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE)
-				/* Number of segment inodes which fit on a
-				single page */
-
-#define FSEG_MAGIC_N_VALUE	97937874
-
-#define	FSEG_FILLFACTOR		8	/* If this value is x, then if
-					the number of unused but reserved
-					pages in a segment is less than
-					reserved pages * 1/x, and there are
-					at least FSEG_FRAG_LIMIT used pages,
-					then we allow a new empty extent to
-					be added to the segment in
-					fseg_alloc_free_page. Otherwise, we
-					use unused pages of the segment. */
-
-#define FSEG_FRAG_LIMIT		FSEG_FRAG_ARR_N_SLOTS
-					/* If the segment has >= this many
-					used pages, it may be expanded by
-					allocating extents to the segment;
-					until that only individual fragment
-					pages are allocated from the space */
-
-#define	FSEG_FREE_LIST_LIMIT	40	/* If the reserved size of a segment
-					is at least this many extents, we
-					allow extents to be put to the free
-					list of the extent: at most
-					FSEG_FREE_LIST_MAX_LEN many */
-#define	FSEG_FREE_LIST_MAX_LEN	4
-
-
-/*			EXTENT DESCRIPTOR
-			=================
-
-File extent descriptor data structure: contains bits to tell which pages in
-the extent are free and which contain old tuple version to clean. */
-
-/*-------------------------------------*/
-#define	XDES_ID			0	/* The identifier of the segment
-					to which this extent belongs */
-#define XDES_FLST_NODE		8	/* The list node data structure
-					for the descriptors */
-#define	XDES_STATE		(FLST_NODE_SIZE + 8)
-					/* contains state information
-					of the extent */
-#define	XDES_BITMAP		(FLST_NODE_SIZE + 12)
-					/* Descriptor bitmap of the pages
-					in the extent */
-/*-------------------------------------*/
-
-#define	XDES_BITS_PER_PAGE	2	/* How many bits are there per page */
-#define	XDES_FREE_BIT		0	/* Index of the bit which tells if
-					the page is free */
-#define	XDES_CLEAN_BIT		1	/* NOTE: currently not used!
-					Index of the bit which tells if
-					there are old versions of tuples
-					on the page */
-/* States of a descriptor */
-#define	XDES_FREE		1	/* extent is in free list of space */
-#define	XDES_FREE_FRAG		2	/* extent is in free fragment list of
-					space */
-#define	XDES_FULL_FRAG		3	/* extent is in full fragment list of
-					space */
-#define	XDES_FSEG		4	/* extent belongs to a segment */
-
-/* File extent data structure size in bytes. */
-#define	XDES_SIZE							\
-	(XDES_BITMAP + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE))
-
-/* Offset of the descriptor array on a descriptor page */
-#define	XDES_ARR_OFFSET		(FSP_HEADER_OFFSET + FSP_HEADER_SIZE)
+#include "srv0start.h"
+
 
 #ifndef UNIV_HOTBACKUP
-/* Flag to indicate if we have printed the tablespace full error. */
+/** Flag to indicate if we have printed the tablespace full error. */
 static ibool fsp_tbs_full_error_printed = FALSE;
 
 /**********************************************************************//**
@@ -183,7 +65,7 @@ fsp_free_extent(
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
 	ulint		page,	/*!< in: page offset in the extent */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Frees an extent of a segment to the space free list. */
 static
@@ -195,7 +77,7 @@ fseg_free_extent(
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
 	ulint		page,	/*!< in: page offset in the extent */
-	mtr_t*		mtr);	/*!< in: mtr handle */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Calculates the number of pages reserved by a segment, and how
 many pages are currently used.
@@ -207,7 +89,7 @@ fseg_n_reserved_pages_low(
 	fseg_inode_t*	header,	/*!< in: segment inode */
 	ulint*		used,	/*!< out: number of pages used (not
 				more than reserved) */
-	mtr_t*		mtr);	/*!< in: mtr handle */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /********************************************************************//**
 Marks a page used. The page must reside within the extents of the given
 segment. */
@@ -217,8 +99,8 @@ fseg_mark_page_used(
 /*================*/
 	fseg_inode_t*	seg_inode,/*!< in: segment inode */
 	ulint		page,	/*!< in: page offset */
-	xdes_t*		descr, /* extent descriptor */
-	mtr_t*		mtr);	/*!< in: mtr */
+	xdes_t*		descr,  /*!< in: extent descriptor */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Returns the first extent descriptor for a segment. We think of the extent
 lists of the segment catenated in the order FSEG_FULL -> FSEG_NOT_FULL
@@ -232,7 +114,7 @@ fseg_get_first_extent(
 	ulint		space,	/*!< in: space id */
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Puts new extents to the free list if
 there are free extents above the free limit. If an extent happens
@@ -250,7 +132,7 @@ fsp_fill_free_list(
 	ulint		space,		/*!< in: space */
 	fsp_header_t*	header,		/*!< in/out: space header */
 	mtr_t*		mtr)		/*!< in/out: mini-transaction */
-	__attribute__((nonnull));
+	UNIV_COLD __attribute__((nonnull));
 /**********************************************************************//**
 Allocates a single free page from a segment. This function implements
 the intelligent allocation strategy which tries to minimize file space
@@ -305,14 +187,14 @@ fsp_get_space_header(
 	ulint	id,	/*!< in: space id */
 	ulint	zip_size,/*!< in: compressed page size in bytes
 			or 0 for uncompressed pages */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	buf_block_t*	block;
 	fsp_header_t*	header;
 
 	ut_ad(ut_is_2pow(zip_size));
-	ut_ad(zip_size <= UNIV_PAGE_SIZE);
-	ut_ad(!zip_size || zip_size >= PAGE_ZIP_MIN_SIZE);
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+	ut_ad(!zip_size || zip_size >= UNIV_ZIP_SIZE_MIN);
 	ut_ad(id || !zip_size);
 
 	block = buf_page_get(id, zip_size, 0, RW_X_LATCH, mtr);
@@ -323,7 +205,7 @@ fsp_get_space_header(
 	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
 
 	ut_ad(id == mach_read_from_4(FSP_SPACE_ID + header));
-	ut_ad(zip_size == dict_table_flags_to_zip_size(
+	ut_ad(zip_size == fsp_flags_get_zip_size(
 		      mach_read_from_4(FSP_SPACE_FLAGS + header)));
 	return(header);
 }
@@ -333,30 +215,18 @@ Gets a descriptor bit of a page.
 @return	TRUE if free */
 UNIV_INLINE
 ibool
-xdes_get_bit(
-/*=========*/
+xdes_mtr_get_bit(
+/*=============*/
 	const xdes_t*	descr,	/*!< in: descriptor */
 	ulint		bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
 	ulint		offset,	/*!< in: page offset within extent:
 				0 ... FSP_EXTENT_SIZE - 1 */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
 {
-	ulint	index;
-	ulint	byte_index;
-	ulint	bit_index;
-
+	ut_ad(mtr->state == MTR_ACTIVE);
 	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
-	ut_ad((bit == XDES_FREE_BIT) || (bit == XDES_CLEAN_BIT));
-	ut_ad(offset < FSP_EXTENT_SIZE);
-
-	index = bit + XDES_BITS_PER_PAGE * offset;
-
-	byte_index = index / 8;
-	bit_index = index % 8;
 
-	return(ut_bit_get_nth(mtr_read_ulint(descr + XDES_BITMAP + byte_index,
-					     MLOG_1BYTE, mtr),
-			      bit_index));
+	return(xdes_get_bit(descr, bit, offset));
 }
 
 /**********************************************************************//**
@@ -370,7 +240,7 @@ xdes_set_bit(
 	ulint	offset,	/*!< in: page offset within extent:
 			0 ... FSP_EXTENT_SIZE - 1 */
 	ibool	val,	/*!< in: bit value */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	index;
 	ulint	byte_index;
@@ -406,8 +276,9 @@ xdes_find_bit(
 	xdes_t*	descr,	/*!< in: descriptor */
 	ulint	bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
 	ibool	val,	/*!< in: desired bit value */
-	ulint	hint,	/*!< in: hint of which bit position would be desirable */
-	mtr_t*	mtr)	/*!< in: mtr */
+	ulint	hint,	/*!< in: hint of which bit position would
+			be desirable */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	i;
 
@@ -416,14 +287,14 @@ xdes_find_bit(
 	ut_ad(hint < FSP_EXTENT_SIZE);
 	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
 	for (i = hint; i < FSP_EXTENT_SIZE; i++) {
-		if (val == xdes_get_bit(descr, bit, i, mtr)) {
+		if (val == xdes_mtr_get_bit(descr, bit, i, mtr)) {
 
 			return(i);
 		}
 	}
 
 	for (i = 0; i < hint; i++) {
-		if (val == xdes_get_bit(descr, bit, i, mtr)) {
+		if (val == xdes_mtr_get_bit(descr, bit, i, mtr)) {
 
 			return(i);
 		}
@@ -443,8 +314,9 @@ xdes_find_bit_downward(
 	xdes_t*	descr,	/*!< in: descriptor */
 	ulint	bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
 	ibool	val,	/*!< in: desired bit value */
-	ulint	hint,	/*!< in: hint of which bit position would be desirable */
-	mtr_t*	mtr)	/*!< in: mtr */
+	ulint	hint,	/*!< in: hint of which bit position would
+			be desirable */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	i;
 
@@ -453,14 +325,14 @@ xdes_find_bit_downward(
 	ut_ad(hint < FSP_EXTENT_SIZE);
 	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
 	for (i = hint + 1; i > 0; i--) {
-		if (val == xdes_get_bit(descr, bit, i - 1, mtr)) {
+		if (val == xdes_mtr_get_bit(descr, bit, i - 1, mtr)) {
 
 			return(i - 1);
 		}
 	}
 
 	for (i = FSP_EXTENT_SIZE - 1; i > hint; i--) {
-		if (val == xdes_get_bit(descr, bit, i, mtr)) {
+		if (val == xdes_mtr_get_bit(descr, bit, i, mtr)) {
 
 			return(i);
 		}
@@ -477,15 +349,14 @@ ulint
 xdes_get_n_used(
 /*============*/
 	const xdes_t*	descr,	/*!< in: descriptor */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	ulint	i;
 	ulint	count	= 0;
 
 	ut_ad(descr && mtr);
 	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
-	for (i = 0; i < FSP_EXTENT_SIZE; i++) {
-		if (FALSE == xdes_get_bit(descr, XDES_FREE_BIT, i, mtr)) {
+	for (ulint i = 0; i < FSP_EXTENT_SIZE; ++i) {
+		if (FALSE == xdes_mtr_get_bit(descr, XDES_FREE_BIT, i, mtr)) {
 			count++;
 		}
 	}
@@ -501,7 +372,7 @@ ibool
 xdes_is_free(
 /*=========*/
 	const xdes_t*	descr,	/*!< in: descriptor */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	if (0 == xdes_get_n_used(descr, mtr)) {
 
@@ -519,7 +390,7 @@ ibool
 xdes_is_full(
 /*=========*/
 	const xdes_t*	descr,	/*!< in: descriptor */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	if (FSP_EXTENT_SIZE == xdes_get_n_used(descr, mtr)) {
 
@@ -537,7 +408,7 @@ xdes_set_state(
 /*===========*/
 	xdes_t*	descr,	/*!< in/out: descriptor */
 	ulint	state,	/*!< in: state to set */
-	mtr_t*	mtr)	/*!< in: mtr handle */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	ut_ad(descr && mtr);
 	ut_ad(state >= XDES_FREE);
@@ -555,7 +426,7 @@ ulint
 xdes_get_state(
 /*===========*/
 	const xdes_t*	descr,	/*!< in: descriptor */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	state;
 
@@ -574,7 +445,7 @@ void
 xdes_init(
 /*======*/
 	xdes_t*	descr,	/*!< in: descriptor */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	i;
 
@@ -590,61 +461,6 @@ xdes_init(
 }
 
 /********************************************************************//**
-Calculates the page where the descriptor of a page resides.
-@return	descriptor page offset */
-UNIV_INLINE
-ulint
-xdes_calc_descriptor_page(
-/*======================*/
-	ulint	zip_size,	/*!< in: compressed page size in bytes;
-				0 for uncompressed pages */
-	ulint	offset)		/*!< in: page offset */
-{
-//#ifndef DOXYGEN /* Doxygen gets confused of these */
-//# if UNIV_PAGE_SIZE <= XDES_ARR_OFFSET
-//		+ (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE) * XDES_SIZE
-//#  error
-//# endif
-//# if PAGE_ZIP_MIN_SIZE <= XDES_ARR_OFFSET
-//		+ (PAGE_ZIP_MIN_SIZE / FSP_EXTENT_SIZE) * XDES_SIZE
-//#  error
-//# endif
-//#endif /* !DOXYGEN */
-	ut_a(UNIV_PAGE_SIZE > XDES_ARR_OFFSET + (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE) * XDES_SIZE);
-	ut_a(PAGE_ZIP_MIN_SIZE > XDES_ARR_OFFSET + (PAGE_ZIP_MIN_SIZE / FSP_EXTENT_SIZE) * XDES_SIZE);
-	ut_ad(ut_is_2pow(zip_size));
-
-	if (!zip_size) {
-		return(ut_2pow_round(offset, UNIV_PAGE_SIZE));
-	} else {
-		ut_ad(zip_size > XDES_ARR_OFFSET
-		      + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE);
-		return(ut_2pow_round(offset, zip_size));
-	}
-}
-
-/********************************************************************//**
-Calculates the descriptor index within a descriptor page.
-@return	descriptor index */
-UNIV_INLINE
-ulint
-xdes_calc_descriptor_index(
-/*=======================*/
-	ulint	zip_size,	/*!< in: compressed page size in bytes;
-				0 for uncompressed pages */
-	ulint	offset)		/*!< in: page offset */
-{
-	ut_ad(ut_is_2pow(zip_size));
-
-	if (!zip_size) {
-		return(ut_2pow_remainder(offset, UNIV_PAGE_SIZE)
-		       / FSP_EXTENT_SIZE);
-	} else {
-		return(ut_2pow_remainder(offset, zip_size) / FSP_EXTENT_SIZE);
-	}
-}
-
-/********************************************************************//**
 Gets pointer to a the extent descriptor of a page. The page where the extent
 descriptor resides is x-locked. This function no longer extends the data
 file.
@@ -676,7 +492,7 @@ xdes_get_descriptor_with_space_hdr(
 	/* Read free limit and space size */
 	limit = mach_read_from_4(sp_header + FSP_FREE_LIMIT);
 	size  = mach_read_from_4(sp_header + FSP_SIZE);
-	zip_size = dict_table_flags_to_zip_size(
+	zip_size = fsp_flags_get_zip_size(
 		mach_read_from_4(sp_header + FSP_SPACE_FLAGS));
 
 	if ((offset >= size) || (offset >= limit)) {
@@ -748,7 +564,7 @@ xdes_lst_get_descriptor(
 				or 0 for uncompressed pages */
 	fil_addr_t	lst_node,/*!< in: file address of the list node
 				contained in the descriptor */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	xdes_t*	descr;
 
@@ -768,7 +584,7 @@ UNIV_INLINE
 ulint
 xdes_get_offset(
 /*============*/
-	xdes_t*	descr)	/*!< in: extent descriptor */
+	const xdes_t*	descr)	/*!< in: extent descriptor */
 {
 	ut_ad(descr);
 
@@ -793,7 +609,7 @@ fsp_init_file_page_low(
 	block->check_index_page_at_flush = FALSE;
 #endif /* !UNIV_HOTBACKUP */
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		memset(page, 0, UNIV_PAGE_SIZE);
 		memset(page_zip->data, 0, page_zip_get_size(page_zip));
 		mach_write_to_4(page + FIL_PAGE_OFFSET,
@@ -822,7 +638,7 @@ void
 fsp_init_file_page(
 /*===============*/
 	buf_block_t*	block,	/*!< in: pointer to a page */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	fsp_init_file_page_low(block);
 
@@ -858,27 +674,33 @@ void
 fsp_init(void)
 /*==========*/
 {
+	/* FSP_EXTENT_SIZE must be a multiple of page & zip size */
+	ut_a(0 == (UNIV_PAGE_SIZE % FSP_EXTENT_SIZE));
+	ut_a(UNIV_PAGE_SIZE);
+
+#if UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX
+# error "UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX != 0"
+#endif
+#if UNIV_ZIP_SIZE_MIN % FSP_EXTENT_SIZE_MIN
+# error "UNIV_ZIP_SIZE_MIN % FSP_EXTENT_SIZE_MIN != 0"
+#endif
+
 	/* Does nothing at the moment */
 }
 
 /**********************************************************************//**
-Writes the space id and compressed page size to a tablespace header.
-This function is used past the buffer pool when we in fil0fil.c create
-a new single-table tablespace. */
+Writes the space id and flags to a tablespace header.  The flags contain
+row type, physical/compressed page size, and logical/uncompressed page
+size of the tablespace. */
 UNIV_INTERN
 void
 fsp_header_init_fields(
 /*===================*/
 	page_t*	page,		/*!< in/out: first page in the space */
 	ulint	space_id,	/*!< in: space id */
-	ulint	flags)		/*!< in: tablespace flags (FSP_SPACE_FLAGS):
-				0, or table->flags if newer than COMPACT */
+	ulint	flags)		/*!< in: tablespace flags (FSP_SPACE_FLAGS) */
 {
-	/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
-	ROW_FORMAT=COMPACT (table->flags == DICT_TF_COMPACT) and
-	ROW_FORMAT=REDUNDANT (table->flags == 0).  For any other
-	format, the tablespace flags should equal table->flags. */
-	ut_a(flags != DICT_TF_COMPACT);
+	ut_a(fsp_flags_is_valid(flags));
 
 	mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page,
 			space_id);
@@ -896,7 +718,7 @@ fsp_header_init(
 /*============*/
 	ulint	space,		/*!< in: space id */
 	ulint	size,		/*!< in: current size in blocks */
-	mtr_t*	mtr)		/*!< in: mini-transaction handle */
+	mtr_t*	mtr)		/*!< in/out: mini-transaction */
 {
 	fsp_header_t*	header;
 	buf_block_t*	block;
@@ -908,7 +730,7 @@ fsp_header_init(
 
 	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
 
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 	block = buf_page_create(space, 0, zip_size, mtr);
 	buf_page_get(space, zip_size, 0, RW_X_LATCH, mtr);
 	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
@@ -939,10 +761,10 @@ fsp_header_init(
 	flst_init(header + FSP_SEG_INODES_FREE, mtr);
 
 	mlog_write_ull(header + FSP_SEG_ID, 1, mtr);
-	if (space == TRX_SYS_SPACE || space == TRX_DOUBLEWRITE_SPACE) {
+	if (space == 0) {
 		fsp_fill_free_list(FALSE, space, header, mtr);
 		btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF,
-			   space, 0, DICT_IBUF_ID_MIN + space,
+			   0, 0, DICT_IBUF_ID_MIN + space,
 			   dict_ind_redundant, mtr);
 	} else {
 		fsp_fill_free_list(TRUE, space, header, mtr);
@@ -966,11 +788,13 @@ fsp_header_get_space_id(
 
 	id = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
 
+	DBUG_EXECUTE_IF("fsp_header_get_space_id_failure",
+			id = ULINT_UNDEFINED;);
+
 	if (id != fsp_id) {
-		fprintf(stderr,
-			"InnoDB: Error: space id in fsp header %lu,"
-			" but in the page header %lu\n",
-			(ulong) fsp_id, (ulong) id);
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Space id in fsp header %lu,but in the page header "
+			"%lu", fsp_id, id);
 
 		return(ULINT_UNDEFINED);
 	}
@@ -1003,7 +827,7 @@ fsp_header_get_zip_size(
 {
 	ulint	flags = fsp_header_get_flags(page);
 
-	return(dict_table_flags_to_zip_size(flags));
+	return(fsp_flags_get_zip_size(flags));
 }
 
 #ifndef UNIV_HOTBACKUP
@@ -1013,9 +837,9 @@ UNIV_INTERN
 void
 fsp_header_inc_size(
 /*================*/
-	ulint	space,	/*!< in: space id */
-	ulint	size_inc,/*!< in: size increment in pages */
-	mtr_t*	mtr)	/*!< in: mini-transaction handle */
+	ulint	space,		/*!< in: space id */
+	ulint	size_inc,	/*!< in: size increment in pages */
+	mtr_t*	mtr)		/*!< in/out: mini-transaction */
 {
 	fsp_header_t*	header;
 	ulint		size;
@@ -1026,7 +850,7 @@ fsp_header_inc_size(
 	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
 
 	header = fsp_get_space_header(space,
-				      dict_table_flags_to_zip_size(flags),
+				      fsp_flags_get_zip_size(flags),
 				      mtr);
 
 	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
@@ -1036,38 +860,6 @@ fsp_header_inc_size(
 }
 
 /**********************************************************************//**
-Gets the current free limit of the system tablespace.  The free limit
-means the place of the first page which has never been put to the
-free list for allocation.  The space above that address is initialized
-to zero.  Sets also the global variable log_fsp_current_free_limit.
-@return	free limit in megabytes */
-UNIV_INTERN
-ulint
-fsp_header_get_free_limit(void)
-/*===========================*/
-{
-	fsp_header_t*	header;
-	ulint		limit;
-	mtr_t		mtr;
-
-	mtr_start(&mtr);
-
-	mtr_x_lock(fil_space_get_latch(0, NULL), &mtr);
-
-	header = fsp_get_space_header(0, 0, &mtr);
-
-	limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, &mtr);
-
-	limit /= ((1024 * 1024) / UNIV_PAGE_SIZE);
-
-	log_fsp_current_free_limit_set_and_checkpoint(limit);
-
-	mtr_commit(&mtr);
-
-	return(limit);
-}
-
-/**********************************************************************//**
 Gets the size of the system tablespace from the tablespace header.  If
 we do not have an auto-extending data file, this should be equal to
 the size of the data files.  If there is an auto-extending data file,
@@ -1099,7 +891,7 @@ fsp_header_get_tablespace_size(void)
 Tries to extend a single-table tablespace so that a page would fit in the
 data file.
 @return	TRUE if success */
-static __attribute__((nonnull, warn_unused_result))
+static UNIV_COLD __attribute__((nonnull, warn_unused_result))
 ibool
 fsp_try_extend_data_file_with_pages(
 /*================================*/
@@ -1131,7 +923,7 @@ fsp_try_extend_data_file_with_pages(
 /***********************************************************************//**
 Tries to extend the last data file of a tablespace if it is auto-extending.
 @return	FALSE if not auto-extending */
-static __attribute__((nonnull))
+static UNIV_COLD __attribute__((nonnull))
 ibool
 fsp_try_extend_data_file(
 /*=====================*/
@@ -1173,7 +965,7 @@ fsp_try_extend_data_file(
 	}
 
 	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
-	zip_size = dict_table_flags_to_zip_size(
+	zip_size = fsp_flags_get_zip_size(
 		mach_read_from_4(header + FSP_SPACE_FLAGS));
 
 	old_size = size;
@@ -1246,6 +1038,11 @@ fsp_try_extend_data_file(
 
 	success = fil_extend_space_to_desired_size(&actual_size, space,
 						   size + size_increase);
+	if (!success) {
+
+		return(false);
+	}
+
 	/* We ignore any fragments of a full megabyte when storing the size
 	to the space header */
 
@@ -1297,11 +1094,11 @@ fsp_fill_free_list(
 	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
 	limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, mtr);
 
-	zip_size = dict_table_flags_to_zip_size(
+	zip_size = fsp_flags_get_zip_size(
 		mach_read_from_4(FSP_SPACE_FLAGS + header));
 	ut_a(ut_is_2pow(zip_size));
-	ut_a(zip_size <= UNIV_PAGE_SIZE);
-	ut_a(!zip_size || zip_size >= PAGE_ZIP_MIN_SIZE);
+	ut_a(zip_size <= UNIV_ZIP_SIZE_MAX);
+	ut_a(!zip_size || zip_size >= UNIV_ZIP_SIZE_MIN);
 
 	if (space == 0 && srv_auto_extend_last_data_file
 	    && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) {
@@ -1334,15 +1131,6 @@ fsp_fill_free_list(
 		mlog_write_ulint(header + FSP_FREE_LIMIT, i + FSP_EXTENT_SIZE,
 				 MLOG_4BYTES, mtr);
 
-		/* Update the free limit info in the log system and make
-		a checkpoint */
-		if (space == 0) {
-			ut_a(!zip_size);
-			log_fsp_current_free_limit_set_and_checkpoint(
-				(i + FSP_EXTENT_SIZE)
-				/ ((1024 * 1024) / UNIV_PAGE_SIZE));
-		}
-
 		if (UNIV_UNLIKELY(init_xdes)) {
 
 			buf_block_t*	block;
@@ -1392,13 +1180,6 @@ fsp_fill_free_list(
 							   mtr);
 		xdes_init(descr, mtr);
 
-//#if UNIV_PAGE_SIZE % FSP_EXTENT_SIZE
-//# error "UNIV_PAGE_SIZE % FSP_EXTENT_SIZE != 0"
-//#endif
-//#if PAGE_ZIP_MIN_SIZE % FSP_EXTENT_SIZE
-//# error "PAGE_ZIP_MIN_SIZE % FSP_EXTENT_SIZE != 0"
-//#endif
-
 		if (UNIV_UNLIKELY(init_xdes)) {
 
 			/* The first page in the extent is a descriptor page
@@ -1439,7 +1220,7 @@ fsp_alloc_free_extent(
 	ulint	hint,	/*!< in: hint of which extent would be desirable: any
 			page offset in the extent goes; the hint must not
 			be > FSP_FREE_LIMIT */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	fsp_header_t*	header;
 	fil_addr_t	first;
@@ -1490,7 +1271,7 @@ fsp_alloc_from_free_frag(
 	ulint		frag_n_used;
 
 	ut_ad(xdes_get_state(descr, mtr) == XDES_FREE_FRAG);
-	ut_a(xdes_get_bit(descr, XDES_FREE_BIT, bit, mtr));
+	ut_a(xdes_mtr_get_bit(descr, XDES_FREE_BIT, bit, mtr));
 	xdes_set_bit(descr, XDES_FREE_BIT, bit, FALSE, mtr);
 
 	/* Update the FRAG_N_USED field */
@@ -1639,7 +1420,7 @@ fsp_alloc_free_page(
 			     hint % FSP_EXTENT_SIZE, mtr);
 	if (free == ULINT_UNDEFINED) {
 
-		ut_print_buf(stderr, ((byte*)descr) - 500, 1000);
+		ut_print_buf(stderr, ((byte*) descr) - 500, 1000);
 		putc('\n', stderr);
 
 		ut_error;
@@ -1672,7 +1453,6 @@ fsp_alloc_free_page(
 	}
 
 	fsp_alloc_from_free_frag(header, descr, free, mtr);
-
 	return(fsp_page_create(space, zip_size, page_no, mtr, init_mtr));
 }
 
@@ -1686,7 +1466,7 @@ fsp_free_page(
 	ulint	zip_size,/*!< in: compressed page size in bytes
 			or 0 for uncompressed pages */
 	ulint	page,	/*!< in: page offset */
-	mtr_t*	mtr)	/*!< in: mtr handle */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	fsp_header_t*	header;
 	xdes_t*		descr;
@@ -1710,7 +1490,7 @@ fsp_free_page(
 			(ulong) page,
 			(ulong) state);
 		fputs("InnoDB: Dump of descriptor: ", stderr);
-		ut_print_buf(stderr, ((byte*)descr) - 50, 200);
+		ut_print_buf(stderr, ((byte*) descr) - 50, 200);
 		putc('\n', stderr);
 		/* Crash in debug version, so that we get a core dump
 		of this corruption. */
@@ -1726,12 +1506,14 @@ fsp_free_page(
 		ut_error;
 	}
 
-	if (xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)) {
+	if (xdes_mtr_get_bit(descr, XDES_FREE_BIT,
+			     page % FSP_EXTENT_SIZE, mtr)) {
+
 		fprintf(stderr,
 			"InnoDB: Error: File space extent descriptor"
 			" of page %lu says it is free\n"
 			"InnoDB: Dump of descriptor: ", (ulong) page);
-		ut_print_buf(stderr, ((byte*)descr) - 50, 200);
+		ut_print_buf(stderr, ((byte*) descr) - 50, 200);
 		putc('\n', stderr);
 		/* Crash in debug version, so that we get a core dump
 		of this corruption. */
@@ -1784,7 +1566,7 @@ fsp_free_extent(
 	ulint	zip_size,/*!< in: compressed page size in bytes
 			or 0 for uncompressed pages */
 	ulint	page,	/*!< in: page offset in the extent */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	fsp_header_t*	header;
 	xdes_t*		descr;
@@ -1797,7 +1579,7 @@ fsp_free_extent(
 
 	if (xdes_get_state(descr, mtr) == XDES_FREE) {
 
-		ut_print_buf(stderr, (byte*)descr - 500, 1000);
+		ut_print_buf(stderr, (byte*) descr - 500, 1000);
 		putc('\n', stderr);
 
 		ut_error;
@@ -1820,7 +1602,7 @@ fsp_seg_inode_page_get_nth_inode(
 	ulint	zip_size __attribute__((unused)),
 			/*!< in: compressed page size, or 0 */
 	mtr_t*	mtr __attribute__((unused)))
-			/*!< in: mini-transaction handle */
+			/*!< in/out: mini-transaction */
 {
 	ut_ad(i < FSP_SEG_INODES_PER_PAGE(zip_size));
 	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
@@ -1837,7 +1619,7 @@ fsp_seg_inode_page_find_used(
 /*=========================*/
 	page_t*	page,	/*!< in: segment inode page */
 	ulint	zip_size,/*!< in: compressed page size, or 0 */
-	mtr_t*	mtr)	/*!< in: mini-transaction handle */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	ulint		i;
 	fseg_inode_t*	inode;
@@ -1869,20 +1651,19 @@ fsp_seg_inode_page_find_free(
 	page_t*	page,	/*!< in: segment inode page */
 	ulint	i,	/*!< in: search forward starting from this index */
 	ulint	zip_size,/*!< in: compressed page size, or 0 */
-	mtr_t*	mtr)	/*!< in: mini-transaction handle */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
-	fseg_inode_t*	inode;
-
 	SRV_CORRUPT_TABLE_CHECK(page, return(ULINT_UNDEFINED););
 
 	for (; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) {
 
+		fseg_inode_t*	inode;
+
 		inode = fsp_seg_inode_page_get_nth_inode(
 			page, i, zip_size, mtr);
 
 		if (!mach_read_from_8(inode + FSEG_ID)) {
 			/* This is unused */
-
 			return(i);
 		}
 
@@ -1901,19 +1682,19 @@ ibool
 fsp_alloc_seg_inode_page(
 /*=====================*/
 	fsp_header_t*	space_header,	/*!< in: space header */
-	mtr_t*		mtr)		/*!< in: mini-transaction handle */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
 	fseg_inode_t*	inode;
 	buf_block_t*	block;
 	page_t*		page;
 	ulint		space;
 	ulint		zip_size;
-	ulint		i;
 
 	ut_ad(page_offset(space_header) == FSP_HEADER_OFFSET);
 
 	space = page_get_space_id(page_align(space_header));
-	zip_size = dict_table_flags_to_zip_size(
+
+	zip_size = fsp_flags_get_zip_size(
 		mach_read_from_4(FSP_SPACE_FLAGS + space_header));
 
 	block = fsp_alloc_free_page(space, zip_size, 0, mtr, mtr);
@@ -1933,16 +1714,18 @@ fsp_alloc_seg_inode_page(
 	mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_INODE,
 			 MLOG_2BYTES, mtr);
 
-	for (i = 0; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) {
+	for (ulint i = 0; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) {
 
-		inode = fsp_seg_inode_page_get_nth_inode(page, i,
-							 zip_size, mtr);
+		inode = fsp_seg_inode_page_get_nth_inode(
+			page, i, zip_size, mtr);
 
 		mlog_write_ull(inode + FSEG_ID, 0, mtr);
 	}
 
-	flst_add_last(space_header + FSP_SEG_INODES_FREE,
-		      page + FSEG_INODE_PAGE_NODE, mtr);
+	flst_add_last(
+		space_header + FSP_SEG_INODES_FREE,
+		page + FSEG_INODE_PAGE_NODE, mtr);
+
 	return(TRUE);
 }
 
@@ -1954,7 +1737,7 @@ fseg_inode_t*
 fsp_alloc_seg_inode(
 /*================*/
 	fsp_header_t*	space_header,	/*!< in: space header */
-	mtr_t*		mtr)		/*!< in: mini-transaction handle */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
 	ulint		page_no;
 	buf_block_t*	block;
@@ -1979,7 +1762,7 @@ fsp_alloc_seg_inode(
 
 	page_no = flst_get_first(space_header + FSP_SEG_INODES_FREE, mtr).page;
 
-	zip_size = dict_table_flags_to_zip_size(
+	zip_size = fsp_flags_get_zip_size(
 		mach_read_from_4(FSP_SPACE_FLAGS + space_header));
 	block = buf_page_get(page_get_space_id(page_align(space_header)),
 			     zip_size, page_no, RW_X_LATCH, mtr);
@@ -2022,7 +1805,7 @@ fsp_free_seg_inode(
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
 	fseg_inode_t*	inode,	/*!< in: segment inode */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	page_t*		page;
 	fsp_header_t*	space_header;
@@ -2071,7 +1854,7 @@ fseg_inode_try_get(
 	ulint		space,	/*!< in: space id */
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	fil_addr_t	inode_addr;
 	fseg_inode_t*	inode;
@@ -2106,7 +1889,7 @@ fseg_inode_get(
 	ulint		space,	/*!< in: space id */
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	fseg_inode_t*	inode
 		= fseg_inode_try_get(header, space, zip_size, mtr);
@@ -2123,7 +1906,8 @@ fseg_get_nth_frag_page_no(
 /*======================*/
 	fseg_inode_t*	inode,	/*!< in: segment inode */
 	ulint		n,	/*!< in: slot index */
-	mtr_t*		mtr __attribute__((unused))) /*!< in: mtr handle */
+	mtr_t*		mtr __attribute__((unused)))
+				/*!< in/out: mini-transaction */
 {
 	ut_ad(inode && mtr);
 	ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
@@ -2142,7 +1926,7 @@ fseg_set_nth_frag_page_no(
 	fseg_inode_t*	inode,	/*!< in: segment inode */
 	ulint		n,	/*!< in: slot index */
 	ulint		page_no,/*!< in: page number to set */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ut_ad(inode && mtr);
 	ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
@@ -2161,7 +1945,7 @@ ulint
 fseg_find_free_frag_page_slot(
 /*==========================*/
 	fseg_inode_t*	inode,	/*!< in: segment inode */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	i;
 	ulint	page_no;
@@ -2188,7 +1972,7 @@ ulint
 fseg_find_last_used_frag_page_slot(
 /*===============================*/
 	fseg_inode_t*	inode,	/*!< in: segment inode */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	i;
 	ulint	page_no;
@@ -2216,7 +2000,7 @@ ulint
 fseg_get_n_frag_pages(
 /*==================*/
 	fseg_inode_t*	inode,	/*!< in: segment inode */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	i;
 	ulint	count	= 0;
@@ -2253,7 +2037,7 @@ fseg_create_general(
 			the inode and the other for the segment) then there is
 			no need to do the check for this individual
 			operation */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	ulint		flags;
 	ulint		zip_size;
@@ -2262,7 +2046,7 @@ fseg_create_general(
 	ib_id_t		seg_id;
 	buf_block_t*	block	= 0; /* remove warning */
 	fseg_header_t*	header	= 0; /* remove warning */
-	rw_lock_t*	latch;
+	prio_rw_lock_t*	latch;
 	ibool		success;
 	ulint		n_reserved;
 	ulint		i;
@@ -2272,16 +2056,13 @@ fseg_create_general(
 	      <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END);
 
 	latch = fil_space_get_latch(space, &flags);
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	if (page != 0) {
 		block = buf_page_get(space, zip_size, page, RW_X_LATCH, mtr);
 		header = byte_offset + buf_block_get_frame(block);
 	}
 
-	ut_ad(!mutex_own(&kernel_mutex)
-	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
-
 	mtr_x_lock(latch, mtr);
 
 	if (rw_lock_get_x_lock_count(latch) == 1) {
@@ -2381,7 +2162,7 @@ fseg_create(
 			will belong to the created segment */
 	ulint	byte_offset, /*!< in: byte offset of the created segment header
 			on the page */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	return(fseg_create_general(space, page, byte_offset, FALSE, mtr));
 }
@@ -2397,7 +2178,7 @@ fseg_n_reserved_pages_low(
 	fseg_inode_t*	inode,	/*!< in: segment inode */
 	ulint*		used,	/*!< out: number of pages used (not
 				more than reserved) */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	ret;
 
@@ -2426,21 +2207,18 @@ fseg_n_reserved_pages(
 /*==================*/
 	fseg_header_t*	header,	/*!< in: segment header */
 	ulint*		used,	/*!< out: number of pages used (<= reserved) */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint		ret;
 	fseg_inode_t*	inode;
 	ulint		space;
 	ulint		flags;
 	ulint		zip_size;
-	rw_lock_t*	latch;
+	prio_rw_lock_t*	latch;
 
 	space = page_get_space_id(page_align(header));
 	latch = fil_space_get_latch(space, &flags);
-	zip_size = dict_table_flags_to_zip_size(flags);
-
-	ut_ad(!mutex_own(&kernel_mutex)
-	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	mtr_x_lock(latch, mtr);
 
@@ -2466,7 +2244,7 @@ fseg_fill_free_list(
 				or 0 for uncompressed pages */
 	ulint		hint,	/*!< in: hint which extent would be good as
 				the first extent */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	xdes_t*	descr;
 	ulint	i;
@@ -2533,7 +2311,7 @@ fseg_alloc_free_extent(
 	ulint		space,	/*!< in: space id */
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	xdes_t*		descr;
 	ib_id_t		seg_id;
@@ -2640,8 +2418,8 @@ fseg_alloc_free_page_low(
 	/*-------------------------------------------------------------*/
 	if ((xdes_get_state(descr, mtr) == XDES_FSEG)
 	    && mach_read_from_8(descr + XDES_ID) == seg_id
-	    && (xdes_get_bit(descr, XDES_FREE_BIT,
-			     hint % FSP_EXTENT_SIZE, mtr) == TRUE)) {
+	    && (xdes_mtr_get_bit(descr, XDES_FREE_BIT,
+				 hint % FSP_EXTENT_SIZE, mtr) == TRUE)) {
 take_hinted_page:
 		/* 1. We can take the hinted page
 		=================================*/
@@ -2806,14 +2584,16 @@ got_hinted_page:
 
 		ut_ad(xdes_get_descriptor(space, zip_size, ret_page, mtr)
 		      == ret_descr);
-		ut_ad(xdes_get_bit(ret_descr, XDES_FREE_BIT,
-				   ret_page % FSP_EXTENT_SIZE, mtr) == TRUE);
+
+		ut_ad(xdes_mtr_get_bit(
+				ret_descr, XDES_FREE_BIT,
+				ret_page % FSP_EXTENT_SIZE, mtr));
 
 		fseg_mark_page_used(seg_inode, ret_page, ret_descr, mtr);
 	}
 
 	return(fsp_page_create(
-		       space, dict_table_flags_to_zip_size(
+		       space, fsp_flags_get_zip_size(
 			       mach_read_from_4(FSP_SPACE_FLAGS
 						+ space_header)),
 		       ret_page, mtr, init_mtr));
@@ -2844,7 +2624,7 @@ fseg_alloc_free_page_general(
 				with fsp_reserve_free_extents, then there
 				is no need to do the check for this individual
 				page */
-	mtr_t*		mtr,	/*!< in/out: mini-transaction handle */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
 	mtr_t*		init_mtr)/*!< in/out: mtr or another mini-transaction
 				in which the page should be initialized.
 				If init_mtr!=mtr, but the page is already
@@ -2854,7 +2634,7 @@ fseg_alloc_free_page_general(
 	ulint		space;
 	ulint		flags;
 	ulint		zip_size;
-	rw_lock_t*	latch;
+	prio_rw_lock_t*	latch;
 	buf_block_t*	block;
 	ulint		n_reserved;
 
@@ -2862,10 +2642,7 @@ fseg_alloc_free_page_general(
 
 	latch = fil_space_get_latch(space, &flags);
 
-	zip_size = dict_table_flags_to_zip_size(flags);
-
-	ut_ad(!mutex_own(&kernel_mutex)
-	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	mtr_x_lock(latch, mtr);
 
@@ -2912,7 +2689,7 @@ fsp_reserve_free_pages(
 					x-latched */
 	ulint		size,		/*!< in: size of the tablespace in
 					pages, must be < FSP_EXTENT_SIZE */
-	mtr_t*		mtr)		/*!< in: mtr */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
 	xdes_t*	descr;
 	ulint	n_used;
@@ -2971,10 +2748,10 @@ fsp_reserve_free_extents(
 	ulint	space,	/*!< in: space id */
 	ulint	n_ext,	/*!< in: number of extents to reserve */
 	ulint	alloc_type,/*!< in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	fsp_header_t*	space_header;
-	rw_lock_t*	latch;
+	prio_rw_lock_t*	latch;
 	ulint		n_free_list_ext;
 	ulint		free_limit;
 	ulint		size;
@@ -2990,10 +2767,7 @@ fsp_reserve_free_extents(
 	*n_reserved = n_ext;
 
 	latch = fil_space_get_latch(space, &flags);
-	zip_size = dict_table_flags_to_zip_size(flags);
-
-	ut_ad(!mutex_own(&kernel_mutex)
-	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	mtr_x_lock(latch, mtr);
 
@@ -3093,14 +2867,12 @@ fsp_get_available_space_in_free_extents(
 	ulint		n_free;
 	ulint		n_free_up;
 	ulint		reserve;
-	rw_lock_t*	latch;
+	prio_rw_lock_t*	latch;
 	mtr_t		mtr;
 
-	ut_ad(!mutex_own(&kernel_mutex));
-
 	/* The convoluted mutex acquire is to overcome latching order
 	issues: The problem is that the fil_mutex is at a lower level
-	than the tablespace latch and the buffer pool mutex. We have to
+	than the tablespace latch and the buffer pool mutexes. We have to
 	first prevent any operations on the file system by acquiring the
 	dictionary mutex. Then acquire the tablespace latch to obey the
 	latching order and then release the dictionary mutex. That way we
@@ -3110,7 +2882,7 @@ fsp_get_available_space_in_free_extents(
 	However, there is one further complication, we release the fil_mutex
 	when we need to invalidate the the pages in the buffer pool and we
 	reacquire the fil_mutex when deleting and freeing the tablespace
-	instance in fil0fil.c. Here we need to account for that situation
+	instance in fil0fil.cc. Here we need to account for that situation
 	too. */
 
 	mutex_enter(&dict_sys->mutex);
@@ -3133,7 +2905,7 @@ fsp_get_available_space_in_free_extents(
 	by another thread. However, the tablespace pages can still be freed
 	from the buffer pool. We need to check for that again. */
 
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	mtr_x_lock(latch, &mtr);
 
@@ -3222,8 +2994,8 @@ fseg_mark_page_used(
 /*================*/
 	fseg_inode_t*	seg_inode,/*!< in: segment inode */
 	ulint		page,	/*!< in: page offset */
-	xdes_t*		descr, /* extent descriptor */
-	mtr_t*		mtr)	/*!< in: mtr */
+	xdes_t*		descr,  /*!< in: extent descriptor */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	not_full_n_used;
 
@@ -3243,8 +3015,9 @@ fseg_mark_page_used(
 			      descr + XDES_FLST_NODE, mtr);
 	}
 
-	ut_ad(xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)
-	      == TRUE);
+	ut_ad(xdes_mtr_get_bit(
+			descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr));
+
 	/* We mark the page as used */
 	xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, FALSE, mtr);
 
@@ -3278,7 +3051,7 @@ fseg_free_page_low(
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
 	ulint		page,	/*!< in: page offset */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	xdes_t*	descr;
 	ulint	not_full_n_used;
@@ -3305,7 +3078,8 @@ fseg_free_page_low(
 		return;
 	});
 
-	if (xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)) {
+	if (xdes_mtr_get_bit(descr, XDES_FREE_BIT,
+			     page % FSP_EXTENT_SIZE, mtr)) {
 		fputs("InnoDB: Dump of the tablespace extent descriptor: ",
 		      stderr);
 		ut_print_buf(stderr, descr, 40);
@@ -3418,18 +3192,15 @@ fseg_free_page(
 	fseg_header_t*	seg_header, /*!< in: segment header */
 	ulint		space,	/*!< in: space id */
 	ulint		page,	/*!< in: page offset */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint		flags;
 	ulint		zip_size;
 	fseg_inode_t*	seg_inode;
-	rw_lock_t*	latch;
+	prio_rw_lock_t*	latch;
 
 	latch = fil_space_get_latch(space, &flags);
-	zip_size = dict_table_flags_to_zip_size(flags);
-
-	ut_ad(!mutex_own(&kernel_mutex)
-	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	mtr_x_lock(latch, mtr);
 
@@ -3443,6 +3214,49 @@ fseg_free_page(
 }
 
 /**********************************************************************//**
+Checks if a single page of a segment is free.
+@return	true if free */
+UNIV_INTERN
+bool
+fseg_page_is_free(
+/*==============*/
+	fseg_header_t*	seg_header,	/*!< in: segment header */
+	ulint		space,		/*!< in: space id */
+	ulint		page)		/*!< in: page offset */
+{
+	mtr_t		mtr;
+	ibool		is_free;
+	ulint		flags;
+	prio_rw_lock_t*	latch;
+	xdes_t*		descr;
+	ulint		zip_size;
+	fseg_inode_t*	seg_inode;
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = dict_tf_get_zip_size(flags);
+
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	seg_inode = fseg_inode_get(seg_header, space, zip_size, &mtr);
+
+	ut_a(seg_inode);
+	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+	      == FSEG_MAGIC_N_VALUE);
+	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+
+	descr = xdes_get_descriptor(space, zip_size, page, &mtr);
+	ut_a(descr);
+
+	is_free = xdes_mtr_get_bit(
+		descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, &mtr);
+
+	mtr_commit(&mtr);
+
+	return(is_free);
+}
+
+/**********************************************************************//**
 Frees an extent of a segment to the space free list. */
 static
 void
@@ -3453,7 +3267,7 @@ fseg_free_extent(
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
 	ulint		page,	/*!< in: a page in the extent */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	first_page_in_extent;
 	xdes_t*	descr;
@@ -3473,7 +3287,7 @@ fseg_free_extent(
 	first_page_in_extent = page - (page % FSP_EXTENT_SIZE);
 
 	for (i = 0; i < FSP_EXTENT_SIZE; i++) {
-		if (FALSE == xdes_get_bit(descr, XDES_FREE_BIT, i, mtr)) {
+		if (!xdes_mtr_get_bit(descr, XDES_FREE_BIT, i, mtr)) {
 
 			/* Drop search system page hash index if the page is
 			found in the pool and is hashed */
@@ -3528,7 +3342,7 @@ fseg_free_step(
 				resides on the first page of the frag list
 				of the segment, this pointer becomes obsolete
 				after the last freeing step */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint		n;
 	ulint		page;
@@ -3538,16 +3352,13 @@ fseg_free_step(
 	ulint		flags;
 	ulint		zip_size;
 	ulint		header_page;
-	rw_lock_t*	latch;
+	prio_rw_lock_t*	latch;
 
 	space = page_get_space_id(page_align(header));
 	header_page = page_get_page_no(page_align(header));
 
 	latch = fil_space_get_latch(space, &flags);
-	zip_size = dict_table_flags_to_zip_size(flags);
-
-	ut_ad(!mutex_own(&kernel_mutex)
-	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	mtr_x_lock(latch, mtr);
 
@@ -3562,8 +3373,9 @@ fseg_free_step(
 	/* Check that the header resides on a page which has not been
 	freed yet */
 
-	ut_a(xdes_get_bit(descr, XDES_FREE_BIT,
-			  header_page % FSP_EXTENT_SIZE, mtr) == FALSE);
+	ut_a(xdes_mtr_get_bit(descr, XDES_FREE_BIT,
+			      header_page % FSP_EXTENT_SIZE, mtr) == FALSE);
+
 	inode = fseg_inode_try_get(header, space, zip_size, mtr);
 
 	if (UNIV_UNLIKELY(inode == NULL)) {
@@ -3618,7 +3430,7 @@ fseg_free_step_not_header(
 /*======================*/
 	fseg_header_t*	header,	/*!< in: segment header which must reside on
 				the first fragment page of the segment */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint		n;
 	ulint		page;
@@ -3628,15 +3440,12 @@ fseg_free_step_not_header(
 	ulint		flags;
 	ulint		zip_size;
 	ulint		page_no;
-	rw_lock_t*	latch;
+	prio_rw_lock_t*	latch;
 
 	space = page_get_space_id(page_align(header));
 
 	latch = fil_space_get_latch(space, &flags);
-	zip_size = dict_table_flags_to_zip_size(flags);
-
-	ut_ad(!mutex_own(&kernel_mutex)
-	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	mtr_x_lock(latch, mtr);
 
@@ -3692,7 +3501,7 @@ fseg_get_first_extent(
 	ulint		space,	/*!< in: space id */
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	fil_addr_t	first;
 	xdes_t*		descr;
@@ -3734,7 +3543,7 @@ ibool
 fseg_validate_low(
 /*==============*/
 	fseg_inode_t*	inode, /*!< in: segment inode */
-	mtr_t*		mtr2)	/*!< in: mtr */
+	mtr_t*		mtr2)	/*!< in/out: mini-transaction */
 {
 	ulint		space;
 	ib_id_t		seg_id;
@@ -3765,7 +3574,7 @@ fseg_validate_low(
 
 		mtr_start(&mtr);
 		mtr_x_lock(fil_space_get_latch(space, &flags), &mtr);
-		zip_size = dict_table_flags_to_zip_size(flags);
+		zip_size = fsp_flags_get_zip_size(flags);
 
 		descr = xdes_lst_get_descriptor(space, zip_size,
 						node_addr, &mtr);
@@ -3788,7 +3597,7 @@ fseg_validate_low(
 
 		mtr_start(&mtr);
 		mtr_x_lock(fil_space_get_latch(space, &flags), &mtr);
-		zip_size = dict_table_flags_to_zip_size(flags);
+		zip_size = fsp_flags_get_zip_size(flags);
 
 		descr = xdes_lst_get_descriptor(space, zip_size,
 						node_addr, &mtr);
@@ -3814,7 +3623,7 @@ fseg_validate_low(
 
 		mtr_start(&mtr);
 		mtr_x_lock(fil_space_get_latch(space, &flags), &mtr);
-		zip_size = dict_table_flags_to_zip_size(flags);
+		zip_size = fsp_flags_get_zip_size(flags);
 
 		descr = xdes_lst_get_descriptor(space, zip_size,
 						node_addr, &mtr);
@@ -3841,7 +3650,7 @@ ibool
 fseg_validate(
 /*==========*/
 	fseg_header_t*	header, /*!< in: segment header */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	fseg_inode_t*	inode;
 	ibool		ret;
@@ -3852,7 +3661,7 @@ fseg_validate(
 	space = page_get_space_id(page_align(header));
 
 	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	inode = fseg_inode_get(header, space, zip_size, mtr);
 
@@ -3869,7 +3678,7 @@ void
 fseg_print_low(
 /*===========*/
 	fseg_inode_t*	inode, /*!< in: segment inode */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	space;
 	ulint	n_used;
@@ -3918,7 +3727,7 @@ void
 fseg_print(
 /*=======*/
 	fseg_header_t*	header, /*!< in: segment header */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	fseg_inode_t*	inode;
 	ulint		space;
@@ -3928,7 +3737,7 @@ fseg_print(
 	space = page_get_space_id(page_align(header));
 
 	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	inode = fseg_inode_get(header, space, zip_size, mtr);
 
@@ -3948,7 +3757,7 @@ fsp_validate(
 	fsp_header_t*	header;
 	fseg_inode_t*	seg_inode;
 	page_t*		seg_inode_page;
-	rw_lock_t*	latch;
+	prio_rw_lock_t*	latch;
 	ulint		size;
 	ulint		flags;
 	ulint		zip_size;
@@ -3968,10 +3777,10 @@ fsp_validate(
 	ulint		seg_inode_len_full;
 
 	latch = fil_space_get_latch(space, &flags);
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 	ut_a(ut_is_2pow(zip_size));
-	ut_a(zip_size <= UNIV_PAGE_SIZE);
-	ut_a(!zip_size || zip_size >= PAGE_ZIP_MIN_SIZE);
+	ut_a(zip_size <= UNIV_ZIP_SIZE_MAX);
+	ut_a(!zip_size || zip_size >= UNIV_ZIP_SIZE_MIN);
 
 	/* Start first a mini-transaction mtr2 to lock out all other threads
 	from the fsp system */
@@ -4200,7 +4009,7 @@ fsp_print(
 	fsp_header_t*	header;
 	fseg_inode_t*	seg_inode;
 	page_t*		seg_inode_page;
-	rw_lock_t*	latch;
+	prio_rw_lock_t*	latch;
 	ulint		flags;
 	ulint		zip_size;
 	ulint		size;
@@ -4218,7 +4027,7 @@ fsp_print(
 	mtr_t		mtr2;
 
 	latch = fil_space_get_latch(space, &flags);
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	/* Start first a mini-transaction mtr2 to lock out all other threads
 	from the fsp system */
diff --git a/storage/xtradb/fts/Makefile.query b/storage/xtradb/fts/Makefile.query
new file mode 100644
index 00000000000..12dcd833064
--- /dev/null
+++ b/storage/xtradb/fts/Makefile.query
@@ -0,0 +1,32 @@
+LEX=flex
+YACC=bison
+PREFIX=fts
+
+all:	fts0pars.cc fts0blex.cc fts0tlex.cc
+
+fts0par.cc: fts0pars.y
+fts0blex.cc: fts0blex.l
+fts0tlex.cc: fts0tlex.l
+
+.l.cc:
+	$(LEX) -P$(subst lex,,$*) -o $*.cc --header-file=../include/$*.h $<
+
+.y.cc:
+	$(YACC) -p $(PREFIX) -o $*.cc -d $<
+	mv $*.h ../include
+LEX=flex
+YACC=bison
+PREFIX=fts
+
+all:	fts0pars.cc fts0blex.cc fts0tlex.cc
+
+fts0par.cc: fts0pars.y
+fts0blex.cc: fts0blex.l
+fts0tlex.cc: fts0tlex.l
+
+.l.cc:
+	$(LEX) -P$(subst lex,,$*) -o $*.cc --header-file=../include/$*.h $<
+
+.y.cc:
+	$(YACC) -p $(PREFIX) -o $*.cc -d $<
+	mv $*.h ../include
diff --git a/storage/xtradb/fts/fts0ast.cc b/storage/xtradb/fts/fts0ast.cc
new file mode 100644
index 00000000000..3a03fc63303
--- /dev/null
+++ b/storage/xtradb/fts/fts0ast.cc
@@ -0,0 +1,632 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0ast.cc
+Full Text Search parser helper file.
+
+Created 2007/3/16 Sunny Bains.
+***********************************************************************/
+
+#include "mem0mem.h"
+#include "fts0ast.h"
+#include "fts0pars.h"
+#include "fts0fts.h"
+
+/* The FTS ast visit pass. */
+enum fts_ast_visit_pass_t {
+	FTS_PASS_FIRST,		/*!< First visit pass,
+				process operators excluding
+				FTS_EXIST and FTS_IGNORE */
+	FTS_PASS_EXIST,		/*!< Exist visit pass,
+				process operator FTS_EXIST */
+	FTS_PASS_IGNORE		/*!< Ignore visit pass,
+				process operator FTS_IGNORE */
+};
+
+/******************************************************************//**
+Create an empty fts_ast_node_t.
+@return Create a new node */
+static
+fts_ast_node_t*
+fts_ast_node_create(void)
+/*=====================*/
+{
+	fts_ast_node_t*	node;
+
+	node = (fts_ast_node_t*) ut_malloc(sizeof(*node));
+	memset(node, 0x0, sizeof(*node));
+
+	return(node);
+}
+
+/******************************************************************//**
+Create a operator fts_ast_node_t.
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_oper(
+/*=====================*/
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_oper_t	oper)			/*!< in: ast operator */
+{
+	fts_ast_node_t*	node = fts_ast_node_create();
+
+	node->type = FTS_AST_OPER;
+	node->oper = oper;
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+This function takes ownership of the ptr and is responsible
+for free'ing it
+@return new node or a node list with tokenized words */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_term(
+/*=====================*/
+	void*		arg,			/*!< in: ast state instance */
+	const char*	ptr)			/*!< in: ast term string */
+{
+	fts_ast_state_t*	state = static_cast<fts_ast_state_t*>(arg);
+	ulint			len = strlen(ptr);
+	ulint			cur_pos = 0;
+	fts_ast_node_t*         node = NULL;
+	fts_ast_node_t*		node_list = NULL;
+	fts_ast_node_t*		first_node = NULL;
+
+	/* Scan the incoming string and filter out any "non-word" characters */
+	while (cur_pos < len) {
+		fts_string_t	str;
+		ulint		offset;
+		ulint		cur_len;
+
+		cur_len = innobase_mysql_fts_get_token(
+			state->charset,
+			reinterpret_cast<const byte*>(ptr) + cur_pos,
+			reinterpret_cast<const byte*>(ptr) + len, &str, &offset);
+
+		if (cur_len == 0) {
+			break;
+		}
+
+		cur_pos += cur_len;
+
+		if (str.f_n_char > 0) {
+			/* If the subsequent term (after the first one)'s size
+			is less than fts_min_token_size, we shall ignore
+			that. This is to make consistent with MyISAM behavior */
+			if (first_node && (str.f_n_char < fts_min_token_size)) {
+				continue;
+			}
+
+			node = fts_ast_node_create();
+
+			node->type = FTS_AST_TERM;
+
+			node->term.ptr = static_cast<byte*>(ut_malloc(
+				str.f_len + 1));
+			memcpy(node->term.ptr, str.f_str, str.f_len);
+			node->term.ptr[str.f_len] = '\0';
+
+			fts_ast_state_add_node(
+				static_cast<fts_ast_state_t*>(arg), node);
+
+			if (first_node) {
+				/* There is more than one word, create
+				a list to organize them */
+				if (!node_list) {
+					node_list = fts_ast_create_node_list(
+						static_cast<fts_ast_state_t*>(
+							arg),
+						 first_node);
+				}
+
+				fts_ast_add_node(node_list, node);
+			} else {
+				first_node = node;
+			}
+		}
+	}
+
+	return((node_list != NULL) ? node_list : first_node);
+}
+
+/******************************************************************//**
+This function takes ownership of the ptr and is responsible
+for free'ing it.
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_text(
+/*=====================*/
+	void*		arg,			/*!< in: ast state instance */
+	const char*	ptr)			/*!< in: ast text string */
+{
+	ulint		len = strlen(ptr);
+	fts_ast_node_t*	node = NULL;
+
+
+	ut_ad(len >= 1);
+
+	if (len <= 2) {
+		/* There is a way to directly supply null terminator
+		in the query string (by using 0x220022) and get here,
+		and certainly it would not make a valid query text */
+		ut_ad(ptr[0] == '\"');
+
+		if (len == 2) {
+			ut_ad(ptr[1] == '\"');
+		}
+
+		return(NULL);
+	}
+
+	node = fts_ast_node_create();
+
+	/*!< We ignore the actual quotes "" */
+	len -= 2;
+
+	node->type = FTS_AST_TEXT;
+	node->text.ptr = static_cast<byte*>(ut_malloc(len + 1));
+
+	/*!< Skip copying the first quote */
+	memcpy(node->text.ptr, ptr + 1, len);
+	node->text.ptr[len] = 0;
+	node->text.distance = ULINT_UNDEFINED;
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+This function takes ownership of the expr and is responsible
+for free'ing it.
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_list(
+/*=====================*/
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_node_t*	expr)			/*!< in: ast expr instance */
+{
+	fts_ast_node_t*	node = fts_ast_node_create();
+
+	node->type = FTS_AST_LIST;
+	node->list.head = node->list.tail = expr;
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+Create a sub-expression list node. This function takes ownership of
+expr and is responsible for deleting it.
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_subexp_list(
+/*============================*/
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_node_t*	expr)			/*!< in: ast expr instance */
+{
+	fts_ast_node_t*	node = fts_ast_node_create();
+
+	node->type = FTS_AST_SUBEXP_LIST;
+	node->list.head = node->list.tail = expr;
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+Free an expr list node elements. */
+static
+void
+fts_ast_free_list(
+/*==============*/
+	fts_ast_node_t*	node)			/*!< in: ast node to free */
+{
+	ut_a(node->type == FTS_AST_LIST
+	     || node->type == FTS_AST_SUBEXP_LIST);
+
+	for (node = node->list.head;
+	     node != NULL;
+	     node = fts_ast_free_node(node)) {
+
+		/*!< No op */
+	}
+}
+
+/********************************************************************//**
+Free a fts_ast_node_t instance.
+@return next node to free */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_free_node(
+/*==============*/
+	fts_ast_node_t*	node)			/*!< in: the node to free */
+{
+	fts_ast_node_t*	next_node;
+
+	switch (node->type) {
+	case FTS_AST_TEXT:
+		if (node->text.ptr) {
+			ut_free(node->text.ptr);
+			node->text.ptr = NULL;
+		}
+		break;
+
+	case FTS_AST_TERM:
+		if (node->term.ptr) {
+			ut_free(node->term.ptr);
+			node->term.ptr = NULL;
+		}
+		break;
+
+	case FTS_AST_LIST:
+	case FTS_AST_SUBEXP_LIST:
+		fts_ast_free_list(node);
+		node->list.head = node->list.tail = NULL;
+		break;
+
+	case FTS_AST_OPER:
+		break;
+
+	default:
+		ut_error;
+	}
+
+	/*!< Get next node before freeing the node itself */
+	next_node = node->next;
+
+	ut_free(node);
+
+	return(next_node);
+}
+
+/******************************************************************//**
+This AST takes ownership of the expr and is responsible
+for free'ing it.
+@return in param "list" */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_add_node(
+/*=============*/
+	fts_ast_node_t*	node,			/*!< in: list instance */
+	fts_ast_node_t*	elem)			/*!< in: node to add to list */
+{
+	if (!elem) {
+		return(NULL);
+	}
+
+	ut_a(!elem->next);
+	ut_a(node->type == FTS_AST_LIST
+	     || node->type == FTS_AST_SUBEXP_LIST);
+
+	if (!node->list.head) {
+		ut_a(!node->list.tail);
+
+		node->list.head = node->list.tail = elem;
+	} else {
+		ut_a(node->list.tail);
+
+		node->list.tail->next = elem;
+		node->list.tail = elem;
+	}
+
+	return(node);
+}
+
+/******************************************************************//**
+For tracking node allocations, in case there is an error during
+parsing. */
+UNIV_INTERN
+void
+fts_ast_state_add_node(
+/*===================*/
+	fts_ast_state_t*state,			/*!< in: ast instance */
+	fts_ast_node_t*	node)			/*!< in: node to add to ast */
+{
+	if (!state->list.head) {
+		ut_a(!state->list.tail);
+
+		state->list.head = state->list.tail = node;
+	} else {
+		state->list.tail->next_alloc = node;
+		state->list.tail = node;
+	}
+}
+
+/******************************************************************//**
+Set the wildcard attribute of a term. */
+UNIV_INTERN
+void
+fts_ast_term_set_wildcard(
+/*======================*/
+	fts_ast_node_t*	node)			/*!< in/out: set attribute of
+						a term node */
+{
+	if (!node) {
+		return;
+	}
+
+	/* If it's a node list, the wildcard should be set to the tail node*/
+	if (node->type == FTS_AST_LIST)	{
+		ut_ad(node->list.tail != NULL);
+		node = node->list.tail;
+	}
+
+	ut_a(node->type == FTS_AST_TERM);
+	ut_a(!node->term.wildcard);
+
+	node->term.wildcard = TRUE;
+}
+
+/******************************************************************//**
+Set the proximity attribute of a text node. */
+UNIV_INTERN
+void
+fts_ast_term_set_distance(
+/*======================*/
+	fts_ast_node_t*	node,			/*!< in/out: text node */
+	ulint		distance)		/*!< in: the text proximity
+						distance */
+{
+	ut_a(node->type == FTS_AST_TEXT);
+	ut_a(node->text.distance == ULINT_UNDEFINED);
+
+	node->text.distance = distance;
+}
+
+/******************************************************************//**
+Free node and expr allocations. */
+UNIV_INTERN
+void
+fts_ast_state_free(
+/*===============*/
+	fts_ast_state_t*state)			/*!< in: ast state to free */
+{
+	fts_ast_node_t*	node = state->list.head;
+
+	/* Free the nodes that were allocated during parsing. */
+	while (node) {
+		fts_ast_node_t*	next = node->next_alloc;
+
+		if (node->type == FTS_AST_TEXT && node->text.ptr) {
+			ut_free(node->text.ptr);
+			node->text.ptr = NULL;
+		} else if (node->type == FTS_AST_TERM && node->term.ptr) {
+			ut_free(node->term.ptr);
+			node->term.ptr = NULL;
+		}
+
+		ut_free(node);
+		node = next;
+	}
+
+	state->root = state->list.head = state->list.tail = NULL;
+}
+
+/******************************************************************//**
+Print an ast node. */
+UNIV_INTERN
+void
+fts_ast_node_print(
+/*===============*/
+	fts_ast_node_t*	node)			/*!< in: ast node to print */
+{
+	switch (node->type) {
+	case FTS_AST_TEXT:
+		printf("TEXT: %s\n", node->text.ptr);
+		break;
+
+	case FTS_AST_TERM:
+		printf("TERM: %s\n", node->term.ptr);
+		break;
+
+	case FTS_AST_LIST:
+		printf("LIST: ");
+		node = node->list.head;
+
+		while (node) {
+			fts_ast_node_print(node);
+			node = node->next;
+		}
+		break;
+
+	case FTS_AST_SUBEXP_LIST:
+		printf("SUBEXP_LIST: ");
+		node = node->list.head;
+
+		while (node) {
+			fts_ast_node_print(node);
+			node = node->next;
+		}
+	case FTS_AST_OPER:
+		printf("OPER: %d\n", node->oper);
+		break;
+
+	default:
+		ut_error;
+	}
+}
+
+/******************************************************************//**
+Traverse the AST - in-order traversal, except for the FTX_EXIST and FTS_IGNORE
+nodes, which will be ignored in the first pass of each level, and visited in a
+second and third pass after all other nodes in the same level are visited.
+@return DB_SUCCESS if all went well */
+UNIV_INTERN
+dberr_t
+fts_ast_visit(
+/*==========*/
+	fts_ast_oper_t		oper,		/*!< in: current operator */
+	fts_ast_node_t*		node,		/*!< in: current root node */
+	fts_ast_callback	visitor,	/*!< in: callback function */
+	void*			arg,		/*!< in: arg for callback */
+	bool*			has_ignore)	/*!< out: true, if the operator
+						was ignored during processing,
+						currently we ignore FTS_EXIST
+						and FTS_IGNORE operators */
+{
+	dberr_t			error = DB_SUCCESS;
+	fts_ast_node_t*		oper_node = NULL;
+	fts_ast_node_t*		start_node;
+	bool			revisit = false;
+	bool			will_be_ignored = false;
+	fts_ast_visit_pass_t	visit_pass = FTS_PASS_FIRST;
+
+	start_node = node->list.head;
+
+	ut_a(node->type == FTS_AST_LIST
+	     || node->type == FTS_AST_SUBEXP_LIST);
+
+	if (oper == FTS_EXIST_SKIP) {
+		visit_pass = FTS_PASS_EXIST;
+	} else if (oper == FTS_IGNORE_SKIP) {
+		visit_pass = FTS_PASS_IGNORE;
+	}
+
+	/* In the first pass of the tree, at the leaf level of the
+	tree, FTS_EXIST and FTS_IGNORE operation will be ignored.
+	It will be repeated at the level above the leaf level.
+
+	The basic idea here is that when we encounter FTS_EXIST or
+	FTS_IGNORE, we will change the operator node into FTS_EXIST_SKIP
+	or FTS_IGNORE_SKIP, and term node & text node with the operators
+	is ignored in the first pass. We have two passes during the revisit:
+	We process nodes with FTS_EXIST_SKIP in the exist pass, and then
+	process nodes with FTS_IGNORE_SKIP in the ignore pass.
+
+	The order should be restrictly followed, or we will get wrong results.
+	For example, we have a query 'a +b -c d +e -f'.
+	first pass: process 'a' and 'd' by union;
+	exist pass: process '+b' and '+e' by intersection;
+	ignore pass: process '-c' and '-f' by difference. */
+
+	for (node = node->list.head;
+	     node && (error == DB_SUCCESS);
+	     node = node->next) {
+
+		switch(node->type) {
+		case FTS_AST_LIST:
+			if (visit_pass != FTS_PASS_FIRST) {
+				break;
+			}
+
+			error = fts_ast_visit(oper, node, visitor,
+					      arg, &will_be_ignored);
+
+			/* If will_be_ignored is set to true, then
+			we encountered and ignored a FTS_EXIST or FTS_IGNORE
+			operator. */
+			if (will_be_ignored) {
+				revisit = true;
+				/* Remember oper for list in case '-abc&def',
+				ignored oper is from previous node of list.*/
+				node->oper = oper;
+			}
+
+			break;
+
+		case FTS_AST_SUBEXP_LIST:
+			if (visit_pass != FTS_PASS_FIRST) {
+				break;
+			}
+
+			error = fts_ast_visit_sub_exp(node, visitor, arg);
+			break;
+
+		case FTS_AST_OPER:
+			oper = node->oper;
+			oper_node = node;
+
+			/* Change the operator for revisit */
+			if (oper == FTS_EXIST) {
+				oper_node->oper = FTS_EXIST_SKIP;
+			} else if (oper == FTS_IGNORE) {
+				oper_node->oper = FTS_IGNORE_SKIP;
+			}
+
+			break;
+
+		default:
+			if (node->visited) {
+				continue;
+			}
+
+			ut_a(oper == FTS_NONE || !oper_node
+			     || oper_node->oper == oper
+			     || oper_node->oper == FTS_EXIST_SKIP
+			     || oper_node->oper == FTS_IGNORE_SKIP);
+
+			if (oper== FTS_EXIST || oper == FTS_IGNORE) {
+				*has_ignore = true;
+				continue;
+			}
+
+			/* Process leaf node accroding to its pass.*/
+			if (oper == FTS_EXIST_SKIP
+			    && visit_pass == FTS_PASS_EXIST) {
+				error = visitor(FTS_EXIST, node, arg);
+				node->visited = true;
+			} else if (oper == FTS_IGNORE_SKIP
+				   && visit_pass == FTS_PASS_IGNORE) {
+				error = visitor(FTS_IGNORE, node, arg);
+				node->visited = true;
+			} else if (visit_pass == FTS_PASS_FIRST) {
+				error = visitor(oper, node, arg);
+				node->visited = true;
+			}
+		}
+	}
+
+	if (revisit) {
+		/* Exist pass processes the skipped FTS_EXIST operation. */
+                for (node = start_node;
+		     node && error == DB_SUCCESS;
+		     node = node->next) {
+
+			if (node->type == FTS_AST_LIST
+			    && node->oper != FTS_IGNORE) {
+				error = fts_ast_visit(FTS_EXIST_SKIP, node,
+					visitor, arg, &will_be_ignored);
+			}
+		}
+
+		/* Ignore pass processes the skipped FTS_IGNORE operation. */
+		for (node = start_node;
+		     node && error == DB_SUCCESS;
+		     node = node->next) {
+
+			if (node->type == FTS_AST_LIST) {
+				error = fts_ast_visit(FTS_IGNORE_SKIP, node,
+					visitor, arg, &will_be_ignored);
+			}
+		}
+	}
+
+	return(error);
+}
diff --git a/storage/xtradb/fts/fts0blex.cc b/storage/xtradb/fts/fts0blex.cc
new file mode 100644
index 00000000000..dccedac0212
--- /dev/null
+++ b/storage/xtradb/fts/fts0blex.cc
@@ -0,0 +1,1959 @@
+#include "univ.i"
+#line 2 "fts0blex.cc"
+
+#line 4 "fts0blex.cc"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 35
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* Returned upon end-of-file. */
+#define YY_NULL 0
+
+/* Promotes a possibly negative, possibly signed char to an unsigned
+ * integer for use as an array index.  If the signed char is negative,
+ * we want to instead treat it as an 8-bit unsigned char, hence the
+ * double cast.
+ */
+#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c)
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Enter a start condition.  This macro really ought to take a parameter,
+ * but we do it the disgusting crufty way forced on us by the ()-less
+ * definition of BEGIN.
+ */
+#define BEGIN yyg->yy_start = 1 + 2 *
+
+/* Translate the current start state into a value that can be later handed
+ * to BEGIN to return to the state.  The YYSTATE alias is for lex
+ * compatibility.
+ */
+#define YY_START ((yyg->yy_start - 1) / 2)
+#define YYSTATE YY_START
+
+/* Action number for EOF rule of a given start state. */
+#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1)
+
+/* Special action meaning "start processing a new file". */
+#define YY_NEW_FILE fts0brestart(yyin ,yyscanner )
+
+#define YY_END_OF_BUFFER_CHAR 0
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+/* The state buf must be large enough to hold one state per character in the main buffer.
+ */
+#define YY_STATE_BUF_SIZE   ((YY_BUF_SIZE + 2) * sizeof(yy_state_type))
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#define EOB_ACT_CONTINUE_SCAN 0
+#define EOB_ACT_END_OF_FILE 1
+#define EOB_ACT_LAST_MATCH 2
+
+    #define YY_LESS_LINENO(n)
+    
+/* Return all but the first "n" matched characters back to the input stream. */
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		*yy_cp = yyg->yy_hold_char; \
+		YY_RESTORE_YY_MORE_OFFSET \
+		yyg->yy_c_buf_p = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \
+		YY_DO_BEFORE_ACTION; /* set up yytext again */ \
+		} \
+	while ( 0 )
+
+#define unput(c) yyunput( c, yyg->yytext_ptr , yyscanner )
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+    
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+#define YY_BUFFER_NEW 0
+#define YY_BUFFER_NORMAL 1
+	/* When an EOF's been seen but there's still some text to process
+	 * then we mark the buffer as YY_EOF_PENDING, to indicate that we
+	 * shouldn't try reading from the input source any more.  We might
+	 * still have a bunch of tokens to match, though, because of
+	 * possible backing-up.
+	 *
+	 * When we actually see the EOF, we change the status to "new"
+	 * (via fts0brestart()), so that the user can continue scanning by
+	 * just pointing yyin at a new input file.
+	 */
+#define YY_BUFFER_EOF_PENDING 2
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+/* We provide macros for accessing buffer states in case in the
+ * future we want to put the buffer states in a more general
+ * "scanner state".
+ *
+ * Returns the top of the stack, or NULL.
+ */
+#define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \
+                          ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \
+                          : NULL)
+
+/* Same as previous macro, but useful when we know that the buffer stack is not
+ * NULL or when we need an lvalue. For internal use only.
+ */
+#define YY_CURRENT_BUFFER_LVALUE yyg->yy_buffer_stack[yyg->yy_buffer_stack_top]
+
+void fts0brestart (FILE *input_file ,yyscan_t yyscanner );
+void fts0b_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
+void fts0b_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0b_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0bpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+void fts0bpop_buffer_state (yyscan_t yyscanner );
+
+static void fts0bensure_buffer_stack (yyscan_t yyscanner );
+static void fts0b_load_buffer_state (yyscan_t yyscanner );
+static void fts0b_init_buffer (YY_BUFFER_STATE b,FILE *file ,yyscan_t yyscanner );
+
+#define YY_FLUSH_BUFFER fts0b_flush_buffer(YY_CURRENT_BUFFER ,yyscanner)
+
+YY_BUFFER_STATE fts0b_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner );
+
+void *fts0balloc (yy_size_t ,      yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) );
+void *fts0brealloc (void *,yy_size_t ,      yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) );
+void fts0bfree (void * ,      yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) );
+
+#define yy_new_buffer fts0b_create_buffer
+
+#define yy_set_interactive(is_interactive) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){ \
+        fts0bensure_buffer_stack (yyscanner); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            fts0b_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \
+	}
+
+#define yy_set_bol(at_bol) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){\
+        fts0bensure_buffer_stack (yyscanner); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            fts0b_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \
+	}
+
+#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
+
+/* Begin user sect3 */
+
+#define fts0bwrap(n) 1
+#define YY_SKIP_YYWRAP
+
+typedef unsigned char YY_CHAR;
+
+typedef int yy_state_type;
+
+#define yytext_ptr yytext_r
+
+static yy_state_type yy_get_previous_state (yyscan_t yyscanner );
+static yy_state_type yy_try_NUL_trans (yy_state_type current_state  ,yyscan_t yyscanner);
+static int yy_get_next_buffer (yyscan_t yyscanner );
+static void yy_fatal_error (yyconst char msg[] ,      yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) );
+
+/* Done after the current pattern has been matched and before the
+ * corresponding action - sets up yytext.
+ */
+#define YY_DO_BEFORE_ACTION \
+	yyg->yytext_ptr = yy_bp; \
+	yyleng = (size_t) (yy_cp - yy_bp); \
+	yyg->yy_hold_char = *yy_cp; \
+	*yy_cp = '\0'; \
+	yyg->yy_c_buf_p = yy_cp;
+
+#define YY_NUM_RULES 7
+#define YY_END_OF_BUFFER 8
+/* This struct is not used in this scanner,
+   but its presence is necessary. */
+struct yy_trans_info
+	{
+	flex_int32_t yy_verify;
+	flex_int32_t yy_nxt;
+	};
+static yyconst flex_int16_t yy_accept[19] =
+    {   0,
+        4,    4,    8,    4,    1,    6,    1,    7,    7,    2,
+        3,    4,    1,    1,    0,    5,    3,    0
+    } ;
+
+static yyconst flex_int32_t yy_ec[256] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    2,    3,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    4,    1,    5,    1,    1,    6,    1,    1,    7,
+        7,    7,    7,    1,    7,    1,    1,    8,    8,    8,
+        8,    8,    8,    8,    8,    8,    8,    1,    1,    7,
+        1,    7,    1,    7,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    7,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1
+    } ;
+
+static yyconst flex_int32_t yy_meta[9] =
+    {   0,
+        1,    2,    3,    4,    5,    5,    5,    1
+    } ;
+
+static yyconst flex_int16_t yy_base[22] =
+    {   0,
+        0,    0,   22,    0,    7,   23,    0,   14,   23,   23,
+        7,    0,    0,    0,    5,   23,    0,   23,   11,   12,
+       16
+    } ;
+
+static yyconst flex_int16_t yy_def[22] =
+    {   0,
+       18,    1,   18,   19,   19,   18,   20,   21,   18,   18,
+       19,   19,    5,   20,   21,   18,   11,    0,   18,   18,
+       18
+    } ;
+
+static yyconst flex_int16_t yy_nxt[32] =
+    {   0,
+        4,    5,    6,    7,    8,    9,   10,   11,   13,   16,
+       14,   12,   12,   14,   17,   14,   15,   15,   16,   15,
+       15,   18,    3,   18,   18,   18,   18,   18,   18,   18,
+       18
+    } ;
+
+static yyconst flex_int16_t yy_chk[32] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    5,   15,
+        5,   19,   19,   20,   11,   20,   21,   21,    8,   21,
+       21,    3,   18,   18,   18,   18,   18,   18,   18,   18,
+       18
+    } ;
+
+/* The intent behind this definition is that it'll catch
+ * any uses of REJECT which flex missed.
+ */
+#define REJECT reject_used_but_not_detected
+#define yymore() yymore_used_but_not_detected
+#define YY_MORE_ADJ 0
+#define YY_RESTORE_YY_MORE_OFFSET
+#line 1 "fts0blex.l"
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**
+ * @file fts/fts0blex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+#line 27 "fts0blex.l"
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_blexer(YYSTYPE* val, yyscan_t yyscanner)
+
+#define YY_NO_INPUT 1
+#line 484 "fts0blex.cc"
+
+#define INITIAL 0
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+/* Holds the entire state of the reentrant scanner. */
+struct yyguts_t
+    {
+
+    /* User-defined. Not touched by flex. */
+    YY_EXTRA_TYPE yyextra_r;
+
+    /* The rest are the same as the globals declared in the non-reentrant scanner. */
+    FILE *yyin_r, *yyout_r;
+    size_t yy_buffer_stack_top; /**< index of top of stack. */
+    size_t yy_buffer_stack_max; /**< capacity of stack. */
+    YY_BUFFER_STATE * yy_buffer_stack; /**< Stack as an array. */
+    char yy_hold_char;
+    int yy_n_chars;
+    int yyleng_r;
+    char *yy_c_buf_p;
+    int yy_init;
+    int yy_start;
+    int yy_did_buffer_switch_on_eof;
+    int yy_start_stack_ptr;
+    int yy_start_stack_depth;
+    int *yy_start_stack;
+    yy_state_type yy_last_accepting_state;
+    char* yy_last_accepting_cpos;
+
+    int yylineno_r;
+    int yy_flex_debug_r;
+
+    char *yytext_r;
+    int yy_more_flag;
+    int yy_more_len;
+
+    }; /* end struct yyguts_t */
+
+static int yy_init_globals (yyscan_t yyscanner );
+
+int fts0blex_init (yyscan_t* scanner);
+
+int fts0blex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int fts0blex_destroy (yyscan_t yyscanner );
+
+int fts0bget_debug (yyscan_t yyscanner );
+
+void fts0bset_debug (int debug_flag ,yyscan_t yyscanner );
+
+YY_EXTRA_TYPE fts0bget_extra (yyscan_t yyscanner );
+
+void fts0bset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
+
+FILE *fts0bget_in (yyscan_t yyscanner );
+
+void fts0bset_in  (FILE * in_str ,yyscan_t yyscanner );
+
+FILE *fts0bget_out (yyscan_t yyscanner );
+
+void fts0bset_out  (FILE * out_str ,yyscan_t yyscanner );
+
+int fts0bget_leng (yyscan_t yyscanner );
+
+char *fts0bget_text (yyscan_t yyscanner );
+
+int fts0bget_lineno (yyscan_t yyscanner );
+
+void fts0bset_lineno (int line_number ,yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int fts0bwrap (yyscan_t yyscanner );
+#else
+extern int fts0bwrap (yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int ,      yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)));
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * ,      yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)));
+#endif
+
+#ifndef YY_NO_INPUT
+
+#ifdef __cplusplus
+static int yyinput (yyscan_t yyscanner );
+#else
+static int input (yyscan_t yyscanner );
+#endif
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Copy whatever the last rule matched to the standard output. */
+#ifndef ECHO
+/* This used to be an fputs(), but since the string might contain NUL's,
+ * we now use fwrite().
+ */
+#define ECHO do { if (fwrite( yytext, yyleng, 1, yyout )) {} } while (0)
+#endif
+
+/* Gets input and stuffs it into "buf".  number of characters read, or YY_NULL,
+ * is returned in "result".
+ */
+#ifndef YY_INPUT
+#define YY_INPUT(buf,result,max_size) \
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
+		{ \
+		int c = '*'; \
+		size_t n; \
+		for ( n = 0; n < max_size && \
+			     (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
+			buf[n] = (char) c; \
+		if ( c == '\n' ) \
+			buf[n++] = (char) c; \
+		if ( c == EOF && ferror( yyin ) ) \
+			YY_FATAL_ERROR( "input in flex scanner failed" ); \
+		result = n; \
+		} \
+	else \
+		{ \
+		errno=0; \
+		while ( (result = fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \
+			{ \
+			if( errno != EINTR) \
+				{ \
+				YY_FATAL_ERROR( "input in flex scanner failed" ); \
+				break; \
+				} \
+			errno=0; \
+			clearerr(yyin); \
+			} \
+		}\
+\
+
+#endif
+
+/* No semi-colon after return; correct usage is to write "yyterminate();" -
+ * we don't want an extra ';' after the "return" because that will cause
+ * some compilers to complain about unreachable statements.
+ */
+#ifndef yyterminate
+#define yyterminate() return YY_NULL
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Report a fatal error. */
+#ifndef YY_FATAL_ERROR
+#define YY_FATAL_ERROR(msg) yy_fatal_error( msg , yyscanner)
+#endif
+
+/* end tables serialization structures and prototypes */
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int fts0blex (yyscan_t yyscanner);
+
+#define YY_DECL int fts0blex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* Code executed at the beginning of each rule, after yytext and yyleng
+ * have been set up.
+ */
+#ifndef YY_USER_ACTION
+#define YY_USER_ACTION
+#endif
+
+/* Code executed at the end of each rule. */
+#ifndef YY_BREAK
+#define YY_BREAK break;
+#endif
+
+#define YY_RULE_SETUP \
+	YY_USER_ACTION
+
+/** The main scanner function which does all the work.
+ */
+YY_DECL
+{
+	register yy_state_type yy_current_state;
+	register char *yy_cp, *yy_bp;
+	register int yy_act;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+#line 43 "fts0blex.l"
+
+
+#line 711 "fts0blex.cc"
+
+	if ( !yyg->yy_init )
+		{
+		yyg->yy_init = 1;
+
+#ifdef YY_USER_INIT
+		YY_USER_INIT;
+#endif
+
+		if ( ! yyg->yy_start )
+			yyg->yy_start = 1;	/* first start state */
+
+		if ( ! yyin )
+			yyin = stdin;
+
+		if ( ! yyout )
+			yyout = stdout;
+
+		if ( ! YY_CURRENT_BUFFER ) {
+			fts0bensure_buffer_stack (yyscanner);
+			YY_CURRENT_BUFFER_LVALUE =
+				fts0b_create_buffer(yyin,YY_BUF_SIZE ,yyscanner);
+		}
+
+		fts0b_load_buffer_state(yyscanner );
+		}
+
+	while ( 1 )		/* loops until end-of-file is reached */
+		{
+		yy_cp = yyg->yy_c_buf_p;
+
+		/* Support of yytext. */
+		*yy_cp = yyg->yy_hold_char;
+
+		/* yy_bp points to the position in yy_ch_buf of the start of
+		 * the current run.
+		 */
+		yy_bp = yy_cp;
+
+		yy_current_state = yyg->yy_start;
+yy_match:
+		do
+			{
+			register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)];
+			if ( yy_accept[yy_current_state] )
+				{
+				yyg->yy_last_accepting_state = yy_current_state;
+				yyg->yy_last_accepting_cpos = yy_cp;
+				}
+			while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+				{
+				yy_current_state = (int) yy_def[yy_current_state];
+				if ( yy_current_state >= 19 )
+					yy_c = yy_meta[(unsigned int) yy_c];
+				}
+			yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+			++yy_cp;
+			}
+		while ( yy_current_state != 18 );
+		yy_cp = yyg->yy_last_accepting_cpos;
+		yy_current_state = yyg->yy_last_accepting_state;
+
+yy_find_action:
+		yy_act = yy_accept[yy_current_state];
+
+		YY_DO_BEFORE_ACTION;
+
+do_action:	/* This label is used only to access EOF actions. */
+
+		switch ( yy_act )
+	{ /* beginning of action switch */
+			case 0: /* must back up */
+			/* undo the effects of YY_DO_BEFORE_ACTION */
+			*yy_cp = yyg->yy_hold_char;
+			yy_cp = yyg->yy_last_accepting_cpos;
+			yy_current_state = yyg->yy_last_accepting_state;
+			goto yy_find_action;
+
+case 1:
+YY_RULE_SETUP
+#line 45 "fts0blex.l"
+/* Ignore whitespace */ ;
+	YY_BREAK
+case 2:
+YY_RULE_SETUP
+#line 47 "fts0blex.l"
+{
+	val->oper = fts0bget_text(yyscanner)[0];
+
+	return(val->oper);
+}
+	YY_BREAK
+case 3:
+YY_RULE_SETUP
+#line 53 "fts0blex.l"
+{
+	val->token = strdup(fts0bget_text(yyscanner));
+
+	return(FTS_NUMB);
+}
+	YY_BREAK
+case 4:
+YY_RULE_SETUP
+#line 59 "fts0blex.l"
+{
+	val->token = strdup(fts0bget_text(yyscanner));
+
+	return(FTS_TERM);
+}
+	YY_BREAK
+case 5:
+YY_RULE_SETUP
+#line 65 "fts0blex.l"
+{
+	val->token = strdup(fts0bget_text(yyscanner));
+
+	return(FTS_TEXT);
+}
+	YY_BREAK
+case 6:
+/* rule 6 can match eol */
+YY_RULE_SETUP
+#line 71 "fts0blex.l"
+
+	YY_BREAK
+case 7:
+YY_RULE_SETUP
+#line 73 "fts0blex.l"
+ECHO;
+	YY_BREAK
+#line 842 "fts0blex.cc"
+case YY_STATE_EOF(INITIAL):
+	yyterminate();
+
+	case YY_END_OF_BUFFER:
+		{
+		/* Amount of text matched not including the EOB char. */
+		int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1;
+
+		/* Undo the effects of YY_DO_BEFORE_ACTION. */
+		*yy_cp = yyg->yy_hold_char;
+		YY_RESTORE_YY_MORE_OFFSET
+
+		if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
+			{
+			/* We're scanning a new file or input source.  It's
+			 * possible that this happened because the user
+			 * just pointed yyin at a new source and called
+			 * fts0blex().  If so, then we have to assure
+			 * consistency between YY_CURRENT_BUFFER and our
+			 * globals.  Here is the right place to do so, because
+			 * this is the first action (other than possibly a
+			 * back-up) that will match for the new input source.
+			 */
+			yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+			YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
+			}
+
+		/* Note that here we test for yy_c_buf_p "<=" to the position
+		 * of the first EOB in the buffer, since yy_c_buf_p will
+		 * already have been incremented past the NUL character
+		 * (since all states make transitions on EOB to the
+		 * end-of-buffer state).  Contrast this with the test
+		 * in input().
+		 */
+		if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+			{ /* This was really a NUL. */
+			yy_state_type yy_next_state;
+
+			yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text;
+
+			yy_current_state = yy_get_previous_state( yyscanner );
+
+			/* Okay, we're now positioned to make the NUL
+			 * transition.  We couldn't have
+			 * yy_get_previous_state() go ahead and do it
+			 * for us because it doesn't know how to deal
+			 * with the possibility of jamming (and we don't
+			 * want to build jamming into it because then it
+			 * will run more slowly).
+			 */
+
+			yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner);
+
+			yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+
+			if ( yy_next_state )
+				{
+				/* Consume the NUL. */
+				yy_cp = ++yyg->yy_c_buf_p;
+				yy_current_state = yy_next_state;
+				goto yy_match;
+				}
+
+			else
+				{
+				yy_cp = yyg->yy_last_accepting_cpos;
+				yy_current_state = yyg->yy_last_accepting_state;
+				goto yy_find_action;
+				}
+			}
+
+		else switch ( yy_get_next_buffer( yyscanner ) )
+			{
+			case EOB_ACT_END_OF_FILE:
+				{
+				yyg->yy_did_buffer_switch_on_eof = 0;
+
+				if ( fts0bwrap(yyscanner ) )
+					{
+					/* Note: because we've taken care in
+					 * yy_get_next_buffer() to have set up
+					 * yytext, we can now set up
+					 * yy_c_buf_p so that if some total
+					 * hoser (like flex itself) wants to
+					 * call the scanner after we return the
+					 * YY_NULL, it'll still work - another
+					 * YY_NULL will get returned.
+					 */
+					yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ;
+
+					yy_act = YY_STATE_EOF(YY_START);
+					goto do_action;
+					}
+
+				else
+					{
+					if ( ! yyg->yy_did_buffer_switch_on_eof )
+						YY_NEW_FILE;
+					}
+				break;
+				}
+
+			case EOB_ACT_CONTINUE_SCAN:
+				yyg->yy_c_buf_p =
+					yyg->yytext_ptr + yy_amount_of_matched_text;
+
+				yy_current_state = yy_get_previous_state( yyscanner );
+
+				yy_cp = yyg->yy_c_buf_p;
+				yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+				goto yy_match;
+
+			case EOB_ACT_LAST_MATCH:
+				yyg->yy_c_buf_p =
+				&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars];
+
+				yy_current_state = yy_get_previous_state( yyscanner );
+
+				yy_cp = yyg->yy_c_buf_p;
+				yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+				goto yy_find_action;
+			}
+		break;
+		}
+
+	default:
+		YY_FATAL_ERROR(
+			"fatal flex scanner internal error--no action found" );
+	} /* end of action switch */
+		} /* end of scanning one token */
+} /* end of fts0blex */
+
+/* yy_get_next_buffer - try to read in a new buffer
+ *
+ * Returns a code representing an action:
+ *	EOB_ACT_LAST_MATCH -
+ *	EOB_ACT_CONTINUE_SCAN - continue scanning from current position
+ *	EOB_ACT_END_OF_FILE - end of file
+ */
+static int yy_get_next_buffer (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
+	register char *source = yyg->yytext_ptr;
+	register int number_to_move, i;
+	int ret_val;
+
+	if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] )
+		YY_FATAL_ERROR(
+		"fatal flex scanner internal error--end of buffer missed" );
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 )
+		{ /* Don't try to fill the buffer, so this is an EOF. */
+		if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 )
+			{
+			/* We matched a single character, the EOB, so
+			 * treat this as a final EOF.
+			 */
+			return EOB_ACT_END_OF_FILE;
+			}
+
+		else
+			{
+			/* We matched some text prior to the EOB, first
+			 * process it.
+			 */
+			return EOB_ACT_LAST_MATCH;
+			}
+		}
+
+	/* Try to read more data. */
+
+	/* First move last chars to start of buffer. */
+	number_to_move = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr) - 1;
+
+	for ( i = 0; i < number_to_move; ++i )
+		*(dest++) = *(source++);
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING )
+		/* don't do the read, it's not guaranteed to return an EOF,
+		 * just force an EOF
+		 */
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0;
+
+	else
+		{
+			int num_to_read =
+			YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1;
+
+		while ( num_to_read <= 0 )
+			{ /* Not enough room in the buffer - grow it. */
+
+			/* just a shorter name for the current buffer */
+			YY_BUFFER_STATE b = YY_CURRENT_BUFFER;
+
+			int yy_c_buf_p_offset =
+				(int) (yyg->yy_c_buf_p - b->yy_ch_buf);
+
+			if ( b->yy_is_our_buffer )
+				{
+				int new_size = b->yy_buf_size * 2;
+
+				if ( new_size <= 0 )
+					b->yy_buf_size += b->yy_buf_size / 8;
+				else
+					b->yy_buf_size *= 2;
+
+				b->yy_ch_buf = (char *)
+					/* Include room in for 2 EOB chars. */
+					fts0brealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ,yyscanner );
+				}
+			else
+				/* Can't grow it, we don't own it. */
+				b->yy_ch_buf = 0;
+
+			if ( ! b->yy_ch_buf )
+				YY_FATAL_ERROR(
+				"fatal error - scanner input buffer overflow" );
+
+			yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset];
+
+			num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size -
+						number_to_move - 1;
+
+			}
+
+		if ( num_to_read > YY_READ_BUF_SIZE )
+			num_to_read = YY_READ_BUF_SIZE;
+
+		/* Read in more data. */
+		YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
+			yyg->yy_n_chars, (size_t) num_to_read );
+
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	if ( yyg->yy_n_chars == 0 )
+		{
+		if ( number_to_move == YY_MORE_ADJ )
+			{
+			ret_val = EOB_ACT_END_OF_FILE;
+			fts0brestart(yyin  ,yyscanner);
+			}
+
+		else
+			{
+			ret_val = EOB_ACT_LAST_MATCH;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status =
+				YY_BUFFER_EOF_PENDING;
+			}
+		}
+
+	else
+		ret_val = EOB_ACT_CONTINUE_SCAN;
+
+	if ((yy_size_t) (yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
+		/* Extend the array by 50%, plus the number we really need. */
+		yy_size_t new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1);
+		YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) fts0brealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ,yyscanner );
+		if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
+			YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
+	}
+
+	yyg->yy_n_chars += number_to_move;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR;
+
+	yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0];
+
+	return ret_val;
+}
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+    static yy_state_type yy_get_previous_state (yyscan_t yyscanner)
+{
+	register yy_state_type yy_current_state;
+	register char *yy_cp;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	yy_current_state = yyg->yy_start;
+
+	for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp )
+		{
+		register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
+		if ( yy_accept[yy_current_state] )
+			{
+			yyg->yy_last_accepting_state = yy_current_state;
+			yyg->yy_last_accepting_cpos = yy_cp;
+			}
+		while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+			{
+			yy_current_state = (int) yy_def[yy_current_state];
+			if ( yy_current_state >= 19 )
+				yy_c = yy_meta[(unsigned int) yy_c];
+			}
+		yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+		}
+
+	return yy_current_state;
+}
+
+/* yy_try_NUL_trans - try to make a transition on the NUL character
+ *
+ * synopsis
+ *	next_state = yy_try_NUL_trans( current_state );
+ */
+    static yy_state_type yy_try_NUL_trans  (yy_state_type yy_current_state , yyscan_t yyscanner)
+{
+	register int yy_is_jam;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */
+	register char *yy_cp = yyg->yy_c_buf_p;
+
+	register YY_CHAR yy_c = 1;
+	if ( yy_accept[yy_current_state] )
+		{
+		yyg->yy_last_accepting_state = yy_current_state;
+		yyg->yy_last_accepting_cpos = yy_cp;
+		}
+	while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+		{
+		yy_current_state = (int) yy_def[yy_current_state];
+		if ( yy_current_state >= 19 )
+			yy_c = yy_meta[(unsigned int) yy_c];
+		}
+	yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+	yy_is_jam = (yy_current_state == 18);
+
+	return yy_is_jam ? 0 : yy_current_state;
+}
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+    static int yyinput (yyscan_t yyscanner)
+#else
+    static int input  (yyscan_t yyscanner)
+#endif
+
+{
+	int c;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	*yyg->yy_c_buf_p = yyg->yy_hold_char;
+
+	if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR )
+		{
+		/* yy_c_buf_p now points to the character we want to return.
+		 * If this occurs *before* the EOB characters, then it's a
+		 * valid NUL; if not, then we've hit the end of the buffer.
+		 */
+		if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+			/* This was really a NUL. */
+			*yyg->yy_c_buf_p = '\0';
+
+		else
+			{ /* need more input */
+			int offset = yyg->yy_c_buf_p - yyg->yytext_ptr;
+			++yyg->yy_c_buf_p;
+
+			switch ( yy_get_next_buffer( yyscanner ) )
+				{
+				case EOB_ACT_LAST_MATCH:
+					/* This happens because yy_g_n_b()
+					 * sees that we've accumulated a
+					 * token and flags that we need to
+					 * try matching the token before
+					 * proceeding.  But for input(),
+					 * there's no matching to consider.
+					 * So convert the EOB_ACT_LAST_MATCH
+					 * to EOB_ACT_END_OF_FILE.
+					 */
+
+					/* Reset buffer status. */
+					fts0brestart(yyin ,yyscanner);
+
+					/*FALLTHROUGH*/
+
+				case EOB_ACT_END_OF_FILE:
+					{
+					if ( fts0bwrap(yyscanner ) )
+						return EOF;
+
+					if ( ! yyg->yy_did_buffer_switch_on_eof )
+						YY_NEW_FILE;
+#ifdef __cplusplus
+					return yyinput(yyscanner);
+#else
+					return input(yyscanner);
+#endif
+					}
+
+				case EOB_ACT_CONTINUE_SCAN:
+					yyg->yy_c_buf_p = yyg->yytext_ptr + offset;
+					break;
+				}
+			}
+		}
+
+	c = *(unsigned char *) yyg->yy_c_buf_p;	/* cast for 8-bit char's */
+	*yyg->yy_c_buf_p = '\0';	/* preserve yytext */
+	yyg->yy_hold_char = *++yyg->yy_c_buf_p;
+
+	return c;
+}
+#endif	/* ifndef YY_NO_INPUT */
+
+/** Immediately switch to a different input stream.
+ * @param input_file A readable stream.
+ * @param yyscanner The scanner object.
+ * @note This function does not reset the start condition to @c INITIAL .
+ */
+    void fts0brestart  (FILE * input_file , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if ( ! YY_CURRENT_BUFFER ){
+        fts0bensure_buffer_stack (yyscanner);
+		YY_CURRENT_BUFFER_LVALUE =
+            fts0b_create_buffer(yyin,YY_BUF_SIZE ,yyscanner);
+	}
+
+	fts0b_init_buffer(YY_CURRENT_BUFFER,input_file ,yyscanner);
+	fts0b_load_buffer_state(yyscanner );
+}
+
+/** Switch to a different input buffer.
+ * @param new_buffer The new input buffer.
+ * @param yyscanner The scanner object.
+ */
+    void fts0b_switch_to_buffer  (YY_BUFFER_STATE  new_buffer , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	/* TODO. We should be able to replace this entire function body
+	 * with
+	 *		fts0bpop_buffer_state();
+	 *		fts0bpush_buffer_state(new_buffer);
+     */
+	fts0bensure_buffer_stack (yyscanner);
+	if ( YY_CURRENT_BUFFER == new_buffer )
+		return;
+
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*yyg->yy_c_buf_p = yyg->yy_hold_char;
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+	fts0b_load_buffer_state(yyscanner );
+
+	/* We don't actually know whether we did this switch during
+	 * EOF (fts0bwrap()) processing, but the only time this flag
+	 * is looked at is after fts0bwrap() is called, so it's safe
+	 * to go ahead and always set it.
+	 */
+	yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+static void fts0b_load_buffer_state  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+	yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos;
+	yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file;
+	yyg->yy_hold_char = *yyg->yy_c_buf_p;
+}
+
+/** Allocate and initialize an input buffer state.
+ * @param file A readable stream.
+ * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
+ * @param yyscanner The scanner object.
+ * @return the allocated buffer state.
+ */
+    YY_BUFFER_STATE fts0b_create_buffer  (FILE * file, int  size , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+    
+	b = (YY_BUFFER_STATE) fts0balloc(sizeof( struct yy_buffer_state ) ,yyscanner );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0b_create_buffer()" );
+
+	b->yy_buf_size = size;
+
+	/* yy_ch_buf has to be 2 characters longer than the size given because
+	 * we need to put in 2 end-of-buffer characters.
+	 */
+	b->yy_ch_buf = (char *) fts0balloc(b->yy_buf_size + 2 ,yyscanner );
+	if ( ! b->yy_ch_buf )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0b_create_buffer()" );
+
+	b->yy_is_our_buffer = 1;
+
+	fts0b_init_buffer(b,file ,yyscanner);
+
+	return b;
+}
+
+/** Destroy the buffer.
+ * @param b a buffer created with fts0b_create_buffer()
+ * @param yyscanner The scanner object.
+ */
+    void fts0b_delete_buffer (YY_BUFFER_STATE  b , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if ( ! b )
+		return;
+
+	if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */
+		YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
+
+	if ( b->yy_is_our_buffer )
+		fts0bfree((void *) b->yy_ch_buf ,yyscanner );
+
+	fts0bfree((void *) b ,yyscanner );
+}
+
+/* Initializes or reinitializes a buffer.
+ * This function is sometimes called more than once on the same buffer,
+ * such as during a fts0brestart() or at EOF.
+ */
+    static void fts0b_init_buffer  (YY_BUFFER_STATE  b, FILE * file , yyscan_t yyscanner)
+
+{
+	int oerrno = errno;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	fts0b_flush_buffer(b ,yyscanner);
+
+	b->yy_input_file = file;
+	b->yy_fill_buffer = 1;
+
+    /* If b is the current buffer, then fts0b_init_buffer was _probably_
+     * called from fts0brestart() or through yy_get_next_buffer.
+     * In that case, we don't want to reset the lineno or column.
+     */
+    if (b != YY_CURRENT_BUFFER){
+        b->yy_bs_lineno = 1;
+        b->yy_bs_column = 0;
+    }
+
+        b->yy_is_interactive = 0;
+    
+	errno = oerrno;
+}
+
+/** Discard all buffered characters. On the next scan, YY_INPUT will be called.
+ * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
+ * @param yyscanner The scanner object.
+ */
+    void fts0b_flush_buffer (YY_BUFFER_STATE  b , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if ( ! b )
+		return;
+
+	b->yy_n_chars = 0;
+
+	/* We always need two end-of-buffer characters.  The first causes
+	 * a transition to the end-of-buffer state.  The second causes
+	 * a jam in that state.
+	 */
+	b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR;
+	b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR;
+
+	b->yy_buf_pos = &b->yy_ch_buf[0];
+
+	b->yy_at_bol = 1;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	if ( b == YY_CURRENT_BUFFER )
+		fts0b_load_buffer_state(yyscanner );
+}
+
+/** Pushes the new state onto the stack. The new state becomes
+ *  the current state. This function will allocate the stack
+ *  if necessary.
+ *  @param new_buffer The new state.
+ *  @param yyscanner The scanner object.
+ */
+void fts0bpush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if (new_buffer == NULL)
+		return;
+
+	fts0bensure_buffer_stack(yyscanner);
+
+	/* This block is copied from fts0b_switch_to_buffer. */
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*yyg->yy_c_buf_p = yyg->yy_hold_char;
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	/* Only push if top exists. Otherwise, replace top. */
+	if (YY_CURRENT_BUFFER)
+		yyg->yy_buffer_stack_top++;
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+
+	/* copied from fts0b_switch_to_buffer. */
+	fts0b_load_buffer_state(yyscanner );
+	yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+/** Removes and deletes the top of the stack, if present.
+ *  The next element becomes the new top.
+ *  @param yyscanner The scanner object.
+ */
+void fts0bpop_buffer_state (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if (!YY_CURRENT_BUFFER)
+		return;
+
+	fts0b_delete_buffer(YY_CURRENT_BUFFER ,yyscanner);
+	YY_CURRENT_BUFFER_LVALUE = NULL;
+	if (yyg->yy_buffer_stack_top > 0)
+		--yyg->yy_buffer_stack_top;
+
+	if (YY_CURRENT_BUFFER) {
+		fts0b_load_buffer_state(yyscanner );
+		yyg->yy_did_buffer_switch_on_eof = 1;
+	}
+}
+
+/* Allocates the stack if it does not exist.
+ *  Guarantees space for at least one push.
+ */
+static void fts0bensure_buffer_stack (yyscan_t yyscanner)
+{
+	int num_to_alloc;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if (!yyg->yy_buffer_stack) {
+
+		/* First allocation is just for 2 elements, since we don't know if this
+		 * scanner will even need a stack. We use 2 instead of 1 to avoid an
+		 * immediate realloc on the next call.
+         */
+		num_to_alloc = 1;
+		yyg->yy_buffer_stack = (struct yy_buffer_state**)fts0balloc
+								(num_to_alloc * sizeof(struct yy_buffer_state*)
+								, yyscanner);
+		if ( ! yyg->yy_buffer_stack )
+			YY_FATAL_ERROR( "out of dynamic memory in fts0bensure_buffer_stack()" );
+								  
+		memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*));
+				
+		yyg->yy_buffer_stack_max = num_to_alloc;
+		yyg->yy_buffer_stack_top = 0;
+		return;
+	}
+
+	if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){
+
+		/* Increase the buffer to prepare for a possible push. */
+		int grow_size = 8 /* arbitrary grow size */;
+
+		num_to_alloc = yyg->yy_buffer_stack_max + grow_size;
+		yyg->yy_buffer_stack = (struct yy_buffer_state**)fts0brealloc
+								(yyg->yy_buffer_stack,
+								num_to_alloc * sizeof(struct yy_buffer_state*)
+								, yyscanner);
+		if ( ! yyg->yy_buffer_stack )
+			YY_FATAL_ERROR( "out of dynamic memory in fts0bensure_buffer_stack()" );
+
+		/* zero only the new slots.*/
+		memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*));
+		yyg->yy_buffer_stack_max = num_to_alloc;
+	}
+}
+
+/** Setup the input buffer state to scan directly from a user-specified character buffer.
+ * @param base the character buffer
+ * @param size the size in bytes of the character buffer
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object. 
+ */
+YY_BUFFER_STATE fts0b_scan_buffer  (char * base, yy_size_t  size , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+    
+	if ( size < 2 ||
+	     base[size-2] != YY_END_OF_BUFFER_CHAR ||
+	     base[size-1] != YY_END_OF_BUFFER_CHAR )
+		/* They forgot to leave room for the EOB's. */
+		return 0;
+
+	b = (YY_BUFFER_STATE) fts0balloc(sizeof( struct yy_buffer_state ) ,yyscanner );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0b_scan_buffer()" );
+
+	b->yy_buf_size = size - 2;	/* "- 2" to take care of EOB's */
+	b->yy_buf_pos = b->yy_ch_buf = base;
+	b->yy_is_our_buffer = 0;
+	b->yy_input_file = 0;
+	b->yy_n_chars = b->yy_buf_size;
+	b->yy_is_interactive = 0;
+	b->yy_at_bol = 1;
+	b->yy_fill_buffer = 0;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	fts0b_switch_to_buffer(b ,yyscanner );
+
+	return b;
+}
+
+/** Setup the input buffer state to scan a string. The next call to fts0blex() will
+ * scan from a @e copy of @a str.
+ * @param yystr a NUL-terminated string to scan
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ * @note If you want to scan bytes that may contain NUL values, then use
+ *       fts0b_scan_bytes() instead.
+ */
+YY_BUFFER_STATE fts0b_scan_string (yyconst char * yystr , yyscan_t yyscanner)
+{
+    
+	return fts0b_scan_bytes(yystr,strlen(yystr) ,yyscanner);
+}
+
+/** Setup the input buffer state to scan the given bytes. The next call to fts0blex() will
+ * scan from a @e copy of @a bytes.
+ * @param yybytes the byte buffer to scan
+ * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes.
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE fts0b_scan_bytes  (yyconst char * yybytes, int  _yybytes_len , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+	char *buf;
+	yy_size_t n;
+	int i;
+    
+	/* Get memory for full buffer, including space for trailing EOB's. */
+	n = _yybytes_len + 2;
+	buf = (char *) fts0balloc(n ,yyscanner );
+	if ( ! buf )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0b_scan_bytes()" );
+
+	for ( i = 0; i < _yybytes_len; ++i )
+		buf[i] = yybytes[i];
+
+	buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR;
+
+	b = fts0b_scan_buffer(buf,n ,yyscanner);
+	if ( ! b )
+		YY_FATAL_ERROR( "bad buffer in fts0b_scan_bytes()" );
+
+	/* It's okay to grow etc. this buffer, and we should throw it
+	 * away when we're done.
+	 */
+	b->yy_is_our_buffer = 1;
+
+	return b;
+}
+
+#ifndef YY_EXIT_FAILURE
+#define YY_EXIT_FAILURE 2
+#endif
+
+static void yy_fatal_error (yyconst char* msg ,       yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+    	(void) fprintf( stderr, "%s\n", msg );
+	exit( YY_EXIT_FAILURE );
+}
+
+/* Redefine yyless() so it works in section 3 code. */
+
+#undef yyless
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		yytext[yyleng] = yyg->yy_hold_char; \
+		yyg->yy_c_buf_p = yytext + yyless_macro_arg; \
+		yyg->yy_hold_char = *yyg->yy_c_buf_p; \
+		*yyg->yy_c_buf_p = '\0'; \
+		yyleng = yyless_macro_arg; \
+		} \
+	while ( 0 )
+
+/* Accessor  methods (get/set functions) to struct members. */
+
+/** Get the user-defined data for this scanner.
+ * @param yyscanner The scanner object.
+ */
+YY_EXTRA_TYPE fts0bget_extra  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyextra;
+}
+
+/** Get the current line number.
+ * @param yyscanner The scanner object.
+ */
+int fts0bget_lineno  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    
+        if (! YY_CURRENT_BUFFER)
+            return 0;
+    
+    return yylineno;
+}
+
+/** Get the current column number.
+ * @param yyscanner The scanner object.
+ */
+int fts0bget_column  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    
+        if (! YY_CURRENT_BUFFER)
+            return 0;
+    
+    return yycolumn;
+}
+
+/** Get the input stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *fts0bget_in  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyin;
+}
+
+/** Get the output stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *fts0bget_out  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyout;
+}
+
+/** Get the length of the current token.
+ * @param yyscanner The scanner object.
+ */
+int fts0bget_leng  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyleng;
+}
+
+/** Get the current token.
+ * @param yyscanner The scanner object.
+ */
+
+char *fts0bget_text  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yytext;
+}
+
+/** Set the user-defined data. This data is never touched by the scanner.
+ * @param user_defined The data to be associated with this scanner.
+ * @param yyscanner The scanner object.
+ */
+void fts0bset_extra (YY_EXTRA_TYPE  user_defined , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yyextra = user_defined ;
+}
+
+/** Set the current line number.
+ * @param line_number
+ * @param yyscanner The scanner object.
+ */
+void fts0bset_lineno (int  line_number , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+        /* lineno is only valid if an input buffer exists. */
+        if (! YY_CURRENT_BUFFER )
+           yy_fatal_error( "fts0bset_lineno called with no buffer" , yyscanner); 
+    
+    yylineno = line_number;
+}
+
+/** Set the current column.
+ * @param line_number
+ * @param yyscanner The scanner object.
+ */
+void fts0bset_column (int  column_no , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+        /* column is only valid if an input buffer exists. */
+        if (! YY_CURRENT_BUFFER )
+           yy_fatal_error( "fts0bset_column called with no buffer" , yyscanner); 
+    
+    yycolumn = column_no;
+}
+
+/** Set the input stream. This does not discard the current
+ * input buffer.
+ * @param in_str A readable stream.
+ * @param yyscanner The scanner object.
+ * @see fts0b_switch_to_buffer
+ */
+void fts0bset_in (FILE *  in_str , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yyin = in_str ;
+}
+
+void fts0bset_out (FILE *  out_str , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yyout = out_str ;
+}
+
+int fts0bget_debug  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yy_flex_debug;
+}
+
+void fts0bset_debug (int  bdebug , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yy_flex_debug = bdebug ;
+}
+
+/* Accessor methods for yylval and yylloc */
+
+/* User-visible API */
+
+/* fts0blex_init is special because it creates the scanner itself, so it is
+ * the ONLY reentrant function that doesn't take the scanner as the last argument.
+ * That's why we explicitly handle the declaration, instead of using our macros.
+ */
+
+int fts0blex_init(yyscan_t* ptr_yy_globals)
+
+{
+    if (ptr_yy_globals == NULL){
+        errno = EINVAL;
+        return 1;
+    }
+
+    *ptr_yy_globals = (yyscan_t) fts0balloc ( sizeof( struct yyguts_t ), NULL );
+
+    if (*ptr_yy_globals == NULL){
+        errno = ENOMEM;
+        return 1;
+    }
+
+    /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */
+    memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+    return yy_init_globals ( *ptr_yy_globals );
+}
+
+/* fts0blex_init_extra has the same functionality as fts0blex_init, but follows the
+ * convention of taking the scanner as the last argument. Note however, that
+ * this is a *pointer* to a scanner, as it will be allocated by this call (and
+ * is the reason, too, why this function also must handle its own declaration).
+ * The user defined value in the first argument will be available to fts0balloc in
+ * the yyextra field.
+ */
+
+int fts0blex_init_extra(YY_EXTRA_TYPE yy_user_defined,yyscan_t* ptr_yy_globals )
+
+{
+    struct yyguts_t dummy_yyguts;
+
+    fts0bset_extra (yy_user_defined, &dummy_yyguts);
+
+    if (ptr_yy_globals == NULL){
+        errno = EINVAL;
+        return 1;
+    }
+	
+    *ptr_yy_globals = (yyscan_t) fts0balloc ( sizeof( struct yyguts_t ), &dummy_yyguts );
+	
+    if (*ptr_yy_globals == NULL){
+        errno = ENOMEM;
+        return 1;
+    }
+    
+    /* By setting to 0xAA, we expose bugs in
+    yy_init_globals. Leave at 0x00 for releases. */
+    memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+    
+    fts0bset_extra (yy_user_defined, *ptr_yy_globals);
+    
+    return yy_init_globals ( *ptr_yy_globals );
+}
+
+static int yy_init_globals (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    /* Initialization is the same as for the non-reentrant scanner.
+     * This function is called from fts0blex_destroy(), so don't allocate here.
+     */
+
+    yyg->yy_buffer_stack = 0;
+    yyg->yy_buffer_stack_top = 0;
+    yyg->yy_buffer_stack_max = 0;
+    yyg->yy_c_buf_p = (char *) 0;
+    yyg->yy_init = 0;
+    yyg->yy_start = 0;
+
+    yyg->yy_start_stack_ptr = 0;
+    yyg->yy_start_stack_depth = 0;
+    yyg->yy_start_stack =  NULL;
+
+/* Defined in main.c */
+#ifdef YY_STDINIT
+    yyin = stdin;
+    yyout = stdout;
+#else
+    yyin = (FILE *) 0;
+    yyout = (FILE *) 0;
+#endif
+
+    /* For future reference: Set errno on error, since we are called by
+     * fts0blex_init()
+     */
+    return 0;
+}
+
+/* fts0blex_destroy is for both reentrant and non-reentrant scanners. */
+int fts0blex_destroy  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+    /* Pop the buffer stack, destroying each element. */
+	while(YY_CURRENT_BUFFER){
+		fts0b_delete_buffer(YY_CURRENT_BUFFER ,yyscanner );
+		YY_CURRENT_BUFFER_LVALUE = NULL;
+		fts0bpop_buffer_state(yyscanner);
+	}
+
+	/* Destroy the stack itself. */
+	fts0bfree(yyg->yy_buffer_stack ,yyscanner);
+	yyg->yy_buffer_stack = NULL;
+
+    /* Destroy the start condition stack. */
+        fts0bfree(yyg->yy_start_stack ,yyscanner );
+        yyg->yy_start_stack = NULL;
+
+    /* Reset the globals. This is important in a non-reentrant scanner so the next time
+     * fts0blex() is called, initialization will occur. */
+    yy_init_globals( yyscanner);
+
+    /* Destroy the main struct (reentrant only). */
+    fts0bfree ( yyscanner , yyscanner );
+    yyscanner = NULL;
+    return 0;
+}
+
+/*
+ * Internal utility routines.
+ */
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char* s1, yyconst char * s2, int n ,       yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	register int i;
+	for ( i = 0; i < n; ++i )
+		s1[i] = s2[i];
+}
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * s ,       yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	register int n;
+	for ( n = 0; s[n]; ++n )
+		;
+
+	return n;
+}
+#endif
+
+void *fts0balloc (yy_size_t  size ,       yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	return (void *) malloc( size );
+}
+
+void *fts0brealloc  (void * ptr, yy_size_t  size ,       yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	/* The cast to (char *) in the following accommodates both
+	 * implementations that use char* generic pointers, and those
+	 * that use void* generic pointers.  It works with the latter
+	 * because both ANSI C and C++ allow castless assignment from
+	 * any pointer type to void*, and deal with argument conversions
+	 * as though doing an assignment.
+	 */
+	return (void *) realloc( (char *) ptr, size );
+}
+
+void fts0bfree (void * ptr ,       yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	free( (char *) ptr );	/* see fts0brealloc() for (char *) cast */
+}
+
+#define YYTABLES_NAME "yytables"
+
+#line 73 "fts0blex.l"
+
+
+
diff --git a/storage/xtradb/fts/fts0blex.l b/storage/xtradb/fts/fts0blex.l
new file mode 100644
index 00000000000..6193f0df187
--- /dev/null
+++ b/storage/xtradb/fts/fts0blex.l
@@ -0,0 +1,73 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+ * @file fts/fts0blex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+
+%{
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_blexer(YYSTYPE* val, yyscan_t yyscanner)
+
+%}
+
+%option noinput
+%option nounput
+%option noyywrap
+%option nostdinit
+%option reentrant
+%option never-interactive
+
+%%
+
+[\t ]+	/* Ignore whitespace */ ;
+
+[*()+\-<>~@]		{
+	val->oper = fts0bget_text(yyscanner)[0];
+
+	return(val->oper);
+}
+
+[0-9]+			{
+	val->token = strdup(fts0bget_text(yyscanner));
+
+	return(FTS_NUMB);
+}
+
+[^" \n*()+\-<>~@%]*		{
+	val->token = strdup(fts0bget_text(yyscanner));
+
+	return(FTS_TERM);
+}
+
+\"[^\"\n]*\"		{
+	val->token = strdup(fts0bget_text(yyscanner));
+
+	return(FTS_TEXT);
+}
+
+\n
+
+%%
diff --git a/storage/xtradb/fts/fts0config.cc b/storage/xtradb/fts/fts0config.cc
new file mode 100644
index 00000000000..29d6771f9e7
--- /dev/null
+++ b/storage/xtradb/fts/fts0config.cc
@@ -0,0 +1,562 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0config.cc
+Full Text Search configuration table.
+
+Created 2007/5/9 Sunny Bains
+***********************************************************************/
+
+#include "trx0roll.h"
+#include "row0sel.h"
+
+#include "fts0priv.h"
+
+#ifndef UNIV_NONINL
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#endif
+
+/******************************************************************//**
+Callback function for fetching the config value.
+@return always returns TRUE */
+static
+ibool
+fts_config_fetch_value(
+/*===================*/
+	void*		row,			/*!< in: sel_node_t* */
+	void*		user_arg)		/*!< in: pointer to
+						 ib_vector_t */
+{
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	fts_string_t*	value = static_cast<fts_string_t*>(user_arg);
+
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+	void*		data = dfield_get_data(dfield);
+
+	ut_a(dtype_get_mtype(type) == DATA_VARCHAR);
+
+	if (len != UNIV_SQL_NULL) {
+		ulint	max_len = ut_min(value->f_len - 1, len);
+
+		memcpy(value->f_str, data, max_len);
+		value->f_len = max_len;
+		value->f_str[value->f_len] = '\0';
+	}
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Get value from the config table. The caller must ensure that enough
+space is allocated for value to hold the column contents.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_get_value(
+/*=================*/
+	trx_t*		trx,			/*!< transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: get config value for
+						this parameter name */
+	fts_string_t*	value)			/*!< out: value read from
+						config table */
+{
+	pars_info_t*	info;
+	que_t*		graph;
+	dberr_t		error;
+	ulint		name_len = strlen(name);
+
+	info = pars_info_create();
+
+	*value->f_str = '\0';
+	ut_a(value->f_len > 0);
+
+	pars_info_bind_function(info, "my_func", fts_config_fetch_value,
+				value);
+
+	/* The len field of value must be set to the max bytes that
+	it can hold. On a successful read, the len field will be set
+	to the actual number of bytes copied to value. */
+	pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len);
+
+	fts_table->suffix = "CONFIG";
+
+	graph = fts_parse_sql(
+		fts_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS SELECT value FROM %s"
+		" WHERE key = :name;\n"
+		"BEGIN\n"
+		""
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	trx->op_info = "getting FTS config value";
+
+	error = fts_eval_sql(trx, graph);
+
+	mutex_enter(&dict_sys->mutex);
+	que_graph_free(graph);
+	mutex_exit(&dict_sys->mutex);
+
+	return(error);
+}
+
+/*********************************************************************//**
+Create the config table name for retrieving index specific value.
+@return index config parameter name */
+UNIV_INTERN
+char*
+fts_config_create_index_param_name(
+/*===============================*/
+	const char*		param,		/*!< in: base name of param */
+	const dict_index_t*	index)		/*!< in: index for config */
+{
+	ulint		len;
+	char*		name;
+
+	/* The format of the config name is: name_<index_id>. */
+	len = strlen(param);
+
+	/* Caller is responsible for deleting name. */
+	name = static_cast<char*>(ut_malloc(
+		len + FTS_AUX_MIN_TABLE_ID_LENGTH + 2));
+	strcpy(name, param);
+	name[len] = '_';
+
+	fts_write_object_id(index->id, name + len + 1);
+
+	return(name);
+}
+
+/******************************************************************//**
+Get value specific to an FTS index from the config table. The caller
+must ensure that enough space is allocated for value to hold the
+column contents.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_get_index_value(
+/*=======================*/
+	trx_t*		trx,			/*!< transaction */
+	dict_index_t*	index,			/*!< in: index */
+	const char*	param,			/*!< in: get config value for
+						this parameter name */
+	fts_string_t*	value)			/*!< out: value read from
+						config table */
+{
+	char*		name;
+	dberr_t		error;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE,
+			   index->table);
+
+	/* We are responsible for free'ing name. */
+	name = fts_config_create_index_param_name(param, index);
+
+	error = fts_config_get_value(trx, &fts_table, name, value);
+
+	ut_free(name);
+
+	return(error);
+}
+
+/******************************************************************//**
+Set the value in the config table for name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_set_value(
+/*=================*/
+	trx_t*		trx,			/*!< transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: get config value for
+						this parameter name */
+	const fts_string_t*
+			value)			/*!< in: value to update */
+{
+	pars_info_t*	info;
+	que_t*		graph;
+	dberr_t		error;
+	undo_no_t	undo_no;
+	undo_no_t	n_rows_updated;
+	ulint		name_len = strlen(name);
+
+	info = pars_info_create();
+
+	pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len);
+	pars_info_bind_varchar_literal(info, "value",
+				       value->f_str, value->f_len);
+
+	fts_table->suffix = "CONFIG";
+
+	graph = fts_parse_sql(
+		fts_table, info,
+		"BEGIN UPDATE \"%s\" SET value = :value WHERE key = :name;");
+
+	trx->op_info = "setting FTS config value";
+
+	undo_no = trx->undo_no;
+
+	error = fts_eval_sql(trx, graph);
+
+	fts_que_graph_free_check_lock(fts_table, NULL, graph);
+
+	n_rows_updated = trx->undo_no - undo_no;
+
+	/* Check if we need to do an insert. */
+	if (n_rows_updated == 0) {
+		info = pars_info_create();
+
+		pars_info_bind_varchar_literal(
+			info, "name", (byte*) name, name_len);
+
+		pars_info_bind_varchar_literal(
+			info, "value", value->f_str, value->f_len);
+
+		graph = fts_parse_sql(
+			fts_table, info,
+			"BEGIN\n"
+			"INSERT INTO \"%s\" VALUES(:name, :value);");
+
+		trx->op_info = "inserting FTS config value";
+
+		error = fts_eval_sql(trx, graph);
+
+		fts_que_graph_free_check_lock(fts_table, NULL, graph);
+	}
+
+	return(error);
+}
+
+/******************************************************************//**
+Set the value specific to an FTS index in the config table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_set_index_value(
+/*=======================*/
+	trx_t*		trx,			/*!< transaction */
+	dict_index_t*	index,			/*!< in: index */
+	const char*	param,			/*!< in: get config value for
+						this parameter name */
+	fts_string_t*	value)			/*!< out: value read from
+						config table */
+{
+	char*		name;
+	dberr_t		error;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE,
+			   index->table);
+
+	/* We are responsible for free'ing name. */
+	name = fts_config_create_index_param_name(param, index);
+
+	error = fts_config_set_value(trx, &fts_table, name, value);
+
+	ut_free(name);
+
+	return(error);
+}
+
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+dberr_t
+fts_config_get_index_ulint(
+/*=======================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	const char*	name,			/*!< in: param name */
+	ulint*		int_value)		/*!< out: value */
+{
+	dberr_t		error;
+	fts_string_t	value;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	error = fts_config_get_index_value(trx, index, name, &value);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "  InnoDB: Error: (%s) reading `%s'\n",
+			ut_strerr(error), name);
+	} else {
+		*int_value = strtoul((char*) value.f_str, NULL, 10);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+dberr_t
+fts_config_set_index_ulint(
+/*=======================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	const char*	name,			/*!< in: param name */
+	ulint		int_value)		/*!< in: value */
+{
+	dberr_t		error;
+	fts_string_t	value;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	// FIXME: Get rid of snprintf
+	ut_a(FTS_MAX_INT_LEN < FTS_MAX_CONFIG_VALUE_LEN);
+
+	value.f_len = ut_snprintf(
+		(char*) value.f_str, FTS_MAX_INT_LEN, "%lu", int_value);
+
+	error = fts_config_set_index_value(trx, index, name, &value);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "  InnoDB: Error: (%s) writing `%s'\n",
+			ut_strerr(error), name);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+dberr_t
+fts_config_get_ulint(
+/*=================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: param name */
+	ulint*		int_value)		/*!< out: value */
+{
+	dberr_t		error;
+	fts_string_t	value;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	error = fts_config_get_value(trx, fts_table, name, &value);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "  InnoDB: Error: (%s) reading `%s'\n",
+			ut_strerr(error), name);
+	} else {
+		*int_value = strtoul((char*) value.f_str, NULL, 10);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+dberr_t
+fts_config_set_ulint(
+/*=================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: param name */
+	ulint		int_value)		/*!< in: value */
+{
+	dberr_t		error;
+	fts_string_t	value;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	// FIXME: Get rid of snprintf
+	ut_a(FTS_MAX_INT_LEN < FTS_MAX_CONFIG_VALUE_LEN);
+
+	value.f_len = snprintf(
+		(char*) value.f_str, FTS_MAX_INT_LEN, "%lu", int_value);
+
+	error = fts_config_set_value(trx, fts_table, name, &value);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "  InnoDB: Error: (%s) writing `%s'\n",
+			ut_strerr(error), name);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Increment the value in the config table for column name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_increment_value(
+/*=======================*/
+	trx_t*		trx,			/*!< transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: increment config value
+						for this parameter name */
+	ulint		delta)			/*!< in: increment by this
+						much */
+{
+	dberr_t		error;
+	fts_string_t	value;
+	que_t*		graph = NULL;
+	ulint		name_len = strlen(name);
+	pars_info_t*	info = pars_info_create();
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	*value.f_str = '\0';
+
+	pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len);
+
+	pars_info_bind_function(
+		info, "my_func", fts_config_fetch_value, &value);
+
+	fts_table->suffix = "CONFIG";
+
+	graph = fts_parse_sql(
+		fts_table, info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS SELECT value FROM %s"
+		" WHERE key = :name FOR UPDATE;\n"
+		"BEGIN\n"
+		""
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	trx->op_info = "read  FTS config value";
+
+	error = fts_eval_sql(trx, graph);
+
+	fts_que_graph_free_check_lock(fts_table, NULL, graph);
+
+	if (UNIV_UNLIKELY(error == DB_SUCCESS)) {
+		ulint		int_value;
+
+		int_value = strtoul((char*) value.f_str, NULL, 10);
+
+		int_value += delta;
+
+		ut_a(FTS_MAX_CONFIG_VALUE_LEN > FTS_MAX_INT_LEN);
+
+		// FIXME: Get rid of snprintf
+		value.f_len = snprintf(
+			(char*) value.f_str, FTS_MAX_INT_LEN, "%lu", int_value);
+
+		fts_config_set_value(trx, fts_table, name, &value);
+	}
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "  InnoDB: Error: (%s) "
+			"while incrementing %s.\n", ut_strerr(error), name);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Increment the per index value in the config table for column name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_increment_index_value(
+/*=============================*/
+	trx_t*		trx,			/*!< transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	const char*	param,			/*!< in: increment config value
+						for this parameter name */
+	ulint		delta)			/*!< in: increment by this
+						much */
+{
+	char*		name;
+	dberr_t		error;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE,
+			   index->table);
+
+	/* We are responsible for free'ing name. */
+	name = fts_config_create_index_param_name(param, index);
+
+	error = fts_config_increment_value(trx, &fts_table, name, delta);
+
+	ut_free(name);
+
+	return(error);
+}
+
diff --git a/storage/xtradb/fts/fts0fts.cc b/storage/xtradb/fts/fts0fts.cc
new file mode 100644
index 00000000000..83c6d1f5e80
--- /dev/null
+++ b/storage/xtradb/fts/fts0fts.cc
@@ -0,0 +1,6569 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0fts.cc
+Full Text Search interface
+***********************************************************************/
+
+#include "trx0roll.h"
+#include "row0mysql.h"
+#include "row0upd.h"
+#include "dict0types.h"
+#include "row0sel.h"
+
+#include "fts0fts.h"
+#include "fts0priv.h"
+#include "fts0types.h"
+
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#include "dict0priv.h"
+#include "dict0stats.h"
+#include "btr0pcur.h"
+
+#include "ha_prototypes.h"
+
+#define FTS_MAX_ID_LEN	32
+
+/** Column name from the FTS config table */
+#define FTS_MAX_CACHE_SIZE_IN_MB	"cache_size_in_mb"
+
+/** This is maximum FTS cache for each table and would be
+a configurable variable */
+UNIV_INTERN ulong	fts_max_cache_size;
+
+/** Whether the total memory used for FTS cache is exhausted, and we will
+need a sync to free some memory */
+UNIV_INTERN bool       fts_need_sync = false;
+
+/** Variable specifying the total memory allocated for FTS cache */
+UNIV_INTERN ulong      fts_max_total_cache_size;
+
+/** This is FTS result cache limit for each query and would be
+a configurable variable */
+UNIV_INTERN ulong	fts_result_cache_limit;
+
+/** Variable specifying the maximum FTS max token size */
+UNIV_INTERN ulong	fts_max_token_size;
+
+/** Variable specifying the minimum FTS max token size */
+UNIV_INTERN ulong	fts_min_token_size;
+
+
+// FIXME: testing
+ib_time_t elapsed_time = 0;
+ulint n_nodes = 0;
+
+/** Error condition reported by fts_utf8_decode() */
+const ulint UTF8_ERROR = 0xFFFFFFFF;
+
+/** The cache size permissible lower limit (1K) */
+static const ulint FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB = 1;
+
+/** The cache size permissible upper limit (1G) */
+static const ulint FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB = 1024;
+
+/** Time to sleep after DEADLOCK error before retrying operation. */
+static const ulint FTS_DEADLOCK_RETRY_WAIT = 100000;
+
+#ifdef UNIV_PFS_RWLOCK
+UNIV_INTERN mysql_pfs_key_t	fts_cache_rw_lock_key;
+UNIV_INTERN mysql_pfs_key_t	fts_cache_init_rw_lock_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	fts_delete_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	fts_optimize_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	fts_bg_threads_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	fts_doc_id_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/** variable to record innodb_fts_internal_tbl_name for information
+schema table INNODB_FTS_INSERTED etc. */
+UNIV_INTERN char* fts_internal_tbl_name		= NULL;
+
+/** InnoDB default stopword list:
+There are different versions of stopwords, the stop words listed
+below comes from "Google Stopword" list. Reference:
+http://meta.wikimedia.org/wiki/Stop_word_list/google_stop_word_list.
+The final version of InnoDB default stopword list is still pending
+for decision */
+const char *fts_default_stopword[] =
+{
+	"a",
+	"about",
+	"an",
+	"are",
+	"as",
+	"at",
+	"be",
+	"by",
+	"com",
+	"de",
+	"en",
+	"for",
+	"from",
+	"how",
+	"i",
+	"in",
+	"is",
+	"it",
+	"la",
+	"of",
+	"on",
+	"or",
+	"that",
+	"the",
+	"this",
+	"to",
+	"was",
+	"what",
+	"when",
+	"where",
+	"who",
+	"will",
+	"with",
+	"und",
+	"the",
+	"www",
+	NULL
+};
+
+/** For storing table info when checking for orphaned tables. */
+struct fts_aux_table_t {
+	table_id_t	id;		/*!< Table id */
+	table_id_t	parent_id;	/*!< Parent table id */
+	table_id_t	index_id;	/*!< Table FT index id */
+	char*		name;		/*!< Name of the table */
+};
+
+/** SQL statements for creating the ancillary common FTS tables. */
+static const char* fts_create_common_tables_sql = {
+	"BEGIN\n"
+	""
+	"CREATE TABLE \"%s_DELETED\" (\n"
+	"  doc_id BIGINT UNSIGNED\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND ON \"%s_DELETED\"(doc_id);\n"
+	""
+	"CREATE TABLE \"%s_DELETED_CACHE\" (\n"
+	"  doc_id BIGINT UNSIGNED\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND "
+		"ON \"%s_DELETED_CACHE\"(doc_id);\n"
+	""
+	"CREATE TABLE \"%s_BEING_DELETED\" (\n"
+	"  doc_id BIGINT UNSIGNED\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND "
+		"ON \"%s_BEING_DELETED\"(doc_id);\n"
+	""
+	"CREATE TABLE \"%s_BEING_DELETED_CACHE\" (\n"
+	"  doc_id BIGINT UNSIGNED\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND "
+		"ON \"%s_BEING_DELETED_CACHE\"(doc_id);\n"
+	""
+	"CREATE TABLE \"%s_CONFIG\" (\n"
+	"  key CHAR(50),\n"
+	"  value CHAR(50) NOT NULL\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND ON \"%s_CONFIG\"(key);\n"
+};
+
+#ifdef FTS_DOC_STATS_DEBUG
+/** Template for creating the FTS auxiliary index specific tables. This is
+mainly designed for the statistics work in the future */
+static const char* fts_create_index_tables_sql = {
+	"BEGIN\n"
+	""
+	"CREATE TABLE \"%s_DOC_ID\" (\n"
+	"   doc_id BIGINT UNSIGNED,\n"
+	"   word_count INTEGER UNSIGNED NOT NULL\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND ON \"%s_DOC_ID\"(doc_id);\n"
+};
+#endif
+
+/** Template for creating the ancillary FTS tables word index tables. */
+static const char* fts_create_index_sql = {
+	"BEGIN\n"
+	""
+	"CREATE UNIQUE CLUSTERED INDEX FTS_INDEX_TABLE_IND "
+		"ON \"%s\"(word, first_doc_id);\n"
+};
+
+/** FTS auxiliary table suffixes that are common to all FT indexes. */
+static const char* fts_common_tables[] = {
+	"BEING_DELETED",
+	"BEING_DELETED_CACHE",
+	"CONFIG",
+	"DELETED",
+	"DELETED_CACHE",
+	NULL
+};
+
+/** FTS auxiliary INDEX split intervals. */
+const  fts_index_selector_t fts_index_selector[] = {
+	{ 9, "INDEX_1" },
+	{ 65, "INDEX_2" },
+	{ 70, "INDEX_3" },
+	{ 75, "INDEX_4" },
+	{ 80, "INDEX_5" },
+	{ 85, "INDEX_6" },
+	{  0 , NULL	 }
+};
+
+/** Default config values for FTS indexes on a table. */
+static const char* fts_config_table_insert_values_sql =
+	"BEGIN\n"
+	"\n"
+	"INSERT INTO \"%s\" VALUES('"
+		FTS_MAX_CACHE_SIZE_IN_MB "', '256');\n"
+	""
+	"INSERT INTO \"%s\" VALUES('"
+		FTS_OPTIMIZE_LIMIT_IN_SECS  "', '180');\n"
+	""
+	"INSERT INTO \"%s\" VALUES ('"
+		FTS_SYNCED_DOC_ID "', '0');\n"
+	""
+	"INSERT INTO \"%s\" VALUES ('"
+		FTS_TOTAL_DELETED_COUNT "', '0');\n"
+	"" /* Note: 0 == FTS_TABLE_STATE_RUNNING */
+	"INSERT INTO \"%s\" VALUES ('"
+		FTS_TABLE_STATE "', '0');\n";
+
+/****************************************************************//**
+Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@return DB_SUCCESS if all OK  */
+static
+dberr_t
+fts_sync(
+/*=====*/
+	fts_sync_t*	sync)		/*!< in: sync state */
+	__attribute__((nonnull));
+
+/****************************************************************//**
+Release all resources help by the words rb tree e.g., the node ilist. */
+static
+void
+fts_words_free(
+/*===========*/
+	ib_rbt_t*	words)		/*!< in: rb tree of words */
+	__attribute__((nonnull));
+#ifdef FTS_CACHE_SIZE_DEBUG
+/****************************************************************//**
+Read the max cache size parameter from the config table. */
+static
+void
+fts_update_max_cache_size(
+/*======================*/
+	fts_sync_t*	sync);		/*!< in: sync state */
+#endif
+
+/*********************************************************************//**
+This function fetches the document just inserted right before
+we commit the transaction, and tokenize the inserted text data
+and insert into FTS auxiliary table and its cache.
+@return TRUE if successful */
+static
+ulint
+fts_add_doc_by_id(
+/*==============*/
+	fts_trx_table_t*ftt,		/*!< in: FTS trx table */
+	doc_id_t	doc_id,		/*!< in: doc id */
+	ib_vector_t*	fts_indexes __attribute__((unused)));
+					/*!< in: affected fts indexes */
+#ifdef FTS_DOC_STATS_DEBUG
+/****************************************************************//**
+Check whether a particular word (term) exists in the FTS index.
+@return DB_SUCCESS if all went fine */
+static
+dberr_t
+fts_is_word_in_index(
+/*=================*/
+	trx_t*		trx,		/*!< in: FTS query state */
+	que_t**		graph,		/*!< out: Query graph */
+	fts_table_t*	fts_table,	/*!< in: table instance */
+	const fts_string_t* word,	/*!< in: the word to check */
+	ibool*		found)		/*!< out: TRUE if exists */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* FTS_DOC_STATS_DEBUG */
+
+/******************************************************************//**
+Update the last document id. This function could create a new
+transaction to update the last document id.
+@return DB_SUCCESS if OK */
+static
+dberr_t
+fts_update_sync_doc_id(
+/*===================*/
+	const dict_table_t*	table,		/*!< in: table */
+	const char*		table_name,	/*!< in: table name, or NULL */
+	doc_id_t		doc_id,		/*!< in: last document id */
+	trx_t*			trx)		/*!< in: update trx, or NULL */
+	__attribute__((nonnull(1)));
+/********************************************************************
+Check if we should stop. */
+UNIV_INLINE
+ibool
+fts_is_stop_signalled(
+/*==================*/
+	fts_t*		fts)			/*!< in: fts instance */
+{
+	ibool		stop_signalled = FALSE;
+
+	mutex_enter(&fts->bg_threads_mutex);
+
+	if (fts->fts_status & BG_THREAD_STOP) {
+
+		stop_signalled = TRUE;
+	}
+
+	mutex_exit(&fts->bg_threads_mutex);
+
+	return(stop_signalled);
+}
+
+/****************************************************************//**
+This function loads the default InnoDB stopword list */
+static
+void
+fts_load_default_stopword(
+/*======================*/
+	fts_stopword_t*		stopword_info)	/*!< in: stopword info */
+{
+	fts_string_t		str;
+	mem_heap_t*		heap;
+	ib_alloc_t*		allocator;
+	ib_rbt_t*		stop_words;
+
+	allocator = stopword_info->heap;
+	heap = static_cast<mem_heap_t*>(allocator->arg);
+
+	if (!stopword_info->cached_stopword) {
+		/* For default stopword, we always use fts_utf8_string_cmp() */
+		stopword_info->cached_stopword = rbt_create(
+			sizeof(fts_tokenizer_word_t), fts_utf8_string_cmp);
+	}
+
+	stop_words = stopword_info->cached_stopword;
+
+	str.f_n_char = 0;
+
+	for (ulint i = 0; fts_default_stopword[i]; ++i) {
+		char*			word;
+		fts_tokenizer_word_t	new_word;
+
+		/* We are going to duplicate the value below. */
+		word = const_cast<char*>(fts_default_stopword[i]);
+
+		new_word.nodes = ib_vector_create(
+			allocator, sizeof(fts_node_t), 4);
+
+		str.f_len = ut_strlen(word);
+		str.f_str = reinterpret_cast<byte*>(word);
+
+		fts_utf8_string_dup(&new_word.text, &str, heap);
+
+		rbt_insert(stop_words, &new_word, &new_word);
+	}
+
+	stopword_info->status = STOPWORD_FROM_DEFAULT;
+}
+
+/****************************************************************//**
+Callback function to read a single stopword value.
+@return Always return TRUE */
+static
+ibool
+fts_read_stopword(
+/*==============*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	ib_alloc_t*	allocator;
+	fts_stopword_t*	stopword_info;
+	sel_node_t*	sel_node;
+	que_node_t*	exp;
+	ib_rbt_t*	stop_words;
+	dfield_t*	dfield;
+	fts_string_t	str;
+	mem_heap_t*	heap;
+	ib_rbt_bound_t	parent;
+
+	sel_node = static_cast<sel_node_t*>(row);
+	stopword_info = static_cast<fts_stopword_t*>(user_arg);
+
+	stop_words = stopword_info->cached_stopword;
+	allocator =  static_cast<ib_alloc_t*>(stopword_info->heap);
+	heap = static_cast<mem_heap_t*>(allocator->arg);
+
+	exp = sel_node->select_list;
+
+	/* We only need to read the first column */
+	dfield = que_node_get_val(exp);
+
+	str.f_n_char = 0;
+	str.f_str = static_cast<byte*>(dfield_get_data(dfield));
+	str.f_len = dfield_get_len(dfield);
+
+	/* Only create new node if it is a value not already existed */
+	if (str.f_len != UNIV_SQL_NULL
+	    && rbt_search(stop_words, &parent, &str) != 0) {
+
+		fts_tokenizer_word_t	new_word;
+
+		new_word.nodes = ib_vector_create(
+			allocator, sizeof(fts_node_t), 4);
+
+		new_word.text.f_str = static_cast<byte*>(
+			 mem_heap_alloc(heap, str.f_len + 1));
+
+		memcpy(new_word.text.f_str, str.f_str, str.f_len);
+
+		new_word.text.f_n_char = 0;
+		new_word.text.f_len = str.f_len;
+		new_word.text.f_str[str.f_len] = 0;
+
+		rbt_insert(stop_words, &new_word, &new_word);
+	}
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Load user defined stopword from designated user table
+@return TRUE if load operation is successful */
+static
+ibool
+fts_load_user_stopword(
+/*===================*/
+	fts_t*		fts,			/*!< in: FTS struct */
+	const char*	stopword_table_name,	/*!< in: Stopword table
+						name */
+	fts_stopword_t*	stopword_info)		/*!< in: Stopword info */
+{
+	pars_info_t*	info;
+	que_t*		graph;
+	dberr_t		error = DB_SUCCESS;
+	ibool		ret = TRUE;
+	trx_t*		trx;
+	ibool		has_lock = fts->fts_status & TABLE_DICT_LOCKED;
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "Load user stopword table into FTS cache";
+
+	if (!has_lock) {
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	/* Validate the user table existence and in the right
+	format */
+	stopword_info->charset = fts_valid_stopword_table(stopword_table_name);
+	if (!stopword_info->charset) {
+		ret = FALSE;
+		goto cleanup;
+	} else if (!stopword_info->cached_stopword) {
+		/* Create the stopword RB tree with the stopword column
+		charset. All comparison will use this charset */
+		stopword_info->cached_stopword = rbt_create_arg_cmp(
+			sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp,
+			(void*)stopword_info->charset);
+
+	}
+
+	info = pars_info_create();
+
+	pars_info_bind_id(info, TRUE, "table_stopword", stopword_table_name);
+
+	pars_info_bind_function(info, "my_func", fts_read_stopword,
+				stopword_info);
+
+	graph = fts_parse_sql_no_dict_lock(
+		NULL,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT value "
+		" FROM $table_stopword;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for (;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+			stopword_info->status = STOPWORD_USER_TABLE;
+			break;
+		} else {
+
+			fts_sql_rollback(trx);
+
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, "  InnoDB: Warning: lock wait "
+					"timeout reading user stopword table. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, "  InnoDB: Error '%s' "
+					"while reading user stopword table.\n",
+					ut_strerr(error));
+				ret = FALSE;
+				break;
+			}
+		}
+	}
+
+	que_graph_free(graph);
+
+cleanup:
+	if (!has_lock) {
+		mutex_exit(&dict_sys->mutex);
+	}
+
+	trx_free_for_background(trx);
+	return(ret);
+}
+
+/******************************************************************//**
+Initialize the index cache. */
+static
+void
+fts_index_cache_init(
+/*=================*/
+	ib_alloc_t*		allocator,	/*!< in: the allocator to use */
+	fts_index_cache_t*	index_cache)	/*!< in: index cache */
+{
+	ulint			i;
+
+	ut_a(index_cache->words == NULL);
+
+	index_cache->words = rbt_create_arg_cmp(
+		sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp,
+		(void*)index_cache->charset);
+
+	ut_a(index_cache->doc_stats == NULL);
+
+	index_cache->doc_stats = ib_vector_create(
+		allocator, sizeof(fts_doc_stats_t), 4);
+
+	for (i = 0; fts_index_selector[i].value; ++i) {
+		ut_a(index_cache->ins_graph[i] == NULL);
+		ut_a(index_cache->sel_graph[i] == NULL);
+	}
+}
+
+/*********************************************************************//**
+Initialize FTS cache. */
+UNIV_INTERN
+void
+fts_cache_init(
+/*===========*/
+	fts_cache_t*	cache)		/*!< in: cache to initialize */
+{
+	ulint		i;
+
+	/* Just to make sure */
+	ut_a(cache->sync_heap->arg == NULL);
+
+	cache->sync_heap->arg = mem_heap_create(1024);
+
+	cache->total_size = 0;
+
+	cache->deleted_doc_ids = ib_vector_create(
+		cache->sync_heap, sizeof(fts_update_t), 4);
+
+	/* Reset the cache data for all the FTS indexes. */
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		fts_index_cache_init(cache->sync_heap, index_cache);
+	}
+}
+
+/****************************************************************//**
+Create a FTS cache. */
+UNIV_INTERN
+fts_cache_t*
+fts_cache_create(
+/*=============*/
+	dict_table_t*	table)	/*!< in: table owns the FTS cache */
+{
+	mem_heap_t*	heap;
+	fts_cache_t*	cache;
+
+	heap = static_cast<mem_heap_t*>(mem_heap_create(512));
+
+	cache = static_cast<fts_cache_t*>(
+		mem_heap_zalloc(heap, sizeof(*cache)));
+
+	cache->cache_heap = heap;
+
+	rw_lock_create(fts_cache_rw_lock_key, &cache->lock, SYNC_FTS_CACHE);
+
+	rw_lock_create(
+		fts_cache_init_rw_lock_key, &cache->init_lock,
+		SYNC_FTS_CACHE_INIT);
+
+	mutex_create(
+		fts_delete_mutex_key, &cache->deleted_lock, SYNC_FTS_OPTIMIZE);
+
+	mutex_create(
+		fts_optimize_mutex_key, &cache->optimize_lock,
+		SYNC_FTS_OPTIMIZE);
+
+	mutex_create(
+		fts_doc_id_mutex_key, &cache->doc_id_lock, SYNC_FTS_OPTIMIZE);
+
+	/* This is the heap used to create the cache itself. */
+	cache->self_heap = ib_heap_allocator_create(heap);
+
+	/* This is a transient heap, used for storing sync data. */
+	cache->sync_heap = ib_heap_allocator_create(heap);
+	cache->sync_heap->arg = NULL;
+
+	fts_need_sync = false;
+
+	cache->sync = static_cast<fts_sync_t*>(
+		mem_heap_zalloc(heap, sizeof(fts_sync_t)));
+
+	cache->sync->table = table;
+
+	/* Create the index cache vector that will hold the inverted indexes. */
+	cache->indexes = ib_vector_create(
+		cache->self_heap, sizeof(fts_index_cache_t), 2);
+
+	fts_cache_init(cache);
+
+	cache->stopword_info.cached_stopword = NULL;
+	cache->stopword_info.charset = NULL;
+
+	cache->stopword_info.heap = cache->self_heap;
+
+	cache->stopword_info.status = STOPWORD_NOT_INIT;
+
+	return(cache);
+}
+
+/*******************************************************************//**
+Add a newly create index into FTS cache */
+UNIV_INTERN
+void
+fts_add_index(
+/*==========*/
+	dict_index_t*	index,		/*!< FTS index to be added */
+	dict_table_t*	table)		/*!< table */
+{
+	fts_t*			fts = table->fts;
+	fts_cache_t*		cache;
+	fts_index_cache_t*	index_cache;
+
+	ut_ad(fts);
+	cache = table->fts->cache;
+
+	rw_lock_x_lock(&cache->init_lock);
+
+	ib_vector_push(fts->indexes, &index);
+
+	index_cache = fts_find_index_cache(cache, index);
+
+	if (!index_cache) {
+		/* Add new index cache structure */
+		index_cache = fts_cache_index_cache_create(table, index);
+	}
+
+	rw_lock_x_unlock(&cache->init_lock);
+}
+
+/*******************************************************************//**
+recalibrate get_doc structure after index_cache in cache->indexes changed */
+static
+void
+fts_reset_get_doc(
+/*==============*/
+	fts_cache_t*	cache)	/*!< in: FTS index cache */
+{
+	fts_get_doc_t*  get_doc;
+	ulint		i;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_EX));
+#endif
+	ib_vector_reset(cache->get_docs);
+
+	for (i = 0; i < ib_vector_size(cache->indexes); i++) {
+		fts_index_cache_t*	ind_cache;
+
+		ind_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_push(cache->get_docs, NULL));
+
+		memset(get_doc, 0x0, sizeof(*get_doc));
+
+		get_doc->index_cache = ind_cache;
+	}
+
+	ut_ad(ib_vector_size(cache->get_docs)
+	      == ib_vector_size(cache->indexes));
+}
+
+/*******************************************************************//**
+Check an index is in the table->indexes list
+@return TRUE if it exists */
+static
+ibool
+fts_in_dict_index(
+/*==============*/
+	dict_table_t*	table,		/*!< in: Table */
+	dict_index_t*	index_check)	/*!< in: index to be checked */
+{
+	dict_index_t*	index;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (index == index_check) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Check an index is in the fts->cache->indexes list
+@return TRUE if it exists */
+static
+ibool
+fts_in_index_cache(
+/*===============*/
+	dict_table_t*	table,	/*!< in: Table */
+	dict_index_t*	index)	/*!< in: index to be checked */
+{
+	ulint	i;
+
+	for (i = 0; i < ib_vector_size(table->fts->cache->indexes); i++) {
+		fts_index_cache_t*      index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(table->fts->cache->indexes, i));
+
+		if (index_cache->index == index) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Check indexes in the fts->indexes is also present in index cache and
+table->indexes list
+@return TRUE if all indexes match */
+UNIV_INTERN
+ibool
+fts_check_cached_index(
+/*===================*/
+	dict_table_t*	table)	/*!< in: Table where indexes are dropped */
+{
+	ulint	i;
+
+	if (!table->fts || !table->fts->cache) {
+		return(TRUE);
+	}
+
+	ut_a(ib_vector_size(table->fts->indexes)
+	      == ib_vector_size(table->fts->cache->indexes));
+
+	for (i = 0; i < ib_vector_size(table->fts->indexes); i++) {
+		dict_index_t*	index;
+
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(table->fts->indexes, i));
+
+		if (!fts_in_index_cache(table, index)) {
+			return(FALSE);
+		}
+
+		if (!fts_in_dict_index(table, index)) {
+			return(FALSE);
+		}
+	}
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Drop auxiliary tables related to an FTS index
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+fts_drop_index(
+/*===========*/
+	dict_table_t*	table,	/*!< in: Table where indexes are dropped */
+	dict_index_t*	index,	/*!< in: Index to be dropped */
+	trx_t*		trx)	/*!< in: Transaction for the drop */
+{
+	ib_vector_t*	indexes = table->fts->indexes;
+	dberr_t		err = DB_SUCCESS;
+
+	ut_a(indexes);
+
+	if ((ib_vector_size(indexes) == 1
+	    && (index == static_cast<dict_index_t*>(
+			ib_vector_getp(table->fts->indexes, 0))))
+	   || ib_vector_is_empty(indexes)) {
+		doc_id_t	current_doc_id;
+		doc_id_t	first_doc_id;
+
+		/* If we are dropping the only FTS index of the table,
+		remove it from optimize thread */
+		fts_optimize_remove_table(table);
+
+		DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS);
+
+		/* If Doc ID column is not added internally by FTS index,
+		we can drop all FTS auxiliary tables. Otherwise, we will
+		need to keep some common table such as CONFIG table, so
+		as to keep track of incrementing Doc IDs */
+		if (!DICT_TF2_FLAG_IS_SET(
+			table, DICT_TF2_FTS_HAS_DOC_ID)) {
+
+			err = fts_drop_tables(trx, table);
+
+			err = fts_drop_index_tables(trx, index);
+
+			fts_free(table);
+
+			return(err);
+		}
+
+		current_doc_id = table->fts->cache->next_doc_id;
+		first_doc_id = table->fts->cache->first_doc_id;
+		fts_cache_clear(table->fts->cache, TRUE);
+		fts_cache_destroy(table->fts->cache);
+		table->fts->cache = fts_cache_create(table);
+		table->fts->cache->next_doc_id = current_doc_id;
+		table->fts->cache->first_doc_id = first_doc_id;
+	} else {
+		fts_cache_t*            cache = table->fts->cache;
+		fts_index_cache_t*      index_cache;
+
+		rw_lock_x_lock(&cache->init_lock);
+
+		index_cache = fts_find_index_cache(cache, index);
+
+		if (index_cache->words) {
+			fts_words_free(index_cache->words);
+			rbt_free(index_cache->words);
+		}
+
+		ib_vector_remove(cache->indexes, *(void**) index_cache);
+
+		if (cache->get_docs) {
+			fts_reset_get_doc(cache);
+		}
+
+		rw_lock_x_unlock(&cache->init_lock);
+	}
+
+	err = fts_drop_index_tables(trx, index);
+
+	ib_vector_remove(indexes, (const void*) index);
+
+	return(err);
+}
+
+/****************************************************************//**
+Free the query graph but check whether dict_sys->mutex is already
+held */
+UNIV_INTERN
+void
+fts_que_graph_free_check_lock(
+/*==========================*/
+	fts_table_t*		fts_table,	/*!< in: FTS table */
+	const fts_index_cache_t*index_cache,	/*!< in: FTS index cache */
+	que_t*			graph)		/*!< in: query graph */
+{
+	ibool	has_dict = FALSE;
+
+	if (fts_table && fts_table->table) {
+		ut_ad(fts_table->table->fts);
+
+		has_dict = fts_table->table->fts->fts_status
+			 & TABLE_DICT_LOCKED;
+	} else if (index_cache) {
+		ut_ad(index_cache->index->table->fts);
+
+		has_dict = index_cache->index->table->fts->fts_status
+			 & TABLE_DICT_LOCKED;
+	}
+
+	if (!has_dict) {
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	que_graph_free(graph);
+
+	if (!has_dict) {
+		mutex_exit(&dict_sys->mutex);
+	}
+}
+
+/****************************************************************//**
+Create an FTS index cache. */
+UNIV_INTERN
+CHARSET_INFO*
+fts_index_get_charset(
+/*==================*/
+	dict_index_t*		index)		/*!< in: FTS index */
+{
+	CHARSET_INFO*		charset = NULL;
+	dict_field_t*		field;
+	ulint			prtype;
+
+	field = dict_index_get_nth_field(index, 0);
+	prtype = field->col->prtype;
+
+	charset = innobase_get_fts_charset(
+		(int) (prtype & DATA_MYSQL_TYPE_MASK),
+		(uint) dtype_get_charset_coll(prtype));
+
+#ifdef FTS_DEBUG
+	/* Set up charset info for this index. Please note all
+	field of the FTS index should have the same charset */
+	for (i = 1; i < index->n_fields; i++) {
+		CHARSET_INFO*   fld_charset;
+
+		field = dict_index_get_nth_field(index, i);
+		prtype = field->col->prtype;
+
+		fld_charset = innobase_get_fts_charset(
+			(int)(prtype & DATA_MYSQL_TYPE_MASK),
+			(uint) dtype_get_charset_coll(prtype));
+
+		/* All FTS columns should have the same charset */
+		if (charset) {
+			ut_a(charset == fld_charset);
+		} else {
+			charset = fld_charset;
+		}
+	}
+#endif
+
+	return(charset);
+
+}
+/****************************************************************//**
+Create an FTS index cache.
+@return Index Cache */
+UNIV_INTERN
+fts_index_cache_t*
+fts_cache_index_cache_create(
+/*=========================*/
+	dict_table_t*		table,		/*!< in: table with FTS index */
+	dict_index_t*		index)		/*!< in: FTS index */
+{
+	ulint			n_bytes;
+	fts_index_cache_t*	index_cache;
+	fts_cache_t*		cache = table->fts->cache;
+
+	ut_a(cache != NULL);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_EX));
+#endif
+
+	/* Must not already exist in the cache vector. */
+	ut_a(fts_find_index_cache(cache, index) == NULL);
+
+	index_cache = static_cast<fts_index_cache_t*>(
+		ib_vector_push(cache->indexes, NULL));
+
+	memset(index_cache, 0x0, sizeof(*index_cache));
+
+	index_cache->index = index;
+
+	index_cache->charset = fts_index_get_charset(index);
+
+	n_bytes = sizeof(que_t*) * sizeof(fts_index_selector);
+
+	index_cache->ins_graph = static_cast<que_t**>(
+		mem_heap_zalloc(static_cast<mem_heap_t*>(
+			cache->self_heap->arg), n_bytes));
+
+	index_cache->sel_graph = static_cast<que_t**>(
+		mem_heap_zalloc(static_cast<mem_heap_t*>(
+			cache->self_heap->arg), n_bytes));
+
+	fts_index_cache_init(cache->sync_heap, index_cache);
+
+	if (cache->get_docs) {
+		fts_reset_get_doc(cache);
+	}
+
+	return(index_cache);
+}
+
+/****************************************************************//**
+Release all resources help by the words rb tree e.g., the node ilist. */
+static
+void
+fts_words_free(
+/*===========*/
+	ib_rbt_t*	words)			/*!< in: rb tree of words */
+{
+	const ib_rbt_node_t*	rbt_node;
+
+	/* Free the resources held by a word. */
+	for (rbt_node = rbt_first(words);
+	     rbt_node != NULL;
+	     rbt_node = rbt_first(words)) {
+
+		ulint			i;
+		fts_tokenizer_word_t*	word;
+
+		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+		/* Free the ilists of this word. */
+		for (i = 0; i < ib_vector_size(word->nodes); ++i) {
+
+			fts_node_t* fts_node = static_cast<fts_node_t*>(
+				ib_vector_get(word->nodes, i));
+
+			ut_free(fts_node->ilist);
+			fts_node->ilist = NULL;
+		}
+
+		/* NOTE: We are responsible for free'ing the node */
+		ut_free(rbt_remove_node(words, rbt_node));
+	}
+}
+
+/*********************************************************************//**
+Clear cache. If the shutdown flag is TRUE then the cache can contain
+data that needs to be freed. For regular clear as part of normal
+working we assume the caller has freed all resources. */
+UNIV_INTERN
+void
+fts_cache_clear(
+/*============*/
+	fts_cache_t*	cache,		/*!< in: cache */
+	ibool		free_words)	/*!< in: TRUE if free in memory
+					word cache. */
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		ulint			j;
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		if (free_words) {
+			fts_words_free(index_cache->words);
+		}
+
+		ut_a(rbt_empty(index_cache->words));
+
+		rbt_free(index_cache->words);
+
+		index_cache->words = NULL;
+
+		for (j = 0; fts_index_selector[j].value; ++j) {
+
+			if (index_cache->ins_graph[j] != NULL) {
+
+				fts_que_graph_free_check_lock(
+					NULL, index_cache,
+					index_cache->ins_graph[j]);
+
+				index_cache->ins_graph[j] = NULL;
+			}
+
+			if (index_cache->sel_graph[j] != NULL) {
+
+				fts_que_graph_free_check_lock(
+					NULL, index_cache,
+					index_cache->sel_graph[j]);
+
+				index_cache->sel_graph[j] = NULL;
+			}
+		}
+
+		index_cache->doc_stats = NULL;
+	}
+
+	mem_heap_free(static_cast<mem_heap_t*>(cache->sync_heap->arg));
+	cache->sync_heap->arg = NULL;
+
+	cache->total_size = 0;
+	cache->deleted_doc_ids = NULL;
+}
+
+/*********************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index cache else NULL */
+UNIV_INLINE
+fts_index_cache_t*
+fts_get_index_cache(
+/*================*/
+	fts_cache_t*		cache,		/*!< in: cache to search */
+	const dict_index_t*	index)		/*!< in: index to search for */
+{
+	ulint			i;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own((rw_lock_t*) &cache->lock, RW_LOCK_EX)
+	      || rw_lock_own((rw_lock_t*) &cache->init_lock, RW_LOCK_EX));
+#endif
+
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		if (index_cache->index == index) {
+
+			return(index_cache);
+		}
+	}
+
+	return(NULL);
+}
+
+#ifdef FTS_DEBUG
+/*********************************************************************//**
+Search the index cache for a get_doc structure.
+@return the fts_get_doc_t item else NULL */
+static
+fts_get_doc_t*
+fts_get_index_get_doc(
+/*==================*/
+	fts_cache_t*		cache,		/*!< in: cache to search */
+	const dict_index_t*	index)		/*!< in: index to search for */
+{
+	ulint			i;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own((rw_lock_t*) &cache->init_lock, RW_LOCK_EX));
+#endif
+
+	for (i = 0; i < ib_vector_size(cache->get_docs); ++i) {
+		fts_get_doc_t*	get_doc;
+
+		get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_get(cache->get_docs, i));
+
+		if (get_doc->index_cache->index == index) {
+
+			return(get_doc);
+		}
+	}
+
+	return(NULL);
+}
+#endif
+
+/**********************************************************************//**
+Free the FTS cache. */
+UNIV_INTERN
+void
+fts_cache_destroy(
+/*==============*/
+	fts_cache_t*	cache)			/*!< in: cache*/
+{
+	rw_lock_free(&cache->lock);
+	rw_lock_free(&cache->init_lock);
+	mutex_free(&cache->optimize_lock);
+	mutex_free(&cache->deleted_lock);
+	mutex_free(&cache->doc_id_lock);
+
+	if (cache->stopword_info.cached_stopword) {
+		rbt_free(cache->stopword_info.cached_stopword);
+	}
+
+	if (cache->sync_heap->arg) {
+		mem_heap_free(static_cast<mem_heap_t*>(cache->sync_heap->arg));
+	}
+
+	mem_heap_free(cache->cache_heap);
+}
+
+/**********************************************************************//**
+Find an existing word, or if not found, create one and return it.
+@return specified word token */
+static
+fts_tokenizer_word_t*
+fts_tokenizer_word_get(
+/*===================*/
+	fts_cache_t*	cache,			/*!< in: cache */
+	fts_index_cache_t*
+			index_cache,		/*!< in: index cache */
+	fts_string_t*	text)			/*!< in: node text */
+{
+	fts_tokenizer_word_t*	word;
+	ib_rbt_bound_t		parent;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&cache->lock, RW_LOCK_EX));
+#endif
+
+	/* If it is a stopword, do not index it */
+	if (rbt_search(cache->stopword_info.cached_stopword,
+		       &parent, text) == 0) {
+
+		return(NULL);
+	}
+
+	/* Check if we found a match, if not then add word to tree. */
+	if (rbt_search(index_cache->words, &parent, text) != 0) {
+		mem_heap_t*		heap;
+		fts_tokenizer_word_t	new_word;
+
+		heap = static_cast<mem_heap_t*>(cache->sync_heap->arg);
+
+		new_word.nodes = ib_vector_create(
+			cache->sync_heap, sizeof(fts_node_t), 4);
+
+		fts_utf8_string_dup(&new_word.text, text, heap);
+
+		parent.last = rbt_add_node(
+			index_cache->words, &parent, &new_word);
+
+		/* Take into account the RB tree memory use and the vector. */
+		cache->total_size += sizeof(new_word)
+			+ sizeof(ib_rbt_node_t)
+			+ text->f_len
+			+ (sizeof(fts_node_t) * 4)
+			+ sizeof(*new_word.nodes);
+
+		ut_ad(rbt_validate(index_cache->words));
+	}
+
+	word = rbt_value(fts_tokenizer_word_t, parent.last);
+
+	return(word);
+}
+
+/**********************************************************************//**
+Add the given doc_id/word positions to the given node's ilist. */
+UNIV_INTERN
+void
+fts_cache_node_add_positions(
+/*=========================*/
+	fts_cache_t*	cache,		/*!< in: cache */
+	fts_node_t*	node,		/*!< in: word node */
+	doc_id_t	doc_id,		/*!< in: doc id */
+	ib_vector_t*	positions)	/*!< in: fts_token_t::positions */
+{
+	ulint		i;
+	byte*		ptr;
+	byte*		ilist;
+	ulint		enc_len;
+	ulint		last_pos;
+	byte*		ptr_start;
+	ulint		doc_id_delta;
+
+#ifdef UNIV_SYNC_DEBUG
+	if (cache) {
+		ut_ad(rw_lock_own(&cache->lock, RW_LOCK_EX));
+	}
+#endif
+	ut_ad(doc_id >= node->last_doc_id);
+
+	/* Calculate the space required to store the ilist. */
+	doc_id_delta = (ulint)(doc_id - node->last_doc_id);
+	enc_len = fts_get_encoded_len(doc_id_delta);
+
+	last_pos = 0;
+	for (i = 0; i < ib_vector_size(positions); i++) {
+		ulint	pos = *(static_cast<ulint*>(
+			ib_vector_get(positions, i)));
+
+		ut_ad(last_pos == 0 || pos > last_pos);
+
+		enc_len += fts_get_encoded_len(pos - last_pos);
+		last_pos = pos;
+	}
+
+	/* The 0x00 byte at the end of the token positions list. */
+	enc_len++;
+
+	if ((node->ilist_size_alloc - node->ilist_size) >= enc_len) {
+		/* No need to allocate more space, we can fit in the new
+		data at the end of the old one. */
+		ilist = NULL;
+		ptr = node->ilist + node->ilist_size;
+	} else {
+		ulint	new_size = node->ilist_size + enc_len;
+
+		/* Over-reserve space by a fixed size for small lengths and
+		by 20% for lengths >= 48 bytes. */
+		if (new_size < 16) {
+			new_size = 16;
+		} else if (new_size < 32) {
+			new_size = 32;
+		} else if (new_size < 48) {
+			new_size = 48;
+		} else {
+			new_size = (ulint)(1.2 * new_size);
+		}
+
+		ilist = static_cast<byte*>(ut_malloc(new_size));
+		ptr = ilist + node->ilist_size;
+
+		node->ilist_size_alloc = new_size;
+	}
+
+	ptr_start = ptr;
+
+	/* Encode the new fragment. */
+	ptr += fts_encode_int(doc_id_delta, ptr);
+
+	last_pos = 0;
+	for (i = 0; i < ib_vector_size(positions); i++) {
+		ulint	pos = *(static_cast<ulint*>(
+			 ib_vector_get(positions, i)));
+
+		ptr += fts_encode_int(pos - last_pos, ptr);
+		last_pos = pos;
+	}
+
+	*ptr++ = 0;
+
+	ut_a(enc_len == (ulint)(ptr - ptr_start));
+
+	if (ilist) {
+		/* Copy old ilist to the start of the new one and switch the
+		new one into place in the node. */
+		if (node->ilist_size > 0) {
+			memcpy(ilist, node->ilist, node->ilist_size);
+			ut_free(node->ilist);
+		}
+
+		node->ilist = ilist;
+	}
+
+	node->ilist_size += enc_len;
+
+	if (cache) {
+		cache->total_size += enc_len;
+	}
+
+	if (node->first_doc_id == FTS_NULL_DOC_ID) {
+		node->first_doc_id = doc_id;
+	}
+
+	node->last_doc_id = doc_id;
+	++node->doc_count;
+}
+
+/**********************************************************************//**
+Add document to the cache. */
+static
+void
+fts_cache_add_doc(
+/*==============*/
+	fts_cache_t*	cache,			/*!< in: cache */
+	fts_index_cache_t*
+			index_cache,		/*!< in: index cache */
+	doc_id_t	doc_id,			/*!< in: doc id to add */
+	ib_rbt_t*	tokens)			/*!< in: document tokens */
+{
+	const ib_rbt_node_t*	node;
+	ulint			n_words;
+	fts_doc_stats_t*	doc_stats;
+
+	if (!tokens) {
+		return;
+	}
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&cache->lock, RW_LOCK_EX));
+#endif
+
+	n_words = rbt_size(tokens);
+
+	for (node = rbt_first(tokens); node; node = rbt_first(tokens)) {
+
+		fts_tokenizer_word_t*	word;
+		fts_node_t*		fts_node = NULL;
+		fts_token_t*		token = rbt_value(fts_token_t, node);
+
+		/* Find and/or add token to the cache. */
+		word = fts_tokenizer_word_get(
+			cache, index_cache, &token->text);
+
+		if (!word) {
+			ut_free(rbt_remove_node(tokens, node));
+			continue;
+		}
+
+		if (ib_vector_size(word->nodes) > 0) {
+			fts_node = static_cast<fts_node_t*>(
+				ib_vector_last(word->nodes));
+		}
+
+		if (fts_node == NULL
+		    || fts_node->ilist_size > FTS_ILIST_MAX_SIZE
+		    || doc_id < fts_node->last_doc_id) {
+
+			fts_node = static_cast<fts_node_t*>(
+				ib_vector_push(word->nodes, NULL));
+
+			memset(fts_node, 0x0, sizeof(*fts_node));
+
+			cache->total_size += sizeof(*fts_node);
+		}
+
+		fts_cache_node_add_positions(
+			cache, fts_node, doc_id, token->positions);
+
+		ut_free(rbt_remove_node(tokens, node));
+	}
+
+	ut_a(rbt_empty(tokens));
+
+	/* Add to doc ids processed so far. */
+	doc_stats = static_cast<fts_doc_stats_t*>(
+		ib_vector_push(index_cache->doc_stats, NULL));
+
+	doc_stats->doc_id = doc_id;
+	doc_stats->word_count = n_words;
+
+	/* Add the doc stats memory usage too. */
+	cache->total_size += sizeof(*doc_stats);
+
+	if (doc_id > cache->sync->max_doc_id) {
+		cache->sync->max_doc_id = doc_id;
+	}
+}
+
+/****************************************************************//**
+Drops a table. If the table can't be found we return a SUCCESS code.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_drop_table(
+/*===========*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	table_name)		/*!< in: table to drop */
+{
+	dict_table_t*	table;
+	dberr_t		error = DB_SUCCESS;
+
+	/* Check that the table exists in our data dictionary.
+	Similar to regular drop table case, we will open table with
+	DICT_ERR_IGNORE_INDEX_ROOT and DICT_ERR_IGNORE_CORRUPT option */
+	table = dict_table_open_on_name(
+		table_name, TRUE, FALSE,
+		static_cast<dict_err_ignore_t>(
+                        DICT_ERR_IGNORE_INDEX_ROOT | DICT_ERR_IGNORE_CORRUPT));
+
+	if (table != 0) {
+
+		dict_table_close(table, TRUE, FALSE);
+
+		/* Pass nonatomic=false (dont allow data dict unlock),
+		because the transaction may hold locks on SYS_* tables from
+		previous calls to fts_drop_table(). */
+		error = row_drop_table_for_mysql(table_name, trx, true, false);
+
+		if (error != DB_SUCCESS) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Unable to drop FTS index aux table %s: %s",
+				table_name, ut_strerr(error));
+		}
+	} else {
+		error = DB_FAIL;
+	}
+
+	return(error);
+}
+
+/****************************************************************//**
+Rename a single auxiliary table due to database name change.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_rename_one_aux_table(
+/*=====================*/
+	const char*	new_name,		/*!< in: new parent tbl name */
+	const char*	fts_table_old_name,	/*!< in: old aux tbl name */
+	trx_t*		trx)			/*!< in: transaction */
+{
+	char	fts_table_new_name[MAX_TABLE_NAME_LEN];
+	ulint	new_db_name_len = dict_get_db_name_len(new_name);
+	ulint	old_db_name_len = dict_get_db_name_len(fts_table_old_name);
+	ulint	table_new_name_len = strlen(fts_table_old_name)
+				     + new_db_name_len - old_db_name_len;
+
+	/* Check if the new and old database names are the same, if so,
+	nothing to do */
+	ut_ad((new_db_name_len != old_db_name_len)
+	      || strncmp(new_name, fts_table_old_name, old_db_name_len) != 0);
+
+	/* Get the database name from "new_name", and table name
+	from the fts_table_old_name */
+	strncpy(fts_table_new_name, new_name, new_db_name_len);
+	strncpy(fts_table_new_name + new_db_name_len,
+	       strchr(fts_table_old_name, '/'),
+	       table_new_name_len - new_db_name_len);
+	fts_table_new_name[table_new_name_len] = 0;
+
+	return(row_rename_table_for_mysql(
+		fts_table_old_name, fts_table_new_name, trx, false));
+}
+
+/****************************************************************//**
+Rename auxiliary tables for all fts index for a table. This(rename)
+is due to database name change
+@return DB_SUCCESS or error code */
+
+dberr_t
+fts_rename_aux_tables(
+/*==================*/
+	dict_table_t*	table,		/*!< in: user Table */
+	const char*     new_name,       /*!< in: new table name */
+	trx_t*		trx)		/*!< in: transaction */
+{
+	ulint		i;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+
+	/* Rename common auxiliary tables */
+	for (i = 0; fts_common_tables[i] != NULL; ++i) {
+		char*	old_table_name;
+		dberr_t	err = DB_SUCCESS;
+
+		fts_table.suffix = fts_common_tables[i];
+
+		old_table_name = fts_get_table_name(&fts_table);
+
+		err = fts_rename_one_aux_table(new_name, old_table_name, trx);
+
+		mem_free(old_table_name);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	fts_t*	fts = table->fts;
+
+	/* Rename index specific auxiliary tables */
+	for (i = 0; fts->indexes != 0 && i < ib_vector_size(fts->indexes);
+	     ++i) {
+		dict_index_t*	index;
+
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(fts->indexes, i));
+
+		FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index);
+
+		for (ulint j = 0; fts_index_selector[j].value; ++j) {
+			dberr_t	err;
+			char*	old_table_name;
+
+			fts_table.suffix = fts_get_suffix(j);
+
+			old_table_name = fts_get_table_name(&fts_table);
+
+			err = fts_rename_one_aux_table(
+				new_name, old_table_name, trx);
+
+			DBUG_EXECUTE_IF("fts_rename_failure",
+					err = DB_DEADLOCK;);
+
+			mem_free(old_table_name);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/****************************************************************//**
+Drops the common ancillary tables needed for supporting an FTS index
+on the given table. row_mysql_lock_data_dictionary must have been called
+before this.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_drop_common_tables(
+/*===================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table)		/*!< in: table with an FTS
+						index */
+{
+	ulint		i;
+	dberr_t		error = DB_SUCCESS;
+
+	for (i = 0; fts_common_tables[i] != NULL; ++i) {
+		dberr_t	err;
+		char*	table_name;
+
+		fts_table->suffix = fts_common_tables[i];
+
+		table_name = fts_get_table_name(fts_table);
+
+		err = fts_drop_table(trx, table_name);
+
+		/* We only return the status of the last error. */
+		if (err != DB_SUCCESS && err != DB_FAIL) {
+			error = err;
+		}
+
+		mem_free(table_name);
+	}
+
+	return(error);
+}
+
+/****************************************************************//**
+Since we do a horizontal split on the index table, we need to drop
+all the split tables.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_drop_index_split_tables(
+/*========================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index)			/*!< in: fts instance */
+
+{
+	ulint		i;
+	fts_table_t	fts_table;
+	dberr_t		error = DB_SUCCESS;
+
+	FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index);
+
+	for (i = 0; fts_index_selector[i].value; ++i) {
+		dberr_t	err;
+		char*	table_name;
+
+		fts_table.suffix = fts_get_suffix(i);
+
+		table_name = fts_get_table_name(&fts_table);
+
+		err = fts_drop_table(trx, table_name);
+
+		/* We only return the status of the last error. */
+		if (err != DB_SUCCESS && err != DB_FAIL) {
+			error = err;
+		}
+
+		mem_free(table_name);
+	}
+
+	return(error);
+}
+
+/****************************************************************//**
+Drops FTS auxiliary tables for an FTS index
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_drop_index_tables(
+/*==================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index)		/*!< in: Index to drop */
+{
+	dberr_t			error = DB_SUCCESS;
+
+#ifdef FTS_DOC_STATS_DEBUG
+	fts_table_t		fts_table;
+	static const char*	index_tables[] = {
+		"DOC_ID",
+		NULL
+	};
+#endif /* FTS_DOC_STATS_DEBUG */
+
+	dberr_t	err = fts_drop_index_split_tables(trx, index);
+
+	/* We only return the status of the last error. */
+	if (err != DB_SUCCESS) {
+		error = err;
+	}
+
+#ifdef FTS_DOC_STATS_DEBUG
+	FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index);
+
+	for (ulint i = 0; index_tables[i] != NULL; ++i) {
+		char*	table_name;
+
+		fts_table.suffix = index_tables[i];
+
+		table_name = fts_get_table_name(&fts_table);
+
+		err = fts_drop_table(trx, table_name);
+
+		/* We only return the status of the last error. */
+		if (err != DB_SUCCESS && err != DB_FAIL) {
+			error = err;
+		}
+
+		mem_free(table_name);
+	}
+#endif /* FTS_DOC_STATS_DEBUG */
+
+	return(error);
+}
+
+/****************************************************************//**
+Drops FTS ancillary tables needed for supporting an FTS index
+on the given table. row_mysql_lock_data_dictionary must have been called
+before this.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_drop_all_index_tables(
+/*======================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_t*		fts)			/*!< in: fts instance */
+{
+	dberr_t		error = DB_SUCCESS;
+
+	for (ulint i = 0;
+	     fts->indexes != 0 && i < ib_vector_size(fts->indexes);
+	     ++i) {
+
+		dberr_t		err;
+		dict_index_t*	index;
+
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(fts->indexes, i));
+
+		err = fts_drop_index_tables(trx, index);
+
+		if (err != DB_SUCCESS) {
+			error = err;
+		}
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Drops the ancillary tables needed for supporting an FTS index on a
+given table. row_mysql_lock_data_dictionary must have been called before
+this.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_drop_tables(
+/*============*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	table)		/*!< in: table has the FTS index */
+{
+	dberr_t		error;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+
+	/* TODO: This is not atomic and can cause problems during recovery. */
+
+	error = fts_drop_common_tables(trx, &fts_table);
+
+	if (error == DB_SUCCESS) {
+		error = fts_drop_all_index_tables(trx, table->fts);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Prepare the SQL, so that all '%s' are replaced by the common prefix.
+@return sql string, use mem_free() to free the memory */
+static
+char*
+fts_prepare_sql(
+/*============*/
+	fts_table_t*	fts_table,	/*!< in: table name info */
+	const char*	my_template)	/*!< in: sql template */
+{
+	char*		sql;
+	char*		name_prefix;
+
+	name_prefix = fts_get_table_name_prefix(fts_table);
+	sql = ut_strreplace(my_template, "%s", name_prefix);
+	mem_free(name_prefix);
+
+	return(sql);
+}
+
+/*********************************************************************//**
+Creates the common ancillary tables needed for supporting an FTS index
+on the given table. row_mysql_lock_data_dictionary must have been called
+before this.
+@return DB_SUCCESS if succeed */
+UNIV_INTERN
+dberr_t
+fts_create_common_tables(
+/*=====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const dict_table_t* table,	/*!< in: table with FTS index */
+	const char*	name,		/*!< in: table name normalized.*/
+	bool		skip_doc_id_index)/*!< in: Skip index on doc id */
+{
+	char*		sql;
+	dberr_t		error;
+	que_t*		graph;
+	fts_table_t	fts_table;
+	mem_heap_t*	heap = mem_heap_create(1024);
+	pars_info_t*	info;
+
+	FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+
+	error = fts_drop_common_tables(trx, &fts_table);
+
+	if (error != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	/* Create the FTS tables that are common to an FTS index. */
+	sql = fts_prepare_sql(&fts_table, fts_create_common_tables_sql);
+	graph = fts_parse_sql_no_dict_lock(NULL, NULL, sql);
+	mem_free(sql);
+
+	error = fts_eval_sql(trx, graph);
+
+	que_graph_free(graph);
+
+	if (error != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	/* Write the default settings to the config table. */
+	fts_table.suffix = "CONFIG";
+	graph = fts_parse_sql_no_dict_lock(
+		&fts_table, NULL, fts_config_table_insert_values_sql);
+
+	error = fts_eval_sql(trx, graph);
+
+	que_graph_free(graph);
+
+	if (error != DB_SUCCESS || skip_doc_id_index) {
+
+		goto func_exit;
+	}
+
+	info = pars_info_create();
+
+	pars_info_bind_id(info, TRUE, "table_name", name);
+	pars_info_bind_id(info, TRUE, "index_name", FTS_DOC_ID_INDEX_NAME);
+	pars_info_bind_id(info, TRUE, "doc_id_col_name", FTS_DOC_ID_COL_NAME);
+
+	/* Create the FTS DOC_ID index on the hidden column. Currently this
+	is common for any FT index created on the table. */
+	graph = fts_parse_sql_no_dict_lock(
+		NULL,
+		info,
+		mem_heap_printf(
+			heap,
+			"BEGIN\n"
+			""
+			"CREATE UNIQUE INDEX $index_name ON $table_name("
+			"$doc_id_col_name);\n"));
+
+	error = fts_eval_sql(trx, graph);
+	que_graph_free(graph);
+
+func_exit:
+	if (error != DB_SUCCESS) {
+		/* We have special error handling here */
+
+		trx->error_state = DB_SUCCESS;
+
+		trx_rollback_to_savepoint(trx, NULL);
+
+		row_drop_table_for_mysql(table->name, trx, FALSE);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	mem_heap_free(heap);
+
+	return(error);
+}
+
+/*************************************************************//**
+Wrapper function of fts_create_index_tables_low(), create auxiliary
+tables for an FTS index
+@return: DB_SUCCESS or error code */
+static
+dict_table_t*
+fts_create_one_index_table(
+/*=======================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const dict_index_t*
+			index,		/*!< in: the index instance */
+	fts_table_t*	fts_table,	/*!< in: fts_table structure */
+	mem_heap_t*	heap)		/*!< in: heap */
+{
+	dict_field_t*		field;
+	dict_table_t*		new_table = NULL;
+	char*			table_name = fts_get_table_name(fts_table);
+	dberr_t			error;
+	CHARSET_INFO*		charset;
+
+	ut_ad(index->type & DICT_FTS);
+
+	new_table = dict_mem_table_create(table_name, 0, 5, 1, 0);
+
+	field = dict_index_get_nth_field(index, 0);
+	charset = innobase_get_fts_charset(
+		(int)(field->col->prtype & DATA_MYSQL_TYPE_MASK),
+		(uint) dtype_get_charset_coll(field->col->prtype));
+
+	if (strcmp(charset->name, "latin1_swedish_ci") == 0) {
+		dict_mem_table_add_col(new_table, heap, "word", DATA_VARCHAR,
+				       field->col->prtype, FTS_MAX_WORD_LEN);
+	} else {
+		dict_mem_table_add_col(new_table, heap, "word", DATA_VARMYSQL,
+				       field->col->prtype, FTS_MAX_WORD_LEN);
+	}
+
+	dict_mem_table_add_col(new_table, heap, "first_doc_id", DATA_INT,
+			       DATA_NOT_NULL | DATA_UNSIGNED,
+			       sizeof(doc_id_t));
+
+	dict_mem_table_add_col(new_table, heap, "last_doc_id", DATA_INT,
+			       DATA_NOT_NULL | DATA_UNSIGNED,
+			       sizeof(doc_id_t));
+
+	dict_mem_table_add_col(new_table, heap, "doc_count", DATA_INT,
+			       DATA_NOT_NULL | DATA_UNSIGNED, 4);
+
+	dict_mem_table_add_col(new_table, heap, "ilist", DATA_BLOB,
+			       4130048,	0);
+
+	error = row_create_table_for_mysql(new_table, trx, true);
+
+	if (error != DB_SUCCESS) {
+		trx->error_state = error;
+		dict_mem_table_free(new_table);
+		new_table = NULL;
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Fail to create FTS index table %s", table_name);
+	}
+
+	mem_free(table_name);
+
+	return(new_table);
+}
+
+/*************************************************************//**
+Wrapper function of fts_create_index_tables_low(), create auxiliary
+tables for an FTS index
+@return: DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_create_index_tables_low(
+/*========================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const dict_index_t*
+			index,		/*!< in: the index instance */
+	const char*	table_name,	/*!< in: the table name */
+	table_id_t	table_id)	/*!< in: the table id */
+
+{
+	ulint		i;
+	que_t*		graph;
+	fts_table_t	fts_table;
+	dberr_t		error = DB_SUCCESS;
+	mem_heap_t*	heap = mem_heap_create(1024);
+
+	fts_table.type = FTS_INDEX_TABLE;
+	fts_table.index_id = index->id;
+	fts_table.table_id = table_id;
+	fts_table.parent = table_name;
+	fts_table.table = NULL;
+
+#ifdef FTS_DOC_STATS_DEBUG
+	char*		sql;
+
+	/* Create the FTS auxiliary tables that are specific
+	to an FTS index. */
+	sql = fts_prepare_sql(&fts_table, fts_create_index_tables_sql);
+
+	graph = fts_parse_sql_no_dict_lock(NULL, NULL, sql);
+	mem_free(sql);
+
+	error = fts_eval_sql(trx, graph);
+	que_graph_free(graph);
+#endif /* FTS_DOC_STATS_DEBUG */
+
+	for (i = 0; fts_index_selector[i].value && error == DB_SUCCESS; ++i) {
+		dict_table_t*	new_table;
+
+		/* Create the FTS auxiliary tables that are specific
+		to an FTS index. We need to preserve the table_id %s
+		which fts_parse_sql_no_dict_lock() will fill in for us. */
+		fts_table.suffix = fts_get_suffix(i);
+
+		new_table = fts_create_one_index_table(
+			trx, index, &fts_table, heap);
+
+		if (!new_table) {
+			error = DB_FAIL;
+			break;
+		}
+
+		graph = fts_parse_sql_no_dict_lock(
+			&fts_table, NULL, fts_create_index_sql);
+
+		error = fts_eval_sql(trx, graph);
+		que_graph_free(graph);
+	}
+
+	if (error != DB_SUCCESS) {
+		/* We have special error handling here */
+
+		trx->error_state = DB_SUCCESS;
+
+		trx_rollback_to_savepoint(trx, NULL);
+
+		row_drop_table_for_mysql(table_name, trx, FALSE);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	mem_heap_free(heap);
+
+	return(error);
+}
+
+/******************************************************************//**
+Creates the column specific ancillary tables needed for supporting an
+FTS index on the given table. row_mysql_lock_data_dictionary must have
+been called before this.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_create_index_tables(
+/*====================*/
+	trx_t*			trx,	/*!< in: transaction */
+	const dict_index_t*	index)	/*!< in: the index instance */
+{
+	dberr_t		err;
+	dict_table_t*	table;
+
+	table = dict_table_get_low(index->table_name);
+	ut_a(table != NULL);
+
+	err = fts_create_index_tables_low(trx, index, table->name, table->id);
+
+	if (err == DB_SUCCESS) {
+		trx_commit(trx);
+	}
+
+	return(err);
+}
+#if 0
+/******************************************************************//**
+Return string representation of state. */
+static
+const char*
+fts_get_state_str(
+/*==============*/
+				/* out: string representation of state */
+	fts_row_state	state)	/*!< in: state */
+{
+	switch (state) {
+	case FTS_INSERT:
+		return("INSERT");
+
+	case FTS_MODIFY:
+		return("MODIFY");
+
+	case FTS_DELETE:
+		return("DELETE");
+
+	case FTS_NOTHING:
+		return("NOTHING");
+
+	case FTS_INVALID:
+		return("INVALID");
+
+	default:
+		return("UNKNOWN");
+	}
+}
+#endif
+
+/******************************************************************//**
+Calculate the new state of a row given the existing state and a new event.
+@return new state of row */
+static
+fts_row_state
+fts_trx_row_get_new_state(
+/*======================*/
+	fts_row_state	old_state,		/*!< in: existing state of row */
+	fts_row_state	event)			/*!< in: new event */
+{
+	/* The rules for transforming states:
+
+	I = inserted
+	M = modified
+	D = deleted
+	N = nothing
+
+	M+D -> D:
+
+	If the row existed before the transaction started and it is modified
+	during the transaction, followed by a deletion of the row, only the
+	deletion will be signaled.
+
+	M+ -> M:
+
+	If the row existed before the transaction started and it is modified
+	more than once during the transaction, only the last modification
+	will be signaled.
+
+	IM*D -> N:
+
+	If a new row is added during the transaction (and possibly modified
+	after its initial insertion) but it is deleted before the end of the
+	transaction, nothing will be signaled.
+
+	IM* -> I:
+
+	If a new row is added during the transaction and modified after its
+	initial insertion, only the addition will be signaled.
+
+	M*DI -> M:
+
+	If the row existed before the transaction started and it is deleted,
+	then re-inserted, only a modification will be signaled. Note that
+	this case is only possible if the table is using the row's primary
+	key for FTS row ids, since those can be re-inserted by the user,
+	which is not true for InnoDB generated row ids.
+
+	It is easily seen that the above rules decompose such that we do not
+	need to store the row's entire history of events. Instead, we can
+	store just one state for the row and update that when new events
+	arrive. Then we can implement the above rules as a two-dimensional
+	look-up table, and get checking of invalid combinations "for free"
+	in the process. */
+
+	/* The lookup table for transforming states. old_state is the
+	Y-axis, event is the X-axis. */
+	static const fts_row_state table[4][4] = {
+			/*    I            M            D            N */
+		/* I */	{ FTS_INVALID, FTS_INSERT,  FTS_NOTHING, FTS_INVALID },
+		/* M */	{ FTS_INVALID, FTS_MODIFY,  FTS_DELETE,  FTS_INVALID },
+		/* D */	{ FTS_MODIFY,  FTS_INVALID, FTS_INVALID, FTS_INVALID },
+		/* N */	{ FTS_INVALID, FTS_INVALID, FTS_INVALID, FTS_INVALID }
+	};
+
+	fts_row_state result;
+
+	ut_a(old_state < FTS_INVALID);
+	ut_a(event < FTS_INVALID);
+
+	result = table[(int) old_state][(int) event];
+	ut_a(result != FTS_INVALID);
+
+	return(result);
+}
+
+/******************************************************************//**
+Create a savepoint instance.
+@return savepoint instance */
+static
+fts_savepoint_t*
+fts_savepoint_create(
+/*=================*/
+	ib_vector_t*	savepoints,		/*!< out: InnoDB transaction */
+	const char*	name,			/*!< in: savepoint name */
+	mem_heap_t*	heap)			/*!< in: heap */
+{
+	fts_savepoint_t*	savepoint;
+
+	savepoint = static_cast<fts_savepoint_t*>(
+		ib_vector_push(savepoints, NULL));
+
+	memset(savepoint, 0x0, sizeof(*savepoint));
+
+	if (name) {
+		savepoint->name = mem_heap_strdup(heap, name);
+	}
+
+	savepoint->tables = rbt_create(
+		sizeof(fts_trx_table_t*), fts_trx_table_cmp);
+
+	return(savepoint);
+}
+
+/******************************************************************//**
+Create an FTS trx.
+@return FTS trx  */
+static
+fts_trx_t*
+fts_trx_create(
+/*===========*/
+	trx_t*	trx)				/*!< in: InnoDB transaction */
+{
+	fts_trx_t*	ftt;
+	ib_alloc_t*	heap_alloc;
+	mem_heap_t*	heap = mem_heap_create(1024);
+
+	ftt = static_cast<fts_trx_t*>(mem_heap_alloc(heap, sizeof(fts_trx_t)));
+	ftt->trx = trx;
+	ftt->heap = heap;
+
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	ftt->savepoints = static_cast<ib_vector_t*>(ib_vector_create(
+		heap_alloc, sizeof(fts_savepoint_t), 4));
+
+	ftt->last_stmt = static_cast<ib_vector_t*>(ib_vector_create(
+		heap_alloc, sizeof(fts_savepoint_t), 4));
+
+	/* Default instance has no name and no heap. */
+	fts_savepoint_create(ftt->savepoints, NULL, NULL);
+	fts_savepoint_create(ftt->last_stmt, NULL, NULL);
+
+	return(ftt);
+}
+
+/******************************************************************//**
+Create an FTS trx table.
+@return FTS trx table */
+static
+fts_trx_table_t*
+fts_trx_table_create(
+/*=================*/
+	fts_trx_t*	fts_trx,		/*!< in: FTS trx */
+	dict_table_t*	table)			/*!< in: table */
+{
+	fts_trx_table_t*	ftt;
+
+	ftt = static_cast<fts_trx_table_t*>(
+		mem_heap_alloc(fts_trx->heap, sizeof(*ftt)));
+
+	memset(ftt, 0x0, sizeof(*ftt));
+
+	ftt->table = table;
+	ftt->fts_trx = fts_trx;
+
+	ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp);
+
+	return(ftt);
+}
+
+/******************************************************************//**
+Clone an FTS trx table.
+@return FTS trx table */
+static
+fts_trx_table_t*
+fts_trx_table_clone(
+/*=================*/
+	const fts_trx_table_t*	ftt_src)	/*!< in: FTS trx */
+{
+	fts_trx_table_t*	ftt;
+
+	ftt = static_cast<fts_trx_table_t*>(
+		mem_heap_alloc(ftt_src->fts_trx->heap, sizeof(*ftt)));
+
+	memset(ftt, 0x0, sizeof(*ftt));
+
+	ftt->table = ftt_src->table;
+	ftt->fts_trx = ftt_src->fts_trx;
+
+	ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp);
+
+	/* Copy the rb tree values to the new savepoint. */
+	rbt_merge_uniq(ftt_src->rows, ftt->rows);
+
+	/* These are only added on commit. At this stage we only have
+	the updated row state. */
+	ut_a(ftt_src->added_doc_ids == NULL);
+
+	return(ftt);
+}
+
+/******************************************************************//**
+Initialize the FTS trx instance.
+@return FTS trx instance */
+static
+fts_trx_table_t*
+fts_trx_init(
+/*=========*/
+	trx_t*			trx,		/*!< in: transaction */
+	dict_table_t*		table,		/*!< in: FTS table instance */
+	ib_vector_t*		savepoints)	/*!< in: Savepoints */
+{
+	fts_trx_table_t*	ftt;
+	ib_rbt_bound_t		parent;
+	ib_rbt_t*		tables;
+	fts_savepoint_t*	savepoint;
+
+	savepoint = static_cast<fts_savepoint_t*>(ib_vector_last(savepoints));
+
+	tables = savepoint->tables;
+	rbt_search_cmp(tables, &parent, &table->id, fts_trx_table_id_cmp, NULL);
+
+	if (parent.result == 0) {
+		fts_trx_table_t**	fttp;
+
+		fttp = rbt_value(fts_trx_table_t*, parent.last);
+		ftt = *fttp;
+	} else {
+		ftt = fts_trx_table_create(trx->fts_trx, table);
+		rbt_add_node(tables, &parent, &ftt);
+	}
+
+	ut_a(ftt->table == table);
+
+	return(ftt);
+}
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+static
+void
+fts_trx_table_add_op(
+/*=================*/
+	fts_trx_table_t*ftt,			/*!< in: FTS trx table */
+	doc_id_t	doc_id,			/*!< in: doc id */
+	fts_row_state	state,			/*!< in: state of the row */
+	ib_vector_t*	fts_indexes)		/*!< in: FTS indexes affected */
+{
+	ib_rbt_t*	rows;
+	ib_rbt_bound_t	parent;
+
+	rows = ftt->rows;
+	rbt_search(rows, &parent, &doc_id);
+
+	/* Row id found, update state, and if new state is FTS_NOTHING,
+	we delete the row from our tree. */
+	if (parent.result == 0) {
+		fts_trx_row_t*	row = rbt_value(fts_trx_row_t, parent.last);
+
+		row->state = fts_trx_row_get_new_state(row->state, state);
+
+		if (row->state == FTS_NOTHING) {
+			if (row->fts_indexes) {
+				ib_vector_free(row->fts_indexes);
+			}
+
+			ut_free(rbt_remove_node(rows, parent.last));
+			row = NULL;
+		} else if (row->fts_indexes != NULL) {
+			ib_vector_free(row->fts_indexes);
+			row->fts_indexes = fts_indexes;
+		}
+
+	} else { /* Row-id not found, create a new one. */
+		fts_trx_row_t	row;
+
+		row.doc_id = doc_id;
+		row.state = state;
+		row.fts_indexes = fts_indexes;
+
+		rbt_add_node(rows, &parent, &row);
+	}
+}
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+UNIV_INTERN
+void
+fts_trx_add_op(
+/*===========*/
+	trx_t*		trx,			/*!< in: InnoDB transaction */
+	dict_table_t*	table,			/*!< in: table */
+	doc_id_t	doc_id,			/*!< in: new doc id */
+	fts_row_state	state,			/*!< in: state of the row */
+	ib_vector_t*	fts_indexes)		/*!< in: FTS indexes affected
+						(NULL=all) */
+{
+	fts_trx_table_t*	tran_ftt;
+	fts_trx_table_t*	stmt_ftt;
+
+	if (!trx->fts_trx) {
+		trx->fts_trx = fts_trx_create(trx);
+	}
+
+	tran_ftt = fts_trx_init(trx, table, trx->fts_trx->savepoints);
+	stmt_ftt = fts_trx_init(trx, table, trx->fts_trx->last_stmt);
+
+	fts_trx_table_add_op(tran_ftt, doc_id, state, fts_indexes);
+	fts_trx_table_add_op(stmt_ftt, doc_id, state, fts_indexes);
+}
+
+/******************************************************************//**
+Fetch callback that converts a textual document id to a binary value and
+stores it in the given place.
+@return always returns NULL */
+static
+ibool
+fts_fetch_store_doc_id(
+/*===================*/
+	void*		row,			/*!< in: sel_node_t* */
+	void*		user_arg)		/*!< in: doc_id_t* to store
+						doc_id in */
+{
+	int		n_parsed;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	doc_id_t*	doc_id = static_cast<doc_id_t*>(user_arg);
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+
+	char		buf[32];
+
+	ut_a(dtype_get_mtype(type) == DATA_VARCHAR);
+	ut_a(len > 0 && len < sizeof(buf));
+
+	memcpy(buf, dfield_get_data(dfield), len);
+	buf[len] = '\0';
+
+	n_parsed = sscanf(buf, FTS_DOC_ID_FORMAT, doc_id);
+	ut_a(n_parsed == 1);
+
+	return(FALSE);
+}
+
+#ifdef FTS_CACHE_SIZE_DEBUG
+/******************************************************************//**
+Get the max cache size in bytes. If there is an error reading the
+value we simply print an error message here and return the default
+value to the caller.
+@return max cache size in bytes */
+static
+ulint
+fts_get_max_cache_size(
+/*===================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table)		/*!< in: table instance */
+{
+	dberr_t		error;
+	fts_string_t	value;
+	ulint		cache_size_in_mb;
+
+	/* Set to the default value. */
+	cache_size_in_mb = FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value. */
+	value.f_n_char = 0;
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = ut_malloc(value.f_len + 1);
+
+	error = fts_config_get_value(
+		trx, fts_table, FTS_MAX_CACHE_SIZE_IN_MB, &value);
+
+	if (error == DB_SUCCESS) {
+
+		value.f_str[value.f_len] = 0;
+		cache_size_in_mb = strtoul((char*) value.f_str, NULL, 10);
+
+		if (cache_size_in_mb > FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Warning: FTS max cache size "
+				" (%lu) out of range. Minimum value is "
+				"%luMB and the maximum values is %luMB, "
+				"setting cache size to upper limit\n",
+				cache_size_in_mb,
+				FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB,
+				FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB);
+
+			cache_size_in_mb = FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB;
+
+		} else if  (cache_size_in_mb
+			    < FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Warning: FTS max cache size "
+				" (%lu) out of range. Minimum value is "
+				"%luMB and the maximum values is %luMB, "
+				"setting cache size to lower limit\n",
+				cache_size_in_mb,
+				FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB,
+				FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB);
+
+			cache_size_in_mb = FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB;
+		}
+	} else {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "InnoDB: Error: (%lu) reading max cache "
+			"config value from config table\n", error);
+	}
+
+	ut_free(value.f_str);
+
+	return(cache_size_in_mb * 1024 * 1024);
+}
+#endif
+
+#ifdef FTS_DOC_STATS_DEBUG
+/*********************************************************************//**
+Get the total number of words in the FTS for a particular FTS index.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+dberr_t
+fts_get_total_word_count(
+/*=====================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: for this index */
+	ulint*		total)			/* out: total words */
+{
+	dberr_t		error;
+	fts_string_t	value;
+
+	*total = 0;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value. */
+	value.f_n_char = 0;
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	error = fts_config_get_index_value(
+		trx, index, FTS_TOTAL_WORD_COUNT, &value);
+
+	if (error == DB_SUCCESS) {
+
+		value.f_str[value.f_len] = 0;
+		*total = strtoul((char*) value.f_str, NULL, 10);
+	} else {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error: (%s) reading total words "
+			"value from config table\n", ut_strerr(error));
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+#endif /* FTS_DOC_STATS_DEBUG */
+
+/*********************************************************************//**
+Update the next and last Doc ID in the CONFIG table to be the input
+"doc_id" value (+ 1). We would do so after each FTS index build or
+table truncate */
+UNIV_INTERN
+void
+fts_update_next_doc_id(
+/*===================*/
+	trx_t*			trx,		/*!< in/out: transaction */
+	const dict_table_t*	table,		/*!< in: table */
+	const char*		table_name,	/*!< in: table name, or NULL */
+	doc_id_t		doc_id)		/*!< in: DOC ID to set */
+{
+	table->fts->cache->synced_doc_id = doc_id;
+	table->fts->cache->next_doc_id = doc_id + 1;
+
+	table->fts->cache->first_doc_id = table->fts->cache->next_doc_id;
+
+	fts_update_sync_doc_id(
+		table, table_name, table->fts->cache->synced_doc_id, trx);
+
+}
+
+/*********************************************************************//**
+Get the next available document id.
+@return DB_SUCCESS if OK */
+UNIV_INTERN
+dberr_t
+fts_get_next_doc_id(
+/*================*/
+	const dict_table_t*	table,		/*!< in: table */
+	doc_id_t*		doc_id)		/*!< out: new document id */
+{
+	fts_cache_t*	cache = table->fts->cache;
+
+	/* If the Doc ID system has not yet been initialized, we
+	will consult the CONFIG table and user table to re-establish
+	the initial value of the Doc ID */
+
+	if (cache->first_doc_id != 0 || !fts_init_doc_id(table)) {
+		if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			*doc_id = FTS_NULL_DOC_ID;
+			return(DB_SUCCESS);
+		}
+
+		/* Otherwise, simply increment the value in cache */
+		mutex_enter(&cache->doc_id_lock);
+		*doc_id = ++cache->next_doc_id;
+		mutex_exit(&cache->doc_id_lock);
+	} else {
+		mutex_enter(&cache->doc_id_lock);
+		*doc_id = cache->next_doc_id;
+		mutex_exit(&cache->doc_id_lock);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+This function fetch the Doc ID from CONFIG table, and compare with
+the Doc ID supplied. And store the larger one to the CONFIG table.
+@return DB_SUCCESS if OK */
+static __attribute__((nonnull))
+dberr_t
+fts_cmp_set_sync_doc_id(
+/*====================*/
+	const dict_table_t*	table,		/*!< in: table */
+	doc_id_t		doc_id_cmp,	/*!< in: Doc ID to compare */
+	ibool			read_only,	/*!< in: TRUE if read the
+						synced_doc_id only */
+	doc_id_t*		doc_id)		/*!< out: larger document id
+						after comparing "doc_id_cmp"
+						to the one stored in CONFIG
+						table */
+{
+	trx_t*		trx;
+	pars_info_t*	info;
+	dberr_t		error;
+	fts_table_t	fts_table;
+	que_t*		graph = NULL;
+	fts_cache_t*	cache = table->fts->cache;
+retry:
+	ut_a(table->fts->doc_col != ULINT_UNDEFINED);
+
+	fts_table.suffix = "CONFIG";
+	fts_table.table_id = table->id;
+	fts_table.type = FTS_COMMON_TABLE;
+	fts_table.table = table;
+
+	fts_table.parent = table->name;
+
+	trx = trx_allocate_for_background();
+
+	trx->op_info = "update the next FTS document id";
+
+	info = pars_info_create();
+
+	pars_info_bind_function(
+		info, "my_func", fts_fetch_store_doc_id, doc_id);
+
+	graph = fts_parse_sql(
+		&fts_table, info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS SELECT value FROM \"%s\""
+		" WHERE key = 'synced_doc_id' FOR UPDATE;\n"
+		"BEGIN\n"
+		""
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	*doc_id = 0;
+
+	error = fts_eval_sql(trx, graph);
+
+	fts_que_graph_free_check_lock(&fts_table, NULL, graph);
+
+	// FIXME: We need to retry deadlock errors
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (read_only) {
+		goto func_exit;
+	}
+
+	if (doc_id_cmp == 0 && *doc_id) {
+		cache->synced_doc_id = *doc_id - 1;
+	} else {
+		cache->synced_doc_id = ut_max(doc_id_cmp, *doc_id);
+	}
+
+	mutex_enter(&cache->doc_id_lock);
+	/* For each sync operation, we will add next_doc_id by 1,
+	so to mark a sync operation */
+	if (cache->next_doc_id < cache->synced_doc_id + 1) {
+		cache->next_doc_id = cache->synced_doc_id + 1;
+	}
+	mutex_exit(&cache->doc_id_lock);
+
+	if (doc_id_cmp > *doc_id) {
+		error = fts_update_sync_doc_id(
+			table, table->name, cache->synced_doc_id, trx);
+	}
+
+	*doc_id = cache->next_doc_id;
+
+func_exit:
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(trx);
+	} else {
+		*doc_id = 0;
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error: (%s) "
+			"while getting next doc id.\n", ut_strerr(error));
+
+		fts_sql_rollback(trx);
+
+		if (error == DB_DEADLOCK) {
+			os_thread_sleep(FTS_DEADLOCK_RETRY_WAIT);
+			goto retry;
+		}
+	}
+
+	trx_free_for_background(trx);
+
+	return(error);
+}
+
+/*********************************************************************//**
+Update the last document id. This function could create a new
+transaction to update the last document id.
+@return DB_SUCCESS if OK */
+static
+dberr_t
+fts_update_sync_doc_id(
+/*===================*/
+	const dict_table_t*	table,		/*!< in: table */
+	const char*		table_name,	/*!< in: table name, or NULL */
+	doc_id_t		doc_id,		/*!< in: last document id */
+	trx_t*			trx)		/*!< in: update trx, or NULL */
+{
+	byte		id[FTS_MAX_ID_LEN];
+	pars_info_t*	info;
+	fts_table_t	fts_table;
+	ulint		id_len;
+	que_t*		graph = NULL;
+	dberr_t		error;
+	ibool		local_trx = FALSE;
+	fts_cache_t*	cache = table->fts->cache;
+
+	fts_table.suffix = "CONFIG";
+	fts_table.table_id = table->id;
+	fts_table.type = FTS_COMMON_TABLE;
+	fts_table.table = table;
+	if (table_name) {
+		fts_table.parent = table_name;
+	} else {
+		fts_table.parent = table->name;
+	}
+
+	if (!trx) {
+		trx = trx_allocate_for_background();
+
+		trx->op_info = "setting last FTS document id";
+		local_trx = TRUE;
+	}
+
+	info = pars_info_create();
+
+	id_len = ut_snprintf(
+		(char*) id, sizeof(id), FTS_DOC_ID_FORMAT, doc_id + 1);
+
+	pars_info_bind_varchar_literal(info, "doc_id", id, id_len);
+
+	graph = fts_parse_sql(
+		&fts_table, info,
+		"BEGIN "
+		"UPDATE %s SET value = :doc_id"
+		" WHERE key = 'synced_doc_id';");
+
+	error = fts_eval_sql(trx, graph);
+
+	fts_que_graph_free_check_lock(&fts_table, NULL, graph);
+
+	if (local_trx) {
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+			cache->synced_doc_id = doc_id;
+		} else {
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"(%s) while updating last doc id.",
+				ut_strerr(error));
+
+			fts_sql_rollback(trx);
+		}
+		trx_free_for_background(trx);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Create a new fts_doc_ids_t.
+@return new fts_doc_ids_t */
+UNIV_INTERN
+fts_doc_ids_t*
+fts_doc_ids_create(void)
+/*====================*/
+{
+	fts_doc_ids_t*	fts_doc_ids;
+	mem_heap_t*	heap = mem_heap_create(512);
+
+	fts_doc_ids = static_cast<fts_doc_ids_t*>(
+		mem_heap_alloc(heap, sizeof(*fts_doc_ids)));
+
+	fts_doc_ids->self_heap = ib_heap_allocator_create(heap);
+
+	fts_doc_ids->doc_ids = static_cast<ib_vector_t*>(ib_vector_create(
+		fts_doc_ids->self_heap, sizeof(fts_update_t), 32));
+
+	return(fts_doc_ids);
+}
+
+/*********************************************************************//**
+Free a fts_doc_ids_t. */
+
+void
+fts_doc_ids_free(
+/*=============*/
+	fts_doc_ids_t*	fts_doc_ids)
+{
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(
+		fts_doc_ids->self_heap->arg);
+
+	memset(fts_doc_ids, 0, sizeof(*fts_doc_ids));
+
+	mem_heap_free(heap);
+}
+
+/*********************************************************************//**
+Do commit-phase steps necessary for the insertion of a new row.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_add(
+/*====*/
+	fts_trx_table_t*ftt,			/*!< in: FTS trx table */
+	fts_trx_row_t*	row)			/*!< in: row */
+{
+	dict_table_t*	table = ftt->table;
+	dberr_t		error = DB_SUCCESS;
+	doc_id_t	doc_id = row->doc_id;
+
+	ut_a(row->state == FTS_INSERT || row->state == FTS_MODIFY);
+
+	fts_add_doc_by_id(ftt, doc_id, row->fts_indexes);
+
+	if (error == DB_SUCCESS) {
+		mutex_enter(&table->fts->cache->deleted_lock);
+		++table->fts->cache->added;
+		mutex_exit(&table->fts->cache->deleted_lock);
+
+		if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+		    && doc_id >= table->fts->cache->next_doc_id) {
+			table->fts->cache->next_doc_id = doc_id + 1;
+		}
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Do commit-phase steps necessary for the deletion of a row.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_delete(
+/*=======*/
+	fts_trx_table_t*ftt,			/*!< in: FTS trx table */
+	fts_trx_row_t*	row)			/*!< in: row */
+{
+	que_t*		graph;
+	fts_table_t	fts_table;
+	dberr_t		error = DB_SUCCESS;
+	doc_id_t	write_doc_id;
+	dict_table_t*	table = ftt->table;
+	doc_id_t	doc_id = row->doc_id;
+	trx_t*		trx = ftt->fts_trx->trx;
+	pars_info_t*	info = pars_info_create();
+	fts_cache_t*	cache = table->fts->cache;
+
+	/* we do not index Documents whose Doc ID value is 0 */
+	if (doc_id == FTS_NULL_DOC_ID) {
+		ut_ad(!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID));
+		return(error);
+	}
+
+	ut_a(row->state == FTS_DELETE || row->state == FTS_MODIFY);
+
+	FTS_INIT_FTS_TABLE(&fts_table, "DELETED", FTS_COMMON_TABLE, table);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &write_doc_id, doc_id);
+	fts_bind_doc_id(info, "doc_id", &write_doc_id);
+
+	/* It is possible we update a record that has not yet been sync-ed
+	into cache from last crash (delete Doc will not initialize the
+	sync). Avoid any added counter accounting until the FTS cache
+	is re-established and sync-ed */
+	if (table->fts->fts_status & ADDED_TABLE_SYNCED
+	    && doc_id > cache->synced_doc_id) {
+		mutex_enter(&table->fts->cache->deleted_lock);
+
+		/* The Doc ID could belong to those left in
+		ADDED table from last crash. So need to check
+		if it is less than first_doc_id when we initialize
+		the Doc ID system after reboot */
+		if (doc_id >= table->fts->cache->first_doc_id
+		    && table->fts->cache->added > 0) {
+			--table->fts->cache->added;
+		}
+
+		mutex_exit(&table->fts->cache->deleted_lock);
+
+		/* Only if the row was really deleted. */
+		ut_a(row->state == FTS_DELETE || row->state == FTS_MODIFY);
+	}
+
+	/* Note the deleted document for OPTIMIZE to purge. */
+	if (error == DB_SUCCESS) {
+
+		trx->op_info = "adding doc id to FTS DELETED";
+
+		info->graph_owns_us = TRUE;
+
+		fts_table.suffix = "DELETED";
+
+		graph = fts_parse_sql(
+			&fts_table,
+			info,
+			"BEGIN INSERT INTO \"%s\" VALUES (:doc_id);");
+
+		error = fts_eval_sql(trx, graph);
+
+		fts_que_graph_free(graph);
+	} else {
+		pars_info_free(info);
+	}
+
+	/* Increment the total deleted count, this is used to calculate the
+	number of documents indexed. */
+	if (error == DB_SUCCESS) {
+		mutex_enter(&table->fts->cache->deleted_lock);
+
+		++table->fts->cache->deleted;
+
+		mutex_exit(&table->fts->cache->deleted_lock);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Do commit-phase steps necessary for the modification of a row.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_modify(
+/*=======*/
+	fts_trx_table_t*	ftt,		/*!< in: FTS trx table */
+	fts_trx_row_t*		row)		/*!< in: row */
+{
+	dberr_t	error;
+
+	ut_a(row->state == FTS_MODIFY);
+
+	error = fts_delete(ftt, row);
+
+	if (error == DB_SUCCESS) {
+		error = fts_add(ftt, row);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Create a new document id.
+@return DB_SUCCESS if all went well else error */
+UNIV_INTERN
+dberr_t
+fts_create_doc_id(
+/*==============*/
+	dict_table_t*	table,		/*!< in: row is of this table. */
+	dtuple_t*	row,		/* in/out: add doc id value to this
+					row. This is the current row that is
+					being inserted. */
+	mem_heap_t*	heap)		/*!< in: heap */
+{
+	doc_id_t	doc_id;
+	dberr_t		error = DB_SUCCESS;
+
+	ut_a(table->fts->doc_col != ULINT_UNDEFINED);
+
+	if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+		if (table->fts->cache->first_doc_id == FTS_NULL_DOC_ID) {
+			error = fts_get_next_doc_id(table, &doc_id);
+		}
+		return(error);
+	}
+
+	error = fts_get_next_doc_id(table, &doc_id);
+
+	if (error == DB_SUCCESS) {
+		dfield_t*	dfield;
+		doc_id_t*	write_doc_id;
+
+		ut_a(doc_id > 0);
+
+		dfield = dtuple_get_nth_field(row, table->fts->doc_col);
+		write_doc_id = static_cast<doc_id_t*>(
+			mem_heap_alloc(heap, sizeof(*write_doc_id)));
+
+		ut_a(doc_id != FTS_NULL_DOC_ID);
+		ut_a(sizeof(doc_id) == dfield->type.len);
+		fts_write_doc_id((byte*) write_doc_id, doc_id);
+
+		dfield_set_data(dfield, write_doc_id, sizeof(*write_doc_id));
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_commit_table(
+/*=============*/
+	fts_trx_table_t*	ftt)		/*!< in: FTS table to commit*/
+{
+	const ib_rbt_node_t*	node;
+	ib_rbt_t*		rows;
+	dberr_t			error = DB_SUCCESS;
+	fts_cache_t*		cache = ftt->table->fts->cache;
+	trx_t*			trx = trx_allocate_for_background();
+
+	rows = ftt->rows;
+
+	ftt->fts_trx->trx = trx;
+
+	if (cache->get_docs == NULL) {
+		rw_lock_x_lock(&cache->init_lock);
+		if (cache->get_docs == NULL) {
+			cache->get_docs = fts_get_docs_create(cache);
+		}
+		rw_lock_x_unlock(&cache->init_lock);
+	}
+
+	for (node = rbt_first(rows);
+	     node != NULL && error == DB_SUCCESS;
+	     node = rbt_next(rows, node)) {
+
+		fts_trx_row_t*	row = rbt_value(fts_trx_row_t, node);
+
+		switch (row->state) {
+		case FTS_INSERT:
+			error = fts_add(ftt, row);
+			break;
+
+		case FTS_MODIFY:
+			error = fts_modify(ftt, row);
+			break;
+
+		case FTS_DELETE:
+			error = fts_delete(ftt, row);
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	fts_sql_commit(trx);
+
+	trx_free_for_background(trx);
+
+	return(error);
+}
+
+/*********************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_commit(
+/*=======*/
+	trx_t*	trx)				/*!< in: transaction */
+{
+	const ib_rbt_node_t*	node;
+	dberr_t			error;
+	ib_rbt_t*		tables;
+	fts_savepoint_t*	savepoint;
+
+	savepoint = static_cast<fts_savepoint_t*>(
+		ib_vector_last(trx->fts_trx->savepoints));
+	tables = savepoint->tables;
+
+	for (node = rbt_first(tables), error = DB_SUCCESS;
+	     node != NULL && error == DB_SUCCESS;
+	     node = rbt_next(tables, node)) {
+
+		fts_trx_table_t**	ftt;
+
+		ftt = rbt_value(fts_trx_table_t*, node);
+
+		error = fts_commit_table(*ftt);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Initialize a document. */
+UNIV_INTERN
+void
+fts_doc_init(
+/*=========*/
+	fts_doc_t*	doc)			/*!< in: doc to initialize */
+{
+	mem_heap_t*	heap = mem_heap_create(32);
+
+	memset(doc, 0, sizeof(*doc));
+
+	doc->self_heap = ib_heap_allocator_create(heap);
+}
+
+/*********************************************************************//**
+Free document. */
+UNIV_INTERN
+void
+fts_doc_free(
+/*=========*/
+	fts_doc_t*	doc)			/*!< in: document */
+{
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(doc->self_heap->arg);
+
+	if (doc->tokens) {
+		rbt_free(doc->tokens);
+	}
+
+#ifdef UNIV_DEBUG
+	memset(doc, 0, sizeof(*doc));
+#endif /* UNIV_DEBUG */
+
+	mem_heap_free(heap);
+}
+
+/*********************************************************************//**
+Callback function for fetch that stores a row id to the location pointed.
+The column's type must be DATA_FIXBINARY, DATA_BINARY_TYPE, length = 8.
+@return always returns NULL */
+UNIV_INTERN
+void*
+fts_fetch_row_id(
+/*=============*/
+	void*	row,				/*!< in: sel_node_t* */
+	void*	user_arg)			/*!< in: data pointer */
+{
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+
+	ut_a(dtype_get_mtype(type) == DATA_FIXBINARY);
+	ut_a(dtype_get_prtype(type) & DATA_BINARY_TYPE);
+	ut_a(len == 8);
+
+	memcpy(user_arg, dfield_get_data(dfield), 8);
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Callback function for fetch that stores the text of an FTS document,
+converting each column to UTF-16.
+@return always FALSE */
+UNIV_INTERN
+ibool
+fts_query_expansion_fetch_doc(
+/*==========================*/
+	void*		row,			/*!< in: sel_node_t* */
+	void*		user_arg)		/*!< in: fts_doc_t* */
+{
+	que_node_t*	exp;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	fts_doc_t*	result_doc = static_cast<fts_doc_t*>(user_arg);
+	dfield_t*	dfield;
+	ulint		len;
+	ulint		doc_len;
+	fts_doc_t	doc;
+	CHARSET_INFO*	doc_charset = NULL;
+	ulint		field_no = 0;
+
+	len = 0;
+
+	fts_doc_init(&doc);
+	doc.found = TRUE;
+
+	exp = node->select_list;
+	doc_len = 0;
+
+	doc_charset  = result_doc->charset;
+
+	/* Copy each indexed column content into doc->text.f_str */
+	while (exp) {
+		dfield = que_node_get_val(exp);
+		len = dfield_get_len(dfield);
+
+		/* NULL column */
+		if (len == UNIV_SQL_NULL) {
+			exp = que_node_get_next(exp);
+			continue;
+		}
+
+		if (!doc_charset) {
+			ulint   prtype = dfield->type.prtype;
+			doc_charset = innobase_get_fts_charset(
+					(int)(prtype & DATA_MYSQL_TYPE_MASK),
+					(uint) dtype_get_charset_coll(prtype));
+		}
+
+		doc.charset = doc_charset;
+
+		if (dfield_is_ext(dfield)) {
+			/* We ignore columns that are stored externally, this
+			could result in too many words to search */
+			exp = que_node_get_next(exp);
+			continue;
+		} else {
+			doc.text.f_n_char = 0;
+
+			doc.text.f_str = static_cast<byte*>(
+				dfield_get_data(dfield));
+
+			doc.text.f_len = len;
+		}
+
+		if (field_no == 0) {
+			fts_tokenize_document(&doc, result_doc);
+		} else {
+			fts_tokenize_document_next(&doc, doc_len, result_doc);
+		}
+
+		exp = que_node_get_next(exp);
+
+		doc_len += (exp) ? len + 1 : len;
+
+		field_no++;
+	}
+
+	ut_ad(doc_charset);
+
+	if (!result_doc->charset) {
+		result_doc->charset = doc_charset;
+	}
+
+	fts_doc_free(&doc);
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+fetch and tokenize the document. */
+static
+void
+fts_fetch_doc_from_rec(
+/*===================*/
+	fts_get_doc_t*  get_doc,	/*!< in: FTS index's get_doc struct */
+	dict_index_t*	clust_index,	/*!< in: cluster index */
+	btr_pcur_t*	pcur,		/*!< in: cursor whose position
+					has been stored */
+	ulint*		offsets,	/*!< in: offsets */
+	fts_doc_t*	doc)		/*!< out: fts doc to hold parsed
+					documents */
+{
+	dict_index_t*		index;
+	dict_table_t*		table;
+	const rec_t*		clust_rec;
+	ulint			num_field;
+	const dict_field_t*	ifield;
+	const dict_col_t*	col;
+	ulint			clust_pos;
+	ulint			i;
+	ulint			doc_len = 0;
+	ulint			processed_doc = 0;
+
+	if (!get_doc) {
+		return;
+	}
+
+	index = get_doc->index_cache->index;
+	table = get_doc->index_cache->index->table;
+
+	clust_rec = btr_pcur_get_rec(pcur);
+
+	num_field = dict_index_get_n_fields(index);
+
+	for (i = 0; i < num_field; i++) {
+		ifield = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(ifield);
+		clust_pos = dict_col_get_clust_pos(col, clust_index);
+
+		if (!get_doc->index_cache->charset) {
+			ulint   prtype = ifield->col->prtype;
+
+			get_doc->index_cache->charset =
+				innobase_get_fts_charset(
+					(int) (prtype & DATA_MYSQL_TYPE_MASK),
+					(uint) dtype_get_charset_coll(prtype));
+		}
+
+		if (rec_offs_nth_extern(offsets, clust_pos)) {
+			doc->text.f_str =
+				btr_rec_copy_externally_stored_field(
+					clust_rec, offsets,
+					dict_table_zip_size(table),
+					clust_pos, &doc->text.f_len,
+					static_cast<mem_heap_t*>(
+						doc->self_heap->arg));
+		} else {
+			doc->text.f_str = (byte*) rec_get_nth_field(
+				clust_rec, offsets, clust_pos,
+				&doc->text.f_len);
+		}
+
+		doc->found = TRUE;
+		doc->charset = get_doc->index_cache->charset;
+
+		/* Null Field */
+		if (doc->text.f_len == UNIV_SQL_NULL) {
+			continue;
+		}
+
+		if (processed_doc == 0) {
+			fts_tokenize_document(doc, NULL);
+		} else {
+			fts_tokenize_document_next(doc, doc_len, NULL);
+		}
+
+		processed_doc++;
+		doc_len += doc->text.f_len + 1;
+	}
+}
+
+/*********************************************************************//**
+This function fetches the document inserted during the committing
+transaction, and tokenize the inserted text data and insert into
+FTS auxiliary table and its cache.
+@return TRUE if successful */
+static
+ulint
+fts_add_doc_by_id(
+/*==============*/
+	fts_trx_table_t*ftt,		/*!< in: FTS trx table */
+	doc_id_t	doc_id,		/*!< in: doc id */
+	ib_vector_t*	fts_indexes __attribute__((unused)))
+					/*!< in: affected fts indexes */
+{
+	mtr_t		mtr;
+	mem_heap_t*	heap;
+	btr_pcur_t	pcur;
+	dict_table_t*	table;
+	dtuple_t*	tuple;
+	dfield_t*       dfield;
+	fts_get_doc_t*	get_doc;
+	doc_id_t        temp_doc_id;
+	dict_index_t*   clust_index;
+	dict_index_t*	fts_id_index;
+	ibool		is_id_cluster;
+	fts_cache_t*   	cache = ftt->table->fts->cache;
+
+	ut_ad(cache->get_docs);
+
+	/* If Doc ID has been supplied by the user, then the table
+	might not yet be sync-ed */
+
+	if (!(ftt->table->fts->fts_status & ADDED_TABLE_SYNCED)) {
+		fts_init_index(ftt->table, FALSE);
+	}
+
+	/* Get the first FTS index's get_doc */
+	get_doc = static_cast<fts_get_doc_t*>(
+		ib_vector_get(cache->get_docs, 0));
+	ut_ad(get_doc);
+
+	table = get_doc->index_cache->index->table;
+
+	heap = mem_heap_create(512);
+
+	clust_index = dict_table_get_first_index(table);
+	fts_id_index = dict_table_get_index_on_name(
+				table, FTS_DOC_ID_INDEX_NAME);
+
+	/* Check whether the index on FTS_DOC_ID is cluster index */
+	is_id_cluster = (clust_index == fts_id_index);
+
+	mtr_start(&mtr);
+	btr_pcur_init(&pcur);
+
+	/* Search based on Doc ID. Here, we'll need to consider the case
+	when there is no primary index on Doc ID */
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+	dfield->type.mtype = DATA_INT;
+	dfield->type.prtype = DATA_NOT_NULL | DATA_UNSIGNED | DATA_BINARY_TYPE;
+
+	mach_write_to_8((byte*) &temp_doc_id, doc_id);
+	dfield_set_data(dfield, &temp_doc_id, sizeof(temp_doc_id));
+
+	btr_pcur_open_with_no_init(
+		fts_id_index, tuple, PAGE_CUR_LE, BTR_SEARCH_LEAF,
+		&pcur, 0, &mtr);
+
+	/* If we have a match, add the data to doc structure */
+	if (btr_pcur_get_low_match(&pcur) == 1) {
+		const rec_t*	rec;
+		btr_pcur_t*	doc_pcur;
+		const rec_t*	clust_rec;
+		btr_pcur_t	clust_pcur;
+		ulint*		offsets = NULL;
+		ulint		num_idx = ib_vector_size(cache->get_docs);
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		/* Doc could be deleted */
+		if (page_rec_is_infimum(rec)
+		    || rec_get_deleted_flag(rec, dict_table_is_comp(table))) {
+
+			goto func_exit;
+		}
+
+		if (is_id_cluster) {
+			clust_rec = rec;
+			doc_pcur = &pcur;
+		} else {
+			dtuple_t*	clust_ref;
+			ulint		n_fields;
+
+			btr_pcur_init(&clust_pcur);
+			n_fields = dict_index_get_n_unique(clust_index);
+
+			clust_ref = dtuple_create(heap, n_fields);
+			dict_index_copy_types(clust_ref, clust_index, n_fields);
+
+			row_build_row_ref_in_tuple(
+				clust_ref, rec, fts_id_index, NULL, NULL);
+
+			btr_pcur_open_with_no_init(
+				clust_index, clust_ref, PAGE_CUR_LE,
+				BTR_SEARCH_LEAF, &clust_pcur, 0, &mtr);
+
+			doc_pcur = &clust_pcur;
+			clust_rec = btr_pcur_get_rec(&clust_pcur);
+
+		}
+
+		offsets = rec_get_offsets(clust_rec, clust_index,
+					  NULL, ULINT_UNDEFINED, &heap);
+
+		 for (ulint i = 0; i < num_idx; ++i) {
+			fts_doc_t       doc;
+			dict_table_t*   table;
+			fts_get_doc_t*  get_doc;
+
+			get_doc = static_cast<fts_get_doc_t*>(
+				ib_vector_get(cache->get_docs, i));
+
+			table = get_doc->index_cache->index->table;
+
+			fts_doc_init(&doc);
+
+			fts_fetch_doc_from_rec(
+				get_doc, clust_index, doc_pcur, offsets, &doc);
+
+			if (doc.found) {
+				ibool	success __attribute__((unused));
+
+				btr_pcur_store_position(doc_pcur, &mtr);
+				mtr_commit(&mtr);
+
+				rw_lock_x_lock(&table->fts->cache->lock);
+
+				fts_cache_add_doc(
+					table->fts->cache,
+					get_doc->index_cache,
+					doc_id, doc.tokens);
+
+				rw_lock_x_unlock(&table->fts->cache->lock);
+
+				DBUG_EXECUTE_IF(
+					"fts_instrument_sync",
+					fts_sync(cache->sync);
+				);
+
+				if (cache->total_size > fts_max_cache_size
+				    || fts_need_sync) {
+					fts_sync(cache->sync);
+				}
+
+				mtr_start(&mtr);
+
+				if (i < num_idx - 1) {
+
+					success = btr_pcur_restore_position(
+						BTR_SEARCH_LEAF, doc_pcur,
+						&mtr);
+
+					ut_ad(success);
+				}
+			}
+
+			fts_doc_free(&doc);
+		}
+
+		if (!is_id_cluster) {
+			btr_pcur_close(doc_pcur);
+		}
+	}
+func_exit:
+	mtr_commit(&mtr);
+
+	btr_pcur_close(&pcur);
+
+	mem_heap_free(heap);
+	return(TRUE);
+}
+
+
+/*********************************************************************//**
+Callback function to read a single ulint column.
+return always returns TRUE */
+static
+ibool
+fts_read_ulint(
+/*===========*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ulint */
+{
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	ulint*		value = static_cast<ulint*>(user_arg);
+	que_node_t*	exp = sel_node->select_list;
+	dfield_t*	dfield = que_node_get_val(exp);
+	void*		data = dfield_get_data(dfield);
+
+	*value = static_cast<ulint>(mach_read_from_4(
+		static_cast<const byte*>(data)));
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists
+@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */
+UNIV_INTERN
+doc_id_t
+fts_get_max_doc_id(
+/*===============*/
+	dict_table_t*	table)		/*!< in: user table */
+{
+	dict_index_t*	index;
+	dict_field_t*	dfield __attribute__((unused)) = NULL;
+	doc_id_t	doc_id = 0;
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+
+	index = dict_table_get_index_on_name(table, FTS_DOC_ID_INDEX_NAME);
+
+	if (!index) {
+		return(0);
+	}
+
+	dfield = dict_index_get_nth_field(index, 0);
+
+#if 0 /* This can fail when renaming a column to FTS_DOC_ID_COL_NAME. */
+	ut_ad(innobase_strcasecmp(FTS_DOC_ID_COL_NAME, dfield->name) == 0);
+#endif
+
+	mtr_start(&mtr);
+
+	/* fetch the largest indexes value */
+	btr_pcur_open_at_index_side(
+		false, index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
+
+	if (!page_is_empty(btr_pcur_get_page(&pcur))) {
+		const rec_t*    rec = NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		ulint*		offsets = offsets_;
+		mem_heap_t*	heap = NULL;
+		ulint		len;
+		const void*	data;
+
+		rec_offs_init(offsets_);
+
+		do {
+			rec = btr_pcur_get_rec(&pcur);
+
+			if (page_rec_is_user_rec(rec)) {
+				break;
+			}
+		} while (btr_pcur_move_to_prev(&pcur, &mtr));
+
+		if (!rec) {
+			goto func_exit;
+		}
+
+		offsets = rec_get_offsets(
+			rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+		data = rec_get_nth_field(rec, offsets, 0, &len);
+
+		doc_id = static_cast<doc_id_t>(fts_read_doc_id(
+			static_cast<const byte*>(data)));
+	}
+
+func_exit:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+	return(doc_id);
+}
+
+/*********************************************************************//**
+Fetch document with the given document id.
+@return DB_SUCCESS if OK else error */
+UNIV_INTERN
+dberr_t
+fts_doc_fetch_by_doc_id(
+/*====================*/
+	fts_get_doc_t*	get_doc,	/*!< in: state */
+	doc_id_t	doc_id,		/*!< in: id of document to
+					fetch */
+	dict_index_t*	index_to_use,	/*!< in: caller supplied FTS index,
+					or NULL */
+	ulint		option,		/*!< in: search option, if it is
+					greater than doc_id or equal */
+	fts_sql_callback
+			callback,	/*!< in: callback to read */
+	void*		arg)		/*!< in: callback arg */
+{
+	pars_info_t*	info;
+	dberr_t		error;
+	const char*	select_str;
+	doc_id_t	write_doc_id;
+	dict_index_t*	index;
+	trx_t*		trx = trx_allocate_for_background();
+	que_t*          graph;
+
+	trx->op_info = "fetching indexed FTS document";
+
+	/* The FTS index can be supplied by caller directly with
+	"index_to_use", otherwise, get it from "get_doc" */
+	index = (index_to_use) ? index_to_use : get_doc->index_cache->index;
+
+	if (get_doc && get_doc->get_document_graph) {
+		info = get_doc->get_document_graph->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &write_doc_id, doc_id);
+	fts_bind_doc_id(info, "doc_id", &write_doc_id);
+	pars_info_bind_function(info, "my_func", callback, arg);
+
+	select_str = fts_get_select_columns_str(index, info, info->heap);
+	pars_info_bind_id(info, TRUE, "table_name", index->table_name);
+
+	if (!get_doc || !get_doc->get_document_graph) {
+		if (option == FTS_FETCH_DOC_BY_ID_EQUAL) {
+			graph = fts_parse_sql(
+				NULL,
+				info,
+				mem_heap_printf(info->heap,
+					"DECLARE FUNCTION my_func;\n"
+					"DECLARE CURSOR c IS"
+					" SELECT %s FROM $table_name"
+					" WHERE %s = :doc_id;\n"
+					"BEGIN\n"
+					""
+					"OPEN c;\n"
+					"WHILE 1 = 1 LOOP\n"
+					"  FETCH c INTO my_func();\n"
+					"  IF c %% NOTFOUND THEN\n"
+					"    EXIT;\n"
+					"  END IF;\n"
+					"END LOOP;\n"
+					"CLOSE c;",
+					select_str, FTS_DOC_ID_COL_NAME));
+		} else {
+			ut_ad(option == FTS_FETCH_DOC_BY_ID_LARGE);
+
+			/* This is used for crash recovery of table with
+			hidden DOC ID or FTS indexes. We will scan the table
+			to re-processing user table rows whose DOC ID or
+			FTS indexed documents have not been sync-ed to disc
+			during recent crash.
+			In the case that all fulltext indexes are dropped
+			for a table, we will keep the "hidden" FTS_DOC_ID
+			column, and this scan is to retreive the largest
+			DOC ID being used in the table to determine the
+			appropriate next DOC ID.
+			In the case of there exists fulltext index(es), this
+			operation will re-tokenize any docs that have not
+			been sync-ed to the disk, and re-prime the FTS
+			cached */
+			graph = fts_parse_sql(
+				NULL,
+				info,
+				mem_heap_printf(info->heap,
+					"DECLARE FUNCTION my_func;\n"
+					"DECLARE CURSOR c IS"
+					" SELECT %s, %s FROM $table_name"
+					" WHERE %s > :doc_id;\n"
+					"BEGIN\n"
+					""
+					"OPEN c;\n"
+					"WHILE 1 = 1 LOOP\n"
+					"  FETCH c INTO my_func();\n"
+					"  IF c %% NOTFOUND THEN\n"
+					"    EXIT;\n"
+					"  END IF;\n"
+					"END LOOP;\n"
+					"CLOSE c;",
+					FTS_DOC_ID_COL_NAME,
+					select_str, FTS_DOC_ID_COL_NAME));
+		}
+		if (get_doc) {
+			get_doc->get_document_graph = graph;
+		}
+	} else {
+		graph = get_doc->get_document_graph;
+	}
+
+	error = fts_eval_sql(trx, graph);
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(trx);
+	} else {
+		fts_sql_rollback(trx);
+	}
+
+	trx_free_for_background(trx);
+
+	if (!get_doc) {
+		fts_que_graph_free(graph);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Write out a single word's data as new entry/entries in the INDEX table.
+@return DB_SUCCESS if all OK. */
+UNIV_INTERN
+dberr_t
+fts_write_node(
+/*===========*/
+	trx_t*		trx,			/*!< in: transaction */
+	que_t**		graph,			/*!< in: query graph */
+	fts_table_t*	fts_table,		/*!< in: aux table */
+	fts_string_t*	word,			/*!< in: word in UTF-8 */
+	fts_node_t*	node)			/*!< in: node columns */
+{
+	pars_info_t*	info;
+	dberr_t		error;
+	ib_uint32_t	doc_count;
+	ib_time_t	start_time;
+	doc_id_t	last_doc_id;
+	doc_id_t	first_doc_id;
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	pars_info_bind_varchar_literal(info, "token", word->f_str, word->f_len);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &first_doc_id, node->first_doc_id);
+	fts_bind_doc_id(info, "first_doc_id", &first_doc_id);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &last_doc_id, node->last_doc_id);
+	fts_bind_doc_id(info, "last_doc_id", &last_doc_id);
+
+	ut_a(node->last_doc_id >= node->first_doc_id);
+
+	/* Convert to "storage" byte order. */
+	mach_write_to_4((byte*) &doc_count, node->doc_count);
+	pars_info_bind_int4_literal(
+		info, "doc_count", (const ib_uint32_t*) &doc_count);
+
+	/* Set copy_name to FALSE since it's a static. */
+	pars_info_bind_literal(
+		info, "ilist", node->ilist, node->ilist_size,
+		DATA_BLOB, DATA_BINARY_TYPE);
+
+	if (!*graph) {
+		*graph = fts_parse_sql(
+			fts_table,
+			info,
+			"BEGIN\n"
+			"INSERT INTO \"%s\" VALUES "
+			"(:token, :first_doc_id,"
+			" :last_doc_id, :doc_count, :ilist);");
+	}
+
+	start_time = ut_time();
+	error = fts_eval_sql(trx, *graph);
+	elapsed_time += ut_time() - start_time;
+	++n_nodes;
+
+	return(error);
+}
+
+/*********************************************************************//**
+Add rows to the DELETED_CACHE table.
+@return DB_SUCCESS if all went well else error code*/
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_sync_add_deleted_cache(
+/*=======================*/
+	fts_sync_t*	sync,			/*!< in: sync state */
+	ib_vector_t*	doc_ids)		/*!< in: doc ids to add */
+{
+	ulint		i;
+	pars_info_t*	info;
+	que_t*		graph;
+	fts_table_t	fts_table;
+	doc_id_t	dummy = 0;
+	dberr_t		error = DB_SUCCESS;
+	ulint		n_elems = ib_vector_size(doc_ids);
+
+	ut_a(ib_vector_size(doc_ids) > 0);
+
+	ib_vector_sort(doc_ids, fts_update_doc_id_cmp);
+
+	info = pars_info_create();
+
+	fts_bind_doc_id(info, "doc_id", &dummy);
+
+	FTS_INIT_FTS_TABLE(
+		&fts_table, "DELETED_CACHE", FTS_COMMON_TABLE, sync->table);
+
+	graph = fts_parse_sql(
+		&fts_table,
+		info,
+		"BEGIN INSERT INTO \"%s\" VALUES (:doc_id);");
+
+	for (i = 0; i < n_elems && error == DB_SUCCESS; ++i) {
+		fts_update_t*	update;
+		doc_id_t	write_doc_id;
+
+		update = static_cast<fts_update_t*>(ib_vector_get(doc_ids, i));
+
+		/* Convert to "storage" byte order. */
+		fts_write_doc_id((byte*) &write_doc_id, update->doc_id);
+		fts_bind_doc_id(info, "doc_id", &write_doc_id);
+
+		error = fts_eval_sql(sync->trx, graph);
+	}
+
+	fts_que_graph_free(graph);
+
+	return(error);
+}
+
+/*********************************************************************//**
+Write the words and ilist to disk.
+@return DB_SUCCESS if all went well else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_sync_write_words(
+/*=================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_index_cache_t*
+			index_cache)		/*!< in: index cache */
+{
+	fts_table_t	fts_table;
+	ulint		n_nodes = 0;
+	ulint		n_words = 0;
+	const ib_rbt_node_t* rbt_node;
+	dberr_t		error = DB_SUCCESS;
+	ibool		print_error = FALSE;
+#ifdef FTS_DOC_STATS_DEBUG
+	dict_table_t*	table = index_cache->index->table;
+	ulint		n_new_words = 0;
+#endif /* FTS_DOC_STATS_DEBUG */
+
+	FTS_INIT_INDEX_TABLE(
+		&fts_table, NULL, FTS_INDEX_TABLE, index_cache->index);
+
+	n_words = rbt_size(index_cache->words);
+
+	/* We iterate over the entire tree, even if there is an error,
+	since we want to free the memory used during caching. */
+	for (rbt_node = rbt_first(index_cache->words);
+	     rbt_node;
+	     rbt_node = rbt_first(index_cache->words)) {
+
+		ulint			i;
+		ulint			selected;
+		fts_tokenizer_word_t*	word;
+
+		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+		selected = fts_select_index(
+			index_cache->charset, word->text.f_str,
+			word->text.f_len);
+
+		fts_table.suffix = fts_get_suffix(selected);
+
+#ifdef FTS_DOC_STATS_DEBUG
+		/* Check if the word exists in the FTS index and if not
+		then we need to increment the total word count stats. */
+		if (error == DB_SUCCESS && fts_enable_diag_print) {
+			ibool	found = FALSE;
+
+			error = fts_is_word_in_index(
+				trx,
+				&index_cache->sel_graph[selected],
+				&fts_table,
+				&word->text, &found);
+
+			if (error == DB_SUCCESS && !found) {
+
+				++n_new_words;
+			}
+		}
+#endif /* FTS_DOC_STATS_DEBUG */
+
+		n_nodes += ib_vector_size(word->nodes);
+
+		/* We iterate over all the nodes even if there was an error,
+		this is to free the memory of the fts_node_t elements. */
+		for (i = 0; i < ib_vector_size(word->nodes); ++i) {
+
+			fts_node_t* fts_node = static_cast<fts_node_t*>(
+				ib_vector_get(word->nodes, i));
+
+			if (error == DB_SUCCESS) {
+
+				error = fts_write_node(
+					trx,
+					&index_cache->ins_graph[selected],
+					&fts_table, &word->text, fts_node);
+			}
+
+			ut_free(fts_node->ilist);
+			fts_node->ilist = NULL;
+		}
+
+		if (error != DB_SUCCESS && !print_error) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Error (%s) writing "
+				"word node to FTS auxiliary index "
+				"table.\n", ut_strerr(error));
+
+			print_error = TRUE;
+		}
+
+		/* NOTE: We are responsible for free'ing the node */
+		ut_free(rbt_remove_node(index_cache->words, rbt_node));
+	}
+
+#ifdef FTS_DOC_STATS_DEBUG
+	if (error == DB_SUCCESS && n_new_words > 0 && fts_enable_diag_print) {
+		fts_table_t	fts_table;
+
+		FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+
+		/* Increment the total number of words in the FTS index */
+		error = fts_config_increment_index_value(
+			trx, index_cache->index, FTS_TOTAL_WORD_COUNT,
+			n_new_words);
+	}
+#endif /* FTS_DOC_STATS_DEBUG */
+
+	if (fts_enable_diag_print) {
+		printf("Avg number of nodes: %lf\n",
+		       (double) n_nodes / (double) (n_words > 1 ? n_words : 1));
+	}
+
+	return(error);
+}
+
+#ifdef FTS_DOC_STATS_DEBUG
+/*********************************************************************//**
+Write a single documents statistics to disk.
+@return DB_SUCCESS if all went well else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_sync_write_doc_stat(
+/*====================*/
+	trx_t*			trx,		/*!< in: transaction */
+	dict_index_t*		index,		/*!< in: index */
+	que_t**			graph,		/* out: query graph */
+	const fts_doc_stats_t*	doc_stat)	/*!< in: doc stats to write */
+{
+	pars_info_t*	info;
+	doc_id_t	doc_id;
+	dberr_t		error = DB_SUCCESS;
+	ib_uint32_t	word_count;
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	/* Convert to "storage" byte order. */
+	mach_write_to_4((byte*) &word_count, doc_stat->word_count);
+	pars_info_bind_int4_literal(
+		info, "count", (const ib_uint32_t*) &word_count);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &doc_id, doc_stat->doc_id);
+	fts_bind_doc_id(info, "doc_id", &doc_id);
+
+	if (!*graph) {
+		fts_table_t	fts_table;
+
+		FTS_INIT_INDEX_TABLE(
+			&fts_table, "DOC_ID", FTS_INDEX_TABLE, index);
+
+		*graph = fts_parse_sql(
+			&fts_table,
+			info,
+			"BEGIN INSERT INTO \"%s\" VALUES (:doc_id, :count);");
+	}
+
+	for (;;) {
+		error = fts_eval_sql(trx, *graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, "  InnoDB: Warning: lock wait "
+					"timeout writing to FTS doc_id. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, "  InnoDB: Error: (%s) "
+					"while writing to FTS doc_id.\n",
+					ut_strerr(error));
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Write document statistics to disk.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_sync_write_doc_stats(
+/*=====================*/
+	trx_t*			trx,		/*!< in: transaction */
+	const fts_index_cache_t*index_cache)	/*!< in: index cache */
+{
+	dberr_t		error = DB_SUCCESS;
+	que_t*		graph = NULL;
+	fts_doc_stats_t*  doc_stat;
+
+	if (ib_vector_is_empty(index_cache->doc_stats)) {
+		return(DB_SUCCESS);
+	}
+
+	doc_stat = static_cast<ts_doc_stats_t*>(
+		ib_vector_pop(index_cache->doc_stats));
+
+	while (doc_stat) {
+		error = fts_sync_write_doc_stat(
+			trx, index_cache->index, &graph, doc_stat);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		if (ib_vector_is_empty(index_cache->doc_stats)) {
+			break;
+		}
+
+		doc_stat = static_cast<ts_doc_stats_t*>(
+			ib_vector_pop(index_cache->doc_stats));
+	}
+
+	if (graph != NULL) {
+		fts_que_graph_free_check_lock(NULL, index_cache, graph);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Callback to check the existince of a word.
+@return always return NULL */
+static
+ibool
+fts_lookup_word(
+/*============*/
+	void*	row,				/*!< in:  sel_node_t* */
+	void*	user_arg)			/*!< in:  fts_doc_t* */
+{
+
+	que_node_t*	exp;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	ibool*		found = static_cast<ibool*>(user_arg);
+
+	exp = node->select_list;
+
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		ulint		len = dfield_get_len(dfield);
+
+		if (len != UNIV_SQL_NULL && len != 0) {
+			*found = TRUE;
+		}
+
+		exp = que_node_get_next(exp);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Check whether a particular word (term) exists in the FTS index.
+@return DB_SUCCESS if all went well else error code */
+static
+dberr_t
+fts_is_word_in_index(
+/*=================*/
+	trx_t*		trx,			/*!< in: FTS query state */
+	que_t**		graph,			/* out: Query graph */
+	fts_table_t*	fts_table,		/*!< in: table instance */
+	const fts_string_t*
+			word,			/*!< in: the word to check */
+	ibool*		found)			/* out: TRUE if exists */
+{
+	pars_info_t*	info;
+	dberr_t		error;
+
+	trx->op_info = "looking up word in FTS index";
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	pars_info_bind_function(info, "my_func", fts_lookup_word, found);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	if (*graph == NULL) {
+		*graph = fts_parse_sql(
+			fts_table,
+			info,
+			"DECLARE FUNCTION my_func;\n"
+			"DECLARE CURSOR c IS"
+			" SELECT doc_count\n"
+			" FROM %s\n"
+			" WHERE word = :word "
+			" ORDER BY first_doc_id;\n"
+			"BEGIN\n"
+			"\n"
+			"OPEN c;\n"
+			"WHILE 1 = 1 LOOP\n"
+			"  FETCH c INTO my_func();\n"
+			"  IF c % NOTFOUND THEN\n"
+			"    EXIT;\n"
+			"  END IF;\n"
+			"END LOOP;\n"
+			"CLOSE c;");
+	}
+
+	for (;;) {
+		error = fts_eval_sql(trx, *graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, "  InnoDB: Warning: lock wait "
+					"timeout reading FTS index. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, "  InnoDB: Error: (%s) "
+					"while reading FTS index.\n",
+					ut_strerr(error));
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	return(error);
+}
+#endif /* FTS_DOC_STATS_DEBUG */
+
+/*********************************************************************//**
+Begin Sync, create transaction, acquire locks, etc. */
+static
+void
+fts_sync_begin(
+/*===========*/
+	fts_sync_t*	sync)			/*!< in: sync state */
+{
+	fts_cache_t*	cache = sync->table->fts->cache;
+
+	n_nodes = 0;
+	elapsed_time = 0;
+
+	sync->start_time = ut_time();
+
+	sync->trx = trx_allocate_for_background();
+
+	if (fts_enable_diag_print) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"FTS SYNC for table %s, deleted count: %ld size: "
+			"%lu bytes",
+			sync->table->name,
+			ib_vector_size(cache->deleted_doc_ids),
+			cache->total_size);
+	}
+}
+
+/*********************************************************************//**
+Run SYNC on the table, i.e., write out data from the index specific
+cache to the FTS aux INDEX table and FTS aux doc id stats table.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_sync_index(
+/*===========*/
+	fts_sync_t*		sync,		/*!< in: sync state */
+	fts_index_cache_t*	index_cache)	/*!< in: index cache */
+{
+	trx_t*		trx = sync->trx;
+	dberr_t		error = DB_SUCCESS;
+
+	trx->op_info = "doing SYNC index";
+
+	if (fts_enable_diag_print) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"SYNC words: %ld", rbt_size(index_cache->words));
+	}
+
+	ut_ad(rbt_validate(index_cache->words));
+
+	error = fts_sync_write_words(trx, index_cache);
+
+#ifdef FTS_DOC_STATS_DEBUG
+	/* FTS_RESOLVE: the word counter info in auxiliary table "DOC_ID"
+	is not used currently for ranking. We disable fts_sync_write_doc_stats()
+	for now */
+	/* Write the per doc statistics that will be used for ranking. */
+	if (error == DB_SUCCESS) {
+
+		error = fts_sync_write_doc_stats(trx, index_cache);
+	}
+#endif /* FTS_DOC_STATS_DEBUG */
+
+	return(error);
+}
+
+/*********************************************************************//**
+Commit the SYNC, change state of processed doc ids etc.
+@return DB_SUCCESS if all OK */
+static  __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_sync_commit(
+/*============*/
+	fts_sync_t*	sync)			/*!< in: sync state */
+{
+	dberr_t		error;
+	trx_t*		trx = sync->trx;
+	fts_cache_t*	cache = sync->table->fts->cache;
+	doc_id_t	last_doc_id;
+
+	trx->op_info = "doing SYNC commit";
+
+	/* After each Sync, update the CONFIG table about the max doc id
+	we just sync-ed to index table */
+	error = fts_cmp_set_sync_doc_id(sync->table, sync->max_doc_id, FALSE,
+					&last_doc_id);
+
+	/* Get the list of deleted documents that are either in the
+	cache or were headed there but were deleted before the add
+	thread got to them. */
+
+	if (error == DB_SUCCESS && ib_vector_size(cache->deleted_doc_ids) > 0) {
+
+		error = fts_sync_add_deleted_cache(
+			sync, cache->deleted_doc_ids);
+	}
+
+	/* We need to do this within the deleted lock since fts_delete() can
+	attempt to add a deleted doc id to the cache deleted id array. Set
+	the shutdown flag to FALSE, signifying that we don't want to release
+	all resources. */
+	fts_cache_clear(cache, FALSE);
+	fts_cache_init(cache);
+	rw_lock_x_unlock(&cache->lock);
+
+	if (error == DB_SUCCESS) {
+
+		fts_sql_commit(trx);
+
+	} else if (error != DB_SUCCESS) {
+
+		fts_sql_rollback(trx);
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error: (%s) during SYNC.\n",
+			ut_strerr(error));
+	}
+
+	if (fts_enable_diag_print && elapsed_time) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"SYNC for table %s: SYNC time : %lu secs: "
+			"elapsed %lf ins/sec",
+			sync->table->name,
+			(ulong) (ut_time() - sync->start_time),
+			(double) n_nodes/ (double) elapsed_time);
+	}
+
+	trx_free_for_background(trx);
+
+	return(error);
+}
+
+/*********************************************************************//**
+Rollback a sync operation */
+static
+void
+fts_sync_rollback(
+/*==============*/
+	fts_sync_t*	sync)			/*!< in: sync state */
+{
+	trx_t*		trx = sync->trx;
+	fts_cache_t*	cache = sync->table->fts->cache;
+
+	rw_lock_x_unlock(&cache->lock);
+
+	fts_sql_rollback(trx);
+	trx_free_for_background(trx);
+}
+
+/****************************************************************//**
+Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@return DB_SUCCESS if all OK */
+static
+dberr_t
+fts_sync(
+/*=====*/
+	fts_sync_t*	sync)		/*!< in: sync state */
+{
+	ulint		i;
+	dberr_t		error = DB_SUCCESS;
+	fts_cache_t*	cache = sync->table->fts->cache;
+
+	rw_lock_x_lock(&cache->lock);
+
+	fts_sync_begin(sync);
+
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		error = fts_sync_index(sync, index_cache);
+
+		if (error != DB_SUCCESS && !sync->interrupted) {
+
+			break;
+		}
+	}
+
+	DBUG_EXECUTE_IF("fts_instrument_sync_interrupted",
+			 sync->interrupted = true;
+	);
+
+	if (error == DB_SUCCESS && !sync->interrupted) {
+		error = fts_sync_commit(sync);
+	}  else {
+		fts_sync_rollback(sync);
+	}
+
+	/* We need to check whether an optimize is required, for that
+	we make copies of the two variables that control the trigger. These
+	variables can change behind our back and we don't want to hold the
+	lock for longer than is needed. */
+	mutex_enter(&cache->deleted_lock);
+
+	cache->added = 0;
+	cache->deleted = 0;
+
+	mutex_exit(&cache->deleted_lock);
+
+	return(error);
+}
+
+/****************************************************************//**
+Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end. */
+UNIV_INTERN
+void
+fts_sync_table(
+/*===========*/
+	dict_table_t*	table)		/*!< in: table */
+{
+	ut_ad(table->fts);
+
+	if (table->fts->cache) {
+		fts_sync(table->fts->cache->sync);
+	}
+}
+
+/********************************************************************
+Process next token from document starting at the given position, i.e., add
+the token's start position to the token's list of positions.
+@return number of characters handled in this call */
+static
+ulint
+fts_process_token(
+/*==============*/
+	fts_doc_t*	doc,		/* in/out: document to
+					tokenize */
+	fts_doc_t*	result,		/* out: if provided, save
+					result here */
+	ulint		start_pos,	/*!< in: start position in text */
+	ulint		add_pos)	/*!< in: add this position to all
+					tokens from this tokenization */
+{
+	ulint		ret;
+	fts_string_t	str;
+	ulint		offset = 0;
+	fts_doc_t*	result_doc;
+	byte		buf[FTS_MAX_WORD_LEN + 1];
+
+	str.f_str = buf;
+
+	/* Determine where to save the result. */
+	result_doc = (result) ? result : doc;
+
+	/* The length of a string in characters is set here only. */
+
+	ret = innobase_mysql_fts_get_token(
+		doc->charset, doc->text.f_str + start_pos,
+		doc->text.f_str + doc->text.f_len, &str, &offset);
+
+	/* Ignore string whose character number is less than
+	"fts_min_token_size" or more than "fts_max_token_size" */
+
+	if (str.f_n_char >= fts_min_token_size
+	    && str.f_n_char <= fts_max_token_size) {
+
+		mem_heap_t*	heap;
+		fts_string_t	t_str;
+		fts_token_t*	token;
+		ib_rbt_bound_t	parent;
+		ulint		newlen;
+
+		heap = static_cast<mem_heap_t*>(result_doc->self_heap->arg);
+
+		t_str.f_n_char = str.f_n_char;
+
+		t_str.f_len = str.f_len * doc->charset->casedn_multiply + 1;
+
+		t_str.f_str = static_cast<byte*>(
+			mem_heap_alloc(heap, t_str.f_len));
+
+		newlen = innobase_fts_casedn_str(
+			doc->charset, (char*) str.f_str, str.f_len,
+			(char*) t_str.f_str, t_str.f_len);
+
+		t_str.f_len = newlen;
+
+		/* Add the word to the document statistics. If the word
+		hasn't been seen before we create a new entry for it. */
+		if (rbt_search(result_doc->tokens, &parent, &t_str) != 0) {
+			fts_token_t	new_token;
+
+			new_token.text.f_len = newlen;
+			new_token.text.f_str = t_str.f_str;
+			new_token.text.f_n_char = t_str.f_n_char;
+
+			new_token.positions = ib_vector_create(
+				result_doc->self_heap, sizeof(ulint), 32);
+
+			ut_a(new_token.text.f_n_char >= fts_min_token_size);
+			ut_a(new_token.text.f_n_char <= fts_max_token_size);
+
+			parent.last = rbt_add_node(
+				result_doc->tokens, &parent, &new_token);
+
+			ut_ad(rbt_validate(result_doc->tokens));
+		}
+
+#ifdef	FTS_CHARSET_DEBUG
+		offset += start_pos + add_pos;
+#endif /* FTS_CHARSET_DEBUG */
+
+		offset += start_pos + ret - str.f_len + add_pos;
+
+		token = rbt_value(fts_token_t, parent.last);
+		ib_vector_push(token->positions, &offset);
+	}
+
+	return(ret);
+}
+
+/******************************************************************//**
+Tokenize a document. */
+UNIV_INTERN
+void
+fts_tokenize_document(
+/*==================*/
+	fts_doc_t*	doc,		/* in/out: document to
+					tokenize */
+	fts_doc_t*	result)		/* out: if provided, save
+					the result token here */
+{
+	ulint		inc;
+
+	ut_a(!doc->tokens);
+	ut_a(doc->charset);
+
+	doc->tokens = rbt_create_arg_cmp(
+		sizeof(fts_token_t), innobase_fts_text_cmp, (void*) doc->charset);
+
+	for (ulint i = 0; i < doc->text.f_len; i += inc) {
+		inc = fts_process_token(doc, result, i, 0);
+		ut_a(inc > 0);
+	}
+}
+
+/******************************************************************//**
+Continue to tokenize a document. */
+UNIV_INTERN
+void
+fts_tokenize_document_next(
+/*=======================*/
+	fts_doc_t*	doc,		/*!< in/out: document to
+					tokenize */
+	ulint		add_pos,	/*!< in: add this position to all
+					tokens from this tokenization */
+	fts_doc_t*	result)		/*!< out: if provided, save
+					the result token here */
+{
+	ulint		inc;
+
+	ut_a(doc->tokens);
+
+	for (ulint i = 0; i < doc->text.f_len; i += inc) {
+		inc = fts_process_token(doc, result, i, add_pos);
+		ut_a(inc > 0);
+	}
+}
+
+/********************************************************************
+Create the vector of fts_get_doc_t instances. */
+UNIV_INTERN
+ib_vector_t*
+fts_get_docs_create(
+/*================*/
+						/* out: vector of
+						fts_get_doc_t instances */
+	fts_cache_t*	cache)			/*!< in: fts cache */
+{
+	ulint		i;
+	ib_vector_t*	get_docs;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_EX));
+#endif
+	/* We need one instance of fts_get_doc_t per index. */
+	get_docs = ib_vector_create(
+		cache->self_heap, sizeof(fts_get_doc_t), 4);
+
+	/* Create the get_doc instance, we need one of these
+	per FTS index. */
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+
+		dict_index_t**	index;
+		fts_get_doc_t*	get_doc;
+
+		index = static_cast<dict_index_t**>(
+			ib_vector_get(cache->indexes, i));
+
+		get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_push(get_docs, NULL));
+
+		memset(get_doc, 0x0, sizeof(*get_doc));
+
+		get_doc->index_cache = fts_get_index_cache(cache, *index);
+		get_doc->cache = cache;
+
+		/* Must find the index cache. */
+		ut_a(get_doc->index_cache != NULL);
+	}
+
+	return(get_docs);
+}
+
+/********************************************************************
+Release any resources held by the fts_get_doc_t instances. */
+static
+void
+fts_get_docs_clear(
+/*===============*/
+	ib_vector_t*	get_docs)		/*!< in: Doc retrieval vector */
+{
+	ulint		i;
+
+	/* Release the get doc graphs if any. */
+	for (i = 0; i < ib_vector_size(get_docs); ++i) {
+
+		fts_get_doc_t*	get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_get(get_docs, i));
+
+		if (get_doc->get_document_graph != NULL) {
+
+			ut_a(get_doc->index_cache);
+
+			fts_que_graph_free(get_doc->get_document_graph);
+			get_doc->get_document_graph = NULL;
+		}
+	}
+}
+
+/*********************************************************************//**
+Get the initial Doc ID by consulting the CONFIG table
+@return initial Doc ID */
+UNIV_INTERN
+doc_id_t
+fts_init_doc_id(
+/*============*/
+	const dict_table_t*	table)		/*!< in: table */
+{
+	doc_id_t	max_doc_id = 0;
+
+	rw_lock_x_lock(&table->fts->cache->lock);
+
+	/* Return if the table is already initialized for DOC ID */
+	if (table->fts->cache->first_doc_id != FTS_NULL_DOC_ID) {
+		rw_lock_x_unlock(&table->fts->cache->lock);
+		return(0);
+	}
+
+	DEBUG_SYNC_C("fts_initialize_doc_id");
+
+	/* Then compare this value with the ID value stored in the CONFIG
+	table. The larger one will be our new initial Doc ID */
+	fts_cmp_set_sync_doc_id(table, 0, FALSE, &max_doc_id);
+
+	/* If DICT_TF2_FTS_ADD_DOC_ID is set, we are in the process of
+	creating index (and add doc id column. No need to recovery
+	documents */
+	if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		fts_init_index((dict_table_t*) table, TRUE);
+	}
+
+	table->fts->fts_status |= ADDED_TABLE_SYNCED;
+
+	table->fts->cache->first_doc_id = max_doc_id;
+
+	rw_lock_x_unlock(&table->fts->cache->lock);
+
+	ut_ad(max_doc_id > 0);
+
+	return(max_doc_id);
+}
+
+#ifdef FTS_MULT_INDEX
+/*********************************************************************//**
+Check if the index is in the affected set.
+@return TRUE if index is updated */
+static
+ibool
+fts_is_index_updated(
+/*=================*/
+	const ib_vector_t*	fts_indexes,	/*!< in: affected FTS indexes */
+	const fts_get_doc_t*	get_doc)	/*!< in: info for reading
+						document */
+{
+	ulint		i;
+	dict_index_t*	index = get_doc->index_cache->index;
+
+	for (i = 0; i < ib_vector_size(fts_indexes); ++i) {
+		const dict_index_t*	updated_fts_index;
+
+		updated_fts_index = static_cast<const dict_index_t*>(
+			ib_vector_getp_const(fts_indexes, i));
+
+		ut_a(updated_fts_index != NULL);
+
+		if (updated_fts_index == index) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+#endif
+
+/*********************************************************************//**
+Fetch COUNT(*) from specified table.
+@return the number of rows in the table */
+UNIV_INTERN
+ulint
+fts_get_rows_count(
+/*===============*/
+	fts_table_t*	fts_table)	/*!< in: fts table to read */
+{
+	trx_t*		trx;
+	pars_info_t*	info;
+	que_t*		graph;
+	dberr_t		error;
+	ulint		count = 0;
+
+	trx = trx_allocate_for_background();
+
+	trx->op_info = "fetching FT table rows count";
+
+	info = pars_info_create();
+
+	pars_info_bind_function(info, "my_func", fts_read_ulint, &count);
+
+	graph = fts_parse_sql(
+		fts_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT COUNT(*) "
+		" FROM \"%s\";\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for (;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+
+			break;				/* Exit the loop. */
+		} else {
+			fts_sql_rollback(trx);
+
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, "  InnoDB: Warning: lock wait "
+					"timeout reading FTS table. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, "  InnoDB: Error: (%s) "
+					"while reading FTS table.\n",
+					ut_strerr(error));
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	fts_que_graph_free(graph);
+
+	trx_free_for_background(trx);
+
+	return(count);
+}
+
+#ifdef FTS_CACHE_SIZE_DEBUG
+/*********************************************************************//**
+Read the max cache size parameter from the config table. */
+static
+void
+fts_update_max_cache_size(
+/*======================*/
+	fts_sync_t*	sync)			/*!< in: sync state */
+{
+	trx_t*		trx;
+	fts_table_t	fts_table;
+
+	trx = trx_allocate_for_background();
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, sync->table);
+
+	/* The size returned is in bytes. */
+	sync->max_cache_size = fts_get_max_cache_size(trx, &fts_table);
+
+	fts_sql_commit(trx);
+
+	trx_free_for_background(trx);
+}
+#endif /* FTS_CACHE_SIZE_DEBUG */
+
+/*********************************************************************//**
+Free the modified rows of a table. */
+UNIV_INLINE
+void
+fts_trx_table_rows_free(
+/*====================*/
+	ib_rbt_t*	rows)			/*!< in: rbt of rows to free */
+{
+	const ib_rbt_node_t*	node;
+
+	for (node = rbt_first(rows); node; node = rbt_first(rows)) {
+		fts_trx_row_t*	row;
+
+		row = rbt_value(fts_trx_row_t, node);
+
+		if (row->fts_indexes != NULL) {
+			/* This vector shouldn't be using the
+			heap allocator.  */
+			ut_a(row->fts_indexes->allocator->arg == NULL);
+
+			ib_vector_free(row->fts_indexes);
+			row->fts_indexes = NULL;
+		}
+
+		ut_free(rbt_remove_node(rows, node));
+	}
+
+	ut_a(rbt_empty(rows));
+	rbt_free(rows);
+}
+
+/*********************************************************************//**
+Free an FTS savepoint instance. */
+UNIV_INLINE
+void
+fts_savepoint_free(
+/*===============*/
+	fts_savepoint_t*	savepoint)	/*!< in: savepoint instance */
+{
+	const ib_rbt_node_t*	node;
+	ib_rbt_t*		tables = savepoint->tables;
+
+	/* Nothing to free! */
+	if (tables == NULL) {
+		return;
+	}
+
+	for (node = rbt_first(tables); node; node = rbt_first(tables)) {
+		fts_trx_table_t*	ftt;
+		fts_trx_table_t**	fttp;
+
+		fttp = rbt_value(fts_trx_table_t*, node);
+		ftt = *fttp;
+
+		/* This can be NULL if a savepoint was released. */
+		if (ftt->rows != NULL) {
+			fts_trx_table_rows_free(ftt->rows);
+			ftt->rows = NULL;
+		}
+
+		/* This can be NULL if a savepoint was released. */
+		if (ftt->added_doc_ids != NULL) {
+			fts_doc_ids_free(ftt->added_doc_ids);
+			ftt->added_doc_ids = NULL;
+		}
+
+		/* The default savepoint name must be NULL. */
+		if (ftt->docs_added_graph) {
+			fts_que_graph_free(ftt->docs_added_graph);
+		}
+
+		/* NOTE: We are responsible for free'ing the node */
+		ut_free(rbt_remove_node(tables, node));
+	}
+
+	ut_a(rbt_empty(tables));
+	rbt_free(tables);
+	savepoint->tables = NULL;
+}
+
+/*********************************************************************//**
+Free an FTS trx. */
+UNIV_INTERN
+void
+fts_trx_free(
+/*=========*/
+	fts_trx_t*	fts_trx)		/* in, own: FTS trx */
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(fts_trx->savepoints); ++i) {
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_get(fts_trx->savepoints, i));
+
+		/* The default savepoint name must be NULL. */
+		if (i == 0) {
+			ut_a(savepoint->name == NULL);
+		}
+
+		fts_savepoint_free(savepoint);
+	}
+
+	for (i = 0; i < ib_vector_size(fts_trx->last_stmt); ++i) {
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_get(fts_trx->last_stmt, i));
+
+		/* The default savepoint name must be NULL. */
+		if (i == 0) {
+			ut_a(savepoint->name == NULL);
+		}
+
+		fts_savepoint_free(savepoint);
+	}
+
+	if (fts_trx->heap) {
+		mem_heap_free(fts_trx->heap);
+	}
+}
+
+/*********************************************************************//**
+Extract the doc id from the FTS hidden column.
+@return doc id that was extracted from rec */
+UNIV_INTERN
+doc_id_t
+fts_get_doc_id_from_row(
+/*====================*/
+	dict_table_t*	table,			/*!< in: table */
+	dtuple_t*	row)			/*!< in: row whose FTS doc id we
+						want to extract.*/
+{
+	dfield_t*	field;
+	doc_id_t	doc_id = 0;
+
+	ut_a(table->fts->doc_col != ULINT_UNDEFINED);
+
+	field = dtuple_get_nth_field(row, table->fts->doc_col);
+
+	ut_a(dfield_get_len(field) == sizeof(doc_id));
+	ut_a(dfield_get_type(field)->mtype == DATA_INT);
+
+	doc_id = fts_read_doc_id(
+		static_cast<const byte*>(dfield_get_data(field)));
+
+	return(doc_id);
+}
+
+/*********************************************************************//**
+Extract the doc id from the FTS hidden column.
+@return doc id that was extracted from rec */
+UNIV_INTERN
+doc_id_t
+fts_get_doc_id_from_rec(
+/*====================*/
+	dict_table_t*	table,			/*!< in: table */
+	const rec_t*	rec,			/*!< in: rec */
+	mem_heap_t*	heap)			/*!< in: heap */
+{
+	ulint		len;
+	const byte*	data;
+	ulint		col_no;
+	doc_id_t	doc_id = 0;
+	dict_index_t*	clust_index;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets = offsets_;
+	mem_heap_t*	my_heap = heap;
+
+	ut_a(table->fts->doc_col != ULINT_UNDEFINED);
+
+	clust_index = dict_table_get_first_index(table);
+
+	rec_offs_init(offsets_);
+
+	offsets = rec_get_offsets(
+		rec, clust_index, offsets, ULINT_UNDEFINED, &my_heap);
+
+	col_no = dict_col_get_clust_pos(
+		&table->cols[table->fts->doc_col], clust_index);
+	ut_ad(col_no != ULINT_UNDEFINED);
+
+	data = rec_get_nth_field(rec, offsets, col_no, &len);
+
+	ut_a(len == 8);
+	ut_ad(8 == sizeof(doc_id));
+	doc_id = static_cast<doc_id_t>(mach_read_from_8(data));
+
+	if (my_heap && !heap) {
+		mem_heap_free(my_heap);
+	}
+
+	return(doc_id);
+}
+
+/*********************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index specific cache else NULL */
+UNIV_INTERN
+fts_index_cache_t*
+fts_find_index_cache(
+/*=================*/
+	const fts_cache_t*	cache,		/*!< in: cache to search */
+	const dict_index_t*	index)		/*!< in: index to search for */
+{
+	/* We cast away the const because our internal function, takes
+	non-const cache arg and returns a non-const pointer. */
+	return(static_cast<fts_index_cache_t*>(
+		fts_get_index_cache((fts_cache_t*) cache, index)));
+}
+
+/*********************************************************************//**
+Search cache for word.
+@return the word node vector if found else NULL */
+UNIV_INTERN
+const ib_vector_t*
+fts_cache_find_word(
+/*================*/
+	const fts_index_cache_t*index_cache,	/*!< in: cache to search */
+	const fts_string_t*	text)		/*!< in: word to search for */
+{
+	ib_rbt_bound_t		parent;
+	const ib_vector_t*	nodes = NULL;
+#ifdef UNIV_SYNC_DEBUG
+	dict_table_t*		table = index_cache->index->table;
+	fts_cache_t*		cache = table->fts->cache;
+
+	ut_ad(rw_lock_own((rw_lock_t*) &cache->lock, RW_LOCK_EX));
+#endif
+
+	/* Lookup the word in the rb tree */
+	if (rbt_search(index_cache->words, &parent, text) == 0) {
+		const fts_tokenizer_word_t*	word;
+
+		word = rbt_value(fts_tokenizer_word_t, parent.last);
+
+		nodes = word->nodes;
+	}
+
+	return(nodes);
+}
+
+/*********************************************************************//**
+Check cache for deleted doc id.
+@return TRUE if deleted */
+UNIV_INTERN
+ibool
+fts_cache_is_deleted_doc_id(
+/*========================*/
+	const fts_cache_t*	cache,		/*!< in: cache ito search */
+	doc_id_t		doc_id)		/*!< in: doc id to search for */
+{
+	ulint			i;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&cache->deleted_lock));
+#endif
+
+	for (i = 0; i < ib_vector_size(cache->deleted_doc_ids); ++i) {
+		const fts_update_t*	update;
+
+		update = static_cast<const fts_update_t*>(
+			ib_vector_get_const(cache->deleted_doc_ids, i));
+
+		if (doc_id == update->doc_id) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Append deleted doc ids to vector. */
+UNIV_INTERN
+void
+fts_cache_append_deleted_doc_ids(
+/*=============================*/
+	const fts_cache_t*	cache,		/*!< in: cache to use */
+	ib_vector_t*		vector)		/*!< in: append to this vector */
+{
+	ulint			i;
+
+	mutex_enter((ib_mutex_t*) &cache->deleted_lock);
+
+	for (i = 0; i < ib_vector_size(cache->deleted_doc_ids); ++i) {
+		fts_update_t*	update;
+
+		update = static_cast<fts_update_t*>(
+			ib_vector_get(cache->deleted_doc_ids, i));
+
+		ib_vector_push(vector, &update->doc_id);
+	}
+
+	mutex_exit((ib_mutex_t*) &cache->deleted_lock);
+}
+
+/*********************************************************************//**
+Wait for the background thread to start. We poll to detect change
+of state, which is acceptable, since the wait should happen only
+once during startup.
+@return true if the thread started else FALSE (i.e timed out) */
+UNIV_INTERN
+ibool
+fts_wait_for_background_thread_to_start(
+/*====================================*/
+	dict_table_t*		table,		/*!< in: table to which the thread
+						is attached */
+	ulint			max_wait)	/*!< in: time in microseconds, if
+						set to 0 then it disables
+						timeout checking */
+{
+	ulint			count = 0;
+	ibool			done = FALSE;
+
+	ut_a(max_wait == 0 || max_wait >= FTS_MAX_BACKGROUND_THREAD_WAIT);
+
+	for (;;) {
+		fts_t*		fts = table->fts;
+
+		mutex_enter(&fts->bg_threads_mutex);
+
+		if (fts->fts_status & BG_THREAD_READY) {
+
+			done = TRUE;
+		}
+
+		mutex_exit(&fts->bg_threads_mutex);
+
+		if (!done) {
+			os_thread_sleep(FTS_MAX_BACKGROUND_THREAD_WAIT);
+
+			if (max_wait > 0) {
+
+				max_wait -= FTS_MAX_BACKGROUND_THREAD_WAIT;
+
+				/* We ignore the residual value. */
+				if (max_wait < FTS_MAX_BACKGROUND_THREAD_WAIT) {
+					break;
+				}
+			}
+
+			++count;
+		} else {
+			break;
+		}
+
+		if (count >= FTS_BACKGROUND_THREAD_WAIT_COUNT) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, " InnoDB: Error the background thread "
+				"for the FTS table %s refuses to start\n",
+				table->name);
+
+			count = 0;
+		}
+	}
+
+	return(done);
+}
+
+/*********************************************************************//**
+Add the FTS document id hidden column. */
+UNIV_INTERN
+void
+fts_add_doc_id_column(
+/*==================*/
+	dict_table_t*	table,	/*!< in/out: Table with FTS index */
+	mem_heap_t*	heap)	/*!< in: temporary memory heap, or NULL */
+{
+	dict_mem_table_add_col(
+		table, heap,
+		FTS_DOC_ID_COL_NAME,
+		DATA_INT,
+		dtype_form_prtype(
+			DATA_NOT_NULL | DATA_UNSIGNED
+			| DATA_BINARY_TYPE | DATA_FTS_DOC_ID, 0),
+		sizeof(doc_id_t));
+	DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_HAS_DOC_ID);
+}
+
+/*********************************************************************//**
+Update the query graph with a new document id.
+@return Doc ID used */
+UNIV_INTERN
+doc_id_t
+fts_update_doc_id(
+/*==============*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	ufield,		/*!< out: update node */
+	doc_id_t*	next_doc_id)	/*!< in/out: buffer for writing */
+{
+	doc_id_t	doc_id;
+	dberr_t		error = DB_SUCCESS;
+
+	if (*next_doc_id) {
+		doc_id = *next_doc_id;
+	} else {
+		/* Get the new document id that will be added. */
+		error = fts_get_next_doc_id(table, &doc_id);
+	}
+
+	if (error == DB_SUCCESS) {
+		dict_index_t*	clust_index;
+
+		ufield->exp = NULL;
+
+		ufield->new_val.len = sizeof(doc_id);
+
+		clust_index = dict_table_get_first_index(table);
+
+		ufield->field_no = dict_col_get_clust_pos(
+			&table->cols[table->fts->doc_col], clust_index);
+
+		/* It is possible we update record that has
+		not yet be sync-ed from last crash. */
+
+		/* Convert to storage byte order. */
+		ut_a(doc_id != FTS_NULL_DOC_ID);
+		fts_write_doc_id((byte*) next_doc_id, doc_id);
+
+		ufield->new_val.data = next_doc_id;
+	}
+
+	return(doc_id);
+}
+
+/*********************************************************************//**
+Check if the table has an FTS index. This is the non-inline version
+of dict_table_has_fts_index().
+@return TRUE if table has an FTS index */
+UNIV_INTERN
+ibool
+fts_dict_table_has_fts_index(
+/*=========================*/
+	dict_table_t*	table)		/*!< in: table */
+{
+	return(dict_table_has_fts_index(table));
+}
+
+/*********************************************************************//**
+Create an instance of fts_t.
+@return instance of fts_t */
+UNIV_INTERN
+fts_t*
+fts_create(
+/*=======*/
+	dict_table_t*	table)		/*!< in/out: table with FTS indexes */
+{
+	fts_t*		fts;
+	ib_alloc_t*	heap_alloc;
+	mem_heap_t*	heap;
+
+	ut_a(!table->fts);
+
+	heap = mem_heap_create(512);
+
+	fts = static_cast<fts_t*>(mem_heap_alloc(heap, sizeof(*fts)));
+
+	memset(fts, 0x0, sizeof(*fts));
+
+	fts->fts_heap = heap;
+
+	fts->doc_col = ULINT_UNDEFINED;
+
+	mutex_create(
+		fts_bg_threads_mutex_key, &fts->bg_threads_mutex,
+		SYNC_FTS_BG_THREADS);
+
+	heap_alloc = ib_heap_allocator_create(heap);
+	fts->indexes = ib_vector_create(heap_alloc, sizeof(dict_index_t*), 4);
+	dict_table_get_all_fts_indexes(table, fts->indexes);
+
+	return(fts);
+}
+
+/*********************************************************************//**
+Free the FTS resources. */
+UNIV_INTERN
+void
+fts_free(
+/*=====*/
+	dict_table_t*	table)	/*!< in/out: table with FTS indexes */
+{
+	fts_t*		fts = table->fts;
+
+	mutex_free(&fts->bg_threads_mutex);
+
+	ut_ad(!fts->add_wq);
+
+	if (fts->cache) {
+		fts_cache_clear(fts->cache, TRUE);
+		fts_cache_destroy(fts->cache);
+		fts->cache = NULL;
+	}
+
+	mem_heap_free(fts->fts_heap);
+
+	table->fts = NULL;
+}
+
+/*********************************************************************//**
+Signal FTS threads to initiate shutdown. */
+UNIV_INTERN
+void
+fts_start_shutdown(
+/*===============*/
+	dict_table_t*	table,		/*!< in: table with FTS indexes */
+	fts_t*		fts)		/*!< in: fts instance that needs
+					to be informed about shutdown */
+{
+	mutex_enter(&fts->bg_threads_mutex);
+
+	fts->fts_status |= BG_THREAD_STOP;
+
+	mutex_exit(&fts->bg_threads_mutex);
+
+}
+
+/*********************************************************************//**
+Wait for FTS threads to shutdown. */
+UNIV_INTERN
+void
+fts_shutdown(
+/*=========*/
+	dict_table_t*	table,		/*!< in: table with FTS indexes */
+	fts_t*		fts)		/*!< in: fts instance to shutdown */
+{
+	mutex_enter(&fts->bg_threads_mutex);
+
+	ut_a(fts->fts_status & BG_THREAD_STOP);
+
+	dict_table_wait_for_bg_threads_to_exit(table, 20000);
+
+	mutex_exit(&fts->bg_threads_mutex);
+}
+
+/*********************************************************************//**
+Take a FTS savepoint. */
+UNIV_INLINE
+void
+fts_savepoint_copy(
+/*===============*/
+	const fts_savepoint_t*	src,	/*!< in: source savepoint */
+	fts_savepoint_t*	dst)	/*!< out: destination savepoint */
+{
+	const ib_rbt_node_t*	node;
+	const ib_rbt_t*		tables;
+
+	tables = src->tables;
+
+	for (node = rbt_first(tables); node; node = rbt_next(tables, node)) {
+
+		fts_trx_table_t*	ftt_dst;
+		const fts_trx_table_t**	ftt_src;
+
+		ftt_src = rbt_value(const fts_trx_table_t*, node);
+
+		ftt_dst = fts_trx_table_clone(*ftt_src);
+
+		rbt_insert(dst->tables, &ftt_dst, &ftt_dst);
+	}
+}
+
+/*********************************************************************//**
+Take a FTS savepoint. */
+UNIV_INTERN
+void
+fts_savepoint_take(
+/*===============*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	name)		/*!< in: savepoint name */
+{
+	mem_heap_t*		heap;
+	fts_trx_t*		fts_trx;
+	fts_savepoint_t*	savepoint;
+	fts_savepoint_t*	last_savepoint;
+
+	ut_a(name != NULL);
+
+	fts_trx = trx->fts_trx;
+	heap = fts_trx->heap;
+
+	/* The implied savepoint must exist. */
+	ut_a(ib_vector_size(fts_trx->savepoints) > 0);
+
+	last_savepoint = static_cast<fts_savepoint_t*>(
+		ib_vector_last(fts_trx->savepoints));
+	savepoint = fts_savepoint_create(fts_trx->savepoints, name, heap);
+
+	if (last_savepoint->tables != NULL) {
+		fts_savepoint_copy(last_savepoint, savepoint);
+	}
+}
+
+/*********************************************************************//**
+Lookup a savepoint instance by name.
+@return ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+fts_savepoint_lookup(
+/*==================*/
+	ib_vector_t*	savepoints,	/*!< in: savepoints */
+	const char*	name)		/*!< in: savepoint name */
+{
+	ulint			i;
+
+	ut_a(ib_vector_size(savepoints) > 0);
+
+	for (i = 1; i < ib_vector_size(savepoints); ++i) {
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_get(savepoints, i));
+
+		if (strcmp(name, savepoint->name) == 0) {
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/*********************************************************************//**
+Release the savepoint data identified by  name. All savepoints created
+after the named savepoint are also released.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+void
+fts_savepoint_release(
+/*==================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	name)		/*!< in: savepoint name */
+{
+	ulint			i;
+	ib_vector_t*		savepoints;
+	ulint			top_of_stack = 0;
+
+	ut_a(name != NULL);
+
+	savepoints = trx->fts_trx->savepoints;
+
+	ut_a(ib_vector_size(savepoints) > 0);
+
+	/* Skip the implied savepoint (first element). */
+	for (i = 1; i < ib_vector_size(savepoints); ++i) {
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_get(savepoints, i));
+
+		/* Even though we release the resources that are part
+		of the savepoint, we don't (always) actually delete the
+		entry.  We simply set the savepoint name to NULL. Therefore
+		we have to skip deleted/released entries. */
+		if (savepoint->name != NULL
+		    && strcmp(name, savepoint->name) == 0) {
+			break;
+
+		/* Track the previous savepoint instance that will
+		be at the top of the stack after the release. */
+		} else if (savepoint->name != NULL) {
+			/* We need to delete all entries
+			greater than this element. */
+			top_of_stack = i;
+		}
+	}
+
+	/* Only if we found and element to release. */
+	if (i < ib_vector_size(savepoints)) {
+
+		ut_a(top_of_stack < ib_vector_size(savepoints));
+
+		/* Skip the implied savepoint. */
+		for (i = ib_vector_size(savepoints) - 1;
+		     i > top_of_stack;
+		     --i) {
+
+			fts_savepoint_t*	savepoint;
+
+			savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_get(savepoints, i));
+
+			/* Skip savepoints that were released earlier. */
+			if (savepoint->name != NULL) {
+				savepoint->name = NULL;
+				fts_savepoint_free(savepoint);
+			}
+
+			ib_vector_pop(savepoints);
+		}
+
+		/* Make sure we don't delete the implied savepoint. */
+		ut_a(ib_vector_size(savepoints) > 0);
+
+		/* This must hold. */
+		ut_a(ib_vector_size(savepoints) == (top_of_stack + 1));
+	}
+}
+
+/**********************************************************************//**
+Refresh last statement savepoint. */
+UNIV_INTERN
+void
+fts_savepoint_laststmt_refresh(
+/*===========================*/
+	trx_t*			trx)	/*!< in: transaction */
+{
+
+	fts_trx_t*              fts_trx;
+	fts_savepoint_t*        savepoint;
+
+	fts_trx = trx->fts_trx;
+
+	savepoint = static_cast<fts_savepoint_t*>(
+		ib_vector_pop(fts_trx->last_stmt));
+	fts_savepoint_free(savepoint);
+
+	ut_ad(ib_vector_is_empty(fts_trx->last_stmt));
+	savepoint = fts_savepoint_create(fts_trx->last_stmt, NULL, NULL);
+}
+
+/********************************************************************
+Undo the Doc ID add/delete operations in last stmt */
+static
+void
+fts_undo_last_stmt(
+/*===============*/
+	fts_trx_table_t*	s_ftt,	/*!< in: Transaction FTS table */
+	fts_trx_table_t*	l_ftt)	/*!< in: last stmt FTS table */
+{
+	ib_rbt_t*		s_rows;
+	ib_rbt_t*		l_rows;
+	const ib_rbt_node_t*	node;
+
+	l_rows = l_ftt->rows;
+	s_rows = s_ftt->rows;
+
+	for (node = rbt_first(l_rows);
+	     node;
+	     node = rbt_next(l_rows, node)) {
+		fts_trx_row_t*	l_row = rbt_value(fts_trx_row_t, node);
+		ib_rbt_bound_t	parent;
+
+		rbt_search(s_rows, &parent, &(l_row->doc_id));
+
+		if (parent.result == 0) {
+			fts_trx_row_t*	s_row = rbt_value(
+				fts_trx_row_t, parent.last);
+
+			switch (l_row->state) {
+			case FTS_INSERT:
+				ut_free(rbt_remove_node(s_rows, parent.last));
+				break;
+
+			case FTS_DELETE:
+				if (s_row->state == FTS_NOTHING) {
+					s_row->state = FTS_INSERT;
+				} else if (s_row->state == FTS_DELETE) {
+					ut_free(rbt_remove_node(
+						s_rows, parent.last));
+				}
+				break;
+
+			/* FIXME: Check if FTS_MODIFY need to be addressed */
+			case FTS_MODIFY:
+			case FTS_NOTHING:
+				break;
+			default:
+				ut_error;
+			}
+		}
+	}
+}
+
+/**********************************************************************//**
+Rollback to savepoint indentified by name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+void
+fts_savepoint_rollback_last_stmt(
+/*=============================*/
+	trx_t*		trx)		/*!< in: transaction */
+{
+	ib_vector_t*		savepoints;
+	fts_savepoint_t*	savepoint;
+	fts_savepoint_t*	last_stmt;
+	fts_trx_t*		fts_trx;
+	ib_rbt_bound_t		parent;
+	const ib_rbt_node_t*    node;
+	ib_rbt_t*		l_tables;
+	ib_rbt_t*		s_tables;
+
+	fts_trx = trx->fts_trx;
+	savepoints = fts_trx->savepoints;
+
+	savepoint = static_cast<fts_savepoint_t*>(ib_vector_last(savepoints));
+	last_stmt = static_cast<fts_savepoint_t*>(
+		ib_vector_last(fts_trx->last_stmt));
+
+	l_tables = last_stmt->tables;
+	s_tables = savepoint->tables;
+
+	for (node = rbt_first(l_tables);
+	     node;
+	     node = rbt_next(l_tables, node)) {
+
+		fts_trx_table_t**	l_ftt;
+
+		l_ftt = rbt_value(fts_trx_table_t*, node);
+
+		rbt_search_cmp(
+			s_tables, &parent, &(*l_ftt)->table->id,
+			fts_trx_table_id_cmp, NULL);
+
+		if (parent.result == 0) {
+			fts_trx_table_t**	s_ftt;
+
+			s_ftt = rbt_value(fts_trx_table_t*, parent.last);
+
+			fts_undo_last_stmt(*s_ftt, *l_ftt);
+		}
+	}
+}
+
+/**********************************************************************//**
+Rollback to savepoint indentified by name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+void
+fts_savepoint_rollback(
+/*===================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	name)		/*!< in: savepoint name */
+{
+	ulint		i;
+	ib_vector_t*	savepoints;
+
+	ut_a(name != NULL);
+
+	savepoints = trx->fts_trx->savepoints;
+
+	/* We pop all savepoints from the the top of the stack up to
+	and including the instance that was found. */
+	i = fts_savepoint_lookup(savepoints, name);
+
+	if (i != ULINT_UNDEFINED) {
+		fts_savepoint_t*	savepoint;
+
+		ut_a(i > 0);
+
+		while (ib_vector_size(savepoints) > i) {
+			fts_savepoint_t*	savepoint;
+
+			savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_pop(savepoints));
+
+			if (savepoint->name != NULL) {
+				/* Since name was allocated on the heap, the
+				memory will be released when the transaction
+				completes. */
+				savepoint->name = NULL;
+
+				fts_savepoint_free(savepoint);
+			}
+		}
+
+		/* Pop all a elements from the top of the stack that may
+		have been released. We have to be careful that we don't
+		delete the implied savepoint. */
+
+		for (savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_last(savepoints));
+		     ib_vector_size(savepoints) > 1
+		     && savepoint->name == NULL;
+		     savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_last(savepoints))) {
+
+			ib_vector_pop(savepoints);
+		}
+
+		/* Make sure we don't delete the implied savepoint. */
+		ut_a(ib_vector_size(savepoints) > 0);
+	}
+}
+
+/**********************************************************************//**
+Check if a table is an FTS auxiliary table name.
+@return TRUE if the name matches an auxiliary table name pattern */
+static
+ibool
+fts_is_aux_table_name(
+/*==================*/
+	fts_aux_table_t*table,		/*!< out: table info */
+	const char*	name,		/*!< in: table name */
+	ulint		len)		/*!< in: length of table name */
+{
+	const char*	ptr;
+	char*		end;
+	char		my_name[MAX_FULL_NAME_LEN + 1];
+
+	ut_ad(len <= MAX_FULL_NAME_LEN);
+	ut_memcpy(my_name, name, len);
+	my_name[len] = 0;
+	end = my_name + len;
+
+	ptr =  static_cast<const char*>(memchr(my_name, '/', len));
+
+	if (ptr != NULL) {
+		/* We will start the match after the '/' */
+		++ptr;
+		len = end - ptr;
+	}
+
+	/* All auxiliary tables are prefixed with "FTS_" and the name
+	length will be at the very least greater than 20 bytes. */
+	if (ptr != NULL && len > 20 && strncmp(ptr, "FTS_", 4) == 0) {
+		ulint		i;
+
+		/* Skip the prefix. */
+		ptr += 4;
+		len -= 4;
+
+		/* Try and read the table id. */
+		if (!fts_read_object_id(&table->parent_id, ptr)) {
+			return(FALSE);
+		}
+
+		/* Skip the table id. */
+		ptr = static_cast<const char*>(memchr(ptr, '_', len));
+
+		if (ptr == NULL) {
+			return(FALSE);
+		}
+
+		/* Skip the underscore. */
+		++ptr;
+		ut_a(end > ptr);
+		len = end - ptr;
+
+		/* First search the common table suffix array. */
+		for (i = 0; fts_common_tables[i] != NULL; ++i) {
+
+			if (strncmp(ptr, fts_common_tables[i], len) == 0) {
+				return(TRUE);
+			}
+		}
+
+		/* Try and read the index id. */
+		if (!fts_read_object_id(&table->index_id, ptr)) {
+			return(FALSE);
+		}
+
+		/* Skip the table id. */
+		ptr = static_cast<const char*>(memchr(ptr, '_', len));
+
+		if (ptr == NULL) {
+			return(FALSE);
+		}
+
+		/* Skip the underscore. */
+		++ptr;
+		ut_a(end > ptr);
+		len = end - ptr;
+
+		/* Search the FT index specific array. */
+		for (i = 0; fts_index_selector[i].value; ++i) {
+
+			if (strncmp(ptr, fts_get_suffix(i), len) == 0) {
+				return(TRUE);
+			}
+		}
+
+		/* Other FT index specific table(s). */
+		if (strncmp(ptr, "DOC_ID", len) == 0) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Callback function to read a single table ID column.
+@return Always return TRUE */
+static
+ibool
+fts_read_tables(
+/*============*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	int		i;
+	fts_aux_table_t*table;
+	mem_heap_t*	heap;
+	ibool		done = FALSE;
+	ib_vector_t*	tables = static_cast<ib_vector_t*>(user_arg);
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	que_node_t*	exp = sel_node->select_list;
+
+	/* Must be a heap allocated vector. */
+	ut_a(tables->allocator->arg != NULL);
+
+	/* We will use this heap for allocating strings. */
+	heap = static_cast<mem_heap_t*>(tables->allocator->arg);
+	table = static_cast<fts_aux_table_t*>(ib_vector_push(tables, NULL));
+
+	memset(table, 0x0, sizeof(*table));
+
+	/* Iterate over the columns and read the values. */
+	for (i = 0; exp && !done; exp = que_node_get_next(exp), ++i) {
+
+		dfield_t*	dfield = que_node_get_val(exp);
+		void*		data = dfield_get_data(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		/* Note: The column numbers below must match the SELECT */
+		switch (i) {
+		case 0: /* NAME */
+
+			if (!fts_is_aux_table_name(
+				table, static_cast<const char*>(data), len)) {
+				ib_vector_pop(tables);
+				done = TRUE;
+				break;
+			}
+
+			table->name = static_cast<char*>(
+				mem_heap_alloc(heap, len + 1));
+			memcpy(table->name, data, len);
+			table->name[len] = 0;
+			break;
+
+		case 1: /* ID */
+			ut_a(len == 8);
+			table->id = mach_read_from_8(
+				static_cast<const byte*>(data));
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Check and drop all orphaned FTS auxiliary tables, those that don't have
+a parent table or FTS index defined on them.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull))
+void
+fts_check_and_drop_orphaned_tables(
+/*===============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	ib_vector_t*	tables)			/*!< in: tables to check */
+{
+	for (ulint i = 0; i < ib_vector_size(tables); ++i) {
+		dict_table_t*		table;
+		fts_aux_table_t*	aux_table;
+		bool			drop = false;
+
+		aux_table = static_cast<fts_aux_table_t*>(
+			ib_vector_get(tables, i));
+
+		table = dict_table_open_on_id(
+			aux_table->parent_id, TRUE, DICT_TABLE_OP_NORMAL);
+
+		if (table == NULL || table->fts == NULL) {
+
+			drop = true;
+
+		} else if (aux_table->index_id != 0) {
+			index_id_t	id;
+			fts_t*		fts;
+
+			drop = true;
+			fts = table->fts;
+			id = aux_table->index_id;
+
+			/* Search for the FT index in the table's list. */
+			for (ulint j = 0;
+			     j < ib_vector_size(fts->indexes);
+			     ++j) {
+
+				const dict_index_t*	index;
+
+				index = static_cast<const dict_index_t*>(
+					ib_vector_getp_const(fts->indexes, j));
+
+				if (index->id == id) {
+
+					drop = false;
+					break;
+				}
+			}
+		}
+
+		if (table) {
+			dict_table_close(table, TRUE, FALSE);
+		}
+
+		if (drop) {
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Parent table of FTS auxiliary table %s not "
+				"found.", aux_table->name);
+
+			dberr_t	err = fts_drop_table(trx, aux_table->name);
+
+			if (err == DB_FAIL) {
+				char*	path;
+
+				path = fil_make_ibd_name(
+					aux_table->name, false);
+
+				os_file_delete_if_exists(innodb_file_data_key,
+							 path);
+
+				mem_free(path);
+			}
+		}
+	}
+}
+
+/**********************************************************************//**
+Drop all orphaned FTS auxiliary tables, those that don't have a parent
+table or FTS index defined on them. */
+UNIV_INTERN
+void
+fts_drop_orphaned_tables(void)
+/*==========================*/
+{
+	trx_t*			trx;
+	pars_info_t*		info;
+	mem_heap_t*		heap;
+	que_t*			graph;
+	ib_vector_t*		tables;
+	ib_alloc_t*		heap_alloc;
+	space_name_list_t	space_name_list;
+	dberr_t			error = DB_SUCCESS;
+
+	/* Note: We have to free the memory after we are done with the list. */
+	error = fil_get_space_names(space_name_list);
+
+	if (error == DB_OUT_OF_MEMORY) {
+		ib_logf(IB_LOG_LEVEL_ERROR, "Out of memory");
+		ut_error;
+	}
+
+	heap = mem_heap_create(1024);
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	/* We store the table ids of all the FTS indexes that were found. */
+	tables = ib_vector_create(heap_alloc, sizeof(fts_aux_table_t), 128);
+
+	/* Get the list of all known .ibd files and check for orphaned
+	FTS auxiliary files in that list. We need to remove them because
+	users can't map them back to table names and this will create
+	unnecessary clutter. */
+
+	for (space_name_list_t::iterator it = space_name_list.begin();
+	     it != space_name_list.end();
+	     ++it) {
+
+		fts_aux_table_t*	fts_aux_table;
+
+		fts_aux_table = static_cast<fts_aux_table_t*>(
+			ib_vector_push(tables, NULL));
+
+		memset(fts_aux_table, 0x0, sizeof(*fts_aux_table));
+
+		if (!fts_is_aux_table_name(fts_aux_table, *it, strlen(*it))) {
+			ib_vector_pop(tables);
+		} else {
+			ulint	len = strlen(*it);
+
+			fts_aux_table->id = fil_get_space_id_for_table(*it);
+
+			/* We got this list from fil0fil.cc. The tablespace
+			with this name must exist. */
+			ut_a(fts_aux_table->id != ULINT_UNDEFINED);
+
+			fts_aux_table->name = static_cast<char*>(
+				mem_heap_dup(heap, *it, len + 1));
+
+			fts_aux_table->name[len] = 0;
+		}
+	}
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "dropping orphaned FTS tables";
+	row_mysql_lock_data_dictionary(trx);
+
+	info = pars_info_create();
+
+	pars_info_bind_function(info, "my_func", fts_read_tables, tables);
+
+	graph = fts_parse_sql_no_dict_lock(
+		NULL,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT NAME, ID "
+		" FROM SYS_TABLES;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for (;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+			fts_check_and_drop_orphaned_tables(trx, tables);
+			fts_sql_commit(trx);
+			break;				/* Exit the loop. */
+		} else {
+			ib_vector_reset(tables);
+
+			fts_sql_rollback(trx);
+
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"lock wait timeout reading SYS_TABLES. "
+					"Retrying!");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"(%s) while reading SYS_TABLES.",
+					ut_strerr(error));
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	que_graph_free(graph);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx_free_for_background(trx);
+
+	if (heap != NULL) {
+		mem_heap_free(heap);
+	}
+
+	/** Free the memory allocated to store the .ibd names. */
+	for (space_name_list_t::iterator it = space_name_list.begin();
+	     it != space_name_list.end();
+	     ++it) {
+
+		delete[] *it;
+	}
+}
+
+/**********************************************************************//**
+Check whether user supplied stopword table is of the right format.
+Caller is responsible to hold dictionary locks.
+@return the stopword column charset if qualifies */
+UNIV_INTERN
+CHARSET_INFO*
+fts_valid_stopword_table(
+/*=====================*/
+	 const char*	stopword_table_name)	/*!< in: Stopword table
+						name */
+{
+	dict_table_t*	table;
+	dict_col_t*     col = NULL;
+
+	if (!stopword_table_name) {
+		return(NULL);
+	}
+
+	table = dict_table_get_low(stopword_table_name);
+
+	if (!table) {
+		fprintf(stderr,
+			"InnoDB: user stopword table %s does not exist.\n",
+			stopword_table_name);
+
+		return(NULL);
+	} else {
+		const char*     col_name;
+
+		col_name = dict_table_get_col_name(table, 0);
+
+		if (ut_strcmp(col_name, "value")) {
+			fprintf(stderr,
+				"InnoDB: invalid column name for stopword "
+				"table %s. Its first column must be named as "
+				"'value'.\n", stopword_table_name);
+
+			return(NULL);
+		}
+
+		col = dict_table_get_nth_col(table, 0);
+
+		if (col->mtype != DATA_VARCHAR
+		    && col->mtype != DATA_VARMYSQL) {
+			fprintf(stderr,
+				"InnoDB: invalid column type for stopword "
+				"table %s. Its first column must be of "
+				"varchar type\n", stopword_table_name);
+
+			return(NULL);
+		}
+	}
+
+	ut_ad(col);
+
+	return(innobase_get_fts_charset(
+		static_cast<int>(col->prtype & DATA_MYSQL_TYPE_MASK),
+		static_cast<ulint>(dtype_get_charset_coll(col->prtype))));
+}
+
+/**********************************************************************//**
+This function loads the stopword into the FTS cache. It also
+records/fetches stopword configuration to/from FTS configure
+table, depending on whether we are creating or reloading the
+FTS.
+@return TRUE if load operation is successful */
+UNIV_INTERN
+ibool
+fts_load_stopword(
+/*==============*/
+	const dict_table_t*
+			table,			/*!< in: Table with FTS */
+	trx_t*		trx,			/*!< in: Transactions */
+	const char*	global_stopword_table,	/*!< in: Global stopword table
+						name */
+	const char*	session_stopword_table,	/*!< in: Session stopword table
+						name */
+	ibool		stopword_is_on,		/*!< in: Whether stopword
+						option is turned on/off */
+	ibool		reload)			/*!< in: Whether it is
+						for reloading FTS table */
+{
+	fts_table_t	fts_table;
+	fts_string_t	str;
+	dberr_t		error = DB_SUCCESS;
+	ulint		use_stopword;
+	fts_cache_t*	cache;
+	const char*	stopword_to_use = NULL;
+	ibool		new_trx = FALSE;
+	byte		str_buffer[MAX_FULL_NAME_LEN + 1];
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, table);
+
+	cache = table->fts->cache;
+
+	if (!reload && !(cache->stopword_info.status
+			 & STOPWORD_NOT_INIT)) {
+		return(TRUE);
+	}
+
+	if (!trx) {
+		trx = trx_allocate_for_background();
+		trx->op_info = "upload FTS stopword";
+		new_trx = TRUE;
+	}
+
+	/* First check whether stopword filtering is turned off */
+	if (reload) {
+		error = fts_config_get_ulint(
+			trx, &fts_table, FTS_USE_STOPWORD, &use_stopword);
+	} else {
+		use_stopword = (ulint) stopword_is_on;
+
+		error = fts_config_set_ulint(
+			trx, &fts_table, FTS_USE_STOPWORD, use_stopword);
+	}
+
+	if (error != DB_SUCCESS) {
+		goto cleanup;
+	}
+
+	/* If stopword is turned off, no need to continue to load the
+	stopword into cache, but still need to do initialization */
+	if (!use_stopword) {
+		cache->stopword_info.status = STOPWORD_OFF;
+		goto cleanup;
+	}
+
+	if (reload) {
+		/* Fetch the stopword table name from FTS config
+		table */
+		str.f_n_char = 0;
+		str.f_str = str_buffer;
+		str.f_len = sizeof(str_buffer) - 1;
+
+		error = fts_config_get_value(
+			trx, &fts_table, FTS_STOPWORD_TABLE_NAME, &str);
+
+		if (error != DB_SUCCESS) {
+			goto cleanup;
+		}
+
+		if (strlen((char*) str.f_str) > 0) {
+			stopword_to_use = (const char*) str.f_str;
+		}
+	} else {
+		stopword_to_use = (session_stopword_table)
+			? session_stopword_table : global_stopword_table;
+	}
+
+	if (stopword_to_use
+	    && fts_load_user_stopword(table->fts, stopword_to_use,
+				      &cache->stopword_info)) {
+		/* Save the stopword table name to the configure
+		table */
+		if (!reload) {
+			str.f_n_char = 0;
+			str.f_str = (byte*) stopword_to_use;
+			str.f_len = ut_strlen(stopword_to_use);
+
+			error = fts_config_set_value(
+				trx, &fts_table, FTS_STOPWORD_TABLE_NAME, &str);
+		}
+	} else {
+		/* Load system default stopword list */
+		fts_load_default_stopword(&cache->stopword_info);
+	}
+
+cleanup:
+	if (new_trx) {
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+		} else {
+			fts_sql_rollback(trx);
+		}
+
+		trx_free_for_background(trx);
+	}
+
+	if (!cache->stopword_info.cached_stopword) {
+		cache->stopword_info.cached_stopword = rbt_create(
+			sizeof(fts_tokenizer_word_t), fts_utf8_string_cmp);
+	}
+
+	return(error == DB_SUCCESS);
+}
+
+/**********************************************************************//**
+Callback function when we initialize the FTS at the start up
+time. It recovers the maximum Doc IDs presented in the current table.
+@return: always returns TRUE */
+static
+ibool
+fts_init_get_doc_id(
+/*================*/
+	void*	row,			/*!< in: sel_node_t* */
+	void*	user_arg)		/*!< in: fts cache */
+{
+	doc_id_t	doc_id = FTS_NULL_DOC_ID;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	que_node_t*	exp = node->select_list;
+	fts_cache_t*    cache = static_cast<fts_cache_t*>(user_arg);
+
+	ut_ad(ib_vector_is_empty(cache->get_docs));
+
+	/* Copy each indexed column content into doc->text.f_str */
+	if (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		dtype_t*        type = dfield_get_type(dfield);
+		void*           data = dfield_get_data(dfield);
+
+		ut_a(dtype_get_mtype(type) == DATA_INT);
+
+		doc_id = static_cast<doc_id_t>(mach_read_from_8(
+			static_cast<const byte*>(data)));
+
+		if (doc_id >= cache->next_doc_id) {
+			cache->next_doc_id = doc_id + 1;
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Callback function when we initialize the FTS at the start up
+time. It recovers Doc IDs that have not sync-ed to the auxiliary
+table, and require to bring them back into FTS index.
+@return: always returns TRUE */
+static
+ibool
+fts_init_recover_doc(
+/*=================*/
+	void*	row,			/*!< in: sel_node_t* */
+	void*	user_arg)		/*!< in: fts cache */
+{
+
+	fts_doc_t       doc;
+	ulint		doc_len = 0;
+	ulint		field_no = 0;
+	fts_get_doc_t*  get_doc = static_cast<fts_get_doc_t*>(user_arg);
+	doc_id_t	doc_id = FTS_NULL_DOC_ID;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	que_node_t*	exp = node->select_list;
+	fts_cache_t*	cache = get_doc->cache;
+
+	fts_doc_init(&doc);
+	doc.found = TRUE;
+
+	ut_ad(cache);
+
+	/* Copy each indexed column content into doc->text.f_str */
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		ulint		len = dfield_get_len(dfield);
+
+		if (field_no == 0) {
+			dtype_t*        type = dfield_get_type(dfield);
+			void*           data = dfield_get_data(dfield);
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+
+			doc_id = static_cast<doc_id_t>(mach_read_from_8(
+				static_cast<const byte*>(data)));
+
+			field_no++;
+			exp = que_node_get_next(exp);
+			continue;
+		}
+
+		if (len == UNIV_SQL_NULL) {
+			exp = que_node_get_next(exp);
+			continue;
+		}
+
+		ut_ad(get_doc);
+
+		if (!get_doc->index_cache->charset) {
+			ulint   prtype = dfield->type.prtype;
+
+			get_doc->index_cache->charset =
+				innobase_get_fts_charset(
+				(int)(prtype & DATA_MYSQL_TYPE_MASK),
+				(uint) dtype_get_charset_coll(prtype));
+		}
+
+		doc.charset = get_doc->index_cache->charset;
+
+		if (dfield_is_ext(dfield)) {
+			dict_table_t*	table = cache->sync->table;
+			ulint		zip_size = dict_table_zip_size(table);
+
+			doc.text.f_str = btr_copy_externally_stored_field(
+				&doc.text.f_len,
+				static_cast<byte*>(dfield_get_data(dfield)),
+				zip_size, len,
+				static_cast<mem_heap_t*>(doc.self_heap->arg));
+		} else {
+			doc.text.f_str = static_cast<byte*>(
+				dfield_get_data(dfield));
+
+			doc.text.f_len = len;
+		}
+
+		if (field_no == 1) {
+			fts_tokenize_document(&doc, NULL);
+		} else {
+			fts_tokenize_document_next(&doc, doc_len, NULL);
+		}
+
+		exp = que_node_get_next(exp);
+
+		doc_len += (exp) ? len + 1 : len;
+
+		field_no++;
+	}
+
+	fts_cache_add_doc(cache, get_doc->index_cache, doc_id, doc.tokens);
+
+	fts_doc_free(&doc);
+
+	cache->added++;
+
+	if (doc_id >= cache->next_doc_id) {
+		cache->next_doc_id = doc_id + 1;
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+This function brings FTS index in sync when FTS index is first
+used. There are documents that have not yet sync-ed to auxiliary
+tables from last server abnormally shutdown, we will need to bring
+such document into FTS cache before any further operations
+@return TRUE if all OK */
+UNIV_INTERN
+ibool
+fts_init_index(
+/*===========*/
+	dict_table_t*	table,		/*!< in: Table with FTS */
+	ibool		has_cache_lock)	/*!< in: Whether we already have
+					cache lock */
+{
+	dict_index_t*   index;
+	doc_id_t        start_doc;
+	fts_get_doc_t*  get_doc = NULL;
+	fts_cache_t*    cache = table->fts->cache;
+	bool		need_init = false;
+
+	ut_ad(!mutex_own(&dict_sys->mutex));
+
+	/* First check cache->get_docs is initialized */
+	if (!has_cache_lock) {
+		rw_lock_x_lock(&cache->lock);
+	}
+
+	rw_lock_x_lock(&cache->init_lock);
+	if (cache->get_docs == NULL) {
+		cache->get_docs = fts_get_docs_create(cache);
+	}
+	rw_lock_x_unlock(&cache->init_lock);
+
+	if (table->fts->fts_status & ADDED_TABLE_SYNCED) {
+		goto func_exit;
+	}
+
+	need_init = true;
+
+	start_doc = cache->synced_doc_id;
+
+	if (!start_doc) {
+		fts_cmp_set_sync_doc_id(table, 0, TRUE, &start_doc);
+		cache->synced_doc_id = start_doc;
+	}
+
+	/* No FTS index, this is the case when previous FTS index
+	dropped, and we re-initialize the Doc ID system for subsequent
+	insertion */
+	if (ib_vector_is_empty(cache->get_docs)) {
+		index = dict_table_get_index_on_name(table, FTS_DOC_ID_INDEX_NAME);
+
+		ut_a(index);
+
+		fts_doc_fetch_by_doc_id(NULL, start_doc, index,
+					FTS_FETCH_DOC_BY_ID_LARGE,
+					fts_init_get_doc_id, cache);
+	} else {
+		if (table->fts->cache->stopword_info.status
+		    & STOPWORD_NOT_INIT) {
+			fts_load_stopword(table, NULL, NULL, NULL, TRUE, TRUE);
+		}
+
+		for (ulint i = 0; i < ib_vector_size(cache->get_docs); ++i) {
+			get_doc = static_cast<fts_get_doc_t*>(
+				ib_vector_get(cache->get_docs, i));
+
+			index = get_doc->index_cache->index;
+
+			fts_doc_fetch_by_doc_id(NULL, start_doc, index,
+						FTS_FETCH_DOC_BY_ID_LARGE,
+						fts_init_recover_doc, get_doc);
+		}
+	}
+
+	table->fts->fts_status |= ADDED_TABLE_SYNCED;
+
+	fts_get_docs_clear(cache->get_docs);
+
+func_exit:
+	if (!has_cache_lock) {
+		rw_lock_x_unlock(&cache->lock);
+	}
+
+	if (need_init) {
+		mutex_enter(&dict_sys->mutex);
+		/* Register the table with the optimize thread. */
+		fts_optimize_add_table(table);
+		mutex_exit(&dict_sys->mutex);
+	}
+
+	return(TRUE);
+}
diff --git a/storage/xtradb/fts/fts0opt.cc b/storage/xtradb/fts/fts0opt.cc
new file mode 100644
index 00000000000..0fd577c5767
--- /dev/null
+++ b/storage/xtradb/fts/fts0opt.cc
@@ -0,0 +1,3169 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0opt.cc
+Full Text Search optimize thread
+
+Created 2007/03/27 Sunny Bains
+Completed 2011/7/10 Sunny and Jimmy Yang
+
+***********************************************************************/
+
+#include "fts0fts.h"
+#include "row0sel.h"
+#include "que0types.h"
+#include "fts0priv.h"
+#include "fts0types.h"
+#include "ut0wqueue.h"
+#include "srv0start.h"
+#include "zlib.h"
+
+#ifndef UNIV_NONINL
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#endif
+
+/** The FTS optimize thread's work queue. */
+static ib_wqueue_t* fts_optimize_wq;
+
+/** The number of document ids to delete in one statement. */
+static const ulint FTS_MAX_DELETE_DOC_IDS = 1000;
+
+/** Time to wait for a message. */
+static const ulint FTS_QUEUE_WAIT_IN_USECS = 5000000;
+
+/** Default optimize interval in secs. */
+static const ulint FTS_OPTIMIZE_INTERVAL_IN_SECS = 300;
+
+/** Server is shutting down, so does we exiting the optimize thread */
+static bool fts_opt_start_shutdown = false;
+
+/** Last time we did check whether system need a sync */
+static ib_time_t	last_check_sync_time;
+
+#if 0
+/** Check each table in round robin to see whether they'd
+need to be "optimized" */
+static	ulint	fts_optimize_sync_iterator = 0;
+#endif
+
+/** State of a table within the optimization sub system. */
+enum fts_state_t {
+	FTS_STATE_LOADED,
+	FTS_STATE_RUNNING,
+	FTS_STATE_SUSPENDED,
+	FTS_STATE_DONE,
+	FTS_STATE_EMPTY
+};
+
+/** FTS optimize thread message types. */
+enum fts_msg_type_t {
+	FTS_MSG_START,			/*!< Start optimizing thread */
+
+	FTS_MSG_PAUSE,			/*!< Pause optimizing thread */
+
+	FTS_MSG_STOP,			/*!< Stop optimizing and exit thread */
+
+	FTS_MSG_ADD_TABLE,		/*!< Add table to the optimize thread's
+					work queue */
+
+	FTS_MSG_OPTIMIZE_TABLE,		/*!< Optimize a table */
+
+	FTS_MSG_DEL_TABLE,		/*!< Remove a table from the optimize
+					threads work queue */
+};
+
+/** Compressed list of words that have been read from FTS INDEX
+that needs to be optimized. */
+struct fts_zip_t {
+	ulint		status;		/*!< Status of (un)/zip operation */
+
+	ulint		n_words;	/*!< Number of words compressed */
+
+	ulint		block_sz;	/*!< Size of a block in bytes */
+
+	ib_vector_t*	blocks;		/*!< Vector of compressed blocks */
+
+	ib_alloc_t*	heap_alloc;	/*!< Heap to use for allocations */
+
+	ulint		pos;		/*!< Offset into blocks */
+
+	ulint		last_big_block;	/*!< Offset of last block in the
+					blocks array that is of size
+					block_sz. Blocks beyond this offset
+					are of size FTS_MAX_WORD_LEN */
+
+	z_streamp	zp;		/*!< ZLib state */
+
+					/*!< The value of the last word read
+					from the FTS INDEX table. This is
+					used to discard duplicates */
+
+	fts_string_t	word;		/*!< UTF-8 string */
+
+	ulint		max_words;	/*!< maximum number of words to read
+					in one pase */
+};
+
+/** Prepared statemets used during optimize */
+struct fts_optimize_graph_t {
+					/*!< Delete a word from FTS INDEX */
+	que_t*		delete_nodes_graph;
+					/*!< Insert a word into FTS INDEX */
+	que_t*		write_nodes_graph;
+					/*!< COMMIT a transaction */
+	que_t*		commit_graph;
+					/*!< Read the nodes from FTS_INDEX */
+	que_t*		read_nodes_graph;
+};
+
+/** Used by fts_optimize() to store state. */
+struct fts_optimize_t {
+	trx_t*		trx;		/*!< The transaction used for all SQL */
+
+	ib_alloc_t*	self_heap;	/*!< Heap to use for allocations */
+
+	char*		name_prefix;	/*!< FTS table name prefix */
+
+	fts_table_t	fts_index_table;/*!< Common table definition */
+
+					/*!< Common table definition */
+	fts_table_t	fts_common_table;
+
+	dict_table_t*	table;		/*!< Table that has to be queried */
+
+	dict_index_t*	index;		/*!< The FTS index to be optimized */
+
+	fts_doc_ids_t*	to_delete;	/*!< doc ids to delete, we check against
+					this vector and purge the matching
+					entries during the optimizing
+					process. The vector entries are
+					sorted on doc id */
+
+	ulint		del_pos;	/*!< Offset within to_delete vector,
+					this is used to keep track of where
+					we are up to in the vector */
+
+	ibool		done;		/*!< TRUE when optimize finishes */
+
+	ib_vector_t*	words;		/*!< Word + Nodes read from FTS_INDEX,
+					it contains instances of fts_word_t */
+
+	fts_zip_t*	zip;		/*!< Words read from the FTS_INDEX */
+
+	fts_optimize_graph_t		/*!< Prepared statements used during */
+			graph;		/*optimize */
+
+	ulint		n_completed;	/*!< Number of FTS indexes that have
+					been optimized */
+	ibool		del_list_regenerated;
+					/*!< BEING_DELETED list regenarated */
+};
+
+/** Used by the optimize, to keep state during compacting nodes. */
+struct fts_encode_t {
+	doc_id_t	src_last_doc_id;/*!< Last doc id read from src node */
+	byte*		src_ilist_ptr;	/*!< Current ptr within src ilist */
+};
+
+/** We use this information to determine when to start the optimize
+cycle for a table. */
+struct fts_slot_t {
+	dict_table_t*	table;		/*!< Table to optimize */
+
+	fts_state_t	state;		/*!< State of this slot */
+
+	ulint		added;		/*!< Number of doc ids added since the
+					last time this table was optimized */
+
+	ulint		deleted;	/*!< Number of doc ids deleted since the
+					last time this table was optimized */
+
+	ib_time_t	last_run;	/*!< Time last run completed */
+
+	ib_time_t	completed;	/*!< Optimize finish time */
+
+	ib_time_t	interval_time;	/*!< Minimum time to wait before
+					optimizing the table again. */
+};
+
+/** A table remove message for the FTS optimize thread. */
+struct fts_msg_del_t {
+	dict_table_t*	table;		/*!< The table to remove */
+
+	os_event_t	event;		/*!< Event to synchronize acknowledgement
+					of receipt and processing of the
+					this message by the consumer */
+};
+
+/** Stop the optimize thread. */
+struct fts_msg_optimize_t {
+	dict_table_t*	table;		/*!< Table to optimize */
+};
+
+/** The FTS optimize message work queue message type. */
+struct fts_msg_t {
+	fts_msg_type_t	type;		/*!< Message type */
+
+	void*		ptr;		/*!< The message contents */
+
+	mem_heap_t*	heap;		/*!< The heap used to allocate this
+					message, the message consumer will
+					free the heap. */
+};
+
+/** The number of words to read and optimize in a single pass. */
+UNIV_INTERN ulong	fts_num_word_optimize;
+
+// FIXME
+UNIV_INTERN char	fts_enable_diag_print;
+
+/** ZLib compressed block size.*/
+static ulint FTS_ZIP_BLOCK_SIZE	= 1024;
+
+/** The amount of time optimizing in a single pass, in milliseconds. */
+static ib_time_t fts_optimize_time_limit = 0;
+
+/** SQL Statement for changing state of rows to be deleted from FTS Index. */
+static	const char* fts_init_delete_sql =
+	"BEGIN\n"
+	"\n"
+	"INSERT INTO %s_BEING_DELETED\n"
+		"SELECT doc_id FROM \"%s_DELETED\";\n"
+	"\n"
+	"INSERT INTO %s_BEING_DELETED_CACHE\n"
+		"SELECT doc_id FROM \"%s_DELETED_CACHE\";\n";
+
+static const char* fts_delete_doc_ids_sql =
+	"BEGIN\n"
+	"\n"
+	"DELETE FROM \"%s_DELETED\" WHERE doc_id = :doc_id1;\n"
+	"DELETE FROM \"%s_DELETED_CACHE\" WHERE doc_id = :doc_id2;\n";
+
+static const char* fts_end_delete_sql =
+	"BEGIN\n"
+	"\n"
+	"DELETE FROM \"%s_BEING_DELETED\";\n"
+	"DELETE FROM \"%s_BEING_DELETED_CACHE\";\n";
+
+/**********************************************************************//**
+Initialize fts_zip_t. */
+static
+void
+fts_zip_initialize(
+/*===============*/
+	fts_zip_t*	zip)		/*!< out: zip instance to initialize */
+{
+	zip->pos = 0;
+	zip->n_words = 0;
+
+	zip->status = Z_OK;
+
+	zip->last_big_block = 0;
+
+	zip->word.f_len = 0;
+	memset(zip->word.f_str, 0, FTS_MAX_WORD_LEN);
+
+	ib_vector_reset(zip->blocks);
+
+	memset(zip->zp, 0, sizeof(*zip->zp));
+}
+
+/**********************************************************************//**
+Create an instance of fts_zip_t.
+@return a new instance of fts_zip_t */
+static
+fts_zip_t*
+fts_zip_create(
+/*===========*/
+	mem_heap_t*	heap,		/*!< in: heap */
+	ulint		block_sz,	/*!< in: size of a zip block.*/
+	ulint		max_words)	/*!< in: max words to read */
+{
+	fts_zip_t*	zip;
+
+	zip = static_cast<fts_zip_t*>(mem_heap_zalloc(heap, sizeof(*zip)));
+
+	zip->word.f_str = static_cast<byte*>(
+		mem_heap_zalloc(heap, FTS_MAX_WORD_LEN + 1));
+
+	zip->block_sz = block_sz;
+
+	zip->heap_alloc = ib_heap_allocator_create(heap);
+
+	zip->blocks = ib_vector_create(zip->heap_alloc, sizeof(void*), 128);
+
+	zip->max_words = max_words;
+
+	zip->zp = static_cast<z_stream*>(
+		mem_heap_zalloc(heap, sizeof(*zip->zp)));
+
+	return(zip);
+}
+
+/**********************************************************************//**
+Initialize an instance of fts_zip_t. */
+static
+void
+fts_zip_init(
+/*=========*/
+
+	fts_zip_t*	zip)		/*!< in: zip instance to init */
+{
+	memset(zip->zp, 0, sizeof(*zip->zp));
+
+	zip->word.f_len = 0;
+	*zip->word.f_str = '\0';
+}
+
+/**********************************************************************//**
+Create a fts_optimizer_word_t instance.
+@return new instance */
+UNIV_INTERN
+fts_word_t*
+fts_word_init(
+/*==========*/
+	fts_word_t*	word,		/*!< in: word to initialize */
+	byte*		utf8,		/*!< in: UTF-8 string */
+	ulint		len)		/*!< in: length of string in bytes */
+{
+	mem_heap_t*	heap = mem_heap_create(sizeof(fts_node_t));
+
+	memset(word, 0, sizeof(*word));
+
+	word->text.f_len = len;
+	word->text.f_str = static_cast<byte*>(mem_heap_alloc(heap, len + 1));
+
+	/* Need to copy the NUL character too. */
+	memcpy(word->text.f_str, utf8, word->text.f_len);
+	word->text.f_str[word->text.f_len] = 0;
+
+	word->heap_alloc = ib_heap_allocator_create(heap);
+
+	word->nodes = ib_vector_create(
+		word->heap_alloc, sizeof(fts_node_t), 64);
+
+	return(word);
+}
+
+/**********************************************************************//**
+Read the FTS INDEX row.
+@return fts_node_t instance */
+static
+fts_node_t*
+fts_optimize_read_node(
+/*===================*/
+	fts_word_t*	word,		/*!< in: */
+	que_node_t*	exp)		/*!< in: */
+{
+	int		i;
+	fts_node_t*	node = static_cast<fts_node_t*>(
+		ib_vector_push(word->nodes, NULL));
+
+	/* Start from 1 since the first node has been read by the caller */
+	for (i = 1; exp; exp = que_node_get_next(exp), ++i) {
+
+		dfield_t*	dfield = que_node_get_val(exp);
+		byte*		data = static_cast<byte*>(
+			dfield_get_data(dfield));
+		ulint		len = dfield_get_len(dfield);
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		/* Note: The column numbers below must match the SELECT */
+		switch (i) {
+		case 1: /* DOC_COUNT */
+			node->doc_count = mach_read_from_4(data);
+			break;
+
+		case 2: /* FIRST_DOC_ID */
+			node->first_doc_id = fts_read_doc_id(data);
+			break;
+
+		case 3: /* LAST_DOC_ID */
+			node->last_doc_id = fts_read_doc_id(data);
+			break;
+
+		case 4: /* ILIST */
+			node->ilist_size_alloc = node->ilist_size = len;
+			node->ilist = static_cast<byte*>(ut_malloc(len));
+			memcpy(node->ilist, data, len);
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	/* Make sure all columns were read. */
+	ut_a(i == 5);
+
+	return(node);
+}
+
+/**********************************************************************//**
+Callback function to fetch the rows in an FTS INDEX record.
+@return always returns non-NULL */
+UNIV_INTERN
+ibool
+fts_optimize_index_fetch_node(
+/*==========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	fts_word_t*	word;
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	fts_fetch_t*	fetch = static_cast<fts_fetch_t*>(user_arg);
+	ib_vector_t*	words = static_cast<ib_vector_t*>(fetch->read_arg);
+	que_node_t*	exp = sel_node->select_list;
+	dfield_t*	dfield = que_node_get_val(exp);
+	void*		data = dfield_get_data(dfield);
+	ulint		dfield_len = dfield_get_len(dfield);
+
+	ut_a(dfield_len <= FTS_MAX_WORD_LEN);
+
+	if (ib_vector_size(words) == 0) {
+
+		word = static_cast<fts_word_t*>(ib_vector_push(words, NULL));
+		fts_word_init(word, (byte*) data, dfield_len);
+	}
+
+	word = static_cast<fts_word_t*>(ib_vector_last(words));
+
+	if (dfield_len != word->text.f_len
+	    || memcmp(word->text.f_str, data, dfield_len)) {
+
+		word = static_cast<fts_word_t*>(ib_vector_push(words, NULL));
+		fts_word_init(word, (byte*) data, dfield_len);
+	}
+
+	fts_optimize_read_node(word, que_node_get_next(exp));
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Read the rows from the FTS inde.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_index_fetch_nodes(
+/*==================*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		graph,		/*!< in: prepared statement */
+	fts_table_t*	fts_table,	/*!< in: table of the FTS INDEX */
+	const fts_string_t*
+			word,		/*!< in: the word to fetch */
+	fts_fetch_t*	fetch)		/*!< in: fetch callback.*/
+{
+	pars_info_t*	info;
+	dberr_t		error;
+
+	trx->op_info = "fetching FTS index nodes";
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	pars_info_bind_function(info, "my_func", fetch->read_record, fetch);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	if (!*graph) {
+		ulint	selected;
+
+		ut_a(fts_table->type == FTS_INDEX_TABLE);
+
+		selected = fts_select_index(fts_table->charset,
+					    word->f_str, word->f_len);
+
+		fts_table->suffix = fts_get_suffix(selected);
+
+		*graph = fts_parse_sql(
+			fts_table,
+			info,
+			"DECLARE FUNCTION my_func;\n"
+			"DECLARE CURSOR c IS"
+			" SELECT word, doc_count, first_doc_id, last_doc_id, "
+				"ilist\n"
+			" FROM \"%s\"\n"
+			" WHERE word LIKE :word\n"
+			" ORDER BY first_doc_id;\n"
+			"BEGIN\n"
+			"\n"
+			"OPEN c;\n"
+			"WHILE 1 = 1 LOOP\n"
+			"  FETCH c INTO my_func();\n"
+			"  IF c % NOTFOUND THEN\n"
+			"    EXIT;\n"
+			"  END IF;\n"
+			"END LOOP;\n"
+			"CLOSE c;");
+	}
+
+	for(;;) {
+		error = fts_eval_sql(trx, *graph);
+
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+
+			break;				/* Exit the loop. */
+		} else {
+			fts_sql_rollback(trx);
+
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, " InnoDB: Warning: lock wait "
+					"timeout reading FTS index. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, " InnoDB: Error: (%s) "
+					"while reading FTS index.\n",
+					ut_strerr(error));
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Read a word */
+static
+byte*
+fts_zip_read_word(
+/*==============*/
+	fts_zip_t*	zip,		/*!< in: Zip state + data */
+	fts_string_t*	word)		/*!< out: uncompressed word */
+{
+#ifdef UNIV_DEBUG
+	ulint		i;
+#endif
+	byte		len = 0;
+	void*		null = NULL;
+	byte*		ptr = word->f_str;
+	int		flush = Z_NO_FLUSH;
+
+	/* Either there was an error or we are at the Z_STREAM_END. */
+	if (zip->status != Z_OK) {
+		return(NULL);
+	}
+
+	zip->zp->next_out = &len;
+	zip->zp->avail_out = sizeof(len);
+
+	while (zip->status == Z_OK && zip->zp->avail_out > 0) {
+
+		/* Finished decompressing block. */
+		if (zip->zp->avail_in == 0) {
+
+			/* Free the block thats been decompressed. */
+			if (zip->pos > 0) {
+				ulint	prev = zip->pos - 1;
+
+				ut_a(zip->pos < ib_vector_size(zip->blocks));
+
+				ut_free(ib_vector_getp(zip->blocks, prev));
+				ib_vector_set(zip->blocks, prev, &null);
+			}
+
+			/* Any more blocks to decompress. */
+			if (zip->pos < ib_vector_size(zip->blocks)) {
+
+				zip->zp->next_in = static_cast<byte*>(
+					ib_vector_getp(
+						zip->blocks, zip->pos));
+
+				if (zip->pos > zip->last_big_block) {
+					zip->zp->avail_in =
+						FTS_MAX_WORD_LEN;
+				} else {
+					zip->zp->avail_in = zip->block_sz;
+				}
+
+				++zip->pos;
+			} else {
+				flush = Z_FINISH;
+			}
+		}
+
+		switch (zip->status = inflate(zip->zp, flush)) {
+		case Z_OK:
+			if (zip->zp->avail_out == 0 && len > 0) {
+
+				ut_a(len <= FTS_MAX_WORD_LEN);
+				ptr[len] = 0;
+
+				zip->zp->next_out = ptr;
+				zip->zp->avail_out = len;
+
+				word->f_len = len;
+				len = 0;
+			}
+			break;
+
+		case Z_BUF_ERROR:	/* No progress possible. */
+		case Z_STREAM_END:
+			inflateEnd(zip->zp);
+			break;
+
+		case Z_STREAM_ERROR:
+		default:
+			ut_error;
+		}
+	}
+
+#ifdef UNIV_DEBUG
+	/* All blocks must be freed at end of inflate. */
+	if (zip->status != Z_OK) {
+		for (i = 0; i < ib_vector_size(zip->blocks); ++i) {
+			if (ib_vector_getp(zip->blocks, i)) {
+				ut_free(ib_vector_getp(zip->blocks, i));
+				ib_vector_set(zip->blocks, i, &null);
+			}
+		}
+	}
+
+	if (ptr != NULL) {
+		ut_ad(word->f_len == strlen((char*) ptr));
+	}
+#endif /* UNIV_DEBUG */
+
+	return(zip->status == Z_OK || zip->status == Z_STREAM_END ? ptr : NULL);
+}
+
+/**********************************************************************//**
+Callback function to fetch and compress the word in an FTS
+INDEX record.
+@return FALSE on EOF */
+static
+ibool
+fts_fetch_index_words(
+/*==================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	fts_zip_t*	zip = static_cast<fts_zip_t*>(user_arg);
+	que_node_t*	exp = sel_node->select_list;
+	dfield_t*	dfield = que_node_get_val(exp);
+	byte		len = (byte) dfield_get_len(dfield);
+	void*		data = dfield_get_data(dfield);
+
+	/* Skip the duplicate words. */
+	if (zip->word.f_len == len && !memcmp(zip->word.f_str, data, len)) {
+
+		return(TRUE);
+	}
+
+	ut_a(len <= FTS_MAX_WORD_LEN);
+
+	memcpy(zip->word.f_str, data, len);
+	zip->word.f_len = len;
+
+	ut_a(zip->zp->avail_in == 0);
+	ut_a(zip->zp->next_in == NULL);
+
+	/* The string is prefixed by len. */
+	zip->zp->next_in = &len;
+	zip->zp->avail_in = sizeof(len);
+
+	/* Compress the word, create output blocks as necessary. */
+	while (zip->zp->avail_in > 0) {
+
+		/* No space left in output buffer, create a new one. */
+		if (zip->zp->avail_out == 0) {
+			byte*		block;
+
+			block = static_cast<byte*>(ut_malloc(zip->block_sz));
+			ib_vector_push(zip->blocks, &block);
+
+			zip->zp->next_out = block;
+			zip->zp->avail_out = zip->block_sz;
+		}
+
+		switch (zip->status = deflate(zip->zp, Z_NO_FLUSH)) {
+		case Z_OK:
+			if (zip->zp->avail_in == 0) {
+				zip->zp->next_in = static_cast<byte*>(data);
+				zip->zp->avail_in = len;
+				ut_a(len <= FTS_MAX_WORD_LEN);
+				len = 0;
+			}
+			break;
+
+		case Z_STREAM_END:
+		case Z_BUF_ERROR:
+		case Z_STREAM_ERROR:
+		default:
+			ut_error;
+			break;
+		}
+	}
+
+	/* All data should have been compressed. */
+	ut_a(zip->zp->avail_in == 0);
+	zip->zp->next_in = NULL;
+
+	++zip->n_words;
+
+	return(zip->n_words >= zip->max_words ? FALSE : TRUE);
+}
+
+/**********************************************************************//**
+Finish Zip deflate. */
+static
+void
+fts_zip_deflate_end(
+/*================*/
+	fts_zip_t*	zip)		/*!< in: instance that should be closed*/
+{
+	ut_a(zip->zp->avail_in == 0);
+	ut_a(zip->zp->next_in == NULL);
+
+	zip->status = deflate(zip->zp, Z_FINISH);
+
+	ut_a(ib_vector_size(zip->blocks) > 0);
+	zip->last_big_block = ib_vector_size(zip->blocks) - 1;
+
+	/* Allocate smaller block(s), since this is trailing data. */
+	while (zip->status == Z_OK) {
+		byte*		block;
+
+		ut_a(zip->zp->avail_out == 0);
+
+		block = static_cast<byte*>(ut_malloc(FTS_MAX_WORD_LEN + 1));
+		ib_vector_push(zip->blocks, &block);
+
+		zip->zp->next_out = block;
+		zip->zp->avail_out = FTS_MAX_WORD_LEN;
+
+		zip->status = deflate(zip->zp, Z_FINISH);
+	}
+
+	ut_a(zip->status == Z_STREAM_END);
+
+	zip->status = deflateEnd(zip->zp);
+	ut_a(zip->status == Z_OK);
+
+	/* Reset the ZLib data structure. */
+	memset(zip->zp, 0, sizeof(*zip->zp));
+}
+
+/**********************************************************************//**
+Read the words from the FTS INDEX.
+@return DB_SUCCESS if all OK, DB_TABLE_NOT_FOUND if no more indexes
+        to search else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_index_fetch_words(
+/*==================*/
+	fts_optimize_t*		optim,	/*!< in: optimize scratch pad */
+	const fts_string_t*	word,	/*!< in: get words greater than this
+					 word */
+	ulint			n_words)/*!< in: max words to read */
+{
+	pars_info_t*	info;
+	que_t*		graph;
+	ulint		selected;
+	fts_zip_t*	zip = NULL;
+	dberr_t		error = DB_SUCCESS;
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(optim->self_heap->arg);
+	ibool		inited = FALSE;
+
+	optim->trx->op_info = "fetching FTS index words";
+
+	if (optim->zip == NULL) {
+		optim->zip = fts_zip_create(heap, FTS_ZIP_BLOCK_SIZE, n_words);
+	} else {
+		fts_zip_initialize(optim->zip);
+	}
+
+	for (selected = fts_select_index(
+		optim->fts_index_table.charset, word->f_str, word->f_len);
+	     fts_index_selector[selected].value;
+	     selected++) {
+
+		optim->fts_index_table.suffix = fts_get_suffix(selected);
+
+		/* We've search all indexes. */
+		if (optim->fts_index_table.suffix == NULL) {
+			return(DB_TABLE_NOT_FOUND);
+		}
+
+		info = pars_info_create();
+
+		pars_info_bind_function(
+			info, "my_func", fts_fetch_index_words, optim->zip);
+
+		pars_info_bind_varchar_literal(
+			info, "word", word->f_str, word->f_len);
+
+		graph = fts_parse_sql(
+			&optim->fts_index_table,
+			info,
+			"DECLARE FUNCTION my_func;\n"
+			"DECLARE CURSOR c IS"
+			" SELECT word\n"
+			" FROM \"%s\"\n"
+			" WHERE word > :word\n"
+			" ORDER BY word;\n"
+			"BEGIN\n"
+			"\n"
+			"OPEN c;\n"
+			"WHILE 1 = 1 LOOP\n"
+			"  FETCH c INTO my_func();\n"
+			"  IF c % NOTFOUND THEN\n"
+			"    EXIT;\n"
+			"  END IF;\n"
+			"END LOOP;\n"
+			"CLOSE c;");
+
+		zip = optim->zip;
+
+		for(;;) {
+			int	err;
+
+			if (!inited && ((err = deflateInit(zip->zp, 9))
+					!= Z_OK)) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					" InnoDB: Error: ZLib deflateInit() "
+					"failed: %d\n", err);
+
+				error = DB_ERROR;
+				break;
+			} else {
+				inited = TRUE;
+				error = fts_eval_sql(optim->trx, graph);
+			}
+
+			if (error == DB_SUCCESS) {
+				//FIXME fts_sql_commit(optim->trx);
+				break;
+			} else {
+				//FIXME fts_sql_rollback(optim->trx);
+
+				ut_print_timestamp(stderr);
+
+				if (error == DB_LOCK_WAIT_TIMEOUT) {
+					fprintf(stderr, " InnoDB: "
+						"Warning: lock wait "
+						"timeout reading document. "
+						"Retrying!\n");
+
+					/* We need to reset the ZLib state. */
+					inited = FALSE;
+					deflateEnd(zip->zp);
+					fts_zip_init(zip);
+
+					optim->trx->error_state = DB_SUCCESS;
+				} else {
+					fprintf(stderr, " InnoDB: Error: (%s) "
+						"while reading document.\n",
+						ut_strerr(error));
+
+					break;	/* Exit the loop. */
+				}
+			}
+		}
+
+		fts_que_graph_free(graph);
+
+		/* Check if max word to fetch is exceeded */
+		if (optim->zip->n_words >= n_words) {
+			break;
+		}
+	}
+
+	if (error == DB_SUCCESS && zip->status == Z_OK && zip->n_words > 0) {
+
+		/* All data should have been read. */
+		ut_a(zip->zp->avail_in == 0);
+
+		fts_zip_deflate_end(zip);
+	} else {
+		deflateEnd(zip->zp);
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Callback function to fetch the doc id from the record.
+@return always returns TRUE */
+static
+ibool
+fts_fetch_doc_ids(
+/*==============*/
+	void*	row,		/*!< in: sel_node_t* */
+	void*	user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	que_node_t*	exp;
+	int		i = 0;
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	fts_doc_ids_t*	fts_doc_ids = static_cast<fts_doc_ids_t*>(user_arg);
+	fts_update_t*	update = static_cast<fts_update_t*>(
+		ib_vector_push(fts_doc_ids->doc_ids, NULL));
+
+	for (exp = sel_node->select_list;
+	     exp;
+	     exp = que_node_get_next(exp), ++i) {
+
+		dfield_t*	dfield = que_node_get_val(exp);
+		void*		data = dfield_get_data(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		/* Note: The column numbers below must match the SELECT. */
+		switch (i) {
+		case 0: /* DOC_ID */
+			update->fts_indexes = NULL;
+			update->doc_id = fts_read_doc_id(
+				static_cast<byte*>(data));
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Read the rows from a FTS common auxiliary table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_table_fetch_doc_ids(
+/*====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: table */
+	fts_doc_ids_t*	doc_ids)	/*!< in: For collecting doc ids */
+{
+	dberr_t		error;
+	que_t*		graph;
+	pars_info_t*	info = pars_info_create();
+	ibool		alloc_bk_trx = FALSE;
+
+	ut_a(fts_table->suffix != NULL);
+	ut_a(fts_table->type == FTS_COMMON_TABLE);
+
+	if (!trx) {
+		trx = trx_allocate_for_background();
+		alloc_bk_trx = TRUE;
+	}
+
+	trx->op_info = "fetching FTS doc ids";
+
+	pars_info_bind_function(info, "my_func", fts_fetch_doc_ids, doc_ids);
+
+	graph = fts_parse_sql(
+		fts_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT doc_id FROM \"%s\";\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	error = fts_eval_sql(trx, graph);
+
+	mutex_enter(&dict_sys->mutex);
+	que_graph_free(graph);
+	mutex_exit(&dict_sys->mutex);
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(trx);
+
+		ib_vector_sort(doc_ids->doc_ids, fts_update_doc_id_cmp);
+	} else {
+		fts_sql_rollback(trx);
+	}
+
+	if (alloc_bk_trx) {
+		trx_free_for_background(trx);
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Do a binary search for a doc id in the array
+@return +ve index if found -ve index where it should be inserted
+        if not found */
+UNIV_INTERN
+int
+fts_bsearch(
+/*========*/
+	fts_update_t*	array,	/*!< in: array to sort */
+	int		lower,	/*!< in: the array lower bound */
+	int		upper,	/*!< in: the array upper bound */
+	doc_id_t	doc_id)	/*!< in: the doc id to search for */
+{
+	int	orig_size = upper;
+
+	if (upper == 0) {
+		/* Nothing to search */
+		return(-1);
+	} else {
+		while (lower < upper) {
+			int	i = (lower + upper) >> 1;
+
+			if (doc_id > array[i].doc_id) {
+				lower = i + 1;
+			} else if (doc_id < array[i].doc_id) {
+				upper = i - 1;
+			} else {
+				return(i); /* Found. */
+			}
+		}
+	}
+
+	if (lower == upper && lower < orig_size) {
+		if (doc_id == array[lower].doc_id) {
+			return(lower);
+		} else if (lower == 0) {
+			return(-1);
+		}
+	}
+
+	/* Not found. */
+	return( (lower == 0) ? -1 : -lower);
+}
+
+/**********************************************************************//**
+Search in the to delete array whether any of the doc ids within
+the [first, last] range are to be deleted
+@return +ve index if found -ve index where it should be inserted
+        if not found */
+static
+int
+fts_optimize_lookup(
+/*================*/
+	ib_vector_t*	doc_ids,	/*!< in: array to search */
+	ulint		lower,		/*!< in: lower limit of array */
+	doc_id_t	first_doc_id,	/*!< in: doc id to lookup */
+	doc_id_t	last_doc_id)	/*!< in: doc id to lookup */
+{
+	int		pos;
+	int		upper = ib_vector_size(doc_ids);
+	fts_update_t*	array = (fts_update_t*) doc_ids->data;
+
+	pos = fts_bsearch(array, lower, upper, first_doc_id);
+
+	ut_a(abs(pos) <= upper + 1);
+
+	if (pos < 0) {
+
+		int	i = abs(pos);
+
+		/* If i is 1, it could be first_doc_id is less than
+		either the first or second array item, do a
+		double check */
+		if (i == 1 && array[0].doc_id <= last_doc_id
+		    && first_doc_id < array[0].doc_id) {
+			pos = 0;
+		} else if (i < upper && array[i].doc_id <= last_doc_id) {
+
+			/* Check if the "next" doc id is within the
+			first & last doc id of the node. */
+			pos = i;
+		}
+	}
+
+	return(pos);
+}
+
+/**********************************************************************//**
+Encode the word pos list into the node
+@return DB_SUCCESS or error code*/
+static __attribute__((nonnull))
+dberr_t
+fts_optimize_encode_node(
+/*=====================*/
+	fts_node_t*	node,		/*!< in: node to fill*/
+	doc_id_t	doc_id,		/*!< in: doc id to encode */
+	fts_encode_t*	enc)		/*!< in: encoding state.*/
+{
+	byte*		dst;
+	ulint		enc_len;
+	ulint		pos_enc_len;
+	doc_id_t	doc_id_delta;
+	dberr_t		error = DB_SUCCESS;
+	byte*		src = enc->src_ilist_ptr;
+
+	if (node->first_doc_id == 0) {
+		ut_a(node->last_doc_id == 0);
+
+		node->first_doc_id = doc_id;
+	}
+
+	/* Calculate the space required to store the ilist. */
+	doc_id_delta = doc_id - node->last_doc_id;
+	enc_len = fts_get_encoded_len(static_cast<ulint>(doc_id_delta));
+
+	/* Calculate the size of the encoded pos array. */
+	while (*src) {
+		fts_decode_vlc(&src);
+	}
+
+	/* Skip the 0x00 byte at the end of the word positions list. */
+	++src;
+
+	/* Number of encoded pos bytes to copy. */
+	pos_enc_len = src - enc->src_ilist_ptr;
+
+	/* Total number of bytes required for copy. */
+	enc_len += pos_enc_len;
+
+	/* Check we have enough space in the destination buffer for
+	copying the document word list. */
+	if (!node->ilist) {
+		ulint	new_size;
+
+		ut_a(node->ilist_size == 0);
+
+		new_size = enc_len > FTS_ILIST_MAX_SIZE
+			? enc_len : FTS_ILIST_MAX_SIZE;
+
+		node->ilist = static_cast<byte*>(ut_malloc(new_size));
+		node->ilist_size_alloc = new_size;
+
+	} else if ((node->ilist_size + enc_len) > node->ilist_size_alloc) {
+		ulint	new_size = node->ilist_size + enc_len;
+		byte*	ilist = static_cast<byte*>(ut_malloc(new_size));
+
+		memcpy(ilist, node->ilist, node->ilist_size);
+
+		ut_free(node->ilist);
+
+		node->ilist = ilist;
+		node->ilist_size_alloc = new_size;
+	}
+
+	src = enc->src_ilist_ptr;
+	dst = node->ilist + node->ilist_size;
+
+	/* Encode the doc id. Cast to ulint, the delta should be small and
+	therefore no loss of precision. */
+	dst += fts_encode_int((ulint) doc_id_delta, dst);
+
+	/* Copy the encoded pos array. */
+	memcpy(dst, src, pos_enc_len);
+
+	node->last_doc_id = doc_id;
+
+	/* Data copied upto here. */
+	node->ilist_size += enc_len;
+	enc->src_ilist_ptr += pos_enc_len;
+
+	ut_a(node->ilist_size <= node->ilist_size_alloc);
+
+	return(error);
+}
+
+/**********************************************************************//**
+Optimize the data contained in a node.
+@return DB_SUCCESS or error code*/
+static __attribute__((nonnull))
+dberr_t
+fts_optimize_node(
+/*==============*/
+	ib_vector_t*	del_vec,	/*!< in: vector of doc ids to delete*/
+	int*		del_pos,	/*!< in: offset into above vector */
+	fts_node_t*	dst_node,	/*!< in: node to fill*/
+	fts_node_t*	src_node,	/*!< in: source node for data*/
+	fts_encode_t*	enc)		/*!< in: encoding state */
+{
+	ulint		copied;
+	dberr_t		error = DB_SUCCESS;
+	doc_id_t	doc_id = enc->src_last_doc_id;
+
+	if (!enc->src_ilist_ptr) {
+		enc->src_ilist_ptr = src_node->ilist;
+	}
+
+	copied = enc->src_ilist_ptr - src_node->ilist;
+
+	/* While there is data in the source node and space to copy
+	into in the destination node. */
+	while (copied < src_node->ilist_size
+	       && dst_node->ilist_size < FTS_ILIST_MAX_SIZE) {
+
+		doc_id_t	delta;
+		doc_id_t	del_doc_id = FTS_NULL_DOC_ID;
+
+		delta = fts_decode_vlc(&enc->src_ilist_ptr);
+
+test_again:
+		/* Check whether the doc id is in the delete list, if
+		so then we skip the entries but we need to track the
+		delta for decoding the entries following this document's
+		entries. */
+		if (*del_pos >= 0 && *del_pos < (int) ib_vector_size(del_vec)) {
+			fts_update_t*	update;
+
+			update = (fts_update_t*) ib_vector_get(
+				del_vec, *del_pos);
+
+			del_doc_id = update->doc_id;
+		}
+
+		if (enc->src_ilist_ptr == src_node->ilist && doc_id == 0) {
+			ut_a(delta == src_node->first_doc_id);
+		}
+
+		doc_id += delta;
+
+		if (del_doc_id > 0 && doc_id == del_doc_id) {
+
+			++*del_pos;
+
+			/* Skip the entries for this document. */
+			while (*enc->src_ilist_ptr) {
+				fts_decode_vlc(&enc->src_ilist_ptr);
+			}
+
+			/* Skip the end of word position marker. */
+			++enc->src_ilist_ptr;
+
+		} else {
+
+			/* DOC ID already becomes larger than
+			del_doc_id, check the next del_doc_id */
+			if (del_doc_id > 0 && doc_id > del_doc_id) {
+				del_doc_id = 0;
+				++*del_pos;
+				delta = 0;
+				goto test_again;
+			}
+
+			/* Decode and copy the word positions into
+			the dest node. */
+			fts_optimize_encode_node(dst_node, doc_id, enc);
+
+			++dst_node->doc_count;
+
+			ut_a(dst_node->last_doc_id == doc_id);
+		}
+
+		/* Bytes copied so for from source. */
+		copied = enc->src_ilist_ptr - src_node->ilist;
+	}
+
+	if (copied >= src_node->ilist_size) {
+		ut_a(doc_id == src_node->last_doc_id);
+	}
+
+	enc->src_last_doc_id = doc_id;
+
+	return(error);
+}
+
+/**********************************************************************//**
+Determine the starting pos within the deleted doc id vector for a word.
+@return delete position */
+static __attribute__((nonnull, warn_unused_result))
+int
+fts_optimize_deleted_pos(
+/*=====================*/
+	fts_optimize_t*	optim,		/*!< in: optimize state data */
+	fts_word_t*	word)		/*!< in: the word data to check */
+{
+	int		del_pos;
+	ib_vector_t*	del_vec = optim->to_delete->doc_ids;
+
+	/* Get the first and last dict ids for the word, we will use
+	these values to determine which doc ids need to be removed
+	when we coalesce the nodes. This way we can reduce the numer
+	of elements that need to be searched in the deleted doc ids
+	vector and secondly we can remove the doc ids during the
+	coalescing phase. */
+	if (ib_vector_size(del_vec) > 0) {
+		fts_node_t*	node;
+		doc_id_t	last_id;
+		doc_id_t	first_id;
+		ulint		size = ib_vector_size(word->nodes);
+
+		node = (fts_node_t*) ib_vector_get(word->nodes, 0);
+		first_id = node->first_doc_id;
+
+		node = (fts_node_t*) ib_vector_get(word->nodes, size - 1);
+		last_id = node->last_doc_id;
+
+		ut_a(first_id <= last_id);
+
+		del_pos = fts_optimize_lookup(
+			del_vec, optim->del_pos, first_id, last_id);
+	} else {
+
+		del_pos = -1; /* Note that there is nothing to delete. */
+	}
+
+	return(del_pos);
+}
+
+#define FTS_DEBUG_PRINT
+/**********************************************************************//**
+Compact the nodes for a word, we also remove any doc ids during the
+compaction pass.
+@return DB_SUCCESS or error code.*/
+static
+ib_vector_t*
+fts_optimize_word(
+/*==============*/
+	fts_optimize_t*	optim,		/*!< in: optimize state data */
+	fts_word_t*	word)		/*!< in: the word to optimize */
+{
+	fts_encode_t	enc;
+	ib_vector_t*	nodes;
+	ulint		i = 0;
+	int		del_pos;
+	fts_node_t*	dst_node = NULL;
+	ib_vector_t*	del_vec = optim->to_delete->doc_ids;
+	ulint		size = ib_vector_size(word->nodes);
+
+	del_pos = fts_optimize_deleted_pos(optim, word);
+	nodes = ib_vector_create(word->heap_alloc, sizeof(*dst_node), 128);
+
+	enc.src_last_doc_id = 0;
+	enc.src_ilist_ptr = NULL;
+
+	if (fts_enable_diag_print) {
+		word->text.f_str[word->text.f_len] = 0;
+		fprintf(stderr, "FTS_OPTIMIZE: optimize \"%s\"\n",
+			word->text.f_str);
+	}
+
+	while (i < size) {
+		ulint		copied;
+		fts_node_t*	src_node;
+
+		src_node = (fts_node_t*) ib_vector_get(word->nodes, i);
+
+		if (!dst_node) {
+
+			dst_node = static_cast<fts_node_t*>(
+				ib_vector_push(nodes, NULL));
+			memset(dst_node, 0, sizeof(*dst_node));
+		}
+
+		/* Copy from the src to the dst node. */
+		fts_optimize_node(del_vec, &del_pos, dst_node, src_node, &enc);
+
+		ut_a(enc.src_ilist_ptr != NULL);
+
+		/* Determine the numer of bytes copied to dst_node. */
+		copied = enc.src_ilist_ptr - src_node->ilist;
+
+		/* Can't copy more than whats in the vlc array. */
+		ut_a(copied <= src_node->ilist_size);
+
+		/* We are done with this node release the resources. */
+		if (copied == src_node->ilist_size) {
+
+			enc.src_last_doc_id = 0;
+			enc.src_ilist_ptr = NULL;
+
+			ut_free(src_node->ilist);
+
+			src_node->ilist = NULL;
+			src_node->ilist_size = src_node->ilist_size_alloc = 0;
+
+			src_node = NULL;
+
+			++i; /* Get next source node to OPTIMIZE. */
+		}
+
+		if (dst_node->ilist_size >= FTS_ILIST_MAX_SIZE || i >= size) {
+
+			dst_node = NULL;
+		}
+	}
+
+	/* All dst nodes created should have been added to the vector. */
+	ut_a(dst_node == NULL);
+
+	/* Return the OPTIMIZED nodes. */
+	return(nodes);
+}
+
+/**********************************************************************//**
+Update the FTS index table. This is a delete followed by an insert.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_write_word(
+/*====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: table of FTS index */
+	fts_string_t*	word,		/*!< in: word data to write */
+	ib_vector_t*	nodes)		/*!< in: the nodes to write */
+{
+	ulint		i;
+	pars_info_t*	info;
+	que_t*		graph;
+	ulint		selected;
+	dberr_t		error = DB_SUCCESS;
+	char*		table_name = fts_get_table_name(fts_table);
+
+	info = pars_info_create();
+
+	ut_ad(fts_table->charset);
+
+	if (fts_enable_diag_print) {
+		fprintf(stderr, "FTS_OPTIMIZE: processed \"%s\"\n",
+			word->f_str);
+	}
+
+	pars_info_bind_varchar_literal(
+		info, "word", word->f_str, word->f_len);
+
+	selected = fts_select_index(fts_table->charset,
+				    word->f_str, word->f_len);
+
+	fts_table->suffix = fts_get_suffix(selected);
+
+	graph = fts_parse_sql(
+		fts_table,
+		info,
+		"BEGIN DELETE FROM \"%s\" WHERE word = :word;");
+
+	error = fts_eval_sql(trx, graph);
+
+	if (error != DB_SUCCESS) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: (%s) during optimize, "
+			"when deleting a word from the FTS index.\n",
+			ut_strerr(error));
+	}
+
+	fts_que_graph_free(graph);
+	graph = NULL;
+
+	mem_free(table_name);
+
+	/* Even if the operation needs to be rolled back and redone,
+	we iterate over the nodes in order to free the ilist. */
+	for (i = 0; i < ib_vector_size(nodes); ++i) {
+
+		fts_node_t* node = (fts_node_t*) ib_vector_get(nodes, i);
+
+		if (error == DB_SUCCESS) {
+			error = fts_write_node(
+				trx, &graph, fts_table, word, node);
+
+			if (error != DB_SUCCESS) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr, " InnoDB: Error: (%s) "
+					"during optimize, while adding a "
+					"word to the FTS index.\n",
+					ut_strerr(error));
+			}
+		}
+
+		ut_free(node->ilist);
+		node->ilist = NULL;
+		node->ilist_size = node->ilist_size_alloc = 0;
+	}
+
+	if (graph != NULL) {
+		fts_que_graph_free(graph);
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Free fts_optimizer_word_t instanace.*/
+UNIV_INTERN
+void
+fts_word_free(
+/*==========*/
+	fts_word_t*	word)		/*!< in: instance to free.*/
+{
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(word->heap_alloc->arg);
+
+#ifdef UNIV_DEBUG
+	memset(word, 0, sizeof(*word));
+#endif /* UNIV_DEBUG */
+
+	mem_heap_free(heap);
+}
+
+/**********************************************************************//**
+Optimize the word ilist and rewrite data to the FTS index.
+@return status one of RESTART, EXIT, ERROR */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_compact(
+/*=================*/
+	fts_optimize_t*	optim,		/*!< in: optimize state data */
+	dict_index_t*	index,		/*!< in: current FTS being optimized */
+	ib_time_t	start_time)	/*!< in: optimize start time */
+{
+	ulint		i;
+	dberr_t		error = DB_SUCCESS;
+	ulint		size = ib_vector_size(optim->words);
+
+	for (i = 0; i < size && error == DB_SUCCESS && !optim->done; ++i) {
+		fts_word_t*	word;
+		ib_vector_t*	nodes;
+		trx_t*		trx = optim->trx;
+
+		word = (fts_word_t*) ib_vector_get(optim->words, i);
+
+		/* nodes is allocated from the word heap and will be destroyed
+		when the word is freed. We however have to be careful about
+		the ilist, that needs to be freed explicitly. */
+		nodes = fts_optimize_word(optim, word);
+
+		/* Update the data on disk. */
+		error = fts_optimize_write_word(
+			trx, &optim->fts_index_table, &word->text, nodes);
+
+		if (error == DB_SUCCESS) {
+			/* Write the last word optimized to the config table,
+			we use this value for restarting optimize. */
+			error = fts_config_set_index_value(
+				optim->trx, index,
+				FTS_LAST_OPTIMIZED_WORD, &word->text);
+		}
+
+		/* Free the word that was optimized. */
+		fts_word_free(word);
+
+		if (fts_optimize_time_limit > 0
+		    && (ut_time() - start_time) > fts_optimize_time_limit) {
+
+			optim->done = TRUE;
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Create an instance of fts_optimize_t. Also create a new
+background transaction.*/
+static
+fts_optimize_t*
+fts_optimize_create(
+/*================*/
+	dict_table_t*	table)		/*!< in: table with FTS indexes */
+{
+	fts_optimize_t*	optim;
+	mem_heap_t*	heap = mem_heap_create(128);
+
+	optim = (fts_optimize_t*) mem_heap_zalloc(heap, sizeof(*optim));
+
+	optim->self_heap = ib_heap_allocator_create(heap);
+
+	optim->to_delete = fts_doc_ids_create();
+
+	optim->words = ib_vector_create(
+		optim->self_heap, sizeof(fts_word_t), 256);
+
+	optim->table = table;
+
+	optim->trx = trx_allocate_for_background();
+
+	optim->fts_common_table.parent = table->name;
+	optim->fts_common_table.table_id = table->id;
+	optim->fts_common_table.type = FTS_COMMON_TABLE;
+
+	optim->fts_index_table.parent = table->name;
+	optim->fts_index_table.table_id = table->id;
+	optim->fts_index_table.type = FTS_INDEX_TABLE;
+
+	/* The common prefix for all this parent table's aux tables. */
+	optim->name_prefix = fts_get_table_name_prefix(
+		&optim->fts_common_table);
+
+	return(optim);
+}
+
+#ifdef FTS_OPTIMIZE_DEBUG
+/**********************************************************************//**
+Get optimize start time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_get_index_start_time(
+/*==============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	ib_time_t*	start_time)		/*!< out: time in secs */
+{
+	return(fts_config_get_index_ulint(
+		       trx, index, FTS_OPTIMIZE_START_TIME,
+		       (ulint*) start_time));
+}
+
+/**********************************************************************//**
+Set the optimize start time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_set_index_start_time(
+/*==============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	ib_time_t	start_time)		/*!< in: start time */
+{
+	return(fts_config_set_index_ulint(
+		       trx, index, FTS_OPTIMIZE_START_TIME,
+		       (ulint) start_time));
+}
+
+/**********************************************************************//**
+Get optimize end time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_get_index_end_time(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	ib_time_t*	end_time)		/*!< out: time in secs */
+{
+	return(fts_config_get_index_ulint(
+		       trx, index, FTS_OPTIMIZE_END_TIME, (ulint*) end_time));
+}
+
+/**********************************************************************//**
+Set the optimize end time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_set_index_end_time(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	ib_time_t	end_time)		/*!< in: end time */
+{
+	return(fts_config_set_index_ulint(
+		       trx, index, FTS_OPTIMIZE_END_TIME, (ulint) end_time));
+}
+#endif
+
+/**********************************************************************//**
+Free the optimize prepared statements.*/
+static
+void
+fts_optimize_graph_free(
+/*====================*/
+	fts_optimize_graph_t*	graph)	/*!< in/out: The graph instances
+					to free */
+{
+	if (graph->commit_graph) {
+		que_graph_free(graph->commit_graph);
+		graph->commit_graph = NULL;
+	}
+
+	if (graph->write_nodes_graph) {
+		que_graph_free(graph->write_nodes_graph);
+		graph->write_nodes_graph = NULL;
+	}
+
+	if (graph->delete_nodes_graph) {
+		que_graph_free(graph->delete_nodes_graph);
+		graph->delete_nodes_graph = NULL;
+	}
+
+	if (graph->read_nodes_graph) {
+		que_graph_free(graph->read_nodes_graph);
+		graph->read_nodes_graph = NULL;
+	}
+}
+
+/**********************************************************************//**
+Free all optimize resources. */
+static
+void
+fts_optimize_free(
+/*==============*/
+	fts_optimize_t*	optim)		/*!< in: table with on FTS index */
+{
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(optim->self_heap->arg);
+
+	trx_free_for_background(optim->trx);
+
+	fts_doc_ids_free(optim->to_delete);
+	fts_optimize_graph_free(&optim->graph);
+
+	mem_free(optim->name_prefix);
+
+	/* This will free the heap from which optim itself was allocated. */
+	mem_heap_free(heap);
+}
+
+/**********************************************************************//**
+Get the max time optimize should run in millisecs.
+@return max optimize time limit in millisecs. */
+static
+ib_time_t
+fts_optimize_get_time_limit(
+/*========================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table)		/*!< in: aux table */
+{
+	ib_time_t	time_limit = 0;
+
+	fts_config_get_ulint(
+		trx, fts_table,
+		FTS_OPTIMIZE_LIMIT_IN_SECS, (ulint*) &time_limit);
+
+	return(time_limit * 1000);
+}
+
+
+/**********************************************************************//**
+Run OPTIMIZE on the given table. Note: this can take a very long time
+(hours). */
+static
+void
+fts_optimize_words(
+/*===============*/
+	fts_optimize_t*	optim,	/*!< in: optimize instance */
+	dict_index_t*	index,	/*!< in: current FTS being optimized */
+	fts_string_t*	word)	/*!< in: the starting word to optimize */
+{
+	fts_fetch_t	fetch;
+	ib_time_t	start_time;
+	que_t*		graph = NULL;
+	CHARSET_INFO*	charset = optim->fts_index_table.charset;
+
+	ut_a(!optim->done);
+
+	/* Get the time limit from the config table. */
+	fts_optimize_time_limit = fts_optimize_get_time_limit(
+		optim->trx, &optim->fts_common_table);
+
+	start_time = ut_time();
+
+	/* Setup the callback to use for fetching the word ilist etc. */
+	fetch.read_arg = optim->words;
+	fetch.read_record = fts_optimize_index_fetch_node;
+
+	fprintf(stderr, "%.*s\n", (int) word->f_len, word->f_str);
+
+	while(!optim->done) {
+		dberr_t	error;
+		trx_t*	trx = optim->trx;
+		ulint	selected;
+
+		ut_a(ib_vector_size(optim->words) == 0);
+
+		selected = fts_select_index(charset, word->f_str, word->f_len);
+
+		/* Read the index records to optimize. */
+		error = fts_index_fetch_nodes(
+			trx, &graph, &optim->fts_index_table, word,
+			&fetch);
+
+		if (error == DB_SUCCESS) {
+			/* There must be some nodes to read. */
+			ut_a(ib_vector_size(optim->words) > 0);
+
+			/* Optimize the nodes that were read and write
+			back to DB. */
+			error = fts_optimize_compact(optim, index, start_time);
+
+			if (error == DB_SUCCESS) {
+				fts_sql_commit(optim->trx);
+			} else {
+				fts_sql_rollback(optim->trx);
+			}
+		}
+
+		ib_vector_reset(optim->words);
+
+		if (error == DB_SUCCESS) {
+			if (!optim->done) {
+				if (!fts_zip_read_word(optim->zip, word)) {
+					optim->done = TRUE;
+				} else if (selected
+					   != fts_select_index(
+						charset, word->f_str,
+						word->f_len)
+					  && graph) {
+					fts_que_graph_free(graph);
+					graph = NULL;
+				}
+			}
+		} else if (error == DB_LOCK_WAIT_TIMEOUT) {
+			fprintf(stderr, "InnoDB: Warning: lock wait timeout "
+				"during optimize. Retrying!\n");
+
+			trx->error_state = DB_SUCCESS;
+		} else if (error == DB_DEADLOCK) {
+			fprintf(stderr, "InnoDB: Warning: deadlock "
+				"during optimize. Retrying!\n");
+
+			trx->error_state = DB_SUCCESS;
+		} else {
+			optim->done = TRUE;		/* Exit the loop. */
+		}
+	}
+
+	if (graph != NULL) {
+		fts_que_graph_free(graph);
+	}
+}
+
+/**********************************************************************//**
+Select the FTS index to search.
+@return TRUE if last index */
+static
+ibool
+fts_optimize_set_next_word(
+/*=======================*/
+	CHARSET_INFO*	charset,	/*!< in: charset */
+	fts_string_t*	word)		/*!< in: current last word */
+{
+	ulint		selected;
+	ibool		last = FALSE;
+
+	selected = fts_select_next_index(charset, word->f_str, word->f_len);
+
+	/* If this was the last index then reset to start. */
+	if (fts_index_selector[selected].value == 0) {
+		/* Reset the last optimized word to '' if no
+		more words could be read from the FTS index. */
+		word->f_len = 0;
+		*word->f_str = 0;
+
+		last = TRUE;
+	} else {
+		ulint	value = fts_index_selector[selected].value;
+
+		ut_a(value <= 0xff);
+
+		/* Set to the first character of the next slot. */
+		word->f_len = 1;
+		*word->f_str = (byte) value;
+	}
+
+	return(last);
+}
+
+/**********************************************************************//**
+Optimize is complete. Set the completion time, and reset the optimize
+start string for this FTS index to "".
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_index_completed(
+/*=========================*/
+	fts_optimize_t*	optim,	/*!< in: optimize instance */
+	dict_index_t*	index)	/*!< in: table with one FTS index */
+{
+	fts_string_t	word;
+	dberr_t		error;
+	byte		buf[sizeof(ulint)];
+#ifdef FTS_OPTIMIZE_DEBUG
+	ib_time_t	end_time = ut_time();
+
+	error = fts_optimize_set_index_end_time(optim->trx, index, end_time);
+#endif
+
+	/* If we've reached the end of the index then set the start
+	word to the empty string. */
+
+	word.f_len = 0;
+	word.f_str = buf;
+	*word.f_str = '\0';
+
+	error = fts_config_set_index_value(
+		optim->trx, index, FTS_LAST_OPTIMIZED_WORD, &word);
+
+	if (error != DB_SUCCESS) {
+
+		fprintf(stderr, "InnoDB: Error: (%s) while "
+			"updating last optimized word!\n", ut_strerr(error));
+	}
+
+	return(error);
+}
+
+
+/**********************************************************************//**
+Read the list of words from the FTS auxiliary index that will be
+optimized in this pass.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_index_read_words(
+/*==========================*/
+	fts_optimize_t*	optim,	/*!< in: optimize instance */
+	dict_index_t*	index,	/*!< in: table with one FTS index */
+	fts_string_t*	word)	/*!< in: buffer to use */
+{
+	dberr_t	error = DB_SUCCESS;
+
+	if (optim->del_list_regenerated) {
+		word->f_len = 0;
+	} else {
+
+		/* Get the last word that was optimized from
+		the config table. */
+		error = fts_config_get_index_value(
+			optim->trx, index, FTS_LAST_OPTIMIZED_WORD, word);
+	}
+
+	/* If record not found then we start from the top. */
+	if (error == DB_RECORD_NOT_FOUND) {
+		word->f_len = 0;
+		error = DB_SUCCESS;
+	}
+
+	while (error == DB_SUCCESS) {
+
+		error = fts_index_fetch_words(
+			optim, word, fts_num_word_optimize);
+
+		if (error == DB_SUCCESS) {
+
+			/* If the search returned an empty set
+			try the next index in the horizontal split. */
+			if (optim->zip->n_words > 0) {
+				break;
+			} else {
+
+				fts_optimize_set_next_word(
+					optim->fts_index_table.charset,
+					word);
+
+				if (word->f_len == 0) {
+					break;
+				}
+			}
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Run OPTIMIZE on the given FTS index. Note: this can take a very long
+time (hours).
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_index(
+/*===============*/
+	fts_optimize_t*	optim,	/*!< in: optimize instance */
+	dict_index_t*	index)	/*!< in: table with one FTS index */
+{
+	fts_string_t	word;
+	dberr_t		error;
+	byte		str[FTS_MAX_WORD_LEN + 1];
+
+	/* Set the current index that we have to optimize. */
+	optim->fts_index_table.index_id = index->id;
+	optim->fts_index_table.charset = fts_index_get_charset(index);
+
+	optim->done = FALSE; /* Optimize until !done */
+
+	/* We need to read the last word optimized so that we start from
+	the next word. */
+	word.f_str = str;
+
+	/* We set the length of word to the size of str since we
+	need to pass the max len info to the fts_get_config_value() function. */
+	word.f_len = sizeof(str) - 1;
+
+	memset(word.f_str, 0x0, word.f_len);
+
+	/* Read the words that will be optimized in this pass. */
+	error = fts_optimize_index_read_words(optim, index, &word);
+
+	if (error == DB_SUCCESS) {
+		int	zip_error;
+
+		ut_a(optim->zip->pos == 0);
+		ut_a(optim->zip->zp->total_in == 0);
+		ut_a(optim->zip->zp->total_out == 0);
+
+		zip_error = inflateInit(optim->zip->zp);
+		ut_a(zip_error == Z_OK);
+
+		word.f_len = 0;
+		word.f_str = str;
+
+		/* Read the first word to optimize from the Zip buffer. */
+		if (!fts_zip_read_word(optim->zip, &word)) {
+
+			optim->done = TRUE;
+		} else {
+			fts_optimize_words(optim, index, &word);
+		}
+
+		/* If we couldn't read any records then optimize is
+		complete. Increment the number of indexes that have
+		been optimized and set FTS index optimize state to
+		completed. */
+		if (error == DB_SUCCESS && optim->zip->n_words == 0) {
+
+			error = fts_optimize_index_completed(optim, index);
+
+			if (error == DB_SUCCESS) {
+				++optim->n_completed;
+			}
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Delete the document ids in the delete, and delete cache tables.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_purge_deleted_doc_ids(
+/*===============================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	ulint		i;
+	pars_info_t*	info;
+	que_t*		graph;
+	fts_update_t*	update;
+	char*		sql_str;
+	doc_id_t	write_doc_id;
+	dberr_t		error = DB_SUCCESS;
+
+	info = pars_info_create();
+
+	ut_a(ib_vector_size(optim->to_delete->doc_ids) > 0);
+
+	update = static_cast<fts_update_t*>(
+		ib_vector_get(optim->to_delete->doc_ids, 0));
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &write_doc_id, update->doc_id);
+
+	/* This is required for the SQL parser to work. It must be able
+	to find the following variables. So we do it twice. */
+	fts_bind_doc_id(info, "doc_id1", &write_doc_id);
+	fts_bind_doc_id(info, "doc_id2", &write_doc_id);
+
+	/* Since we only replace the table_id and don't construct the full
+	name, we do substitution ourselves. Remember to free sql_str. */
+	sql_str = ut_strreplace(
+		fts_delete_doc_ids_sql, "%s", optim->name_prefix);
+
+	graph = fts_parse_sql(NULL, info, sql_str);
+
+	mem_free(sql_str);
+
+	/* Delete the doc ids that were copied at the start. */
+	for (i = 0; i < ib_vector_size(optim->to_delete->doc_ids); ++i) {
+
+		update = static_cast<fts_update_t*>(ib_vector_get(
+			optim->to_delete->doc_ids, i));
+
+		/* Convert to "storage" byte order. */
+		fts_write_doc_id((byte*) &write_doc_id, update->doc_id);
+
+		fts_bind_doc_id(info, "doc_id1", &write_doc_id);
+
+		fts_bind_doc_id(info, "doc_id2", &write_doc_id);
+
+		error = fts_eval_sql(optim->trx, graph);
+
+		// FIXME: Check whether delete actually succeeded!
+		if (error != DB_SUCCESS) {
+
+			fts_sql_rollback(optim->trx);
+			break;
+		}
+	}
+
+	fts_que_graph_free(graph);
+
+	return(error);
+}
+
+/**********************************************************************//**
+Delete the document ids in the pending delete, and delete tables.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_purge_deleted_doc_id_snapshot(
+/*=======================================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	dberr_t		error;
+	que_t*		graph;
+	char*		sql_str;
+
+	/* Since we only replace the table_id and don't construct
+	the full name, we do the '%s' substitution ourselves. */
+	sql_str = ut_strreplace(fts_end_delete_sql, "%s", optim->name_prefix);
+
+	/* Delete the doc ids that were copied to delete pending state at
+	the start of optimize. */
+	graph = fts_parse_sql(NULL, NULL, sql_str);
+
+	mem_free(sql_str);
+
+	error = fts_eval_sql(optim->trx, graph);
+	fts_que_graph_free(graph);
+
+	return(error);
+}
+
+/**********************************************************************//**
+Copy the deleted doc ids that will be purged during this optimize run
+to the being deleted FTS auxiliary tables. The transaction is committed
+upon successfull copy and rolled back on DB_DUPLICATE_KEY error.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_optimize_being_deleted_count(
+/*=============================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, "BEING_DELETED", FTS_COMMON_TABLE,
+			   optim->table);
+
+	return(fts_get_rows_count(&fts_table));
+}
+
+/*********************************************************************//**
+Copy the deleted doc ids that will be purged during this optimize run
+to the being deleted FTS auxiliary tables. The transaction is committed
+upon successfull copy and rolled back on DB_DUPLICATE_KEY error.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_create_deleted_doc_id_snapshot(
+/*========================================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	dberr_t		error;
+	que_t*		graph;
+	char*		sql_str;
+
+	/* Since we only replace the table_id and don't construct the
+	full name, we do the substitution ourselves. */
+	sql_str = ut_strreplace(fts_init_delete_sql, "%s", optim->name_prefix);
+
+	/* Move doc_ids that are to be deleted to state being deleted. */
+	graph = fts_parse_sql(NULL, NULL, sql_str);
+
+	mem_free(sql_str);
+
+	error = fts_eval_sql(optim->trx, graph);
+
+	fts_que_graph_free(graph);
+
+	if (error != DB_SUCCESS) {
+		fts_sql_rollback(optim->trx);
+	} else {
+		fts_sql_commit(optim->trx);
+	}
+
+	optim->del_list_regenerated = TRUE;
+
+	return(error);
+}
+
+/*********************************************************************//**
+Read in the document ids that are to be purged during optimize. The
+transaction is committed upon successfully read.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_read_deleted_doc_id_snapshot(
+/*======================================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	dberr_t		error;
+
+	optim->fts_common_table.suffix = "BEING_DELETED";
+
+	/* Read the doc_ids to delete. */
+	error = fts_table_fetch_doc_ids(
+		optim->trx, &optim->fts_common_table, optim->to_delete);
+
+	if (error == DB_SUCCESS) {
+
+		optim->fts_common_table.suffix = "BEING_DELETED_CACHE";
+
+		/* Read additional doc_ids to delete. */
+		error = fts_table_fetch_doc_ids(
+			optim->trx, &optim->fts_common_table, optim->to_delete);
+	}
+
+	if (error != DB_SUCCESS) {
+
+		fts_doc_ids_free(optim->to_delete);
+		optim->to_delete = NULL;
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Optimze all the FTS indexes, skipping those that have already been
+optimized, since the FTS auxiliary indexes are not guaranteed to be
+of the same cardinality.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_indexes(
+/*=================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	ulint		i;
+	dberr_t		error = DB_SUCCESS;
+	fts_t*		fts = optim->table->fts;
+
+	/* Optimize the FTS indexes. */
+	for (i = 0; i < ib_vector_size(fts->indexes); ++i) {
+		dict_index_t*	index;
+
+#ifdef	FTS_OPTIMIZE_DEBUG
+		ib_time_t	end_time;
+		ib_time_t	start_time;
+
+		/* Get the start and end optimize times for this index. */
+		error = fts_optimize_get_index_start_time(
+			optim->trx, index, &start_time);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		error = fts_optimize_get_index_end_time(
+			optim->trx, index, &end_time);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		/* Start time will be 0 only for the first time or after
+		completing the optimization of all FTS indexes. */
+		if (start_time == 0) {
+			start_time = ut_time();
+
+			error = fts_optimize_set_index_start_time(
+				optim->trx, index, start_time);
+		}
+
+		/* Check if this index needs to be optimized or not. */
+		if (ut_difftime(end_time, start_time) < 0) {
+			error = fts_optimize_index(optim, index);
+
+			if (error != DB_SUCCESS) {
+				break;
+			}
+		} else {
+			++optim->n_completed;
+		}
+#endif
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(fts->indexes, i));
+		error = fts_optimize_index(optim, index);
+	}
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(optim->trx);
+	} else {
+		fts_sql_rollback(optim->trx);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Cleanup the snapshot tables and the master deleted table.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_purge_snapshot(
+/*========================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	dberr_t		error;
+
+	/* Delete the doc ids from the master deleted tables, that were
+	in the snapshot that was taken at the start of optimize. */
+	error = fts_optimize_purge_deleted_doc_ids(optim);
+
+	if (error == DB_SUCCESS) {
+		/* Destroy the deleted doc id snapshot. */
+		error = fts_optimize_purge_deleted_doc_id_snapshot(optim);
+	}
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(optim->trx);
+	} else {
+		fts_sql_rollback(optim->trx);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Reset the start time to 0 so that a new optimize can be started.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_reset_start_time(
+/*==========================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	dberr_t		error = DB_SUCCESS;
+#ifdef FTS_OPTIMIZE_DEBUG
+	fts_t*		fts = optim->table->fts;
+
+	/* Optimization should have been completed for all indexes. */
+	ut_a(optim->n_completed == ib_vector_size(fts->indexes));
+
+	for (uint i = 0; i < ib_vector_size(fts->indexes); ++i) {
+		dict_index_t*	index;
+
+		ib_time_t	start_time = 0;
+
+		/* Reset the start time to 0 for this index. */
+		error = fts_optimize_set_index_start_time(
+			optim->trx, index, start_time);
+
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(fts->indexes, i));
+	}
+#endif
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(optim->trx);
+	} else {
+		fts_sql_rollback(optim->trx);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Run OPTIMIZE on the given table by a background thread.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull))
+dberr_t
+fts_optimize_table_bk(
+/*==================*/
+	fts_slot_t*	slot)	/*!< in: table to optimiza */
+{
+	dberr_t		error;
+	dict_table_t*	table = slot->table;
+	fts_t*		fts = table->fts;
+
+	/* Avoid optimizing tables that were optimized recently. */
+	if (slot->last_run > 0
+	    && (ut_time() - slot->last_run) < slot->interval_time) {
+
+		return(DB_SUCCESS);
+
+	} else if (fts && fts->cache
+		   && fts->cache->deleted >= FTS_OPTIMIZE_THRESHOLD) {
+
+		error = fts_optimize_table(table);
+
+		if (error == DB_SUCCESS) {
+			slot->state = FTS_STATE_DONE;
+			slot->last_run = 0;
+			slot->completed = ut_time();
+		}
+	} else {
+		error = DB_SUCCESS;
+	}
+
+	/* Note time this run completed. */
+	slot->last_run = ut_time();
+
+	return(error);
+}
+/*********************************************************************//**
+Run OPTIMIZE on the given table.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+fts_optimize_table(
+/*===============*/
+	dict_table_t*	table)	/*!< in: table to optimiza */
+{
+	dberr_t		error = DB_SUCCESS;
+	fts_optimize_t*	optim = NULL;
+	fts_t*		fts = table->fts;
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: FTS start optimize %s\n", table->name);
+
+	optim = fts_optimize_create(table);
+
+	// FIXME: Call this only at the start of optimize, currently we
+	// rely on DB_DUPLICATE_KEY to handle corrupting the snapshot.
+
+	/* Check whether there are still records in BEING_DELETED table */
+	if (fts_optimize_being_deleted_count(optim) == 0) {
+		/* Take a snapshot of the deleted document ids, they are copied
+		to the BEING_ tables. */
+		error = fts_optimize_create_deleted_doc_id_snapshot(optim);
+	}
+
+	/* A duplicate error is OK, since we don't erase the
+	doc ids from the being deleted state until all FTS
+	indexes have been optimized. */
+	if (error == DB_DUPLICATE_KEY) {
+		error = DB_SUCCESS;
+	}
+
+	if (error == DB_SUCCESS) {
+
+		/* These document ids will be filtered out during the
+		index optimization phase. They are in the snapshot that we
+		took above, at the start of the optimize. */
+		error = fts_optimize_read_deleted_doc_id_snapshot(optim);
+
+		if (error == DB_SUCCESS) {
+
+			/* Commit the read of being deleted
+			doc ids transaction. */
+			fts_sql_commit(optim->trx);
+
+			/* We would do optimization only if there
+			are deleted records to be cleaned up */
+			if (ib_vector_size(optim->to_delete->doc_ids) > 0) {
+				error = fts_optimize_indexes(optim);
+			}
+
+		} else {
+			ut_a(optim->to_delete == NULL);
+		}
+
+		/* Only after all indexes have been optimized can we
+		delete the (snapshot) doc ids in the pending delete,
+		and master deleted tables. */
+		if (error == DB_SUCCESS
+		    && optim->n_completed == ib_vector_size(fts->indexes)) {
+
+			if (fts_enable_diag_print) {
+				fprintf(stderr, "FTS_OPTIMIZE: Completed "
+						"Optimize, cleanup DELETED "
+						"table\n");
+			}
+
+			if (ib_vector_size(optim->to_delete->doc_ids) > 0) {
+
+				/* Purge the doc ids that were in the
+				snapshot from the snapshot tables and
+				the master deleted table. */
+				error = fts_optimize_purge_snapshot(optim);
+			}
+
+			if (error == DB_SUCCESS) {
+				/* Reset the start time of all the FTS indexes
+				so that optimize can be restarted. */
+				error = fts_optimize_reset_start_time(optim);
+			}
+		}
+	}
+
+	fts_optimize_free(optim);
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: FTS end optimize %s\n", table->name);
+
+	return(error);
+}
+
+/********************************************************************//**
+Add the table to add to the OPTIMIZER's list.
+@return new message instance */
+static
+fts_msg_t*
+fts_optimize_create_msg(
+/*====================*/
+	fts_msg_type_t	type,		/*!< in: type of message */
+	void*		ptr)		/*!< in: message payload */
+{
+	mem_heap_t*	heap;
+	fts_msg_t*	msg;
+
+	heap = mem_heap_create(sizeof(*msg) + sizeof(ib_list_node_t) + 16);
+	msg = static_cast<fts_msg_t*>(mem_heap_alloc(heap, sizeof(*msg)));
+
+	msg->ptr = ptr;
+	msg->type = type;
+	msg->heap = heap;
+
+	return(msg);
+}
+
+/**********************************************************************//**
+Add the table to add to the OPTIMIZER's list. */
+UNIV_INTERN
+void
+fts_optimize_add_table(
+/*===================*/
+	dict_table_t*	table)			/*!< in: table to add */
+{
+	fts_msg_t*	msg;
+
+	if (!fts_optimize_wq) {
+		return;
+	}
+
+	/* Make sure table with FTS index cannot be evicted */
+	if (table->can_be_evicted) {
+		dict_table_move_from_lru_to_non_lru(table);
+	}
+
+	msg = fts_optimize_create_msg(FTS_MSG_ADD_TABLE, table);
+
+	ib_wqueue_add(fts_optimize_wq, msg, msg->heap);
+}
+
+/**********************************************************************//**
+Optimize a table. */
+UNIV_INTERN
+void
+fts_optimize_do_table(
+/*==================*/
+	dict_table_t*	table)			/*!< in: table to optimize */
+{
+	fts_msg_t*	msg;
+
+	/* Optimizer thread could be shutdown */
+	if (!fts_optimize_wq) {
+		return;
+	}
+
+	msg = fts_optimize_create_msg(FTS_MSG_OPTIMIZE_TABLE, table);
+
+	ib_wqueue_add(fts_optimize_wq, msg, msg->heap);
+}
+
+/**********************************************************************//**
+Remove the table from the OPTIMIZER's list. We do wait for
+acknowledgement from the consumer of the message. */
+UNIV_INTERN
+void
+fts_optimize_remove_table(
+/*======================*/
+	dict_table_t*	table)			/*!< in: table to remove */
+{
+	fts_msg_t*	msg;
+	os_event_t	event;
+	fts_msg_del_t*	remove;
+
+	/* if the optimize system not yet initialized, return */
+	if (!fts_optimize_wq) {
+		return;
+	}
+
+	/* FTS optimizer thread is already exited */
+	if (fts_opt_start_shutdown) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Try to remove table %s after FTS optimize"
+			" thread exiting.", table->name);
+		return;
+	}
+
+	msg = fts_optimize_create_msg(FTS_MSG_DEL_TABLE, NULL);
+
+	/* We will wait on this event until signalled by the consumer. */
+	event = os_event_create();
+
+	remove = static_cast<fts_msg_del_t*>(
+		mem_heap_alloc(msg->heap, sizeof(*remove)));
+
+	remove->table = table;
+	remove->event = event;
+	msg->ptr = remove;
+
+	ib_wqueue_add(fts_optimize_wq, msg, msg->heap);
+
+	os_event_wait(event);
+
+	os_event_free(event);
+}
+
+/**********************************************************************//**
+Find the slot for a particular table.
+@return slot if found else NULL. */
+static
+fts_slot_t*
+fts_optimize_find_slot(
+/*===================*/
+	ib_vector_t*		tables,		/*!< in: vector of tables */
+	const dict_table_t*	table)		/*!< in: table to add */
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(tables); ++i) {
+		fts_slot_t*	slot;
+
+		slot = static_cast<fts_slot_t*>(ib_vector_get(tables, i));
+
+		if (slot->table->id == table->id) {
+			return(slot);
+		}
+	}
+
+	return(NULL);
+}
+
+/**********************************************************************//**
+Start optimizing table. */
+static
+void
+fts_optimize_start_table(
+/*=====================*/
+	ib_vector_t*		tables,		/*!< in/out: vector of tables */
+	dict_table_t*		table)		/*!< in: table to optimize */
+{
+	fts_slot_t*	slot;
+
+	slot = fts_optimize_find_slot(tables, table);
+
+	if (slot == NULL) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: table %s not registered "
+			"with the optimize thread.\n", table->name);
+	} else {
+		slot->last_run = 0;
+		slot->completed = 0;
+	}
+}
+
+/**********************************************************************//**
+Add the table to the vector if it doesn't already exist. */
+static
+ibool
+fts_optimize_new_table(
+/*===================*/
+	ib_vector_t*	tables,			/*!< in/out: vector of tables */
+	dict_table_t*	table)			/*!< in: table to add */
+{
+	ulint		i;
+	fts_slot_t*	slot;
+	ulint		empty_slot = ULINT_UNDEFINED;
+
+	/* Search for duplicates, also find a free slot if one exists. */
+	for (i = 0; i < ib_vector_size(tables); ++i) {
+
+		slot = static_cast<fts_slot_t*>(
+			ib_vector_get(tables, i));
+
+		if (slot->state == FTS_STATE_EMPTY) {
+			empty_slot = i;
+		} else if (slot->table->id == table->id) {
+			/* Already exists in our optimize queue. */
+			return(FALSE);
+		}
+	}
+
+	/* Reuse old slot. */
+	if (empty_slot != ULINT_UNDEFINED) {
+
+		slot = static_cast<fts_slot_t*>(
+			ib_vector_get(tables, empty_slot));
+
+		ut_a(slot->state == FTS_STATE_EMPTY);
+
+	} else { /* Create a new slot. */
+
+		slot = static_cast<fts_slot_t*>(ib_vector_push(tables, NULL));
+	}
+
+	memset(slot, 0x0, sizeof(*slot));
+
+	slot->table = table;
+	slot->state = FTS_STATE_LOADED;
+	slot->interval_time = FTS_OPTIMIZE_INTERVAL_IN_SECS;
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Remove the table from the vector if it exists. */
+static
+ibool
+fts_optimize_del_table(
+/*===================*/
+	ib_vector_t*	tables,			/*!< in/out: vector of tables */
+	fts_msg_del_t*	msg)			/*!< in: table to delete */
+{
+	ulint		i;
+	dict_table_t*	table = msg->table;
+
+	for (i = 0; i < ib_vector_size(tables); ++i) {
+		fts_slot_t*	slot;
+
+		slot = static_cast<fts_slot_t*>(ib_vector_get(tables, i));
+
+		/* FIXME: Should we assert on this ? */
+		if (slot->state != FTS_STATE_EMPTY
+		    && slot->table->id == table->id) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr, " InnoDB: FTS Optimize Removing "
+				"table %s\n", table->name);
+
+			slot->table = NULL;
+			slot->state = FTS_STATE_EMPTY;
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Calculate how many of the registered tables need to be optimized.
+@return no. of tables to optimize */
+static
+ulint
+fts_optimize_how_many(
+/*==================*/
+	const ib_vector_t*	tables)		/*!< in: registered tables
+						vector*/
+{
+	ulint		i;
+	ib_time_t	delta;
+	ulint		n_tables = 0;
+	ib_time_t	current_time;
+
+	current_time = ut_time();
+
+	for (i = 0; i < ib_vector_size(tables); ++i) {
+		const fts_slot_t*	slot;
+
+		slot = static_cast<const fts_slot_t*>(
+			ib_vector_get_const(tables, i));
+
+		switch (slot->state) {
+		case FTS_STATE_DONE:
+		case FTS_STATE_LOADED:
+			ut_a(slot->completed <= current_time);
+
+			delta = current_time - slot->completed;
+
+			/* Skip slots that have been optimized recently. */
+			if (delta >= slot->interval_time) {
+				++n_tables;
+			}
+			break;
+
+		case FTS_STATE_RUNNING:
+			ut_a(slot->last_run <= current_time);
+
+			delta = current_time - slot->last_run;
+
+			if (delta > slot->interval_time) {
+				++n_tables;
+			}
+			break;
+
+			/* Slots in a state other than the above
+			are ignored. */
+		case FTS_STATE_EMPTY:
+		case FTS_STATE_SUSPENDED:
+			break;
+		}
+
+	}
+
+	return(n_tables);
+}
+
+/**********************************************************************//**
+Check if the total memory used by all FTS table exceeds the maximum limit.
+@return true if a sync is needed, false otherwise */
+static
+bool
+fts_is_sync_needed(
+/*===============*/
+	const ib_vector_t*	tables)		/*!< in: registered tables
+						vector*/
+{
+	ulint		total_memory = 0;
+	double		time_diff = difftime(ut_time(), last_check_sync_time);
+
+	if (fts_need_sync || time_diff < 5) {
+		return(false);
+	}
+
+	last_check_sync_time = ut_time();
+
+	for (ulint i = 0; i < ib_vector_size(tables); ++i) {
+		const fts_slot_t*	slot;
+
+		slot = static_cast<const fts_slot_t*>(
+			ib_vector_get_const(tables, i));
+
+		if (slot->table && slot->table->fts) {
+			total_memory += slot->table->fts->cache->total_size;
+		}
+
+		if (total_memory > fts_max_total_cache_size) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+#if 0
+/*********************************************************************//**
+Check whether a table needs to be optimized. */
+static
+void
+fts_optimize_need_sync(
+/*===================*/
+	ib_vector_t*	tables)	/*!< in: list of tables */
+{
+	dict_table_t*	table = NULL;
+	fts_slot_t*	slot;
+	ulint		num_table = ib_vector_size(tables);
+
+	if (!num_table) {
+		return;
+	}
+
+	if (fts_optimize_sync_iterator >= num_table) {
+		fts_optimize_sync_iterator = 0;
+	}
+
+	slot = ib_vector_get(tables, fts_optimize_sync_iterator);
+	table = slot->table;
+
+	if (!table) {
+		return;
+	}
+
+	ut_ad(table->fts);
+
+	if (table->fts->cache) {
+		ulint	deleted = table->fts->cache->deleted;
+
+		if (table->fts->cache->added
+		    >= fts_optimize_add_threshold) {
+			fts_sync_table(table);
+		} else if (deleted >= fts_optimize_delete_threshold) {
+			fts_optimize_do_table(table);
+
+			mutex_enter(&table->fts->cache->deleted_lock);
+			table->fts->cache->deleted -= deleted;
+			mutex_exit(&table->fts->cache->deleted_lock);
+		}
+	}
+
+	fts_optimize_sync_iterator++;
+
+	return;
+}
+#endif
+
+/**********************************************************************//**
+Optimize all FTS tables.
+@return Dummy return */
+UNIV_INTERN
+os_thread_ret_t
+fts_optimize_thread(
+/*================*/
+	void*		arg)			/*!< in: work queue*/
+{
+	mem_heap_t*	heap;
+	ib_vector_t*	tables;
+	ib_alloc_t*	heap_alloc;
+	ulint		current = 0;
+	ibool		done = FALSE;
+	ulint		n_tables = 0;
+	os_event_t	exit_event = 0;
+	ulint		n_optimize = 0;
+	ib_wqueue_t*	wq = (ib_wqueue_t*) arg;
+
+	ut_ad(!srv_read_only_mode);
+
+	heap = mem_heap_create(sizeof(dict_table_t*) * 64);
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	tables = ib_vector_create(heap_alloc, sizeof(fts_slot_t), 4);
+
+	while(!done && srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+
+		/* If there is no message in the queue and we have tables
+		to optimize then optimize the tables. */
+
+		if (!done
+		    && ib_wqueue_is_empty(wq)
+		    && n_tables > 0
+		    && n_optimize > 0) {
+
+			fts_slot_t*	slot;
+
+			ut_a(ib_vector_size(tables) > 0);
+
+			slot = static_cast<fts_slot_t*>(
+				ib_vector_get(tables, current));
+
+			/* Handle the case of empty slots. */
+			if (slot->state != FTS_STATE_EMPTY) {
+
+				slot->state = FTS_STATE_RUNNING;
+
+				fts_optimize_table_bk(slot);
+			}
+
+			++current;
+
+			/* Wrap around the counter. */
+			if (current >= ib_vector_size(tables)) {
+				n_optimize = fts_optimize_how_many(tables);
+
+				current = 0;
+			}
+
+		} else if (n_optimize == 0 || !ib_wqueue_is_empty(wq)) {
+			fts_msg_t*	msg;
+
+			msg = static_cast<fts_msg_t*>(
+				ib_wqueue_timedwait(wq,
+						    FTS_QUEUE_WAIT_IN_USECS));
+
+			/* Timeout ? */
+			if (msg == NULL) {
+				if (fts_is_sync_needed(tables)) {
+					fts_need_sync = true;
+				}
+
+				continue;
+			}
+
+			switch (msg->type) {
+			case FTS_MSG_START:
+				break;
+
+			case FTS_MSG_PAUSE:
+				break;
+
+			case FTS_MSG_STOP:
+				done = TRUE;
+				exit_event = (os_event_t) msg->ptr;
+				break;
+
+			case FTS_MSG_ADD_TABLE:
+				ut_a(!done);
+				if (fts_optimize_new_table(
+					tables,
+					static_cast<dict_table_t*>(
+					msg->ptr))) {
+					++n_tables;
+				}
+				break;
+
+			case FTS_MSG_OPTIMIZE_TABLE:
+				if (!done) {
+					fts_optimize_start_table(
+						tables,
+						static_cast<dict_table_t*>(
+						msg->ptr));
+				}
+				break;
+
+			case FTS_MSG_DEL_TABLE:
+				if (fts_optimize_del_table(
+					tables, static_cast<fts_msg_del_t*>(
+						msg->ptr))) {
+					--n_tables;
+				}
+
+				/* Signal the producer that we have
+				removed the table. */
+				os_event_set(
+					((fts_msg_del_t*) msg->ptr)->event);
+				break;
+
+			default:
+				ut_error;
+			}
+
+			mem_heap_free(msg->heap);
+
+			if (!done) {
+				n_optimize = fts_optimize_how_many(tables);
+			} else {
+				n_optimize = 0;
+			}
+		}
+	}
+
+	/* Server is being shutdown, sync the data from FTS cache to disk
+	if needed */
+	if (n_tables > 0) {
+		ulint	i;
+
+		for (i = 0; i < ib_vector_size(tables); i++) {
+			fts_slot_t*	slot;
+
+			slot = static_cast<fts_slot_t*>(
+				ib_vector_get(tables, i));
+
+			if (slot->state != FTS_STATE_EMPTY) {
+				dict_table_t*	table = NULL;
+
+			        table = dict_table_open_on_name(
+					slot->table->name, FALSE, FALSE,
+					DICT_ERR_IGNORE_INDEX_ROOT);
+
+				if (table) {
+
+					if (dict_table_has_fts_index(table)) {
+						fts_sync_table(table);
+					}
+
+					if (table->fts) {
+						fts_free(table);
+					}
+
+					dict_table_close(table, FALSE, FALSE);
+				}
+			}
+		}
+	}
+
+	ib_vector_free(tables);
+
+	ib_logf(IB_LOG_LEVEL_INFO, "FTS optimize thread exiting.");
+
+	os_event_set(exit_event);
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/**********************************************************************//**
+Startup the optimize thread and create the work queue. */
+UNIV_INTERN
+void
+fts_optimize_init(void)
+/*===================*/
+{
+	ut_ad(!srv_read_only_mode);
+
+	/* For now we only support one optimize thread. */
+	ut_a(fts_optimize_wq == NULL);
+
+	fts_optimize_wq = ib_wqueue_create();
+	ut_a(fts_optimize_wq != NULL);
+	last_check_sync_time = ut_time();
+
+	os_thread_create(fts_optimize_thread, fts_optimize_wq, NULL);
+}
+
+/**********************************************************************//**
+Check whether the work queue is initialized.
+@return TRUE if optimze queue is initialized. */
+UNIV_INTERN
+ibool
+fts_optimize_is_init(void)
+/*======================*/
+{
+	return(fts_optimize_wq != NULL);
+}
+
+/**********************************************************************//**
+Signal the optimize thread to prepare for shutdown. */
+UNIV_INTERN
+void
+fts_optimize_start_shutdown(void)
+/*=============================*/
+{
+	ut_ad(!srv_read_only_mode);
+
+	fts_msg_t*	msg;
+	os_event_t	event;
+
+	/* If there is an ongoing activity on dictionary, such as
+	srv_master_evict_from_table_cache(), wait for it */
+	dict_mutex_enter_for_mysql();
+
+	/* Tells FTS optimizer system that we are exiting from
+	optimizer thread, message send their after will not be
+	processed */
+	fts_opt_start_shutdown = true;
+	dict_mutex_exit_for_mysql();
+
+	/* We tell the OPTIMIZE thread to switch to state done, we
+	can't delete the work queue here because the add thread needs
+	deregister the FTS tables. */
+	event = os_event_create();
+
+	msg = fts_optimize_create_msg(FTS_MSG_STOP, NULL);
+	msg->ptr = event;
+
+	ib_wqueue_add(fts_optimize_wq, msg, msg->heap);
+
+	os_event_wait(event);
+	os_event_free(event);
+
+	ib_wqueue_free(fts_optimize_wq);
+
+}
+
+/**********************************************************************//**
+Reset the work queue. */
+UNIV_INTERN
+void
+fts_optimize_end(void)
+/*==================*/
+{
+	ut_ad(!srv_read_only_mode);
+
+	// FIXME: Potential race condition here: We should wait for
+	// the optimize thread to confirm shutdown.
+	fts_optimize_wq = NULL;
+}
diff --git a/storage/xtradb/fts/fts0pars.cc b/storage/xtradb/fts/fts0pars.cc
new file mode 100644
index 00000000000..a4009106c83
--- /dev/null
+++ b/storage/xtradb/fts/fts0pars.cc
@@ -0,0 +1,1972 @@
+/* A Bison parser, made by GNU Bison 2.5.  */
+
+/* Bison implementation for Yacc-like parsers in C
+   
+      Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc.
+   
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+   
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+/* C LALR(1) parser skeleton written by Richard Stallman, by
+   simplifying the original so-called "semantic" parser.  */
+
+/* All symbols defined below should begin with yy or YY, to avoid
+   infringing on user name space.  This should be done even for local
+   variables, as they might otherwise be expanded by user macros.
+   There are some unavoidable exceptions within include files to
+   define necessary library symbols; they are noted "INFRINGES ON
+   USER NAME SPACE" below.  */
+
+/* Identify Bison output.  */
+#define YYBISON 1
+
+/* Bison version.  */
+#define YYBISON_VERSION "2.5"
+
+/* Skeleton name.  */
+#define YYSKELETON_NAME "yacc.c"
+
+/* Pure parsers.  */
+#define YYPURE 1
+
+/* Push parsers.  */
+#define YYPUSH 0
+
+/* Pull parsers.  */
+#define YYPULL 1
+
+/* Using locations.  */
+#define YYLSP_NEEDED 0
+
+/* Substitute the variable and function names.  */
+#define yyparse         ftsparse
+#define yylex           ftslex
+#define yyerror         ftserror
+#define yylval          ftslval
+#define yychar          ftschar
+#define yydebug         ftsdebug
+#define yynerrs         ftsnerrs
+
+
+/* Copy the first part of user declarations.  */
+
+/* Line 268 of yacc.c  */
+#line 26 "fts0pars.y"
+
+
+#include "mem0mem.h"
+#include "fts0ast.h"
+#include "fts0blex.h"
+#include "fts0tlex.h"
+#include "fts0pars.h"
+
+extern	int fts_lexer(YYSTYPE*, fts_lexer_t*);
+extern	int fts_blexer(YYSTYPE*, yyscan_t);
+extern	int fts_tlexer(YYSTYPE*, yyscan_t);
+
+typedef int (*fts_scan)();
+
+extern int ftserror(const char* p);
+
+/* Required for reentrant parser */
+#define ftslex	fts_lexer
+
+#define YYERROR_VERBOSE
+
+/* For passing an argument to yyparse() */
+#define YYPARSE_PARAM state
+#define YYLEX_PARAM ((fts_ast_state_t*) state)->lexer
+
+typedef	int	(*fts_scanner_alt)(YYSTYPE* val, yyscan_t yyscanner);
+typedef	int	(*fts_scanner)();
+
+struct fts_lexer_t {
+	fts_scanner	scanner;
+	void*		yyscanner;
+};
+
+
+
+/* Line 268 of yacc.c  */
+#line 115 "fts0pars.cc"
+
+/* Enabling traces.  */
+#ifndef YYDEBUG
+# define YYDEBUG 0
+#endif
+
+/* Enabling verbose error messages.  */
+#ifdef YYERROR_VERBOSE
+# undef YYERROR_VERBOSE
+# define YYERROR_VERBOSE 1
+#else
+# define YYERROR_VERBOSE 0
+#endif
+
+/* Enabling the token table.  */
+#ifndef YYTOKEN_TABLE
+# define YYTOKEN_TABLE 0
+#endif
+
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     FTS_OPER = 258,
+     FTS_TEXT = 259,
+     FTS_TERM = 260,
+     FTS_NUMB = 261
+   };
+#endif
+
+
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef union YYSTYPE
+{
+
+/* Line 293 of yacc.c  */
+#line 61 "fts0pars.y"
+
+	int		oper;
+	char*		token;
+	fts_ast_node_t*	node;
+
+
+
+/* Line 293 of yacc.c  */
+#line 165 "fts0pars.cc"
+} YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+/* Copy the second part of user declarations.  */
+
+
+/* Line 343 of yacc.c  */
+#line 177 "fts0pars.cc"
+
+#ifdef short
+# undef short
+#endif
+
+#ifdef YYTYPE_UINT8
+typedef YYTYPE_UINT8 yytype_uint8;
+#else
+typedef unsigned char yytype_uint8;
+#endif
+
+#ifdef YYTYPE_INT8
+typedef YYTYPE_INT8 yytype_int8;
+#elif (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+typedef signed char yytype_int8;
+#else
+typedef short int yytype_int8;
+#endif
+
+#ifdef YYTYPE_UINT16
+typedef YYTYPE_UINT16 yytype_uint16;
+#else
+typedef unsigned short int yytype_uint16;
+#endif
+
+#ifdef YYTYPE_INT16
+typedef YYTYPE_INT16 yytype_int16;
+#else
+typedef short int yytype_int16;
+#endif
+
+#ifndef YYSIZE_T
+# ifdef __SIZE_TYPE__
+#  define YYSIZE_T __SIZE_TYPE__
+# elif defined size_t
+#  define YYSIZE_T size_t
+# elif ! defined YYSIZE_T && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+#  include <stddef.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYSIZE_T size_t
+# else
+#  define YYSIZE_T unsigned int
+# endif
+#endif
+
+#define YYSIZE_MAXIMUM ((YYSIZE_T) -1)
+
+#ifndef YY_
+# if defined YYENABLE_NLS && YYENABLE_NLS
+#  if ENABLE_NLS
+#   include <libintl.h> /* INFRINGES ON USER NAME SPACE */
+#   define YY_(msgid) dgettext ("bison-runtime", msgid)
+#  endif
+# endif
+# ifndef YY_
+#  define YY_(msgid) msgid
+# endif
+#endif
+
+/* Suppress unused-variable warnings by "using" E.  */
+#if ! defined lint || defined __GNUC__
+# define YYUSE(e) ((void) (e))
+#else
+# define YYUSE(e) /* empty */
+#endif
+
+/* Identity function, used to suppress warnings about constant conditions.  */
+#ifndef lint
+# define YYID(n) (n)
+#else
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static int
+YYID (int yyi)
+#else
+static int
+YYID (yyi)
+    int yyi;
+#endif
+{
+  return yyi;
+}
+#endif
+
+#if ! defined yyoverflow || YYERROR_VERBOSE
+
+/* The parser invokes alloca or malloc; define the necessary symbols.  */
+
+# ifdef YYSTACK_USE_ALLOCA
+#  if YYSTACK_USE_ALLOCA
+#   ifdef __GNUC__
+#    define YYSTACK_ALLOC __builtin_alloca
+#   elif defined __BUILTIN_VA_ARG_INCR
+#    include <alloca.h> /* INFRINGES ON USER NAME SPACE */
+#   elif defined _AIX
+#    define YYSTACK_ALLOC __alloca
+#   elif defined _MSC_VER
+#    include <malloc.h> /* INFRINGES ON USER NAME SPACE */
+#    define alloca _alloca
+#   else
+#    define YYSTACK_ALLOC alloca
+#    if ! defined _ALLOCA_H && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+#     include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#     ifndef EXIT_SUCCESS
+#      define EXIT_SUCCESS 0
+#     endif
+#    endif
+#   endif
+#  endif
+# endif
+
+# ifdef YYSTACK_ALLOC
+   /* Pacify GCC's `empty if-body' warning.  */
+#  define YYSTACK_FREE(Ptr) do { /* empty */; } while (YYID (0))
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+    /* The OS might guarantee only one guard page at the bottom of the stack,
+       and a page size can be as small as 4096 bytes.  So we cannot safely
+       invoke alloca (N) if N exceeds 4096.  Use a slightly smaller number
+       to allow for a few compiler-allocated temporary stack slots.  */
+#   define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */
+#  endif
+# else
+#  define YYSTACK_ALLOC YYMALLOC
+#  define YYSTACK_FREE YYFREE
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+#   define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM
+#  endif
+#  if (defined __cplusplus && ! defined EXIT_SUCCESS \
+       && ! ((defined YYMALLOC || defined malloc) \
+	     && (defined YYFREE || defined free)))
+#   include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#   ifndef EXIT_SUCCESS
+#    define EXIT_SUCCESS 0
+#   endif
+#  endif
+#  ifndef YYMALLOC
+#   define YYMALLOC malloc
+#   if ! defined malloc && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+#  ifndef YYFREE
+#   define YYFREE free
+#   if ! defined free && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+void free (void *); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+# endif
+#endif /* ! defined yyoverflow || YYERROR_VERBOSE */
+
+
+#if (! defined yyoverflow \
+     && (! defined __cplusplus \
+	 || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL)))
+
+/* A type that is properly aligned for any stack member.  */
+union yyalloc
+{
+  yytype_int16 yyss_alloc;
+  YYSTYPE yyvs_alloc;
+};
+
+/* The size of the maximum gap between one aligned stack and the next.  */
+# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1)
+
+/* The size of an array large to enough to hold all stacks, each with
+   N elements.  */
+# define YYSTACK_BYTES(N) \
+     ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \
+      + YYSTACK_GAP_MAXIMUM)
+
+# define YYCOPY_NEEDED 1
+
+/* Relocate STACK from its old location to the new one.  The
+   local variables YYSIZE and YYSTACKSIZE give the old and new number of
+   elements in the stack, and YYPTR gives the new location of the
+   stack.  Advance YYPTR to a properly aligned location for the next
+   stack.  */
+# define YYSTACK_RELOCATE(Stack_alloc, Stack)				\
+    do									\
+      {									\
+	YYSIZE_T yynewbytes;						\
+	YYCOPY (&yyptr->Stack_alloc, Stack, yysize);			\
+	Stack = &yyptr->Stack_alloc;					\
+	yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \
+	yyptr += yynewbytes / sizeof (*yyptr);				\
+      }									\
+    while (YYID (0))
+
+#endif
+
+#if defined YYCOPY_NEEDED && YYCOPY_NEEDED
+/* Copy COUNT objects from FROM to TO.  The source and destination do
+   not overlap.  */
+# ifndef YYCOPY
+#  if defined __GNUC__ && 1 < __GNUC__
+#   define YYCOPY(To, From, Count) \
+      __builtin_memcpy (To, From, (Count) * sizeof (*(From)))
+#  else
+#   define YYCOPY(To, From, Count)		\
+      do					\
+	{					\
+	  YYSIZE_T yyi;				\
+	  for (yyi = 0; yyi < (Count); yyi++)	\
+	    (To)[yyi] = (From)[yyi];		\
+	}					\
+      while (YYID (0))
+#  endif
+# endif
+#endif /* !YYCOPY_NEEDED */
+
+/* YYFINAL -- State number of the termination state.  */
+#define YYFINAL  3
+/* YYLAST -- Last index in YYTABLE.  */
+#define YYLAST   52
+
+/* YYNTOKENS -- Number of terminals.  */
+#define YYNTOKENS  16
+/* YYNNTS -- Number of nonterminals.  */
+#define YYNNTS  8
+/* YYNRULES -- Number of rules.  */
+#define YYNRULES  24
+/* YYNRULES -- Number of states.  */
+#define YYNSTATES  33
+
+/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX.  */
+#define YYUNDEFTOK  2
+#define YYMAXUTOK   261
+
+#define YYTRANSLATE(YYX)						\
+  ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
+
+/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX.  */
+static const yytype_uint8 yytranslate[] =
+{
+       0,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+      12,    13,    14,     7,     2,     8,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+      10,     2,    11,     2,    15,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     9,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     1,     2,     3,     4,
+       5,     6
+};
+
+#if YYDEBUG
+/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in
+   YYRHS.  */
+static const yytype_uint8 yyprhs[] =
+{
+       0,     0,     3,     5,     6,     9,    12,    16,    21,    23,
+      25,    28,    32,    36,    39,    44,    47,    49,    51,    53,
+      55,    57,    59,    61,    64
+};
+
+/* YYRHS -- A `-1'-separated list of the rules' RHS.  */
+static const yytype_int8 yyrhs[] =
+{
+      17,     0,    -1,    18,    -1,    -1,    18,    20,    -1,    18,
+      19,    -1,    12,    18,    13,    -1,    21,    12,    18,    13,
+      -1,    22,    -1,    23,    -1,    22,    14,    -1,    23,    15,
+       6,    -1,    21,    22,    14,    -1,    21,    22,    -1,    21,
+      23,    15,     6,    -1,    21,    23,    -1,     8,    -1,     7,
+      -1,     9,    -1,    10,    -1,    11,    -1,     5,    -1,     6,
+      -1,    14,    22,    -1,     4,    -1
+};
+
+/* YYRLINE[YYN] -- source line where rule number YYN was defined.  */
+static const yytype_uint8 yyrline[] =
+{
+       0,    79,    79,    85,    89,    99,   111,   115,   124,   128,
+     132,   136,   141,   147,   152,   159,   165,   169,   173,   177,
+     181,   186,   191,   197,   202
+};
+#endif
+
+#if YYDEBUG || YYERROR_VERBOSE || YYTOKEN_TABLE
+/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
+   First, the terminals, then, starting at YYNTOKENS, nonterminals.  */
+static const char *const yytname[] =
+{
+  "$end", "error", "$undefined", "FTS_OPER", "FTS_TEXT", "FTS_TERM",
+  "FTS_NUMB", "'+'", "'-'", "'~'", "'<'", "'>'", "'('", "')'", "'*'",
+  "'@'", "$accept", "query", "expr_lst", "sub_expr", "expr", "prefix",
+  "term", "text", 0
+};
+#endif
+
+# ifdef YYPRINT
+/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to
+   token YYLEX-NUM.  */
+static const yytype_uint16 yytoknum[] =
+{
+       0,   256,   257,   258,   259,   260,   261,    43,    45,   126,
+      60,    62,    40,    41,    42,    64
+};
+# endif
+
+/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
+static const yytype_uint8 yyr1[] =
+{
+       0,    16,    17,    18,    18,    18,    19,    19,    20,    20,
+      20,    20,    20,    20,    20,    20,    21,    21,    21,    21,
+      21,    22,    22,    22,    23
+};
+
+/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN.  */
+static const yytype_uint8 yyr2[] =
+{
+       0,     2,     1,     0,     2,     2,     3,     4,     1,     1,
+       2,     3,     3,     2,     4,     2,     1,     1,     1,     1,
+       1,     1,     1,     2,     1
+};
+
+/* YYDEFACT[STATE-NAME] -- Default reduction number in state STATE-NUM.
+   Performed when YYTABLE doesn't specify something else to do.  Zero
+   means the default is an error.  */
+static const yytype_uint8 yydefact[] =
+{
+       3,     0,     2,     1,    24,    21,    22,    17,    16,    18,
+      19,    20,     3,     0,     5,     4,     0,     8,     9,     0,
+      23,     3,    13,    15,    10,     0,     6,     0,    12,     0,
+      11,     7,    14
+};
+
+/* YYDEFGOTO[NTERM-NUM].  */
+static const yytype_int8 yydefgoto[] =
+{
+      -1,     1,     2,    14,    15,    16,    17,    18
+};
+
+/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
+   STATE-NUM.  */
+#define YYPACT_NINF -5
+static const yytype_int8 yypact[] =
+{
+      -5,    38,    18,    -5,    -5,    -5,    -5,    -5,    -5,    -5,
+      -5,    -5,    -5,    31,    -5,    -5,    29,    30,    32,    -4,
+      -5,    -5,    34,    35,    -5,    40,    -5,     7,    -5,    43,
+      -5,    -5,    -5
+};
+
+/* YYPGOTO[NTERM-NUM].  */
+static const yytype_int8 yypgoto[] =
+{
+      -5,    -5,    19,    -5,    -5,    -5,    26,    36
+};
+
+/* YYTABLE[YYPACT[STATE-NUM]].  What to do in state STATE-NUM.  If
+   positive, shift that token.  If negative, reduce the rule which
+   number is the opposite.  If YYTABLE_NINF, syntax error.  */
+#define YYTABLE_NINF -1
+static const yytype_uint8 yytable[] =
+{
+       4,     5,     6,     7,     8,     9,    10,    11,    12,    26,
+      13,     4,     5,     6,     7,     8,     9,    10,    11,    12,
+      31,    13,     4,     5,     6,     7,     8,     9,    10,    11,
+      12,    19,    13,     4,     5,     6,     5,     6,     3,    20,
+      27,    21,    22,    13,    24,    13,    30,    25,    28,    32,
+      29,     0,    23
+};
+
+#define yypact_value_is_default(yystate) \
+  ((yystate) == (-5))
+
+#define yytable_value_is_error(yytable_value) \
+  YYID (0)
+
+static const yytype_int8 yycheck[] =
+{
+       4,     5,     6,     7,     8,     9,    10,    11,    12,    13,
+      14,     4,     5,     6,     7,     8,     9,    10,    11,    12,
+      13,    14,     4,     5,     6,     7,     8,     9,    10,    11,
+      12,    12,    14,     4,     5,     6,     5,     6,     0,    13,
+      21,    12,    16,    14,    14,    14,     6,    15,    14,     6,
+      15,    -1,    16
+};
+
+/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
+   symbol of state STATE-NUM.  */
+static const yytype_uint8 yystos[] =
+{
+       0,    17,    18,     0,     4,     5,     6,     7,     8,     9,
+      10,    11,    12,    14,    19,    20,    21,    22,    23,    18,
+      22,    12,    22,    23,    14,    15,    13,    18,    14,    15,
+       6,    13,     6
+};
+
+#define yyerrok		(yyerrstatus = 0)
+#define yyclearin	(yychar = YYEMPTY)
+#define YYEMPTY		(-2)
+#define YYEOF		0
+
+#define YYACCEPT	goto yyacceptlab
+#define YYABORT		goto yyabortlab
+#define YYERROR		goto yyerrorlab
+
+
+/* Like YYERROR except do call yyerror.  This remains here temporarily
+   to ease the transition to the new meaning of YYERROR, for GCC.
+   Once GCC version 2 has supplanted version 1, this can go.  However,
+   YYFAIL appears to be in use.  Nevertheless, it is formally deprecated
+   in Bison 2.4.2's NEWS entry, where a plan to phase it out is
+   discussed.  */
+
+#define YYFAIL		goto yyerrlab
+#if defined YYFAIL
+  /* This is here to suppress warnings from the GCC cpp's
+     -Wunused-macros.  Normally we don't worry about that warning, but
+     some users do, and we want to make it easy for users to remove
+     YYFAIL uses, which will produce warnings from Bison 2.5.  */
+#endif
+
+#define YYRECOVERING()  (!!yyerrstatus)
+
+#define YYBACKUP(Token, Value)					\
+do								\
+  if (yychar == YYEMPTY && yylen == 1)				\
+    {								\
+      yychar = (Token);						\
+      yylval = (Value);						\
+      YYPOPSTACK (1);						\
+      goto yybackup;						\
+    }								\
+  else								\
+    {								\
+      yyerror (YY_("syntax error: cannot back up")); \
+      YYERROR;							\
+    }								\
+while (YYID (0))
+
+
+#define YYTERROR	1
+#define YYERRCODE	256
+
+
+/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N].
+   If N is 0, then set CURRENT to the empty location which ends
+   the previous symbol: RHS[0] (always defined).  */
+
+#define YYRHSLOC(Rhs, K) ((Rhs)[K])
+#ifndef YYLLOC_DEFAULT
+# define YYLLOC_DEFAULT(Current, Rhs, N)				\
+    do									\
+      if (YYID (N))                                                    \
+	{								\
+	  (Current).first_line   = YYRHSLOC (Rhs, 1).first_line;	\
+	  (Current).first_column = YYRHSLOC (Rhs, 1).first_column;	\
+	  (Current).last_line    = YYRHSLOC (Rhs, N).last_line;		\
+	  (Current).last_column  = YYRHSLOC (Rhs, N).last_column;	\
+	}								\
+      else								\
+	{								\
+	  (Current).first_line   = (Current).last_line   =		\
+	    YYRHSLOC (Rhs, 0).last_line;				\
+	  (Current).first_column = (Current).last_column =		\
+	    YYRHSLOC (Rhs, 0).last_column;				\
+	}								\
+    while (YYID (0))
+#endif
+
+
+/* This macro is provided for backward compatibility. */
+
+#ifndef YY_LOCATION_PRINT
+# define YY_LOCATION_PRINT(File, Loc) ((void) 0)
+#endif
+
+
+/* YYLEX -- calling `yylex' with the right arguments.  */
+
+#ifdef YYLEX_PARAM
+# define YYLEX yylex (&yylval, YYLEX_PARAM)
+#else
+# define YYLEX yylex (&yylval)
+#endif
+
+/* Enable debugging if requested.  */
+#if YYDEBUG
+
+# ifndef YYFPRINTF
+#  include <stdio.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYFPRINTF fprintf
+# endif
+
+# define YYDPRINTF(Args)			\
+do {						\
+  if (yydebug)					\
+    YYFPRINTF Args;				\
+} while (YYID (0))
+
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)			  \
+do {									  \
+  if (yydebug)								  \
+    {									  \
+      YYFPRINTF (stderr, "%s ", Title);					  \
+      yy_symbol_print (stderr,						  \
+		  Type, Value); \
+      YYFPRINTF (stderr, "\n");						  \
+    }									  \
+} while (YYID (0))
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep)
+#else
+static void
+yy_symbol_value_print (yyoutput, yytype, yyvaluep)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE const * const yyvaluep;
+#endif
+{
+  if (!yyvaluep)
+    return;
+# ifdef YYPRINT
+  if (yytype < YYNTOKENS)
+    YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep);
+# else
+  YYUSE (yyoutput);
+# endif
+  switch (yytype)
+    {
+      default:
+	break;
+    }
+}
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep)
+#else
+static void
+yy_symbol_print (yyoutput, yytype, yyvaluep)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE const * const yyvaluep;
+#endif
+{
+  if (yytype < YYNTOKENS)
+    YYFPRINTF (yyoutput, "token %s (", yytname[yytype]);
+  else
+    YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]);
+
+  yy_symbol_value_print (yyoutput, yytype, yyvaluep);
+  YYFPRINTF (yyoutput, ")");
+}
+
+/*------------------------------------------------------------------.
+| yy_stack_print -- Print the state stack from its BOTTOM up to its |
+| TOP (included).                                                   |
+`------------------------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_stack_print (yytype_int16 *yybottom, yytype_int16 *yytop)
+#else
+static void
+yy_stack_print (yybottom, yytop)
+    yytype_int16 *yybottom;
+    yytype_int16 *yytop;
+#endif
+{
+  YYFPRINTF (stderr, "Stack now");
+  for (; yybottom <= yytop; yybottom++)
+    {
+      int yybot = *yybottom;
+      YYFPRINTF (stderr, " %d", yybot);
+    }
+  YYFPRINTF (stderr, "\n");
+}
+
+# define YY_STACK_PRINT(Bottom, Top)				\
+do {								\
+  if (yydebug)							\
+    yy_stack_print ((Bottom), (Top));				\
+} while (YYID (0))
+
+
+/*------------------------------------------------.
+| Report that the YYRULE is going to be reduced.  |
+`------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_reduce_print (YYSTYPE *yyvsp, int yyrule)
+#else
+static void
+yy_reduce_print (yyvsp, yyrule)
+    YYSTYPE *yyvsp;
+    int yyrule;
+#endif
+{
+  int yynrhs = yyr2[yyrule];
+  int yyi;
+  unsigned long int yylno = yyrline[yyrule];
+  YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n",
+	     yyrule - 1, yylno);
+  /* The symbols being reduced.  */
+  for (yyi = 0; yyi < yynrhs; yyi++)
+    {
+      YYFPRINTF (stderr, "   $%d = ", yyi + 1);
+      yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi],
+		       &(yyvsp[(yyi + 1) - (yynrhs)])
+		       		       );
+      YYFPRINTF (stderr, "\n");
+    }
+}
+
+# define YY_REDUCE_PRINT(Rule)		\
+do {					\
+  if (yydebug)				\
+    yy_reduce_print (yyvsp, Rule); \
+} while (YYID (0))
+
+/* Nonzero means print parse trace.  It is left uninitialized so that
+   multiple parsers can coexist.  */
+int yydebug;
+#else /* !YYDEBUG */
+# define YYDPRINTF(Args)
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)
+# define YY_STACK_PRINT(Bottom, Top)
+# define YY_REDUCE_PRINT(Rule)
+#endif /* !YYDEBUG */
+
+
+/* YYINITDEPTH -- initial size of the parser's stacks.  */
+#ifndef	YYINITDEPTH
+# define YYINITDEPTH 200
+#endif
+
+/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
+   if the built-in stack extension method is used).
+
+   Do not make this value too large; the results are undefined if
+   YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH)
+   evaluated with infinite-precision integer arithmetic.  */
+
+#ifndef YYMAXDEPTH
+# define YYMAXDEPTH 10000
+#endif
+
+
+#if YYERROR_VERBOSE
+
+# ifndef yystrlen
+#  if defined __GLIBC__ && defined _STRING_H
+#   define yystrlen strlen
+#  else
+/* Return the length of YYSTR.  */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static YYSIZE_T
+yystrlen (const char *yystr)
+#else
+static YYSIZE_T
+yystrlen (yystr)
+    const char *yystr;
+#endif
+{
+  YYSIZE_T yylen;
+  for (yylen = 0; yystr[yylen]; yylen++)
+    continue;
+  return yylen;
+}
+#  endif
+# endif
+
+# ifndef yystpcpy
+#  if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE
+#   define yystpcpy stpcpy
+#  else
+/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in
+   YYDEST.  */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static char *
+yystpcpy (char *yydest, const char *yysrc)
+#else
+static char *
+yystpcpy (yydest, yysrc)
+    char *yydest;
+    const char *yysrc;
+#endif
+{
+  char *yyd = yydest;
+  const char *yys = yysrc;
+
+  while ((*yyd++ = *yys++) != '\0')
+    continue;
+
+  return yyd - 1;
+}
+#  endif
+# endif
+
+# ifndef yytnamerr
+/* Copy to YYRES the contents of YYSTR after stripping away unnecessary
+   quotes and backslashes, so that it's suitable for yyerror.  The
+   heuristic is that double-quoting is unnecessary unless the string
+   contains an apostrophe, a comma, or backslash (other than
+   backslash-backslash).  YYSTR is taken from yytname.  If YYRES is
+   null, do not copy; instead, return the length of what the result
+   would have been.  */
+static YYSIZE_T
+yytnamerr (char *yyres, const char *yystr)
+{
+  if (*yystr == '"')
+    {
+      YYSIZE_T yyn = 0;
+      char const *yyp = yystr;
+
+      for (;;)
+	switch (*++yyp)
+	  {
+	  case '\'':
+	  case ',':
+	    goto do_not_strip_quotes;
+
+	  case '\\':
+	    if (*++yyp != '\\')
+	      goto do_not_strip_quotes;
+	    /* Fall through.  */
+	  default:
+	    if (yyres)
+	      yyres[yyn] = *yyp;
+	    yyn++;
+	    break;
+
+	  case '"':
+	    if (yyres)
+	      yyres[yyn] = '\0';
+	    return yyn;
+	  }
+    do_not_strip_quotes: ;
+    }
+
+  if (! yyres)
+    return yystrlen (yystr);
+
+  return yystpcpy (yyres, yystr) - yyres;
+}
+# endif
+
+/* Copy into *YYMSG, which is of size *YYMSG_ALLOC, an error message
+   about the unexpected token YYTOKEN for the state stack whose top is
+   YYSSP.
+
+   Return 0 if *YYMSG was successfully written.  Return 1 if *YYMSG is
+   not large enough to hold the message.  In that case, also set
+   *YYMSG_ALLOC to the required number of bytes.  Return 2 if the
+   required number of bytes is too large to store.  */
+static int
+yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg,
+                yytype_int16 *yyssp, int yytoken)
+{
+  YYSIZE_T yysize0 = yytnamerr (0, yytname[yytoken]);
+  YYSIZE_T yysize = yysize0;
+  YYSIZE_T yysize1;
+  enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 };
+  /* Internationalized format string. */
+  const char *yyformat = 0;
+  /* Arguments of yyformat. */
+  char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM];
+  /* Number of reported tokens (one for the "unexpected", one per
+     "expected"). */
+  int yycount = 0;
+
+  /* There are many possibilities here to consider:
+     - Assume YYFAIL is not used.  It's too flawed to consider.  See
+       <http://lists.gnu.org/archive/html/bison-patches/2009-12/msg00024.html>
+       for details.  YYERROR is fine as it does not invoke this
+       function.
+     - If this state is a consistent state with a default action, then
+       the only way this function was invoked is if the default action
+       is an error action.  In that case, don't check for expected
+       tokens because there are none.
+     - The only way there can be no lookahead present (in yychar) is if
+       this state is a consistent state with a default action.  Thus,
+       detecting the absence of a lookahead is sufficient to determine
+       that there is no unexpected or expected token to report.  In that
+       case, just report a simple "syntax error".
+     - Don't assume there isn't a lookahead just because this state is a
+       consistent state with a default action.  There might have been a
+       previous inconsistent state, consistent state with a non-default
+       action, or user semantic action that manipulated yychar.
+     - Of course, the expected token list depends on states to have
+       correct lookahead information, and it depends on the parser not
+       to perform extra reductions after fetching a lookahead from the
+       scanner and before detecting a syntax error.  Thus, state merging
+       (from LALR or IELR) and default reductions corrupt the expected
+       token list.  However, the list is correct for canonical LR with
+       one exception: it will still contain any token that will not be
+       accepted due to an error action in a later state.
+  */
+  if (yytoken != YYEMPTY)
+    {
+      int yyn = yypact[*yyssp];
+      yyarg[yycount++] = yytname[yytoken];
+      if (!yypact_value_is_default (yyn))
+        {
+          /* Start YYX at -YYN if negative to avoid negative indexes in
+             YYCHECK.  In other words, skip the first -YYN actions for
+             this state because they are default actions.  */
+          int yyxbegin = yyn < 0 ? -yyn : 0;
+          /* Stay within bounds of both yycheck and yytname.  */
+          int yychecklim = YYLAST - yyn + 1;
+          int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
+          int yyx;
+
+          for (yyx = yyxbegin; yyx < yyxend; ++yyx)
+            if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR
+                && !yytable_value_is_error (yytable[yyx + yyn]))
+              {
+                if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM)
+                  {
+                    yycount = 1;
+                    yysize = yysize0;
+                    break;
+                  }
+                yyarg[yycount++] = yytname[yyx];
+                yysize1 = yysize + yytnamerr (0, yytname[yyx]);
+                if (! (yysize <= yysize1
+                       && yysize1 <= YYSTACK_ALLOC_MAXIMUM))
+                  return 2;
+                yysize = yysize1;
+              }
+        }
+    }
+
+  switch (yycount)
+    {
+# define YYCASE_(N, S)                      \
+      case N:                               \
+        yyformat = S;                       \
+      break
+      YYCASE_(0, YY_("syntax error"));
+      YYCASE_(1, YY_("syntax error, unexpected %s"));
+      YYCASE_(2, YY_("syntax error, unexpected %s, expecting %s"));
+      YYCASE_(3, YY_("syntax error, unexpected %s, expecting %s or %s"));
+      YYCASE_(4, YY_("syntax error, unexpected %s, expecting %s or %s or %s"));
+      YYCASE_(5, YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s"));
+# undef YYCASE_
+    }
+
+  yysize1 = yysize + yystrlen (yyformat);
+  if (! (yysize <= yysize1 && yysize1 <= YYSTACK_ALLOC_MAXIMUM))
+    return 2;
+  yysize = yysize1;
+
+  if (*yymsg_alloc < yysize)
+    {
+      *yymsg_alloc = 2 * yysize;
+      if (! (yysize <= *yymsg_alloc
+             && *yymsg_alloc <= YYSTACK_ALLOC_MAXIMUM))
+        *yymsg_alloc = YYSTACK_ALLOC_MAXIMUM;
+      return 1;
+    }
+
+  /* Avoid sprintf, as that infringes on the user's name space.
+     Don't have undefined behavior even if the translation
+     produced a string with the wrong number of "%s"s.  */
+  {
+    char *yyp = *yymsg;
+    int yyi = 0;
+    while ((*yyp = *yyformat) != '\0')
+      if (*yyp == '%' && yyformat[1] == 's' && yyi < yycount)
+        {
+          yyp += yytnamerr (yyp, yyarg[yyi++]);
+          yyformat += 2;
+        }
+      else
+        {
+          yyp++;
+          yyformat++;
+        }
+  }
+  return 0;
+}
+#endif /* YYERROR_VERBOSE */
+
+/*-----------------------------------------------.
+| Release the memory associated to this symbol.  |
+`-----------------------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep)
+#else
+static void
+yydestruct (yymsg, yytype, yyvaluep)
+    const char *yymsg;
+    int yytype;
+    YYSTYPE *yyvaluep;
+#endif
+{
+  YYUSE (yyvaluep);
+
+  if (!yymsg)
+    yymsg = "Deleting";
+  YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp);
+
+  switch (yytype)
+    {
+
+      default:
+	break;
+    }
+}
+
+
+/* Prevent warnings from -Wmissing-prototypes.  */
+#ifdef YYPARSE_PARAM
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void *YYPARSE_PARAM);
+#else
+int yyparse ();
+#endif
+#else /* ! YYPARSE_PARAM */
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void);
+#else
+int yyparse ();
+#endif
+#endif /* ! YYPARSE_PARAM */
+
+
+/*----------.
+| yyparse.  |
+`----------*/
+
+#ifdef YYPARSE_PARAM
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void *YYPARSE_PARAM)
+#else
+int
+yyparse (YYPARSE_PARAM)
+    void *YYPARSE_PARAM;
+#endif
+#else /* ! YYPARSE_PARAM */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void)
+#else
+int
+yyparse ()
+
+#endif
+#endif
+{
+/* The lookahead symbol.  */
+int yychar;
+
+/* The semantic value of the lookahead symbol.  */
+YYSTYPE yylval;
+
+    /* Number of syntax errors so far.  */
+    int yynerrs;
+
+    int yystate;
+    /* Number of tokens to shift before error messages enabled.  */
+    int yyerrstatus;
+
+    /* The stacks and their tools:
+       `yyss': related to states.
+       `yyvs': related to semantic values.
+
+       Refer to the stacks thru separate pointers, to allow yyoverflow
+       to reallocate them elsewhere.  */
+
+    /* The state stack.  */
+    yytype_int16 yyssa[YYINITDEPTH];
+    yytype_int16 *yyss;
+    yytype_int16 *yyssp;
+
+    /* The semantic value stack.  */
+    YYSTYPE yyvsa[YYINITDEPTH];
+    YYSTYPE *yyvs;
+    YYSTYPE *yyvsp;
+
+    YYSIZE_T yystacksize;
+
+  int yyn;
+  int yyresult;
+  /* Lookahead token as an internal (translated) token number.  */
+  int yytoken;
+  /* The variables used to return semantic value and location from the
+     action routines.  */
+  YYSTYPE yyval;
+
+#if YYERROR_VERBOSE
+  /* Buffer for error messages, and its allocated size.  */
+  char yymsgbuf[128];
+  char *yymsg = yymsgbuf;
+  YYSIZE_T yymsg_alloc = sizeof yymsgbuf;
+#endif
+
+#define YYPOPSTACK(N)   (yyvsp -= (N), yyssp -= (N))
+
+  /* The number of symbols on the RHS of the reduced rule.
+     Keep to zero when no symbol should be popped.  */
+  int yylen = 0;
+
+  yytoken = 0;
+  yyss = yyssa;
+  yyvs = yyvsa;
+  yystacksize = YYINITDEPTH;
+
+  YYDPRINTF ((stderr, "Starting parse\n"));
+
+  yystate = 0;
+  yyerrstatus = 0;
+  yynerrs = 0;
+  yychar = YYEMPTY; /* Cause a token to be read.  */
+
+  /* Initialize stack pointers.
+     Waste one element of value and location stack
+     so that they stay on the same level as the state stack.
+     The wasted elements are never initialized.  */
+  yyssp = yyss;
+  yyvsp = yyvs;
+
+  goto yysetstate;
+
+/*------------------------------------------------------------.
+| yynewstate -- Push a new state, which is found in yystate.  |
+`------------------------------------------------------------*/
+ yynewstate:
+  /* In all cases, when you get here, the value and location stacks
+     have just been pushed.  So pushing a state here evens the stacks.  */
+  yyssp++;
+
+ yysetstate:
+  *yyssp = yystate;
+
+  if (yyss + yystacksize - 1 <= yyssp)
+    {
+      /* Get the current used size of the three stacks, in elements.  */
+      YYSIZE_T yysize = yyssp - yyss + 1;
+
+#ifdef yyoverflow
+      {
+	/* Give user a chance to reallocate the stack.  Use copies of
+	   these so that the &'s don't force the real ones into
+	   memory.  */
+	YYSTYPE *yyvs1 = yyvs;
+	yytype_int16 *yyss1 = yyss;
+
+	/* Each stack pointer address is followed by the size of the
+	   data in use in that stack, in bytes.  This used to be a
+	   conditional around just the two extra args, but that might
+	   be undefined if yyoverflow is a macro.  */
+	yyoverflow (YY_("memory exhausted"),
+		    &yyss1, yysize * sizeof (*yyssp),
+		    &yyvs1, yysize * sizeof (*yyvsp),
+		    &yystacksize);
+
+	yyss = yyss1;
+	yyvs = yyvs1;
+      }
+#else /* no yyoverflow */
+# ifndef YYSTACK_RELOCATE
+      goto yyexhaustedlab;
+# else
+      /* Extend the stack our own way.  */
+      if (YYMAXDEPTH <= yystacksize)
+	goto yyexhaustedlab;
+      yystacksize *= 2;
+      if (YYMAXDEPTH < yystacksize)
+	yystacksize = YYMAXDEPTH;
+
+      {
+	yytype_int16 *yyss1 = yyss;
+	union yyalloc *yyptr =
+	  (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize));
+	if (! yyptr)
+	  goto yyexhaustedlab;
+	YYSTACK_RELOCATE (yyss_alloc, yyss);
+	YYSTACK_RELOCATE (yyvs_alloc, yyvs);
+#  undef YYSTACK_RELOCATE
+	if (yyss1 != yyssa)
+	  YYSTACK_FREE (yyss1);
+      }
+# endif
+#endif /* no yyoverflow */
+
+      yyssp = yyss + yysize - 1;
+      yyvsp = yyvs + yysize - 1;
+
+      YYDPRINTF ((stderr, "Stack size increased to %lu\n",
+		  (unsigned long int) yystacksize));
+
+      if (yyss + yystacksize - 1 <= yyssp)
+	YYABORT;
+    }
+
+  YYDPRINTF ((stderr, "Entering state %d\n", yystate));
+
+  if (yystate == YYFINAL)
+    YYACCEPT;
+
+  goto yybackup;
+
+/*-----------.
+| yybackup.  |
+`-----------*/
+yybackup:
+
+  /* Do appropriate processing given the current state.  Read a
+     lookahead token if we need one and don't already have one.  */
+
+  /* First try to decide what to do without reference to lookahead token.  */
+  yyn = yypact[yystate];
+  if (yypact_value_is_default (yyn))
+    goto yydefault;
+
+  /* Not known => get a lookahead token if don't already have one.  */
+
+  /* YYCHAR is either YYEMPTY or YYEOF or a valid lookahead symbol.  */
+  if (yychar == YYEMPTY)
+    {
+      YYDPRINTF ((stderr, "Reading a token: "));
+      yychar = YYLEX;
+    }
+
+  if (yychar <= YYEOF)
+    {
+      yychar = yytoken = YYEOF;
+      YYDPRINTF ((stderr, "Now at end of input.\n"));
+    }
+  else
+    {
+      yytoken = YYTRANSLATE (yychar);
+      YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc);
+    }
+
+  /* If the proper action on seeing token YYTOKEN is to reduce or to
+     detect an error, take that action.  */
+  yyn += yytoken;
+  if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken)
+    goto yydefault;
+  yyn = yytable[yyn];
+  if (yyn <= 0)
+    {
+      if (yytable_value_is_error (yyn))
+        goto yyerrlab;
+      yyn = -yyn;
+      goto yyreduce;
+    }
+
+  /* Count tokens shifted since error; after three, turn off error
+     status.  */
+  if (yyerrstatus)
+    yyerrstatus--;
+
+  /* Shift the lookahead token.  */
+  YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
+
+  /* Discard the shifted token.  */
+  yychar = YYEMPTY;
+
+  yystate = yyn;
+  *++yyvsp = yylval;
+
+  goto yynewstate;
+
+
+/*-----------------------------------------------------------.
+| yydefault -- do the default action for the current state.  |
+`-----------------------------------------------------------*/
+yydefault:
+  yyn = yydefact[yystate];
+  if (yyn == 0)
+    goto yyerrlab;
+  goto yyreduce;
+
+
+/*-----------------------------.
+| yyreduce -- Do a reduction.  |
+`-----------------------------*/
+yyreduce:
+  /* yyn is the number of a rule to reduce with.  */
+  yylen = yyr2[yyn];
+
+  /* If YYLEN is nonzero, implement the default value of the action:
+     `$$ = $1'.
+
+     Otherwise, the following line sets YYVAL to garbage.
+     This behavior is undocumented and Bison
+     users should not rely upon it.  Assigning to YYVAL
+     unconditionally makes the parser a bit smaller, and it avoids a
+     GCC warning that YYVAL may be used uninitialized.  */
+  yyval = yyvsp[1-yylen];
+
+
+  YY_REDUCE_PRINT (yyn);
+  switch (yyn)
+    {
+        case 2:
+
+/* Line 1806 of yacc.c  */
+#line 79 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (1)].node);
+		((fts_ast_state_t*) state)->root = (yyval.node);
+	}
+    break;
+
+  case 3:
+
+/* Line 1806 of yacc.c  */
+#line 85 "fts0pars.y"
+    {
+		(yyval.node) = NULL;
+	}
+    break;
+
+  case 4:
+
+/* Line 1806 of yacc.c  */
+#line 89 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (2)].node);
+
+		if (!(yyval.node)) {
+			(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(2) - (2)].node));
+		} else {
+			fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+		}
+	}
+    break;
+
+  case 5:
+
+/* Line 1806 of yacc.c  */
+#line 99 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (2)].node);
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node));
+
+		if (!(yyval.node)) {
+			(yyval.node) = fts_ast_create_node_subexp_list(state, (yyvsp[(2) - (2)].node));
+		} else {
+			fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+		}
+	}
+    break;
+
+  case 6:
+
+/* Line 1806 of yacc.c  */
+#line 111 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(2) - (3)].node);
+	}
+    break;
+
+  case 7:
+
+/* Line 1806 of yacc.c  */
+#line 115 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_subexp_list(state, (yyvsp[(1) - (4)].node));
+
+		if ((yyvsp[(3) - (4)].node)) {
+			fts_ast_add_node((yyval.node), (yyvsp[(3) - (4)].node));
+		}
+	}
+    break;
+
+  case 8:
+
+/* Line 1806 of yacc.c  */
+#line 124 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (1)].node);
+	}
+    break;
+
+  case 9:
+
+/* Line 1806 of yacc.c  */
+#line 128 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (1)].node);
+	}
+    break;
+
+  case 10:
+
+/* Line 1806 of yacc.c  */
+#line 132 "fts0pars.y"
+    {
+		fts_ast_term_set_wildcard((yyvsp[(1) - (2)].node));
+	}
+    break;
+
+  case 11:
+
+/* Line 1806 of yacc.c  */
+#line 136 "fts0pars.y"
+    {
+		fts_ast_term_set_distance((yyvsp[(1) - (3)].node), strtoul((yyvsp[(3) - (3)].token), NULL, 10));
+		free((yyvsp[(3) - (3)].token));
+	}
+    break;
+
+  case 12:
+
+/* Line 1806 of yacc.c  */
+#line 141 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (3)].node));
+		fts_ast_add_node((yyval.node), (yyvsp[(2) - (3)].node));
+		fts_ast_term_set_wildcard((yyvsp[(2) - (3)].node));
+	}
+    break;
+
+  case 13:
+
+/* Line 1806 of yacc.c  */
+#line 147 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node));
+		fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+	}
+    break;
+
+  case 14:
+
+/* Line 1806 of yacc.c  */
+#line 152 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (4)].node));
+		fts_ast_add_node((yyval.node), (yyvsp[(2) - (4)].node));
+		fts_ast_term_set_distance((yyvsp[(2) - (4)].node), strtoul((yyvsp[(4) - (4)].token), NULL, 10));
+		free((yyvsp[(4) - (4)].token));
+	}
+    break;
+
+  case 15:
+
+/* Line 1806 of yacc.c  */
+#line 159 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node));
+		fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+	}
+    break;
+
+  case 16:
+
+/* Line 1806 of yacc.c  */
+#line 165 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_IGNORE);
+	}
+    break;
+
+  case 17:
+
+/* Line 1806 of yacc.c  */
+#line 169 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_EXIST);
+	}
+    break;
+
+  case 18:
+
+/* Line 1806 of yacc.c  */
+#line 173 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_NEGATE);
+	}
+    break;
+
+  case 19:
+
+/* Line 1806 of yacc.c  */
+#line 177 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_DECR_RATING);
+	}
+    break;
+
+  case 20:
+
+/* Line 1806 of yacc.c  */
+#line 181 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_INCR_RATING);
+	}
+    break;
+
+  case 21:
+
+/* Line 1806 of yacc.c  */
+#line 186 "fts0pars.y"
+    {
+		(yyval.node)  = fts_ast_create_node_term(state, (yyvsp[(1) - (1)].token));
+		free((yyvsp[(1) - (1)].token));
+	}
+    break;
+
+  case 22:
+
+/* Line 1806 of yacc.c  */
+#line 191 "fts0pars.y"
+    {
+		(yyval.node)  = fts_ast_create_node_term(state, (yyvsp[(1) - (1)].token));
+		free((yyvsp[(1) - (1)].token));
+	}
+    break;
+
+  case 23:
+
+/* Line 1806 of yacc.c  */
+#line 197 "fts0pars.y"
+    {
+		(yyval.node)  = (yyvsp[(2) - (2)].node);
+	}
+    break;
+
+  case 24:
+
+/* Line 1806 of yacc.c  */
+#line 202 "fts0pars.y"
+    {
+		(yyval.node)  = fts_ast_create_node_text(state, (yyvsp[(1) - (1)].token));
+		free((yyvsp[(1) - (1)].token));
+	}
+    break;
+
+
+
+/* Line 1806 of yacc.c  */
+#line 1658 "fts0pars.cc"
+      default: break;
+    }
+  /* User semantic actions sometimes alter yychar, and that requires
+     that yytoken be updated with the new translation.  We take the
+     approach of translating immediately before every use of yytoken.
+     One alternative is translating here after every semantic action,
+     but that translation would be missed if the semantic action invokes
+     YYABORT, YYACCEPT, or YYERROR immediately after altering yychar or
+     if it invokes YYBACKUP.  In the case of YYABORT or YYACCEPT, an
+     incorrect destructor might then be invoked immediately.  In the
+     case of YYERROR or YYBACKUP, subsequent parser actions might lead
+     to an incorrect destructor call or verbose syntax error message
+     before the lookahead is translated.  */
+  YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc);
+
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+
+  *++yyvsp = yyval;
+
+  /* Now `shift' the result of the reduction.  Determine what state
+     that goes to, based on the state we popped back to and the rule
+     number reduced by.  */
+
+  yyn = yyr1[yyn];
+
+  yystate = yypgoto[yyn - YYNTOKENS] + *yyssp;
+  if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp)
+    yystate = yytable[yystate];
+  else
+    yystate = yydefgoto[yyn - YYNTOKENS];
+
+  goto yynewstate;
+
+
+/*------------------------------------.
+| yyerrlab -- here on detecting error |
+`------------------------------------*/
+yyerrlab:
+  /* Make sure we have latest lookahead translation.  See comments at
+     user semantic actions for why this is necessary.  */
+  yytoken = yychar == YYEMPTY ? YYEMPTY : YYTRANSLATE (yychar);
+
+  /* If not already recovering from an error, report this error.  */
+  if (!yyerrstatus)
+    {
+      ++yynerrs;
+#if ! YYERROR_VERBOSE
+      yyerror (YY_("syntax error"));
+#else
+# define YYSYNTAX_ERROR yysyntax_error (&yymsg_alloc, &yymsg, \
+                                        yyssp, yytoken)
+      {
+        char const *yymsgp = YY_("syntax error");
+        int yysyntax_error_status;
+        yysyntax_error_status = YYSYNTAX_ERROR;
+        if (yysyntax_error_status == 0)
+          yymsgp = yymsg;
+        else if (yysyntax_error_status == 1)
+          {
+            if (yymsg != yymsgbuf)
+              YYSTACK_FREE (yymsg);
+            yymsg = (char *) YYSTACK_ALLOC (yymsg_alloc);
+            if (!yymsg)
+              {
+                yymsg = yymsgbuf;
+                yymsg_alloc = sizeof yymsgbuf;
+                yysyntax_error_status = 2;
+              }
+            else
+              {
+                yysyntax_error_status = YYSYNTAX_ERROR;
+                yymsgp = yymsg;
+              }
+          }
+        yyerror (yymsgp);
+        if (yysyntax_error_status == 2)
+          goto yyexhaustedlab;
+      }
+# undef YYSYNTAX_ERROR
+#endif
+    }
+
+
+
+  if (yyerrstatus == 3)
+    {
+      /* If just tried and failed to reuse lookahead token after an
+	 error, discard it.  */
+
+      if (yychar <= YYEOF)
+	{
+	  /* Return failure if at end of input.  */
+	  if (yychar == YYEOF)
+	    YYABORT;
+	}
+      else
+	{
+	  yydestruct ("Error: discarding",
+		      yytoken, &yylval);
+	  yychar = YYEMPTY;
+	}
+    }
+
+  /* Else will try to reuse lookahead token after shifting the error
+     token.  */
+  goto yyerrlab1;
+
+
+/*---------------------------------------------------.
+| yyerrorlab -- error raised explicitly by YYERROR.  |
+`---------------------------------------------------*/
+yyerrorlab:
+
+  /* Pacify compilers like GCC when the user code never invokes
+     YYERROR and the label yyerrorlab therefore never appears in user
+     code.  */
+  if (/*CONSTCOND*/ 0)
+     goto yyerrorlab;
+
+  /* Do not reclaim the symbols of the rule which action triggered
+     this YYERROR.  */
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+  yystate = *yyssp;
+  goto yyerrlab1;
+
+
+/*-------------------------------------------------------------.
+| yyerrlab1 -- common code for both syntax error and YYERROR.  |
+`-------------------------------------------------------------*/
+yyerrlab1:
+  yyerrstatus = 3;	/* Each real token shifted decrements this.  */
+
+  for (;;)
+    {
+      yyn = yypact[yystate];
+      if (!yypact_value_is_default (yyn))
+	{
+	  yyn += YYTERROR;
+	  if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR)
+	    {
+	      yyn = yytable[yyn];
+	      if (0 < yyn)
+		break;
+	    }
+	}
+
+      /* Pop the current state because it cannot handle the error token.  */
+      if (yyssp == yyss)
+	YYABORT;
+
+
+      yydestruct ("Error: popping",
+		  yystos[yystate], yyvsp);
+      YYPOPSTACK (1);
+      yystate = *yyssp;
+      YY_STACK_PRINT (yyss, yyssp);
+    }
+
+  *++yyvsp = yylval;
+
+
+  /* Shift the error token.  */
+  YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp);
+
+  yystate = yyn;
+  goto yynewstate;
+
+
+/*-------------------------------------.
+| yyacceptlab -- YYACCEPT comes here.  |
+`-------------------------------------*/
+yyacceptlab:
+  yyresult = 0;
+  goto yyreturn;
+
+/*-----------------------------------.
+| yyabortlab -- YYABORT comes here.  |
+`-----------------------------------*/
+yyabortlab:
+  yyresult = 1;
+  goto yyreturn;
+
+#if !defined(yyoverflow) || YYERROR_VERBOSE
+/*-------------------------------------------------.
+| yyexhaustedlab -- memory exhaustion comes here.  |
+`-------------------------------------------------*/
+yyexhaustedlab:
+  yyerror (YY_("memory exhausted"));
+  yyresult = 2;
+  /* Fall through.  */
+#endif
+
+yyreturn:
+  if (yychar != YYEMPTY)
+    {
+      /* Make sure we have latest lookahead translation.  See comments at
+         user semantic actions for why this is necessary.  */
+      yytoken = YYTRANSLATE (yychar);
+      yydestruct ("Cleanup: discarding lookahead",
+                  yytoken, &yylval);
+    }
+  /* Do not reclaim the symbols of the rule which action triggered
+     this YYABORT or YYACCEPT.  */
+  YYPOPSTACK (yylen);
+  YY_STACK_PRINT (yyss, yyssp);
+  while (yyssp != yyss)
+    {
+      yydestruct ("Cleanup: popping",
+		  yystos[*yyssp], yyvsp);
+      YYPOPSTACK (1);
+    }
+#ifndef yyoverflow
+  if (yyss != yyssa)
+    YYSTACK_FREE (yyss);
+#endif
+#if YYERROR_VERBOSE
+  if (yymsg != yymsgbuf)
+    YYSTACK_FREE (yymsg);
+#endif
+  /* Make sure YYID is used.  */
+  return YYID (yyresult);
+}
+
+
+
+/* Line 2067 of yacc.c  */
+#line 207 "fts0pars.y"
+
+
+/********************************************************************
+*/
+int
+ftserror(
+/*=====*/
+	const char*	p)
+{
+	my_printf_error(ER_PARSE_ERROR, "%s", MYF(0), p);
+	return(0);
+}
+
+/********************************************************************
+Create a fts_lexer_t instance.*/
+
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+	ibool		boolean_mode,
+	const byte*	query,
+	ulint		query_len)
+{
+	fts_lexer_t*	fts_lexer = static_cast<fts_lexer_t*>(
+		ut_malloc(sizeof(fts_lexer_t)));
+
+	if (boolean_mode) {
+		fts0blex_init(&fts_lexer->yyscanner);
+		fts0b_scan_bytes((char*) query, query_len, fts_lexer->yyscanner);
+		fts_lexer->scanner = (fts_scan) fts_blexer;
+		/* FIXME: Debugging */
+		/* fts0bset_debug(1 , fts_lexer->yyscanner); */
+	} else {
+		fts0tlex_init(&fts_lexer->yyscanner);
+		fts0t_scan_bytes((char*) query, query_len, fts_lexer->yyscanner);
+		fts_lexer->scanner = (fts_scan) fts_tlexer;
+	}
+
+	return(fts_lexer);
+}
+
+/********************************************************************
+Free an fts_lexer_t instance.*/
+void
+
+fts_lexer_free(
+/*===========*/
+	fts_lexer_t*	fts_lexer)
+{
+	if (fts_lexer->scanner == (fts_scan) fts_blexer) {
+		fts0blex_destroy(fts_lexer->yyscanner);
+	} else {
+		fts0tlex_destroy(fts_lexer->yyscanner);
+	}
+
+	ut_free(fts_lexer);
+}
+
+/********************************************************************
+Call the appropaiate scanner.*/
+
+int
+fts_lexer(
+/*======*/
+	YYSTYPE*	val,
+	fts_lexer_t*	fts_lexer)
+{
+	fts_scanner_alt func_ptr;
+
+	func_ptr = (fts_scanner_alt) fts_lexer->scanner;
+
+	return(func_ptr(val, fts_lexer->yyscanner));
+}
+
+/********************************************************************
+Parse the query.*/
+int
+fts_parse(
+/*======*/
+	fts_ast_state_t*	state)
+{
+	return(ftsparse(state));
+}
+
diff --git a/storage/xtradb/fts/fts0pars.y b/storage/xtradb/fts/fts0pars.y
new file mode 100644
index 00000000000..73d71bc87c5
--- /dev/null
+++ b/storage/xtradb/fts/fts0pars.y
@@ -0,0 +1,289 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2013,  Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+ * @file fts/fts0pars.y
+ * FTS parser: input file for the GNU Bison parser generator
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+
+%{
+
+#include "mem0mem.h"
+#include "fts0ast.h"
+#include "fts0blex.h"
+#include "fts0tlex.h"
+#include "fts0pars.h"
+
+extern	int fts_lexer(YYSTYPE*, fts_lexer_t*);
+extern	int fts_blexer(YYSTYPE*, yyscan_t);
+extern	int fts_tlexer(YYSTYPE*, yyscan_t);
+
+typedef int (*fts_scan)();
+
+extern int ftserror(const char* p);
+
+/* Required for reentrant parser */
+#define ftslex	fts_lexer
+
+#define YYERROR_VERBOSE
+
+/* For passing an argument to yyparse() */
+#define YYPARSE_PARAM state
+#define YYLEX_PARAM ((fts_ast_state_t*) state)->lexer
+
+typedef	int	(*fts_scanner_alt)(YYSTYPE* val, yyscan_t yyscanner);
+typedef	int	(*fts_scanner)();
+
+struct fts_lexer_struct {
+	fts_scanner	scanner;
+	void*		yyscanner;
+};
+
+%}
+
+%union {
+	int		oper;
+	char*		token;
+	fts_ast_node_t*	node;
+};
+
+/* Enable re-entrant parser */
+%pure_parser
+
+%token<oper>	FTS_OPER
+%token<token>	FTS_TEXT FTS_TERM FTS_NUMB
+
+%type<node>	prefix term text expr sub_expr expr_lst query
+
+%nonassoc	'+' '-' '~' '<' '>'
+
+%%
+
+query	: expr_lst	{
+		$$ = $1;
+		((fts_ast_state_t*) state)->root = $$;
+	}
+	;
+
+expr_lst: /* Empty */	{
+		$$ = NULL;
+	}
+
+	| expr_lst expr	{
+		$$ = $1;
+
+		if (!$$) {
+			$$ = fts_ast_create_node_list(state, $2);
+		} else {
+			fts_ast_add_node($$, $2);
+		}
+	}
+
+	| expr_lst sub_expr		{
+		$$ = $1;
+		$$ = fts_ast_create_node_list(state, $1);
+
+		if (!$$) {
+			$$ = fts_ast_create_node_subexp_list(state, $2);
+		} else {
+			fts_ast_add_node($$, $2);
+		}
+	}
+	;
+
+sub_expr: '(' expr_lst ')'		{
+		$$ = $2;
+	}
+
+	| prefix '(' expr_lst ')'	{
+		$$ = fts_ast_create_node_subexp_list(state, $1);
+
+		if ($3) {
+			fts_ast_add_node($$, $3);
+		}
+	}
+	;
+
+expr	: term		{
+		$$ = $1;
+	}
+
+	| text		{
+		$$ = $1;
+	}
+
+	| term '*' {
+		fts_ast_term_set_wildcard($1);
+	}
+
+	| text '@' FTS_NUMB {
+		fts_ast_term_set_distance($1, strtoul($3, NULL, 10));
+		free($3);
+	}
+
+	| prefix term '*' {
+		$$ = fts_ast_create_node_list(state, $1);
+		fts_ast_add_node($$, $2);
+		fts_ast_term_set_wildcard($2);
+	}
+
+	| prefix term	{
+		$$ = fts_ast_create_node_list(state, $1);
+		fts_ast_add_node($$, $2);
+	}
+
+	| prefix text '@' FTS_NUMB {
+		$$ = fts_ast_create_node_list(state, $1);
+		fts_ast_add_node($$, $2);
+		fts_ast_term_set_distance($2, strtoul($4, NULL, 10));
+		free($4);
+	}
+
+	| prefix text {
+		$$ = fts_ast_create_node_list(state, $1);
+		fts_ast_add_node($$, $2);
+	}
+	;
+
+prefix	: '-'		{
+		$$ = fts_ast_create_node_oper(state, FTS_IGNORE);
+	}
+
+	| '+'		{
+		$$ = fts_ast_create_node_oper(state, FTS_EXIST);
+	}
+
+	| '~'		{
+		$$ = fts_ast_create_node_oper(state, FTS_NEGATE);
+	}
+
+	| '<'		{
+		$$ = fts_ast_create_node_oper(state, FTS_DECR_RATING);
+	}
+
+	| '>'		{
+		$$ = fts_ast_create_node_oper(state, FTS_INCR_RATING);
+	}
+	;
+
+term	: FTS_TERM	{
+		$$  = fts_ast_create_node_term(state, $1);
+		free($1);
+	}
+
+	| FTS_NUMB	{
+		$$  = fts_ast_create_node_term(state, $1);
+		free($1);
+	}
+
+	/* Ignore leading '*' */
+	| '*' term {
+		$$  = $2;
+	}
+	;
+
+text	: FTS_TEXT	{
+		$$  = fts_ast_create_node_text(state, $1);
+		free($1);
+	}
+	;
+%%
+
+/********************************************************************
+*/
+int
+ftserror(
+/*=====*/
+	const char*	p)
+{
+	fprintf(stderr, "%s\n", p);
+	return(0);
+}
+
+/********************************************************************
+Create a fts_lexer_t instance.*/
+
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+	ibool		boolean_mode,
+	const byte*	query,
+	ulint		query_len)
+{
+	fts_lexer_t*	fts_lexer = static_cast<fts_lexer_t*>(
+		ut_malloc(sizeof(fts_lexer_t)));
+
+	if (boolean_mode) {
+		fts0blex_init(&fts_lexer->yyscanner);
+		fts0b_scan_bytes((char*) query, query_len, fts_lexer->yyscanner);
+		fts_lexer->scanner = (fts_scan) fts_blexer;
+		/* FIXME: Debugging */
+		/* fts0bset_debug(1 , fts_lexer->yyscanner); */
+	} else {
+		fts0tlex_init(&fts_lexer->yyscanner);
+		fts0t_scan_bytes((char*) query, query_len, fts_lexer->yyscanner);
+		fts_lexer->scanner = (fts_scan) fts_tlexer;
+	}
+
+	return(fts_lexer);
+}
+
+/********************************************************************
+Free an fts_lexer_t instance.*/
+void
+
+fts_lexer_free(
+/*===========*/
+	fts_lexer_t*	fts_lexer)
+{
+	if (fts_lexer->scanner == (fts_scan) fts_blexer) {
+		fts0blex_destroy(fts_lexer->yyscanner);
+	} else {
+		fts0tlex_destroy(fts_lexer->yyscanner);
+	}
+
+	ut_free(fts_lexer);
+}
+
+/********************************************************************
+Call the appropaiate scanner.*/
+
+int
+fts_lexer(
+/*======*/
+	YYSTYPE*	val,
+	fts_lexer_t*	fts_lexer)
+{
+	fts_scanner_alt func_ptr;
+
+	func_ptr = (fts_scanner_alt) fts_lexer->scanner;
+
+	return(func_ptr(val, fts_lexer->yyscanner));
+}
+
+/********************************************************************
+Parse the query.*/
+int
+fts_parse(
+/*======*/
+	fts_ast_state_t*	state)
+{
+	return(ftsparse(state));
+}
diff --git a/storage/xtradb/fts/fts0que.cc b/storage/xtradb/fts/fts0que.cc
new file mode 100644
index 00000000000..c2922d993bf
--- /dev/null
+++ b/storage/xtradb/fts/fts0que.cc
@@ -0,0 +1,4435 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0que.cc
+Full Text Search functionality.
+
+Created 2007/03/27 Sunny Bains
+Completed 2011/7/10 Sunny and Jimmy Yang
+*******************************************************/
+
+#include "dict0dict.h" /* dict_table_get_n_rows() */
+#include "ut0rbt.h"
+#include "row0sel.h"
+#include "fts0fts.h"
+#include "fts0priv.h"
+#include "fts0ast.h"
+#include "fts0pars.h"
+#include "fts0types.h"
+#include "ha_prototypes.h"
+#include <ctype.h>
+
+#ifndef UNIV_NONINL
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#endif
+
+#include <string>
+#include <vector>
+#include <map>
+
+#define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
+
+#define RANK_DOWNGRADE		(-1.0F)
+#define RANK_UPGRADE		(1.0F)
+
+/* Maximum number of words supported in a proximity search.
+FIXME, this limitation can be removed easily. Need to see
+if we want to enforce such limitation */
+#define MAX_PROXIMITY_ITEM	128
+
+/* Memory used by rbt itself for create and node add */
+#define SIZEOF_RBT_CREATE	sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
+#define SIZEOF_RBT_NODE_ADD	sizeof(ib_rbt_node_t)
+
+/*Initial byte length for 'words' in fts_ranking_t */
+#define RANKING_WORDS_INIT_LEN	4
+
+/* Coeffecient to use for normalize relevance ranking. */
+static const double FTS_NORMALIZE_COEFF = 0.0115F;
+
+// FIXME: Need to have a generic iterator that traverses the ilist.
+
+typedef std::map<std::string, ulint>	word_map_t;
+typedef std::vector<std::string>	word_vector_t;
+
+struct fts_word_freq_t;
+
+/** State of an FTS query. */
+struct fts_query_t {
+	mem_heap_t*	heap;		/*!< Heap to use for allocations */
+
+	trx_t*		trx;		/*!< The query transaction */
+
+	dict_index_t*	index;		/*!< The FTS index to search */
+					/*!< FTS auxiliary common table def */
+	fts_table_t	fts_common_table;
+
+	fts_table_t	fts_index_table;/*!< FTS auxiliary index table def */
+
+	ulint		total_size;	/*!< total memory size used by query */
+
+	fts_doc_ids_t*	deleted;	/*!< Deleted doc ids that need to be
+					filtered from the output */
+
+	fts_ast_node_t*	root;		/*!< Abstract syntax tree */
+
+	fts_ast_node_t* cur_node;	/*!< Current tree node */
+
+	word_map_t*	word_map;	/*!< Matched word map for
+					searching by word*/
+
+	word_vector_t*	word_vector;	/*!< Matched word vector for
+					searching by index */
+
+	ib_rbt_t*       doc_ids;	/*!< The current set of matching
+					doc ids, elements are of
+					type fts_ranking_t */
+
+	ib_rbt_t*	intersection;	/*!< The doc ids that were found in
+					doc_ids, this tree will become
+					the new doc_ids, elements are of type
+					fts_ranking_t */
+
+					/*!< Prepared statement to read the
+					nodes from the FTS INDEX */
+	que_t*		read_nodes_graph;
+
+	fts_ast_oper_t	oper;		/*!< Current boolean mode operator */
+
+					/*!< TRUE if we want to collect the
+					word positions within the document */
+	ibool		collect_positions;
+
+	ulint		flags;		/*!< Specify the full text search type,
+					such as  boolean search, phrase
+					search, proximity search etc. */
+
+	ulint		distance;	/*!< The proximity distance of a
+					phrase search. */
+
+					/*!< These doc ids are used as a
+					boundary condition when searching the
+					FTS index rows */
+
+	doc_id_t	lower_doc_id;	/*!< Lowest doc id in doc_ids */
+
+	doc_id_t	upper_doc_id;	/*!< Highest doc id in doc_ids */
+
+	bool		boolean_mode;	/*!< TRUE if boolean mode query */
+
+	ib_vector_t*	matched;	/*!< Array of matching documents
+					(fts_match_t) to search for a phrase */
+
+	ib_vector_t**	match_array;	/*!< Used for proximity search, contains
+					position info for each matched word
+					in the word list */
+
+	ib_uint64_t	total_docs;	/*!< The total number of documents */
+
+	ulint		total_words;	/*!< The total number of words */
+
+	dberr_t		error;		/*!< Error code if any, that is
+					encountered during query processing */
+
+	ib_rbt_t*	word_freqs;	/*!< RB tree of word frequencies per
+					document, its elements are of type
+					fts_word_freq_t */
+
+	bool		multi_exist;	/*!< multiple FTS_EXIST oper */
+};
+
+/** For phrase matching, first we collect the documents and the positions
+then we match. */
+struct fts_match_t {
+	doc_id_t	doc_id;		/*!< Document id */
+
+	ulint		start;		/*!< Start the phrase match from
+					this offset within the positions
+					vector. */
+
+	ib_vector_t*	positions;	/*!< Offsets of a word in a
+					document */
+};
+
+/** For matching tokens in a phrase search. We use this data structure in
+the callback that determines whether a document should be accepted or
+rejected for a phrase search. */
+struct fts_select_t {
+	doc_id_t	doc_id;		/*!< The document id to match */
+
+	ulint		min_pos;	/*!< For found to be TRUE at least
+					one position must be greater than
+					min_pos. */
+
+	ibool		found;		/*!< TRUE if found */
+
+	fts_word_freq_t*
+			word_freq;	/*!< Word frequency instance of the
+					current word being looked up in
+					the FTS index */
+};
+
+/** structure defines a set of ranges for original documents, each of which
+has a minimum position and maximum position. Text in such range should
+contain all words in the proximity search. We will need to count the
+words in such range to make sure it is less than the specified distance
+of the proximity search */
+struct fts_proximity_t {
+	ulint		n_pos;		/*!< number of position set, defines
+					a range (min to max) containing all
+					matching words */
+	ulint*		min_pos;	/*!< the minimum position (in bytes)
+					of the range */
+	ulint*		max_pos;	/*!< the maximum position (in bytes)
+					of the range */
+};
+
+/** The match positions and tokesn to match */
+struct fts_phrase_t {
+	ibool		found;		/*!< Match result */
+
+	const fts_match_t*
+			match;		/*!< Positions within text */
+
+	const ib_vector_t*
+			tokens;		/*!< Tokens to match */
+
+	ulint		distance;	/*!< For matching on proximity
+					distance. Can be 0 for exact match */
+	CHARSET_INFO*	charset;	/*!< Phrase match charset */
+	mem_heap_t*     heap;		/*!< Heap for word processing */
+	ulint		zip_size;	/*!< row zip size */
+	fts_proximity_t*proximity_pos;	/*!< position info for proximity
+					search verification. Records the min
+					and max position of words matched */
+};
+
+/** For storing the frequncy of a word/term in a document */
+struct fts_doc_freq_t {
+	doc_id_t	doc_id;		/*!< Document id */
+	ulint		freq;		/*!< Frequency of a word in a document */
+};
+
+/** To determine the word frequency per document. */
+struct fts_word_freq_t {
+	byte*		word;		/*!< Word for which we need the freq,
+					it's allocated on the query heap */
+
+	ib_rbt_t*	doc_freqs;	/*!< RB Tree for storing per document
+					word frequencies. The elements are
+					of type fts_doc_freq_t */
+	ib_uint64_t	doc_count;	/*!< Total number of documents that
+					contain this word */
+	double		idf;		/*!< Inverse document frequency */
+};
+
+/********************************************************************
+Callback function to fetch the rows in an FTS INDEX record.
+@return always TRUE */
+static
+ibool
+fts_query_index_fetch_nodes(
+/*========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg);	/*!< in: pointer to ib_vector_t */
+
+/********************************************************************
+Read and filter nodes.
+@return fts_node_t instance */
+static
+dberr_t
+fts_query_filter_doc_ids(
+/*=====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	const byte*	word,		/*!< in: the current word */
+	fts_word_freq_t*word_freq,	/*!< in/out: word frequency */
+	const fts_node_t*
+			node,		/*!< in: current FTS node */
+	void*		data,		/*!< in: doc id ilist */
+	ulint		len,		/*!< in: doc id ilist size */
+	ibool		calc_doc_count);/*!< in: whether to remember doc
+					count */
+
+#if 0
+/*****************************************************************//***
+Find a doc_id in a word's ilist.
+@return TRUE if found. */
+static
+ibool
+fts_query_find_doc_id(
+/*==================*/
+	fts_select_t*	select,		/*!< in/out: search the doc id selected,
+					update the frequency if found. */
+	void*		data,		/*!< in: doc id ilist */
+	ulint		len);		/*!< in: doc id ilist size */
+#endif
+
+/*************************************************************//**
+This function implements a simple "blind" query expansion search:
+words in documents found in the first search pass will be used as
+search arguments to search the document again, thus "expand"
+the search result set.
+@return DB_SUCCESS if success, otherwise the error code */
+static
+dberr_t
+fts_expand_query(
+/*=============*/
+	dict_index_t*	index,		/*!< in: FTS index to search */
+	fts_query_t*	query)		/*!< in: query result, to be freed
+					by the client */
+	__attribute__((nonnull, warn_unused_result));
+/*************************************************************//**
+This function finds documents that contain all words in a
+phrase or proximity search. And if proximity search, verify
+the words are close enough to each other, as in specified distance.
+This function is called for phrase and proximity search.
+@return TRUE if documents are found, FALSE if otherwise */
+static
+ibool
+fts_phrase_or_proximity_search(
+/*===========================*/
+	fts_query_t*	query,		/*!< in/out:  query instance
+					query->doc_ids might be instantiated
+					with qualified doc IDs */
+	ib_vector_t*	tokens);	/*!< in: Tokens contain words */
+/*************************************************************//**
+This function checks whether words in result documents are close to
+each other (within proximity range as specified by "distance").
+If "distance" is MAX_ULINT, then it will find all combinations of
+positions of matching words and store min and max positions
+in the "qualified_pos" for later verification.
+@return true if words are close to each other, false if otherwise */
+static
+bool
+fts_proximity_get_positions(
+/*========================*/
+	fts_match_t**		match,		/*!< in: query instance */
+	ulint			num_match,	/*!< in: number of matching
+						items */
+	ulint			distance,	/*!< in: distance value
+						for proximity search */
+	fts_proximity_t*	qualified_pos);	/*!< out: the position info
+						records ranges containing
+						all matching words. */
+#if 0
+/********************************************************************
+Get the total number of words in a documents. */
+static
+ulint
+fts_query_terms_in_document(
+/*========================*/
+					/*!< out: DB_SUCCESS if all go well
+					else error code */
+	fts_query_t*	query,		/*!< in: FTS query state */
+	doc_id_t	doc_id,		/*!< in: the word to check */
+	ulint*		total);		/*!< out: total words in document */
+#endif
+
+/********************************************************************
+Compare two fts_doc_freq_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_freq_doc_id_cmp(
+/*================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_doc_freq_t*	fq1 = (const fts_doc_freq_t*) p1;
+	const fts_doc_freq_t*	fq2 = (const fts_doc_freq_t*) p2;
+
+	return((int) (fq1->doc_id - fq2->doc_id));
+}
+
+#if 0
+/*******************************************************************//**
+Print the table used for calculating LCS. */
+static
+void
+fts_print_lcs_table(
+/*================*/
+	const ulint*	table,		/*!< in: array to print */
+	ulint		n_rows,		/*!< in: total no. of rows */
+	ulint		n_cols)		/*!< in: total no. of cols */
+{
+	ulint		i;
+
+	for (i = 0; i < n_rows; ++i) {
+		ulint	j;
+
+		printf("\n");
+
+		for (j = 0; j < n_cols; ++j) {
+
+			printf("%2lu ", FTS_ELEM(table, n_cols, i, j));
+		}
+	}
+}
+
+/********************************************************************
+Find the longest common subsequence between the query string and
+the document. */
+static
+ulint
+fts_query_lcs(
+/*==========*/
+					/*!< out: LCS (length) between
+					two ilists */
+	const	ulint*	p1,		/*!< in: word positions of query */
+	ulint	len_p1,			/*!< in: no. of elements in p1 */
+	const	ulint*	p2,		/*!< in: word positions within document */
+	ulint	len_p2)			/*!< in: no. of elements in p2 */
+{
+	int	i;
+	ulint	len = 0;
+	ulint	r = len_p1;
+	ulint	c = len_p2;
+	ulint	size = (r + 1) * (c + 1) * sizeof(ulint);
+	ulint*	table = (ulint*) ut_malloc(size);
+
+	/* Traverse the table backwards, from the last row to the first and
+	also from the last column to the first. We compute the smaller
+	common subsequeces first, then use the caluclated values to determine
+	the longest common subsequence. The result will be in TABLE[0][0]. */
+	for (i = r; i >= 0; --i) {
+		int	j;
+
+		for (j = c; j >= 0; --j) {
+
+			if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) {
+
+				FTS_ELEM(table, c, i, j) = 0;
+
+			} else if (p1[i] == p2[j]) {
+
+				FTS_ELEM(table, c, i, j) = FTS_ELEM(
+					table, c, i + 1, j + 1) + 1;
+
+			} else {
+
+				ulint	value;
+
+				value = ut_max(
+					FTS_ELEM(table, c, i + 1, j),
+					FTS_ELEM(table, c, i, j + 1));
+
+				FTS_ELEM(table, c, i, j) = value;
+			}
+		}
+	}
+
+	len = FTS_ELEM(table, c, 0, 0);
+
+	fts_print_lcs_table(table, r, c);
+	printf("\nLen=%lu\n", len);
+
+	ut_free(table);
+
+	return(len);
+}
+#endif
+
+/*******************************************************************//**
+Compare two fts_ranking_t instance on their rank value and doc ids in
+descending order on the rank and ascending order on doc id.
+@return 0 if p1 == p2, < 0 if p1 <  p2, > 0 if p1 >  p2 */
+static
+int
+fts_query_compare_rank(
+/*===================*/
+	const void*	p1,		/*!< in: pointer to elem */
+	const void*	p2)		/*!< in: pointer to elem */
+{
+	const fts_ranking_t*	r1 = (const fts_ranking_t*) p1;
+	const fts_ranking_t*	r2 = (const fts_ranking_t*) p2;
+
+	if (r2->rank < r1->rank) {
+		return(-1);
+	} else if (r2->rank == r1->rank) {
+
+		if (r1->doc_id < r2->doc_id) {
+			return(1);
+		} else if (r1->doc_id > r2->doc_id) {
+			return(1);
+		}
+
+		return(0);
+	}
+
+	return(1);
+}
+
+#ifdef FTS_UTF8_DEBUG
+/*******************************************************************//**
+Convert string to lowercase.
+@return lower case string, callers responsibility to delete using
+ut_free() */
+static
+byte*
+fts_tolower(
+/*========*/
+	const byte*	src,		/*!< in: src string */
+	ulint		len)		/*!< in: src string length */
+{
+	fts_string_t	str;
+	byte*		lc_str = ut_malloc(len + 1);
+
+	str.f_len = len;
+	str.f_str = lc_str;
+
+	memcpy(str.f_str, src, len);
+
+	/* Make sure the last byte is NUL terminated */
+	str.f_str[len] = '\0';
+
+	fts_utf8_tolower(&str);
+
+	return(lc_str);
+}
+
+/*******************************************************************//**
+Do a case insensitive search. Doesn't check for NUL byte end marker
+only relies on len. Convert str2 to lower case before comparing.
+@return 0 if p1 == p2, < 0 if p1 <  p2, > 0 if p1 >  p2 */
+static
+int
+fts_utf8_strcmp(
+/*============*/
+	const fts_string_t*
+			str1,		/*!< in: should be lower case*/
+
+	fts_string_t*	str2)		/*!< in: any case. We will use the length
+					of this string during compare as it
+					should be the min of the two strings */
+{
+	byte		b = str2->f_str[str2->f_len];
+
+	ut_a(str2->f_len <= str1->f_len);
+
+	/* We need to write a NUL byte at the end of the string because the
+	string is converted to lowercase by a MySQL function which doesn't
+	care about the length. */
+	str2->f_str[str2->f_len] = 0;
+
+	fts_utf8_tolower(str2);
+
+	/* Restore the value we replaced above. */
+	str2->f_str[str2->f_len] = b;
+
+	return(memcmp(str1->f_str, str2->f_str, str2->f_len));
+}
+#endif
+
+/*******************************************************************//**
+Create words in ranking */
+static
+void
+fts_ranking_words_create(
+/*=====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	fts_ranking_t*	ranking)	/*!< in: ranking instance */
+{
+	ranking->words = static_cast<byte*>(
+		mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN));
+	ranking->words_len = RANKING_WORDS_INIT_LEN;
+}
+
+/*
+The optimization here is using a char array(bitmap) to replace words rb tree
+in fts_ranking_t.
+
+It can save lots of memory except in some cases of QUERY EXPANSION.
+
+'word_map' is used as a word dictionary, in which the key is a word, the value
+is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'.
+if not, we add it into 'word_map', and give it a position(actually a number).
+then we set the corresponding bit to '1' at the position in the char array 'words'.
+
+'word_vector' is a useful backup of 'word_map', and we can get a word by its position,
+more quickly than searching by value in 'word_map'. we use 'word_vector'
+in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need
+to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq
+by the word.
+*/
+
+/*******************************************************************//**
+Add a word into ranking */
+static
+void
+fts_ranking_words_add(
+/*==================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	fts_ranking_t*	ranking,	/*!< in: ranking instance */
+	const char*	word)		/*!< in: term/word to add */
+{
+	ulint	pos;
+	ulint	byte_offset;
+	ulint	bit_offset;
+	word_map_t::iterator it;
+
+	/* Note: we suppose the word map and vector are append-only */
+	/* Check if need to add it to word map */
+	it = query->word_map->lower_bound(word);
+	if (it != query->word_map->end()
+	    && !query->word_map->key_comp()(word, it->first)) {
+		pos = it->second;
+	} else {
+		pos = query->word_map->size();
+		query->word_map->insert(it,
+			std::pair<std::string, ulint>(word, pos));
+
+		query->word_vector->push_back(word);
+	}
+
+	/* Check words len */
+	byte_offset = pos / CHAR_BIT;
+	if (byte_offset >= ranking->words_len) {
+		byte*	words = ranking->words;
+		ulint	words_len = ranking->words_len;
+
+		while (byte_offset >= words_len) {
+			words_len *= 2;
+		}
+
+		ranking->words = static_cast<byte*>(
+			mem_heap_zalloc(query->heap, words_len));
+		ut_memcpy(ranking->words, words, ranking->words_len);
+		ranking->words_len = words_len;
+	}
+
+	/* Set ranking words */
+	ut_ad(byte_offset < ranking->words_len);
+	bit_offset = pos % CHAR_BIT;
+	ranking->words[byte_offset] |= 1 << bit_offset;
+}
+
+/*******************************************************************//**
+Get a word from a ranking
+@return true if it's successful */
+static
+bool
+fts_ranking_words_get_next(
+/*=======================*/
+	const	fts_query_t*	query,	/*!< in: query instance */
+	fts_ranking_t*		ranking,/*!< in: ranking instance */
+	ulint*			pos,	/*!< in/out: word start pos */
+	byte**			word)	/*!< in/out: term/word to add */
+{
+	bool	ret = false;
+	ulint	max_pos = ranking->words_len * CHAR_BIT;
+
+	/* Search for next word */
+	while (*pos < max_pos) {
+		ulint	byte_offset = *pos / CHAR_BIT;
+		ulint	bit_offset = *pos % CHAR_BIT;
+
+		if (ranking->words[byte_offset] & (1 << bit_offset)) {
+			ret = true;
+			break;
+		}
+
+		*pos += 1;
+	};
+
+	/* Get next word from word vector */
+	if (ret) {
+		ut_ad(*pos < query->word_vector->size());
+		*word = (byte*)query->word_vector->at((size_t)*pos).c_str();
+		*pos += 1;
+	}
+
+	return ret;
+}
+
+/*******************************************************************//**
+Add a word if it doesn't exist, to the term freq RB tree. We store
+a pointer to the word that is passed in as the argument.
+@return pointer to word */
+static
+fts_word_freq_t*
+fts_query_add_word_freq(
+/*====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	const byte*	word)		/*!< in: term/word to add */
+{
+	ib_rbt_bound_t		parent;
+
+	/* Lookup the word in our rb tree and add if it doesn't exist. */
+	if (rbt_search(query->word_freqs, &parent, word) != 0) {
+		fts_word_freq_t	word_freq;
+		ulint		len = ut_strlen((char*) word) + 1;
+
+		memset(&word_freq, 0, sizeof(word_freq));
+
+		word_freq.word = static_cast<byte*>(
+			mem_heap_alloc(query->heap, len));
+
+		/* Need to copy the NUL character too. */
+		memcpy(word_freq.word, word, len);
+
+		word_freq.doc_count = 0;
+
+		word_freq.doc_freqs = rbt_create(
+			sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
+
+		parent.last = rbt_add_node(
+			query->word_freqs, &parent, &word_freq);
+
+		query->total_size += len
+			+ SIZEOF_RBT_CREATE
+			+ SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_word_freq_t);
+	}
+
+	return(rbt_value(fts_word_freq_t, parent.last));
+}
+
+/*******************************************************************//**
+Add a doc id if it doesn't exist, to the doc freq RB tree.
+@return pointer to word */
+static
+fts_doc_freq_t*
+fts_query_add_doc_freq(
+/*===================*/
+	fts_query_t*	query,		/*!< in: query instance	*/
+	ib_rbt_t*	doc_freqs,	/*!< in: rb tree of fts_doc_freq_t */
+	doc_id_t	doc_id)		/*!< in: doc id to add */
+{
+	ib_rbt_bound_t	parent;
+
+	/* Lookup the doc id in our rb tree and add if it doesn't exist. */
+	if (rbt_search(doc_freqs, &parent, &doc_id) != 0) {
+		fts_doc_freq_t	doc_freq;
+
+		memset(&doc_freq, 0, sizeof(doc_freq));
+
+		doc_freq.freq = 0;
+		doc_freq.doc_id = doc_id;
+
+		parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
+
+		query->total_size += SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_doc_freq_t);
+	}
+
+	return(rbt_value(fts_doc_freq_t, parent.last));
+}
+
+/*******************************************************************//**
+Add the doc id to the query set only if it's not in the
+deleted array. */
+static
+void
+fts_query_union_doc_id(
+/*===================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id,		/*!< in: the doc id to add */
+	fts_rank_t	rank)		/*!< in: if non-zero, it is the
+					rank associated with the doc_id */
+{
+	ib_rbt_bound_t	parent;
+	ulint		size = ib_vector_size(query->deleted->doc_ids);
+	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
+
+	/* Check if the doc id is deleted and it's not already in our set. */
+	if (fts_bsearch(array, 0, size, doc_id) < 0
+	    && rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
+
+		fts_ranking_t	ranking;
+
+		ranking.rank = rank;
+		ranking.doc_id = doc_id;
+		fts_ranking_words_create(query, &ranking);
+
+		rbt_add_node(query->doc_ids, &parent, &ranking);
+
+		query->total_size += SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN;
+	}
+}
+
+/*******************************************************************//**
+Remove the doc id from the query set only if it's not in the
+deleted set. */
+static
+void
+fts_query_remove_doc_id(
+/*====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id)		/*!< in: the doc id to add */
+{
+	ib_rbt_bound_t	parent;
+	ulint		size = ib_vector_size(query->deleted->doc_ids);
+	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
+
+	/* Check if the doc id is deleted and it's in our set. */
+	if (fts_bsearch(array, 0, size, doc_id) < 0
+	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
+		ut_free(rbt_remove_node(query->doc_ids, parent.last));
+
+		ut_ad(query->total_size >
+		      SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
+		query->total_size -= SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_ranking_t);
+	}
+}
+
+/*******************************************************************//**
+Find the doc id in the query set but not in the deleted set, artificialy
+downgrade or upgrade its ranking by a value and make/initialize its ranking
+under or above its normal range 0 to 1. This is used for Boolean Search
+operator such as Negation operator, which makes word's contribution to the
+row's relevance to be negative */
+static
+void
+fts_query_change_ranking(
+/*====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id,		/*!< in: the doc id to add */
+	ibool		downgrade)	/*!< in: Whether to downgrade ranking */
+{
+	ib_rbt_bound_t	parent;
+	ulint		size = ib_vector_size(query->deleted->doc_ids);
+	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
+
+	/* Check if the doc id is deleted and it's in our set. */
+	if (fts_bsearch(array, 0, size, doc_id) < 0
+	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+
+		ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
+
+		/* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5)
+		and RANK_UPGRADE (0.5) */
+		if (ranking->rank >= 1.0F) {
+			ranking->rank = 1.0F;
+		} else if (ranking->rank <= -1.0F) {
+			ranking->rank = -1.0F;
+		}
+	}
+}
+
+/*******************************************************************//**
+Check the doc id in the query set only if it's not in the
+deleted array. The doc ids that were found are stored in
+another rb tree (fts_query_t::intersect). */
+static
+void
+fts_query_intersect_doc_id(
+/*=======================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id,		/*!< in: the doc id to add */
+	fts_rank_t	rank)		/*!< in: if non-zero, it is the
+					rank associated with the doc_id */
+{
+	ib_rbt_bound_t	parent;
+	ulint		size = ib_vector_size(query->deleted->doc_ids);
+	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
+	fts_ranking_t*	ranking= NULL;
+
+	/* There are three types of intersect:
+	   1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'.
+	   2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect
+	      if it matches 'b'. if the doc is also in  doc_ids, then change the
+	      doc's rank, and add 'a' in doc's words.
+	   3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect
+	      if it matches 'b' and it's in doc_ids.(multi_exist = true). */
+
+	/* Check if the doc id is deleted and it's in our set */
+	if (fts_bsearch(array, 0, size, doc_id) < 0) {
+		fts_ranking_t	new_ranking;
+
+		if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
+			if (query->multi_exist) {
+				return;
+			} else {
+				new_ranking.words = NULL;
+			}
+		} else {
+			ranking = rbt_value(fts_ranking_t, parent.last);
+
+			/* We've just checked the doc id before */
+			if (ranking->words == NULL) {
+				ut_ad(rbt_search(query->intersection, &parent,
+					ranking) == 0);
+				return;
+			}
+
+			/* Merge rank */
+			rank += ranking->rank;
+			if (rank >= 1.0F) {
+				rank = 1.0F;
+			} else if (rank <= -1.0F) {
+				rank = -1.0F;
+			}
+
+			/* Take words */
+			new_ranking.words = ranking->words;
+			new_ranking.words_len = ranking->words_len;
+		}
+
+		new_ranking.rank = rank;
+		new_ranking.doc_id = doc_id;
+
+		if (rbt_search(query->intersection, &parent,
+			       &new_ranking) != 0) {
+			if (new_ranking.words == NULL) {
+				fts_ranking_words_create(query, &new_ranking);
+
+				query->total_size += RANKING_WORDS_INIT_LEN;
+			} else {
+				/* Note that the intersection has taken
+				ownership of the ranking data. */
+				ranking->words = NULL;
+			}
+
+			rbt_add_node(query->intersection,
+				     &parent, &new_ranking);
+
+			query->total_size += SIZEOF_RBT_NODE_ADD
+				+ sizeof(fts_ranking_t);
+		}
+	}
+}
+
+/*******************************************************************//**
+Free the document ranking rb tree. */
+static
+void
+fts_query_free_doc_ids(
+/*===================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	ib_rbt_t*	doc_ids)	/*!< in: rb tree to free */
+{
+	const ib_rbt_node_t*	node;
+
+	for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, node);
+
+		if (ranking->words) {
+			ranking->words = NULL;
+		}
+
+		ut_free(rbt_remove_node(doc_ids, node));
+
+		ut_ad(query->total_size >
+		      SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
+		query->total_size -= SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_ranking_t);
+	}
+
+	rbt_free(doc_ids);
+
+	ut_ad(query->total_size > SIZEOF_RBT_CREATE);
+	query->total_size -= SIZEOF_RBT_CREATE;
+}
+
+/*******************************************************************//**
+Add the word to the documents "list" of matching words from
+the query. We make a copy of the word from the query heap. */
+static
+void
+fts_query_add_word_to_document(
+/*===========================*/
+	fts_query_t*		query,	/*!< in: query to update */
+	doc_id_t		doc_id,	/*!< in: the document to update */
+	const byte*		word)	/*!< in: the token to add */
+{
+	ib_rbt_bound_t		parent;
+	fts_ranking_t*		ranking = NULL;
+
+	if (query->flags == FTS_OPT_RANKING) {
+		return;
+	}
+
+	/* First we search the intersection RB tree as it could have
+	taken ownership of the words rb tree instance. */
+	if (query->intersection
+	    && rbt_search(query->intersection, &parent, &doc_id) == 0) {
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+	}
+
+	if (ranking == NULL
+	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+	}
+
+	if (ranking != NULL) {
+		fts_ranking_words_add(query, ranking, (char*)word);
+	}
+}
+
+/*******************************************************************//**
+Check the node ilist. */
+static
+void
+fts_query_check_node(
+/*=================*/
+	fts_query_t*		query,	/*!< in: query to update */
+	const fts_string_t*	token,	/*!< in: the token to search */
+	const fts_node_t*	node)	/*!< in: node to check */
+{
+	/* Skip nodes whose doc ids are out range. */
+	if (query->oper == FTS_EXIST
+	    && ((query->upper_doc_id > 0
+		&& node->first_doc_id > query->upper_doc_id)
+		|| (query->lower_doc_id > 0
+		    && node->last_doc_id < query->lower_doc_id))) {
+
+		/* Ignore */
+
+	} else {
+		int		ret;
+		ib_rbt_bound_t	parent;
+		ulint		ilist_size = node->ilist_size;
+		fts_word_freq_t*word_freqs;
+
+		/* The word must exist. */
+		ret = rbt_search(query->word_freqs, &parent, token->f_str);
+		ut_a(ret == 0);
+
+		word_freqs = rbt_value(fts_word_freq_t, parent.last);
+
+		query->error = fts_query_filter_doc_ids(
+					query, token->f_str, word_freqs, node,
+					node->ilist, ilist_size, TRUE);
+	}
+}
+
+/*****************************************************************//**
+Search index cache for word with wildcard match.
+@return number of words matched */
+static
+ulint
+fts_cache_find_wildcard(
+/*====================*/
+	fts_query_t*		query,		/*!< in: query instance */
+	const fts_index_cache_t*index_cache,	/*!< in: cache to search */
+	const fts_string_t*	token)		/*!< in: token to search */
+{
+	ib_rbt_bound_t		parent;
+	const ib_vector_t*	nodes = NULL;
+	fts_string_t		srch_text;
+	byte			term[FTS_MAX_WORD_LEN + 1];
+	ulint			num_word = 0;
+
+	srch_text.f_len = (token->f_str[token->f_len - 1] == '%')
+			? token->f_len - 1
+			: token->f_len;
+
+	strncpy((char*) term, (char*) token->f_str, srch_text.f_len);
+	term[srch_text.f_len] = '\0';
+	srch_text.f_str = term;
+
+	/* Lookup the word in the rb tree */
+	if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL,
+			   innobase_fts_text_cmp_prefix) == 0) {
+		const fts_tokenizer_word_t*     word;
+		ulint				i;
+		const ib_rbt_node_t*		cur_node;
+		ibool				forward = FALSE;
+
+		word = rbt_value(fts_tokenizer_word_t, parent.last);
+		cur_node = parent.last;
+
+		while (innobase_fts_text_cmp_prefix(
+			index_cache->charset, &srch_text, &word->text) == 0) {
+
+			nodes = word->nodes;
+
+			for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
+				int                     ret;
+				const fts_node_t*       node;
+				ib_rbt_bound_t          freq_parent;
+				fts_word_freq_t*	word_freqs;
+
+				node = static_cast<const fts_node_t*>(
+					ib_vector_get_const(nodes, i));
+
+				ret = rbt_search(query->word_freqs,
+						 &freq_parent,
+						 srch_text.f_str);
+
+				ut_a(ret == 0);
+
+				word_freqs = rbt_value(
+					fts_word_freq_t,
+					freq_parent.last);
+
+				query->error = fts_query_filter_doc_ids(
+					query, srch_text.f_str,
+					word_freqs, node,
+					node->ilist, node->ilist_size, TRUE);
+
+				if (query->error != DB_SUCCESS) {
+					return(0);
+				}
+			}
+
+			num_word++;
+
+			if (!forward) {
+				cur_node = rbt_prev(
+					index_cache->words, cur_node);
+			} else {
+cont_search:
+				cur_node = rbt_next(
+					index_cache->words, cur_node);
+			}
+
+			if (!cur_node) {
+				break;
+			}
+
+			word = rbt_value(fts_tokenizer_word_t, cur_node);
+		}
+
+		if (!forward) {
+			forward = TRUE;
+			cur_node = parent.last;
+			goto cont_search;
+		}
+	}
+
+	return(num_word);
+}
+
+/*****************************************************************//**
+Set difference.
+@return DB_SUCCESS if all go well */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_difference(
+/*=================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	const fts_string_t*	token)	/*!< in: token to search */
+{
+	ulint			n_doc_ids= 0;
+	trx_t*			trx = query->trx;
+	dict_table_t*		table = query->index->table;
+
+	ut_a(query->oper == FTS_IGNORE);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	fprintf(stderr, "DIFFERENCE: Searching: '%.*s'\n",
+		(int) token->f_len, token->f_str);
+#endif
+
+	if (query->doc_ids) {
+		n_doc_ids = rbt_size(query->doc_ids);
+	}
+
+	/* There is nothing we can substract from an empty set. */
+	if (query->doc_ids && !rbt_empty(query->doc_ids)) {
+		ulint			i;
+		fts_fetch_t		fetch;
+		const ib_vector_t*	nodes;
+		const fts_index_cache_t*index_cache;
+		que_t*			graph = NULL;
+		fts_cache_t*		cache = table->fts->cache;
+		dberr_t			error;
+
+		rw_lock_x_lock(&cache->lock);
+
+		index_cache = fts_find_index_cache(cache, query->index);
+
+		/* Must find the index cache */
+		ut_a(index_cache != NULL);
+
+		/* Search the cache for a matching word first. */
+		if (query->cur_node->term.wildcard
+		    && query->flags != FTS_PROXIMITY
+		    && query->flags != FTS_PHRASE) {
+			fts_cache_find_wildcard(query, index_cache, token);
+		} else {
+			nodes = fts_cache_find_word(index_cache, token);
+
+			for (i = 0; nodes && i < ib_vector_size(nodes)
+			     && query->error == DB_SUCCESS; ++i) {
+				const fts_node_t*	node;
+
+				node = static_cast<const fts_node_t*>(
+					ib_vector_get_const(nodes, i));
+
+				fts_query_check_node(query, token, node);
+			}
+		}
+
+		rw_lock_x_unlock(&cache->lock);
+
+		/* error is passed by 'query->error' */
+		if (query->error != DB_SUCCESS) {
+			ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+			return(query->error);
+		}
+
+		/* Setup the callback args for filtering and
+		consolidating the ilist. */
+		fetch.read_arg = query;
+		fetch.read_record = fts_query_index_fetch_nodes;
+
+		error = fts_index_fetch_nodes(
+			trx, &graph, &query->fts_index_table, token, &fetch);
+
+		/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+		ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+		if (error != DB_SUCCESS) {
+			query->error = error;
+		}
+
+		fts_que_graph_free(graph);
+	}
+
+	/* The size can't increase. */
+	ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Intersect the token doc ids with the current set.
+@return DB_SUCCESS if all go well */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_intersect(
+/*================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	const fts_string_t*	token)	/*!< in: the token to search */
+{
+	trx_t*			trx = query->trx;
+	dict_table_t*		table = query->index->table;
+
+	ut_a(query->oper == FTS_EXIST);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	fprintf(stderr, "INTERSECT: Searching: '%.*s'\n",
+		(int) token->f_len, token->f_str);
+#endif
+
+	/* If the words set is not empty and multi exist is true,
+	we know the intersection set is empty in advance. */
+	if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
+		ulint                   n_doc_ids = 0;
+		ulint			i;
+		fts_fetch_t		fetch;
+		const ib_vector_t*	nodes;
+		const fts_index_cache_t*index_cache;
+		que_t*			graph = NULL;
+		fts_cache_t*		cache = table->fts->cache;
+		dberr_t			error;
+
+		ut_a(!query->intersection);
+
+		n_doc_ids = rbt_size(query->doc_ids);
+
+		/* Create the rb tree that will hold the doc ids of
+		the intersection. */
+		query->intersection = rbt_create(
+			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+		query->total_size += SIZEOF_RBT_CREATE;
+
+		/* This is to avoid decompressing the ilist if the
+		node's ilist doc ids are out of range. */
+		if (!rbt_empty(query->doc_ids) && query->multi_exist) {
+			const ib_rbt_node_t*	node;
+			doc_id_t*		doc_id;
+
+			node = rbt_first(query->doc_ids);
+			doc_id = rbt_value(doc_id_t, node);
+			query->lower_doc_id = *doc_id;
+
+			node = rbt_last(query->doc_ids);
+			doc_id = rbt_value(doc_id_t, node);
+			query->upper_doc_id = *doc_id;
+
+		} else {
+			query->lower_doc_id = 0;
+			query->upper_doc_id = 0;
+		}
+
+		/* Search the cache for a matching word first. */
+
+		rw_lock_x_lock(&cache->lock);
+
+		/* Search for the index specific cache. */
+		index_cache = fts_find_index_cache(cache, query->index);
+
+		/* Must find the index cache. */
+		ut_a(index_cache != NULL);
+
+		if (query->cur_node->term.wildcard) {
+			/* Wildcard search the index cache */
+			fts_cache_find_wildcard(query, index_cache, token);
+		} else {
+			nodes = fts_cache_find_word(index_cache, token);
+
+			for (i = 0; nodes && i < ib_vector_size(nodes)
+			     && query->error == DB_SUCCESS; ++i) {
+				const fts_node_t*	node;
+
+				node = static_cast<const fts_node_t*>(
+					ib_vector_get_const(nodes, i));
+
+				fts_query_check_node(query, token, node);
+			}
+		}
+
+		rw_lock_x_unlock(&cache->lock);
+
+		/* error is passed by 'query->error' */
+		if (query->error != DB_SUCCESS) {
+			ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+			return(query->error);
+		}
+
+		/* Setup the callback args for filtering and
+		consolidating the ilist. */
+		fetch.read_arg = query;
+		fetch.read_record = fts_query_index_fetch_nodes;
+
+		error = fts_index_fetch_nodes(
+			trx, &graph, &query->fts_index_table, token, &fetch);
+
+		/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+		ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+		if (error != DB_SUCCESS) {
+			query->error = error;
+		}
+
+		fts_que_graph_free(graph);
+
+		if (query->error == DB_SUCCESS) {
+			/* Make the intesection (rb tree) the current doc id
+			set and free the old set. */
+			fts_query_free_doc_ids(query, query->doc_ids);
+			query->doc_ids = query->intersection;
+			query->intersection = NULL;
+
+			ut_a(!query->multi_exist || (query->multi_exist
+			     && rbt_size(query->doc_ids) <= n_doc_ids));
+		}
+	}
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Query index cache.
+@return DB_SUCCESS if all go well */
+static
+dberr_t
+fts_query_cache(
+/*============*/
+	fts_query_t*		query,	/*!< in/out: query instance */
+	const fts_string_t*	token)	/*!< in: token to search */
+{
+	const fts_index_cache_t*index_cache;
+	dict_table_t*		table = query->index->table;
+	fts_cache_t*		cache = table->fts->cache;
+
+	/* Search the cache for a matching word first. */
+	rw_lock_x_lock(&cache->lock);
+
+	/* Search for the index specific cache. */
+	index_cache = fts_find_index_cache(cache, query->index);
+
+	/* Must find the index cache. */
+	ut_a(index_cache != NULL);
+
+	if (query->cur_node->term.wildcard
+	    && query->flags != FTS_PROXIMITY
+	    && query->flags != FTS_PHRASE) {
+		/* Wildcard search the index cache */
+		fts_cache_find_wildcard(query, index_cache, token);
+	} else {
+		const ib_vector_t*      nodes;
+		ulint			i;
+
+		nodes = fts_cache_find_word(index_cache, token);
+
+		for (i = 0; nodes && i < ib_vector_size(nodes)
+		     && query->error == DB_SUCCESS; ++i) {
+			const fts_node_t*	node;
+
+			node = static_cast<const fts_node_t*>(
+				ib_vector_get_const(nodes, i));
+
+			fts_query_check_node(query, token, node);
+		}
+	}
+
+	rw_lock_x_unlock(&cache->lock);
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Set union.
+@return DB_SUCCESS if all go well */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_union(
+/*============*/
+	fts_query_t*		query,	/*!< in: query instance */
+	fts_string_t*		token)	/*!< in: token to search */
+{
+	fts_fetch_t		fetch;
+	ulint			n_doc_ids = 0;
+	trx_t*			trx = query->trx;
+	que_t*			graph = NULL;
+	dberr_t			error;
+
+	ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING ||
+	     query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	fprintf(stderr, "UNION: Searching: '%.*s'\n",
+		(int) token->f_len, token->f_str);
+#endif
+
+	if (query->doc_ids) {
+		n_doc_ids = rbt_size(query->doc_ids);
+	}
+
+	if (token->f_len == 0) {
+		return(query->error);
+	}
+
+	/* Single '%' would confuse parser in pars_like_rebind(). In addition,
+	our wildcard search only supports prefix search */
+	ut_ad(*token->f_str != '%');
+
+	fts_query_cache(query, token);
+
+	/* Setup the callback args for filtering and
+	consolidating the ilist. */
+	fetch.read_arg = query;
+	fetch.read_record = fts_query_index_fetch_nodes;
+
+	/* Read the nodes from disk. */
+	error = fts_index_fetch_nodes(
+		trx, &graph, &query->fts_index_table, token, &fetch);
+
+	/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+	ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+	if (error != DB_SUCCESS) {
+		query->error = error;
+	}
+
+	fts_que_graph_free(graph);
+
+	if (query->error == DB_SUCCESS) {
+
+		/* The size can't decrease. */
+		ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
+
+		/* Calulate the number of doc ids that were added to
+		the current doc id set. */
+		if (query->doc_ids) {
+			n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
+		}
+	}
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Depending upon the current query operator process the doc id.
+return DB_SUCCESS if all go well
+or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
+static
+dberr_t
+fts_query_process_doc_id(
+/*=====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id,		/*!< in: doc id to process */
+	fts_rank_t	rank)		/*!< in: if non-zero, it is the
+					rank associated with the doc_id */
+{
+	if (query->flags == FTS_OPT_RANKING) {
+		return(DB_SUCCESS);
+	}
+
+	switch (query->oper) {
+	case FTS_NONE:
+		fts_query_union_doc_id(query, doc_id, rank);
+		break;
+
+	case FTS_EXIST:
+		fts_query_intersect_doc_id(query, doc_id, rank);
+		break;
+
+	case FTS_IGNORE:
+		fts_query_remove_doc_id(query, doc_id);
+		break;
+
+	case FTS_NEGATE:
+		fts_query_change_ranking(query, doc_id, TRUE);
+		break;
+
+	case FTS_DECR_RATING:
+		fts_query_union_doc_id(query, doc_id, rank);
+		fts_query_change_ranking(query, doc_id, TRUE);
+		break;
+
+	case FTS_INCR_RATING:
+		fts_query_union_doc_id(query, doc_id, rank);
+		fts_query_change_ranking(query, doc_id, FALSE);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	if (query->total_size > fts_result_cache_limit) {
+		return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+	} else {
+		return(DB_SUCCESS);
+	}
+}
+
+/*****************************************************************//**
+Merge two result sets. */
+static
+dberr_t
+fts_merge_doc_ids(
+/*==============*/
+	fts_query_t*	query,		/*!< in,out: query instance */
+	const ib_rbt_t*	doc_ids)	/*!< in: result set to merge */
+{
+	const ib_rbt_node_t*	node;
+
+	ut_a(!rbt_empty(doc_ids));
+	ut_a(!query->intersection);
+
+	/* To process FTS_EXIST operation (intersection), we need
+	to create a new result set for fts_query_intersect(). */
+	if (query->oper == FTS_EXIST) {
+
+		query->intersection = rbt_create(
+			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+		query->total_size += SIZEOF_RBT_CREATE;
+	}
+
+	/* Merge the elements to the result set. */
+	for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
+		fts_ranking_t*		ranking;
+		ulint			pos = 0;
+		byte*			word = NULL;
+
+		ranking = rbt_value(fts_ranking_t, node);
+
+		query->error = fts_query_process_doc_id(
+				query, ranking->doc_id, ranking->rank);
+
+		if (query->error != DB_SUCCESS) {
+			return(query->error);
+		}
+
+		/* Merge words. Don't need to take operator into account. */
+		ut_a(ranking->words);
+		while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
+			fts_query_add_word_to_document(query, ranking->doc_id,
+						       word);
+		}
+	}
+
+	/* If it is an intersection operation, reset query->doc_ids
+	to query->intersection and free the old result list. */
+	if (query->oper == FTS_EXIST && query->intersection != NULL) {
+		fts_query_free_doc_ids(query, query->doc_ids);
+		query->doc_ids = query->intersection;
+		query->intersection = NULL;
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Skip non-whitespace in a string. Move ptr to the next word boundary.
+@return pointer to first whitespace character or end */
+UNIV_INLINE
+byte*
+fts_query_skip_word(
+/*================*/
+	byte*		ptr,		/*!< in: start of scan */
+	const byte*	end)		/*!< in: pointer to end of string */
+{
+	/* TODO: Does this have to be UTF-8 too ? */
+	while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) {
+		++ptr;
+	}
+
+	return(ptr);
+}
+
+/*****************************************************************//**
+Check whether the remaining terms in the phrase match the text.
+@return TRUE if matched else FALSE */
+static
+ibool
+fts_query_match_phrase_terms(
+/*=========================*/
+	fts_phrase_t*	phrase,		/*!< in: phrase to match */
+	byte**		start,		/*!< in/out: text to search, we can't
+					make this const becase we need to
+					first convert the string to
+					lowercase */
+	const byte*	end,		/*!< in: pointer to the end of
+					the string to search */
+	mem_heap_t*	heap)		/*!< in: heap */
+{
+	ulint			i;
+	byte*			ptr = *start;
+	const ib_vector_t*	tokens = phrase->tokens;
+	ulint			distance = phrase->distance;
+
+	/* We check only from the second term onwards, since the first
+	must have matched otherwise we wouldn't be here. */
+	for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) {
+		fts_string_t		match;
+		fts_string_t		cmp_str;
+		const fts_string_t*	token;
+		int			result;
+		ulint			ret;
+		ulint			offset;
+
+		ret = innobase_mysql_fts_get_token(
+			phrase->charset, ptr, (byte*) end,
+			&match, &offset);
+
+		if (match.f_len > 0) {
+			/* Get next token to match. */
+			token = static_cast<const fts_string_t*>(
+				ib_vector_get_const(tokens, i));
+
+			fts_utf8_string_dup(&cmp_str, &match, heap);
+
+			result = innobase_fts_text_case_cmp(
+				phrase->charset, token, &cmp_str);
+
+			/* Skip the rest of the tokens if this one doesn't
+			match and the proximity distance is exceeded. */
+			if (result
+			    && (distance == ULINT_UNDEFINED
+				|| distance == 0)) {
+
+				break;
+			}
+
+			/* This token matched move to the next token. */
+			if (result == 0) {
+				/* Advance the text to search by the length
+				of the last token. */
+				ptr += ret;
+
+				/* Advance to the next token. */
+				++i;
+			} else {
+
+				ut_a(distance != ULINT_UNDEFINED);
+
+				ptr = fts_query_skip_word(ptr, end);
+			}
+
+			/* Distance can be 0 for exact matches. */
+			if (distance != ULINT_UNDEFINED && distance > 0) {
+				--distance;
+			}
+		} else {
+			ptr += ret;
+		}
+	}
+
+	*start = ptr;
+
+	/* Can't be greater than the number of elements. */
+	ut_a(i <= ib_vector_size(tokens));
+
+	/* This is the case for multiple words. */
+	if (i == ib_vector_size(tokens)) {
+		phrase->found = TRUE;
+	}
+
+	return(phrase->found);
+}
+
+/*****************************************************************//**
+Callback function to count the number of words in position ranges,
+and see whether the word count is in specified "phrase->distance"
+@return true if the number of characters is less than the "distance" */
+static
+bool
+fts_proximity_is_word_in_range(
+/*===========================*/
+	const fts_phrase_t*
+			phrase,		/*!< in: phrase with the search info */
+	byte*		start,		/*!< in: text to search */
+	ulint		total_len)	/*!< in: length of text */
+{
+	fts_proximity_t*	proximity_pos = phrase->proximity_pos;
+
+	/* Search each matched position pair (with min and max positions)
+	and count the number of words in the range */
+	for (ulint i = 0; i < proximity_pos->n_pos; i++) {
+		ulint		cur_pos = proximity_pos->min_pos[i];
+		ulint		n_word = 0;
+
+		ut_ad(proximity_pos->max_pos[i] <= total_len);
+
+		/* Walk through words in the range and count them */
+		while (cur_pos <= proximity_pos->max_pos[i]) {
+			ulint		len;
+			fts_string_t	str;
+			ulint           offset = 0;
+
+			len = innobase_mysql_fts_get_token(
+				phrase->charset,
+				start + cur_pos,
+				start + total_len, &str, &offset);
+
+			if (len == 0) {
+				break;
+			}
+
+			/* Advances position with "len" bytes */
+			cur_pos += len;
+
+			/* Record the number of words */
+			if (str.f_n_char > 0) {
+				n_word++;
+			}
+
+			if (n_word > phrase->distance) {
+				break;
+			}
+		}
+
+		/* Check if the number of words is less than specified
+		"distance" */
+		if (n_word && n_word <= phrase->distance) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/*****************************************************************//**
+Callback function to fetch and search the document.
+@return TRUE if matched else FALSE */
+static
+ibool
+fts_query_match_phrase(
+/*===================*/
+	fts_phrase_t*	phrase,		/*!< in: phrase to match */
+	byte*		start,		/*!< in: text to search, we can't make
+					this const becase we need to first
+					convert the string to lowercase */
+	ulint		cur_len,	/*!< in: length of text */
+	ulint		prev_len,	/*!< in: total length for searched
+					doc fields*/
+	mem_heap_t*	heap)		/* heap */
+{
+	ulint			i;
+	const fts_string_t*	first;
+	const byte*		end = start + cur_len;
+	const ib_vector_t*	tokens = phrase->tokens;
+	const ib_vector_t*	positions = phrase->match->positions;
+
+	ut_a(!phrase->found);
+	ut_a(phrase->match->doc_id > 0);
+	ut_a(ib_vector_size(tokens) > 0);
+	ut_a(ib_vector_size(positions) > 0);
+
+	first = static_cast<const fts_string_t*>(
+		ib_vector_get_const(tokens, 0));
+
+	ut_a(phrase->match->start < ib_vector_size(positions));
+
+	for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
+		ulint		pos;
+		fts_string_t	match;
+		fts_string_t	cmp_str;
+		byte*		ptr = start;
+		ulint		ret;
+		ulint		offset;
+
+		pos = *(ulint*) ib_vector_get_const(positions, i);
+
+		if (pos == ULINT_UNDEFINED) {
+			break;
+		}
+
+		if (pos < prev_len) {
+			continue;
+		}
+
+		/* Document positions are calculated from the beginning
+		of the first field, need to save the length for each
+		searched field to adjust the doc position when search
+		phrases. */
+		pos -= prev_len;
+		ptr = match.f_str = start + pos;
+
+		/* Within limits ? */
+		if (ptr >= end) {
+			break;
+		}
+
+		ret = innobase_mysql_fts_get_token(
+			phrase->charset, start + pos, (byte*) end,
+			&match, &offset);
+
+		if (match.f_len == 0) {
+			break;
+		}
+
+		fts_utf8_string_dup(&cmp_str, &match, heap);
+
+		if (innobase_fts_text_case_cmp(
+			phrase->charset, first, &cmp_str) == 0) {
+
+			/* This is the case for the single word
+			in the phrase. */
+			if (ib_vector_size(phrase->tokens) == 1) {
+				phrase->found = TRUE;
+				break;
+			}
+
+			ptr += ret;
+
+			/* Match the remaining terms in the phrase. */
+			if (fts_query_match_phrase_terms(phrase, &ptr,
+							 end, heap)) {
+				break;
+			}
+		}
+	}
+
+	return(phrase->found);
+}
+
+/*****************************************************************//**
+Callback function to fetch and search the document.
+@return whether the phrase is found */
+static
+ibool
+fts_query_fetch_document(
+/*=====================*/
+	void*		row,		/*!< in:  sel_node_t* */
+	void*		user_arg)	/*!< in:  fts_doc_t* */
+{
+
+	que_node_t*	exp;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	fts_phrase_t*	phrase = static_cast<fts_phrase_t*>(user_arg);
+	ulint		prev_len = 0;
+	ulint		total_len = 0;
+	byte*		document_text = NULL;
+
+	exp = node->select_list;
+
+	phrase->found = FALSE;
+
+	/* For proximity search, we will need to get the whole document
+	from all fields, so first count the total length of the document
+	from all the fields */
+	if (phrase->proximity_pos) {
+		 while (exp) {
+			ulint		field_len;
+			dfield_t*	dfield = que_node_get_val(exp);
+			byte*		data = static_cast<byte*>(
+						dfield_get_data(dfield));
+
+			if (dfield_is_ext(dfield)) {
+				ulint	local_len = dfield_get_len(dfield);
+
+				local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+				field_len = mach_read_from_4(
+					data + local_len + BTR_EXTERN_LEN + 4);
+			} else {
+				field_len = dfield_get_len(dfield);
+			}
+
+			if (field_len != UNIV_SQL_NULL) {
+				total_len += field_len + 1;
+			}
+
+			exp = que_node_get_next(exp);
+		}
+
+		document_text = static_cast<byte*>(mem_heap_zalloc(
+					phrase->heap, total_len));
+
+		if (!document_text) {
+			return(FALSE);
+		}
+	}
+
+	exp = node->select_list;
+
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		byte*		data = static_cast<byte*>(
+					dfield_get_data(dfield));
+		ulint		cur_len;
+
+		if (dfield_is_ext(dfield)) {
+			data = btr_copy_externally_stored_field(
+				&cur_len, data, phrase->zip_size,
+				dfield_get_len(dfield), phrase->heap);
+		} else {
+			cur_len = dfield_get_len(dfield);
+		}
+
+		if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
+			if (phrase->proximity_pos) {
+				memcpy(document_text + prev_len, data, cur_len);
+			} else {
+				/* For phrase search */
+				phrase->found =
+					fts_query_match_phrase(
+						phrase,
+						static_cast<byte*>(data),
+						cur_len, prev_len,
+						phrase->heap);
+			}
+		}
+
+		if (phrase->found) {
+			break;
+		}
+
+		/* Document positions are calculated from the beginning
+		of the first field, need to save the length for each
+		searched field to adjust the doc position when search
+		phrases. */
+		prev_len += cur_len + 1;
+		exp = que_node_get_next(exp);
+	}
+
+	if (phrase->proximity_pos) {
+		ut_ad(prev_len <= total_len);
+
+		phrase->found = fts_proximity_is_word_in_range(
+			phrase, document_text, total_len);
+	}
+
+	return(phrase->found);
+}
+
+#if 0
+/********************************************************************
+Callback function to check whether a record was found or not. */
+static
+ibool
+fts_query_select(
+/*=============*/
+	void*		row,		/*!< in:  sel_node_t* */
+	void*		user_arg)	/*!< in:  fts_doc_t* */
+{
+	int		i;
+	que_node_t*	exp;
+	sel_node_t*	node = row;
+	fts_select_t*	select = user_arg;
+
+	ut_a(select->word_freq);
+	ut_a(select->word_freq->doc_freqs);
+
+	exp = node->select_list;
+
+	for (i = 0; exp && !select->found; ++i) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		void*		data = dfield_get_data(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		switch (i) {
+		case 0: /* DOC_COUNT */
+			if (len != UNIV_SQL_NULL && len != 0) {
+
+				select->word_freq->doc_count +=
+					mach_read_from_4(data);
+			}
+			break;
+
+		case 1: /* ILIST */
+			if (len != UNIV_SQL_NULL && len != 0) {
+
+				fts_query_find_doc_id(select, data, len);
+			}
+			break;
+
+		default:
+			ut_error;
+		}
+
+		exp = que_node_get_next(exp);
+	}
+
+	return(FALSE);
+}
+
+/********************************************************************
+Read the rows from the FTS index, that match word and where the
+doc id is between first and last doc id.
+@return DB_SUCCESS if all go well else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_find_term(
+/*================*/
+	fts_query_t*		query,	/*!< in: FTS query state */
+	que_t**			graph,	/*!< in: prepared statement */
+	const fts_string_t*	word,	/*!< in: the word to fetch */
+	doc_id_t		doc_id,	/*!< in: doc id to match */
+	ulint*			min_pos,/*!< in/out: pos found must be
+					 greater than this minimum value. */
+	ibool*			found)	/*!< out: TRUE if found else FALSE */
+{
+	pars_info_t*		info;
+	dberr_t			error;
+	fts_select_t		select;
+	doc_id_t		match_doc_id;
+	trx_t*			trx = query->trx;
+
+	trx->op_info = "fetching FTS index matching nodes";
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	select.found = FALSE;
+	select.doc_id = doc_id;
+	select.min_pos = *min_pos;
+	select.word_freq = fts_query_add_word_freq(query, word->f_str);
+
+	pars_info_bind_function(info, "my_func", fts_query_select, &select);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &match_doc_id, doc_id);
+
+	fts_bind_doc_id(info, "min_doc_id", &match_doc_id);
+
+	fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
+
+	if (!*graph) {
+		ulint		selected;
+
+		selected = fts_select_index(*word->f_str);
+
+		query->fts_index_table.suffix = fts_get_suffix(selected);
+
+		*graph = fts_parse_sql(
+			&query->fts_index_table,
+			info,
+			"DECLARE FUNCTION my_func;\n"
+			"DECLARE CURSOR c IS"
+			" SELECT doc_count, ilist\n"
+			" FROM %s\n"
+			" WHERE word LIKE :word AND "
+			"	first_doc_id <= :min_doc_id AND "
+			"	last_doc_id >= :max_doc_id\n"
+			" ORDER BY first_doc_id;\n"
+			"BEGIN\n"
+			"\n"
+			"OPEN c;\n"
+			"WHILE 1 = 1 LOOP\n"
+			"  FETCH c INTO my_func();\n"
+			"  IF c % NOTFOUND THEN\n"
+			"    EXIT;\n"
+			"  END IF;\n"
+			"END LOOP;\n"
+			"CLOSE c;");
+	}
+
+	for(;;) {
+		error = fts_eval_sql(trx, *graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, " InnoDB: Warning: lock wait "
+					"timeout reading FTS index. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, " InnoDB: Error: %lu "
+					"while reading FTS index.\n", error);
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	/* Value to return */
+	*found = select.found;
+
+	if (*found) {
+		*min_pos = select.min_pos;
+	}
+
+	return(error);
+}
+
+/********************************************************************
+Callback aggregator for int columns. */
+static
+ibool
+fts_query_sum(
+/*==========*/
+					/*!< out: always returns TRUE */
+	void*		row,		/*!< in:  sel_node_t* */
+	void*		user_arg)	/*!< in:  ulint* */
+{
+
+	que_node_t*	exp;
+	sel_node_t*	node = row;
+	ulint*		total = user_arg;
+
+	exp = node->select_list;
+
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		void*		data = dfield_get_data(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		if (len != UNIV_SQL_NULL && len != 0) {
+			*total += mach_read_from_4(data);
+		}
+
+		exp = que_node_get_next(exp);
+	}
+
+	return(TRUE);
+}
+
+/********************************************************************
+Calculate the total documents that contain a particular word (term).
+@return DB_SUCCESS if all go well else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_total_docs_containing_term(
+/*=================================*/
+	fts_query_t*		query,	/*!< in: FTS query state */
+	const fts_string_t*	word,	/*!< in: the word to check */
+	ulint*			total)	/*!< out: documents containing word */
+{
+	pars_info_t*		info;
+	dberr_t			error;
+	que_t*			graph;
+	ulint			selected;
+	trx_t*			trx = query->trx;
+
+	trx->op_info = "fetching FTS index document count";
+
+	*total = 0;
+
+	info = pars_info_create();
+
+	pars_info_bind_function(info, "my_func", fts_query_sum, total);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	selected = fts_select_index(*word->f_str);
+
+	query->fts_index_table.suffix = fts_get_suffix(selected);
+
+	graph = fts_parse_sql(
+		&query->fts_index_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT doc_count\n"
+		" FROM %s\n"
+		" WHERE word = :word "
+		" ORDER BY first_doc_id;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for(;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, " InnoDB: Warning: lock wait "
+					"timeout reading FTS index. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, " InnoDB: Error: %lu "
+					"while reading FTS index.\n", error);
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	fts_que_graph_free(graph);
+
+	return(error);
+}
+
+/********************************************************************
+Get the total number of words in a documents.
+@return DB_SUCCESS if all go well else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_terms_in_document(
+/*========================*/
+	fts_query_t*	query,		/*!< in: FTS query state */
+	doc_id_t	doc_id,		/*!< in: the word to check */
+	ulint*		total)		/*!< out: total words in document */
+{
+	pars_info_t*	info;
+	dberr_t		error;
+	que_t*		graph;
+	doc_id_t	read_doc_id;
+	trx_t*		trx = query->trx;
+
+	trx->op_info = "fetching FTS document term count";
+
+	*total = 0;
+
+	info = pars_info_create();
+
+	pars_info_bind_function(info, "my_func", fts_query_sum, total);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &read_doc_id, doc_id);
+	fts_bind_doc_id(info, "doc_id", &read_doc_id);
+
+	query->fts_index_table.suffix = "DOC_ID";
+
+	graph = fts_parse_sql(
+		&query->fts_index_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT count\n"
+		" FROM %s\n"
+		" WHERE doc_id = :doc_id "
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for(;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, " InnoDB: Warning: lock wait "
+					"timeout reading FTS doc id table. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, " InnoDB: Error: %lu "
+					"while reading FTS doc id table.\n",
+					error);
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	fts_que_graph_free(graph);
+
+	return(error);
+}
+#endif
+
+/*****************************************************************//**
+Retrieve the document and match the phrase tokens.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_match_document(
+/*=====================*/
+	ib_vector_t*	tokens,		/*!< in: phrase tokens */
+	fts_get_doc_t*	get_doc,	/*!< in: table and prepared statements */
+	fts_match_t*	match,		/*!< in: doc id and positions */
+	ulint		distance,	/*!< in: proximity distance */
+	ibool*		found)		/*!< out: TRUE if phrase found */
+{
+	dberr_t		error;
+	fts_phrase_t	phrase;
+
+	memset(&phrase, 0x0, sizeof(phrase));
+
+	phrase.match = match;		/* Positions to match */
+	phrase.tokens = tokens;		/* Tokens to match */
+	phrase.distance = distance;
+	phrase.charset = get_doc->index_cache->charset;
+	phrase.zip_size = dict_table_zip_size(
+		get_doc->index_cache->index->table);
+	phrase.heap = mem_heap_create(512);
+
+	*found = phrase.found = FALSE;
+
+	error = fts_doc_fetch_by_doc_id(
+		get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
+		fts_query_fetch_document, &phrase);
+
+	if (error != DB_SUCCESS) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "InnoDB: Error: (%s) matching document.\n",
+			ut_strerr(error));
+	} else {
+		*found = phrase.found;
+	}
+
+	mem_heap_free(phrase.heap);
+
+	return(error);
+}
+
+/*****************************************************************//**
+This function fetches the original documents and count the
+words in between matching words to see that is in specified distance
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+bool
+fts_query_is_in_proximity_range(
+/*============================*/
+	const fts_query_t*	query,		/*!< in:  query instance */
+	fts_match_t**		match,		/*!< in: query instance */
+	fts_proximity_t*	qualified_pos)	/*!< in: position info for
+						qualified ranges */
+{
+	fts_get_doc_t		get_doc;
+	fts_cache_t*		cache = query->index->table->fts->cache;
+	dberr_t			err;
+	fts_phrase_t		phrase;
+
+	memset(&get_doc, 0x0, sizeof(get_doc));
+	memset(&phrase, 0x0, sizeof(phrase));
+
+	rw_lock_x_lock(&cache->lock);
+	get_doc.index_cache = fts_find_index_cache(cache, query->index);
+	rw_lock_x_unlock(&cache->lock);
+	ut_a(get_doc.index_cache != NULL);
+
+	phrase.distance = query->distance;
+	phrase.charset = get_doc.index_cache->charset;
+	phrase.zip_size = dict_table_zip_size(
+		get_doc.index_cache->index->table);
+	phrase.heap = mem_heap_create(512);
+	phrase.proximity_pos = qualified_pos;
+	phrase.found = FALSE;
+
+	err = fts_doc_fetch_by_doc_id(
+		&get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
+		fts_query_fetch_document, &phrase);
+
+	if (err != DB_SUCCESS) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Error: (%s) in verification phase of proximity "
+			"search", ut_strerr(err));
+	}
+
+	/* Free the prepared statement. */
+	if (get_doc.get_document_graph) {
+		fts_que_graph_free(get_doc.get_document_graph);
+		get_doc.get_document_graph = NULL;
+	}
+
+	mem_heap_free(phrase.heap);
+
+	return(err == DB_SUCCESS && phrase.found);
+}
+
+/*****************************************************************//**
+Iterate over the matched document ids and search the for the
+actual phrase in the text.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_search_phrase(
+/*====================*/
+	fts_query_t*		query,		/*!< in: query instance */
+	ib_vector_t*		orig_tokens,	/*!< in: tokens to search,
+						with any stopwords in the
+						original phrase */
+	ib_vector_t*		tokens)		/*!< in: tokens that does
+						not include stopwords and
+						can be used to calculate
+						ranking */
+{
+	ulint			i;
+	fts_get_doc_t		get_doc;
+	ulint			n_matched;
+	fts_cache_t*		cache = query->index->table->fts->cache;
+
+	n_matched = ib_vector_size(query->matched);
+
+	/* Setup the doc retrieval infrastructure. */
+	memset(&get_doc, 0x0, sizeof(get_doc));
+
+	rw_lock_x_lock(&cache->lock);
+
+	get_doc.index_cache = fts_find_index_cache(cache, query->index);
+
+	/* Must find the index cache */
+	ut_a(get_doc.index_cache != NULL);
+
+	rw_lock_x_unlock(&cache->lock);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " Start phrase search\n");
+#endif
+
+	/* Read the document from disk and do the actual
+	match, matching documents will be added to the current
+	doc id set. */
+	for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) {
+		fts_match_t*	match;
+		ibool		found = FALSE;
+
+		match = static_cast<fts_match_t*>(
+			ib_vector_get(query->matched, i));
+
+		/* Skip the document ids that were filtered out by
+		an earlier pass. */
+		if (match->doc_id != 0) {
+
+			query->error = fts_query_match_document(
+				orig_tokens, &get_doc,
+				match, query->distance, &found);
+
+			if (query->error == DB_SUCCESS && found) {
+				ulint	z;
+
+				query->error = fts_query_process_doc_id(query,
+							 match->doc_id, 0);
+				if (query->error != DB_SUCCESS) {
+					goto func_exit;
+				}
+
+				for (z = 0; z < ib_vector_size(tokens); z++) {
+					fts_string_t*   token;
+					token = static_cast<fts_string_t*>(
+						ib_vector_get(tokens, z));
+					fts_query_add_word_to_document(
+						query, match->doc_id,
+						token->f_str);
+				}
+			}
+		}
+	}
+
+func_exit:
+	/* Free the prepared statement. */
+	if (get_doc.get_document_graph) {
+		fts_que_graph_free(get_doc.get_document_graph);
+		get_doc.get_document_graph = NULL;
+	}
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Text/Phrase search.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_phrase_search(
+/*====================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	const fts_string_t*	phrase)	/*!< in: token to search */
+{
+	ib_vector_t*		tokens;
+	ib_vector_t*		orig_tokens;
+	mem_heap_t*		heap = mem_heap_create(sizeof(fts_string_t));
+	ulint			len = phrase->f_len;
+	ulint			cur_pos = 0;
+	ib_alloc_t*		heap_alloc;
+	ulint			num_token;
+	CHARSET_INFO*		charset;
+
+	charset = query->fts_index_table.charset;
+
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
+	orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
+
+	if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
+		query->flags = FTS_PROXIMITY;
+	} else {
+		query->flags = FTS_PHRASE;
+	}
+
+	/* Split the phrase into tokens. */
+	while (cur_pos < len) {
+		fts_cache_t*	cache = query->index->table->fts->cache;
+		ib_rbt_bound_t	parent;
+		ulint		offset;
+		ulint		cur_len;
+		fts_string_t	result_str;
+
+                cur_len = innobase_mysql_fts_get_token(
+                        charset,
+                        reinterpret_cast<const byte*>(phrase->f_str) + cur_pos,
+                        reinterpret_cast<const byte*>(phrase->f_str) + len,
+			&result_str, &offset);
+
+		if (cur_len == 0) {
+			break;
+		}
+
+		cur_pos += cur_len;
+
+		if (result_str.f_n_char == 0) {
+			continue;
+		}
+
+		fts_string_t*	token = static_cast<fts_string_t*>(
+			ib_vector_push(tokens, NULL));
+
+		token->f_str = static_cast<byte*>(
+			mem_heap_alloc(heap, result_str.f_len + 1));
+		ut_memcpy(token->f_str, result_str.f_str, result_str.f_len);
+
+		token->f_len = result_str.f_len;
+		token->f_str[token->f_len] = 0;
+
+		if (cache->stopword_info.cached_stopword
+		    && rbt_search(cache->stopword_info.cached_stopword,
+			       &parent, token) != 0
+		    && result_str.f_n_char >= fts_min_token_size
+		    && result_str.f_n_char <= fts_max_token_size) {
+			/* Add the word to the RB tree so that we can
+			calculate it's frequencey within a document. */
+			fts_query_add_word_freq(query, token->f_str);
+		} else {
+			ib_vector_pop(tokens);
+		}
+
+		/* we will start to store all words including stopwords
+		in the "orig_tokens" vector, but skip any leading words
+		that are stopwords */
+		if (!ib_vector_is_empty(tokens)) {
+			fts_string_t*	orig_token = static_cast<fts_string_t*>(
+				ib_vector_push(orig_tokens, NULL));
+
+			orig_token->f_str = token->f_str;
+			orig_token->f_len = token->f_len;
+		}
+	}
+
+	num_token = ib_vector_size(tokens);
+	ut_ad(ib_vector_size(orig_tokens) >= num_token);
+
+	/* Ignore empty strings. */
+	if (num_token > 0) {
+		fts_string_t*	token;
+		fts_fetch_t	fetch;
+		trx_t*		trx = query->trx;
+		fts_ast_oper_t	oper = query->oper;
+		que_t*		graph = NULL;
+		ulint		i;
+		dberr_t		error;
+
+		/* Create the vector for storing matching document ids
+		and the positions of the first token of the phrase. */
+		if (!query->matched) {
+			ib_alloc_t*	heap_alloc;
+
+			heap_alloc = ib_heap_allocator_create(heap);
+
+			if (!(query->flags & FTS_PROXIMITY)
+			    && !(query->flags & FTS_PHRASE)) {
+				query->matched = ib_vector_create(
+					heap_alloc, sizeof(fts_match_t),
+					64);
+			} else {
+				ut_a(num_token < MAX_PROXIMITY_ITEM);
+				query->match_array =
+					(ib_vector_t**) mem_heap_alloc(
+						heap,
+						num_token *
+						sizeof(query->matched));
+
+				for (i = 0; i < num_token; i++) {
+					query->match_array[i] =
+					ib_vector_create(
+						heap_alloc, sizeof(fts_match_t),
+						64);
+				}
+
+				query->matched = query->match_array[0];
+			}
+		}
+
+		/* Setup the callback args for filtering and consolidating
+		the ilist. */
+		fetch.read_arg = query;
+		fetch.read_record = fts_query_index_fetch_nodes;
+
+		for (i = 0; i < num_token; i++) {
+			/* Search for the first word from the phrase. */
+			token = static_cast<fts_string_t*>(
+				ib_vector_get(tokens, i));
+
+			if (query->flags & FTS_PROXIMITY
+			    || query->flags & FTS_PHRASE) {
+				query->matched = query->match_array[i];
+			}
+
+			error = fts_index_fetch_nodes(
+				trx, &graph, &query->fts_index_table,
+				token, &fetch);
+
+			/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+			ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+			if (error != DB_SUCCESS) {
+				query->error = error;
+			}
+
+			fts_que_graph_free(graph);
+			graph = NULL;
+
+			fts_query_cache(query, token);
+
+			if (!(query->flags & FTS_PHRASE)
+			    && !(query->flags & FTS_PROXIMITY)) {
+				break;
+			}
+
+			/* If any of the token can't be found,
+			no need to continue match */
+			if (ib_vector_is_empty(query->match_array[i])
+			    || query->error != DB_SUCCESS) {
+				goto func_exit;
+			}
+		}
+
+		/* Just a single word, no need to fetch the original
+		documents to do phrase matching */
+		if (ib_vector_size(orig_tokens) == 1
+		    && !ib_vector_is_empty(query->match_array[0])) {
+			fts_match_t*    match;
+			ulint		n_matched;
+
+			n_matched = ib_vector_size(query->match_array[0]);
+
+			for (i = 0; i < n_matched; i++) {
+				match = static_cast<fts_match_t*>(
+					ib_vector_get(
+						query->match_array[0], i));
+
+				query->error = fts_query_process_doc_id(
+						query, match->doc_id, 0);
+				if (query->error != DB_SUCCESS) {
+					goto func_exit;
+				}
+
+				fts_query_add_word_to_document(
+					query, match->doc_id, token->f_str);
+			}
+			query->oper = oper;
+			goto func_exit;
+		}
+
+		/* If we are doing proximity search, verify the distance
+		between all words, and check they are in specified distance. */
+		if (query->flags & FTS_PROXIMITY) {
+			fts_phrase_or_proximity_search(query, tokens);
+		} else {
+			ibool	matched;
+
+			/* Phrase Search case:
+			We filter out the doc ids that don't contain
+			all the tokens in the phrase. It's cheaper to
+			search the ilist than bringing the documents in
+			and then doing a search through the text. Isolated
+			testing shows this also helps in mitigating disruption
+			of the buffer cache. */
+			matched = fts_phrase_or_proximity_search(query, tokens);
+			query->matched = query->match_array[0];
+
+			/* Read the actual text in and search for the phrase. */
+			if (matched) {
+				ut_ad(query->error == DB_SUCCESS);
+				query->error = fts_query_search_phrase(
+					query, orig_tokens, tokens);
+			}
+		}
+
+		/* Restore original operation. */
+		query->oper = oper;
+
+		if (query->error != DB_SUCCESS) {
+			goto func_exit;
+		}
+	}
+
+func_exit:
+	mem_heap_free(heap);
+
+	/* Don't need it anymore. */
+	query->matched = NULL;
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Find the word and evaluate.
+@return DB_SUCCESS if all go well */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_execute(
+/*==============*/
+	fts_query_t*		query,	/*!< in: query instance */
+	fts_string_t*		token)	/*!< in: token to search */
+{
+	switch (query->oper) {
+	case FTS_NONE:
+	case FTS_NEGATE:
+	case FTS_INCR_RATING:
+	case FTS_DECR_RATING:
+		query->error = fts_query_union(query, token);
+		break;
+
+	case FTS_EXIST:
+		query->error = fts_query_intersect(query, token);
+		break;
+
+	case FTS_IGNORE:
+		query->error = fts_query_difference(query, token);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Create a wildcard string. It's the responsibility of the caller to
+free the byte* pointer. It's allocated using ut_malloc().
+@return ptr to allocated memory */
+static
+byte*
+fts_query_get_token(
+/*================*/
+	fts_ast_node_t*	node,		/*!< in: the current sub tree */
+	fts_string_t*	token)		/*!< in: token to create */
+{
+	ulint		str_len;
+	byte*		new_ptr = NULL;
+
+	str_len = ut_strlen((char*) node->term.ptr);
+
+	ut_a(node->type == FTS_AST_TERM);
+
+	token->f_len = str_len;
+	token->f_str = node->term.ptr;
+
+	if (node->term.wildcard) {
+
+		token->f_str = static_cast<byte*>(ut_malloc(str_len + 2));
+		token->f_len = str_len + 1;
+
+		/* Need to copy the NUL character too. */
+		memcpy(token->f_str, node->term.ptr, str_len + 1);
+
+		token->f_str[str_len] = '%';
+		token->f_str[token->f_len] = 0;
+
+		new_ptr = token->f_str;
+	}
+
+	return(new_ptr);
+}
+
+/*****************************************************************//**
+Visit every node of the AST. */
+static
+dberr_t
+fts_query_visitor(
+/*==============*/
+	fts_ast_oper_t	oper,		/*!< in: current operator */
+	fts_ast_node_t*	node,		/*!< in: The root of the current subtree*/
+	void*		arg)		/*!< in: callback arg*/
+{
+	byte*		ptr;
+	fts_string_t	token;
+	fts_query_t*	query = static_cast<fts_query_t*>(arg);
+
+	ut_a(node);
+
+	token.f_n_char = 0;
+
+	query->oper = oper;
+
+	query->cur_node = node;
+
+	switch (node->type) {
+	case FTS_AST_TEXT:
+		token.f_str = node->text.ptr;
+		token.f_len = ut_strlen((char*) token.f_str);
+
+		if (query->oper == FTS_EXIST) {
+			ut_ad(query->intersection == NULL);
+			query->intersection = rbt_create(
+				sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+		}
+
+		/* Set the current proximity distance. */
+		query->distance = node->text.distance;
+
+		/* Force collection of doc ids and the positions. */
+		query->collect_positions = TRUE;
+
+		query->error = fts_query_phrase_search(query, &token);
+
+		query->collect_positions = FALSE;
+
+		if (query->oper == FTS_EXIST) {
+			fts_query_free_doc_ids(query, query->doc_ids);
+			query->doc_ids = query->intersection;
+			query->intersection = NULL;
+		}
+
+		break;
+
+	case FTS_AST_TERM:
+
+		/* Add the word to our RB tree that will be used to
+		calculate this terms per document frequency. */
+		fts_query_add_word_freq(query, node->term.ptr);
+
+		ptr = fts_query_get_token(node, &token);
+		query->error = fts_query_execute(query, &token);
+
+		if (ptr) {
+			ut_free(ptr);
+		}
+		break;
+
+	default:
+		ut_error;
+	}
+
+	if (query->oper == FTS_EXIST) {
+		query->multi_exist = true;
+	}
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Process (nested) sub-expression, create a new result set to store the
+sub-expression result by processing nodes under current sub-expression
+list. Merge the sub-expression result with that of parent expression list.
+@return DB_SUCCESS if all  well */
+UNIV_INTERN
+dberr_t
+fts_ast_visit_sub_exp(
+/*==================*/
+	fts_ast_node_t*		node,		/*!< in,out: current root node */
+	fts_ast_callback	visitor,	/*!< in: callback function */
+	void*			arg)		/*!< in,out: arg for callback */
+{
+	fts_ast_oper_t		cur_oper;
+	fts_query_t*		query = static_cast<fts_query_t*>(arg);
+	ib_rbt_t*		parent_doc_ids;
+	ib_rbt_t*		subexpr_doc_ids;
+	dberr_t			error = DB_SUCCESS;
+	bool			will_be_ignored = false;
+	bool			multi_exist;
+
+	ut_a(node->type == FTS_AST_SUBEXP_LIST);
+
+	node = node->list.head;
+
+	if (!node || !node->next) {
+		return(error);
+	}
+
+	cur_oper = node->oper;
+
+	/* Save current result set */
+	parent_doc_ids = query->doc_ids;
+
+	/* Create new result set to store the sub-expression result. We
+	will merge this result set with the parent after processing. */
+	query->doc_ids = rbt_create(sizeof(fts_ranking_t),
+				    fts_ranking_doc_id_cmp);
+
+	query->total_size += SIZEOF_RBT_CREATE;
+
+	multi_exist = query->multi_exist;
+	query->multi_exist = false;
+	/* Process nodes in current sub-expression and store its
+	result set in query->doc_ids we created above. */
+	error = fts_ast_visit(FTS_NONE, node->next, visitor,
+			      arg, &will_be_ignored);
+
+	/* Reinstate parent node state and prepare for merge. */
+	query->multi_exist = multi_exist;
+	query->oper = cur_oper;
+	subexpr_doc_ids = query->doc_ids;
+
+	/* Restore current result set. */
+	query->doc_ids = parent_doc_ids;
+
+	/* Merge the sub-expression result with the parent result set. */
+	if (error == DB_SUCCESS && !rbt_empty(subexpr_doc_ids)) {
+		error = fts_merge_doc_ids(query, subexpr_doc_ids);
+	}
+
+	if (query->oper == FTS_EXIST) {
+		query->multi_exist = true;
+	}
+
+	/* Free current result set. Result already merged into parent. */
+	fts_query_free_doc_ids(query, subexpr_doc_ids);
+
+	return(error);
+}
+
+#if 0
+/*****************************************************************//***
+Check if the doc id exists in the ilist.
+@return TRUE if doc id found */
+static
+ulint
+fts_query_find_doc_id(
+/*==================*/
+	fts_select_t*	select,		/*!< in/out: contains the doc id to
+					find, we update the word freq if
+					document found */
+	void*		data,		/*!< in: doc id ilist */
+	ulint		len)		/*!< in: doc id ilist size */
+{
+	byte*		ptr = data;
+	doc_id_t	doc_id = 0;
+	ulint		decoded = 0;
+
+	/* Decode the ilist and search for selected doc_id. We also
+	calculate the frequency of the word in the document if found. */
+	while (decoded < len && !select->found) {
+		ulint		freq = 0;
+		ulint		min_pos = 0;
+		ulint		last_pos = 0;
+		ulint		pos = fts_decode_vlc(&ptr);
+
+		/* Add the delta. */
+		doc_id += pos;
+
+		while (*ptr) {
+			++freq;
+			last_pos += fts_decode_vlc(&ptr);
+
+			/* Only if min_pos is not set and the current
+			term exists in a position greater than the
+			min_pos of the previous term. */
+			if (min_pos == 0 && last_pos > select->min_pos) {
+				min_pos = last_pos;
+			}
+		}
+
+		/* Skip the end of word position marker. */
+		++ptr;
+
+		/* Bytes decoded so far. */
+		decoded = ptr - (byte*) data;
+
+		/* A word may exist in the document but we only consider a
+		match if it exists in a position that is greater than the
+		position of the previous term. */
+		if (doc_id == select->doc_id && min_pos > 0) {
+			fts_doc_freq_t*	doc_freq;
+
+			/* Add the doc id to the doc freq rb tree, if
+			the doc id doesn't exist it will be created. */
+			doc_freq = fts_query_add_doc_freq(
+				select->word_freq->doc_freqs, doc_id);
+
+			/* Avoid duplicating the frequency tally */
+			if (doc_freq->freq == 0) {
+				doc_freq->freq = freq;
+			}
+
+			select->found = TRUE;
+			select->min_pos = min_pos;
+		}
+	}
+
+	return(select->found);
+}
+#endif
+
+/*****************************************************************//**
+Read and filter nodes.
+@return DB_SUCCESS if all go well,
+or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
+static
+dberr_t
+fts_query_filter_doc_ids(
+/*=====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	const byte*	word,		/*!< in: the current word */
+	fts_word_freq_t*word_freq,	/*!< in/out: word frequency */
+	const fts_node_t*
+			node,		/*!< in: current FTS node */
+	void*		data,		/*!< in: doc id ilist */
+	ulint		len,		/*!< in: doc id ilist size */
+	ibool		calc_doc_count)	/*!< in: whether to remember doc count */
+{
+	byte*		ptr = static_cast<byte*>(data);
+	doc_id_t	doc_id = 0;
+	ulint		decoded = 0;
+	ib_rbt_t*	doc_freqs = word_freq->doc_freqs;
+
+	/* Decode the ilist and add the doc ids to the query doc_id set. */
+	while (decoded < len) {
+		ulint		freq = 0;
+		fts_doc_freq_t*	doc_freq;
+		fts_match_t*	match = NULL;
+		ulint		last_pos = 0;
+		ulint		pos = fts_decode_vlc(&ptr);
+
+		/* Some sanity checks. */
+		if (doc_id == 0) {
+			ut_a(pos == node->first_doc_id);
+		}
+
+		/* Add the delta. */
+		doc_id += pos;
+
+		if (calc_doc_count) {
+			word_freq->doc_count++;
+		}
+
+		/* We simply collect the matching instances here. */
+		if (query->collect_positions) {
+			ib_alloc_t*	heap_alloc;
+
+			/* Create a new fts_match_t instance. */
+			match = static_cast<fts_match_t*>(
+				ib_vector_push(query->matched, NULL));
+
+			match->start = 0;
+			match->doc_id = doc_id;
+			heap_alloc = ib_vector_allocator(query->matched);
+
+			/* Allocate from the same heap as the
+			parent container. */
+			match->positions = ib_vector_create(
+				heap_alloc, sizeof(ulint), 64);
+
+			query->total_size += sizeof(fts_match_t)
+				+ sizeof(ib_vector_t)
+				+ sizeof(ulint) * 64;
+		}
+
+		/* Unpack the positions within the document. */
+		while (*ptr) {
+			last_pos += fts_decode_vlc(&ptr);
+
+			/* Collect the matching word positions, for phrase
+			matching later. */
+			if (query->collect_positions) {
+				ib_vector_push(match->positions, &last_pos);
+			}
+
+			++freq;
+		}
+
+		/* End of list marker. */
+		last_pos = (ulint) -1;
+
+		if (query->collect_positions) {
+			ut_a(match != NULL);
+			ib_vector_push(match->positions, &last_pos);
+		}
+
+		/* Add the doc id to the doc freq rb tree, if the doc id
+		doesn't exist it will be created. */
+		doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
+
+		/* Avoid duplicating frequency tally. */
+		if (doc_freq->freq == 0) {
+			doc_freq->freq = freq;
+		}
+
+		/* Skip the end of word position marker. */
+		++ptr;
+
+		/* Bytes decoded so far */
+		decoded = ptr - (byte*) data;
+
+		/* We simply collect the matching documents and the
+		positions here and match later. */
+		if (!query->collect_positions) {
+			/* We ignore error here and will check it later */
+			fts_query_process_doc_id(query, doc_id, 0);
+
+			/* Add the word to the document's matched RB tree. */
+			fts_query_add_word_to_document(query, doc_id, word);
+		}
+	}
+
+	/* Some sanity checks. */
+	ut_a(doc_id == node->last_doc_id);
+
+	if (query->total_size > fts_result_cache_limit) {
+		return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+	} else {
+		return(DB_SUCCESS);
+	}
+}
+
+/*****************************************************************//**
+Read the FTS INDEX row.
+@return DB_SUCCESS if all go well. */
+static
+dberr_t
+fts_query_read_node(
+/*================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	const fts_string_t*	word,	/*!< in: current word */
+	que_node_t*		exp)	/*!< in: query graph node */
+{
+	int			i;
+	int			ret;
+	fts_node_t		node;
+	ib_rbt_bound_t		parent;
+	fts_word_freq_t*	word_freq;
+	ibool			skip = FALSE;
+	byte			term[FTS_MAX_WORD_LEN + 1];
+	dberr_t			error = DB_SUCCESS;
+
+	ut_a(query->cur_node->type == FTS_AST_TERM ||
+	     query->cur_node->type == FTS_AST_TEXT);
+
+	memset(&node, 0, sizeof(node));
+
+	/* Need to consider the wildcard search case, the word frequency
+	is created on the search string not the actual word. So we need
+	to assign the frequency on search string behalf. */
+	if (query->cur_node->type == FTS_AST_TERM
+	    && query->cur_node->term.wildcard) {
+
+		/* These cast are safe since we only care about the
+		terminating NUL character as an end of string marker. */
+		ut_strcpy((char*) term, (char*) query->cur_node->term.ptr);
+	} else {
+		/* Need to copy the NUL character too. */
+		memcpy(term, word->f_str, word->f_len);
+		term[word->f_len] = 0;
+	}
+
+	/* Lookup the word in our rb tree, it must exist. */
+	ret = rbt_search(query->word_freqs, &parent, term);
+
+	ut_a(ret == 0);
+
+	word_freq = rbt_value(fts_word_freq_t, parent.last);
+
+	/* Start from 1 since the first column has been read by the caller.
+	Also, we rely on the order of the columns projected, to filter
+	out ilists that are out of range and we always want to read
+	the doc_count irrespective of the suitablility of the row. */
+
+	for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) {
+
+		dfield_t*	dfield = que_node_get_val(exp);
+		byte*		data = static_cast<byte*>(
+			dfield_get_data(dfield));
+		ulint		len = dfield_get_len(dfield);
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		/* Note: The column numbers below must match the SELECT. */
+
+		switch (i) {
+		case 1: /* DOC_COUNT */
+			word_freq->doc_count += mach_read_from_4(data);
+			break;
+
+		case 2: /* FIRST_DOC_ID */
+			node.first_doc_id = fts_read_doc_id(data);
+
+			/* Skip nodes whose doc ids are out range. */
+			if (query->oper == FTS_EXIST
+			    && query->upper_doc_id > 0
+			    && node.first_doc_id > query->upper_doc_id) {
+				skip = TRUE;
+			}
+			break;
+
+		case 3: /* LAST_DOC_ID */
+			node.last_doc_id = fts_read_doc_id(data);
+
+			/* Skip nodes whose doc ids are out range. */
+			if (query->oper == FTS_EXIST
+			    && query->lower_doc_id > 0
+			    && node.last_doc_id < query->lower_doc_id) {
+				skip = TRUE;
+			}
+			break;
+
+		case 4: /* ILIST */
+
+			error = fts_query_filter_doc_ids(
+					query, word_freq->word, word_freq,
+					&node, data, len, FALSE);
+
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	if (!skip) {
+		/* Make sure all columns were read. */
+
+		ut_a(i == 5);
+	}
+
+	return error;
+}
+
+/*****************************************************************//**
+Callback function to fetch the rows in an FTS INDEX record.
+@return always returns TRUE */
+static
+ibool
+fts_query_index_fetch_nodes(
+/*========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to fts_fetch_t */
+{
+	fts_string_t	key;
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	fts_fetch_t*	fetch = static_cast<fts_fetch_t*>(user_arg);
+	fts_query_t*	query = static_cast<fts_query_t*>(fetch->read_arg);
+	que_node_t*	exp = sel_node->select_list;
+	dfield_t*	dfield = que_node_get_val(exp);
+	void*		data = dfield_get_data(dfield);
+	ulint		dfield_len = dfield_get_len(dfield);
+
+	key.f_str = static_cast<byte*>(data);
+	key.f_len = dfield_len;
+
+	ut_a(dfield_len <= FTS_MAX_WORD_LEN);
+
+	/* Note: we pass error out by 'query->error' */
+	query->error = fts_query_read_node(query, &key, que_node_get_next(exp));
+
+	if (query->error != DB_SUCCESS) {
+		ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+		return(FALSE);
+	} else {
+		return(TRUE);
+	}
+}
+
+/*****************************************************************//**
+Calculate the inverse document frequency (IDF) for all the terms. */
+static
+void
+fts_query_calculate_idf(
+/*====================*/
+	fts_query_t*	query)	/*!< in: Query state */
+{
+	const ib_rbt_node_t*	node;
+	ib_uint64_t		total_docs = query->total_docs;
+
+	/* We need to free any instances of fts_doc_freq_t that we
+	may have allocated. */
+	for (node = rbt_first(query->word_freqs);
+	     node;
+	     node = rbt_next(query->word_freqs, node)) {
+
+		fts_word_freq_t*	word_freq;
+
+		word_freq = rbt_value(fts_word_freq_t, node);
+
+		if (word_freq->doc_count > 0) {
+			if (total_docs == word_freq->doc_count) {
+				/* QP assume ranking > 0 if we find
+				a match. Since Log10(1) = 0, we cannot
+				make IDF a zero value if do find a
+				word in all documents. So let's make
+				it an arbitrary very small number */
+				word_freq->idf = log10(1.0001);
+			} else {
+				word_freq->idf = log10(
+					total_docs
+					/ (double) word_freq->doc_count);
+			}
+		}
+
+		if (fts_enable_diag_print) {
+			fprintf(stderr,"'%s' -> " UINT64PF "/" UINT64PF
+				" %6.5lf\n",
+			        word_freq->word,
+			        query->total_docs, word_freq->doc_count,
+			        word_freq->idf);
+		}
+	}
+}
+
+/*****************************************************************//**
+Calculate the ranking of the document. */
+static
+void
+fts_query_calculate_ranking(
+/*========================*/
+	const fts_query_t*	query,		/*!< in: query state */
+	fts_ranking_t*		ranking)	/*!< in: Document to rank */
+{
+	ulint	pos = 0;
+	byte*	word = NULL;
+
+	/* At this stage, ranking->rank should not exceed the 1.0
+	bound */
+	ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0);
+	ut_ad(query->word_map->size() == query->word_vector->size());
+
+	while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
+		int			ret;
+		ib_rbt_bound_t		parent;
+		double			weight;
+		fts_doc_freq_t*		doc_freq;
+		fts_word_freq_t*	word_freq;
+
+		ut_ad(word != NULL);
+		ret = rbt_search(query->word_freqs, &parent, word);
+
+		/* It must exist. */
+		ut_a(ret == 0);
+
+		word_freq = rbt_value(fts_word_freq_t, parent.last);
+
+		ret = rbt_search(
+			word_freq->doc_freqs, &parent, &ranking->doc_id);
+
+		/* It must exist. */
+		ut_a(ret == 0);
+
+		doc_freq = rbt_value(fts_doc_freq_t, parent.last);
+
+		weight = (double) doc_freq->freq * word_freq->idf;
+
+		ranking->rank += (fts_rank_t) (weight * word_freq->idf);
+	}
+}
+
+/*****************************************************************//**
+Add ranking to the result set. */
+static
+void
+fts_query_add_ranking(
+/*==================*/
+	fts_query_t*		query,		/*!< in: query state */
+	ib_rbt_t*		ranking_tree,	/*!< in: ranking tree */
+	const fts_ranking_t*	new_ranking)	/*!< in: ranking of a document */
+{
+	ib_rbt_bound_t		parent;
+
+	/* Lookup the ranking in our rb tree and add if it doesn't exist. */
+	if (rbt_search(ranking_tree, &parent, new_ranking) == 0) {
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+
+		ranking->rank += new_ranking->rank;
+
+		ut_a(ranking->words == NULL);
+	} else {
+		rbt_add_node(ranking_tree, &parent, new_ranking);
+
+		query->total_size += SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_ranking_t);
+	}
+}
+
+/*****************************************************************//**
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+@return the relevance ranking value, 0 if no ranking value
+present. */
+float
+fts_retrieve_ranking(
+/*=================*/
+	fts_result_t*	result,	/*!< in: FTS result structure */
+	doc_id_t	doc_id)	/*!< in: doc_id of the item to retrieve */
+{
+	ib_rbt_bound_t		parent;
+	fts_ranking_t		new_ranking;
+
+	if (!result || !result->rankings_by_id) {
+		return(0);
+	}
+
+	new_ranking.doc_id = doc_id;
+
+	/* Lookup the ranking in our rb tree */
+	if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) {
+		fts_ranking_t*  ranking;
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+
+		return(ranking->rank);
+	}
+
+	return(0);
+}
+
+/*****************************************************************//**
+Create the result and copy the data to it. */
+static
+fts_result_t*
+fts_query_prepare_result(
+/*=====================*/
+	fts_query_t*	query,	/*!< in: Query state */
+	fts_result_t*	result)	/*!< in: result this can contain
+				data from a previous search on
+				another FTS index */
+{
+	const ib_rbt_node_t*	node;
+	bool			result_is_null = false;
+
+	if (result == NULL) {
+		result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result)));
+
+		memset(result, 0x0, sizeof(*result));
+
+		result->rankings_by_id = rbt_create(
+			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+		query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
+		result_is_null = true;
+	}
+
+	if (query->flags == FTS_OPT_RANKING) {
+		fts_word_freq_t*	word_freq;
+		ulint		size = ib_vector_size(query->deleted->doc_ids);
+		fts_update_t*	array =
+			(fts_update_t*) query->deleted->doc_ids->data;
+
+		node = rbt_first(query->word_freqs);
+		ut_ad(node);
+		word_freq = rbt_value(fts_word_freq_t, node);
+
+		for (node = rbt_first(word_freq->doc_freqs);
+		     node;
+		     node = rbt_next(word_freq->doc_freqs, node)) {
+			fts_doc_freq_t* doc_freq;
+			fts_ranking_t	ranking;
+
+			doc_freq = rbt_value(fts_doc_freq_t, node);
+
+			/* Don't put deleted docs into result */
+			if (fts_bsearch(array, 0, size, doc_freq->doc_id)
+			    >= 0) {
+				continue;
+			}
+
+			ranking.doc_id = doc_freq->doc_id;
+			ranking.rank = doc_freq->freq * word_freq->idf
+				* word_freq->idf;
+			ranking.words = NULL;
+
+			fts_query_add_ranking(query, result->rankings_by_id,
+					      &ranking);
+
+			if (query->total_size > fts_result_cache_limit) {
+				query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+				fts_query_free_result(result);
+				return(NULL);
+			}
+		}
+
+		return(result);
+	}
+
+	ut_a(rbt_size(query->doc_ids) > 0);
+
+	for (node = rbt_first(query->doc_ids);
+	     node;
+	     node = rbt_next(query->doc_ids, node)) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, node);
+		fts_query_calculate_ranking(query, ranking);
+
+		// FIXME: I think we may requre this information to improve the
+		// ranking of doc ids which have more word matches from
+		// different FTS indexes.
+
+		/* We don't need these anymore free the resources. */
+		ranking->words = NULL;
+
+		if (!result_is_null) {
+			fts_query_add_ranking(query, result->rankings_by_id, ranking);
+
+			 if (query->total_size > fts_result_cache_limit) {
+				query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+				fts_query_free_result(result);
+				return(NULL);
+                        }
+		}
+	}
+
+	if (result_is_null) {
+		/* Use doc_ids directly */
+		rbt_free(result->rankings_by_id);
+		result->rankings_by_id = query->doc_ids;
+		query->doc_ids = NULL;
+	}
+
+	return(result);
+}
+
+/*****************************************************************//**
+Get the result of the query. Calculate the similarity coefficient. */
+static
+fts_result_t*
+fts_query_get_result(
+/*=================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	fts_result_t*		result)	/*!< in: result */
+{
+	if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) {
+		/* Copy the doc ids to the result. */
+		result = fts_query_prepare_result(query, result);
+	} else {
+		/* Create an empty result instance. */
+		result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result)));
+		memset(result, 0, sizeof(*result));
+	}
+
+	return(result);
+}
+
+/*****************************************************************//**
+FTS Query free resources and reset. */
+static
+void
+fts_query_free(
+/*===========*/
+	fts_query_t*	query)		/*!< in: query instance to free*/
+{
+
+	if (query->read_nodes_graph) {
+		fts_que_graph_free(query->read_nodes_graph);
+	}
+
+	if (query->root) {
+		fts_ast_free_node(query->root);
+	}
+
+	if (query->deleted) {
+		fts_doc_ids_free(query->deleted);
+	}
+
+	if (query->doc_ids) {
+		fts_query_free_doc_ids(query, query->doc_ids);
+	}
+
+	if (query->word_freqs) {
+		const ib_rbt_node_t*	node;
+
+		/* We need to free any instances of fts_doc_freq_t that we
+		may have allocated. */
+		for (node = rbt_first(query->word_freqs);
+		     node;
+		     node = rbt_next(query->word_freqs, node)) {
+
+			fts_word_freq_t*	word_freq;
+
+			word_freq = rbt_value(fts_word_freq_t, node);
+
+			/* We need to cast away the const. */
+			rbt_free(word_freq->doc_freqs);
+		}
+
+		rbt_free(query->word_freqs);
+	}
+
+	ut_a(!query->intersection);
+
+	if (query->heap) {
+		mem_heap_free(query->heap);
+	}
+
+	if (query->word_map) {
+		delete query->word_map;
+	}
+
+	if (query->word_vector) {
+		delete query->word_vector;
+	}
+
+	memset(query, 0, sizeof(*query));
+}
+
+/*****************************************************************//**
+Parse the query using flex/bison. */
+static
+fts_ast_node_t*
+fts_query_parse(
+/*============*/
+	fts_query_t*	query,		/*!< in: query instance */
+	byte*		query_str,	/*!< in: query string */
+	ulint		query_len)	/*!< in: query string length */
+{
+	int		error;
+	fts_ast_state_t state;
+	bool		mode = query->boolean_mode;
+
+	memset(&state, 0x0, sizeof(state));
+
+	/* Setup the scanner to use, this depends on the mode flag. */
+	state.lexer = fts_lexer_create(mode, query_str, query_len);
+	state.charset = query->fts_index_table.charset;
+	error = fts_parse(&state);
+	fts_lexer_free(state.lexer);
+	state.lexer = NULL;
+
+	/* Error during parsing ? */
+	if (error) {
+		/* Free the nodes that were allocated during parsing. */
+		fts_ast_state_free(&state);
+	} else {
+		query->root = state.root;
+	}
+
+	return(state.root);
+}
+
+/*******************************************************************//**
+FTS Query optimization
+Set FTS_OPT_RANKING if it is a simple term query */
+static
+void
+fts_query_can_optimize(
+/*===================*/
+	fts_query_t*	query,		/*!< in/out: query instance */
+	uint		flags)		/*!< In: FTS search mode */
+{
+	fts_ast_node_t*	node = query->root;
+
+	if (flags & FTS_EXPAND) {
+		return;
+	}
+
+	/* Check if it has only a term without oper */
+	ut_ad(node->type == FTS_AST_LIST);
+	node = node->list.head;
+	if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) {
+		query->flags = FTS_OPT_RANKING;
+	}
+}
+
+/*******************************************************************//**
+Pre-process the query string
+1) make it lower case
+2) in boolean mode, if there is '-' or '+' that is immediately proceeded
+and followed by valid word, make it a space
+@return the processed string */
+static
+byte*
+fts_query_str_preprocess(
+/*=====================*/
+	const byte*	query_str,	/*!< in: FTS query */
+	ulint		query_len,	/*!< in: FTS query string len */
+	ulint		*result_len,	/*!< out: result string length */
+	CHARSET_INFO*	charset,	/*!< in: string charset */
+	bool		boolean_mode)	/*!< in: is boolean mode */
+{
+	ulint	cur_pos = 0;
+	ulint	str_len;
+	byte*	str_ptr;
+	bool	in_phrase = false;
+
+	/* Convert the query string to lower case before parsing. We own
+	the ut_malloc'ed result and so remember to free it before return. */
+
+	str_len = query_len * charset->casedn_multiply + 1;
+	str_ptr = static_cast<byte*>(ut_malloc(str_len));
+
+	*result_len = innobase_fts_casedn_str(
+		charset, const_cast<char*>(reinterpret_cast<const char*>(
+			query_str)), query_len,
+		reinterpret_cast<char*>(str_ptr), str_len);
+
+	ut_ad(*result_len < str_len);
+
+	str_ptr[*result_len] = 0;
+
+	/* If it is boolean mode, no need to check for '-/+' */
+	if (!boolean_mode) {
+		return(str_ptr);
+	}
+
+	/* Otherwise, we travese the string to find any '-/+' that are
+	immediately proceeded and followed by valid search word.
+	NOTE: we should not do so for CJK languages, this should
+	be taken care of in our CJK implementation */
+        while (cur_pos < *result_len) {
+                fts_string_t    str;
+                ulint           offset;
+                ulint           cur_len;
+
+                cur_len = innobase_mysql_fts_get_token(
+                        charset, str_ptr + cur_pos, str_ptr + *result_len,
+			&str, &offset);
+
+                if (cur_len == 0) {
+                        break;
+                }
+
+		/* Check if we are in a phrase, if so, no need to do
+		replacement of '-/+'. */
+		for (byte* ptr = str_ptr + cur_pos; ptr < str.f_str; ptr++) {
+			if ((char) (*ptr) == '"' ) {
+				in_phrase = !in_phrase;
+			}
+		}
+
+		/* Find those are not leading '-/+' and also not in a phrase */
+		if (cur_pos > 0 && str.f_str - str_ptr - cur_pos == 1
+		    && !in_phrase) {
+			char*	last_op = reinterpret_cast<char*>(
+						str_ptr + cur_pos);
+
+			if (*last_op == '-' || *last_op == '+') {
+				*last_op = ' ';
+			}
+		}
+
+                cur_pos += cur_len;
+	}
+
+	return(str_ptr);
+}
+
+/*******************************************************************//**
+FTS Query entry point.
+@return DB_SUCCESS if successful otherwise error code */
+UNIV_INTERN
+dberr_t
+fts_query(
+/*======*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: The FTS index to search */
+	uint		flags,		/*!< in: FTS search mode */
+	const byte*	query_str,	/*!< in: FTS query */
+	ulint		query_len,	/*!< in: FTS query string len
+					in bytes */
+	fts_result_t**	result)		/*!< in/out: result doc ids */
+{
+	fts_query_t	query;
+	dberr_t		error = DB_SUCCESS;
+	byte*		lc_query_str;
+	ulint		result_len;
+	bool		boolean_mode;
+	trx_t*		query_trx;
+	CHARSET_INFO*	charset;
+	ulint		start_time_ms;
+	bool		will_be_ignored = false;
+
+	boolean_mode = flags & FTS_BOOL;
+
+	*result = NULL;
+	memset(&query, 0x0, sizeof(query));
+	query_trx = trx_allocate_for_background();
+	query_trx->op_info = "FTS query";
+
+	start_time_ms = ut_time_ms();
+
+	query.trx = query_trx;
+	query.index = index;
+	query.boolean_mode = boolean_mode;
+	query.deleted = fts_doc_ids_create();
+	query.cur_node = NULL;
+
+	query.fts_common_table.type = FTS_COMMON_TABLE;
+	query.fts_common_table.table_id = index->table->id;
+	query.fts_common_table.parent = index->table->name;
+
+	charset = fts_index_get_charset(index);
+
+	query.fts_index_table.type = FTS_INDEX_TABLE;
+	query.fts_index_table.index_id = index->id;
+	query.fts_index_table.table_id = index->table->id;
+	query.fts_index_table.parent = index->table->name;
+	query.fts_index_table.charset = charset;
+
+	query.word_map = new word_map_t;
+	query.word_vector = new word_vector_t;
+	query.error = DB_SUCCESS;
+
+	/* Setup the RB tree that will be used to collect per term
+	statistics. */
+	query.word_freqs = rbt_create_arg_cmp(
+		sizeof(fts_word_freq_t), innobase_fts_string_cmp, (void*) charset);
+
+	query.total_size += SIZEOF_RBT_CREATE;
+
+	query.total_docs = dict_table_get_n_rows(index->table);
+
+#ifdef FTS_DOC_STATS_DEBUG
+	if (ft_enable_diag_print) {
+		error = fts_get_total_word_count(
+			trx, query.index, &query.total_words);
+
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		}
+
+		fprintf(stderr, "Total docs: " UINT64PF " Total words: %lu\n",
+			query.total_docs, query.total_words);
+	}
+#endif /* FTS_DOC_STATS_DEBUG */
+
+	query.fts_common_table.suffix = "DELETED";
+
+	/* Read the deleted doc_ids, we need these for filtering. */
+	error = fts_table_fetch_doc_ids(
+		NULL, &query.fts_common_table, query.deleted);
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	query.fts_common_table.suffix = "DELETED_CACHE";
+
+	error = fts_table_fetch_doc_ids(
+		NULL, &query.fts_common_table, query.deleted);
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	/* Get the deleted doc ids that are in the cache. */
+	fts_cache_append_deleted_doc_ids(
+		index->table->fts->cache, query.deleted->doc_ids);
+
+	/* Sort the vector so that we can do a binary search over the ids. */
+	ib_vector_sort(query.deleted->doc_ids, fts_update_doc_id_cmp);
+
+#if 0
+	/* Convert the query string to lower case before parsing. We own
+	the ut_malloc'ed result and so remember to free it before return. */
+
+	lc_query_str_len = query_len * charset->casedn_multiply + 1;
+	lc_query_str = static_cast<byte*>(ut_malloc(lc_query_str_len));
+
+	result_len = innobase_fts_casedn_str(
+		charset, (char*) query_str, query_len,
+		(char*) lc_query_str, lc_query_str_len);
+
+	ut_ad(result_len < lc_query_str_len);
+
+	lc_query_str[result_len] = 0;
+
+#endif
+
+	lc_query_str = fts_query_str_preprocess(
+		query_str, query_len, &result_len, charset, boolean_mode);
+
+	query.heap = mem_heap_create(128);
+
+	/* Create the rb tree for the doc id (current) set. */
+	query.doc_ids = rbt_create(
+		sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+	query.total_size += SIZEOF_RBT_CREATE;
+
+	/* Parse the input query string. */
+	if (fts_query_parse(&query, lc_query_str, result_len)) {
+		fts_ast_node_t*	ast = query.root;
+
+		/* Optimize query to check if it's a single term */
+		fts_query_can_optimize(&query, flags);
+
+		DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
+			        fts_result_cache_limit = 2048;
+		);
+
+		/* Traverse the Abstract Syntax Tree (AST) and execute
+		the query. */
+		query.error = fts_ast_visit(
+			FTS_NONE, ast, fts_query_visitor,
+			&query, &will_be_ignored);
+
+		/* If query expansion is requested, extend the search
+		with first search pass result */
+		if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
+			query.error = fts_expand_query(index, &query);
+		}
+
+		/* Calculate the inverse document frequency of the terms. */
+		if (query.error == DB_SUCCESS) {
+			fts_query_calculate_idf(&query);
+		}
+
+		/* Copy the result from the query state, so that we can
+		return it to the caller. */
+		if (query.error == DB_SUCCESS) {
+			*result = fts_query_get_result(&query, *result);
+		}
+
+		error = query.error;
+	} else {
+		/* still return an empty result set */
+		*result = static_cast<fts_result_t*>(
+			ut_malloc(sizeof(**result)));
+		memset(*result, 0, sizeof(**result));
+	}
+
+	ut_free(lc_query_str);
+
+	if (fts_enable_diag_print && (*result)) {
+		ulint	diff_time = ut_time_ms() - start_time_ms;
+		fprintf(stderr, "FTS Search Processing time: %ld secs:"
+				" %ld millisec: row(s) %d \n",
+			diff_time / 1000, diff_time % 1000,
+			(*result)->rankings_by_id
+				? (int) rbt_size((*result)->rankings_by_id)
+				: -1);
+
+		/* Log memory consumption & result size */
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Full Search Memory: "
+			"%lu (bytes),  Row: %lu .",
+			query.total_size,
+			(*result)->rankings_by_id
+				?  rbt_size((*result)->rankings_by_id)
+				: 0);
+	}
+
+func_exit:
+	fts_query_free(&query);
+
+	trx_free_for_background(query_trx);
+
+	return(error);
+}
+
+/*****************************************************************//**
+FTS Query free result, returned by fts_query(). */
+
+void
+fts_query_free_result(
+/*==================*/
+	fts_result_t*	result)		/*!< in: result instance to free.*/
+{
+	if (result) {
+		if (result->rankings_by_id != NULL) {
+			rbt_free(result->rankings_by_id);
+			result->rankings_by_id = NULL;
+		}
+		if (result->rankings_by_rank != NULL) {
+			rbt_free(result->rankings_by_rank);
+			result->rankings_by_rank = NULL;
+		}
+
+		ut_free(result);
+		result = NULL;
+	}
+}
+
+/*****************************************************************//**
+FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
+
+void
+fts_query_sort_result_on_rank(
+/*==========================*/
+	fts_result_t*	result)		/*!< out: result instance to sort.*/
+{
+	const ib_rbt_node_t*	node;
+	ib_rbt_t*		ranked;
+
+	ut_a(result->rankings_by_id != NULL);
+	if (result->rankings_by_rank) {
+		rbt_free(result->rankings_by_rank);
+	}
+
+	ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank);
+
+	/* We need to free any instances of fts_doc_freq_t that we
+	may have allocated. */
+	for (node = rbt_first(result->rankings_by_id);
+	     node;
+	     node = rbt_next(result->rankings_by_id, node)) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, node);
+
+		ut_a(ranking->words == NULL);
+
+		rbt_insert(ranked, ranking, ranking);
+	}
+
+	/* Reset the current node too. */
+	result->current = NULL;
+	result->rankings_by_rank = ranked;
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+A debug function to print result doc_id set. */
+static
+void
+fts_print_doc_id(
+/*=============*/
+	fts_query_t*	query)	/*!< in : tree that stores doc_ids.*/
+{
+	const ib_rbt_node_t*	node;
+
+	/* Iterate each member of the doc_id set */
+	for (node = rbt_first(query->doc_ids);
+	     node;
+	     node = rbt_next(query->doc_ids, node)) {
+		fts_ranking_t*	ranking;
+		ranking = rbt_value(fts_ranking_t, node);
+
+		fprintf(stderr, "doc_ids info, doc_id: %ld \n",
+			(ulint) ranking->doc_id);
+
+		ulint	pos = 0;
+		byte*	value = NULL;
+		while (fts_ranking_words_get_next(query, ranking, &pos, &value)) {
+			fprintf(stderr, "doc_ids info, value: %s \n", value);
+		}
+	}
+}
+#endif
+
+/*************************************************************//**
+This function implements a simple "blind" query expansion search:
+words in documents found in the first search pass will be used as
+search arguments to search the document again, thus "expand"
+the search result set.
+@return DB_SUCCESS if success, otherwise the error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_expand_query(
+/*=============*/
+	dict_index_t*	index,		/*!< in: FTS index to search */
+	fts_query_t*	query)		/*!< in: FTS query instance */
+{
+	const ib_rbt_node_t*	node;
+	const ib_rbt_node_t*	token_node;
+	fts_doc_t		result_doc;
+	dberr_t			error = DB_SUCCESS;
+	const fts_index_cache_t*index_cache;
+
+	/* If no doc is found in first search pass, return */
+	if (!rbt_size(query->doc_ids)) {
+		return(error);
+	}
+
+	/* Init "result_doc", to hold words from the first search pass */
+	fts_doc_init(&result_doc);
+
+	rw_lock_x_lock(&index->table->fts->cache->lock);
+	index_cache = fts_find_index_cache(index->table->fts->cache, index);
+	rw_lock_x_unlock(&index->table->fts->cache->lock);
+
+	ut_a(index_cache);
+
+	result_doc.tokens = rbt_create_arg_cmp(
+		sizeof(fts_token_t), innobase_fts_text_cmp,
+		(void *)index_cache->charset);
+
+	result_doc.charset = index_cache->charset;
+
+	query->total_size += SIZEOF_RBT_CREATE;
+#ifdef UNIV_DEBUG
+	fts_print_doc_id(query);
+#endif
+
+	for (node = rbt_first(query->doc_ids);
+	     node;
+	     node = rbt_next(query->doc_ids, node)) {
+
+		fts_ranking_t*	ranking;
+		ulint		pos;
+		byte*		word;
+		ulint		prev_token_size;
+		ulint		estimate_size;
+
+		prev_token_size = rbt_size(result_doc.tokens);
+
+		ranking = rbt_value(fts_ranking_t, node);
+
+		/* Fetch the documents with the doc_id from the
+		result of first seach pass. Since we do not
+		store document-to-word mapping, we need to
+		fetch the original document and parse them.
+		Future optimization could be done here if we
+		support some forms of document-to-word mapping */
+		fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index,
+					FTS_FETCH_DOC_BY_ID_EQUAL,
+					fts_query_expansion_fetch_doc,
+					&result_doc);
+
+		/* Remove words that have already been searched in the
+		first pass */
+		pos = 0;
+		word = NULL;
+		while (fts_ranking_words_get_next(query, ranking, &pos,
+			&word)) {
+			fts_string_t	str;
+			ibool		ret;
+
+			/* FIXME: We are discarding a const qualifier here. */
+			str.f_str = word;
+			str.f_len = ut_strlen((const char*) str.f_str);
+			ret = rbt_delete(result_doc.tokens, &str);
+
+			/* The word must exist in the doc we found */
+			if (!ret) {
+				fprintf(stderr, " InnoDB: Error: Did not "
+					"find word %s in doc %ld for query "
+					"expansion search.\n", str.f_str,
+					(ulint) ranking->doc_id);
+			}
+		}
+
+		/* Estimate memory used, see fts_process_token and fts_token_t.
+		   We ignore token size here. */
+		estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
+			* (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t)
+			+ sizeof(ib_vector_t) + sizeof(ulint) * 32);
+		query->total_size += estimate_size;
+
+		if (query->total_size > fts_result_cache_limit) {
+			error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+			goto	func_exit;
+		}
+	}
+
+	/* Search the table the second time with expanded search list */
+	for (token_node = rbt_first(result_doc.tokens);
+	     token_node;
+	     token_node = rbt_next(result_doc.tokens, token_node)) {
+		fts_token_t*	mytoken;
+		mytoken = rbt_value(fts_token_t, token_node);
+
+		fts_query_add_word_freq(query, mytoken->text.f_str);
+		error = fts_query_union(query, &mytoken->text);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+	}
+
+func_exit:
+	fts_doc_free(&result_doc);
+
+	return(error);
+}
+/*************************************************************//**
+This function finds documents that contain all words in a
+phrase or proximity search. And if proximity search, verify
+the words are close enough to each other, as in specified distance.
+This function is called for phrase and proximity search.
+@return TRUE if documents are found, FALSE if otherwise */
+static
+ibool
+fts_phrase_or_proximity_search(
+/*===========================*/
+	fts_query_t*	query,		/*!< in/out:  query instance.
+					query->doc_ids might be instantiated
+					with qualified doc IDs */
+	ib_vector_t*	tokens)		/*!< in: Tokens contain words */
+{
+	ulint		n_matched;
+	ulint		i;
+	ibool		matched = FALSE;
+	ulint		num_token = ib_vector_size(tokens);
+	fts_match_t*	match[MAX_PROXIMITY_ITEM];
+	ibool		end_list = FALSE;
+
+	/* Number of matched documents for the first token */
+	n_matched = ib_vector_size(query->match_array[0]);
+
+	/* We have a set of match list for each word, we shall
+	walk through the list and find common documents that
+	contain all the matching words. */
+	for (i = 0; i < n_matched; i++) {
+		ulint		j;
+		ulint		k = 0;
+		fts_proximity_t	qualified_pos;
+		ulint		qualified_pos_buf[MAX_PROXIMITY_ITEM * 2];
+
+		qualified_pos.min_pos = &qualified_pos_buf[0];
+		qualified_pos.max_pos = &qualified_pos_buf[MAX_PROXIMITY_ITEM];
+
+		match[0] = static_cast<fts_match_t*>(
+			ib_vector_get(query->match_array[0], i));
+
+		/* For remaining match list for the token(word), we
+		try to see if there is a document with the same
+		doc id */
+		for (j = 1; j < num_token; j++) {
+			match[j] = static_cast<fts_match_t*>(
+				ib_vector_get(query->match_array[j], k));
+
+			while (match[j]->doc_id < match[0]->doc_id
+			       && k < ib_vector_size(query->match_array[j])) {
+				 match[j] = static_cast<fts_match_t*>(
+					ib_vector_get(
+						query->match_array[j], k));
+				k++;
+			}
+
+			if (match[j]->doc_id > match[0]->doc_id) {
+				/* no match */
+				if (query->flags & FTS_PHRASE) {
+					match[0]->doc_id = 0;
+				}
+				break;
+			}
+
+			if (k == ib_vector_size(query->match_array[j])) {
+				end_list = TRUE;
+
+				if (match[j]->doc_id != match[0]->doc_id) {
+					/* no match */
+					if (query->flags & FTS_PHRASE) {
+						ulint	s;
+
+						match[0]->doc_id = 0;
+
+						for (s = i + 1; s < n_matched;
+						     s++) {
+							match[0] = static_cast<
+							fts_match_t*>(
+							ib_vector_get(
+							query->match_array[0],
+							s));
+							match[0]->doc_id = 0;
+						}
+					}
+
+					goto func_exit;
+				}
+			}
+
+			/* FIXME: A better solution will be a counter array
+			remember each run's last position. So we don't
+			reset it here very time */
+			k = 0;
+		}
+
+		if (j != num_token) {
+			continue;
+		}
+
+		/* For this matching doc, we need to further
+		verify whether the words in the doc are close
+		to each other, and within the distance specified
+		in the proximity search */
+		if (query->flags & FTS_PHRASE) {
+			matched = TRUE;
+		} else if (fts_proximity_get_positions(
+			match, num_token, ULINT_MAX, &qualified_pos)) {
+
+			/* Fetch the original documents and count the
+			words in between matching words to see that is in
+			specified distance */
+			if (fts_query_is_in_proximity_range(
+				query, match, &qualified_pos)) {
+				/* If so, mark we find a matching doc */
+				query->error = fts_query_process_doc_id(
+					query, match[0]->doc_id, 0);
+				if (query->error != DB_SUCCESS) {
+					matched = FALSE;
+					goto func_exit;
+				}
+
+				matched = TRUE;
+				for (ulint z = 0; z < num_token; z++) {
+					fts_string_t*	token;
+					token = static_cast<fts_string_t*>(
+						ib_vector_get(tokens, z));
+					fts_query_add_word_to_document(
+						query, match[0]->doc_id,
+						token->f_str);
+				}
+			}
+		}
+
+		if (end_list) {
+			break;
+		}
+	}
+
+func_exit:
+	return(matched);
+}
+
+/*************************************************************//**
+This function checks whether words in result documents are close to
+each other (within proximity range as specified by "distance").
+If "distance" is MAX_ULINT, then it will find all combinations of
+positions of matching words and store min and max positions
+in the "qualified_pos" for later verification.
+@return true if words are close to each other, false if otherwise */
+static
+bool
+fts_proximity_get_positions(
+/*========================*/
+	fts_match_t**		match,		/*!< in: query instance */
+	ulint			num_match,	/*!< in: number of matching
+						items */
+	ulint			distance,	/*!< in: distance value
+						for proximity search */
+	fts_proximity_t*	qualified_pos)	/*!< out: the position info
+						records ranges containing
+						all matching words. */
+{
+	ulint	i;
+	ulint	idx[MAX_PROXIMITY_ITEM];
+	ulint	num_pos[MAX_PROXIMITY_ITEM];
+	ulint	min_idx;
+
+	qualified_pos->n_pos = 0;
+
+	ut_a(num_match < MAX_PROXIMITY_ITEM);
+
+	/* Each word could appear multiple times in a doc. So
+	we need to walk through each word's position list, and find
+	closest distance between different words to see if
+	they are in the proximity distance. */
+
+	/* Assume each word's position list is sorted, we
+	will just do a walk through to all words' lists
+	similar to a the merge phase of a merge sort */
+	for (i = 0; i < num_match; i++) {
+		/* idx is the current position we are checking
+		for a particular word */
+		idx[i] = 0;
+
+		/* Number of positions for this word */
+		num_pos[i] = ib_vector_size(match[i]->positions);
+	}
+
+	/* Start with the first word */
+	min_idx = 0;
+
+	while (idx[min_idx] < num_pos[min_idx]) {
+		ulint	position[MAX_PROXIMITY_ITEM];
+		ulint	min_pos = ULINT_MAX;
+		ulint	max_pos = 0;
+
+		/* Check positions in each word position list, and
+		record the max/min position */
+		for (i = 0; i < num_match; i++) {
+			position[i] = *(ulint*) ib_vector_get_const(
+				match[i]->positions, idx[i]);
+
+			if (position[i] == ULINT_UNDEFINED) {
+				break;
+			}
+
+			if (position[i] < min_pos) {
+				min_pos = position[i];
+				min_idx = i;
+			}
+
+			if (position[i] > max_pos) {
+				max_pos = position[i];
+			}
+		}
+
+		/* If max and min position are within range, we
+		find a good match */
+		if (max_pos - min_pos <= distance
+		    && (i >= num_match || position[i] != ULINT_UNDEFINED)) {
+			/* The charset has variable character
+			length encoding, record the min_pos and
+			max_pos, we will need to verify the actual
+			number of characters */
+			qualified_pos->min_pos[qualified_pos->n_pos] = min_pos;
+			qualified_pos->max_pos[qualified_pos->n_pos] = max_pos;
+			qualified_pos->n_pos++;
+		}
+
+		/* Otherwise, move to the next position is the
+		list for the word with the smallest position */
+		idx[min_idx]++;
+	}
+
+	ut_ad(qualified_pos->n_pos <= MAX_PROXIMITY_ITEM);
+
+	return(qualified_pos->n_pos != 0);
+}
diff --git a/storage/xtradb/fts/fts0sql.cc b/storage/xtradb/fts/fts0sql.cc
new file mode 100644
index 00000000000..03c19d93af6
--- /dev/null
+++ b/storage/xtradb/fts/fts0sql.cc
@@ -0,0 +1,355 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0sql.cc
+Full Text Search functionality.
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#include "que0que.h"
+#include "trx0roll.h"
+#include "pars0pars.h"
+#include "dict0dict.h"
+#include "fts0types.h"
+#include "fts0priv.h"
+
+#ifndef UNIV_NONINL
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#endif
+
+/** SQL statements for creating the ancillary FTS tables. %s must be replaced
+with the indexed table's id. */
+
+/** Preamble to all SQL statements. */
+static const char* fts_sql_begin=
+	"PROCEDURE P() IS\n";
+
+/** Postamble to non-committing SQL statements. */
+static const char* fts_sql_end=
+	"\n"
+	"END;\n";
+
+/******************************************************************//**
+Get the table id.
+@return number of bytes written */
+UNIV_INTERN
+int
+fts_get_table_id(
+/*=============*/
+	const fts_table_t*
+			fts_table,	/*!< in: FTS Auxiliary table */
+	char*		table_id)	/*!< out: table id, must be at least
+					FTS_AUX_MIN_TABLE_ID_LENGTH bytes
+					long */
+{
+	int		len;
+
+	switch (fts_table->type) {
+	case FTS_COMMON_TABLE:
+		len = fts_write_object_id(fts_table->table_id, table_id);
+		break;
+
+	case FTS_INDEX_TABLE:
+
+		len = fts_write_object_id(fts_table->table_id, table_id);
+
+		table_id[len] = '_';
+		++len;
+		table_id += len;
+
+		len += fts_write_object_id(fts_table->index_id, table_id);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	ut_a(len >= 16);
+	ut_a(len < FTS_AUX_MIN_TABLE_ID_LENGTH);
+
+	return(len);
+}
+
+/******************************************************************//**
+Construct the prefix name of an FTS table.
+@return own: table name, must be freed with mem_free() */
+UNIV_INTERN
+char*
+fts_get_table_name_prefix(
+/*======================*/
+	const fts_table_t*
+			fts_table)	/*!< in: Auxiliary table type */
+{
+	int		len;
+	const char*	slash;
+	char*		prefix_name;
+	int		dbname_len = 0;
+	int		prefix_name_len;
+	char		table_id[FTS_AUX_MIN_TABLE_ID_LENGTH];
+
+	slash = static_cast<const char*>(
+		memchr(fts_table->parent, '/', strlen(fts_table->parent)));
+
+	if (slash) {
+		/* Print up to and including the separator. */
+		dbname_len = (slash - fts_table->parent) + 1;
+	}
+
+	len = fts_get_table_id(fts_table, table_id);
+
+	prefix_name_len = dbname_len + 4 + len + 1;
+
+	prefix_name = static_cast<char*>(mem_alloc(prefix_name_len));
+
+	len = sprintf(prefix_name, "%.*sFTS_%s",
+		      dbname_len, fts_table->parent, table_id);
+
+	ut_a(len > 0);
+	ut_a(len == prefix_name_len - 1);
+
+	return(prefix_name);
+}
+
+/******************************************************************//**
+Construct the name of an ancillary FTS table.
+@return own: table name, must be freed with mem_free() */
+UNIV_INTERN
+char*
+fts_get_table_name(
+/*===============*/
+	const fts_table_t*	fts_table)
+					/*!< in: Auxiliary table type */
+{
+	int		len;
+	char*		name;
+	int		name_len;
+	char*		prefix_name;
+
+	prefix_name = fts_get_table_name_prefix(fts_table);
+
+	name_len = strlen(prefix_name) + 1 + strlen(fts_table->suffix) + 1;
+
+	name = static_cast<char*>(mem_alloc(name_len));
+
+	len = sprintf(name, "%s_%s", prefix_name, fts_table->suffix);
+
+	ut_a(len > 0);
+	ut_a(len == name_len - 1);
+
+	mem_free(prefix_name);
+
+	return(name);
+}
+
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id.
+@return query graph */
+UNIV_INTERN
+que_t*
+fts_parse_sql(
+/*==========*/
+	fts_table_t*	fts_table,	/*!< in: FTS auxiliarry table info */
+	pars_info_t*	info,		/*!< in: info struct, or NULL */
+	const char*	sql)		/*!< in: SQL string to evaluate */
+{
+	char*		str;
+	que_t*		graph;
+	char*		str_tmp;
+	ibool		dict_locked;
+
+	if (fts_table != NULL) {
+		char*	table_name;
+
+		table_name = fts_get_table_name(fts_table);
+		str_tmp = ut_strreplace(sql, "%s", table_name);
+		mem_free(table_name);
+	} else {
+		ulint	sql_len = strlen(sql) + 1;
+
+		str_tmp = static_cast<char*>(mem_alloc(sql_len));
+		strcpy(str_tmp, sql);
+	}
+
+	str = ut_str3cat(fts_sql_begin, str_tmp, fts_sql_end);
+	mem_free(str_tmp);
+
+	dict_locked = (fts_table && fts_table->table
+		       && (fts_table->table->fts->fts_status
+			   & TABLE_DICT_LOCKED));
+
+	if (!dict_locked) {
+		ut_ad(!mutex_own(&(dict_sys->mutex)));
+
+		/* The InnoDB SQL parser is not re-entrant. */
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	graph = pars_sql(info, str);
+	ut_a(graph);
+
+	if (!dict_locked) {
+		mutex_exit(&dict_sys->mutex);
+	}
+
+	mem_free(str);
+
+	return(graph);
+}
+
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id.
+@return query graph */
+UNIV_INTERN
+que_t*
+fts_parse_sql_no_dict_lock(
+/*=======================*/
+	fts_table_t*	fts_table,	/*!< in: FTS aux table info */
+	pars_info_t*	info,		/*!< in: info struct, or NULL */
+	const char*	sql)		/*!< in: SQL string to evaluate */
+{
+	char*		str;
+	que_t*		graph;
+	char*		str_tmp = NULL;
+
+#ifdef UNIV_DEBUG
+	ut_ad(mutex_own(&dict_sys->mutex));
+#endif
+
+	if (fts_table != NULL) {
+		char*		table_name;
+
+		table_name = fts_get_table_name(fts_table);
+		str_tmp = ut_strreplace(sql, "%s", table_name);
+		mem_free(table_name);
+	}
+
+	if (str_tmp != NULL) {
+		str = ut_str3cat(fts_sql_begin, str_tmp, fts_sql_end);
+		mem_free(str_tmp);
+	} else {
+		str = ut_str3cat(fts_sql_begin, sql, fts_sql_end);
+	}
+
+	//fprintf(stderr, "%s\n", str);
+
+	graph = pars_sql(info, str);
+	ut_a(graph);
+
+	mem_free(str);
+
+	return(graph);
+}
+
+/******************************************************************//**
+Evaluate an SQL query graph.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_eval_sql(
+/*=========*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t*		graph)		/*!< in: Query graph to evaluate */
+{
+	que_thr_t*	thr;
+
+	graph->trx = trx;
+	graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+	ut_a(thr = que_fork_start_command(graph));
+
+	que_run_threads(thr);
+
+	return(trx->error_state);
+}
+
+/******************************************************************//**
+Construct the column specification part of the SQL string for selecting the
+indexed FTS columns for the given table. Adds the necessary bound
+ids to the given 'info' and returns the SQL string. Examples:
+
+One indexed column named "text":
+
+ "$sel0",
+ info/ids: sel0 -> "text"
+
+Two indexed columns named "subject" and "content":
+
+ "$sel0, $sel1",
+ info/ids: sel0 -> "subject", sel1 -> "content",
+@return heap-allocated WHERE string */
+UNIV_INTERN
+const char*
+fts_get_select_columns_str(
+/*=======================*/
+	dict_index_t*   index,		/*!< in: index */
+	pars_info_t*    info,		/*!< in/out: parser info */
+	mem_heap_t*     heap)		/*!< in: memory heap */
+{
+	ulint		i;
+	const char*	str = "";
+
+	for (i = 0; i < index->n_user_defined_cols; i++) {
+		char*           sel_str;
+
+		dict_field_t*   field = dict_index_get_nth_field(index, i);
+
+		sel_str = mem_heap_printf(heap, "sel%lu", (ulong) i);
+
+		/* Set copy_name to TRUE since it's dynamic. */
+		pars_info_bind_id(info, TRUE, sel_str, field->name);
+
+		str = mem_heap_printf(
+			heap, "%s%s$%s", str, (*str) ? ", " : "", sel_str);
+	}
+
+	return(str);
+}
+
+/******************************************************************//**
+Commit a transaction.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_sql_commit(
+/*===========*/
+	trx_t*		trx)		/*!< in: transaction */
+{
+	dberr_t	error;
+
+	error = trx_commit_for_mysql(trx);
+
+	/* Commit should always succeed */
+	ut_a(error == DB_SUCCESS);
+
+	return(DB_SUCCESS);
+}
+
+/******************************************************************//**
+Rollback a transaction.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_sql_rollback(
+/*=============*/
+	trx_t*		trx)		/*!< in: transaction */
+{
+	return(trx_rollback_to_savepoint(trx, NULL));
+}
diff --git a/storage/xtradb/fts/fts0tlex.cc b/storage/xtradb/fts/fts0tlex.cc
new file mode 100644
index 00000000000..717ddb8a77e
--- /dev/null
+++ b/storage/xtradb/fts/fts0tlex.cc
@@ -0,0 +1,1951 @@
+#include "univ.i"
+#line 2 "fts0tlex.cc"
+
+#line 4 "fts0tlex.cc"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 35
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* Returned upon end-of-file. */
+#define YY_NULL 0
+
+/* Promotes a possibly negative, possibly signed char to an unsigned
+ * integer for use as an array index.  If the signed char is negative,
+ * we want to instead treat it as an 8-bit unsigned char, hence the
+ * double cast.
+ */
+#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c)
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Enter a start condition.  This macro really ought to take a parameter,
+ * but we do it the disgusting crufty way forced on us by the ()-less
+ * definition of BEGIN.
+ */
+#define BEGIN yyg->yy_start = 1 + 2 *
+
+/* Translate the current start state into a value that can be later handed
+ * to BEGIN to return to the state.  The YYSTATE alias is for lex
+ * compatibility.
+ */
+#define YY_START ((yyg->yy_start - 1) / 2)
+#define YYSTATE YY_START
+
+/* Action number for EOF rule of a given start state. */
+#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1)
+
+/* Special action meaning "start processing a new file". */
+#define YY_NEW_FILE fts0trestart(yyin ,yyscanner )
+
+#define YY_END_OF_BUFFER_CHAR 0
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+/* The state buf must be large enough to hold one state per character in the main buffer.
+ */
+#define YY_STATE_BUF_SIZE   ((YY_BUF_SIZE + 2) * sizeof(yy_state_type))
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#define EOB_ACT_CONTINUE_SCAN 0
+#define EOB_ACT_END_OF_FILE 1
+#define EOB_ACT_LAST_MATCH 2
+
+    #define YY_LESS_LINENO(n)
+    
+/* Return all but the first "n" matched characters back to the input stream. */
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		*yy_cp = yyg->yy_hold_char; \
+		YY_RESTORE_YY_MORE_OFFSET \
+		yyg->yy_c_buf_p = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \
+		YY_DO_BEFORE_ACTION; /* set up yytext again */ \
+		} \
+	while ( 0 )
+
+#define unput(c) yyunput( c, yyg->yytext_ptr , yyscanner )
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+    
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+#define YY_BUFFER_NEW 0
+#define YY_BUFFER_NORMAL 1
+	/* When an EOF's been seen but there's still some text to process
+	 * then we mark the buffer as YY_EOF_PENDING, to indicate that we
+	 * shouldn't try reading from the input source any more.  We might
+	 * still have a bunch of tokens to match, though, because of
+	 * possible backing-up.
+	 *
+	 * When we actually see the EOF, we change the status to "new"
+	 * (via fts0trestart()), so that the user can continue scanning by
+	 * just pointing yyin at a new input file.
+	 */
+#define YY_BUFFER_EOF_PENDING 2
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+/* We provide macros for accessing buffer states in case in the
+ * future we want to put the buffer states in a more general
+ * "scanner state".
+ *
+ * Returns the top of the stack, or NULL.
+ */
+#define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \
+                          ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \
+                          : NULL)
+
+/* Same as previous macro, but useful when we know that the buffer stack is not
+ * NULL or when we need an lvalue. For internal use only.
+ */
+#define YY_CURRENT_BUFFER_LVALUE yyg->yy_buffer_stack[yyg->yy_buffer_stack_top]
+
+void fts0trestart (FILE *input_file ,yyscan_t yyscanner );
+void fts0t_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
+void fts0t_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0t_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0tpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+void fts0tpop_buffer_state (yyscan_t yyscanner );
+
+static void fts0tensure_buffer_stack (yyscan_t yyscanner );
+static void fts0t_load_buffer_state (yyscan_t yyscanner );
+static void fts0t_init_buffer (YY_BUFFER_STATE b,FILE *file ,yyscan_t yyscanner );
+
+#define YY_FLUSH_BUFFER fts0t_flush_buffer(YY_CURRENT_BUFFER ,yyscanner)
+
+YY_BUFFER_STATE fts0t_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner );
+
+void *fts0talloc (yy_size_t ,      yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) );
+void *fts0trealloc (void *,yy_size_t ,      yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) );
+void fts0tfree (void * ,      yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) );
+
+#define yy_new_buffer fts0t_create_buffer
+
+#define yy_set_interactive(is_interactive) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){ \
+        fts0tensure_buffer_stack (yyscanner); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            fts0t_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \
+	}
+
+#define yy_set_bol(at_bol) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){\
+        fts0tensure_buffer_stack (yyscanner); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            fts0t_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \
+	}
+
+#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
+
+/* Begin user sect3 */
+
+#define fts0twrap(n) 1
+#define YY_SKIP_YYWRAP
+
+typedef unsigned char YY_CHAR;
+
+typedef int yy_state_type;
+
+#define yytext_ptr yytext_r
+
+static yy_state_type yy_get_previous_state (yyscan_t yyscanner );
+static yy_state_type yy_try_NUL_trans (yy_state_type current_state  ,yyscan_t yyscanner);
+static int yy_get_next_buffer (yyscan_t yyscanner );
+static void yy_fatal_error (yyconst char msg[] ,      yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) );
+
+/* Done after the current pattern has been matched and before the
+ * corresponding action - sets up yytext.
+ */
+#define YY_DO_BEFORE_ACTION \
+	yyg->yytext_ptr = yy_bp; \
+	yyleng = (size_t) (yy_cp - yy_bp); \
+	yyg->yy_hold_char = *yy_cp; \
+	*yy_cp = '\0'; \
+	yyg->yy_c_buf_p = yy_cp;
+
+#define YY_NUM_RULES 7
+#define YY_END_OF_BUFFER 8
+/* This struct is not used in this scanner,
+   but its presence is necessary. */
+struct yy_trans_info
+	{
+	flex_int32_t yy_verify;
+	flex_int32_t yy_nxt;
+	};
+static yyconst flex_int16_t yy_accept[17] =
+    {   0,
+        4,    4,    8,    4,    1,    6,    1,    5,    5,    2,
+        4,    1,    1,    0,    3,    0
+    } ;
+
+static yyconst flex_int32_t yy_ec[256] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    2,    3,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    4,    1,    5,    1,    1,    6,    1,    1,    1,
+        1,    7,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1
+    } ;
+
+static yyconst flex_int32_t yy_meta[8] =
+    {   0,
+        1,    2,    3,    4,    5,    5,    1
+    } ;
+
+static yyconst flex_int16_t yy_base[20] =
+    {   0,
+        0,    0,   18,    0,    6,   21,    0,    9,   21,    0,
+        0,    0,    0,    4,   21,   21,   10,   11,   15
+    } ;
+
+static yyconst flex_int16_t yy_def[20] =
+    {   0,
+       16,    1,   16,   17,   17,   16,   18,   19,   16,   17,
+       17,    5,   18,   19,   16,    0,   16,   16,   16
+    } ;
+
+static yyconst flex_int16_t yy_nxt[29] =
+    {   0,
+        4,    5,    6,    7,    8,    9,   10,   12,   15,   13,
+       11,   11,   13,   15,   13,   14,   14,   16,   14,   14,
+        3,   16,   16,   16,   16,   16,   16,   16
+    } ;
+
+static yyconst flex_int16_t yy_chk[29] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    5,   14,    5,
+       17,   17,   18,    8,   18,   19,   19,    3,   19,   19,
+       16,   16,   16,   16,   16,   16,   16,   16
+    } ;
+
+/* The intent behind this definition is that it'll catch
+ * any uses of REJECT which flex missed.
+ */
+#define REJECT reject_used_but_not_detected
+#define yymore() yymore_used_but_not_detected
+#define YY_MORE_ADJ 0
+#define YY_RESTORE_YY_MORE_OFFSET
+#line 1 "fts0tlex.l"
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**
+ * @file fts/fts0tlex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+#line 27 "fts0tlex.l"
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_tlexer(YYSTYPE* val, yyscan_t yyscanner)
+
+#define YY_NO_INPUT 1
+#line 480 "fts0tlex.cc"
+
+#define INITIAL 0
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+/* Holds the entire state of the reentrant scanner. */
+struct yyguts_t
+    {
+
+    /* User-defined. Not touched by flex. */
+    YY_EXTRA_TYPE yyextra_r;
+
+    /* The rest are the same as the globals declared in the non-reentrant scanner. */
+    FILE *yyin_r, *yyout_r;
+    size_t yy_buffer_stack_top; /**< index of top of stack. */
+    size_t yy_buffer_stack_max; /**< capacity of stack. */
+    YY_BUFFER_STATE * yy_buffer_stack; /**< Stack as an array. */
+    char yy_hold_char;
+    int yy_n_chars;
+    int yyleng_r;
+    char *yy_c_buf_p;
+    int yy_init;
+    int yy_start;
+    int yy_did_buffer_switch_on_eof;
+    int yy_start_stack_ptr;
+    int yy_start_stack_depth;
+    int *yy_start_stack;
+    yy_state_type yy_last_accepting_state;
+    char* yy_last_accepting_cpos;
+
+    int yylineno_r;
+    int yy_flex_debug_r;
+
+    char *yytext_r;
+    int yy_more_flag;
+    int yy_more_len;
+
+    }; /* end struct yyguts_t */
+
+static int yy_init_globals (yyscan_t yyscanner );
+
+int fts0tlex_init (yyscan_t* scanner);
+
+int fts0tlex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int fts0tlex_destroy (yyscan_t yyscanner );
+
+int fts0tget_debug (yyscan_t yyscanner );
+
+void fts0tset_debug (int debug_flag ,yyscan_t yyscanner );
+
+YY_EXTRA_TYPE fts0tget_extra (yyscan_t yyscanner );
+
+void fts0tset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
+
+FILE *fts0tget_in (yyscan_t yyscanner );
+
+void fts0tset_in  (FILE * in_str ,yyscan_t yyscanner );
+
+FILE *fts0tget_out (yyscan_t yyscanner );
+
+void fts0tset_out  (FILE * out_str ,yyscan_t yyscanner );
+
+int fts0tget_leng (yyscan_t yyscanner );
+
+char *fts0tget_text (yyscan_t yyscanner );
+
+int fts0tget_lineno (yyscan_t yyscanner );
+
+void fts0tset_lineno (int line_number ,yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int fts0twrap (yyscan_t yyscanner );
+#else
+extern int fts0twrap (yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int ,      yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)));
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * ,      yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)));
+#endif
+
+#ifndef YY_NO_INPUT
+
+#ifdef __cplusplus
+static int yyinput (yyscan_t yyscanner );
+#else
+static int input (yyscan_t yyscanner );
+#endif
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Copy whatever the last rule matched to the standard output. */
+#ifndef ECHO
+/* This used to be an fputs(), but since the string might contain NUL's,
+ * we now use fwrite().
+ */
+#define ECHO do { if (fwrite( yytext, yyleng, 1, yyout )) {} } while (0)
+#endif
+
+/* Gets input and stuffs it into "buf".  number of characters read, or YY_NULL,
+ * is returned in "result".
+ */
+#ifndef YY_INPUT
+#define YY_INPUT(buf,result,max_size) \
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
+		{ \
+		int c = '*'; \
+		size_t n; \
+		for ( n = 0; n < max_size && \
+			     (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
+			buf[n] = (char) c; \
+		if ( c == '\n' ) \
+			buf[n++] = (char) c; \
+		if ( c == EOF && ferror( yyin ) ) \
+			YY_FATAL_ERROR( "input in flex scanner failed" ); \
+		result = n; \
+		} \
+	else \
+		{ \
+		errno=0; \
+		while ( (result = fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \
+			{ \
+			if( errno != EINTR) \
+				{ \
+				YY_FATAL_ERROR( "input in flex scanner failed" ); \
+				break; \
+				} \
+			errno=0; \
+			clearerr(yyin); \
+			} \
+		}\
+\
+
+#endif
+
+/* No semi-colon after return; correct usage is to write "yyterminate();" -
+ * we don't want an extra ';' after the "return" because that will cause
+ * some compilers to complain about unreachable statements.
+ */
+#ifndef yyterminate
+#define yyterminate() return YY_NULL
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Report a fatal error. */
+#ifndef YY_FATAL_ERROR
+#define YY_FATAL_ERROR(msg) yy_fatal_error( msg , yyscanner)
+#endif
+
+/* end tables serialization structures and prototypes */
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int fts0tlex (yyscan_t yyscanner);
+
+#define YY_DECL int fts0tlex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* Code executed at the beginning of each rule, after yytext and yyleng
+ * have been set up.
+ */
+#ifndef YY_USER_ACTION
+#define YY_USER_ACTION
+#endif
+
+/* Code executed at the end of each rule. */
+#ifndef YY_BREAK
+#define YY_BREAK break;
+#endif
+
+#define YY_RULE_SETUP \
+	YY_USER_ACTION
+
+/** The main scanner function which does all the work.
+ */
+YY_DECL
+{
+	register yy_state_type yy_current_state;
+	register char *yy_cp, *yy_bp;
+	register int yy_act;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+#line 44 "fts0tlex.l"
+
+
+#line 707 "fts0tlex.cc"
+
+	if ( !yyg->yy_init )
+		{
+		yyg->yy_init = 1;
+
+#ifdef YY_USER_INIT
+		YY_USER_INIT;
+#endif
+
+		if ( ! yyg->yy_start )
+			yyg->yy_start = 1;	/* first start state */
+
+		if ( ! yyin )
+			yyin = stdin;
+
+		if ( ! yyout )
+			yyout = stdout;
+
+		if ( ! YY_CURRENT_BUFFER ) {
+			fts0tensure_buffer_stack (yyscanner);
+			YY_CURRENT_BUFFER_LVALUE =
+				fts0t_create_buffer(yyin,YY_BUF_SIZE ,yyscanner);
+		}
+
+		fts0t_load_buffer_state(yyscanner );
+		}
+
+	while ( 1 )		/* loops until end-of-file is reached */
+		{
+		yy_cp = yyg->yy_c_buf_p;
+
+		/* Support of yytext. */
+		*yy_cp = yyg->yy_hold_char;
+
+		/* yy_bp points to the position in yy_ch_buf of the start of
+		 * the current run.
+		 */
+		yy_bp = yy_cp;
+
+		yy_current_state = yyg->yy_start;
+yy_match:
+		do
+			{
+			register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)];
+			if ( yy_accept[yy_current_state] )
+				{
+				yyg->yy_last_accepting_state = yy_current_state;
+				yyg->yy_last_accepting_cpos = yy_cp;
+				}
+			while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+				{
+				yy_current_state = (int) yy_def[yy_current_state];
+				if ( yy_current_state >= 17 )
+					yy_c = yy_meta[(unsigned int) yy_c];
+				}
+			yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+			++yy_cp;
+			}
+		while ( yy_current_state != 16 );
+		yy_cp = yyg->yy_last_accepting_cpos;
+		yy_current_state = yyg->yy_last_accepting_state;
+
+yy_find_action:
+		yy_act = yy_accept[yy_current_state];
+
+		YY_DO_BEFORE_ACTION;
+
+do_action:	/* This label is used only to access EOF actions. */
+
+		switch ( yy_act )
+	{ /* beginning of action switch */
+			case 0: /* must back up */
+			/* undo the effects of YY_DO_BEFORE_ACTION */
+			*yy_cp = yyg->yy_hold_char;
+			yy_cp = yyg->yy_last_accepting_cpos;
+			yy_current_state = yyg->yy_last_accepting_state;
+			goto yy_find_action;
+
+case 1:
+YY_RULE_SETUP
+#line 46 "fts0tlex.l"
+/* Ignore whitespace */ ;
+	YY_BREAK
+case 2:
+YY_RULE_SETUP
+#line 48 "fts0tlex.l"
+{
+	val->oper = fts0tget_text(yyscanner)[0];
+
+	return(val->oper);
+}
+	YY_BREAK
+case 3:
+YY_RULE_SETUP
+#line 54 "fts0tlex.l"
+{
+	val->token = strdup(fts0tget_text(yyscanner));
+
+	return(FTS_TEXT);
+}
+	YY_BREAK
+case 4:
+YY_RULE_SETUP
+#line 60 "fts0tlex.l"
+{
+	val->token = strdup(fts0tget_text(yyscanner));
+
+	return(FTS_TERM);
+}
+	YY_BREAK
+case 5:
+YY_RULE_SETUP
+#line 65 "fts0tlex.l"
+;
+	YY_BREAK
+case 6:
+/* rule 6 can match eol */
+YY_RULE_SETUP
+#line 66 "fts0tlex.l"
+
+	YY_BREAK
+case 7:
+YY_RULE_SETUP
+#line 68 "fts0tlex.l"
+ECHO;
+	YY_BREAK
+#line 834 "fts0tlex.cc"
+case YY_STATE_EOF(INITIAL):
+	yyterminate();
+
+	case YY_END_OF_BUFFER:
+		{
+		/* Amount of text matched not including the EOB char. */
+		int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1;
+
+		/* Undo the effects of YY_DO_BEFORE_ACTION. */
+		*yy_cp = yyg->yy_hold_char;
+		YY_RESTORE_YY_MORE_OFFSET
+
+		if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
+			{
+			/* We're scanning a new file or input source.  It's
+			 * possible that this happened because the user
+			 * just pointed yyin at a new source and called
+			 * fts0tlex().  If so, then we have to assure
+			 * consistency between YY_CURRENT_BUFFER and our
+			 * globals.  Here is the right place to do so, because
+			 * this is the first action (other than possibly a
+			 * back-up) that will match for the new input source.
+			 */
+			yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+			YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
+			}
+
+		/* Note that here we test for yy_c_buf_p "<=" to the position
+		 * of the first EOB in the buffer, since yy_c_buf_p will
+		 * already have been incremented past the NUL character
+		 * (since all states make transitions on EOB to the
+		 * end-of-buffer state).  Contrast this with the test
+		 * in input().
+		 */
+		if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+			{ /* This was really a NUL. */
+			yy_state_type yy_next_state;
+
+			yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text;
+
+			yy_current_state = yy_get_previous_state( yyscanner );
+
+			/* Okay, we're now positioned to make the NUL
+			 * transition.  We couldn't have
+			 * yy_get_previous_state() go ahead and do it
+			 * for us because it doesn't know how to deal
+			 * with the possibility of jamming (and we don't
+			 * want to build jamming into it because then it
+			 * will run more slowly).
+			 */
+
+			yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner);
+
+			yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+
+			if ( yy_next_state )
+				{
+				/* Consume the NUL. */
+				yy_cp = ++yyg->yy_c_buf_p;
+				yy_current_state = yy_next_state;
+				goto yy_match;
+				}
+
+			else
+				{
+				yy_cp = yyg->yy_last_accepting_cpos;
+				yy_current_state = yyg->yy_last_accepting_state;
+				goto yy_find_action;
+				}
+			}
+
+		else switch ( yy_get_next_buffer( yyscanner ) )
+			{
+			case EOB_ACT_END_OF_FILE:
+				{
+				yyg->yy_did_buffer_switch_on_eof = 0;
+
+				if ( fts0twrap(yyscanner ) )
+					{
+					/* Note: because we've taken care in
+					 * yy_get_next_buffer() to have set up
+					 * yytext, we can now set up
+					 * yy_c_buf_p so that if some total
+					 * hoser (like flex itself) wants to
+					 * call the scanner after we return the
+					 * YY_NULL, it'll still work - another
+					 * YY_NULL will get returned.
+					 */
+					yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ;
+
+					yy_act = YY_STATE_EOF(YY_START);
+					goto do_action;
+					}
+
+				else
+					{
+					if ( ! yyg->yy_did_buffer_switch_on_eof )
+						YY_NEW_FILE;
+					}
+				break;
+				}
+
+			case EOB_ACT_CONTINUE_SCAN:
+				yyg->yy_c_buf_p =
+					yyg->yytext_ptr + yy_amount_of_matched_text;
+
+				yy_current_state = yy_get_previous_state( yyscanner );
+
+				yy_cp = yyg->yy_c_buf_p;
+				yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+				goto yy_match;
+
+			case EOB_ACT_LAST_MATCH:
+				yyg->yy_c_buf_p =
+				&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars];
+
+				yy_current_state = yy_get_previous_state( yyscanner );
+
+				yy_cp = yyg->yy_c_buf_p;
+				yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+				goto yy_find_action;
+			}
+		break;
+		}
+
+	default:
+		YY_FATAL_ERROR(
+			"fatal flex scanner internal error--no action found" );
+	} /* end of action switch */
+		} /* end of scanning one token */
+} /* end of fts0tlex */
+
+/* yy_get_next_buffer - try to read in a new buffer
+ *
+ * Returns a code representing an action:
+ *	EOB_ACT_LAST_MATCH -
+ *	EOB_ACT_CONTINUE_SCAN - continue scanning from current position
+ *	EOB_ACT_END_OF_FILE - end of file
+ */
+static int yy_get_next_buffer (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
+	register char *source = yyg->yytext_ptr;
+	register int number_to_move, i;
+	int ret_val;
+
+	if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] )
+		YY_FATAL_ERROR(
+		"fatal flex scanner internal error--end of buffer missed" );
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 )
+		{ /* Don't try to fill the buffer, so this is an EOF. */
+		if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 )
+			{
+			/* We matched a single character, the EOB, so
+			 * treat this as a final EOF.
+			 */
+			return EOB_ACT_END_OF_FILE;
+			}
+
+		else
+			{
+			/* We matched some text prior to the EOB, first
+			 * process it.
+			 */
+			return EOB_ACT_LAST_MATCH;
+			}
+		}
+
+	/* Try to read more data. */
+
+	/* First move last chars to start of buffer. */
+	number_to_move = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr) - 1;
+
+	for ( i = 0; i < number_to_move; ++i )
+		*(dest++) = *(source++);
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING )
+		/* don't do the read, it's not guaranteed to return an EOF,
+		 * just force an EOF
+		 */
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0;
+
+	else
+		{
+			int num_to_read =
+			YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1;
+
+		while ( num_to_read <= 0 )
+			{ /* Not enough room in the buffer - grow it. */
+
+			/* just a shorter name for the current buffer */
+			YY_BUFFER_STATE b = YY_CURRENT_BUFFER;
+
+			int yy_c_buf_p_offset =
+				(int) (yyg->yy_c_buf_p - b->yy_ch_buf);
+
+			if ( b->yy_is_our_buffer )
+				{
+				int new_size = b->yy_buf_size * 2;
+
+				if ( new_size <= 0 )
+					b->yy_buf_size += b->yy_buf_size / 8;
+				else
+					b->yy_buf_size *= 2;
+
+				b->yy_ch_buf = (char *)
+					/* Include room in for 2 EOB chars. */
+					fts0trealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ,yyscanner );
+				}
+			else
+				/* Can't grow it, we don't own it. */
+				b->yy_ch_buf = 0;
+
+			if ( ! b->yy_ch_buf )
+				YY_FATAL_ERROR(
+				"fatal error - scanner input buffer overflow" );
+
+			yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset];
+
+			num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size -
+						number_to_move - 1;
+
+			}
+
+		if ( num_to_read > YY_READ_BUF_SIZE )
+			num_to_read = YY_READ_BUF_SIZE;
+
+		/* Read in more data. */
+		YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
+			yyg->yy_n_chars, (size_t) num_to_read );
+
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	if ( yyg->yy_n_chars == 0 )
+		{
+		if ( number_to_move == YY_MORE_ADJ )
+			{
+			ret_val = EOB_ACT_END_OF_FILE;
+			fts0trestart(yyin  ,yyscanner);
+			}
+
+		else
+			{
+			ret_val = EOB_ACT_LAST_MATCH;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status =
+				YY_BUFFER_EOF_PENDING;
+			}
+		}
+
+	else
+		ret_val = EOB_ACT_CONTINUE_SCAN;
+
+	if ((yy_size_t) (yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
+		/* Extend the array by 50%, plus the number we really need. */
+		yy_size_t new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1);
+		YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) fts0trealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ,yyscanner );
+		if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
+			YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
+	}
+
+	yyg->yy_n_chars += number_to_move;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR;
+
+	yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0];
+
+	return ret_val;
+}
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+    static yy_state_type yy_get_previous_state (yyscan_t yyscanner)
+{
+	register yy_state_type yy_current_state;
+	register char *yy_cp;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	yy_current_state = yyg->yy_start;
+
+	for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp )
+		{
+		register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
+		if ( yy_accept[yy_current_state] )
+			{
+			yyg->yy_last_accepting_state = yy_current_state;
+			yyg->yy_last_accepting_cpos = yy_cp;
+			}
+		while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+			{
+			yy_current_state = (int) yy_def[yy_current_state];
+			if ( yy_current_state >= 17 )
+				yy_c = yy_meta[(unsigned int) yy_c];
+			}
+		yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+		}
+
+	return yy_current_state;
+}
+
+/* yy_try_NUL_trans - try to make a transition on the NUL character
+ *
+ * synopsis
+ *	next_state = yy_try_NUL_trans( current_state );
+ */
+    static yy_state_type yy_try_NUL_trans  (yy_state_type yy_current_state , yyscan_t yyscanner)
+{
+	register int yy_is_jam;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */
+	register char *yy_cp = yyg->yy_c_buf_p;
+
+	register YY_CHAR yy_c = 1;
+	if ( yy_accept[yy_current_state] )
+		{
+		yyg->yy_last_accepting_state = yy_current_state;
+		yyg->yy_last_accepting_cpos = yy_cp;
+		}
+	while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+		{
+		yy_current_state = (int) yy_def[yy_current_state];
+		if ( yy_current_state >= 17 )
+			yy_c = yy_meta[(unsigned int) yy_c];
+		}
+	yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+	yy_is_jam = (yy_current_state == 16);
+
+	return yy_is_jam ? 0 : yy_current_state;
+}
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+    static int yyinput (yyscan_t yyscanner)
+#else
+    static int input  (yyscan_t yyscanner)
+#endif
+
+{
+	int c;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	*yyg->yy_c_buf_p = yyg->yy_hold_char;
+
+	if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR )
+		{
+		/* yy_c_buf_p now points to the character we want to return.
+		 * If this occurs *before* the EOB characters, then it's a
+		 * valid NUL; if not, then we've hit the end of the buffer.
+		 */
+		if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+			/* This was really a NUL. */
+			*yyg->yy_c_buf_p = '\0';
+
+		else
+			{ /* need more input */
+			int offset = yyg->yy_c_buf_p - yyg->yytext_ptr;
+			++yyg->yy_c_buf_p;
+
+			switch ( yy_get_next_buffer( yyscanner ) )
+				{
+				case EOB_ACT_LAST_MATCH:
+					/* This happens because yy_g_n_b()
+					 * sees that we've accumulated a
+					 * token and flags that we need to
+					 * try matching the token before
+					 * proceeding.  But for input(),
+					 * there's no matching to consider.
+					 * So convert the EOB_ACT_LAST_MATCH
+					 * to EOB_ACT_END_OF_FILE.
+					 */
+
+					/* Reset buffer status. */
+					fts0trestart(yyin ,yyscanner);
+
+					/*FALLTHROUGH*/
+
+				case EOB_ACT_END_OF_FILE:
+					{
+					if ( fts0twrap(yyscanner ) )
+						return EOF;
+
+					if ( ! yyg->yy_did_buffer_switch_on_eof )
+						YY_NEW_FILE;
+#ifdef __cplusplus
+					return yyinput(yyscanner);
+#else
+					return input(yyscanner);
+#endif
+					}
+
+				case EOB_ACT_CONTINUE_SCAN:
+					yyg->yy_c_buf_p = yyg->yytext_ptr + offset;
+					break;
+				}
+			}
+		}
+
+	c = *(unsigned char *) yyg->yy_c_buf_p;	/* cast for 8-bit char's */
+	*yyg->yy_c_buf_p = '\0';	/* preserve yytext */
+	yyg->yy_hold_char = *++yyg->yy_c_buf_p;
+
+	return c;
+}
+#endif	/* ifndef YY_NO_INPUT */
+
+/** Immediately switch to a different input stream.
+ * @param input_file A readable stream.
+ * @param yyscanner The scanner object.
+ * @note This function does not reset the start condition to @c INITIAL .
+ */
+    void fts0trestart  (FILE * input_file , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if ( ! YY_CURRENT_BUFFER ){
+        fts0tensure_buffer_stack (yyscanner);
+		YY_CURRENT_BUFFER_LVALUE =
+            fts0t_create_buffer(yyin,YY_BUF_SIZE ,yyscanner);
+	}
+
+	fts0t_init_buffer(YY_CURRENT_BUFFER,input_file ,yyscanner);
+	fts0t_load_buffer_state(yyscanner );
+}
+
+/** Switch to a different input buffer.
+ * @param new_buffer The new input buffer.
+ * @param yyscanner The scanner object.
+ */
+    void fts0t_switch_to_buffer  (YY_BUFFER_STATE  new_buffer , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	/* TODO. We should be able to replace this entire function body
+	 * with
+	 *		fts0tpop_buffer_state();
+	 *		fts0tpush_buffer_state(new_buffer);
+     */
+	fts0tensure_buffer_stack (yyscanner);
+	if ( YY_CURRENT_BUFFER == new_buffer )
+		return;
+
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*yyg->yy_c_buf_p = yyg->yy_hold_char;
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+	fts0t_load_buffer_state(yyscanner );
+
+	/* We don't actually know whether we did this switch during
+	 * EOF (fts0twrap()) processing, but the only time this flag
+	 * is looked at is after fts0twrap() is called, so it's safe
+	 * to go ahead and always set it.
+	 */
+	yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+static void fts0t_load_buffer_state  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+	yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos;
+	yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file;
+	yyg->yy_hold_char = *yyg->yy_c_buf_p;
+}
+
+/** Allocate and initialize an input buffer state.
+ * @param file A readable stream.
+ * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
+ * @param yyscanner The scanner object.
+ * @return the allocated buffer state.
+ */
+    YY_BUFFER_STATE fts0t_create_buffer  (FILE * file, int  size , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+    
+	b = (YY_BUFFER_STATE) fts0talloc(sizeof( struct yy_buffer_state ) ,yyscanner );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0t_create_buffer()" );
+
+	b->yy_buf_size = size;
+
+	/* yy_ch_buf has to be 2 characters longer than the size given because
+	 * we need to put in 2 end-of-buffer characters.
+	 */
+	b->yy_ch_buf = (char *) fts0talloc(b->yy_buf_size + 2 ,yyscanner );
+	if ( ! b->yy_ch_buf )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0t_create_buffer()" );
+
+	b->yy_is_our_buffer = 1;
+
+	fts0t_init_buffer(b,file ,yyscanner);
+
+	return b;
+}
+
+/** Destroy the buffer.
+ * @param b a buffer created with fts0t_create_buffer()
+ * @param yyscanner The scanner object.
+ */
+    void fts0t_delete_buffer (YY_BUFFER_STATE  b , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if ( ! b )
+		return;
+
+	if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */
+		YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
+
+	if ( b->yy_is_our_buffer )
+		fts0tfree((void *) b->yy_ch_buf ,yyscanner );
+
+	fts0tfree((void *) b ,yyscanner );
+}
+
+/* Initializes or reinitializes a buffer.
+ * This function is sometimes called more than once on the same buffer,
+ * such as during a fts0trestart() or at EOF.
+ */
+    static void fts0t_init_buffer  (YY_BUFFER_STATE  b, FILE * file , yyscan_t yyscanner)
+
+{
+	int oerrno = errno;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	fts0t_flush_buffer(b ,yyscanner);
+
+	b->yy_input_file = file;
+	b->yy_fill_buffer = 1;
+
+    /* If b is the current buffer, then fts0t_init_buffer was _probably_
+     * called from fts0trestart() or through yy_get_next_buffer.
+     * In that case, we don't want to reset the lineno or column.
+     */
+    if (b != YY_CURRENT_BUFFER){
+        b->yy_bs_lineno = 1;
+        b->yy_bs_column = 0;
+    }
+
+        b->yy_is_interactive = 0;
+    
+	errno = oerrno;
+}
+
+/** Discard all buffered characters. On the next scan, YY_INPUT will be called.
+ * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
+ * @param yyscanner The scanner object.
+ */
+    void fts0t_flush_buffer (YY_BUFFER_STATE  b , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if ( ! b )
+		return;
+
+	b->yy_n_chars = 0;
+
+	/* We always need two end-of-buffer characters.  The first causes
+	 * a transition to the end-of-buffer state.  The second causes
+	 * a jam in that state.
+	 */
+	b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR;
+	b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR;
+
+	b->yy_buf_pos = &b->yy_ch_buf[0];
+
+	b->yy_at_bol = 1;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	if ( b == YY_CURRENT_BUFFER )
+		fts0t_load_buffer_state(yyscanner );
+}
+
+/** Pushes the new state onto the stack. The new state becomes
+ *  the current state. This function will allocate the stack
+ *  if necessary.
+ *  @param new_buffer The new state.
+ *  @param yyscanner The scanner object.
+ */
+void fts0tpush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if (new_buffer == NULL)
+		return;
+
+	fts0tensure_buffer_stack(yyscanner);
+
+	/* This block is copied from fts0t_switch_to_buffer. */
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*yyg->yy_c_buf_p = yyg->yy_hold_char;
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	/* Only push if top exists. Otherwise, replace top. */
+	if (YY_CURRENT_BUFFER)
+		yyg->yy_buffer_stack_top++;
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+
+	/* copied from fts0t_switch_to_buffer. */
+	fts0t_load_buffer_state(yyscanner );
+	yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+/** Removes and deletes the top of the stack, if present.
+ *  The next element becomes the new top.
+ *  @param yyscanner The scanner object.
+ */
+void fts0tpop_buffer_state (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if (!YY_CURRENT_BUFFER)
+		return;
+
+	fts0t_delete_buffer(YY_CURRENT_BUFFER ,yyscanner);
+	YY_CURRENT_BUFFER_LVALUE = NULL;
+	if (yyg->yy_buffer_stack_top > 0)
+		--yyg->yy_buffer_stack_top;
+
+	if (YY_CURRENT_BUFFER) {
+		fts0t_load_buffer_state(yyscanner );
+		yyg->yy_did_buffer_switch_on_eof = 1;
+	}
+}
+
+/* Allocates the stack if it does not exist.
+ *  Guarantees space for at least one push.
+ */
+static void fts0tensure_buffer_stack (yyscan_t yyscanner)
+{
+	int num_to_alloc;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if (!yyg->yy_buffer_stack) {
+
+		/* First allocation is just for 2 elements, since we don't know if this
+		 * scanner will even need a stack. We use 2 instead of 1 to avoid an
+		 * immediate realloc on the next call.
+         */
+		num_to_alloc = 1;
+		yyg->yy_buffer_stack = (struct yy_buffer_state**)fts0talloc
+								(num_to_alloc * sizeof(struct yy_buffer_state*)
+								, yyscanner);
+		if ( ! yyg->yy_buffer_stack )
+			YY_FATAL_ERROR( "out of dynamic memory in fts0tensure_buffer_stack()" );
+								  
+		memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*));
+				
+		yyg->yy_buffer_stack_max = num_to_alloc;
+		yyg->yy_buffer_stack_top = 0;
+		return;
+	}
+
+	if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){
+
+		/* Increase the buffer to prepare for a possible push. */
+		int grow_size = 8 /* arbitrary grow size */;
+
+		num_to_alloc = yyg->yy_buffer_stack_max + grow_size;
+		yyg->yy_buffer_stack = (struct yy_buffer_state**)fts0trealloc
+								(yyg->yy_buffer_stack,
+								num_to_alloc * sizeof(struct yy_buffer_state*)
+								, yyscanner);
+		if ( ! yyg->yy_buffer_stack )
+			YY_FATAL_ERROR( "out of dynamic memory in fts0tensure_buffer_stack()" );
+
+		/* zero only the new slots.*/
+		memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*));
+		yyg->yy_buffer_stack_max = num_to_alloc;
+	}
+}
+
+/** Setup the input buffer state to scan directly from a user-specified character buffer.
+ * @param base the character buffer
+ * @param size the size in bytes of the character buffer
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object. 
+ */
+YY_BUFFER_STATE fts0t_scan_buffer  (char * base, yy_size_t  size , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+    
+	if ( size < 2 ||
+	     base[size-2] != YY_END_OF_BUFFER_CHAR ||
+	     base[size-1] != YY_END_OF_BUFFER_CHAR )
+		/* They forgot to leave room for the EOB's. */
+		return 0;
+
+	b = (YY_BUFFER_STATE) fts0talloc(sizeof( struct yy_buffer_state ) ,yyscanner );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0t_scan_buffer()" );
+
+	b->yy_buf_size = size - 2;	/* "- 2" to take care of EOB's */
+	b->yy_buf_pos = b->yy_ch_buf = base;
+	b->yy_is_our_buffer = 0;
+	b->yy_input_file = 0;
+	b->yy_n_chars = b->yy_buf_size;
+	b->yy_is_interactive = 0;
+	b->yy_at_bol = 1;
+	b->yy_fill_buffer = 0;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	fts0t_switch_to_buffer(b ,yyscanner );
+
+	return b;
+}
+
+/** Setup the input buffer state to scan a string. The next call to fts0tlex() will
+ * scan from a @e copy of @a str.
+ * @param yystr a NUL-terminated string to scan
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ * @note If you want to scan bytes that may contain NUL values, then use
+ *       fts0t_scan_bytes() instead.
+ */
+YY_BUFFER_STATE fts0t_scan_string (yyconst char * yystr , yyscan_t yyscanner)
+{
+    
+	return fts0t_scan_bytes(yystr,strlen(yystr) ,yyscanner);
+}
+
+/** Setup the input buffer state to scan the given bytes. The next call to fts0tlex() will
+ * scan from a @e copy of @a bytes.
+ * @param yybytes the byte buffer to scan
+ * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes.
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE fts0t_scan_bytes  (yyconst char * yybytes, int  _yybytes_len , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+	char *buf;
+	yy_size_t n;
+	int i;
+    
+	/* Get memory for full buffer, including space for trailing EOB's. */
+	n = _yybytes_len + 2;
+	buf = (char *) fts0talloc(n ,yyscanner );
+	if ( ! buf )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0t_scan_bytes()" );
+
+	for ( i = 0; i < _yybytes_len; ++i )
+		buf[i] = yybytes[i];
+
+	buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR;
+
+	b = fts0t_scan_buffer(buf,n ,yyscanner);
+	if ( ! b )
+		YY_FATAL_ERROR( "bad buffer in fts0t_scan_bytes()" );
+
+	/* It's okay to grow etc. this buffer, and we should throw it
+	 * away when we're done.
+	 */
+	b->yy_is_our_buffer = 1;
+
+	return b;
+}
+
+#ifndef YY_EXIT_FAILURE
+#define YY_EXIT_FAILURE 2
+#endif
+
+static void yy_fatal_error (yyconst char* msg ,       yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+    	(void) fprintf( stderr, "%s\n", msg );
+	exit( YY_EXIT_FAILURE );
+}
+
+/* Redefine yyless() so it works in section 3 code. */
+
+#undef yyless
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		yytext[yyleng] = yyg->yy_hold_char; \
+		yyg->yy_c_buf_p = yytext + yyless_macro_arg; \
+		yyg->yy_hold_char = *yyg->yy_c_buf_p; \
+		*yyg->yy_c_buf_p = '\0'; \
+		yyleng = yyless_macro_arg; \
+		} \
+	while ( 0 )
+
+/* Accessor  methods (get/set functions) to struct members. */
+
+/** Get the user-defined data for this scanner.
+ * @param yyscanner The scanner object.
+ */
+YY_EXTRA_TYPE fts0tget_extra  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyextra;
+}
+
+/** Get the current line number.
+ * @param yyscanner The scanner object.
+ */
+int fts0tget_lineno  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    
+        if (! YY_CURRENT_BUFFER)
+            return 0;
+    
+    return yylineno;
+}
+
+/** Get the current column number.
+ * @param yyscanner The scanner object.
+ */
+int fts0tget_column  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    
+        if (! YY_CURRENT_BUFFER)
+            return 0;
+    
+    return yycolumn;
+}
+
+/** Get the input stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *fts0tget_in  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyin;
+}
+
+/** Get the output stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *fts0tget_out  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyout;
+}
+
+/** Get the length of the current token.
+ * @param yyscanner The scanner object.
+ */
+int fts0tget_leng  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyleng;
+}
+
+/** Get the current token.
+ * @param yyscanner The scanner object.
+ */
+
+char *fts0tget_text  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yytext;
+}
+
+/** Set the user-defined data. This data is never touched by the scanner.
+ * @param user_defined The data to be associated with this scanner.
+ * @param yyscanner The scanner object.
+ */
+void fts0tset_extra (YY_EXTRA_TYPE  user_defined , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yyextra = user_defined ;
+}
+
+/** Set the current line number.
+ * @param line_number
+ * @param yyscanner The scanner object.
+ */
+void fts0tset_lineno (int  line_number , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+        /* lineno is only valid if an input buffer exists. */
+        if (! YY_CURRENT_BUFFER )
+           yy_fatal_error( "fts0tset_lineno called with no buffer" , yyscanner); 
+    
+    yylineno = line_number;
+}
+
+/** Set the current column.
+ * @param line_number
+ * @param yyscanner The scanner object.
+ */
+void fts0tset_column (int  column_no , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+        /* column is only valid if an input buffer exists. */
+        if (! YY_CURRENT_BUFFER )
+           yy_fatal_error( "fts0tset_column called with no buffer" , yyscanner); 
+    
+    yycolumn = column_no;
+}
+
+/** Set the input stream. This does not discard the current
+ * input buffer.
+ * @param in_str A readable stream.
+ * @param yyscanner The scanner object.
+ * @see fts0t_switch_to_buffer
+ */
+void fts0tset_in (FILE *  in_str , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yyin = in_str ;
+}
+
+void fts0tset_out (FILE *  out_str , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yyout = out_str ;
+}
+
+int fts0tget_debug  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yy_flex_debug;
+}
+
+void fts0tset_debug (int  bdebug , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yy_flex_debug = bdebug ;
+}
+
+/* Accessor methods for yylval and yylloc */
+
+/* User-visible API */
+
+/* fts0tlex_init is special because it creates the scanner itself, so it is
+ * the ONLY reentrant function that doesn't take the scanner as the last argument.
+ * That's why we explicitly handle the declaration, instead of using our macros.
+ */
+
+int fts0tlex_init(yyscan_t* ptr_yy_globals)
+
+{
+    if (ptr_yy_globals == NULL){
+        errno = EINVAL;
+        return 1;
+    }
+
+    *ptr_yy_globals = (yyscan_t) fts0talloc ( sizeof( struct yyguts_t ), NULL );
+
+    if (*ptr_yy_globals == NULL){
+        errno = ENOMEM;
+        return 1;
+    }
+
+    /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */
+    memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+    return yy_init_globals ( *ptr_yy_globals );
+}
+
+/* fts0tlex_init_extra has the same functionality as fts0tlex_init, but follows the
+ * convention of taking the scanner as the last argument. Note however, that
+ * this is a *pointer* to a scanner, as it will be allocated by this call (and
+ * is the reason, too, why this function also must handle its own declaration).
+ * The user defined value in the first argument will be available to fts0talloc in
+ * the yyextra field.
+ */
+
+int fts0tlex_init_extra(YY_EXTRA_TYPE yy_user_defined,yyscan_t* ptr_yy_globals )
+
+{
+    struct yyguts_t dummy_yyguts;
+
+    fts0tset_extra (yy_user_defined, &dummy_yyguts);
+
+    if (ptr_yy_globals == NULL){
+        errno = EINVAL;
+        return 1;
+    }
+	
+    *ptr_yy_globals = (yyscan_t) fts0talloc ( sizeof( struct yyguts_t ), &dummy_yyguts );
+	
+    if (*ptr_yy_globals == NULL){
+        errno = ENOMEM;
+        return 1;
+    }
+    
+    /* By setting to 0xAA, we expose bugs in
+    yy_init_globals. Leave at 0x00 for releases. */
+    memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+    
+    fts0tset_extra (yy_user_defined, *ptr_yy_globals);
+    
+    return yy_init_globals ( *ptr_yy_globals );
+}
+
+static int yy_init_globals (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    /* Initialization is the same as for the non-reentrant scanner.
+     * This function is called from fts0tlex_destroy(), so don't allocate here.
+     */
+
+    yyg->yy_buffer_stack = 0;
+    yyg->yy_buffer_stack_top = 0;
+    yyg->yy_buffer_stack_max = 0;
+    yyg->yy_c_buf_p = (char *) 0;
+    yyg->yy_init = 0;
+    yyg->yy_start = 0;
+
+    yyg->yy_start_stack_ptr = 0;
+    yyg->yy_start_stack_depth = 0;
+    yyg->yy_start_stack =  NULL;
+
+/* Defined in main.c */
+#ifdef YY_STDINIT
+    yyin = stdin;
+    yyout = stdout;
+#else
+    yyin = (FILE *) 0;
+    yyout = (FILE *) 0;
+#endif
+
+    /* For future reference: Set errno on error, since we are called by
+     * fts0tlex_init()
+     */
+    return 0;
+}
+
+/* fts0tlex_destroy is for both reentrant and non-reentrant scanners. */
+int fts0tlex_destroy  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+    /* Pop the buffer stack, destroying each element. */
+	while(YY_CURRENT_BUFFER){
+		fts0t_delete_buffer(YY_CURRENT_BUFFER ,yyscanner );
+		YY_CURRENT_BUFFER_LVALUE = NULL;
+		fts0tpop_buffer_state(yyscanner);
+	}
+
+	/* Destroy the stack itself. */
+	fts0tfree(yyg->yy_buffer_stack ,yyscanner);
+	yyg->yy_buffer_stack = NULL;
+
+    /* Destroy the start condition stack. */
+        fts0tfree(yyg->yy_start_stack ,yyscanner );
+        yyg->yy_start_stack = NULL;
+
+    /* Reset the globals. This is important in a non-reentrant scanner so the next time
+     * fts0tlex() is called, initialization will occur. */
+    yy_init_globals( yyscanner);
+
+    /* Destroy the main struct (reentrant only). */
+    fts0tfree ( yyscanner , yyscanner );
+    yyscanner = NULL;
+    return 0;
+}
+
+/*
+ * Internal utility routines.
+ */
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char* s1, yyconst char * s2, int n ,       yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	register int i;
+	for ( i = 0; i < n; ++i )
+		s1[i] = s2[i];
+}
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * s ,       yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	register int n;
+	for ( n = 0; s[n]; ++n )
+		;
+
+	return n;
+}
+#endif
+
+void *fts0talloc (yy_size_t  size ,       yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	return (void *) malloc( size );
+}
+
+void *fts0trealloc  (void * ptr, yy_size_t  size ,       yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	/* The cast to (char *) in the following accommodates both
+	 * implementations that use char* generic pointers, and those
+	 * that use void* generic pointers.  It works with the latter
+	 * because both ANSI C and C++ allow castless assignment from
+	 * any pointer type to void*, and deal with argument conversions
+	 * as though doing an assignment.
+	 */
+	return (void *) realloc( (char *) ptr, size );
+}
+
+void fts0tfree (void * ptr ,       yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	free( (char *) ptr );	/* see fts0trealloc() for (char *) cast */
+}
+
+#define YYTABLES_NAME "yytables"
+
+#line 68 "fts0tlex.l"
+
+
+
diff --git a/storage/xtradb/fts/fts0tlex.l b/storage/xtradb/fts/fts0tlex.l
new file mode 100644
index 00000000000..a18c2a55081
--- /dev/null
+++ b/storage/xtradb/fts/fts0tlex.l
@@ -0,0 +1,68 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+ * @file fts/fts0tlex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+
+%{
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_tlexer(YYSTYPE* val, yyscan_t yyscanner)
+
+%}
+
+%option noinput
+%option nounput
+%option noyywrap
+%option nostdinit
+%option reentrant
+%option never-interactive
+
+
+%%
+
+[\t ]+	/* Ignore whitespace */ ;
+
+[*]	{
+	val->oper = fts0tget_text(yyscanner)[0];
+
+	return(val->oper);
+}
+
+\"[^\"\n]*\"	{
+	val->token = strdup(fts0tget_text(yyscanner));
+
+	return(FTS_TEXT);
+}
+
+[^" \n\%]*	{
+	val->token = strdup(fts0tget_text(yyscanner));
+
+	return(FTS_TERM);
+}
+.	;
+\n
+
+%%
diff --git a/storage/xtradb/fts/make_parser.sh b/storage/xtradb/fts/make_parser.sh
new file mode 100755
index 00000000000..2c072914c8b
--- /dev/null
+++ b/storage/xtradb/fts/make_parser.sh
@@ -0,0 +1,49 @@
+#!/bin/sh
+#
+# Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+
+TMPF=t.$$
+
+make -f Makefile.query
+
+echo '#include "univ.i"' > $TMPF
+
+# This is to avoid compiler warning about unused parameters.
+# FIXME: gcc extension "__attribute__" causing compilation errors on windows
+# platform. Quote them out for now.
+sed -e '
+s/^\(static.*void.*yy_fatal_error.*msg.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(static.*void.*yy_flex_strncpy.*n.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(static.*int.*yy_flex_strlen.*s.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]alloc.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]realloc.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]free.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+' < fts0blex.cc >> $TMPF
+
+mv $TMPF fts0blex.cc
+
+echo '#include "univ.i"' > $TMPF
+
+sed -e '
+s/^\(static.*void.*yy_fatal_error.*msg.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(static.*void.*yy_flex_strncpy.*n.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(static.*int.*yy_flex_strlen.*s.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]alloc.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]realloc.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]free.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+' < fts0tlex.cc >> $TMPF
+
+mv $TMPF fts0tlex.cc
diff --git a/storage/xtradb/fut/fut0fut.c b/storage/xtradb/fut/fut0fut.cc
index 35dc66b8914..9bb1c512182 100644
--- a/storage/xtradb/fut/fut0fut.c
+++ b/storage/xtradb/fut/fut0fut.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /******************************************************************//**
-@file fut/fut0fut.c
+@file fut/fut0fut.cc
 File-based utilities
 
 Created 12/13/1995 Heikki Tuuri
diff --git a/storage/xtradb/fut/fut0lst.c b/storage/xtradb/fut/fut0lst.cc
index a008b7453a1..8f96a6426d2 100644
--- a/storage/xtradb/fut/fut0lst.c
+++ b/storage/xtradb/fut/fut0lst.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /******************************************************************//**
-@file fut/fut0lst.c
+@file fut/fut0lst.cc
 File-based list utilities
 
 Created 11/28/1995 Heikki Tuuri
diff --git a/storage/xtradb/ha/ha0ha.c b/storage/xtradb/ha/ha0ha.cc
index 4e0b9a78841..b79ae922045 100644
--- a/storage/xtradb/ha/ha0ha.c
+++ b/storage/xtradb/ha/ha0ha.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file ha/ha0ha.c
+@file ha/ha0ha.cc
 The hash table with external chains
 
 Created 8/22/1994 Heikki Tuuri
@@ -32,7 +32,7 @@ Created 8/22/1994 Heikki Tuuri
 #ifdef UNIV_DEBUG
 # include "buf0buf.h"
 #endif /* UNIV_DEBUG */
-#include "btr0sea.h"
+# include "btr0sea.h"
 #include "page0page.h"
 
 /*************************************************************//**
@@ -45,44 +45,142 @@ ha_create_func(
 /*===========*/
 	ulint	n,		/*!< in: number of array cells */
 #ifdef UNIV_SYNC_DEBUG
-	ulint	mutex_level,	/*!< in: level of the mutexes in the latching
-				order: this is used in the debug version */
+	ulint	sync_level,	/*!< in: level of the mutexes or rw_locks
+				in the latching order: this is used in the
+				 debug version */
 #endif /* UNIV_SYNC_DEBUG */
-	ulint	n_mutexes)	/*!< in: number of mutexes to protect the
-				hash table: must be a power of 2, or 0 */
+	ulint	n_sync_obj,	/*!< in: number of mutexes or rw_locks
+				to protect the hash table: must be a
+				power of 2, or 0 */
+	ulint	type)		/*!< in: type of datastructure for which
+				the memory heap is going to be used e.g.:
+				MEM_HEAP_FOR_BTR_SEARCH or
+				MEM_HEAP_FOR_PAGE_HASH */
 {
 	hash_table_t*	table;
 	ulint		i;
 
-	ut_ad(ut_is_2pow(n_mutexes));
+	ut_a(type == MEM_HEAP_FOR_BTR_SEARCH
+	     || type == MEM_HEAP_FOR_PAGE_HASH);
+
+	ut_ad(ut_is_2pow(n_sync_obj));
 	table = hash_create(n);
 
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-	table->adaptive = TRUE;
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
 	/* Creating MEM_HEAP_BTR_SEARCH type heaps can potentially fail,
 	but in practise it never should in this case, hence the asserts. */
 
-	if (n_mutexes == 0) {
-		table->heap = mem_heap_create_in_btr_search(
-			ut_min(4096, MEM_MAX_ALLOC_IN_BUF));
+	if (n_sync_obj == 0) {
+		table->heap = mem_heap_create_typed(
+			ut_min(4096, MEM_MAX_ALLOC_IN_BUF), type);
 		ut_a(table->heap);
 
 		return(table);
 	}
 
-	hash_create_mutexes(table, n_mutexes, mutex_level);
+	if (type == MEM_HEAP_FOR_PAGE_HASH) {
+		/* We create a hash table protected by rw_locks for
+		buf_pool->page_hash. */
+		hash_create_sync_obj(table, HASH_TABLE_SYNC_RW_LOCK,
+				     n_sync_obj, sync_level);
+	} else {
+		hash_create_sync_obj(table, HASH_TABLE_SYNC_MUTEX,
+				     n_sync_obj, sync_level);
+	}
 
-	table->heaps = mem_alloc(n_mutexes * sizeof(void*));
+	table->heaps = static_cast<mem_heap_t**>(
+		mem_alloc(n_sync_obj * sizeof(void*)));
 
-	for (i = 0; i < n_mutexes; i++) {
-		table->heaps[i] = mem_heap_create_in_btr_search(4096);
+	for (i = 0; i < n_sync_obj; i++) {
+		table->heaps[i] = mem_heap_create_typed(4096, type);
 		ut_a(table->heaps[i]);
 	}
 
 	return(table);
 }
 
+#ifdef UNIV_SYNC_DEBUG
+/*************************************************************//**
+Verifies that the specified hash table is a part of adaptive hash index and
+that its corresponding latch is X-latched by the current thread.  */
+static
+bool
+ha_assert_btr_x_locked(
+/*===================*/
+	const hash_table_t* table)	/*!<in: hash table to check */
+{
+	ulint i;
+
+	ut_ad(table->adaptive);
+
+	for (i = 0; i < btr_search_index_num; i++) {
+		if (btr_search_sys->hash_tables[i] == table) {
+			break;
+		}
+	}
+
+	ut_ad(i < btr_search_index_num);
+	ut_ad(rw_lock_own(&btr_search_latch_arr[i], RW_LOCK_EX));
+
+	return(true);
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+/*************************************************************//**
+Empties a hash table and frees the memory heaps. */
+UNIV_INTERN
+void
+ha_clear(
+/*=====*/
+	hash_table_t*	table)	/*!< in, own: hash table */
+{
+	ulint	i;
+	ulint	n;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!table->adaptive || ha_assert_btr_x_locked(table));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* Free the memory heaps. */
+	n = table->n_sync_obj;
+
+	for (i = 0; i < n; i++) {
+		mem_heap_free(table->heaps[i]);
+	}
+
+	if (table->heaps) {
+		mem_free(table->heaps);
+	}
+
+	switch (table->type) {
+	case HASH_TABLE_SYNC_MUTEX:
+		mem_free(table->sync_obj.mutexes);
+		table->sync_obj.mutexes = NULL;
+		break;
+
+	case HASH_TABLE_SYNC_RW_LOCK:
+		mem_free(table->sync_obj.rw_locks);
+		table->sync_obj.rw_locks = NULL;
+		break;
+
+	case HASH_TABLE_SYNC_NONE:
+		/* do nothing */
+		break;
+	}
+
+	table->n_sync_obj = 0;
+	table->type = HASH_TABLE_SYNC_NONE;
+
+
+	/* Clear the hash table. */
+	n = hash_get_n_cells(table);
+
+	for (i = 0; i < n; i++) {
+		hash_get_nth_cell(table, i)->node = NULL;
+	}
+}
+
 /*************************************************************//**
 Inserts an entry into a hash table. If an entry with the same fold number
 is found, its node is updated to point to the new data, and no new node
@@ -114,17 +212,14 @@ ha_insert_for_fold_func(
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 	ut_a(block->frame == page_align(data));
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(block->btr_search_latch, RW_LOCK_EX));
-#endif /* UNIV_SYNC_DEBUG */
-	ASSERT_HASH_MUTEX_OWN(table, fold);
+	hash_assert_can_modify(table, fold);
 	ut_ad(btr_search_enabled);
 
 	hash = hash_calc_hash(fold, table);
 
 	cell = hash_get_nth_cell(table, hash);
 
-	prev_node = cell->node;
+	prev_node = static_cast<ha_node_t*>(cell->node);
 
 	while (prev_node != NULL) {
 		if (prev_node->fold == fold) {
@@ -150,7 +245,8 @@ ha_insert_for_fold_func(
 
 	/* We have to allocate a new chain node */
 
-	node = mem_heap_alloc(hash_get_heap(table, fold), sizeof(ha_node_t));
+	node = static_cast<ha_node_t*>(
+		mem_heap_alloc(hash_get_heap(table, fold), sizeof(ha_node_t)));
 
 	if (node == NULL) {
 		/* It was a btr search type memory heap and at the moment
@@ -173,7 +269,7 @@ ha_insert_for_fold_func(
 
 	node->next = NULL;
 
-	prev_node = cell->node;
+	prev_node = static_cast<ha_node_t*>(cell->node);
 
 	if (prev_node == NULL) {
 
@@ -204,7 +300,7 @@ ha_delete_hash_node(
 	ut_ad(table);
 	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
 #ifdef UNIV_SYNC_DEBUG
-	//	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+	ut_ad(ha_assert_btr_x_locked(table));
 #endif /* UNIV_SYNC_DEBUG */
 	ut_ad(btr_search_enabled);
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
@@ -220,9 +316,10 @@ ha_delete_hash_node(
 
 /*********************************************************//**
 Looks for an element when we know the pointer to the data, and updates
-the pointer to data, if found. */
+the pointer to data, if found.
+@return TRUE if found */
 UNIV_INTERN
-void
+ibool
 ha_search_and_update_if_found_func(
 /*===============================*/
 	hash_table_t*	table,	/*!< in/out: hash table */
@@ -237,16 +334,16 @@ ha_search_and_update_if_found_func(
 
 	ut_ad(table);
 	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ASSERT_HASH_MUTEX_OWN(table, fold);
+	hash_assert_can_modify(table, fold);
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 	ut_a(new_block->frame == page_align(new_data));
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
 #ifdef UNIV_SYNC_DEBUG
-	//	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+	ut_ad(ha_assert_btr_x_locked(table));
 #endif /* UNIV_SYNC_DEBUG */
 
 	if (!btr_search_enabled) {
-		return;
+		return(FALSE);
 	}
 
 	node = ha_search_with_data(table, fold, data);
@@ -262,7 +359,11 @@ ha_search_and_update_if_found_func(
 		node->block = new_block;
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
 		node->data = new_data;
+
+		return(TRUE);
 	}
+
+	return(FALSE);
 }
 
 /*****************************************************************//**
@@ -280,10 +381,7 @@ ha_remove_all_nodes_to_page(
 
 	ut_ad(table);
 	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ASSERT_HASH_MUTEX_OWN(table, fold);
-#ifdef UNIV_SYNC_DEBUG
-	//	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
-#endif /* UNIV_SYNC_DEBUG */
+	hash_assert_can_modify(table, fold);
 	ut_ad(btr_search_enabled);
 
 	node = ha_chain_get_first(table, fold);
@@ -329,8 +427,6 @@ ha_validate(
 	ulint		start_index,	/*!< in: start index */
 	ulint		end_index)	/*!< in: end index */
 {
-	hash_cell_t*	cell;
-	ha_node_t*	node;
 	ibool		ok	= TRUE;
 	ulint		i;
 
@@ -341,12 +437,15 @@ ha_validate(
 	ut_a(end_index < hash_get_n_cells(table));
 
 	for (i = start_index; i <= end_index; i++) {
+		ha_node_t*	node;
+		hash_cell_t*	cell;
 
 		cell = hash_get_nth_cell(table, i);
 
-		node = cell->node;
+		for (node = static_cast<ha_node_t*>(cell->node);
+		     node != 0;
+		     node = node->next) {
 
-		while (node) {
 			if (hash_calc_hash(node->fold, table) != i) {
 				ut_print_timestamp(stderr);
 				fprintf(stderr,
@@ -357,8 +456,6 @@ ha_validate(
 
 				ok = FALSE;
 			}
-
-			node = node->next;
 		}
 	}
 
diff --git a/storage/xtradb/ha/ha0storage.c b/storage/xtradb/ha/ha0storage.cc
index 95973753906..6820591f316 100644
--- a/storage/xtradb/ha/ha0storage.c
+++ b/storage/xtradb/ha/ha0storage.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file ha/ha0storage.c
+@file ha/ha0storage.cc
 Hash storage.
 Provides a data structure that stores chunks of data in
 its own storage, avoiding duplicates.
@@ -51,7 +51,7 @@ ha_storage_get(
 
 	/* avoid repetitive calls to ut_fold_binary() in the HASH_SEARCH
 	macro */
-	fold = ut_fold_binary(data, data_len);
+	fold = ut_fold_binary(static_cast<const byte*>(data), data_len);
 
 #define IS_FOUND	\
 	node->data_len == data_len && memcmp(node->data, data, data_len) == 0
@@ -128,7 +128,7 @@ ha_storage_put_memlim(
 
 	/* avoid repetitive calls to ut_fold_binary() in the HASH_INSERT
 	macro */
-	fold = ut_fold_binary(data, data_len);
+	fold = ut_fold_binary(static_cast<const byte*>(data), data_len);
 
 	HASH_INSERT(
 		ha_storage_node_t,	/* type used in the hash chain */
diff --git a/storage/xtradb/ha/hash0hash.c b/storage/xtradb/ha/hash0hash.c
deleted file mode 100644
index e594b3f6bd2..00000000000
--- a/storage/xtradb/ha/hash0hash.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file ha/hash0hash.c
-The simple hash table utility
-
-Created 5/20/1997 Heikki Tuuri
-*******************************************************/
-
-#include "hash0hash.h"
-#ifdef UNIV_NONINL
-#include "hash0hash.ic"
-#endif
-
-#include "mem0mem.h"
-
-#ifndef UNIV_HOTBACKUP
-
-# ifdef UNIV_PFS_MUTEX
-UNIV_INTERN mysql_pfs_key_t	hash_table_mutex_key;
-# endif /* UNIV_PFS_MUTEX */
-
-/************************************************************//**
-Reserves the mutex for a fold value in a hash table. */
-UNIV_INTERN
-void
-hash_mutex_enter(
-/*=============*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold)	/*!< in: fold */
-{
-	mutex_enter(hash_get_mutex(table, fold));
-}
-
-/************************************************************//**
-Releases the mutex for a fold value in a hash table. */
-UNIV_INTERN
-void
-hash_mutex_exit(
-/*============*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold)	/*!< in: fold */
-{
-	mutex_exit(hash_get_mutex(table, fold));
-}
-
-/************************************************************//**
-Reserves all the mutexes of a hash table, in an ascending order. */
-UNIV_INTERN
-void
-hash_mutex_enter_all(
-/*=================*/
-	hash_table_t*	table)	/*!< in: hash table */
-{
-	ulint	i;
-
-	for (i = 0; i < table->n_mutexes; i++) {
-
-		mutex_enter(table->mutexes + i);
-	}
-}
-
-/************************************************************//**
-Releases all the mutexes of a hash table. */
-UNIV_INTERN
-void
-hash_mutex_exit_all(
-/*================*/
-	hash_table_t*	table)	/*!< in: hash table */
-{
-	ulint	i;
-
-	for (i = 0; i < table->n_mutexes; i++) {
-
-		mutex_exit(table->mutexes + i);
-	}
-}
-#endif /* !UNIV_HOTBACKUP */
-
-/*************************************************************//**
-Creates a hash table with >= n array cells. The actual number of cells is
-chosen to be a prime number slightly bigger than n.
-@return	own: created table */
-UNIV_INTERN
-hash_table_t*
-hash_create(
-/*========*/
-	ulint	n)	/*!< in: number of array cells */
-{
-	hash_cell_t*	array;
-	ulint		prime;
-	hash_table_t*	table;
-
-	prime = ut_find_prime(n);
-
-	table = mem_alloc(sizeof(hash_table_t));
-
-	array = ut_malloc(sizeof(hash_cell_t) * prime);
-
-	table->array = array;
-	table->n_cells = prime;
-#ifndef UNIV_HOTBACKUP
-# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-	table->adaptive = FALSE;
-# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-	table->n_mutexes = 0;
-	table->mutexes = NULL;
-	table->heaps = NULL;
-#endif /* !UNIV_HOTBACKUP */
-	table->heap = NULL;
-	ut_d(table->magic_n = HASH_TABLE_MAGIC_N);
-
-	/* Initialize the cell array */
-	hash_table_clear(table);
-
-	return(table);
-}
-
-/*************************************************************//**
-Frees a hash table. */
-UNIV_INTERN
-void
-hash_table_free(
-/*============*/
-	hash_table_t*	table)	/*!< in, own: hash table */
-{
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-#ifndef UNIV_HOTBACKUP
-	ut_a(table->mutexes == NULL);
-#endif /* !UNIV_HOTBACKUP */
-
-	ut_free(table->array);
-	mem_free(table);
-}
-
-#ifndef UNIV_HOTBACKUP
-/*************************************************************//**
-Creates a mutex array to protect a hash table. */
-UNIV_INTERN
-void
-hash_create_mutexes_func(
-/*=====================*/
-	hash_table_t*	table,		/*!< in: hash table */
-#ifdef UNIV_SYNC_DEBUG
-	ulint		sync_level,	/*!< in: latching order level of the
-					mutexes: used in the debug version */
-#endif /* UNIV_SYNC_DEBUG */
-	ulint		n_mutexes)	/*!< in: number of mutexes, must be a
-					power of 2 */
-{
-	ulint	i;
-
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ut_a(n_mutexes > 0);
-	ut_a(ut_is_2pow(n_mutexes));
-
-	table->mutexes = mem_alloc(n_mutexes * sizeof(mutex_t));
-
-	for (i = 0; i < n_mutexes; i++) {
-		mutex_create(hash_table_mutex_key,
-			     table->mutexes + i, sync_level);
-	}
-
-	table->n_mutexes = n_mutexes;
-}
-#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/ha/hash0hash.cc b/storage/xtradb/ha/hash0hash.cc
new file mode 100644
index 00000000000..6f5b98e5e98
--- /dev/null
+++ b/storage/xtradb/ha/hash0hash.cc
@@ -0,0 +1,403 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ha/hash0hash.cc
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "hash0hash.h"
+#ifdef UNIV_NONINL
+#include "hash0hash.ic"
+#endif
+
+#include "mem0mem.h"
+
+#ifndef UNIV_HOTBACKUP
+
+# ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	hash_table_mutex_key;
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+UNIV_INTERN mysql_pfs_key_t	hash_table_rw_lock_key;
+# endif /* UNIV_PFS_RWLOCK */
+/************************************************************//**
+Reserves the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_enter(
+/*=============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	mutex_enter(hash_get_mutex(table, fold));
+}
+
+/************************************************************//**
+Releases the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit(
+/*============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	mutex_exit(hash_get_mutex(table, fold));
+}
+
+/************************************************************//**
+Reserves all the mutexes of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_mutex_enter_all(
+/*=================*/
+	hash_table_t*	table)	/*!< in: hash table */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		mutex_enter(table->sync_obj.mutexes + i);
+	}
+}
+
+/************************************************************//**
+Releases all the mutexes of a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit_all(
+/*================*/
+	hash_table_t*	table)	/*!< in: hash table */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		mutex_exit(table->sync_obj.mutexes + i);
+	}
+}
+
+/************************************************************//**
+Releases all but the passed in mutex of a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit_all_but(
+/*====================*/
+	hash_table_t*		table,		/*!< in: hash table */
+	ib_prio_mutex_t*	keep_mutex)	/*!< in: mutex to keep */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		ib_prio_mutex_t* mutex = table->sync_obj.mutexes + i;
+		if (UNIV_LIKELY(keep_mutex != mutex)) {
+			mutex_exit(mutex);
+		}
+	}
+
+	ut_ad(mutex_own(keep_mutex));
+}
+
+/************************************************************//**
+s-lock a lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_lock_s(
+/*========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+
+	prio_rw_lock_t* lock = hash_get_lock(table, fold);
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(lock);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_s_lock(lock);
+}
+
+/************************************************************//**
+x-lock a lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_lock_x(
+/*========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+
+	prio_rw_lock_t* lock = hash_get_lock(table, fold);
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(lock);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_x_lock(lock);
+}
+
+/************************************************************//**
+unlock an s-lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_unlock_s(
+/*==========*/
+
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+
+	prio_rw_lock_t* lock = hash_get_lock(table, fold);
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(lock);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_s_unlock(lock);
+}
+
+/************************************************************//**
+unlock x-lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_unlock_x(
+/*==========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	prio_rw_lock_t* lock = hash_get_lock(table, fold);
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(lock);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_x_unlock(lock);
+}
+
+/************************************************************//**
+Reserves all the locks of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_lock_x_all(
+/*============*/
+	hash_table_t*	table)	/*!< in: hash table */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		prio_rw_lock_t* lock = table->sync_obj.rw_locks + i;
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED));
+		ut_ad(!rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+		rw_lock_x_lock(lock);
+	}
+}
+
+/************************************************************//**
+Releases all the locks of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_unlock_x_all(
+/*==============*/
+	hash_table_t*	table)	/*!< in: hash table */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		prio_rw_lock_t* lock = table->sync_obj.rw_locks + i;
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+		rw_lock_x_unlock(lock);
+	}
+}
+
+/************************************************************//**
+Releases all but passed in lock of a hash table, */
+UNIV_INTERN
+void
+hash_unlock_x_all_but(
+/*==================*/
+	hash_table_t*	table,		/*!< in: hash table */
+	prio_rw_lock_t*	keep_lock)	/*!< in: lock to keep */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		prio_rw_lock_t* lock = table->sync_obj.rw_locks + i;
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+		if (UNIV_LIKELY(keep_lock != lock)) {
+			rw_lock_x_unlock(lock);
+		}
+	}
+}
+
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Creates a hash table with >= n array cells. The actual number of cells is
+chosen to be a prime number slightly bigger than n.
+@return	own: created table */
+UNIV_INTERN
+hash_table_t*
+hash_create(
+/*========*/
+	ulint	n)	/*!< in: number of array cells */
+{
+	hash_cell_t*	array;
+	ulint		prime;
+	hash_table_t*	table;
+
+	prime = ut_find_prime(n);
+
+	table = static_cast<hash_table_t*>(mem_alloc(sizeof(hash_table_t)));
+
+	array = static_cast<hash_cell_t*>(
+		ut_malloc(sizeof(hash_cell_t) * prime));
+
+	/* The default type of hash_table is HASH_TABLE_SYNC_NONE i.e.:
+	the caller is responsible for access control to the table. */
+	table->type = HASH_TABLE_SYNC_NONE;
+	table->array = array;
+	table->n_cells = prime;
+#ifndef UNIV_HOTBACKUP
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	table->adaptive = FALSE;
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	table->n_sync_obj = 0;
+	table->sync_obj.mutexes = NULL;
+	table->heaps = NULL;
+#endif /* !UNIV_HOTBACKUP */
+	table->heap = NULL;
+	ut_d(table->magic_n = HASH_TABLE_MAGIC_N);
+
+	/* Initialize the cell array */
+	hash_table_clear(table);
+
+	return(table);
+}
+
+/*************************************************************//**
+Frees a hash table. */
+UNIV_INTERN
+void
+hash_table_free(
+/*============*/
+	hash_table_t*	table)	/*!< in, own: hash table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+
+	ut_free(table->array);
+	mem_free(table);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Creates a sync object array to protect a hash table.
+::sync_obj can be mutexes or rw_locks depening on the type of
+hash table. */
+UNIV_INTERN
+void
+hash_create_sync_obj_func(
+/*======================*/
+	hash_table_t*		table,	/*!< in: hash table */
+	enum hash_table_sync_t	type,	/*!< in: HASH_TABLE_SYNC_MUTEX
+					or HASH_TABLE_SYNC_RW_LOCK */
+#ifdef UNIV_SYNC_DEBUG
+	ulint			sync_level,/*!< in: latching order level
+					of the mutexes: used in the
+					debug version */
+#endif /* UNIV_SYNC_DEBUG */
+	ulint			n_sync_obj)/*!< in: number of sync objects,
+					must be a power of 2 */
+{
+	ulint	i;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_a(n_sync_obj > 0);
+	ut_a(ut_is_2pow(n_sync_obj));
+
+	table->type = type;
+
+	switch (type) {
+	case HASH_TABLE_SYNC_MUTEX:
+		table->sync_obj.mutexes = static_cast<ib_prio_mutex_t*>(
+			mem_alloc(n_sync_obj * sizeof(ib_prio_mutex_t)));
+
+		for (i = 0; i < n_sync_obj; i++) {
+			mutex_create(hash_table_mutex_key,
+			     table->sync_obj.mutexes + i, sync_level);
+		}
+
+		break;
+
+	case HASH_TABLE_SYNC_RW_LOCK:
+		table->sync_obj.rw_locks = static_cast<prio_rw_lock_t*>(
+			mem_alloc(n_sync_obj * sizeof(prio_rw_lock_t)));
+
+		for (i = 0; i < n_sync_obj; i++) {
+			rw_lock_create(hash_table_rw_lock_key,
+			     table->sync_obj.rw_locks + i, sync_level);
+		}
+
+		break;
+
+	case HASH_TABLE_SYNC_NONE:
+		ut_error;
+	}
+
+	table->n_sync_obj = n_sync_obj;
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
index 155a913a206..43cfa23a99f 100644
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -1,8 +1,9 @@
 /*****************************************************************************
 
-Copyright (c) 2000, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
 Copyright (c) 2008, 2009 Google Inc.
 Copyright (c) 2009, Percona Inc.
+Copyright (c) 2012, Facebook Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -30,37 +31,18 @@ this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
-
-/* TODO list for the InnoDB handler in 5.0:
-  - fix savepoint functions to use savepoint storage area
-  - Find out what kind of problems the OS X case-insensitivity causes to
-    table and database names; should we 'normalize' the names like we do
-    in Windows?
-*/
-
-#ifdef USE_PRAGMA_IMPLEMENTATION
-#pragma implementation				// gcc: Class implementation
-#endif
-
-#define MYSQL_SERVER 1
+ 
+#define MYSQL_SERVER
 
 #include <sql_table.h>	// explain_filename, nz2, EXPLAIN_PARTITIONS_AS_COMMENT,
 			// EXPLAIN_FILENAME_MAX_EXTRA_LENGTH
 
 #include <sql_acl.h>	// PROCESS_ACL
-#include <m_ctype.h>
 #include <debug_sync.h> // DEBUG_SYNC
+#include <my_base.h>	// HA_OPTION_*
 #include <mysys_err.h>
-#include <mysql/plugin.h>
 #include <innodb_priv.h>
-#include <mysql/psi/psi.h>
-#include <my_sys.h>
-
-#ifdef MYSQL_SERVER
-#include <rpl_mi.h>
-#include <slave.h>
-#include <log_event.h> // rpl_get_position_info
-#endif /* MYSQL_SERVER */
+#include <table_cache.h>
 
 #ifdef _WIN32
 #include <io.h>
@@ -68,9 +50,11 @@ this program; if not, write to the Free Software Foundation, Inc.,
 /** @file ha_innodb.cc */
 
 /* Include necessary InnoDB headers */
-extern "C" {
 #include "univ.i"
+#include "buf0dump.h"
 #include "buf0lru.h"
+#include "buf0flu.h"
+#include "buf0dblwr.h"
 #include "btr0sea.h"
 #include "os0file.h"
 #include "os0thread.h"
@@ -78,8 +62,10 @@ extern "C" {
 #include "srv0srv.h"
 #include "trx0roll.h"
 #include "trx0trx.h"
+
 #include "trx0sys.h"
 #include "mtr0mtr.h"
+#include "rem0types.h"
 #include "row0ins.h"
 #include "row0mysql.h"
 #include "row0sel.h"
@@ -96,24 +82,36 @@ extern "C" {
 #include "trx0xa.h"
 #include "row0merge.h"
 #include "dict0boot.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
 #include "ha_prototypes.h"
 #include "ut0mem.h"
 #include "ibuf0ibuf.h"
+#include "dict0dict.h"
+#include "srv0mon.h"
+#include "api0api.h"
+#include "api0misc.h"
+#include "pars0pars.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "row0import.h"
+#include "row0quiesce.h"
+#ifdef UNIV_DEBUG
+#include "trx0purge.h"
+#endif /* UNIV_DEBUG */
+#include "fts0priv.h"
+#include "page0zip.h"
 
-enum_tx_isolation thd_get_trx_isolation(const THD* thd);
+#define thd_get_trx_isolation(X) ((enum_tx_isolation)thd_tx_isolation(X))
 
-}
+#ifdef MYSQL_DYNAMIC_PLUGIN
+#define tc_size 400
+#define tdc_size 400
+#endif
 
 #include "ha_innodb.h"
 #include "i_s.h"
-
-#ifdef MYSQL_SERVER
-// Defined in trx0sys.c
-extern char		trx_sys_mysql_master_log_name[];
-extern ib_int64_t	trx_sys_mysql_master_log_pos;
-extern char		trx_sys_mysql_relay_log_name[];
-extern ib_int64_t	trx_sys_mysql_relay_log_pos;
-#endif /* MYSQL_SERVER */
+#include "xtradb_i_s.h"
 
 # ifndef MYSQL_PLUGIN_IMPORT
 #  define MYSQL_PLUGIN_IMPORT /* nothing */
@@ -121,6 +119,7 @@ extern ib_int64_t	trx_sys_mysql_relay_log_pos;
 
 /** to protect innobase_open_files */
 static mysql_mutex_t innobase_share_mutex;
+/** to force correct commit order in binlog */
 static ulong commit_threads = 0;
 static mysql_cond_t commit_cond;
 static mysql_mutex_t commit_cond_m;
@@ -129,47 +128,48 @@ static bool innodb_inited = 0;
 
 #define INSIDE_HA_INNOBASE_CC
 
-/* In the Windows plugin, the return value of current_thd is
-undefined.  Map it to NULL. */
-
 #define EQ_CURRENT_THD(thd) ((thd) == current_thd)
 
-
 static struct handlerton* innodb_hton_ptr;
 
 static const long AUTOINC_OLD_STYLE_LOCKING = 0;
 static const long AUTOINC_NEW_STYLE_LOCKING = 1;
 static const long AUTOINC_NO_LOCKING = 2;
 
-static long innobase_mirrored_log_groups, innobase_log_files_in_group,
-	innobase_log_buffer_size,
-	innobase_additional_mem_pool_size, innobase_file_io_threads,
-	innobase_force_recovery, innobase_open_files,
-	innobase_autoinc_lock_mode;
+static long innobase_mirrored_log_groups;
+static long innobase_log_buffer_size;
+static long innobase_additional_mem_pool_size;
+static long innobase_file_io_threads;
+static long innobase_open_files;
+static long innobase_autoinc_lock_mode;
 static ulong innobase_commit_concurrency = 0;
 static ulong innobase_read_io_threads;
 static ulong innobase_write_io_threads;
 static long innobase_buffer_pool_instances = 1;
 
-static ulong innobase_page_size;
 static ulong innobase_log_block_size;
 
-static my_bool innobase_thread_concurrency_timer_based;
 static long long innobase_buffer_pool_size, innobase_log_file_size;
 
 /** Percentage of the buffer pool to reserve for 'old' blocks.
 Connected to buf_LRU_old_ratio. */
 static uint innobase_old_blocks_pct;
 
+/** Maximum on-disk size of change buffer in terms of percentage
+of the buffer pool. */
+static uint innobase_change_buffer_max_size = CHANGE_BUFFER_DEFAULT_SIZE;
+
 /* The default values for the following char* start-up parameters
 are determined in innobase_init below: */
 
 static char*	innobase_data_home_dir			= NULL;
 static char*	innobase_data_file_path			= NULL;
-static char*	innobase_log_group_home_dir		= NULL;
 static char*	innobase_file_format_name		= NULL;
 static char*	innobase_change_buffering		= NULL;
-static char*	innobase_doublewrite_file		= NULL;
+static char*	innobase_enable_monitor_counter		= NULL;
+static char*	innobase_disable_monitor_counter	= NULL;
+static char*	innobase_reset_monitor_counter		= NULL;
+static char*	innobase_reset_all_monitor_counter	= NULL;
 
 /* The highest file format being used in the database. The value can be
 set by user, however, it will be adjusted to the newer file format if
@@ -178,6 +178,10 @@ static char*	innobase_file_format_max		= NULL;
 
 static char*	innobase_file_flush_method		= NULL;
 
+/* This variable can be set in the server configure file, specifying
+stopword table to be used */
+static char*	innobase_server_stopword_table		= NULL;
+
 /* Below we have boolean-valued start-up parameters, and their default
 values */
 
@@ -191,28 +195,18 @@ static my_bool	innobase_use_atomic_writes		= FALSE;
 static my_bool	innobase_use_fallocate			= TRUE;
 static my_bool	innobase_use_doublewrite		= TRUE;
 static my_bool	innobase_use_checksums			= TRUE;
-static my_bool	innobase_fast_checksum			= FALSE;
-static my_bool	innobase_recovery_stats			= TRUE;
 static my_bool	innobase_locks_unsafe_for_binlog	= FALSE;
-static my_bool	innobase_overwrite_relay_log_info	= FALSE;
 static my_bool	innobase_rollback_on_timeout		= FALSE;
 static my_bool	innobase_create_status_file		= FALSE;
 static my_bool	innobase_stats_on_metadata		= TRUE;
 static my_bool	innobase_large_prefix			= FALSE;
-static my_bool	innobase_use_sys_stats_table		= FALSE;
-#ifdef UNIV_DEBUG
-static ulong    innobase_sys_stats_root_page		= 0;
-#endif
-static my_bool	innobase_buffer_pool_shm_checksum	= TRUE;
-static uint	innobase_buffer_pool_shm_key		= 0;
-static ulint	srv_lazy_drop_table			= 0;
-
+static my_bool	innodb_optimize_fulltext_only		= FALSE;
 
 static char*	internal_innobase_data_file_path	= NULL;
 
 static char*	innodb_version_str = (char*) INNODB_VERSION_STR;
 
-static my_bool	innobase_blocking_lru_restore		= FALSE;
+static char*	fts_server_stopword_table		= NULL;
 
 /** Possible values for system variable "innodb_stats_method". The values
 are defined the same as its corresponding MyISAM system variable
@@ -233,6 +227,72 @@ static TYPELIB innodb_stats_method_typelib = {
 	NULL
 };
 
+/** Possible values for system variables "innodb_checksum_algorithm" and
+"innodb_log_checksum_algorithm". */
+static const char* innodb_checksum_algorithm_names[] = {
+	"crc32",
+	"strict_crc32",
+	"innodb",
+	"strict_innodb",
+	"none",
+	"strict_none",
+	NullS
+};
+
+/** Used to define an enumerate type of the system variables
+innodb_checksum_algorithm and innodb_log_checksum_algorithm. */
+static TYPELIB innodb_checksum_algorithm_typelib = {
+	array_elements(innodb_checksum_algorithm_names) - 1,
+	"innodb_checksum_algorithm_typelib",
+	innodb_checksum_algorithm_names,
+	NULL
+};
+
+/** Possible values for system variable "innodb_cleaner_lsn_age_factor".  */
+static const char* innodb_cleaner_lsn_age_factor_names[] = {
+	"legacy",
+	"high_checkpoint",
+	NullS
+};
+
+/** Enumeration for innodb_cleaner_lsn_age_factor.  */
+static TYPELIB innodb_cleaner_lsn_age_factor_typelib = {
+	array_elements(innodb_cleaner_lsn_age_factor_names) - 1,
+	"innodb_cleaner_lsn_age_factor_typelib",
+	innodb_cleaner_lsn_age_factor_names,
+	NULL
+};
+
+/** Possible values for system variable "innodb_foreground_preflush".  */
+static const char* innodb_foreground_preflush_names[] = {
+	"sync_preflush",
+	"exponential_backoff",
+	NullS
+};
+
+/* Enumeration for innodb_foreground_preflush.  */
+static TYPELIB innodb_foreground_preflush_typelib = {
+	array_elements(innodb_foreground_preflush_names) - 1,
+	"innodb_foreground_preflush_typelib",
+	innodb_foreground_preflush_names,
+	NULL
+};
+
+/** Possible values for system variable "innodb_empty_free_list_algorithm".  */
+static const char* innodb_empty_free_list_algorithm_names[] = {
+	"legacy",
+	"backoff",
+	NullS
+};
+
+/** Enumeration for innodb_empty_free_list_algorithm.  */
+static TYPELIB innodb_empty_free_list_algorithm_typelib = {
+	array_elements(innodb_empty_free_list_algorithm_names) - 1,
+	"innodb_empty_free_list_algorithm_typelib",
+	innodb_empty_free_list_algorithm_names,
+	NULL
+};
+
 /* The following counter is used to convey information to InnoDB
 about server activity: in selects it is not sensible to call
 srv_active_wake_master_thread after each fetch or search, we only do
@@ -253,6 +313,19 @@ static const char* innobase_change_buffering_values[IBUF_USE_COUNT] = {
 	"all"		/* IBUF_USE_ALL */
 };
 
+/* Call back function array defined by MySQL and used to
+retrieve FTS results. */
+const struct _ft_vft ft_vft_result = {NULL,
+				      innobase_fts_find_ranking,
+				      innobase_fts_close_ranking,
+				      innobase_fts_retrieve_ranking,
+				      NULL};
+
+const struct _ft_vft_ext ft_vft_ext_result = {innobase_fts_get_version,
+					      innobase_fts_flags,
+					      innobase_fts_retrieve_docid,
+					      innobase_fts_count_matches};
+
 #ifdef HAVE_PSI_INTERFACE
 /* Keys to register pthread mutexes/cond in the current file with
 performance schema */
@@ -277,7 +350,6 @@ performance schema instrumented if "UNIV_PFS_MUTEX"
 is defined */
 static PSI_mutex_info all_innodb_mutexes[] = {
 	{&autoinc_mutex_key, "autoinc_mutex", 0},
-	{&btr_search_enabled_mutex_key, "btr_search_enabled_mutex", 0},
 #  ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
 	{&buffer_block_mutex_key, "buffer_block_mutex", 0},
 #  endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
@@ -287,19 +359,26 @@ static PSI_mutex_info all_innodb_mutexes[] = {
 	{&buf_pool_free_list_mutex_key, "buf_pool_free_list_mutex", 0},
 	{&buf_pool_zip_free_mutex_key, "buf_pool_zip_free_mutex", 0},
 	{&buf_pool_zip_hash_mutex_key, "buf_pool_zip_hash_mutex", 0},
+	{&buf_pool_flush_state_mutex_key, "buf_pool_flush_state_mutex", 0},
 	{&cache_last_read_mutex_key, "cache_last_read_mutex", 0},
 	{&dict_foreign_err_mutex_key, "dict_foreign_err_mutex", 0},
 	{&dict_sys_mutex_key, "dict_sys_mutex", 0},
 	{&file_format_max_mutex_key, "file_format_max_mutex", 0},
 	{&fil_system_mutex_key, "fil_system_mutex", 0},
 	{&flush_list_mutex_key, "flush_list_mutex", 0},
+	{&fts_bg_threads_mutex_key, "fts_bg_threads_mutex", 0},
+	{&fts_delete_mutex_key, "fts_delete_mutex", 0},
+	{&fts_optimize_mutex_key, "fts_optimize_mutex", 0},
+	{&fts_doc_id_mutex_key, "fts_doc_id_mutex", 0},
 	{&log_flush_order_mutex_key, "log_flush_order_mutex", 0},
 	{&hash_table_mutex_key, "hash_table_mutex", 0},
 	{&ibuf_bitmap_mutex_key, "ibuf_bitmap_mutex", 0},
 	{&ibuf_mutex_key, "ibuf_mutex", 0},
 	{&ibuf_pessimistic_insert_mutex_key,
 		 "ibuf_pessimistic_insert_mutex", 0},
-	{&kernel_mutex_key, "kernel_mutex", 0},
+#  ifndef HAVE_ATOMIC_BUILTINS
+	{&server_mutex_key, "server_mutex", 0},
+#  endif /* !HAVE_ATOMIC_BUILTINS */
 	{&log_bmp_sys_mutex_key, "log_bmp_sys_mutex", 0},
 	{&log_sys_mutex_key, "log_sys_mutex", 0},
 #  ifdef UNIV_MEM_DEBUG
@@ -307,8 +386,10 @@ static PSI_mutex_info all_innodb_mutexes[] = {
 #  endif /* UNIV_MEM_DEBUG */
 	{&mem_pool_mutex_key, "mem_pool_mutex", 0},
 	{&mutex_list_mutex_key, "mutex_list_mutex", 0},
+	{&page_zip_stat_per_index_mutex_key, "page_zip_stat_per_index_mutex", 0},
 	{&purge_sys_bh_mutex_key, "purge_sys_bh_mutex", 0},
 	{&recv_sys_mutex_key, "recv_sys_mutex", 0},
+	{&recv_writer_mutex_key, "recv_writer_mutex", 0},
 	{&rseg_mutex_key, "rseg_mutex", 0},
 #  ifdef UNIV_SYNC_DEBUG
 	{&rw_lock_debug_mutex_key, "rw_lock_debug_mutex", 0},
@@ -319,12 +400,30 @@ static PSI_mutex_info all_innodb_mutexes[] = {
 	{&srv_innodb_monitor_mutex_key, "srv_innodb_monitor_mutex", 0},
 	{&srv_misc_tmpfile_mutex_key, "srv_misc_tmpfile_mutex", 0},
 	{&srv_monitor_file_mutex_key, "srv_monitor_file_mutex", 0},
-	{&syn_arr_mutex_key, "syn_arr_mutex", 0},
 #  ifdef UNIV_SYNC_DEBUG
 	{&sync_thread_mutex_key, "sync_thread_mutex", 0},
 #  endif /* UNIV_SYNC_DEBUG */
-	{&trx_doublewrite_mutex_key, "trx_doublewrite_mutex", 0},
-	{&trx_undo_mutex_key, "trx_undo_mutex", 0}
+	{&buf_dblwr_mutex_key, "buf_dblwr_mutex", 0},
+	{&trx_undo_mutex_key, "trx_undo_mutex", 0},
+	{&srv_sys_mutex_key, "srv_sys_mutex", 0},
+	{&lock_sys_mutex_key, "lock_mutex", 0},
+	{&lock_sys_wait_mutex_key, "lock_wait_mutex", 0},
+	{&trx_mutex_key, "trx_mutex", 0},
+	{&srv_sys_tasks_mutex_key, "srv_threads_mutex", 0},
+	/* mutex with os_fast_mutex_ interfaces */
+#  ifndef PFS_SKIP_EVENT_MUTEX
+	{&event_os_mutex_key, "event_os_mutex", 0},
+#  endif /* PFS_SKIP_EVENT_MUTEX */
+	{&os_mutex_key, "os_mutex", 0},
+#ifndef HAVE_ATOMIC_BUILTINS
+	{&srv_conc_mutex_key, "srv_conc_mutex", 0},
+#endif /* !HAVE_ATOMIC_BUILTINS */
+#ifndef HAVE_ATOMIC_BUILTINS_64
+	{&monitor_mutex_key, "monitor_mutex", 0},
+#endif /* !HAVE_ATOMIC_BUILTINS_64 */
+	{&ut_list_mutex_key, "ut_list_mutex", 0},
+	{&trx_sys_mutex_key, "trx_sys_mutex", 0},
+	{&zip_pad_mutex_key, "zip_pad_mutex", 0},
 };
 # endif /* UNIV_PFS_MUTEX */
 
@@ -337,7 +436,6 @@ static PSI_rwlock_info all_innodb_rwlocks[] = {
 	{&archive_lock_key, "archive_lock", 0},
 #  endif /* UNIV_LOG_ARCHIVE */
 	{&btr_search_latch_key, "btr_search_latch", 0},
-	{&buf_pool_page_hash_key, "buf_pool_page_hash_latch", 0},
 #  ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
 	{&buf_block_lock_key, "buf_block_lock", 0},
 #  endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
@@ -347,10 +445,14 @@ static PSI_rwlock_info all_innodb_rwlocks[] = {
 	{&dict_operation_lock_key, "dict_operation_lock", 0},
 	{&fil_space_latch_key, "fil_space_latch", 0},
 	{&checkpoint_lock_key, "checkpoint_lock", 0},
+	{&fts_cache_rw_lock_key, "fts_cache_rw_lock", 0},
+	{&fts_cache_init_rw_lock_key, "fts_cache_init_rw_lock", 0},
 	{&trx_i_s_cache_lock_key, "trx_i_s_cache_lock", 0},
 	{&trx_purge_latch_key, "trx_purge_latch", 0},
 	{&index_tree_rw_lock_key, "index_tree_rw_lock", 0},
-	{&dict_table_stats_latch_key, "dict_table_stats", 0}
+	{&index_online_log_key, "index_online_log", 0},
+	{&dict_table_stats_latch_key, "dict_table_stats", 0},
+	{&hash_table_rw_lock_key, "hash_table_locks", 0}
 };
 # endif /* UNIV_PFS_RWLOCK */
 
@@ -366,6 +468,8 @@ static PSI_thread_info	all_innodb_threads[] = {
 	{&srv_monitor_thread_key, "srv_monitor_thread", 0},
 	{&srv_master_thread_key, "srv_master_thread", 0},
 	{&srv_purge_thread_key, "srv_purge_thread", 0},
+	{&buf_page_cleaner_thread_key, "page_cleaner_thread", 0},
+	{&recv_writer_thread_key, "recv_writer_thread", 0},
 	{&srv_log_tracking_thread_key, "srv_redo_log_follow_thread", 0}
 };
 # endif /* UNIV_PFS_THREAD */
@@ -382,65 +486,96 @@ static PSI_file_info	all_innodb_files[] = {
 # endif /* UNIV_PFS_IO */
 #endif /* HAVE_PSI_INTERFACE */
 
-static INNOBASE_SHARE *get_share(const char *table_name);
-static void free_share(INNOBASE_SHARE *share);
-static int innobase_close_connection(handlerton *hton, THD* thd);
-static void innobase_kill_query(handlerton *hton, THD* thd, enum thd_kill_levels level);
-static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all);
-static int innobase_commit(handlerton *hton, THD* thd, bool all);
-static int innobase_rollback(handlerton *hton, THD* thd, bool all);
-static int innobase_rollback_to_savepoint(handlerton *hton, THD* thd,
-           void *savepoint);
-static int innobase_savepoint(handlerton *hton, THD* thd, void *savepoint);
-static int innobase_release_savepoint(handlerton *hton, THD* thd,
-           void *savepoint);
-static void innobase_checkpoint_request(handlerton *hton, void *cookie);
-static handler *innobase_create_handler(handlerton *hton,
-                                        TABLE_SHARE *table,
-                                        MEM_ROOT *mem_root);
-/* "GEN_CLUST_INDEX" is the name reserved for Innodb default
-system primary index. */
-static const char innobase_index_reserve_name[]= "GEN_CLUST_INDEX";
-
-/** @brief Initialize the default value of innodb_commit_concurrency.
-
-Once InnoDB is running, the innodb_commit_concurrency must not change
-from zero to nonzero. (Bug #42101)
+/** Always normalize table name to lower case on Windows */
+#ifdef __WIN__
+#define normalize_table_name(norm_name, name)           \
+	normalize_table_name_low(norm_name, name, TRUE)
+#else
+#define normalize_table_name(norm_name, name)           \
+	normalize_table_name_low(norm_name, name, FALSE)
+#endif /* __WIN__ */
 
-The initial default value is 0, and without this extra initialization,
-SET GLOBAL innodb_commit_concurrency=DEFAULT would set the parameter
-to 0, even if it was initially set to nonzero at the command line
-or configuration file. */
-static
-void
-innobase_commit_concurrency_init_default(void);
-/*==========================================*/
+/** Set up InnoDB API callback function array */
+ib_cb_t innodb_api_cb[] = {
+	(ib_cb_t) ib_cursor_open_table,
+	(ib_cb_t) ib_cursor_read_row,
+	(ib_cb_t) ib_cursor_insert_row,
+	(ib_cb_t) ib_cursor_delete_row,
+	(ib_cb_t) ib_cursor_update_row,
+	(ib_cb_t) ib_cursor_moveto,
+	(ib_cb_t) ib_cursor_first,
+	(ib_cb_t) ib_cursor_next,
+	(ib_cb_t) ib_cursor_last,
+	(ib_cb_t) ib_cursor_set_match_mode,
+	(ib_cb_t) ib_sec_search_tuple_create,
+	(ib_cb_t) ib_clust_read_tuple_create,
+	(ib_cb_t) ib_tuple_delete,
+	(ib_cb_t) ib_tuple_copy,
+	(ib_cb_t) ib_tuple_read_u8,
+	(ib_cb_t) ib_tuple_write_u8,
+	(ib_cb_t) ib_tuple_read_u16,
+	(ib_cb_t) ib_tuple_write_u16,
+	(ib_cb_t) ib_tuple_read_u32,
+	(ib_cb_t) ib_tuple_write_u32,
+	(ib_cb_t) ib_tuple_read_u64,
+	(ib_cb_t) ib_tuple_write_u64,
+	(ib_cb_t) ib_tuple_read_i8,
+	(ib_cb_t) ib_tuple_write_i8,
+	(ib_cb_t) ib_tuple_read_i16,
+	(ib_cb_t) ib_tuple_write_i16,
+	(ib_cb_t) ib_tuple_read_i32,
+	(ib_cb_t) ib_tuple_write_i32,
+	(ib_cb_t) ib_tuple_read_i64,
+	(ib_cb_t) ib_tuple_write_i64,
+	(ib_cb_t) ib_tuple_get_n_cols,
+	(ib_cb_t) ib_col_set_value,
+	(ib_cb_t) ib_col_get_value,
+	(ib_cb_t) ib_col_get_meta,
+	(ib_cb_t) ib_trx_begin,
+	(ib_cb_t) ib_trx_commit,
+	(ib_cb_t) ib_trx_rollback,
+	(ib_cb_t) ib_trx_start,
+	(ib_cb_t) ib_trx_release,
+	(ib_cb_t) ib_trx_state,
+	(ib_cb_t) ib_cursor_lock,
+	(ib_cb_t) ib_cursor_close,
+	(ib_cb_t) ib_cursor_new_trx,
+	(ib_cb_t) ib_cursor_reset,
+	(ib_cb_t) ib_open_table_by_name,
+	(ib_cb_t) ib_col_get_name,
+	(ib_cb_t) ib_table_truncate,
+	(ib_cb_t) ib_cursor_open_index_using_name,
+	(ib_cb_t) ib_close_thd,
+	(ib_cb_t) ib_cfg_get_cfg,
+	(ib_cb_t) ib_cursor_set_cluster_access,
+	(ib_cb_t) ib_cursor_commit_trx,
+	(ib_cb_t) ib_cfg_trx_level,
+	(ib_cb_t) ib_tuple_get_n_user_cols,
+	(ib_cb_t) ib_cursor_set_lock_mode,
+	(ib_cb_t) ib_cursor_clear_trx,
+	(ib_cb_t) ib_get_idx_field_name,
+	(ib_cb_t) ib_trx_get_start_time,
+	(ib_cb_t) ib_cfg_bk_commit_interval
+};
 
-/************************************************************//**
-Validate the file format name and return its corresponding id.
-@return	valid file format id */
-static
-uint
-innobase_file_format_name_lookup(
-/*=============================*/
-	const char*	format_name);		/*!< in: pointer to file format
-						name */
-/************************************************************//**
-Validate the file format check config parameters, as a side effect it
-sets the srv_max_file_format_at_startup variable.
-@return	the format_id if valid config value, otherwise, return -1 */
+/*************************************************************//**
+Check whether valid argument given to innodb_ft_*_stopword_table.
+This function is registered as a callback with MySQL.
+@return 0 for valid stopword table */
 static
 int
-innobase_file_format_validate_and_set(
-/*==================================*/
-	const char*	format_max);		/*!< in: parameter value */
-/****************************************************************//**
-Return alter table flags supported in an InnoDB database. */
-static
-uint
-innobase_alter_table_flags(
-/*=======================*/
-	uint	flags);
+innodb_stopword_table_validate(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value);	/*!< in: incoming string */
+
+/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default
+system clustered index when there is no primary key. */
+const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX";
 /************************************************************//**
 Synchronously read and parse the redo log up to the last
 checkpoint to write the changed page bitmap.
@@ -483,40 +618,18 @@ innobase_map_isolation_level(
 /*=========================*/
 	enum_tx_isolation	iso);	/*!< in: MySQL isolation level code */
 
-static const char innobase_hton_name[]= "InnoDB";
-
-/*************************************************************//**
-Check for a valid value of innobase_commit_concurrency.
-@return	0 for valid innodb_commit_concurrency */
-static
-int
-innobase_commit_concurrency_validate(
-/*=================================*/
-	THD*				thd,	/*!< in: thread handle */
-	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
-						variable */
-	void*				save,	/*!< out: immediate result
-						for update function */
-	struct st_mysql_value*		value)	/*!< in: incoming string */
+/* Enable / disable checkpoints */
+static int innobase_checkpoint_state(handlerton *hton, bool disable)
 {
-	long long	intbuf;
-	ulong		commit_concurrency;
-
-	DBUG_ENTER("innobase_commit_concurrency_validate");
-
-	if (value->val_int(value, &intbuf)) {
-		/* The value is NULL. That is invalid. */
-		DBUG_RETURN(1);
-	}
-
-	*reinterpret_cast<ulong*>(save) = commit_concurrency
-		= static_cast<ulong>(intbuf);
-
-	/* Allow the value to be updated, as long as it remains zero
-	or nonzero. */
-	DBUG_RETURN(!(!commit_concurrency == !innobase_commit_concurrency));
+  if (disable)
+    (void) log_disable_checkpoint();
+  else
+     log_enable_checkpoint();
+  return 0;
 }
 
+static const char innobase_hton_name[]= "InnoDB";
+
 static MYSQL_THDVAR_BOOL(support_xa, PLUGIN_VAR_OPCMDARG,
   "Enable InnoDB support for the XA two-phase commit",
   /* check_func */ NULL, /* update_func */ NULL,
@@ -531,10 +644,20 @@ static MYSQL_THDVAR_BOOL(strict_mode, PLUGIN_VAR_OPCMDARG,
   "Use strict mode when evaluating create options.",
   NULL, NULL, FALSE);
 
+static MYSQL_THDVAR_BOOL(ft_enable_stopword, PLUGIN_VAR_OPCMDARG,
+  "Create FTS index with stopword.",
+  NULL, NULL,
+  /* default */ TRUE);
+
 static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
   "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",
   NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0);
 
+static MYSQL_THDVAR_STR(ft_user_stopword_table,
+  PLUGIN_VAR_OPCMDARG|PLUGIN_VAR_MEMALLOC,
+  "User supplied stopword table name, effective in the session level.",
+  innodb_stopword_table_validate, NULL, NULL);
+
 static MYSQL_THDVAR_ULONG(flush_log_at_trx_commit, PLUGIN_VAR_OPCMDARG,
   "Controls the durability/speed trade-off for commits."
   " Set to 0 (write and flush redo log to disk only once per second),"
@@ -554,187 +677,20 @@ static MYSQL_THDVAR_BOOL(fake_changes, PLUGIN_VAR_OPCMDARG,
   "This is to cause replication prefetch IO. ATTENTION: the transaction started after enabled is affected.",
   NULL, NULL, FALSE);
 
-static MYSQL_THDVAR_ULONG(merge_sort_block_size, PLUGIN_VAR_RQCMDARG,
-  "The block size used doing external merge-sort for secondary index creation.",
-  NULL, NULL, 1UL << 20, 1UL << 20, 1UL << 30, 0);
-
-static handler *innobase_create_handler(handlerton *hton,
-                                        TABLE_SHARE *table,
-                                        MEM_ROOT *mem_root)
-{
-  return new (mem_root) ha_innobase(hton, table);
-}
-
-/*******************************************************************//**
-This function is used to prepare an X/Open XA distributed transaction.
-@return	0 or error number */
-static
-int
-innobase_xa_prepare(
-/*================*/
-        handlerton*	hton,	/*!< in: InnoDB handlerton */
-	THD*		thd,	/*!< in: handle to the MySQL thread of
-				the user whose XA transaction should
-				be prepared */
-	bool		all);	/*!< in: TRUE - commit transaction
-				FALSE - the current SQL statement
-				ended */
-/*******************************************************************//**
-This function is used to recover X/Open XA distributed transactions.
-@return	number of prepared transactions stored in xid_list */
-static
-int
-innobase_xa_recover(
-/*================*/
-	handlerton*	hton,	/*!< in: InnoDB handlerton */
-	XID*		xid_list,/*!< in/out: prepared transactions */
-	uint		len);	/*!< in: number of slots in xid_list */
-/*******************************************************************//**
-This function is used to commit one X/Open XA distributed transaction
-which is in the prepared state
-@return	0 or error number */
-static
-int
-innobase_commit_by_xid(
-/*===================*/
-	handlerton* hton,
-	XID*	xid);	/*!< in: X/Open XA transaction identification */
-/*******************************************************************//**
-This function is used to rollback one X/Open XA distributed transaction
-which is in the prepared state
-@return	0 or error number */
-static
-int
-innobase_rollback_by_xid(
-/*=====================*/
-	handlerton*	hton,	/*!< in: InnoDB handlerton */
-	XID*		xid);	/*!< in: X/Open XA transaction
-				identification */
-/*******************************************************************//**
-Create a consistent view for a cursor based on current transaction
-which is created if the corresponding MySQL thread still lacks one.
-This consistent view is then used inside of MySQL when accessing records
-using a cursor.
-@return	pointer to cursor view or NULL */
-static
-void*
-innobase_create_cursor_view(
-/*========================*/
-	handlerton*	hton,	/*!< in: innobase hton */
-	THD*		thd);	/*!< in: user thread handle */
-/*******************************************************************//**
-Set the given consistent cursor view to a transaction which is created
-if the corresponding MySQL thread still lacks one. If the given
-consistent cursor view is NULL global read view of a transaction is
-restored to a transaction read view. */
-static
-void
-innobase_set_cursor_view(
-/*=====================*/
-	handlerton* hton,
-	THD*	thd,	/*!< in: user thread handle */
-	void*	curview);/*!< in: Consistent cursor view to be set */
-/*******************************************************************//**
-Close the given consistent cursor view of a transaction and restore
-global read view to a transaction read view. Transaction is created if the
-corresponding MySQL thread still lacks one. */
-static
-void
-innobase_close_cursor_view(
-/*=======================*/
-	handlerton* hton,
-	THD*	thd,	/*!< in: user thread handle */
-	void*	curview);/*!< in: Consistent read view to be closed */
-/*****************************************************************//**
-Removes all tables in the named database inside InnoDB. */
-static
-void
-innobase_drop_database(
-/*===================*/
-	handlerton* hton, /*!< in: handlerton of Innodb */
-	char*	path);	/*!< in: database path; inside InnoDB the name
-			of the last directory in the path is used as
-			the database name: for example, in 'mysql/data/test'
-			the database name is 'test' */
-/*******************************************************************//**
-Closes an InnoDB database. */
-static
-int
-innobase_end(handlerton *hton, ha_panic_function type);
-
-/*****************************************************************//**
-Creates an InnoDB transaction struct for the thd if it does not yet have one.
-Starts a new InnoDB transaction if a transaction is not yet started. And
-assigns a new snapshot for a consistent read if the transaction does not yet
-have one.
-@return	0 */
-static
-int
-innobase_start_trx_and_assign_read_view(
-/*====================================*/
-			/* out: 0 */
-	handlerton* hton, /* in: Innodb handlerton */
-	THD*	thd);	/* in: MySQL thread handle of the user for whom
-			the transaction should be committed */
-/****************************************************************//**
-Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
-the logs, and the name of this function should be innobase_checkpoint.
-@return	TRUE if error */
-static
-bool
-innobase_flush_logs(
-/*================*/
-	handlerton*	hton);	/*!< in: InnoDB handlerton */
-
-/************************************************************************//**
-Implements the SHOW INNODB STATUS command. Sends the output of the InnoDB
-Monitor to the client. */
-static
-bool
-innodb_show_status(
-/*===============*/
-	handlerton*	hton,	/*!< in: the innodb handlerton */
-	THD*	thd,	/*!< in: the MySQL query thread of the caller */
-	stat_print_fn *stat_print);
-static
-bool innobase_show_status(handlerton *hton, THD* thd,
-                          stat_print_fn* stat_print,
-                          enum ha_stat_type stat_type);
-
-/* Enable / disable checkpoints */
-static int innobase_checkpoint_state(handlerton *hton, bool disable)
-{
-  if (disable)
-    (void) log_disable_checkpoint();
-  else
-     log_enable_checkpoint();
-  return 0;
-}
-
-
-/*****************************************************************//**
-Commits a transaction in an InnoDB database. */
-static
-void
-innobase_commit_low(
-/*================*/
-	trx_t*	trx);	/*!< in: transaction handle */
 
 static SHOW_VAR innodb_status_variables[]= {
-  {"adaptive_hash_cells",
-  (char*) &export_vars.innodb_adaptive_hash_cells,	  SHOW_LONG},
-  {"adaptive_hash_hash_searches",
-  (char*) &export_vars.innodb_adaptive_hash_hash_searches, SHOW_LONG},
-  {"adaptive_hash_heap_buffers",
-  (char*) &export_vars.innodb_adaptive_hash_heap_buffers, SHOW_LONG},
-  {"adaptive_hash_non_hash_searches",
-  (char*) &export_vars.innodb_adaptive_hash_non_hash_searches, SHOW_LONG},
+  {"available_undo_logs",
+  (char*) &export_vars.innodb_available_undo_logs,        SHOW_LONG},
   {"background_log_sync",
   (char*) &export_vars.innodb_background_log_sync,	  SHOW_LONG},
   {"buffer_pool_bytes_data",
   (char*) &export_vars.innodb_buffer_pool_bytes_data,	  SHOW_LONG},
   {"buffer_pool_bytes_dirty",
   (char*) &export_vars.innodb_buffer_pool_bytes_dirty,	  SHOW_LONG},
+  {"buffer_pool_dump_status",
+  (char*) &export_vars.innodb_buffer_pool_dump_status,	  SHOW_CHAR},
+  {"buffer_pool_load_status",
+  (char*) &export_vars.innodb_buffer_pool_load_status,	  SHOW_CHAR},
   {"buffer_pool_pages_data",
   (char*) &export_vars.innodb_buffer_pool_pages_data,	  SHOW_LONG},
   {"buffer_pool_pages_dirty",
@@ -777,10 +733,6 @@ static SHOW_VAR innodb_status_variables[]= {
   (char*) &export_vars.innodb_checkpoint_age,		  SHOW_LONG},
   {"checkpoint_max_age",
   (char*) &export_vars.innodb_checkpoint_max_age,	  SHOW_LONG},
-  {"checkpoint_target_age",
-  (char*) &export_vars.innodb_checkpoint_target_age,	  SHOW_LONG},
-  {"current_row_locks",
-  (char*) &export_vars.innodb_current_row_locks,		  SHOW_LONG},
   {"data_fsyncs",
   (char*) &export_vars.innodb_data_fsyncs,		  SHOW_LONG},
   {"data_pending_fsyncs",
@@ -803,10 +755,6 @@ static SHOW_VAR innodb_status_variables[]= {
   (char*) &export_vars.innodb_dblwr_writes,		  SHOW_LONG},
   {"deadlocks",
   (char*) &export_vars.innodb_deadlocks,		  SHOW_LONG},
-  {"descriptors_memory",
-  (char*) &export_vars.innodb_descriptors_memory,	  SHOW_LONG},
-  {"dict_tables",
-  (char*) &export_vars.innodb_dict_tables,		  SHOW_LONG},
   {"have_atomic_builtins",
   (char*) &export_vars.innodb_have_atomic_builtins,	  SHOW_BOOL},
   {"history_list_length",
@@ -843,16 +791,10 @@ static SHOW_VAR innodb_status_variables[]= {
   (char*) &export_vars.innodb_lsn_flushed,		  SHOW_LONGLONG},
   {"lsn_last_checkpoint",
   (char*) &export_vars.innodb_lsn_last_checkpoint,	  SHOW_LONGLONG},
-  {"master_thread_1_second_loops",
-  (char*) &export_vars.innodb_master_thread_1_second_loops, SHOW_LONG},
-  {"master_thread_10_second_loops",
-  (char*) &export_vars.innodb_master_thread_10_second_loops, SHOW_LONG},
-  {"master_thread_background_loops",
-  (char*) &export_vars.innodb_master_thread_background_loops, SHOW_LONG},
-  {"master_thread_main_flush_loops",
-  (char*) &export_vars.innodb_master_thread_main_flush_loops, SHOW_LONG},
-  {"master_thread_sleeps",
-  (char*) &export_vars.innodb_master_thread_sleeps,	  SHOW_LONG},
+  {"master_thread_active_loops",
+  (char*) &export_vars.innodb_master_thread_active_loops, SHOW_LONG},
+  {"master_thread_idle_loops",
+  (char*) &export_vars.innodb_master_thread_idle_loops,   SHOW_LONG},
   {"max_trx_id",
   (char*) &export_vars.innodb_max_trx_id,		  SHOW_LONGLONG},
   {"mem_adaptive_hash",
@@ -876,7 +818,7 @@ static SHOW_VAR innodb_status_variables[]= {
   {"os_log_pending_writes",
   (char*) &export_vars.innodb_os_log_pending_writes,	  SHOW_LONG},
   {"os_log_written",
-  (char*) &export_vars.innodb_os_log_written,		  SHOW_LONG},
+  (char*) &export_vars.innodb_os_log_written,		  SHOW_LONGLONG},
   {"page_size",
   (char*) &export_vars.innodb_page_size,		  SHOW_LONG},
   {"pages_created",
@@ -924,7 +866,7 @@ static SHOW_VAR innodb_status_variables[]= {
   {"s_lock_spin_waits",
   (char*) &export_vars.innodb_s_lock_spin_waits,	  SHOW_LONGLONG},
   {"truncated_status_writes",
-  (char*) &export_vars.innodb_truncated_status_writes,	SHOW_LONG},
+  (char*) &export_vars.innodb_truncated_status_writes,	  SHOW_LONG},
   {"x_lock_os_waits",
   (char*) &export_vars.innodb_x_lock_os_waits,		  SHOW_LONGLONG},
   {"x_lock_spin_rounds",
@@ -934,8 +876,458 @@ static SHOW_VAR innodb_status_variables[]= {
   {NullS, NullS, SHOW_LONG}
 };
 
+/************************************************************************//**
+Handling the shared INNOBASE_SHARE structure that is needed to provide table
+locking. Register the table name if it doesn't exist in the hash table. */
+static
+INNOBASE_SHARE*
+get_share(
+/*======*/
+	const char*	table_name);	/*!< in: table to lookup */
+
+/************************************************************************//**
+Free the shared object that was registered with get_share(). */
+static
+void
+free_share(
+/*=======*/
+	INNOBASE_SHARE*	share);		/*!< in/own: share to free */
+
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return	0 or error number */
+static
+int
+innobase_close_connection(
+/*======================*/
+	handlerton*	hton,		/*!< in/out: Innodb handlerton */
+	THD*		thd);		/*!< in: MySQL thread handle for
+					which to close the connection */
+
+static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all);
+static void innobase_checkpoint_request(handlerton *hton, void *cookie);
+
+/*****************************************************************//**
+Cancel any pending lock request associated with the current THD. */
+static
+void
+innobase_kill_connection(
+/*======================*/
+        handlerton*	hton,	/*!< in:  innobase handlerton */
+	THD*	thd,	/*!< in: handle to the MySQL thread being killed */
+        thd_kill_levels);
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database or marks an SQL statement
+ended.
+@return	0 */
+static
+int
+innobase_commit(
+/*============*/
+	handlerton*	hton,		/*!< in/out: Innodb handlerton */
+	THD*		thd,		/*!< in: MySQL thread handle of the
+					user for whom the transaction should
+					be committed */
+	bool		commit_trx);	/*!< in: true - commit transaction
+					false - the current SQL statement
+					ended */
+
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback(
+/*==============*/
+	handlerton*	hton,		/*!< in/out: Innodb handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction should
+					be rolled back */
+	bool		rollback_trx);	/*!< in: TRUE - rollback entire
+					transaction FALSE - rollback the current
+					statement only */
+
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback_to_savepoint(
+/*===========================*/
+	handlerton*	hton,		/*!< in/out: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread of
+					the user whose XA transaction should
+					be rolled back to savepoint */
+	void*		savepoint);	/*!< in: savepoint data */
+
+/*****************************************************************//**
+Sets a transaction savepoint.
+@return	always 0, that is, always succeeds */
+static
+int
+innobase_savepoint(
+/*===============*/
+	handlerton*	hton,		/*!< in/out: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread of
+					the user's XA transaction for which
+					we need to take a savepoint */
+	void*		savepoint);	/*!< in: savepoint data */
+
+/*****************************************************************//**
+Release transaction savepoint name.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_release_savepoint(
+/*=======================*/
+	handlerton*	hton,		/*!< in/out: handlerton for Innodb */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction's
+					savepoint should be released */
+	void*		savepoint);	/*!< in: savepoint data */
+
+/************************************************************************//**
+Function for constructing an InnoDB table handler instance. */
+static
+handler*
+innobase_create_handler(
+/*====================*/
+	handlerton*	hton,		/*!< in/out: handlerton for Innodb */
+	TABLE_SHARE*	table,
+	MEM_ROOT*	mem_root);
+
+/** @brief Initialize the default value of innodb_commit_concurrency.
+
+Once InnoDB is running, the innodb_commit_concurrency must not change
+from zero to nonzero. (Bug #42101)
+
+The initial default value is 0, and without this extra initialization,
+SET GLOBAL innodb_commit_concurrency=DEFAULT would set the parameter
+to 0, even if it was initially set to nonzero at the command line
+or configuration file. */
+static
+void
+innobase_commit_concurrency_init_default();
+/*=======================================*/
+
+/** @brief Initialize the default and max value of innodb_undo_logs.
+
+Once InnoDB is running, the default value and the max value of
+innodb_undo_logs must be equal to the available undo logs,
+given by srv_available_undo_logs. */
+static
+void
+innobase_undo_logs_init_default_max();
+/*==================================*/
+
+/************************************************************//**
+Validate the file format name and return its corresponding id.
+@return	valid file format id */
+static
+uint
+innobase_file_format_name_lookup(
+/*=============================*/
+	const char*	format_name);	/*!< in: pointer to file format
+					name */
+/************************************************************//**
+Validate the file format check config parameters, as a side effect it
+sets the srv_max_file_format_at_startup variable.
+@return	the format_id if valid config value, otherwise, return -1 */
+static
+int
+innobase_file_format_validate_and_set(
+/*==================================*/
+	const char*	format_max);	/*!< in: parameter value */
+
+/*******************************************************************//**
+This function is used to prepare an X/Open XA distributed transaction.
+@return	0 or error number */
+static
+int
+innobase_xa_prepare(
+/*================*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread of
+					the user whose XA transaction should
+					be prepared */
+	bool		all);		/*!< in: true - prepare transaction
+					false - the current SQL statement
+					ended */
+/*******************************************************************//**
+This function is used to recover X/Open XA distributed transactions.
+@return	number of prepared transactions stored in xid_list */
+static
+int
+innobase_xa_recover(
+/*================*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	XID*		xid_list,	/*!< in/out: prepared transactions */
+	uint		len);		/*!< in: number of slots in xid_list */
+/*******************************************************************//**
+This function is used to commit one X/Open XA distributed transaction
+which is in the prepared state
+@return	0 or error number */
+static
+int
+innobase_commit_by_xid(
+/*===================*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	XID*		xid);		/*!< in: X/Open XA transaction
+					identification */
+/*******************************************************************//**
+This function is used to rollback one X/Open XA distributed transaction
+which is in the prepared state
+@return	0 or error number */
+static
+int
+innobase_rollback_by_xid(
+/*=====================*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	XID*		xid);		/*!< in: X/Open XA transaction
+					identification */
+/*******************************************************************//**
+Create a consistent view for a cursor based on current transaction
+which is created if the corresponding MySQL thread still lacks one.
+This consistent view is then used inside of MySQL when accessing records
+using a cursor.
+@return	pointer to cursor view or NULL */
+static
+void*
+innobase_create_cursor_view(
+/*========================*/
+	handlerton*	hton,		/*!< in: innobase hton */
+	THD*		thd);		/*!< in: user thread handle */
+/*******************************************************************//**
+Set the given consistent cursor view to a transaction which is created
+if the corresponding MySQL thread still lacks one. If the given
+consistent cursor view is NULL global read view of a transaction is
+restored to a transaction read view. */
+static
+void
+innobase_set_cursor_view(
+/*=====================*/
+	handlerton*	hton,		/*!< in: handlerton of Innodb */
+	THD*		thd,		/*!< in: user thread handle */
+	void*		curview);	/*!< in: Consistent cursor view to
+					be set */
+/*******************************************************************//**
+Close the given consistent cursor view of a transaction and restore
+global read view to a transaction read view. Transaction is created if the
+corresponding MySQL thread still lacks one. */
+static
+void
+innobase_close_cursor_view(
+/*=======================*/
+	handlerton*	hton,		/*!< in: handlerton of Innodb */
+	THD*		thd,		/*!< in: user thread handle */
+	void*		curview);	/*!< in: Consistent read view to be
+					closed */
+/*****************************************************************//**
+Removes all tables in the named database inside InnoDB. */
+static
+void
+innobase_drop_database(
+/*===================*/
+	handlerton*	hton,		/*!< in: handlerton of Innodb */
+	char*		path);		/*!< in: database path; inside InnoDB
+					the name of the last directory in
+					the path is used as the database name:
+					for example, in 'mysql/data/test' the
+					database name is 'test' */
+/*******************************************************************//**
+Closes an InnoDB database. */
+static
+int
+innobase_end(
+/*=========*/
+	handlerton*		hton,	/* in: Innodb handlerton */
+	ha_panic_function	type);
+
+/*****************************************************************//**
+Creates an InnoDB transaction struct for the thd if it does not yet have one.
+Starts a new InnoDB transaction if a transaction is not yet started. And
+assigns a new snapshot for a consistent read if the transaction does not yet
+have one.
+@return	0 */
+static
+int
+innobase_start_trx_and_assign_read_view(
+/*====================================*/
+	handlerton*	hton,		/* in: Innodb handlerton */
+	THD*		thd);		/* in: MySQL thread handle of the
+					user for whom the transaction should
+					be committed */
+/****************************************************************//**
+Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
+the logs, and the name of this function should be innobase_checkpoint.
+@return	TRUE if error */
+static
+bool
+innobase_flush_logs(
+/*================*/
+	handlerton*	hton);		/*!< in: InnoDB handlerton */
+
+/************************************************************************//**
+Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the
+InnoDB Monitor to the client.
+@return 0 on success */
+static
+int
+innodb_show_status(
+/*===============*/
+	handlerton*	hton,		/*!< in: the innodb handlerton */
+	THD*		thd,		/*!< in: the MySQL query thread of
+					the caller */
+	stat_print_fn*	stat_print);
+/************************************************************************//**
+Return 0 on success and non-zero on failure. Note: the bool return type
+seems to be abused here, should be an int. */
+static
+bool
+innobase_show_status(
+/*=================*/
+	handlerton*		hton,	/*!< in: the innodb handlerton */
+	THD*			thd,	/*!< in: the MySQL query thread of
+					the caller */
+	stat_print_fn*		stat_print,
+	enum ha_stat_type	stat_type);
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database. */
+static
+void
+innobase_commit_low(
+/*================*/
+	trx_t*	trx);	/*!< in: transaction handle */
+
+/****************************************************************//**
+Parse and enable InnoDB monitor counters during server startup.
+User can enable monitor counters/groups by specifying
+"loose-innodb_monitor_enable = monitor_name1;monitor_name2..."
+in server configuration file or at the command line. */
+static
+void
+innodb_enable_monitor_at_startup(
+/*=============================*/
+	char*	str);	/*!< in: monitor counter enable list */
+
+/*********************************************************************
+Normalizes a table name string. A normalized name consists of the
+database name catenated to '/' and table name. An example:
+test/mytable. On Windows normalization puts both the database name and the
+table name always to lower case if "set_lower_case" is set to TRUE. */
+static
+void
+normalize_table_name_low(
+/*=====================*/
+	char*           norm_name,      /* out: normalized name as a
+					null-terminated string */
+	const char*     name,           /* in: table name string */
+	ibool           set_lower_case); /* in: TRUE if we want to set
+					 name to lower case */
+
+/*************************************************************//**
+Removes old archived transaction log files.
+@return	true on error */
+static bool innobase_purge_archive_logs(
+	handlerton *hton,		/*!< in: InnoDB handlerton */
+	time_t before_date,		/*!< in: all files modified
+					before timestamp should be removed */
+	const char* to_filename)	/*!< in: this and earler files
+					should be removed */
+{
+	ulint err= DB_ERROR;
+	if (before_date > 0) {
+		err= purge_archived_logs(before_date, 0);
+	} else if (to_filename) {
+		if (is_prefix(to_filename, IB_ARCHIVED_LOGS_PREFIX)) {
+			unsigned long long log_file_lsn = strtoll(to_filename
+					+ IB_ARCHIVED_LOGS_PREFIX_LEN,
+					NULL, 10);
+			if (log_file_lsn > 0 && log_file_lsn < ULLONG_MAX) {
+				err= purge_archived_logs(0, log_file_lsn);
+			}
+		}
+	}
+	return (err != DB_SUCCESS);
+}
+
+/*************************************************************//**
+Check for a valid value of innobase_commit_concurrency.
+@return	0 for valid innodb_commit_concurrency */
+static
+int
+innobase_commit_concurrency_validate(
+/*=================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	long long	intbuf;
+	ulong		commit_concurrency;
+
+	DBUG_ENTER("innobase_commit_concurrency_validate");
+
+	if (value->val_int(value, &intbuf)) {
+		/* The value is NULL. That is invalid. */
+		DBUG_RETURN(1);
+	}
+
+	*reinterpret_cast<ulong*>(save) = commit_concurrency
+		= static_cast<ulong>(intbuf);
+
+	/* Allow the value to be updated, as long as it remains zero
+	or nonzero. */
+	DBUG_RETURN(!(!commit_concurrency == !innobase_commit_concurrency));
+}
+
+/*******************************************************************//**
+Function for constructing an InnoDB table handler instance. */
+static
+handler*
+innobase_create_handler(
+/*====================*/
+	handlerton*	hton,	/*!< in: InnoDB handlerton */
+	TABLE_SHARE*	table,
+	MEM_ROOT*	mem_root)
+{
+	return(new (mem_root) ha_innobase(hton, table));
+}
+
 /* General functions */
 
+/*************************************************************//**
+Check that a page_size is correct for InnoDB.  If correct, set the
+associated page_size_shift which is the power of 2 for this page size.
+@return	an associated page_size_shift if valid, 0 if invalid. */
+inline
+int
+innodb_page_size_validate(
+/*======================*/
+	ulong	page_size)		/*!< in: Page Size to evaluate */
+{
+	ulong		n;
+
+	DBUG_ENTER("innodb_page_size_validate");
+
+	for (n = UNIV_PAGE_SIZE_SHIFT_MIN;
+	     n <= UNIV_PAGE_SIZE_SHIFT_MAX;
+	     n++) {
+		if (page_size == (ulong) (1 << n)) {
+			DBUG_RETURN(n);
+		}
+	}
+
+	DBUG_RETURN(0);
+}
+
 /******************************************************************//**
 Returns true if the thread is the replication thread on the slave
 server. Used in srv_conc_enter_innodb() to determine if the thread
@@ -943,13 +1335,55 @@ should be allowed to enter InnoDB - the replication thread is treated
 differently than other threads. Also used in
 srv_conc_force_exit_innodb().
 @return	true if thd is the replication thread */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ibool
 thd_is_replication_slave_thread(
 /*============================*/
-	const void*	thd)	/*!< in: thread handle (THD*) */
+	THD*	thd)	/*!< in: thread handle */
 {
-	return((ibool) thd_slave_thread((THD*) thd));
+	return((ibool) thd_slave_thread(thd));
+}
+
+/******************************************************************//**
+Gets information on the durability property requested by thread.
+Used when writing either a prepare or commit record to the log
+buffer. @return the durability property. */
+UNIV_INTERN
+enum durability_properties
+thd_requested_durability(
+/*=====================*/
+	const THD* thd)	/*!< in: thread handle */
+{
+	return(thd_get_durability_property(thd));
+}
+
+/******************************************************************//**
+Returns true if transaction should be flagged as read-only.
+@return	true if the thd is marked as read-only */
+UNIV_INTERN
+ibool
+thd_trx_is_read_only(
+/*=================*/
+	THD*	thd)	/*!< in: thread handle */
+{
+	return(thd != 0 && thd_tx_is_read_only(thd));
+}
+
+/******************************************************************//**
+Check if the transaction is an auto-commit transaction. TRUE also
+implies that it is a SELECT (read-only) transaction.
+@return	true if the transaction is an auto commit read-only transaction. */
+UNIV_INTERN
+ibool
+thd_trx_is_auto_commit(
+/*===================*/
+	THD*	thd)	/*!< in: thread handle, can be NULL */
+{
+	return(thd != NULL
+	       && !thd_test_options(
+		       thd,
+		       OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)
+	       && thd_is_select(thd));
 }
 
 /******************************************************************//**
@@ -957,126 +1391,155 @@ Save some CPU by testing the value of srv_thread_concurrency in inline
 functions. */
 static inline
 void
-innodb_srv_conc_enter_innodb(
-/*=========================*/
+innobase_srv_conc_enter_innodb(
+/*===========================*/
 	trx_t*	trx)	/*!< in: transaction handle */
 {
-	if (UNIV_LIKELY(!srv_thread_concurrency)) {
+	if (srv_thread_concurrency) {
+		if (trx->n_tickets_to_enter_innodb > 0) {
 
-		return;
-	}
+			/* If trx has 'free tickets' to enter the engine left,
+			then use one such ticket */
+
+			--trx->n_tickets_to_enter_innodb;
+
+		} else if (trx->mysql_thd != NULL
+			   && thd_is_replication_slave_thread(trx->mysql_thd)) {
 
-	srv_conc_enter_innodb(trx);
+			UT_WAIT_FOR(
+				srv_conc_get_active_threads()
+				< srv_thread_concurrency,
+				srv_replication_delay * 1000);
+
+		}  else {
+			srv_conc_enter_innodb(trx);
+		}
+	}
 }
 
 /******************************************************************//**
-Save some CPU by testing the value of srv_thread_concurrency in inline
-functions. */
+Note that the thread wants to leave InnoDB only if it doesn't have
+any spare tickets. */
 static inline
 void
-innodb_srv_conc_exit_innodb(
-/*========================*/
+innobase_srv_conc_exit_innodb(
+/*==========================*/
 	trx_t*	trx)	/*!< in: transaction handle */
 {
-	if (UNIV_LIKELY(!trx->declared_to_be_inside_innodb)) {
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
 
-		return;
-	}
+	/* This is to avoid making an unnecessary function call. */
+	if (trx->declared_to_be_inside_innodb
+	    && trx->n_tickets_to_enter_innodb == 0) {
 
-	srv_conc_exit_innodb(trx);
+		srv_conc_force_exit_innodb(trx);
+	}
 }
 
 /******************************************************************//**
 Force a thread to leave InnoDB even if it has spare tickets. */
 static inline
 void
-innodb_srv_conc_force_exit_innodb(
-/*==============================*/
+innobase_srv_conc_force_exit_innodb(
+/*================================*/
 	trx_t*	trx)	/*!< in: transaction handle */
 {
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
 #endif /* UNIV_SYNC_DEBUG */
 
+	/* This is to avoid making an unnecessary function call. */
 	if (trx->declared_to_be_inside_innodb) {
-
 		srv_conc_force_exit_innodb(trx);
 	}
 }
 
 /******************************************************************//**
+Returns the NUL terminated value of glob_hostname.
+@return	pointer to glob_hostname. */
+UNIV_INTERN
+const char*
+server_get_hostname()
+/*=================*/
+{
+	return(glob_hostname);
+}
+
+/******************************************************************//**
 Returns true if the transaction this thread is processing has edited
 non-transactional tables. Used by the deadlock detector when deciding
 which transaction to rollback in case of a deadlock - we try to avoid
 rolling back transactions that have edited non-transactional tables.
 @return	true if non-transactional tables have been edited */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ibool
 thd_has_edited_nontrans_tables(
 /*===========================*/
-	void*	thd)	/*!< in: thread handle (THD*) */
+	THD*	thd)	/*!< in: thread handle */
 {
-	return((ibool) thd_non_transactional_update((THD*) thd));
+	return((ibool) thd_non_transactional_update(thd));
 }
 
 /******************************************************************//**
 Returns true if the thread is executing a SELECT statement.
 @return	true if thd is executing SELECT */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ibool
 thd_is_select(
 /*==========*/
-	const void*	thd)	/*!< in: thread handle (THD*) */
+	const THD*	thd)	/*!< in: thread handle */
 {
-	return(thd_sql_command((const THD*) thd) == SQLCOM_SELECT);
+	return(thd_sql_command(thd) == SQLCOM_SELECT);
 }
 
 /******************************************************************//**
 Returns true if the thread supports XA,
 global value of innodb_supports_xa if thd is NULL.
 @return	true if thd has XA support */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ibool
 thd_supports_xa(
 /*============*/
-	void*	thd)	/*!< in: thread handle (THD*), or NULL to query
+	THD*	thd)	/*!< in: thread handle, or NULL to query
 			the global innodb_supports_xa */
 {
-	return(THDVAR((THD*) thd, support_xa));
+	return(THDVAR(thd, support_xa));
 }
 
 /******************************************************************//**
 Returns the lock wait timeout for the current connection.
 @return	the lock wait timeout, in seconds */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ulong
 thd_lock_wait_timeout(
 /*==================*/
-	void*	thd)	/*!< in: thread handle (THD*), or NULL to query
+	THD*	thd)	/*!< in: thread handle, or NULL to query
 			the global innodb_lock_wait_timeout */
 {
 	/* According to <mysql/plugin.h>, passing thd == NULL
 	returns the global value of the session variable. */
-	return(THDVAR((THD*) thd, lock_wait_timeout));
+	return(THDVAR(thd, lock_wait_timeout));
 }
 
 /******************************************************************//**
 Set the time waited for the lock for the current query. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 thd_set_lock_wait_time(
 /*===================*/
-	void*	thd,	/*!< in: thread handle (THD*) */
+	THD*	thd,	/*!< in/out: thread handle */
 	ulint	value)	/*!< in: time waited for the lock */
 {
 	if (thd) {
-		thd_storage_lock_wait((THD*)thd, value);
+		thd_storage_lock_wait(thd, value);
 	}
 }
 
 /******************************************************************//**
 */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ulong
 thd_flush_log_at_trx_commit(
 /*================================*/
@@ -1085,23 +1548,10 @@ thd_flush_log_at_trx_commit(
 	return(THDVAR((THD*) thd, flush_log_at_trx_commit));
 }
 
-/******************************************************************//**
-Returns the merge-sort block size used for the secondary index creation
-for the current connection.
-@return	the merge-sort block size, in bytes */
-extern "C" UNIV_INTERN
-ulong
-thd_merge_sort_block_size(
-/*================================*/
-	void*	thd)	/*!< in: thread handle (THD*), or NULL to query
-+			the global merge_sort_block_size */
-{
-	return(THDVAR((THD*) thd, merge_sort_block_size));
-}
-
 /********************************************************************//**
 Obtain the InnoDB transaction of a MySQL thread.
 @return	reference to transaction pointer */
+__attribute__((warn_unused_result, nonnull))
 static inline
 trx_t*&
 thd_to_trx(
@@ -1119,33 +1569,34 @@ ha_innobase::is_fake_change_enabled(THD* thd)
 }
 
 /********************************************************************//**
-Call this function when mysqld passes control to the client. That is to
-avoid deadlocks on the adaptive hash S-latch possibly held by thd. For more
-documentation, see handler.cc.
-@return	0 */
+In XtraDB it is impossible for a transaction to own a search latch outside of
+InnoDB code, so there is nothing to release on demand.  We keep this function to
+simplify maintenance.
+@return 0 */
 static
 int
 innobase_release_temporary_latches(
 /*===============================*/
-	handlerton*	hton,	/*!< in: handlerton */
-	THD*		thd)	/*!< in: MySQL thread */
+	handlerton*	hton __attribute__((unused)),	/*!< in: handlerton */
+	THD*		thd __attribute__((unused)))	/*!< in: MySQL thread */
 {
-	trx_t*	trx;
-
+#ifdef UNIV_DEBUG
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
-	if (!innodb_inited) {
+	if (!innodb_inited || thd == NULL) {
 
 		return(0);
 	}
 
-	trx = thd_to_trx(thd);
+	trx_t*	trx = thd_to_trx(thd);
 
 	if (trx != NULL) {
-
-		/* No-op in XtraDB */
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(!btr_search_own_any());
+#endif
 		trx_search_latch_release_if_reserved(trx);
 	}
+#endif
 
 	return(0);
 }
@@ -1172,11 +1623,11 @@ Converts an InnoDB error code to a MySQL error code and also tells to MySQL
 about a possible transaction rollback inside InnoDB caused by a lock wait
 timeout or a deadlock.
 @return	MySQL error code */
-extern "C" UNIV_INTERN
+static
 int
 convert_error_code_to_mysql(
 /*========================*/
-	int	error,	/*!< in: InnoDB error code */
+	dberr_t	error,	/*!< in: InnoDB error code */
 	ulint	flags,  /*!< in: InnoDB table flags, or 0 */
 	THD*	thd)	/*!< in: user thread handle or NULL */
 {
@@ -1188,7 +1639,8 @@ convert_error_code_to_mysql(
                 return(HA_ERR_ABORTED_BY_USER);
 
 	case DB_FOREIGN_EXCEED_MAX_CASCADE:
-		push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+		ut_ad(thd);
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 				    HA_ERR_ROW_IS_REFERENCED,
 				    "InnoDB: Cannot delete/update "
 				    "rows with cascading foreign key "
@@ -1212,6 +1664,9 @@ convert_error_code_to_mysql(
 		handling stage. */
 		return(HA_ERR_FOUND_DUPP_KEY);
 
+	case DB_READ_ONLY:
+		return(HA_ERR_TABLE_READONLY);
+
 	case DB_FOREIGN_DUPLICATE_KEY:
 		return(HA_ERR_FOREIGN_DUPLICATE_KEY);
 
@@ -1242,7 +1697,7 @@ convert_error_code_to_mysql(
 
 		if (thd) {
 			thd_mark_transaction_to_rollback(
-				thd, (bool)row_rollback_on_timeout);
+				thd, (bool) row_rollback_on_timeout);
 		}
 
 		return(HA_ERR_LOCK_WAIT_TIMEOUT);
@@ -1276,14 +1731,17 @@ convert_error_code_to_mysql(
 	case DB_TABLE_IS_BEING_USED:
 		return(HA_ERR_WRONG_COMMAND);
 
+	case DB_TABLESPACE_DELETED:
 	case DB_TABLE_NOT_FOUND:
 		return(HA_ERR_NO_SUCH_TABLE);
 
+	case DB_TABLESPACE_NOT_FOUND:
+		return(HA_ERR_NO_SUCH_TABLE);
+
 	case DB_TOO_BIG_RECORD: {
 		/* If prefix is true then a 768-byte prefix is stored
 		locally for BLOB fields. Refer to dict_table_get_format() */
-		bool prefix = ((flags & DICT_TF_FORMAT_MASK)
-		 	       >> DICT_TF_FORMAT_SHIFT) < UNIV_FORMAT_B;
+		bool prefix = (dict_tf_get_format(flags) == UNIV_FORMAT_A);
 		my_printf_error(ER_TOO_BIG_ROWSIZE,
 			"Row size too large (> %lu). Changing some columns "
 			"to TEXT or BLOB %smay help. In current row "
@@ -1316,18 +1774,12 @@ convert_error_code_to_mysql(
 
 		return(HA_ERR_LOCK_TABLE_FULL);
 
-	case DB_PRIMARY_KEY_IS_NULL:
-		return(ER_PRIMARY_CANT_HAVE_NULL);
-
+	case DB_FTS_INVALID_DOCID:
+		return(HA_FTS_INVALID_DOCID);
+	case DB_FTS_EXCEED_RESULT_CACHE_LIMIT:
+		return(HA_ERR_OUT_OF_MEM);
 	case DB_TOO_MANY_CONCURRENT_TRXS:
-		/* New error code HA_ERR_TOO_MANY_CONCURRENT_TRXS is only
-		available in 5.1.38 and later, but the plugin should still
-		work with previous versions of MySQL. */
-#ifdef HA_ERR_TOO_MANY_CONCURRENT_TRXS
 		return(HA_ERR_TOO_MANY_CONCURRENT_TRXS);
-#else /* HA_ERR_TOO_MANY_CONCURRENT_TRXS */
-		return(HA_ERR_RECORD_FILE_FULL);
-#endif /* HA_ERR_TOO_MANY_CONCURRENT_TRXS */
 	case DB_UNSUPPORTED:
 		return(HA_ERR_UNSUPPORTED);
 	case DB_INDEX_CORRUPT:
@@ -1336,6 +1788,8 @@ convert_error_code_to_mysql(
 		return(HA_ERR_UNDO_REC_TOO_BIG);
 	case DB_OUT_OF_MEMORY:
 		return(HA_ERR_OUT_OF_MEM);
+	case DB_TABLESPACE_EXISTS:
+		return(HA_ERR_TABLESPACE_EXISTS);
 	case DB_IDENTIFIER_TOO_LONG:
 		return(HA_ERR_INTERNAL_ERROR);
 	}
@@ -1343,14 +1797,14 @@ convert_error_code_to_mysql(
 
 /*************************************************************//**
 Prints info of a THD object (== user session thread) to the given file. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_mysql_print_thd(
 /*=====================*/
 	FILE*	f,		/*!< in: output stream */
-	void*	thd,		/*!< in: pointer to a MySQL THD object */
+	THD*	thd,		/*!< in: MySQL THD object */
 	uint	max_query_len)	/*!< in: max query length to print, or 0 to
-				   use the default max length */
+				use the default max length */
 {
 	char	buffer[1024];
 
@@ -1361,8 +1815,20 @@ innobase_mysql_print_thd(
 }
 
 /******************************************************************//**
+Get the error message format string.
+@return the format string or 0 if not found. */
+UNIV_INTERN
+const char*
+innobase_get_err_msg(
+/*=================*/
+	int	error_code)	/*!< in: MySQL error code */
+{
+	return(my_get_err_msg(error_code));
+}
+
+/******************************************************************//**
 Get the variable length bounds of the given character set. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_get_cset_width(
 /*====================*/
@@ -1371,7 +1837,7 @@ innobase_get_cset_width(
 	ulint*	mbmaxlen)	/*!< out: maximum length of a char (in bytes) */
 {
 	CHARSET_INFO*	cs;
-	ut_ad(cset < 256);
+	ut_ad(cset <= MAX_CHAR_COLL_NUM);
 	ut_ad(mbminlen);
 	ut_ad(mbmaxlen);
 
@@ -1405,7 +1871,7 @@ innobase_get_cset_width(
 
 /******************************************************************//**
 Converts an identifier to a table name. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_convert_from_table_id(
 /*===========================*/
@@ -1416,13 +1882,13 @@ innobase_convert_from_table_id(
 {
 	uint	errors;
 
-	strconvert(cs, from, &my_charset_filename, to, (uint) len, &errors);
+	strconvert(cs, from, strlen(from), &my_charset_filename, to, (uint) len, &errors);
 }
 
 /**********************************************************************
 Check if the length of the identifier exceeds the maximum allowed.
 return true when length of identifier is too long. */
-extern "C"
+UNIV_INTERN
 my_bool
 innobase_check_identifier_length(
 /*=============================*/
@@ -1446,7 +1912,7 @@ innobase_check_identifier_length(
 
 /******************************************************************//**
 Converts an identifier to UTF-8. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_convert_from_id(
 /*=====================*/
@@ -1457,44 +1923,50 @@ innobase_convert_from_id(
 {
 	uint	errors;
 
-	strconvert(cs, from, system_charset_info, to, (uint) len, &errors);
-}
-
-/**********************************************************************
-Converts an identifier from my_charset_filename to UTF-8 charset.
-@return result string length, as returned by strconvert() */
-extern "C"
-uint
-innobase_convert_to_system_charset(
-/*===============================*/
-	char*		to,	/* out: converted identifier */
-	const char*	from,	/* in: identifier to convert */
-	ulint		len,	/* in: length of 'to', in bytes */
-	uint*		errors)	/* out: error return */
-{
-	CHARSET_INFO*	cs1 = &my_charset_filename;
-	CHARSET_INFO*	cs2 = system_charset_info;
-
-	return(strconvert(cs1, from, cs2, to, len, errors));
+	strconvert(cs, from, strlen(from), system_charset_info, to, (uint) len, &errors);
 }
 
 /******************************************************************//**
 Compares NUL-terminated UTF-8 strings case insensitively.
 @return	0 if a=b, <0 if a<b, >1 if a>b */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 int
 innobase_strcasecmp(
 /*================*/
 	const char*	a,	/*!< in: first string to compare */
 	const char*	b)	/*!< in: second string to compare */
 {
+	if (!a) {
+		if (!b) {
+			return(0);
+		} else {
+			return(-1);
+		}
+	} else if (!b) {
+		return(1);
+	}
+
 	return(my_strcasecmp(system_charset_info, a, b));
 }
 
 /******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively. The
+second string contains wildcards.
+@return 0 if a match is found, 1 if not */
+UNIV_INTERN
+int
+innobase_wildcasecmp(
+/*=================*/
+	const char*	a,	/*!< in: string to compare */
+	const char*	b)	/*!< in: wildcard string to compare */
+{
+	return(wild_case_compare(system_charset_info, a, b));
+}
+
+/******************************************************************//**
 Strip dir name from a full path name and return only the file name
 @return file name or "null" if no file name */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 const char*
 innobase_basename(
 /*==============*/
@@ -1507,7 +1979,7 @@ innobase_basename(
 
 /******************************************************************//**
 Makes all characters in a NUL-terminated UTF-8 string lower case. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_casedn_str(
 /*================*/
@@ -1519,39 +1991,52 @@ innobase_casedn_str(
 /**********************************************************************//**
 Determines the connection character set.
 @return	connection character set */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 struct charset_info_st*
 innobase_get_charset(
 /*=================*/
-	void*	mysql_thd)	/*!< in: MySQL thread handle */
+	THD*	mysql_thd)	/*!< in: MySQL thread handle */
 {
-	return(thd_charset((THD*) mysql_thd));
+	return(thd_charset(mysql_thd));
 }
 
 /**********************************************************************//**
 Determines the current SQL statement.
 @return	SQL statement string */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 const char*
 innobase_get_stmt(
 /*==============*/
-	void*	mysql_thd,	/*!< in: MySQL thread handle */
+	THD*	thd,		/*!< in: MySQL thread handle */
 	size_t*	length)		/*!< out: length of the SQL statement */
 {
 	LEX_STRING* stmt;
 
-	stmt = thd_query_string((THD*) mysql_thd);
+	stmt = thd_query_string(thd);
 	*length = stmt->length;
 	return(stmt->str);
 }
 
 /**********************************************************************//**
+Get the current setting of the table_def_size global parameter. We do
+a dirty read because for one there is no synchronization object and
+secondly there is little harm in doing so even if we get a torn read.
+@return	value of table_def_size */
+UNIV_INTERN
+ulint
+innobase_get_table_cache_size(void)
+/*===============================*/
+{
+	return(tdc_size);
+}
+
+/**********************************************************************//**
 Get the current setting of the lower_case_table_names global parameter from
 mysqld.cc. We do a dirty read because for one there is no synchronization
 object and secondly there is little harm in doing so even if we get a torn
 read.
 @return	value of lower_case_table_names */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ulint
 innobase_get_lower_case_table_names(void)
 /*=====================================*/
@@ -1562,7 +2047,7 @@ innobase_get_lower_case_table_names(void)
 /*********************************************************************//**
 Creates a temporary file.
 @return	temporary file descriptor, or < 0 on error */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 int
 innobase_mysql_tmpfile(void)
 /*========================*/
@@ -1590,21 +2075,21 @@ innobase_mysql_tmpfile(void)
 #ifdef _WIN32
 		/* Note that on Windows, the integer returned by mysql_tmpfile
 		has no relation to C runtime file descriptor. Here, we need
-		to call my_get_osfhandle to get the HANDLE and then convert it 
+		to call my_get_osfhandle to get the HANDLE and then convert it
 		to C runtime filedescriptor. */
 		{
 			HANDLE hFile = my_get_osfhandle(fd);
 			HANDLE hDup;
-			BOOL bOK = 
-				DuplicateHandle(GetCurrentProcess(), hFile, GetCurrentProcess(),
-								&hDup, 0, FALSE, DUPLICATE_SAME_ACCESS);
-			if(bOK) {
-				fd2 = _open_osfhandle((intptr_t)hDup,0);
-			}
-			else {
+			BOOL bOK = DuplicateHandle(
+					GetCurrentProcess(),
+					hFile, GetCurrentProcess(),
+					&hDup, 0, FALSE, DUPLICATE_SAME_ACCESS);
+			if (bOK) {
+				fd2 = _open_osfhandle((intptr_t) hDup, 0);
+			} else {
 				my_osmaperr(GetLastError());
 				fd2 = -1;
-			}	
+			}
 		}
 #else
 		fd2 = dup(fd);
@@ -1624,7 +2109,7 @@ innobase_mysql_tmpfile(void)
 /*********************************************************************//**
 Wrapper around MySQL's copy_and_convert function.
 @return	number of bytes copied to 'to' */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ulint
 innobase_convert_string(
 /*====================*/
@@ -1634,13 +2119,15 @@ innobase_convert_string(
 	CHARSET_INFO*	to_cs,		/*!< in: character set to convert to */
 	const void*	from,		/*!< in: string to convert */
 	ulint		from_length,	/*!< in: number of bytes to convert */
-	CHARSET_INFO*	from_cs,	/*!< in: character set to convert from */
+	CHARSET_INFO*	from_cs,	/*!< in: character set to convert
+					from */
 	uint*		errors)		/*!< out: number of errors encountered
 					during the conversion */
 {
-  return(copy_and_convert((char*)to, (uint32) to_length, to_cs,
-                          (const char*)from, (uint32) from_length, from_cs,
-                          errors));
+	return(copy_and_convert(
+			(char*) to, (uint32) to_length, to_cs,
+			(const char*) from, (uint32) from_length, from_cs,
+			errors));
 }
 
 /*******************************************************************//**
@@ -1652,7 +2139,7 @@ The result is always NUL-terminated (provided buf_size > 0) and the
 number of bytes that were written to "buf" is returned (including the
 terminating NUL).
 @return	number of bytes that were written */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ulint
 innobase_raw_format(
 /*================*/
@@ -1699,7 +2186,7 @@ innobase_next_autoinc() will be called with increment set to 3 where
 autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
 the multi-value INSERT above.
 @return	the next value */
-static
+UNIV_INTERN
 ulonglong
 innobase_next_autoinc(
 /*==================*/
@@ -1726,9 +2213,6 @@ innobase_next_autoinc(
         */
         max_value= (~(ulonglong) 0);
 
-	/* Current value should never be greater than the maximum. */
-	ut_a(current <= max_value);
-
 	/* According to MySQL documentation, if the offset is greater than
 	the step then the offset is ignored. */
 	if (offset > block) {
@@ -1739,6 +2223,7 @@ innobase_next_autoinc(
 	in reality a negative value.The visual studio compilers converts
 	large double values automatically into unsigned long long datatype
 	maximum value */
+
 	if (block >= max_value
 	    || offset > max_value
 	    || current >= max_value
@@ -1827,9 +2312,9 @@ innobase_trx_init(
 }
 
 /*********************************************************************//**
-Allocates an InnoDB transaction for a MySQL handler object.
+Allocates an InnoDB transaction for a MySQL handler object for DML.
 @return	InnoDB transaction handle */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 trx_t*
 innobase_trx_allocate(
 /*==================*/
@@ -1877,7 +2362,6 @@ check_trx_exists(
 
 /*************************************************************************
 Gets current trx. */
-extern "C"
 trx_t*
 innobase_get_trx()
 {
@@ -1890,7 +2374,6 @@ innobase_get_trx()
 	}
 }
 
-extern "C"
 ibool
 innobase_get_slow_log()
 {
@@ -1969,15 +2452,90 @@ trx_is_started(
 /*===========*/
 	trx_t*	trx)	/* in: transaction */
 {
-	return(trx->state != TRX_NOT_STARTED);
+	return(trx->state != TRX_STATE_NOT_STARTED);
+}
+
+/*********************************************************************//**
+Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+UNIV_INTERN
+void
+innobase_copy_frm_flags_from_create_info(
+/*=====================================*/
+	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
+	const HA_CREATE_INFO*	create_info)	/*!< in: create info */
+{
+	ibool	ps_on;
+	ibool	ps_off;
+
+	if (dict_table_is_temporary(innodb_table)) {
+		/* Temp tables do not use persistent stats. */
+		ps_on = FALSE;
+		ps_off = TRUE;
+	} else {
+		ps_on = create_info->table_options
+			& HA_OPTION_STATS_PERSISTENT;
+		ps_off = create_info->table_options
+			& HA_OPTION_NO_STATS_PERSISTENT;
+	}
+
+	dict_stats_set_persistent(innodb_table, ps_on, ps_off);
+
+	dict_stats_auto_recalc_set(
+		innodb_table,
+		create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
+		create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
+
+	innodb_table->stats_sample_pages = create_info->stats_sample_pages;
+}
+
+/*********************************************************************//**
+Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+UNIV_INTERN
+void
+innobase_copy_frm_flags_from_table_share(
+/*=====================================*/
+	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
+	const TABLE_SHARE*	table_share)	/*!< in: table share */
+{
+	ibool	ps_on;
+	ibool	ps_off;
+
+	if (dict_table_is_temporary(innodb_table)) {
+		/* Temp tables do not use persistent stats */
+		ps_on = FALSE;
+		ps_off = TRUE;
+	} else {
+		ps_on = table_share->db_create_options
+			& HA_OPTION_STATS_PERSISTENT;
+		ps_off = table_share->db_create_options
+			& HA_OPTION_NO_STATS_PERSISTENT;
+	}
+
+	dict_stats_set_persistent(innodb_table, ps_on, ps_off);
+
+	dict_stats_auto_recalc_set(
+		innodb_table,
+		table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
+		table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
+
+	innodb_table->stats_sample_pages = table_share->stats_sample_pages;
 }
 
 /*********************************************************************//**
 Construct ha_innobase handler. */
 UNIV_INTERN
-ha_innobase::ha_innobase(handlerton *hton, TABLE_SHARE *table_arg)
-  :handler(hton, table_arg),
-  int_table_flags(HA_REC_NOT_IN_SEQ |
+ha_innobase::ha_innobase(
+/*=====================*/
+	handlerton*	hton,
+	TABLE_SHARE*	table_arg)
+	:handler(hton, table_arg),
+	int_table_flags(HA_REC_NOT_IN_SEQ |
 		  HA_NULL_IN_KEY | HA_CAN_VIRTUAL_COLUMNS |
 		  HA_CAN_INDEX_BLOBS |
 		  HA_CAN_SQL_HANDLER |
@@ -1985,15 +2543,17 @@ ha_innobase::ha_innobase(handlerton *hton, TABLE_SHARE *table_arg)
 		  HA_PRIMARY_KEY_IN_READ_INDEX |
 		  HA_BINLOG_ROW_CAPABLE |
 		  HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ |
-		  HA_TABLE_SCAN_ON_INDEX),
-  start_of_scan(0),
-  num_write_row(0)
+		  HA_TABLE_SCAN_ON_INDEX | HA_CAN_FULLTEXT |
+		  HA_CAN_FULLTEXT_EXT),
+	start_of_scan(0),
+	num_write_row(0)
 {}
 
 /*********************************************************************//**
 Destruct ha_innobase handler. */
 UNIV_INTERN
 ha_innobase::~ha_innobase()
+/*======================*/
 {
 }
 
@@ -2009,6 +2569,13 @@ ha_innobase::update_thd(
 {
 	trx_t*		trx;
 
+	DBUG_ENTER("ha_innobase::update_thd");
+	DBUG_PRINT("ha_innobase::update_thd", ("user_thd: %p -> %p",
+		   user_thd, thd));
+
+	/* The table should have been opened in ha_innobase::open(). */
+	DBUG_ASSERT(prebuilt->table->n_ref_count > 0);
+
 	trx = check_trx_exists(thd);
 
 	if (prebuilt->trx != trx) {
@@ -2017,6 +2584,7 @@ ha_innobase::update_thd(
 	}
 
 	user_thd = thd;
+	DBUG_VOID_RETURN;
 }
 
 /*********************************************************************//**
@@ -2029,6 +2597,7 @@ ha_innobase::update_thd()
 /*=====================*/
 {
 	THD*	thd = ha_thd();
+
 	ut_ad(EQ_CURRENT_THD(thd));
 	update_thd(thd);
 }
@@ -2058,9 +2627,9 @@ innobase_register_trx(
 
 	trx_register_for_2pc(trx);
 }
-  
-/*   BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB
-     ------------------------------------------------------------
+
+/*	BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB
+	------------------------------------------------------------
 
 1) The use of the query cache for TBL is disabled when there is an
 uncommitted change to TBL.
@@ -2095,7 +2664,9 @@ invalidation to the transaction commit.
 2) To store or retrieve a value from the query cache of an InnoDB table TBL,
 any query must first ask InnoDB's permission. We must pass the thd as a
 parameter because InnoDB will look at the trx id, if any, associated with
-that thd.
+that thd. Also the full_name which is used as key to search for the table
+object. The full_name is a string containing the normalized path to the
+table in the canonical format.
 
 3) Use of the query cache for InnoDB tables is now allowed also when
 AUTOCOMMIT==0 or we are inside BEGIN ... COMMIT. Thus transactions no longer
@@ -2117,9 +2688,9 @@ read view to it if there is no read view yet.
 Why a deadlock of threads is not possible: the query cache calls this function
 at the start of a SELECT processing. Then the calling thread cannot be
 holding any InnoDB semaphores. The calling thread is holding the
-query cache mutex, and this function will reserver the InnoDB kernel mutex.
+query cache mutex, and this function will reserve the InnoDB trx_sys->mutex.
 Thus, the 'rank' in sync0sync.h of the MySQL query cache mutex is above
-the InnoDB kernel mutex.
+the InnoDB trx_sys->mutex.
 @return TRUE if permitted, FALSE if not; note that the value FALSE
 does not mean we should invalidate the query cache: invalidation is
 called explicitly */
@@ -2130,11 +2701,9 @@ innobase_query_caching_of_table_permitted(
 	THD*	thd,		/*!< in: thd of the user who is trying to
 				store a result to the query cache or
 				retrieve it */
-	char*	full_name,	/*!< in: concatenation of database name,
-				the null character NUL, and the table
-				name */
-	uint	full_name_len,	/*!< in: length of the full name, i.e.
-				len(dbname) + len(tablename) + 1 */
+	char*	full_name,	/*!< in: normalized path to the table */
+	uint	full_name_len,	/*!< in: length of the normalized path 
+                                to the table */
 	ulonglong *unused)	/*!< unused for this engine */
 {
 	ibool	is_autocommit;
@@ -2152,18 +2721,16 @@ innobase_query_caching_of_table_permitted(
 		return((my_bool)FALSE);
 	}
 
-	if (trx->has_search_latch) {
+	if (UNIV_UNLIKELY(trx->has_search_latch)) {
 		sql_print_error("The calling thread is holding the adaptive "
 				"search, latch though calling "
 				"innobase_query_caching_of_table_permitted.");
-
-		mutex_enter(&kernel_mutex);
 		trx_print(stderr, trx, 1024);
-		mutex_exit(&kernel_mutex);
 	}
 
 	trx_search_latch_release_if_reserved(trx);
-	innodb_srv_conc_force_exit_innodb(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
 
@@ -2196,15 +2763,7 @@ innobase_query_caching_of_table_permitted(
 	}
 
 	/* Normalize the table name to InnoDB format */
-
-	memcpy(norm_name, full_name, full_name_len);
-
-	norm_name[strlen(norm_name)] = '/'; /* InnoDB uses '/' as the
-					    separator between db and table */
-	norm_name[full_name_len] = '\0';
-#ifdef __WIN__
-	innobase_casedn_str(norm_name);
-#endif
+	normalize_table_name(norm_name, full_name);
 
 	innobase_register_trx(innodb_hton_ptr, thd, trx);
 
@@ -2222,7 +2781,7 @@ innobase_query_caching_of_table_permitted(
 
 /*****************************************************************//**
 Invalidates the MySQL query cache for the table. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_invalidate_query_cache(
 /*============================*/
@@ -2237,12 +2796,12 @@ innobase_invalidate_query_cache(
 					also the null chars count */
 {
 	/* Note that the sync0sync.h rank of the query cache mutex is just
-	above the InnoDB kernel mutex. The caller of this function must not
-	have latches of a lower rank. */
+	above the InnoDB trx_sys_t->lock. The caller of this function must
+	not have latches of a lower rank. */
 
 	/* Argument TRUE below means we are using transactions */
 #ifdef HAVE_QUERY_CACHE
-	mysql_query_cache_invalidate4((THD*) trx->mysql_thd,
+	mysql_query_cache_invalidate4(trx->mysql_thd,
 				      full_name,
 				      (uint32) full_name_len,
 				      TRUE);
@@ -2261,30 +2820,27 @@ innobase_convert_identifier(
 	ulint		buflen,	/*!< in: length of buf, in bytes */
 	const char*	id,	/*!< in: identifier to convert */
 	ulint		idlen,	/*!< in: length of id, in bytes */
-	void*		thd,	/*!< in: MySQL connection thread, or NULL */
+	THD*		thd,	/*!< in: MySQL connection thread, or NULL */
 	ibool		file_id)/*!< in: TRUE=id is a table or database name;
 				FALSE=id is an UTF-8 string */
 {
-	char nz[NAME_LEN + 1];
-	char nz2[NAME_LEN + 1 + EXPLAIN_FILENAME_MAX_EXTRA_LENGTH];
-
 	const char*	s	= id;
 	int		q;
 
 	if (file_id) {
+
+		char nz[MAX_TABLE_NAME_LEN + 1];
+		char nz2[MAX_TABLE_NAME_LEN + 1];
+
 		/* Decode the table name.  The MySQL function expects
 		a NUL-terminated string.  The input and output strings
 		buffers must not be shared. */
-
-		if (UNIV_UNLIKELY(idlen > (sizeof nz) - 1)) {
-			idlen = (sizeof nz) - 1;
-		}
-
+		ut_a(idlen <= MAX_TABLE_NAME_LEN);
 		memcpy(nz, id, idlen);
 		nz[idlen] = 0;
 
 		s = nz2;
-		idlen = explain_filename((THD*) thd, nz, nz2, sizeof nz2,
+		idlen = explain_filename(thd, nz, nz2, sizeof nz2,
 					 EXPLAIN_PARTITIONS_AS_COMMENT);
 		goto no_quote;
 	}
@@ -2293,7 +2849,7 @@ innobase_convert_identifier(
 	if (UNIV_UNLIKELY(!thd)) {
 		q = '"';
 	} else {
-		q = get_quote_char_for_identifier((THD*) thd, s, (int) idlen);
+		q = get_quote_char_for_identifier(thd, s, (int) idlen);
 	}
 
 	if (q == EOF) {
@@ -2341,7 +2897,7 @@ no_quote:
 Convert a table or index name to the MySQL system_charset_info (UTF-8)
 and quote it if needed.
 @return	pointer to the end of buf */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 char*
 innobase_convert_name(
 /*==================*/
@@ -2349,7 +2905,7 @@ innobase_convert_name(
 	ulint		buflen,	/*!< in: length of buf, in bytes */
 	const char*	id,	/*!< in: identifier to convert */
 	ulint		idlen,	/*!< in: length of id, in bytes */
-	void*		thd,	/*!< in: MySQL connection thread, or NULL */
+	THD*		thd,	/*!< in: MySQL connection thread, or NULL */
 	ibool		table_id)/*!< in: TRUE=id is a table or database name;
 				FALSE=id is an index name */
 {
@@ -2391,14 +2947,13 @@ no_db_name:
 	}
 
 	return(s);
-
 }
 
 /*****************************************************************//**
 A wrapper function of innobase_convert_name(), convert a table or
 index name to the MySQL system_charset_info (UTF-8) and quote it if needed.
 @return	pointer to the end of buf */
-static inline
+UNIV_INTERN
 void
 innobase_format_name(
 /*==================*/
@@ -2420,11 +2975,11 @@ innobase_format_name(
 /**********************************************************************//**
 Determines if the currently running transaction has been interrupted.
 @return	TRUE if interrupted */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ibool
 trx_is_interrupted(
 /*===============*/
-	trx_t*	trx)	/*!< in: transaction */
+	const trx_t*	trx)	/*!< in: transaction */
 {
 	return(trx && trx->mysql_thd && thd_kill_level((THD*) trx->mysql_thd));
 }
@@ -2432,21 +2987,33 @@ trx_is_interrupted(
 /**********************************************************************//**
 Determines if the currently running transaction is in strict mode.
 @return	TRUE if strict */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ibool
 trx_is_strict(
 /*==========*/
 	trx_t*	trx)	/*!< in: transaction */
 {
-	return(trx && trx->mysql_thd
-	       && THDVAR((THD*) trx->mysql_thd, strict_mode));
+	return(trx && trx->mysql_thd && THDVAR(trx->mysql_thd, strict_mode));
+}
+
+/**********************************************************************//**
+Determines if the current MySQL thread is running in strict mode.
+If thd==NULL, THDVAR returns the global value of innodb-strict-mode.
+@return	TRUE if strict */
+UNIV_INLINE
+ibool
+thd_is_strict(
+/*==========*/
+	THD*	thd)	/*!< in: MySQL thread descriptor */
+{
+	return(THDVAR(thd, strict_mode));
 }
 
 /**************************************************************//**
 Resets some fields of a prebuilt struct. The template is used in fast
 retrieval of just those column values MySQL needs in its processing. */
-void
 inline
+void
 ha_innobase::reset_template(void)
 /*=============================*/
 {
@@ -2455,14 +3022,15 @@ ha_innobase::reset_template(void)
 
 	prebuilt->keep_other_fields_on_keyread = 0;
 	prebuilt->read_just_key = 0;
-        /* Reset index condition pushdown state. */
-        if (prebuilt->idx_cond) {
-                prebuilt->idx_cond = NULL;
-                prebuilt->idx_cond_n_cols = 0;
-                /* Invalidate prebuilt->mysql_template
-                in ha_innobase::write_row(). */
-                prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE;
-        }
+	prebuilt->in_fts_query = 0;
+	/* Reset index condition pushdown state. */
+	if (prebuilt->idx_cond) {
+		prebuilt->idx_cond = NULL;
+		prebuilt->idx_cond_n_cols = 0;
+		/* Invalidate prebuilt->mysql_template
+		in ha_innobase::write_row(). */
+		prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE;
+	}
 }
 
 /*****************************************************************//**
@@ -2487,11 +3055,12 @@ ha_innobase::init_table_handle_for_HANDLER(void)
 	external_lock */
 
 	trx_search_latch_release_if_reserved(prebuilt->trx);
-	innodb_srv_conc_force_exit_innodb(prebuilt->trx);
+
+	innobase_srv_conc_force_exit_innodb(prebuilt->trx);
 
 	/* If the transaction is not started yet, start it */
 
-	trx_start_if_not_started(prebuilt->trx);
+	trx_start_if_not_started_xa(prebuilt->trx);
 
 	/* Assign a read view if the transaction does not have it yet */
 
@@ -2521,126 +3090,11 @@ ha_innobase::init_table_handle_for_HANDLER(void)
 	reset_template();
 }
 
-#ifdef HAVE_REPLICATION
-/* The last read master log coordinates in the slave info file */
-static char	master_log_fname[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN] = "";
-static int	master_log_pos;
-/* The slave relay log coordinates in the slave info file after startup */
-static char	original_relay_log_fname[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN] = "";
-static int	original_relay_log_pos;
-/* The master log coordinates in the slave info file after startup */
-static char	original_master_log_fname[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN] = "";
-static int	original_master_log_pos;
-#endif
-
-/*****************************************************************//**
-Overwrites the MySQL relay log info file with the current master and relay log
-coordinates from InnoDB.  Skips overwrite if the master log position did not
-change from the last overwrite.  If the InnoDB master log position is equal
-to position that was read from the info file on startup before any overwrites,
-restores the original positions. */
-static
-void
-innobase_do_overwrite_relay_log_info(void)
-/*======================================*/
-{
-#ifdef HAVE_REPLICATION
-	char	info_fname[FN_REFLEN];
-	File	info_fd = -1;
-	int	error	= 0;
-	char	buff[FN_REFLEN*2+22*2+4];
-	char	*relay_info_log_pos;
-	size_t	buf_len;
-
-	if (master_log_fname[0] == '\0') {
-		fprintf(stderr,
-			"InnoDB: something wrong with relay-log.info. "
-			"InnoDB will not overwrite it.\n");
-		return;
-	}
-
-	if (strcmp(master_log_fname, trx_sys_mysql_master_log_name) == 0
-	    && master_log_pos == trx_sys_mysql_master_log_pos) {
-		fprintf(stderr,
-			"InnoDB: InnoDB and relay-log.info are synchronized. "
-			"InnoDB will not overwrite it.\n");
-		return;
-	}
-
-	/* If we overwrite the file back to the original master log position,
-	restore the original relay log position too.  This is required because
-	we might have rolled back a prepared transaction and restored the
-	original master log position from the InnoDB trx sys header, but the
-	corresponding relay log position points to an already-purged file. */
-	if (strcmp(original_master_log_fname, trx_sys_mysql_master_log_name)
-	    == 0
-	    && (original_master_log_pos	== trx_sys_mysql_master_log_pos)) {
-
-		strncpy(trx_sys_mysql_relay_log_name, original_relay_log_fname,
-			TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN);
-		trx_sys_mysql_relay_log_pos = original_relay_log_pos;
-	}
-
-	fn_format(info_fname, relay_log_info_file, mysql_data_home, "",
-		  MY_UNPACK_FILENAME | MY_RETURN_REAL_PATH);
-
-	if (access(info_fname, F_OK)) {
-		/* File does not exist */
-		error = 1;
-		goto skip_overwrite;
-	}
-
-	/* File exists */
-	info_fd = my_open(info_fname, O_RDWR|O_BINARY, MYF(MY_WME));
-	if (info_fd < 0) {
-		error = 1;
-		goto skip_overwrite;
-	}
-
-	relay_info_log_pos = strmov(buff, trx_sys_mysql_relay_log_name);
-	*relay_info_log_pos ++= '\n';
-	relay_info_log_pos = longlong2str(trx_sys_mysql_relay_log_pos,
-					  relay_info_log_pos, 10);
-	*relay_info_log_pos ++= '\n';
-	relay_info_log_pos = strmov(relay_info_log_pos,
-				    trx_sys_mysql_master_log_name);
-	*relay_info_log_pos ++= '\n';
-	relay_info_log_pos = longlong2str(trx_sys_mysql_master_log_pos,
-					  relay_info_log_pos, 10);
-	*relay_info_log_pos = '\n';
-
-	buf_len = (relay_info_log_pos - buff) + 1;
-	if (my_write(info_fd, (uchar *)buff, buf_len, MY_WME) != buf_len) {
-		error = 1;
-	} else if (my_sync(info_fd, MY_WME)) {
-		error = 1;
-	}
-
-	if (info_fd >= 0) {
-		my_close(info_fd, MYF(0));
-	}
-
-	strncpy(master_log_fname, trx_sys_mysql_relay_log_name,
-		TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN);
-	master_log_pos = trx_sys_mysql_master_log_pos;
-
-skip_overwrite:
-	if (error) {
-		fprintf(stderr,
-			"InnoDB: ERROR: error occured during overwriting "
-			"relay-log.info.\n");
-	} else {
-		fprintf(stderr,
-			"InnoDB: relay-log.info was overwritten.\n");
-	}
-#endif
-}
-
-
 /****************************************************************//**
 Gives the file extension of an InnoDB single-table tablespace. */
 static const char* ha_innobase_exts[] = {
   ".ibd",
+  ".isl",
   NullS
 };
 
@@ -2658,40 +3112,46 @@ innobase_init(
 	bool		ret;
 	char		*default_path;
 	uint		format_id;
+	ulong		num_pll_degree;
 
 	DBUG_ENTER("innobase_init");
-        handlerton *innobase_hton= (handlerton *)p;
-        innodb_hton_ptr = innobase_hton;
-
-        innobase_hton->state = SHOW_OPTION_YES;
-        innobase_hton->db_type= DB_TYPE_INNODB;
-        innobase_hton->savepoint_offset=sizeof(trx_named_savept_t);
-        innobase_hton->close_connection=innobase_close_connection;
-        innobase_hton->savepoint_set=innobase_savepoint;
-        innobase_hton->savepoint_rollback=innobase_rollback_to_savepoint;
-        innobase_hton->savepoint_release=innobase_release_savepoint;
-        innobase_hton->commit_ordered=innobase_commit_ordered;
-        innobase_hton->commit=innobase_commit;
-        innobase_hton->rollback=innobase_rollback;
-        innobase_hton->prepare=innobase_xa_prepare;
-        innobase_hton->recover=innobase_xa_recover;
-        innobase_hton->commit_by_xid=innobase_commit_by_xid;
-        innobase_hton->rollback_by_xid=innobase_rollback_by_xid;
+	handlerton *innobase_hton= (handlerton*) p;
+	innodb_hton_ptr = innobase_hton;
+
+	innobase_hton->state = SHOW_OPTION_YES;
+	innobase_hton->db_type= DB_TYPE_INNODB;
+	innobase_hton->savepoint_offset = sizeof(trx_named_savept_t);
+	innobase_hton->close_connection = innobase_close_connection;
+	innobase_hton->savepoint_set = innobase_savepoint;
+	innobase_hton->savepoint_rollback = innobase_rollback_to_savepoint;
+	innobase_hton->savepoint_release = innobase_release_savepoint;
+	innobase_hton->commit_ordered=innobase_commit_ordered;
+	innobase_hton->commit = innobase_commit;
+	innobase_hton->rollback = innobase_rollback;
+	innobase_hton->prepare = innobase_xa_prepare;
+	innobase_hton->recover = innobase_xa_recover;
+	innobase_hton->commit_by_xid = innobase_commit_by_xid;
+	innobase_hton->rollback_by_xid = innobase_rollback_by_xid;
         innobase_hton->commit_checkpoint_request=innobase_checkpoint_request;
         innobase_hton->checkpoint_state= innobase_checkpoint_state;
-        innobase_hton->create_cursor_read_view=innobase_create_cursor_view;
-        innobase_hton->set_cursor_read_view=innobase_set_cursor_view;
-        innobase_hton->close_cursor_read_view=innobase_close_cursor_view;
-        innobase_hton->create=innobase_create_handler;
-        innobase_hton->drop_database=innobase_drop_database;
-        innobase_hton->panic=innobase_end;
-        innobase_hton->start_consistent_snapshot=innobase_start_trx_and_assign_read_view;
-        innobase_hton->flush_logs=innobase_flush_logs;
-        innobase_hton->show_status=innobase_show_status;
-        innobase_hton->flags=HTON_EXTENDED_KEYS;
-        innobase_hton->release_temporary_latches=innobase_release_temporary_latches;
-	innobase_hton->alter_table_flags = innobase_alter_table_flags;
-        innobase_hton->kill_query = innobase_kill_query;
+	innobase_hton->create_cursor_read_view = innobase_create_cursor_view;
+	innobase_hton->set_cursor_read_view = innobase_set_cursor_view;
+	innobase_hton->close_cursor_read_view = innobase_close_cursor_view;
+	innobase_hton->create = innobase_create_handler;
+	innobase_hton->drop_database = innobase_drop_database;
+	innobase_hton->panic = innobase_end;
+
+	innobase_hton->start_consistent_snapshot =
+		innobase_start_trx_and_assign_read_view;
+
+	innobase_hton->flush_logs = innobase_flush_logs;
+	innobase_hton->show_status = innobase_show_status;
+	innobase_hton->flags = HTON_SUPPORTS_EXTENDED_KEYS;
+
+	innobase_hton->release_temporary_latches =
+		innobase_release_temporary_latches;
+
+	innobase_hton->kill_query = innobase_kill_connection;
 
         if (srv_file_per_table)
           innobase_hton->tablefile_extensions = ha_innobase_exts;
@@ -2701,64 +3161,39 @@ innobase_init(
 #ifndef DBUG_OFF
 	static const char	test_filename[] = "-@";
 	char			test_tablename[sizeof test_filename
-				+ sizeof srv_mysql50_table_name_prefix];
-	if ((sizeof test_tablename) - 1
-			!= filename_to_tablename(test_filename, test_tablename,
-			sizeof test_tablename, true)
+				+ sizeof(srv_mysql50_table_name_prefix) - 1];
+	if ((sizeof(test_tablename)) - 1
+			!= filename_to_tablename(test_filename,
+						 test_tablename,
+						 sizeof(test_tablename), true)
 			|| strncmp(test_tablename,
-			srv_mysql50_table_name_prefix,
-			sizeof srv_mysql50_table_name_prefix)
+				   srv_mysql50_table_name_prefix,
+				   sizeof(srv_mysql50_table_name_prefix) - 1)
 			|| strcmp(test_tablename
-			+ sizeof srv_mysql50_table_name_prefix,
-			test_filename)) {
-		sql_print_error("tablename encoding has been changed");
-		goto error;
-	}
-#endif /* DBUG_OFF */
-
-	srv_page_size = 0;
-	srv_page_size_shift = 0;
+				  + sizeof(srv_mysql50_table_name_prefix) - 1,
+				  test_filename)) {
 
-	if (innobase_page_size != (1 << 14)) {
-		uint n_shift;
-
-		fprintf(stderr,
-			"InnoDB: Warning: innodb_page_size has been changed from default value 16384. (###EXPERIMENTAL### operation)\n");
-		for (n_shift = 12; n_shift <= UNIV_PAGE_SIZE_SHIFT_MAX; n_shift++) {
-			if (innobase_page_size == ((ulong)1 << n_shift)) {
-				srv_page_size_shift = n_shift;
-				srv_page_size = (1 << srv_page_size_shift);
-				fprintf(stderr,
-					"InnoDB: The universal page size of the database is set to %lu.\n",
-					srv_page_size);
-				break;
-			}
-		}
-	} else {
-		srv_page_size_shift = 14;
-		srv_page_size = (1 << srv_page_size_shift);
-	}
+		sql_print_error("tablename encoding has been changed");
 
-	if (!srv_page_size_shift) {
-		fprintf(stderr,
-			"InnoDB: Error: %lu is not a valid value for innodb_page_size.\n"
-			"InnoDB: Error: Valid values are 4096, 8192, and 16384 (default=16384).\n",
-			innobase_page_size);
 		goto error;
 	}
+#endif /* DBUG_OFF */
 
 	srv_log_block_size = 0;
 	if (innobase_log_block_size != (1 << 9)) { /*!=512*/
 		uint	n_shift;
 
 		fprintf(stderr,
-			"InnoDB: Warning: innodb_log_block_size has been changed from default value 512. (###EXPERIMENTAL### operation)\n");
-		for (n_shift = 9; n_shift <= UNIV_PAGE_SIZE_SHIFT_MAX; n_shift++) {
+			"InnoDB: Warning: innodb_log_block_size has been "
+			"changed from default value 512. (###EXPERIMENTAL### "
+			"operation)\n");
+		for (n_shift = 9; n_shift <= UNIV_PAGE_SIZE_SHIFT_MAX;
+		     n_shift++) {
 			if (innobase_log_block_size == ((ulong)1 << n_shift)) {
 				srv_log_block_size = (1 << n_shift);
 				fprintf(stderr,
-					"InnoDB: The log block size is set to %lu.\n",
-					srv_log_block_size);
+					"InnoDB: The log block size is set to "
+					ULINTPF ".\n",srv_log_block_size);
 				break;
 			}
 		}
@@ -2769,111 +3204,15 @@ innobase_init(
 
 	if (!srv_log_block_size) {
 		fprintf(stderr,
-			"InnoDB: Error: %lu is not a valid value for innodb_log_block_size.\n"
-			"InnoDB: Error: A valid value for innodb_log_block_size is\n"
+			"InnoDB: Error: %lu is not a valid value for "
+			"innodb_log_block_size.\n"
+			"InnoDB: Error: A valid value for "
+			"innodb_log_block_size is\n"
 			"InnoDB: Error: a power of 2 from 512 to 16384.\n",
 			innobase_log_block_size);
 		goto error;
 	}
 
-#ifndef MYSQL_SERVER
-	innodb_overwrite_relay_log_info = FALSE;
-#endif
-
-#ifdef HAVE_REPLICATION
-#ifdef MYSQL_SERVER
-	/* read master log position from relay-log.info if exists */
-	char info_fname[FN_REFLEN];
-	char relay_log_fname[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN];
-	int relay_log_pos;
-	int info_fd;
-	IO_CACHE info_file;
-
-	info_fname[0] = '\0';
-
-	if(innobase_overwrite_relay_log_info) {
-
-	fprintf(stderr,
-		"InnoDB: Warning: innodb_overwrite_relay_log_info is enabled."
-		" Updates by other storage engines may not be synchronized.\n");
-
-	bzero((char*) &info_file, sizeof(info_file));
-	fn_format(info_fname, relay_log_info_file, mysql_data_home, "", 4+32);
-
-	int error=0;
-
-	if (!access(info_fname,F_OK)) {
-		/* exist */
-		if ((info_fd = my_open(info_fname, O_RDWR | O_BINARY,
-				       MYF(MY_WME))) < 0) {
-			error=1;
-		} else if (init_io_cache(&info_file, info_fd, IO_SIZE*2,
-					READ_CACHE, 0L, 0, MYF(MY_WME))) {
-			error=1;
-		}
-
-		if (error) {
-relay_info_error:
-			if (info_fd >= 0)
-				my_close(info_fd, MYF(0));
-			master_log_fname[0] = '\0';
-			goto skip_relay;
-		}
-	} else {
-		master_log_fname[0] = '\0';
-		goto skip_relay;
-	}
-
-	if (init_strvar_from_file(relay_log_fname, sizeof(relay_log_fname),
-				  &info_file, "")
-	    || /* dummy (it is relay-log) */ init_intvar_from_file(
-		    &relay_log_pos, &info_file, BIN_LOG_HEADER_SIZE)) {
-		end_io_cache(&info_file);
-		error=1;
-		goto relay_info_error;
-	}
-
-	fprintf(stderr,
-		"InnoDB: relay-log.info is detected.\n"
-		"InnoDB: relay log: position %u, file name %s\n",
-		relay_log_pos, relay_log_fname);
-
-	strncpy(trx_sys_mysql_relay_log_name, relay_log_fname,
-		TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN);
-	trx_sys_mysql_relay_log_pos = (ib_int64_t) relay_log_pos;
-
-	strncpy(original_relay_log_fname, relay_log_fname,
-		TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN);
-	original_relay_log_pos = relay_log_pos;
-
-	if (init_strvar_from_file(master_log_fname, sizeof(master_log_fname),
-				  &info_file, "")
-	    || init_intvar_from_file(&master_log_pos, &info_file, 0)) {
-		end_io_cache(&info_file);
-		error=1;
-		goto relay_info_error;
-	}
-
-	fprintf(stderr,
-		"InnoDB: master log: position %u, file name %s\n",
-		master_log_pos, master_log_fname);
-
-	strncpy(trx_sys_mysql_master_log_name, master_log_fname,
-		TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN);
-	trx_sys_mysql_master_log_pos = (ib_int64_t) master_log_pos;
-
-	strncpy(original_master_log_fname, master_log_fname,
-		TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN);
-	original_master_log_pos = master_log_pos;
-
-	end_io_cache(&info_file);
-	if (info_fd >= 0)
-		my_close(info_fd, MYF(0));
-	}
-skip_relay:
-#endif /* MYSQL_SERVER */
-#endif /* HAVE_REPLICATION */
-
 	/* Check that values don't overflow on 32-bit systems. */
 	if (sizeof(ulint) == 4) {
 		if (innobase_buffer_pool_size > UINT_MAX32) {
@@ -2883,17 +3222,9 @@ skip_relay:
 
 			goto error;
 		}
-
-		if (innobase_log_file_size > UINT_MAX32) {
-			sql_print_error(
-				"innobase_log_file_size can't be over 4GB"
-				" on 32-bit systems");
-
-			goto error;
-		}
 	}
 
-	os_innodb_umask = (ulint)my_umask;
+	os_innodb_umask = (ulint) my_umask;
 
 	/* First calculate the default path for innodb_data_home_dir etc.,
 	in case the user has not given any value.
@@ -2924,12 +3255,12 @@ skip_relay:
 	srv_data_home = (innobase_data_home_dir ? innobase_data_home_dir :
 			 default_path);
 
-	/* Set default InnoDB data file size to 10 MB and let it be
+	/* Set default InnoDB data file size to 12 MB and let it be
 	auto-extending. Thus users can use InnoDB in >= 4.0 without having
 	to specify any startup options. */
 
 	if (!innobase_data_file_path) {
-		innobase_data_file_path = (char*) "ibdata1:10M:autoextend";
+		innobase_data_file_path = (char*) "ibdata1:12M:autoextend";
 	}
 
 	/* Since InnoDB edits the argument in the next call, we make another
@@ -2949,49 +3280,57 @@ mem_free_and_error:
 		goto error;
 	}
 
-	srv_doublewrite_file = innobase_doublewrite_file;
-
-	srv_use_sys_stats_table = (ibool) innobase_use_sys_stats_table;
-
-#ifdef UNIV_DEBUG
-	srv_sys_stats_root_page = innobase_sys_stats_root_page;
-#endif
-
-	/* -------------- Log files ---------------------------*/
+	/* -------------- All log files ---------------------------*/
 
 	/* The default dir for log files is the datadir of MySQL */
 
-	if (!innobase_log_group_home_dir) {
-		innobase_log_group_home_dir = default_path;
+	if (!srv_log_group_home_dir) {
+		srv_log_group_home_dir = default_path;
 	}
 
 #ifdef UNIV_LOG_ARCHIVE
-	/* Since innodb_log_arch_dir has no relevance under MySQL,
-	starting from 4.0.6 we always set it the same as
-	innodb_log_group_home_dir: */
-
-	innobase_log_arch_dir = innobase_log_group_home_dir;
-
+	if (!innobase_log_arch_dir) {
+		innobase_log_arch_dir = srv_log_group_home_dir;
+	}
 	srv_arch_dir = innobase_log_arch_dir;
 #endif /* UNIG_LOG_ARCHIVE */
 
-	ret = (bool)
-		srv_parse_log_group_home_dirs(innobase_log_group_home_dir);
+	srv_normalize_path_for_win(srv_log_group_home_dir);
 
-	if (ret == FALSE || innobase_mirrored_log_groups != 1) {
-	  sql_print_error("syntax error in innodb_log_group_home_dir, or a "
-			  "wrong number of mirrored log groups");
+	if (strchr(srv_log_group_home_dir, ';')) {
+		sql_print_error("syntax error in innodb_log_group_home_dir");
+		goto mem_free_and_error;
+	}
 
+	if (innobase_mirrored_log_groups == 1) {
+		sql_print_warning(
+			"innodb_mirrored_log_groups is an unimplemented "
+			"feature and the variable will be completely "
+			"removed in a future version.");
+	}
+
+	if (innobase_mirrored_log_groups > 1) {
+		sql_print_error(
+		"innodb_mirrored_log_groups is an unimplemented feature and "
+		"the variable will be completely removed in a future version. "
+		"Using values other than 1 is not supported.");
 		goto mem_free_and_error;
 	}
 
+	if (innobase_mirrored_log_groups == 0) {
+		/* To throw a deprecation warning message when the option is
+		passed, the default was changed to '0' (as a workaround). Since
+		the only value accepted for this option is '1', reset it to 1 */
+		innobase_mirrored_log_groups = 1;
+	}
+
 	/* Validate the file format by animal name */
 	if (innobase_file_format_name != NULL) {
 
 		format_id = innobase_file_format_name_lookup(
 			innobase_file_format_name);
 
-		if (format_id > DICT_TF_FORMAT_MAX) {
+		if (format_id > UNIV_FORMAT_MAX) {
 
 			sql_print_error("InnoDB: wrong innodb_file_format.");
 
@@ -3017,12 +3356,12 @@ mem_free_and_error:
 	if (!innobase_file_format_check) {
 
 		/* Set the value to disable checking. */
-		srv_max_file_format_at_startup = DICT_TF_FORMAT_MAX + 1;
+		srv_max_file_format_at_startup = UNIV_FORMAT_MAX + 1;
 
 	} else {
 
 		/* Set the value to the lowest supported format. */
-		srv_max_file_format_at_startup = DICT_TF_FORMAT_MIN;
+		srv_max_file_format_at_startup = UNIV_FORMAT_MIN;
 	}
 
 	/* Did the user specify a format name that we support?
@@ -3036,11 +3375,17 @@ mem_free_and_error:
 				"should be any value up to %s or its "
 				"equivalent numeric id",
 				trx_sys_file_format_id_to_name(
-					DICT_TF_FORMAT_MAX));
+					UNIV_FORMAT_MAX));
 
 		goto mem_free_and_error;
 	}
 
+	/* Remember stopword table name supplied at startup */
+	if (innobase_server_stopword_table) {
+		fts_server_stopword_table =
+			my_strdup(innobase_server_stopword_table,  MYF(0));
+	}
+
 	if (innobase_change_buffering) {
 		ulint	use;
 
@@ -3066,83 +3411,178 @@ innobase_change_buffering_inited_ok:
 	innobase_change_buffering = (char*)
 		innobase_change_buffering_values[ibuf_use];
 
+	/* Check that interdependent parameters have sane values. */
+	if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) {
+		sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm"
+				  " cannot be set higher than"
+				  " innodb_max_dirty_pages_pct.\n"
+				  "InnoDB: Setting"
+				  " innodb_max_dirty_pages_pct_lwm to %lu\n",
+				  srv_max_buf_pool_modified_pct);
+
+		srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct;
+	}
+
+	if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT) {
+
+		if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) {
+			/* Avoid overflow. */
+			srv_max_io_capacity = SRV_MAX_IO_CAPACITY_LIMIT;
+		} else {
+			/* The user has not set the value. We should
+			set it based on innodb_io_capacity. */
+			srv_max_io_capacity =
+				ut_max(2 * srv_io_capacity, 2000);
+		}
+
+	} else if (srv_max_io_capacity < srv_io_capacity) {
+		sql_print_warning("InnoDB: innodb_io_capacity"
+				  " cannot be set higher than"
+				  " innodb_io_capacity_max.\n"
+				  "InnoDB: Setting"
+				  " innodb_io_capacity to %lu\n",
+				  srv_max_io_capacity);
+
+		srv_io_capacity = srv_max_io_capacity;
+	}
+
+	if (!is_filename_allowed(srv_buf_dump_filename,
+				 strlen(srv_buf_dump_filename), FALSE)) {
+		sql_print_error("InnoDB: innodb_buffer_pool_filename"
+			" cannot have colon (:) in the file name.");
+		goto mem_free_and_error;
+	}
+
 	/* --------------------------------------------------*/
 
 	srv_file_flush_method_str = innobase_file_flush_method;
 
-	srv_n_log_groups = (ulint) innobase_mirrored_log_groups;
-	srv_n_log_files = (ulint) innobase_log_files_in_group;
-	srv_log_file_size = (ulint) innobase_log_file_size;
-
-	srv_thread_concurrency_timer_based =
-		(ibool) innobase_thread_concurrency_timer_based;
+	srv_log_file_size = (ib_uint64_t) innobase_log_file_size;
 
 #ifdef UNIV_LOG_ARCHIVE
 	srv_log_archive_on = (ulint) innobase_log_archive;
 #endif /* UNIV_LOG_ARCHIVE */
+
+	/* Check that the value of system variable innodb_page_size was
+	set correctly.  Its value was put into srv_page_size. If valid,
+	return the associated srv_page_size_shift.*/
+	srv_page_size_shift = innodb_page_size_validate(srv_page_size);
+	if (!srv_page_size_shift) {
+		sql_print_error("InnoDB: Invalid page size=%lu.\n",
+				srv_page_size);
+		goto mem_free_and_error;
+	}
+	if (UNIV_PAGE_SIZE_DEF != srv_page_size) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: innodb-page-size has been changed"
+			" from the default value %d to %lu.\n",
+			UNIV_PAGE_SIZE_DEF, srv_page_size);
+	}
+
 	srv_log_buffer_size = (ulint) innobase_log_buffer_size;
 
+	if (innobase_buffer_pool_instances == 0) {
+		innobase_buffer_pool_instances = 8;
+
+#if defined(__WIN__) && !defined(_WIN64)
+		if (innobase_buffer_pool_size > 1331 * 1024 * 1024) {
+			innobase_buffer_pool_instances
+				= ut_min(MAX_BUFFER_POOLS,
+					(long) (innobase_buffer_pool_size
+					/ (128 * 1024 * 1024)));
+		}
+#endif /* defined(__WIN__) && !defined(_WIN64) */
+	}
 	srv_buf_pool_size = (ulint) innobase_buffer_pool_size;
 	srv_buf_pool_instances = (ulint) innobase_buffer_pool_instances;
 
-	if (innobase_buffer_pool_shm_key) {
+	srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
+
+	if (innobase_additional_mem_pool_size
+	    != 8*1024*1024L /* the default */ ) {
+
+		ut_print_timestamp(stderr);
 		fprintf(stderr,
-			"InnoDB: Warning: innodb_buffer_pool_shm_key is deprecated function.\n"
-			"InnoDB:          innodb_buffer_pool_shm_key was ignored.\n");
+			" InnoDB: Warning: Using "
+			"innodb_additional_mem_pool_size is DEPRECATED. "
+			"This option may be removed in future releases, "
+			"together with the option innodb_use_sys_malloc "
+			"and with the InnoDB's internal memory "
+			"allocator.\n");
 	}
 
-	if (srv_lazy_drop_table) {
+	if (!srv_use_sys_malloc ) {
+		ut_print_timestamp(stderr);
 		fprintf(stderr,
-			"InnoDB: Warning: "
-			"innodb_lazy_drop_table is deprecated and ignored.\n");
+			" InnoDB: Warning: Setting "
+			"innodb_use_sys_malloc to FALSE is DEPRECATED. "
+			"This option may be removed in future releases, "
+			"together with the InnoDB's internal memory "
+			"allocator.\n");
 	}
 
-	srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
-
 	srv_n_file_io_threads = (ulint) innobase_file_io_threads;
 	srv_n_read_io_threads = (ulint) innobase_read_io_threads;
 	srv_n_write_io_threads = (ulint) innobase_write_io_threads;
 
-	srv_read_ahead &= 3;
-	srv_adaptive_flushing_method %= 3;
-	srv_flush_neighbor_pages %= 3;
-
-	srv_force_recovery = (ulint) innobase_force_recovery;
-
-	srv_recovery_stats = (ibool) innobase_recovery_stats;
-
 	srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
-	srv_use_checksums = (ibool) innobase_use_checksums;
-	srv_fast_checksum = (ibool) innobase_fast_checksum;
 
-	if (innobase_fast_checksum) {
+	if (!innobase_use_checksums) {
+		ut_print_timestamp(stderr);
 		fprintf(stderr,
-			"InnoDB: Warning: innodb_fast_checksum is DEPRECATED "
-			"and *WILL* be removed in Percona Server 5.6. Please "
-			"consult the Percona Server 5.6 documentation for "
-			"help in upgrading.\n");
+			" InnoDB: Warning: Setting "
+			"innodb_checksums to OFF is DEPRECATED. "
+			"This option may be removed in future releases. "
+			"You should set innodb_checksum_algorithm=NONE "
+			"instead.\n");
+		srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_NONE;
 	}
 
-	srv_blocking_lru_restore = (ibool) innobase_blocking_lru_restore;
-
 #ifdef HAVE_LARGE_PAGES
-        if ((os_use_large_pages = (ibool) my_use_large_pages))
+	if ((os_use_large_pages = (ibool) my_use_large_pages)) {
 		os_large_page_size = (ulint) opt_large_page_size;
+	}
 #endif
 
 	row_rollback_on_timeout = (ibool) innobase_rollback_on_timeout;
 
 	srv_locks_unsafe_for_binlog = (ibool) innobase_locks_unsafe_for_binlog;
+	if (innobase_locks_unsafe_for_binlog) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Warning: Using "
+			"innodb_locks_unsafe_for_binlog is DEPRECATED. "
+			"This option may be removed in future releases. "
+			"Please use READ COMMITTED transaction isolation "
+			"level instead, see " REFMAN "set-transaction.html.\n");
+	}
 
+	if (innobase_open_files < 10) {
+		innobase_open_files = 300;
+		if (srv_file_per_table && tc_size > 300) {
+			innobase_open_files = tc_size;
+		}
+	}
 	srv_max_n_open_files = (ulint) innobase_open_files;
 	srv_innodb_status = (ibool) innobase_create_status_file;
 
 	srv_print_verbose_log = mysqld_embedded ? 0 : 1;
 
+	/* Round up fts_sort_pll_degree to nearest power of 2 number */
+	for (num_pll_degree = 1;
+	     num_pll_degree < fts_sort_pll_degree;
+	     num_pll_degree <<= 1) {
+
+		/* No op */
+	}
+
+	fts_sort_pll_degree = num_pll_degree;
+
 	/* Store the default charset-collation number of this MySQL
 	installation */
 
-	data_mysql_default_charset_coll = (ulint)default_charset_info->number;
+	data_mysql_default_charset_coll = (ulint) default_charset_info->number;
 
 	ut_a(DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL ==
 					my_charset_latin1.number);
@@ -3167,13 +3607,12 @@ innobase_change_buffering_inited_ok:
 #endif
 	srv_use_atomic_writes = (ibool) innobase_use_atomic_writes;
 	if (innobase_use_atomic_writes) {
-		fprintf(stderr, "InnoDB: using atomic writes.\n");
+		ib_logf(IB_LOG_LEVEL_INFO, "using atomic writes.");
 
 		/* Force doublewrite buffer off, atomic writes replace it. */
 		if (srv_use_doublewrite_buf) {
-			fprintf(stderr,
-				"InnoDB: Switching off doublewrite buffer "
-				"because of atomic writes.\n");
+			ib_logf(IB_LOG_LEVEL_INFO, "switching off doublewrite "
+				"buffer because of atomic writes.");
 			innobase_use_doublewrite = FALSE;
 			srv_use_doublewrite_buf	= FALSE;
 		}
@@ -3185,9 +3624,8 @@ innobase_change_buffering_inited_ok:
 		   !strstr(innobase_file_flush_method, "O_DIRECT")) {
 			innobase_file_flush_method =
 				srv_file_flush_method_str = (char*)"O_DIRECT";
-			fprintf(stderr,
-				"InnoDB: using O_DIRECT due to atomic "
-				"writes.\n");
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"using O_DIRECT due to atomic writes.");
 		}
 #endif
 #ifdef HAVE_POSIX_FALLOCATE
@@ -3200,46 +3638,38 @@ innobase_change_buffering_inited_ok:
 
 #ifdef HAVE_PSI_INTERFACE
 	/* Register keys with MySQL performance schema */
-	if (PSI_server) {
-		int	count;
+	int	count;
 
-                count = array_elements(all_pthread_mutexes);
-                PSI_server->register_mutex("innodb",
-                                           all_pthread_mutexes, count);
+	count = array_elements(all_pthread_mutexes);
+ 	mysql_mutex_register("innodb", all_pthread_mutexes, count);
 
 # ifdef UNIV_PFS_MUTEX
-		count = array_elements(all_innodb_mutexes);
-		PSI_server->register_mutex("innodb",
-					   all_innodb_mutexes, count);
+	count = array_elements(all_innodb_mutexes);
+	mysql_mutex_register("innodb", all_innodb_mutexes, count);
 # endif /* UNIV_PFS_MUTEX */
 
 # ifdef UNIV_PFS_RWLOCK
-		count = array_elements(all_innodb_rwlocks);
-		PSI_server->register_rwlock("innodb",
-					    all_innodb_rwlocks, count);
+	count = array_elements(all_innodb_rwlocks);
+	mysql_rwlock_register("innodb", all_innodb_rwlocks, count);
 # endif /* UNIV_PFS_MUTEX */
 
 # ifdef UNIV_PFS_THREAD
-		count = array_elements(all_innodb_threads);
-		PSI_server->register_thread("innodb",
-					    all_innodb_threads, count);
+	count = array_elements(all_innodb_threads);
+	mysql_thread_register("innodb", all_innodb_threads, count);
 # endif /* UNIV_PFS_THREAD */
 
 # ifdef UNIV_PFS_IO
-		count = array_elements(all_innodb_files);
-		PSI_server->register_file("innodb",
-					  all_innodb_files, count);
+	count = array_elements(all_innodb_files);
+	mysql_file_register("innodb", all_innodb_files, count);
 # endif /* UNIV_PFS_IO */
 
-		count = array_elements(all_innodb_conds);
-		PSI_server->register_cond("innodb",
-					  all_innodb_conds, count);
-	}
+	count = array_elements(all_innodb_conds);
+	mysql_cond_register("innodb", all_innodb_conds, count);
 #endif /* HAVE_PSI_INTERFACE */
 
 	/* Since we in this module access directly the fields of a trx
 	struct, and due to different headers and flags it might happen that
-	mutex_t has a different size in this module and in InnoDB
+	ib_mutex_t has a different size in this module and in InnoDB
 	modules, we check at run time that the size is the same in
 	these compilation modules. */
 
@@ -3249,13 +3679,14 @@ innobase_change_buffering_inited_ok:
 		goto mem_free_and_error;
 	}
 
-	if(innobase_overwrite_relay_log_info) {
-		innobase_do_overwrite_relay_log_info();
-	}
+	/* Adjust the innodb_undo_logs config object */
+	innobase_undo_logs_init_default_max();
 
 	innobase_old_blocks_pct = buf_LRU_old_ratio_update(
 		innobase_old_blocks_pct, TRUE);
 
+	ibuf_max_size_update(innobase_change_buffer_max_size);
+
 	innobase_open_tables = hash_create(200);
 	mysql_mutex_init(innobase_share_mutex_key,
 			 &innobase_share_mutex,
@@ -3277,6 +3708,22 @@ innobase_change_buffering_inited_ok:
 	/* Get the current high water mark format. */
 	innobase_file_format_max = (char*) trx_sys_file_format_max_get();
 
+	/* Currently, monitor counter information are not persistent. */
+	memset(monitor_set_tbl, 0, sizeof monitor_set_tbl);
+
+	memset(innodb_counter_value, 0, sizeof innodb_counter_value);
+
+	/* Do this as late as possible so server is fully starts up,
+	since  we might get some initial stats if user choose to turn
+	on some counters from start up */
+	if (innobase_enable_monitor_counter) {
+		innodb_enable_monitor_at_startup(
+			innobase_enable_monitor_counter);
+	}
+
+	/* Turn on monitor counters that are default on */
+	srv_mon_default_on();
+
 	DBUG_RETURN(FALSE);
 error:
 	DBUG_RETURN(TRUE);
@@ -3301,6 +3748,7 @@ innobase_end(
 	if (innodb_inited) {
 
 		srv_fast_shutdown = (ulint) innobase_fast_shutdown;
+
 		innodb_inited = 0;
 		hash_table_free(innobase_open_tables);
 		innobase_open_tables = NULL;
@@ -3333,28 +3781,13 @@ innobase_flush_logs(
 	DBUG_ENTER("innobase_flush_logs");
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
-	log_buffer_flush_to_disk();
+	if (!srv_read_only_mode) {
+		log_buffer_flush_to_disk();
+	}
 
 	DBUG_RETURN(result);
 }
 
-/****************************************************************//**
-Return alter table flags supported in an InnoDB database. */
-static
-uint
-innobase_alter_table_flags(
-/*=======================*/
-	uint	flags)
-{
-	return(HA_INPLACE_ADD_INDEX_NO_READ_WRITE
-		| HA_INPLACE_ADD_INDEX_NO_WRITE
-		| HA_INPLACE_DROP_INDEX_NO_READ_WRITE
-		| HA_INPLACE_ADD_UNIQUE_INDEX_NO_READ_WRITE
-		| HA_INPLACE_ADD_UNIQUE_INDEX_NO_WRITE
-		| HA_INPLACE_DROP_UNIQUE_INDEX_NO_READ_WRITE
-		| HA_INPLACE_ADD_PK_INDEX_NO_READ_WRITE);
-}
-
 /************************************************************//**
 Synchronously read and parse the redo log up to the last
 checkpoint to write the changed page bitmap.
@@ -3401,33 +3834,6 @@ innobase_is_fake_change(
 	return UNIV_UNLIKELY(trx->fake_changes);
 }
 
-
-/****************************************************************//**
-Copy the current replication position from MySQL to a transaction. */
-static
-void
-innobase_copy_repl_coords_to_trx(
-/*=============================*/
-	const THD*	thd,	/*!< in: thread handle */
-	trx_t*		trx)	/*!< in/out: transaction */
-{
-        if (thd && thd_is_replication_slave_thread(thd)) {
-        /* Update the replication position info inside InnoDB.
-           In embedded server, does nothing. */
-                const char *log_file_name, *group_relay_log_name;
-                ulonglong log_pos, relay_log_pos;
-                bool res = rpl_get_position_info(&log_file_name, &log_pos,
-                                                 &group_relay_log_name,
-                                                 &relay_log_pos);
-                if (res) {
-                        trx->mysql_master_log_file_name = log_file_name;
-                        trx->mysql_master_log_pos = (ib_int64_t)log_pos;
-                        trx->mysql_relay_log_file_name = group_relay_log_name;
-                        trx->mysql_relay_log_pos = (ib_int64_t)relay_log_pos;
-                }
-        }
-}
-
 /*****************************************************************//**
 Commits a transaction in an InnoDB database. */
 static
@@ -3437,35 +3843,6 @@ innobase_commit_low(
 	trx_t*	trx)	/*!< in: transaction handle */
 {
 	if (trx_is_started(trx)) {
-#ifdef HAVE_REPLICATION
-#ifdef MYSQL_SERVER
-		THD *thd=current_thd;
-
-		if (innobase_overwrite_relay_log_info &&
-                    thd && thd_is_replication_slave_thread(thd) &&
-                    thd->connection_name.length) {
-		/* Update the replication position info inside InnoDB.
-		   In embedded server, does nothing. */
-			const char *log_file_name, *group_relay_log_name;
-			ulonglong log_pos, relay_log_pos;
-			bool res = rpl_get_position_info(&log_file_name, &log_pos,
-							 &group_relay_log_name,
-							 &relay_log_pos);
-			if (res) {
-				trx->mysql_master_log_file_name = log_file_name;
-				trx->mysql_master_log_pos = (ib_int64_t)log_pos;
-				trx->mysql_relay_log_file_name = group_relay_log_name;
-				trx->mysql_relay_log_pos = (ib_int64_t)relay_log_pos;
-			}
-		}
-#endif /* MYSQL_SERVER */
-#endif /* HAVE_REPLICATION */
-
-		/* Save the current replication position for write to trx sys
-		header for undo purposes, see the comment at corresponding call
-		at innobase_xa_prepare(). */
-
-		innobase_copy_repl_coords_to_trx((THD *) trx->mysql_thd, trx);
 
 		trx_commit_for_mysql(trx);
 	}
@@ -3481,9 +3858,9 @@ static
 int
 innobase_start_trx_and_assign_read_view(
 /*====================================*/
-        handlerton *hton, /*!< in: Innodb handlerton */
-	THD*	thd)	/*!< in: MySQL thread handle of the user for whom
-			the transaction should be committed */
+	handlerton*	hton,	/*!< in: Innodb handlerton */
+	THD*		thd)	/*!< in: MySQL thread handle of the user for
+				whom the transaction should be committed */
 {
 	trx_t*	trx;
 
@@ -3495,15 +3872,17 @@ innobase_start_trx_and_assign_read_view(
 	trx = check_trx_exists(thd);
 
 	/* This is just to play safe: release a possible FIFO ticket and
-	search latch. Since we will reserve the kernel mutex, we have to
-	release the search system latch first to obey the latching order. */
+	search latch. Since we can potentially reserve the trx_sys->mutex,
+	we have to release the search system latch first to obey the latching
+	order. */
 
 	trx_search_latch_release_if_reserved(trx);
-	innodb_srv_conc_force_exit_innodb(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	/* If the transaction is not started yet, start it */
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	/* Assign a read view if the transaction does not have it yet.
 	Do this only if transaction is using REPEATABLE READ isolation
@@ -3514,7 +3893,7 @@ innobase_start_trx_and_assign_read_view(
 	if (trx->isolation_level == TRX_ISO_REPEATABLE_READ) {
 		trx_assign_read_view(trx);
 	} else {
-		push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 				    HA_ERR_UNSUPPORTED,
 				    "InnoDB: WITH CONSISTENT SNAPSHOT "
 				    "was ignored because this phrase "
@@ -3536,13 +3915,9 @@ innobase_commit_ordered_2(
 	trx_t*	trx, 	/*!< in: Innodb transaction */
 	THD*	thd)	/*!< in: MySQL thread handle */
 {
-	ulonglong tmp_pos;
-	DBUG_ENTER("innobase_commit_ordered");
-
-	/* We need current binlog position for ibbackup to work.
-	Note, the position is current because commit_ordered is guaranteed
-	to be called in same sequenece as writing to binlog. */
+	DBUG_ENTER("innobase_commit_ordered_2");
 
+	/* We need current binlog position for ibbackup to work. */
 retry:
 	if (innobase_commit_concurrency > 0) {
 		mysql_mutex_lock(&commit_cond_m);
@@ -3551,7 +3926,7 @@ retry:
 		if (commit_threads > innobase_commit_concurrency) {
 			commit_threads--;
 			mysql_cond_wait(&commit_cond,
-					  &commit_cond_m);
+				&commit_cond_m);
 			mysql_mutex_unlock(&commit_cond_m);
 			goto retry;
 		}
@@ -3560,12 +3935,23 @@ retry:
 		}
 	}
 
-	mysql_bin_log_commit_pos(thd, &tmp_pos, &(trx->mysql_log_file_name));
-	trx->mysql_log_offset = (ib_int64_t) tmp_pos;
+	/* The following call read the binary log position of
+	the transaction being committed.
 
+        Binary logging of other engines is not relevant to
+	InnoDB as all InnoDB requires is that committing
+	InnoDB transactions appear in the same order in the
+	MySQL binary log as they appear in InnoDB logs, which
+	is guaranteed by the server.
+
+        If the binary log is not enabled, or the transaction
+        is not written to the binary log, the file name will
+        be a NULL pointer. */
+        unsigned long long pos;
+        thd_binlog_pos(thd, &trx->mysql_log_file_name, &pos);
+        trx->mysql_log_offset= static_cast<ib_int64_t>(pos);
 	/* Don't do write + flush right now. For group commit
-	   to work we want to do the flush in the innobase_commit()
-	   method, which runs without holding any locks. */
+	to work we want to do the flush later. */
 	trx->flush_log_later = TRUE;
 	innobase_commit_low(trx);
 	trx->flush_log_later = FALSE;
@@ -3577,7 +3963,8 @@ retry:
 		mysql_mutex_unlock(&commit_cond_m);
 	}
 
-	DBUG_VOID_RETURN;
+	/* Now do a write + flush of logs. */
+        DBUG_VOID_RETURN;
 }
 
 /*****************************************************************//**
@@ -3628,6 +4015,7 @@ innobase_commit_ordered(
 	innobase_commit_ordered_2(trx, thd);
 
 	trx_set_active_commit_ordered(trx);
+
 	DBUG_VOID_RETURN;
 }
 
@@ -3639,11 +4027,13 @@ static
 int
 innobase_commit(
 /*============*/
-        handlerton *hton, /*!< in: Innodb handlerton */
-	THD* 	thd,	/*!< in: MySQL thread handle of the user for whom
-			the transaction should be committed */
-	bool	all)	/*!< in:	TRUE - commit transaction
-				FALSE - the current SQL statement ended */
+	handlerton*	hton,		/*!< in: Innodb handlerton */
+	THD*		thd,		/*!< in: MySQL thread handle of the
+					user for whom the transaction should
+					be committed */
+	bool		commit_trx)	/*!< in: true - commit transaction
+					false - the current SQL statement
+					ended */
 {
 	trx_t*		trx;
 
@@ -3653,18 +4043,21 @@ innobase_commit(
 
 	trx = check_trx_exists(thd);
 
-	/* Since we will reserve the kernel mutex, we have to release
+	/* Since we will reserve the trx_sys->mutex, we have to release
 	the search system latch first to obey the latching order. */
 
 	/* No-op in XtraDB */
 	trx_search_latch_release_if_reserved(trx);
 
-	if (UNIV_UNLIKELY(trx->fake_changes
-			  && (all || (!thd_test_options(thd,
+	if (UNIV_UNLIKELY(trx->fake_changes &&
+	    (commit_trx ||
+	     (!thd_test_options(thd,
 				OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))))) {
 
-		innobase_rollback(hton, thd, all); /* rollback implicitly */
-		thd->stmt_da->reset_diagnostics_area(); /* because debug assertion code complains, if something left */
+		/* rollback implicitly */
+		innobase_rollback(hton, thd, commit_trx);
+		/* because debug assertion code complains, if something left */
+		thd->get_stmt_da()->reset_diagnostics_area();
 		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
 	}
 	/* Transaction is deregistered only in a commit or a rollback. If
@@ -3678,7 +4071,7 @@ innobase_commit(
 				"but transaction is active");
 	}
 
-	if (all
+	if (commit_trx
 	    || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
 
 		DBUG_EXECUTE_IF("crash_innodb_before_commit",
@@ -3697,8 +4090,6 @@ innobase_commit(
 		this one, to allow then to group commit with us. */
 		thd_wakeup_subsequent_commits(thd, 0);
 
-		/* We did the first part already in innobase_commit_ordered(),
-		Now finish by doing a write + flush of logs. */
 		trx_commit_complete_for_mysql(trx);
                 trx_deregister_from_2pc(trx);
 	} else {
@@ -3708,7 +4099,7 @@ innobase_commit(
 		/* If we had reserved the auto-inc lock for some
 		table in this SQL statement we release it now */
 
-		row_unlock_table_autoinc_for_mysql(trx);
+		lock_unlock_table_autoinc(trx);
 
 		/* Store the current undo_no of the transaction so that we
 		know where to roll back if we have to roll back the next
@@ -3719,11 +4110,10 @@ innobase_commit(
 
 	trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */
 
-	if (trx->declared_to_be_inside_innodb) {
-		/* Release our possible ticket in the FIFO */
+	/* This is a statement level variable. */
+	trx->fts_next_doc_id = 0;
 
-		srv_conc_force_exit_innodb(trx);
-	}
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	/* Tell the InnoDB server that there might be work for utility
 	threads: */
@@ -3739,13 +4129,15 @@ static
 int
 innobase_rollback(
 /*==============*/
-        handlerton *hton, /*!< in: Innodb handlerton */ 
-	THD*	thd,	/*!< in: handle to the MySQL thread of the user
-			whose transaction should be rolled back */
-	bool	all)	/*!< in:	TRUE - commit transaction
-				FALSE - the current SQL statement ended */
+	handlerton*	hton,		/*!< in: Innodb handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction should
+					be rolled back */
+	bool		rollback_trx)	/*!< in: TRUE - rollback entire
+					transaction FALSE - rollback the current
+					statement only */
 {
-	int	error = 0;
+	dberr_t	error;
 	trx_t*	trx;
 
 	DBUG_ENTER("innobase_rollback");
@@ -3755,11 +4147,12 @@ innobase_rollback(
 	trx = check_trx_exists(thd);
 
 	/* Release a possible FIFO ticket and search latch. Since we will
-	reserve the kernel mutex, we have to release the search system latch
-	first to obey the latching order. */
+	reserve the trx_sys->mutex, we have to release the search system
+	latch first to obey the latching order. */
 
 	trx_search_latch_release_if_reserved(trx);
-	innodb_srv_conc_force_exit_innodb(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */
 
@@ -3767,9 +4160,12 @@ innobase_rollback(
 	we come here to roll back the latest SQL statement) we
 	release it now before a possibly lengthy rollback */
 
-	row_unlock_table_autoinc_for_mysql(trx);
+	lock_unlock_table_autoinc(trx);
+
+	/* This is a statement level variable. */
+	trx->fts_next_doc_id = 0;
 
-	if (all
+	if (rollback_trx
 	    || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
 
 		error = trx_rollback_for_mysql(trx);
@@ -3790,25 +4186,28 @@ innobase_rollback_trx(
 /*==================*/
 	trx_t*	trx)	/*!< in: transaction */
 {
-	int	error = 0;
+	dberr_t	error = DB_SUCCESS;
 
 	DBUG_ENTER("innobase_rollback_trx");
 	DBUG_PRINT("trans", ("aborting transaction"));
 
 	/* Release a possible FIFO ticket and search latch. Since we will
-	reserve the kernel mutex, we have to release the search system latch
-	first to obey the latching order. */
+	reserve the trx_sys->mutex, we have to release the search system
+	latch first to obey the latching order. */
 
 	trx_search_latch_release_if_reserved(trx);
-	innodb_srv_conc_force_exit_innodb(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	/* If we had reserved the auto-inc lock for some table (if
 	we come here to roll back the latest SQL statement) we
 	release it now before a possibly lengthy rollback */
 
-	row_unlock_table_autoinc_for_mysql(trx);
+	lock_unlock_table_autoinc(trx);
 
-	error = trx_rollback_for_mysql(trx);
+	if (!trx->read_only) {
+		error = trx_rollback_for_mysql(trx);
+	}
 
 	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
 }
@@ -3886,7 +4285,7 @@ innobase_checkpoint_request(
 Log code calls this whenever log has been written and/or flushed up
 to a new position. We use this to notify upper layer of a new commit
 checkpoint when necessary.*/
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_mysql_log_notify(
 /*===============*/
@@ -3962,13 +4361,14 @@ static
 int
 innobase_rollback_to_savepoint(
 /*===========================*/
-        handlerton *hton,       /*!< in: Innodb handlerton */ 
-	THD*	thd,		/*!< in: handle to the MySQL thread of the user
-				whose transaction should be rolled back */
-	void*	savepoint)	/*!< in: savepoint data */
+	handlerton*	hton,		/*!< in: Innodb handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction should
+					be rolled back to savepoint */
+	void*		savepoint)	/*!< in: savepoint data */
 {
 	ib_int64_t	mysql_binlog_cache_pos;
-	int		error = 0;
+	dberr_t		error;
 	trx_t*		trx;
 	char		name[64];
 
@@ -3978,18 +4378,24 @@ innobase_rollback_to_savepoint(
 	trx = check_trx_exists(thd);
 
 	/* Release a possible FIFO ticket and search latch. Since we will
-	reserve the kernel mutex, we have to release the search system latch
-	first to obey the latching order. */
+	reserve the trx_sys->mutex, we have to release the search system
+	latch first to obey the latching order. */
 
 	trx_search_latch_release_if_reserved(trx);
-	innodb_srv_conc_force_exit_innodb(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	/* TODO: use provided savepoint data area to store savepoint data */
 
-	longlong2str((ulint)savepoint, name, 36);
+	longlong2str((ulint) savepoint, name, 36);
+
+	error = trx_rollback_to_savepoint_for_mysql(
+		trx, name, &mysql_binlog_cache_pos);
+
+	if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+		fts_savepoint_rollback(trx, name);
+	}
 
-	error = (int) trx_rollback_to_savepoint_for_mysql(trx, name,
-						&mysql_binlog_cache_pos);
 	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
 }
 
@@ -4001,12 +4407,13 @@ static
 int
 innobase_release_savepoint(
 /*=======================*/
-        handlerton*	hton,	/*!< in: handlerton for Innodb */
-	THD*	thd,		/*!< in: handle to the MySQL thread of the user
-				whose transaction should be rolled back */
-	void*	savepoint)	/*!< in: savepoint data */
+	handlerton*	hton,		/*!< in: handlerton for Innodb */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction's
+					savepoint should be released */
+	void*		savepoint)	/*!< in: savepoint data */
 {
-	int		error = 0;
+	dberr_t		error;
 	trx_t*		trx;
 	char		name[64];
 
@@ -4017,9 +4424,13 @@ innobase_release_savepoint(
 
 	/* TODO: use provided savepoint data area to store savepoint data */
 
-	longlong2str((ulint)savepoint, name, 36);
+	longlong2str((ulint) savepoint, name, 36);
+
+	error = trx_release_savepoint_for_mysql(trx, name);
 
-	error = (int) trx_release_savepoint_for_mysql(trx, name);
+	if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+		fts_savepoint_release(trx, name);
+	}
 
 	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
 }
@@ -4031,43 +4442,42 @@ static
 int
 innobase_savepoint(
 /*===============*/
-	handlerton*	hton,   /*!< in: handle to the Innodb handlerton */
+	handlerton*	hton,	/*!< in: handle to the Innodb handlerton */
 	THD*	thd,		/*!< in: handle to the MySQL thread */
 	void*	savepoint)	/*!< in: savepoint data */
 {
-	int	error = 0;
+	dberr_t	error;
 	trx_t*	trx;
 
 	DBUG_ENTER("innobase_savepoint");
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
-	/*
-	  In the autocommit mode there is no sense to set a savepoint
-	  (unless we are in sub-statement), so SQL layer ensures that
-	  this method is never called in such situation.
-	*/
-#ifdef MYSQL_SERVER /* plugins cannot access thd->in_sub_stmt */
-	DBUG_ASSERT(thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) ||
-		thd->in_sub_stmt);
-#endif /* MYSQL_SERVER */
+	/* In the autocommit mode there is no sense to set a savepoint
+	(unless we are in sub-statement), so SQL layer ensures that
+	this method is never called in such situation.  */
 
 	trx = check_trx_exists(thd);
 
 	/* Release a possible FIFO ticket and search latch. Since we will
-	reserve the kernel mutex, we have to release the search system latch
-	first to obey the latching order. */
+	reserve the trx_sys->mutex, we have to release the search system
+	latch first to obey the latching order. */
 
 	trx_search_latch_release_if_reserved(trx);
-	innodb_srv_conc_force_exit_innodb(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	/* Cannot happen outside of transaction */
 	DBUG_ASSERT(trx_is_registered_for_2pc(trx));
 
 	/* TODO: use provided savepoint data area to store savepoint data */
 	char name[64];
-	longlong2str((ulint)savepoint,name,36);
+	longlong2str((ulint) savepoint,name,36);
 
-	error = (int) trx_savepoint_for_mysql(trx, name, (ib_int64_t)0);
+	error = trx_savepoint_for_mysql(trx, name, (ib_int64_t)0);
+
+	if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+		fts_savepoint_take(trx, name);
+	}
 
 	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
 }
@@ -4079,9 +4489,9 @@ static
 int
 innobase_close_connection(
 /*======================*/
-        handlerton*	hton,	/*!< in:  innobase handlerton */
-	THD*	thd)	/*!< in: handle to the MySQL thread of the user
-			whose resources should be free'd */
+	handlerton*	hton,	/*!< in: innobase handlerton */
+	THD*		thd)	/*!< in: handle to the MySQL thread of the user
+				whose resources should be free'd */
 {
 	trx_t*	trx;
 
@@ -4097,14 +4507,13 @@ innobase_close_connection(
 				"but transaction is active");
 	}
 
-
 	if (trx_is_started(trx) && global_system_variables.log_warnings) {
 
 		sql_print_warning(
 			"MySQL is closing a connection that has an active "
-			"InnoDB transaction.  %llu row modifications will "
-			"roll back.",
-			(ullint) trx->undo_no);
+			"InnoDB transaction.  "TRX_ID_FMT" row modifications "
+			"will roll back.",
+			trx->undo_no);
 	}
 
 	innobase_rollback_trx(trx);
@@ -4115,31 +4524,22 @@ innobase_close_connection(
 }
 
 /*****************************************************************//**
-Cancel any pending lock request associated with the current THD. */
-static
-void
-innobase_kill_query(
-/*======================*/
-        handlerton*	hton,	    /*!< in: innobase handlerton */
-	THD*	thd,	            /*!< in: MySQL thread being killed */
-        enum thd_kill_levels level) /*!< in: kill level */
+Frees a possible InnoDB trx object associated with the current THD.
+@return	0 or error number */
+UNIV_INTERN
+int
+innobase_close_thd(
+/*===============*/
+	THD*		thd)	/*!< in: handle to the MySQL thread of the user
+				whose resources should be free'd */
 {
-	trx_t*	trx;
-	DBUG_ENTER("innobase_kill_query");
-	DBUG_ASSERT(hton == innodb_hton_ptr);
-
-	mutex_enter(&kernel_mutex);
-
-	trx = thd_to_trx(thd);
+	trx_t*	trx = thd_to_trx(thd);
 
-	/* Cancel a pending lock request. */
-	if (trx && trx->wait_lock) {
-		lock_cancel_waiting_and_release(trx->wait_lock);
+	if (!trx) {
+		return(0);
 	}
 
-	mutex_exit(&kernel_mutex);
-
-	DBUG_VOID_RETURN;
+	return(innobase_close_connection(innodb_hton_ptr, thd));
 }
 
 /*************************************************************************//**
@@ -4158,30 +4558,56 @@ ha_innobase::get_row_type() const
 	if (prebuilt && prebuilt->table) {
 		const ulint	flags = prebuilt->table->flags;
 
-		if (UNIV_UNLIKELY(!flags)) {
+		switch (dict_tf_get_rec_format(flags)) {
+		case REC_FORMAT_REDUNDANT:
 			return(ROW_TYPE_REDUNDANT);
-		}
-
-		ut_ad(flags & DICT_TF_COMPACT);
-
-		switch (flags & DICT_TF_FORMAT_MASK) {
-		case DICT_TF_FORMAT_51 << DICT_TF_FORMAT_SHIFT:
+		case REC_FORMAT_COMPACT:
 			return(ROW_TYPE_COMPACT);
-		case DICT_TF_FORMAT_ZIP << DICT_TF_FORMAT_SHIFT:
-			if (flags & DICT_TF_ZSSIZE_MASK) {
-				return(ROW_TYPE_COMPRESSED);
-			} else {
-				return(ROW_TYPE_DYNAMIC);
-			}
-#if DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX
-# error "DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX"
-#endif
+		case REC_FORMAT_COMPRESSED:
+			return(ROW_TYPE_COMPRESSED);
+		case REC_FORMAT_DYNAMIC:
+			return(ROW_TYPE_DYNAMIC);
 		}
 	}
 	ut_ad(0);
 	return(ROW_TYPE_NOT_USED);
 }
 
+/*****************************************************************//**
+Cancel any pending lock request associated with the current THD. */
+static
+void
+innobase_kill_connection(
+/*======================*/
+        handlerton*	hton,	/*!< in:  innobase handlerton */
+	THD*	thd,	/*!< in: handle to the MySQL thread being killed */
+        thd_kill_levels)
+{
+	trx_t*	trx;
+
+	DBUG_ENTER("innobase_kill_connection");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	lock_mutex_enter();
+
+	trx = thd_to_trx(thd);
+
+	if (trx)
+	{
+		trx_mutex_enter(trx);
+
+		/* Cancel a pending lock request. */
+		if (trx->lock.wait_lock)
+			lock_cancel_waiting_and_release(trx->lock.wait_lock);
+
+		trx_mutex_exit(trx);
+	}
+
+	lock_mutex_exit();
+
+	DBUG_VOID_RETURN;
+}
+
 
 
 /****************************************************************//**
@@ -4192,12 +4618,26 @@ handler::Table_flags
 ha_innobase::table_flags() const
 /*============================*/
 {
-       /* Need to use tx_isolation here since table flags is (also)
-          called before prebuilt is inited. */
-        ulong const tx_isolation = thd_tx_isolation(ha_thd());
-        if (tx_isolation <= ISO_READ_COMMITTED)
-                return int_table_flags;
-        return int_table_flags | HA_BINLOG_STMT_CAPABLE;
+	/* Need to use tx_isolation here since table flags is (also)
+	called before prebuilt is inited. */
+	ulong const tx_isolation = thd_tx_isolation(ha_thd());
+
+	if (tx_isolation <= ISO_READ_COMMITTED) {
+		return(int_table_flags);
+	}
+
+	return(int_table_flags | HA_BINLOG_STMT_CAPABLE);
+}
+
+/****************************************************************//**
+Returns the table type (storage engine name).
+@return	table type */
+UNIV_INTERN
+const char*
+ha_innobase::table_type() const
+/*===========================*/
+{
+	return(innobase_hton_name);
 }
 
 /****************************************************************//**
@@ -4206,10 +4646,15 @@ UNIV_INTERN
 const char*
 ha_innobase::index_type(
 /*====================*/
-	uint)
-				/*!< out: index type */
+	uint	keynr)		/*!< : index number */
 {
-	return("BTREE");
+	dict_index_t*	index = innobase_get_index(keynr);
+
+	if (index && index->type & DICT_FTS) {
+		return("FULLTEXT");
+	} else {
+		return("BTREE");
+	}
 }
 
 /****************************************************************//**
@@ -4219,16 +4664,16 @@ UNIV_INTERN
 ulong
 ha_innobase::index_flags(
 /*=====================*/
-	uint index,
-	uint part,
-	bool all_parts)
-const
+	uint	key,
+	uint,
+	bool) const
 {
-       ulong extra_flag= 0;
-       if (table && index == table->s->primary_key)
-             extra_flag= HA_CLUSTERED_INDEX;
-	return(HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER | extra_flag 
-	       | HA_READ_RANGE | HA_KEYREAD_ONLY | HA_DO_INDEX_COND_PUSHDOWN);
+	return((table_share->key_info[key].algorithm == HA_KEY_ALG_FULLTEXT)
+		 ? 0
+		 : (HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER
+		  | HA_READ_RANGE | HA_KEYREAD_ONLY
+                  | (key == table_share->primary_key ? HA_CLUSTERED_INDEX : 0)
+		  | HA_DO_INDEX_COND_PUSHDOWN));
 }
 
 /****************************************************************//**
@@ -4251,11 +4696,25 @@ ha_innobase::max_supported_key_length() const
 /*=========================================*/
 {
 	/* An InnoDB page must store >= 2 keys; a secondary key record
-	must also contain the primary key value: max key length is
-	therefore set to slightly less than 1 / 4 of page size which
-	is 16 kB; but currently MySQL does not work with keys whose
-	size is > MAX_KEY_LENGTH */
-	return(3500);
+	must also contain the primary key value.  Therefore, if both
+	the primary key and the secondary key are at this maximum length,
+	it must be less than 1/4th of the free space on a page including
+	record overhead.
+
+	MySQL imposes its own limit to this number; MAX_KEY_LENGTH = 3072.
+
+	For page sizes = 16k, InnoDB historically reported 3500 bytes here,
+	But the MySQL limit of 3072 was always used through the handler
+	interface. */
+
+	switch (UNIV_PAGE_SIZE) {
+	case 4096:
+		return(768);
+	case 8192:
+		return(1536);
+	default:
+		return(3500);
+	}
 }
 
 /****************************************************************//**
@@ -4264,6 +4723,7 @@ Returns the key map of keys that are usable for scanning.
 UNIV_INTERN
 const key_map*
 ha_innobase::keys_to_use_for_scanning()
+/*===================================*/
 {
 	return(&key_map_full);
 }
@@ -4274,6 +4734,7 @@ Determines if table caching is supported.
 UNIV_INTERN
 uint8
 ha_innobase::table_cache_type()
+/*===========================*/
 {
 	return(HA_CACHE_TBL_ASKTRANSACT);
 }
@@ -4284,23 +4745,15 @@ Determines if the primary key is clustered index.
 UNIV_INTERN
 bool
 ha_innobase::primary_key_is_clustered()
+/*===================================*/
 {
 	return(true);
 }
 
-/** Always normalize table name to lower case on Windows */
-#ifdef __WIN__
-#define normalize_table_name(norm_name, name)		\
-	normalize_table_name_low(norm_name, name, TRUE)
-#else
-#define normalize_table_name(norm_name, name)           \
-	normalize_table_name_low(norm_name, name, FALSE)
-#endif /* __WIN__ */
-
 /*****************************************************************//**
 Normalizes a table name string. A normalized name consists of the
-database name catenated to '/' and table name. An example:
-test/mytable. On Windows normalization puts both the database name and the
+database name catenated to '/' and table name. Example: test/mytable.
+On Windows normalization puts both the database name and the
 table name always to lower case if "set_lower_case" is set to TRUE. */
 static
 void
@@ -4309,13 +4762,15 @@ normalize_table_name_low(
 	char*		norm_name,	/*!< out: normalized name as a
 					null-terminated string */
 	const char*	name,		/*!< in: table name string */
-	ibool		set_lower_case) /*!< in: TRUE if we want to set
-					name to lower case */
+	ibool		set_lower_case)	/*!< in: TRUE if we want to set name
+					to lower case */
 {
 	char*	name_ptr;
+	ulint	name_len;
 	char*	db_ptr;
 	ulint	db_len;
 	char*	ptr;
+	ulint	norm_len;
 
 	/* Scan name from the end */
 
@@ -4327,6 +4782,7 @@ normalize_table_name_low(
 	}
 
 	name_ptr = ptr + 1;
+	name_len = strlen(name_ptr);
 
 	/* skip any number of path separators */
 	while (ptr >= name && (*ptr == '\\' || *ptr == '/')) {
@@ -4345,11 +4801,15 @@ normalize_table_name_low(
 
 	db_ptr = ptr + 1;
 
+	norm_len = db_len + name_len + sizeof "/";
+	ut_a(norm_len < FN_REFLEN - 1);
+
 	memcpy(norm_name, db_ptr, db_len);
 
 	norm_name[db_len] = '/';
 
-	memcpy(norm_name + db_len + 1, name_ptr, strlen(name_ptr) + 1);
+	/* Copy the name and null-byte. */
+	memcpy(norm_name + db_len + 1, name_ptr, name_len + 1);
 
 	if (set_lower_case) {
 		innobase_casedn_str(norm_name);
@@ -4364,7 +4824,7 @@ void
 test_normalize_table_name_low()
 /*===========================*/
 {
-	char		norm_name[128];
+	char		norm_name[FN_REFLEN];
 	const char*	test_data[][2] = {
 		/* input, expected result */
 		{"./mysqltest/t1", "mysqltest/t1"},
@@ -4420,12 +4880,84 @@ test_normalize_table_name_low()
 		}
 	}
 }
+
+/*********************************************************************
+Test ut_format_name(). */
+static
+void
+test_ut_format_name()
+/*=================*/
+{
+	char		buf[NAME_LEN * 3];
+
+	struct {
+		const char*	name;
+		ibool		is_table;
+		ulint		buf_size;
+		const char*	expected;
+	} test_data[] = {
+		{"test/t1",	TRUE,	sizeof(buf),	"\"test\".\"t1\""},
+		{"test/t1",	TRUE,	12,		"\"test\".\"t1\""},
+		{"test/t1",	TRUE,	11,		"\"test\".\"t1"},
+		{"test/t1",	TRUE,	10,		"\"test\".\"t"},
+		{"test/t1",	TRUE,	9,		"\"test\".\""},
+		{"test/t1",	TRUE,	8,		"\"test\"."},
+		{"test/t1",	TRUE,	7,		"\"test\""},
+		{"test/t1",	TRUE,	6,		"\"test"},
+		{"test/t1",	TRUE,	5,		"\"tes"},
+		{"test/t1",	TRUE,	4,		"\"te"},
+		{"test/t1",	TRUE,	3,		"\"t"},
+		{"test/t1",	TRUE,	2,		"\""},
+		{"test/t1",	TRUE,	1,		""},
+		{"test/t1",	TRUE,	0,		"BUF_NOT_CHANGED"},
+		{"table",	TRUE,	sizeof(buf),	"\"table\""},
+		{"ta'le",	TRUE,	sizeof(buf),	"\"ta'le\""},
+		{"ta\"le",	TRUE,	sizeof(buf),	"\"ta\"\"le\""},
+		{"ta`le",	TRUE,	sizeof(buf),	"\"ta`le\""},
+		{"index",	FALSE,	sizeof(buf),	"\"index\""},
+		{"ind/ex",	FALSE,	sizeof(buf),	"\"ind/ex\""},
+	};
+
+	for (size_t i = 0; i < UT_ARR_SIZE(test_data); i++) {
+
+		memcpy(buf, "BUF_NOT_CHANGED", strlen("BUF_NOT_CHANGED") + 1);
+
+		char*	ret;
+
+		ret = ut_format_name(test_data[i].name,
+				     test_data[i].is_table,
+				     buf,
+				     test_data[i].buf_size);
+
+		ut_a(ret == buf);
+
+		if (strcmp(buf, test_data[i].expected) == 0) {
+			fprintf(stderr,
+				"ut_format_name(%s, %s, buf, %lu), "
+				"expected %s, OK\n",
+				test_data[i].name,
+				test_data[i].is_table ? "TRUE" : "FALSE",
+				test_data[i].buf_size,
+				test_data[i].expected);
+		} else {
+			fprintf(stderr,
+				"ut_format_name(%s, %s, buf, %lu), "
+				"expected %s, ERROR: got %s\n",
+				test_data[i].name,
+				test_data[i].is_table ? "TRUE" : "FALSE",
+				test_data[i].buf_size,
+				test_data[i].expected,
+				buf);
+			ut_error;
+		}
+	}
+}
 #endif /* !DBUG_OFF */
 
 /********************************************************************//**
 Get the upper limit of the MySQL integral and floating-point type.
 @return maximum allowed value for the field */
-static
+UNIV_INTERN
 ulonglong
 innobase_get_int_col_max_value(
 /*===========================*/
@@ -4433,7 +4965,7 @@ innobase_get_int_col_max_value(
 {
 	ulonglong	max_value = 0;
 
-	switch(field->key_type()) {
+	switch (field->key_type()) {
 	/* TINY */
 	case HA_KEYTYPE_BINARY:
 		max_value = 0xFFULL;
@@ -4505,12 +5037,13 @@ innobase_match_index_columns(
 	DBUG_ENTER("innobase_match_index_columns");
 
 	/* Check whether user defined index column count matches */
-	if (key_info->key_parts != index_info->n_user_defined_cols) {
+	if (key_info->user_defined_key_parts !=
+		index_info->n_user_defined_cols) {
 		DBUG_RETURN(FALSE);
 	}
 
 	key_part = key_info->key_part;
-	key_end = key_part + key_info->key_parts;
+	key_end = key_part + key_info->user_defined_key_parts;
 	innodb_idx_fld = index_info->fields;
 	innodb_idx_fld_end = index_info->fields + index_info->n_fields;
 
@@ -4565,13 +5098,13 @@ static
 ibool
 innobase_build_index_translation(
 /*=============================*/
-	const TABLE*		table,	  /*!< in: table in MySQL data
-					  dictionary */
-	dict_table_t*		ib_table, /*!< in: table in Innodb data
-					  dictionary */
-	INNOBASE_SHARE*		share)	  /*!< in/out: share structure
-					  where index translation table
-					  will be constructed in. */
+	const TABLE*		table,	/*!< in: table in MySQL data
+					dictionary */
+	dict_table_t*		ib_table,/*!< in: table in Innodb data
+					dictionary */
+	INNOBASE_SHARE*		share)	/*!< in/out: share structure
+					where index translation table
+					will be constructed in. */
 {
 	ulint		mysql_num_index;
 	ulint		ib_num_index;
@@ -4745,7 +5278,7 @@ ha_innobase::innobase_initialize_autoinc()
 	} else {
 		dict_index_t*	index;
 		const char*	col_name;
-		ulonglong	read_auto_inc;
+		ib_uint64_t	read_auto_inc;
 		ulint		err;
 
 		update_thd(ha_thd());
@@ -4819,11 +5352,11 @@ ha_innobase::open(
 	uint			test_if_locked)	/*!< in: not used */
 {
 	dict_table_t*		ib_table;
-	char			norm_name[1000];
+	char			norm_name[FN_REFLEN];
 	THD*			thd;
 	char*			is_part = NULL;
 	ibool			par_case_name_set = FALSE;
-	char			par_case_name[MAX_FULL_NAME_LEN + 1];
+	char			par_case_name[FN_REFLEN];
 	dict_err_ignore_t	ignore_err = DICT_ERR_IGNORE_NONE;
 
 	DBUG_ENTER("ha_innobase::open");
@@ -4833,12 +5366,8 @@ ha_innobase::open(
 
 	thd = ha_thd();
 
-	/* Under some cases MySQL seems to call this function while
-	holding btr_search_latch. This breaks the latching order as
-	we acquire dict_sys->mutex below and leads to a deadlock. */
-	if (thd != NULL) {
-		innobase_release_temporary_latches(ht, thd);
-	}
+	/* No-op in XtraDB */
+	innobase_release_temporary_latches(ht, thd);
 
 	normalize_table_name(norm_name, name);
 
@@ -4876,7 +5405,30 @@ ha_innobase::open(
 	}
 
 	/* Get pointer to a table object in InnoDB dictionary cache */
-	ib_table = dict_table_get(norm_name, TRUE, ignore_err);
+	ib_table = dict_table_open_on_name(norm_name, FALSE, TRUE, ignore_err);
+
+	if (ib_table
+	    && ((!DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID)
+		 && table->s->stored_fields != dict_table_get_n_user_cols(ib_table))
+		|| (DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID)
+		    && (table->s->fields
+			!= dict_table_get_n_user_cols(ib_table) - 1)))) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"table %s contains %lu user defined columns "
+			"in InnoDB, but %lu columns in MySQL. Please "
+			"check INFORMATION_SCHEMA.INNODB_SYS_COLUMNS and "
+			REFMAN "innodb-troubleshooting.html "
+			"for how to resolve it",
+			norm_name, (ulong) dict_table_get_n_user_cols(ib_table),
+			(ulong) table->s->fields);
+
+		/* Mark this table as corrupted, so the drop table
+		or force recovery can still use it, but not others. */
+		ib_table->corrupted = true;
+		dict_table_close(ib_table, FALSE, FALSE);
+		ib_table = NULL;
+		is_part = NULL;
+	}
 
 	if (UNIV_UNLIKELY(ib_table && ib_table->is_corrupt &&
 			  srv_pass_corrupt_table <= 1)) {
@@ -4902,13 +5454,13 @@ ha_innobase::open(
 
 			1) If boot against an installation from Windows
 			platform, then its partition table name could
-			be all be in lower case in system tables. So we
-			will need to check lower case name when load table.
+			be in lower case in system tables. So we will
+			need to check lower case name when load table.
 
-			2) If  we boot an installation from other case
+			2) If we boot an installation from other case
 			sensitive platform in Windows, we might need to
-			check the existence of table name without lowering
-			case them in the system table. */
+			check the existence of table name without lower
+			case in the system table. */
 			if (innobase_get_lower_case_table_names() == 1) {
 
 				if (!par_case_name_set) {
@@ -4916,9 +5468,7 @@ ha_innobase::open(
 					/* Check for the table using lower
 					case name, including the partition
 					separator "P" */
-					memcpy(par_case_name, norm_name,
-					       strlen(norm_name));
-					par_case_name[strlen(norm_name)] = 0;
+					strcpy(par_case_name, norm_name);
 					innobase_casedn_str(par_case_name);
 #else
 					/* On Windows platfrom, check
@@ -4931,9 +5481,11 @@ ha_innobase::open(
 					par_case_name_set = TRUE;
 				}
 
-				ib_table = dict_table_get(
-					par_case_name, FALSE, ignore_err);
+				ib_table = dict_table_open_on_name(
+					par_case_name, FALSE, TRUE,
+					ignore_err);
 			}
+
 			if (ib_table) {
 #ifndef __WIN__
 				sql_print_warning("Partition table %s opened "
@@ -4964,21 +5516,13 @@ ha_innobase::open(
 					norm_name);
 		}
 
-		sql_print_error("Cannot find or open table %s from\n"
-				"the internal data dictionary of InnoDB "
-				"though the .frm file for the\n"
-				"table exists. Maybe you have deleted and "
-				"recreated InnoDB data\n"
-				"files but have forgotten to delete the "
-				"corresponding .frm files\n"
-				"of InnoDB tables, or you have moved .frm "
-				"files to another database?\n"
-				"or, the table contains indexes that this "
-				"version of the engine\n"
-				"doesn't support.\n"
-				"See " REFMAN "innodb-troubleshooting.html\n"
-				"how you can resolve the problem.\n",
-				norm_name);
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Cannot open table %s from the internal data "
+			"dictionary of InnoDB though the .frm file "
+			"for the table exists. See "
+			REFMAN "innodb-troubleshooting.html for how "
+			"you can resolve the problem.", norm_name);
+
 		free_share(share);
 		my_errno = ENOENT;
 
@@ -4987,19 +5531,47 @@ ha_innobase::open(
 
 table_opened:
 
-	if (ib_table->ibd_file_missing && !thd_tablespace_op(thd)) {
-		sql_print_error("MySQL is trying to open a table handle but "
-				"the .ibd file for\ntable %s does not exist.\n"
-				"Have you deleted the .ibd file from the "
-				"database directory under\nthe MySQL datadir, "
-				"or have you used DISCARD TABLESPACE?\n"
-				"See " REFMAN "innodb-troubleshooting.html\n"
-				"how you can resolve the problem.\n",
-				norm_name);
+	innobase_copy_frm_flags_from_table_share(ib_table, table->s);
+
+	dict_stats_init(ib_table);
+
+	MONITOR_INC(MONITOR_TABLE_OPEN);
+
+	bool	no_tablespace;
+
+	if (dict_table_is_discarded(ib_table)) {
+
+		ib_senderrf(thd,
+			IB_LOG_LEVEL_WARN, ER_TABLESPACE_DISCARDED,
+			table->s->table_name.str);
+
+		/* Allow an open because a proper DISCARD should have set
+		all the flags and index root page numbers to FIL_NULL that
+		should prevent any DML from running but it should allow DDL
+		operations. */
+
+		no_tablespace = false;
+
+	} else if (ib_table->ibd_file_missing) {
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN,
+			ER_TABLESPACE_MISSING, norm_name);
+
+		/* This means we have no idea what happened to the tablespace
+		file, best to play it safe. */
+
+		no_tablespace = true;
+	} else {
+		no_tablespace = false;
+	}
+
+	if (!thd_tablespace_op(thd) && no_tablespace) {
 		free_share(share);
 		my_errno = ENOENT;
 
-		dict_table_decrement_handle_count(ib_table, FALSE);
+		dict_table_close(ib_table, FALSE, FALSE);
+
 		DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
 	}
 
@@ -5009,7 +5581,6 @@ table_opened:
 	ut_ad(prebuilt->default_rec);
 
 	/* Looks like MySQL-3.23 sometimes has primary key number != 0 */
-
 	primary_key = table->s->primary_key;
 	key_used_on_scan = primary_key;
 
@@ -5037,7 +5608,7 @@ table_opened:
 			if not attended, bring this to the user's attention
 			by printing a warning in addition to log a message
 			in the errorlog */
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 					    ER_NO_SUCH_INDEX,
 					    "InnoDB: Table %s has a "
 					    "primary key in InnoDB data "
@@ -5060,9 +5631,14 @@ table_opened:
 			that user can adopt necessary measures for the
 			mismatch while still being accessible to the table
 			date. */
-			ref_length = table->key_info[0].key_length;
+			if (!table->key_info) {
+				ut_ad(!table->s->keys);
+				ref_length = 0;
+			} else {
+				ref_length = table->key_info[0].key_length;
+			}
 
-			/* Find correspoinding cluster index
+			/* Find corresponding cluster index
 			key length in MySQL's key_info[] array */
 			for (ulint i = 0; i < table->s->keys; i++) {
 				dict_index_t*	index;
@@ -5099,7 +5675,7 @@ table_opened:
 			if not attended, bring this to the user attention
 			by printing a warning in addition to log a message
 			in the errorlog */
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 					    ER_NO_SUCH_INDEX,
 					    "InnoDB: Table %s has no "
 					    "primary key in InnoDB data "
@@ -5128,7 +5704,7 @@ table_opened:
 	}
 
 	/* Index block size in InnoDB: used by MySQL in query optimization */
-	stats.block_size = 16 * 1024;
+	stats.block_size = UNIV_PAGE_SIZE;
 
 	/* Init table lock structure */
 	thr_lock_data_init(&share->lock,&lock,(void*) 0);
@@ -5143,7 +5719,9 @@ table_opened:
 	}
 
 	/* Only if the table has an AUTOINC column. */
-	if (prebuilt->table != NULL && table->found_next_number_field != NULL) {
+	if (prebuilt->table != NULL
+	    && !prebuilt->table->ibd_file_missing
+	    && table->found_next_number_field != NULL) {
 		dict_table_autoinc_lock(prebuilt->table);
 
 		/* Since a table can already be "open" in InnoDB's internal
@@ -5177,6 +5755,8 @@ ha_innobase::clone(
 	new_handler = static_cast<ha_innobase*>(handler::clone(name,
 							       mem_root));
 	if (new_handler) {
+		DBUG_ASSERT(new_handler->prebuilt != NULL);
+
 		new_handler->prebuilt->select_lock_type
 			= prebuilt->select_lock_type;
 	}
@@ -5187,6 +5767,7 @@ ha_innobase::clone(
 UNIV_INTERN
 uint
 ha_innobase::max_supported_key_part_length() const
+/*==============================================*/
 {
 	/* A table format specific index column length check will be performed
 	at ha_innobase::add_index() and row_create_index_for_mysql() */
@@ -5200,17 +5781,17 @@ Closes a handle to an InnoDB table.
 @return	0 */
 UNIV_INTERN
 int
-ha_innobase::close(void)
-/*====================*/
+ha_innobase::close()
+/*================*/
 {
 	THD*	thd;
 
 	DBUG_ENTER("ha_innobase::close");
 
 	thd = ha_thd();
-	if (thd != NULL) {
-		innobase_release_temporary_latches(ht, thd);
-	}
+
+	/* No-op in XtraDB */
+	innobase_release_temporary_latches(ht, thd);
 
 	row_prebuilt_free(prebuilt, FALSE);
 
@@ -5223,6 +5804,8 @@ ha_innobase::close(void)
 
 	free_share(share);
 
+	MONITOR_INC(MONITOR_TABLE_CLOSE);
+
 	/* Tell InnoDB server that there might be work for
 	utility threads: */
 
@@ -5246,43 +5829,13 @@ get_field_offset(
 	return((uint) (field->ptr - table->record[0]));
 }
 
-/**************************************************************//**
-Checks if a field in a record is SQL NULL. Uses the record format
-information in table to track the null bit in record.
-@return	1 if NULL, 0 otherwise */
-static inline
-uint
-field_in_record_is_null(
-/*====================*/
-	TABLE*	table,	/*!< in: MySQL table object */
-	Field*	field,	/*!< in: MySQL field object */
-	char*	record)	/*!< in: a row in MySQL format */
-{
-	int	null_offset;
-
-	if (!field->null_ptr) {
-
-		return(0);
-	}
-
-	null_offset = (uint) ((char*) field->null_ptr
-					- (char*) table->record[0]);
-
-	if (record[null_offset] & field->null_bit) {
-
-		return(1);
-	}
-
-	return(0);
-}
-
 /*************************************************************//**
 InnoDB uses this function to compare two data fields for which the data type
 is such that we must use MySQL code to compare them. NOTE that the prototype
-of this function is in rem0cmp.c in InnoDB source code! If you change this
+of this function is in rem0cmp.cc in InnoDB source code! If you change this
 function, remember to update the prototype there!
 @return	1, 0, -1, if a is greater, equal, less than b, respectively */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 int
 innobase_mysql_cmp(
 /*===============*/
@@ -5340,9 +5893,9 @@ innobase_mysql_cmp(
 		changes then: 'b\0\0...' is ordered BEFORE 'b  ...'. Users
 		having indexes on such data need to rebuild their tables! */
 
-		ret = charset->coll->strnncollsp(charset,
-				  a, a_length,
-						 b, b_length, 0);
+		ret = charset->coll->strnncollsp(
+			charset, a, a_length, b, b_length, 0);
+
 		if (ret < 0) {
 			return(-1);
 		} else if (ret > 0) {
@@ -5357,12 +5910,300 @@ innobase_mysql_cmp(
 	return(0);
 }
 
+
+/*************************************************************//**
+Get the next token from the given string and store it in *token. */
+UNIV_INTERN
+CHARSET_INFO*
+innobase_get_fts_charset(
+/*=====================*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number)	/*!< in: number of the charset */
+{
+	enum_field_types	mysql_tp;
+	CHARSET_INFO*		charset;
+
+	mysql_tp = (enum_field_types) mysql_type;
+
+	switch (mysql_tp) {
+
+	case MYSQL_TYPE_BIT:
+	case MYSQL_TYPE_STRING:
+	case MYSQL_TYPE_VAR_STRING:
+	case MYSQL_TYPE_TINY_BLOB:
+	case MYSQL_TYPE_MEDIUM_BLOB:
+	case MYSQL_TYPE_BLOB:
+	case MYSQL_TYPE_LONG_BLOB:
+	case MYSQL_TYPE_VARCHAR:
+		/* Use the charset number to pick the right charset struct for
+		the comparison. Since the MySQL function get_charset may be
+		slow before Bar removes the mutex operation there, we first
+		look at 2 common charsets directly. */
+
+		if (charset_number == default_charset_info->number) {
+			charset = default_charset_info;
+		} else if (charset_number == my_charset_latin1.number) {
+			charset = &my_charset_latin1;
+		} else {
+			charset = get_charset(charset_number, MYF(MY_WME));
+
+			if (charset == NULL) {
+			  sql_print_error("InnoDB needs charset %lu for doing "
+					  "a comparison, but MySQL cannot "
+					  "find that charset.",
+					  (ulong) charset_number);
+				ut_a(0);
+			}
+		}
+		break;
+	default:
+		ut_error;
+	}
+
+	return(charset);
+}
+
+/*************************************************************//**
+InnoDB uses this function to compare two data fields for which the data type
+is such that we must use MySQL code to compare them. NOTE that the prototype
+of this function is in rem0cmp.c in InnoDB source code! If you change this
+function, remember to update the prototype there!
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+UNIV_INTERN
+int
+innobase_mysql_cmp_prefix(
+/*======================*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number,	/*!< in: number of the charset */
+	const unsigned char* a,		/*!< in: data field */
+	unsigned int	a_length,	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	const unsigned char* b,		/*!< in: data field */
+	unsigned int	b_length)	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+{
+	CHARSET_INFO*		charset;
+	int			result;
+
+	charset = innobase_get_fts_charset(mysql_type, charset_number);
+
+	result = ha_compare_text(charset, (uchar*) a, a_length,
+				 (uchar*) b, b_length, 1, 0);
+
+	return(result);
+}
+/******************************************************************//**
+compare two character string according to their charset. */
+UNIV_INTERN
+int
+innobase_fts_text_cmp(
+/*==================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*     p1,		/*!< in: key */
+	const void*     p2)		/*!< in: node */
+{
+	const CHARSET_INFO*	charset = (const CHARSET_INFO*) cs;
+	const fts_string_t*	s1 = (const fts_string_t*) p1;
+	const fts_string_t*	s2 = (const fts_string_t*) p2;
+
+	return(ha_compare_text(charset, s1->f_str, s1->f_len,
+			       s2->f_str, s2->f_len, 0, 0));
+}
+/******************************************************************//**
+compare two character string case insensitively according to their charset. */
+UNIV_INTERN
+int
+innobase_fts_text_case_cmp(
+/*=======================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*     p1,		/*!< in: key */
+	const void*     p2)		/*!< in: node */
+{
+	const CHARSET_INFO*	charset = (const CHARSET_INFO*) cs;
+	const fts_string_t*	s1 = (const fts_string_t*) p1;
+	const fts_string_t*	s2 = (const fts_string_t*) p2;
+	ulint			newlen;
+
+	my_casedn_str(charset, (char*) s2->f_str);
+
+	newlen = strlen((const char*) s2->f_str);
+
+	return(ha_compare_text(charset, s1->f_str, s1->f_len,
+			       s2->f_str, newlen, 0, 0));
+}
+/******************************************************************//**
+Get the first character's code position for FTS index partition. */
+UNIV_INTERN
+ulint
+innobase_strnxfrm(
+/*==============*/
+	const CHARSET_INFO*
+			cs,		/*!< in: Character set */
+	const uchar*	str,		/*!< in: string */
+	const ulint	len)		/*!< in: string length */
+{
+	uchar		mystr[2];
+	ulint		value;
+
+	if (!str || len == 0) {
+		return(0);
+	}
+
+	my_strnxfrm(cs, (uchar*) mystr, 2, str, len);
+
+	value = mach_read_from_2(mystr);
+
+	if (value > 255) {
+		value = value / 256;
+	}
+
+	return(value);
+}
+
+/******************************************************************//**
+compare two character string according to their charset. */
+UNIV_INTERN
+int
+innobase_fts_text_cmp_prefix(
+/*=========================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*	p1,		/*!< in: prefix key */
+	const void*	p2)		/*!< in: value to compare */
+{
+	const CHARSET_INFO*	charset = (const CHARSET_INFO*) cs;
+	const fts_string_t*	s1 = (const fts_string_t*) p1;
+	const fts_string_t*	s2 = (const fts_string_t*) p2;
+	int			result;
+
+	result = ha_compare_text(charset, s2->f_str, s2->f_len,
+				 s1->f_str, s1->f_len, 1, 0);
+
+	/* We switched s1, s2 position in ha_compare_text. So we need
+	to negate the result */
+	return(-result);
+}
+/******************************************************************//**
+compare two character string according to their charset. */
+UNIV_INTERN
+int
+innobase_fts_string_cmp(
+/*====================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*     p1,		/*!< in: key */
+	const void*     p2)		/*!< in: node */
+{
+	const CHARSET_INFO*	charset = (const CHARSET_INFO*) cs;
+	uchar*			s1 = (uchar*) p1;
+	uchar*			s2 = *(uchar**) p2;
+
+	return(ha_compare_text(charset, s1, strlen((const char*) s1),
+			       s2, strlen((const char*) s2), 0, 0));
+}
+/******************************************************************//**
+Makes all characters in a string lower case. */
+UNIV_INTERN
+size_t
+innobase_fts_casedn_str(
+/*====================*/
+	CHARSET_INFO*	cs,	/*!< in: Character set */
+	char*		src,	/*!< in: string to put in lower case */
+	size_t		src_len,/*!< in: input string length */
+	char*		dst,	/*!< in: buffer for result string */
+	size_t		dst_len)/*!< in: buffer size */
+{
+	if (cs->casedn_multiply == 1) {
+		memcpy(dst, src, src_len);
+		dst[src_len] = 0;
+		my_casedn_str(cs, dst);
+
+		return(strlen(dst));
+	} else {
+		return(cs->cset->casedn(cs, src, src_len, dst, dst_len));
+	}
+}
+
+#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
+
+#define misc_word_char(X)       0
+
+/*************************************************************//**
+Get the next token from the given string and store it in *token.
+It is mostly copied from MyISAM's doc parsing function ft_simple_get_word()
+@return length of string processed */
+UNIV_INTERN
+ulint
+innobase_mysql_fts_get_token(
+/*=========================*/
+	CHARSET_INFO*	cs,		/*!< in: Character set */
+	const byte*	start,		/*!< in: start of text */
+	const byte*	end,		/*!< in: one character past end of
+					text */
+	fts_string_t*	token,		/*!< out: token's text */
+	ulint*		offset)		/*!< out: offset to token,
+					measured as characters from
+					'start' */
+{
+	int		mbl;
+	const uchar*	doc = start;
+
+	ut_a(cs);
+
+	token->f_n_char = token->f_len = 0;
+
+	for (;;) {
+
+		if (doc >= end) {
+			return(doc - start);
+		}
+
+		int	ctype;
+
+		mbl = cs->cset->ctype(
+			cs, &ctype, doc, (const uchar*) end);
+
+		if (true_word_char(ctype, *doc)) {
+			break;
+		}
+
+		doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
+	}
+
+	ulint	mwc = 0;
+	ulint	length = 0;
+
+	token->f_str = const_cast<byte*>(doc);
+
+	while (doc < end) {
+
+		int	ctype;
+
+		mbl = cs->cset->ctype(
+			cs, &ctype, (uchar*) doc, (uchar*) end);
+		if (true_word_char(ctype, *doc)) {
+			mwc = 0;
+		} else if (!misc_word_char(*doc) || mwc) {
+			break;
+		} else {
+			++mwc;
+		}
+
+		++length;
+
+		doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
+	}
+
+	token->f_len = (uint) (doc - token->f_str) - mwc;
+	token->f_n_char = length;
+
+	return(doc - start);
+}
+
 /**************************************************************//**
 Converts a MySQL type to an InnoDB type. Note that this function returns
 the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
 VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
 @return	DATA_BINARY, DATA_VARCHAR, ... */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ulint
 get_innobase_type_from_mysql_type(
 /*==============================*/
@@ -5409,13 +6250,12 @@ get_innobase_type_from_mysql_type(
 	switch (field->type()) {
 		/* NOTE that we only allow string types in DATA_MYSQL and
 		DATA_VARMYSQL */
-	case MYSQL_TYPE_VAR_STRING: /* old <= 4.1 VARCHAR */
-	case MYSQL_TYPE_VARCHAR:    /* new >= 5.0.3 true VARCHAR */
+	case MYSQL_TYPE_VAR_STRING:	/* old <= 4.1 VARCHAR */
+	case MYSQL_TYPE_VARCHAR:	/* new >= 5.0.3 true VARCHAR */
 		if (field->binary()) {
 			return(DATA_BINARY);
-		} else if (strcmp(
-				   field->charset()->name,
-				   "latin1_swedish_ci") == 0) {
+		} else if (strcmp(field->charset()->name,
+				  "latin1_swedish_ci") == 0) {
 			return(DATA_VARCHAR);
 		} else {
 			return(DATA_VARMYSQL);
@@ -5424,9 +6264,8 @@ get_innobase_type_from_mysql_type(
 	case MYSQL_TYPE_STRING: if (field->binary()) {
 
 			return(DATA_FIXBINARY);
-		} else if (strcmp(
-				   field->charset()->name,
-				   "latin1_swedish_ci") == 0) {
+		} else if (strcmp(field->charset()->name,
+				  "latin1_swedish_ci") == 0) {
 			return(DATA_CHAR);
 		} else {
 			return(DATA_MYSQL);
@@ -5441,22 +6280,14 @@ get_innobase_type_from_mysql_type(
 	case MYSQL_TYPE_DATE:
 	case MYSQL_TYPE_YEAR:
 	case MYSQL_TYPE_NEWDATE:
-           return(DATA_INT);
-
+		return(DATA_INT);
 	case MYSQL_TYPE_TIME:
 	case MYSQL_TYPE_DATETIME:
 	case MYSQL_TYPE_TIMESTAMP:
-          /*
-            XtraDB should ideally just check field->keytype() and never
-            field->type().  The following check is here to only
-            change the new hires datetime/timestamp/time fields to
-            use DATA_FIXBINARY.  We can't convert this function to
-            just test for field->keytype() as then the check if a
-            table is compatible will fail for old tables.
-          */
-           if (field->key_type() == HA_KEYTYPE_BINARY)
-             return(DATA_FIXBINARY);
-           return(DATA_INT);
+		if (field->key_type() == HA_KEYTYPE_BINARY)
+			return(DATA_FIXBINARY);
+                else
+			return(DATA_INT);
 	case MYSQL_TYPE_FLOAT:
 		return(DATA_FLOAT);
 	case MYSQL_TYPE_DOUBLE:
@@ -5504,7 +6335,7 @@ innobase_read_from_2_little_endian(
 /*===============================*/
 	const uchar*	buf)	/*!< in: from where to read */
 {
-	return (uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1])));
+	return((uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1]))));
 }
 
 /*******************************************************************//**
@@ -5522,7 +6353,8 @@ ha_innobase::store_key_val_for_row(
 {
 	KEY*		key_info	= table->key_info + keynr;
 	KEY_PART_INFO*	key_part	= key_info->key_part;
-	KEY_PART_INFO*	end		= key_part + key_info->key_parts;
+	KEY_PART_INFO*	end		=
+		key_part + key_info->user_defined_key_parts;
 	char*		buff_start	= buff;
 	enum_field_types mysql_type;
 	Field*		field;
@@ -5555,7 +6387,7 @@ ha_innobase::store_key_val_for_row(
 	simple memcmp to compare two key values to determine if they are
 	equal. MySQL does this to compare contents of two 'ref' values. */
 
-	bzero(buff, buff_len);
+	memset(buff, 0, buff_len);
 
 	for (; key_part != end; key_part++) {
 		is_null = FALSE;
@@ -5581,7 +6413,7 @@ ha_innobase::store_key_val_for_row(
 			const byte*	data;
 			ulint		key_len;
 			ulint		true_len;
-			CHARSET_INFO*	cs;
+			const CHARSET_INFO* cs;
 			int		error=0;
 
 			key_len = key_part->length;
@@ -5594,11 +6426,11 @@ ha_innobase::store_key_val_for_row(
 			cs = field->charset();
 
 			lenlen = (ulint)
-				(((Field_varstring*)field)->length_bytes);
+				(((Field_varstring*) field)->length_bytes);
 
 			data = row_mysql_read_true_varchar(&len,
 				(byte*) (record
-				+ (ulint)get_field_offset(table, field)),
+				+ (ulint) get_field_offset(table, field)),
 				lenlen);
 
 			true_len = len;
@@ -5608,10 +6440,9 @@ ha_innobase::store_key_val_for_row(
 
 			if (len > 0 && cs->mbmaxlen > 1) {
 				true_len = (ulint) cs->cset->well_formed_len(cs,
-						(const char *) data,
-						(const char *) data + len,
-                                                (uint) (key_len /
-                                                        cs->mbmaxlen),
+						(const char*) data,
+						(const char*) data + len,
+						(uint) (key_len / cs->mbmaxlen),
 						&error);
 			}
 
@@ -5625,7 +6456,7 @@ ha_innobase::store_key_val_for_row(
 			/* The length in a key value is always stored in 2
 			bytes */
 
-			row_mysql_store_true_var_len((byte*)buff, true_len, 2);
+			row_mysql_store_true_var_len((byte*) buff, true_len, 2);
 			buff += 2;
 
 			memcpy(buff, data, true_len);
@@ -5634,7 +6465,7 @@ ha_innobase::store_key_val_for_row(
 			length of the true VARCHAR in the key value, though
 			only len first bytes after the 2 length bytes contain
 			actual data. The rest of the space was reset to zero
-			in the bzero() call above. */
+			in the memset() call above. */
 
 			buff += key_len;
 
@@ -5646,7 +6477,7 @@ ha_innobase::store_key_val_for_row(
 			as BLOB data in innodb. */
 			|| mysql_type == MYSQL_TYPE_GEOMETRY) {
 
-			CHARSET_INFO*	cs;
+			const CHARSET_INFO* cs;
 			ulint		key_len;
 			ulint		true_len;
 			int		error=0;
@@ -5667,7 +6498,7 @@ ha_innobase::store_key_val_for_row(
 
 			blob_data = row_mysql_read_blob_ref(&blob_len,
 				(byte*) (record
-				+ (ulint)get_field_offset(table, field)),
+				+ (ulint) get_field_offset(table, field)),
 					(ulint) field->pack_length());
 
 			true_len = blob_len;
@@ -5680,11 +6511,10 @@ ha_innobase::store_key_val_for_row(
 
 			if (blob_len > 0 && cs->mbmaxlen > 1) {
 				true_len = (ulint) cs->cset->well_formed_len(cs,
-						(const char *) blob_data,
-						(const char *) blob_data
+						(const char*) blob_data,
+						(const char*) blob_data
 							+ blob_len,
-                                                (uint) (key_len /
-                                                        cs->mbmaxlen),
+						(uint) (key_len / cs->mbmaxlen),
 						&error);
 			}
 
@@ -5700,7 +6530,7 @@ ha_innobase::store_key_val_for_row(
 			storage of the number is little-endian */
 
 			innobase_write_to_2_little_endian(
-					(byte*)buff, true_len);
+					(byte*) buff, true_len);
 			buff += 2;
 
 			memcpy(buff, blob_data, true_len);
@@ -5715,7 +6545,7 @@ ha_innobase::store_key_val_for_row(
 			value we store may be also in a column prefix
 			index. */
 
-			CHARSET_INFO*		cs;
+			const CHARSET_INFO*	cs = NULL;
 			ulint			true_len;
 			ulint			key_len;
 			const uchar*		src_start;
@@ -5753,11 +6583,11 @@ ha_innobase::store_key_val_for_row(
 
 					true_len = (ulint)
 						cs->cset->well_formed_len(cs,
-							(const char *)src_start,
-							(const char *)src_start
+							(const char*) src_start,
+							(const char*) src_start
 								+ key_len,
-                                                        (uint) (key_len /
-                                                                cs->mbmaxlen),
+							(uint) (key_len
+								/ cs->mbmaxlen),
 							&error);
 				}
 			}
@@ -5769,6 +6599,7 @@ ha_innobase::store_key_val_for_row(
 
 			if (true_len < key_len) {
 				ulint	pad_len = key_len - true_len;
+				ut_a(cs != NULL);
 				ut_a(!(pad_len % cs->mbminlen));
 
 				cs->cset->fill(cs, buff, pad_len,
@@ -5847,7 +6678,27 @@ build_template_needs_field(
 }
 
 /**************************************************************//**
-Adds a field is to a prebuilt struct 'template'.
+Determines if a field is needed in a prebuilt struct 'template'.
+@return whether the field is needed for index condition pushdown */
+inline
+bool
+build_template_needs_field_in_icp(
+/*==============================*/
+	const dict_index_t*	index,	/*!< in: InnoDB index */
+	const row_prebuilt_t*	prebuilt,/*!< in: row fetch template */
+	bool			contains,/*!< in: whether the index contains
+					column i */
+	ulint			i)	/*!< in: column number */
+{
+	ut_ad(contains == dict_index_contains_col_or_prefix(index, i));
+
+	return(index == prebuilt->index
+	       ? contains
+	       : dict_index_contains_col_or_prefix(prebuilt->index, i));
+}
+
+/**************************************************************//**
+Adds a field to a prebuilt struct 'template'.
 @return the field template */
 static
 mysql_row_templ_t*
@@ -5880,10 +6731,9 @@ build_template_field(
 		templ->rec_field_no = dict_index_get_nth_col_pos(index, i);
 	}
 
-	if (field->null_ptr) {
+	if (field->real_maybe_null()) {
 		templ->mysql_null_byte_offset =
-			(ulint) ((char*) field->null_ptr
-				 - (char*) table->record[0]);
+			field->null_offset();
 
 		templ->mysql_null_bit_mask = (ulint) field->null_bit;
 	} else {
@@ -5894,16 +6744,16 @@ build_template_field(
 
 	templ->mysql_col_len = (ulint) field->pack_length();
 	templ->type = col->mtype;
-	templ->mysql_type = (ulint)field->type();
+	templ->mysql_type = (ulint) field->type();
 
 	if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
 		templ->mysql_length_bytes = (ulint)
-			(((Field_varstring*)field)->length_bytes);
+			(((Field_varstring*) field)->length_bytes);
 	}
 
 	templ->charset = dtype_get_charset_coll(col->prtype);
-	templ->mbminlen = DATA_MBMINLEN(col->mbminmaxlen);
-	templ->mbmaxlen = DATA_MBMAXLEN(col->mbminmaxlen);
+	templ->mbminlen = dict_col_get_mbminlen(col);
+	templ->mbmaxlen = dict_col_get_mbmaxlen(col);
 	templ->is_unsigned = col->prtype & DATA_UNSIGNED;
 
 	if (!dict_index_is_clust(index)
@@ -5940,7 +6790,7 @@ ha_innobase::build_template(
 	ibool		fetch_all_in_key	= FALSE;
 	ibool		fetch_primary_key_cols	= FALSE;
 	ulint		i, sql_idx;
-       
+
 	if (prebuilt->select_lock_type == LOCK_X) {
 		/* We always retrieve the whole clustered index record if we
 		use exclusive row level locks, for example, if the read is
@@ -5969,11 +6819,11 @@ ha_innobase::build_template(
 		} else if (prebuilt->hint_need_to_fetch_extra_cols
 			== ROW_RETRIEVE_PRIMARY_KEY) {
 			/* We must at least fetch all primary key cols. Note
-			   that if the clustered index was internally generated
-			   by InnoDB on the row id (no primary key was
-			   defined), then row_search_for_mysql() will always
-			   retrieve the row id to a special buffer in the
-			   prebuilt struct. */
+			that if the clustered index was internally generated
+			by InnoDB on the row id (no primary key was
+			defined), then row_search_for_mysql() will always
+			retrieve the row id to a special buffer in the
+			prebuilt struct. */
 
 			fetch_primary_key_cols = TRUE;
 		}
@@ -5985,6 +6835,10 @@ ha_innobase::build_template(
 
 	prebuilt->need_to_access_clustered = (index == clust_index);
 
+	/* Either prebuilt->index should be a secondary index, or it
+	should be the clustered index. */
+	ut_ad(dict_index_is_clust(index) == (index == clust_index));
+
 	/* Below we check column by column if we need to access
 	the clustered index. */
 
@@ -6010,12 +6864,12 @@ ha_innobase::build_template(
 
 	if (active_index != MAX_KEY && active_index == pushed_idx_cond_keyno) {
 		/* Push down an index condition or an end_range check. */
-	  for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
+		for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
+
+			while (!table->field[sql_idx]->stored_in_db) {
+				sql_idx++;
+			}
 
-	                while (!table->field[sql_idx]->stored_in_db) {
-			        sql_idx++;     
-                        }
-                       
 			const ibool		index_contains
 				= dict_index_contains_col_or_prefix(index, i);
 
@@ -6033,10 +6887,8 @@ ha_innobase::build_template(
 			the subset
 			field->part_of_key.is_set(active_index)
 			which would be acceptable if end_range==NULL. */
-			if (index == prebuilt->index
-				? index_contains
-				: dict_index_contains_col_or_prefix(
-					prebuilt->index, i)) {
+			if (build_template_needs_field_in_icp(
+				    index, prebuilt, index_contains, i)) {
 				/* Needed in ICP */
 				const Field*		field;
 				mysql_row_templ_t*	templ;
@@ -6104,11 +6956,20 @@ ha_innobase::build_template(
 
 				/* Index condition pushdown can be used on
 				all columns of a secondary index, and on
-				the PRIMARY KEY columns. */
-				/* TODO: enable this assertion
-				(but first ensure that end_range is
-				valid here and use an accurate condition
-				for end_range) 
+				the PRIMARY KEY columns. On the clustered
+				index, it must never be used on other than
+				PRIMARY KEY columns, because those columns
+				may be stored off-page, and we will not
+				fetch externally stored columns before
+				checking the index condition. */
+				/* TODO: test the above with an assertion
+				like this. Note that index conditions are
+				currently pushed down as part of the
+				"optimizer phase" while end_range is done
+				as part of the execution phase. Therefore,
+				we were unable to use an accurate condition
+				for end_range in the "if" condition above,
+				and the following assertion would fail.
 				ut_ad(!dict_index_is_clust(prebuilt->index)
 				      || templ->rec_field_no
 				      < prebuilt->index->n_uniq);
@@ -6121,19 +6982,17 @@ ha_innobase::build_template(
 
 		/* Include the fields that are not needed in index condition
 		pushdown. */
-                for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
+		for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
+
+			while (!table->field[sql_idx]->stored_in_db) {
+				sql_idx++;
+			}
 
-		        while (!table->field[sql_idx]->stored_in_db) {
-			        sql_idx++;     
-                        }
-                       
 			const ibool		index_contains
 				= dict_index_contains_col_or_prefix(index, i);
 
-			if (index == prebuilt->index
-				? !index_contains
-				: !dict_index_contains_col_or_prefix(
-					prebuilt->index, i)) {
+			if (!build_template_needs_field_in_icp(
+				    index, prebuilt, index_contains, i)) {
 				/* Not needed in ICP */
 				const Field*	field;
 
@@ -6158,16 +7017,16 @@ ha_innobase::build_template(
 		}
 
 		prebuilt->idx_cond = this;
- 	} else {
+	} else {
 		/* No index condition pushdown */
 		prebuilt->idx_cond = NULL;
 
                 for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
 			const Field*	field;
 
-	                while (!table->field[sql_idx]->stored_in_db) {
-			        sql_idx++;     
-                        }
+			while (!table->field[sql_idx]->stored_in_db) {
+				sql_idx++;
+			}
 
 			if (whole_row) {
 				field = table->field[sql_idx];
@@ -6187,14 +7046,16 @@ ha_innobase::build_template(
 			build_template_field(prebuilt, clust_index, index,
 					     table, field, i);
 		}
-        }
+	}
 
 	if (index != clust_index && prebuilt->need_to_access_clustered) {
 		/* Change rec_field_no's to correspond to the clustered index
 		record */
 		for (i = 0; i < prebuilt->n_template; i++) {
+
 			mysql_row_templ_t*	templ
 				= &prebuilt->mysql_template[i];
+
 			templ->rec_field_no = templ->clust_rec_field_no;
 		}
 	}
@@ -6208,11 +7069,13 @@ min value of the autoinc interval. Once that is fixed we can get rid of
 the special lock handling.
 @return	DB_SUCCESS if all OK else error code */
 UNIV_INTERN
-ulint
+dberr_t
 ha_innobase::innobase_lock_autoinc(void)
 /*====================================*/
 {
-	ulint		error = DB_SUCCESS;
+	dberr_t		error = DB_SUCCESS;
+
+	ut_ad(!srv_read_only_mode);
 
 	switch (innobase_autoinc_lock_mode) {
 	case AUTOINC_NO_LOCKING:
@@ -6227,16 +7090,16 @@ ha_innobase::innobase_lock_autoinc(void)
 		etc. type of statement. */
 		if (thd_sql_command(user_thd) == SQLCOM_INSERT
 		    || thd_sql_command(user_thd) == SQLCOM_REPLACE) {
-			dict_table_t*	table = prebuilt->table;
+			dict_table_t*	ib_table = prebuilt->table;
 
 			/* Acquire the AUTOINC mutex. */
-			dict_table_autoinc_lock(table);
+			dict_table_autoinc_lock(ib_table);
 
 			/* We need to check that another transaction isn't
 			already holding the AUTOINC lock on the table. */
-			if (table->n_waiting_or_granted_auto_inc_locks) {
+			if (ib_table->n_waiting_or_granted_auto_inc_locks) {
 				/* Release the mutex to avoid deadlocks. */
-				dict_table_autoinc_unlock(table);
+				dict_table_autoinc_unlock(ib_table);
 			} else {
 				break;
 			}
@@ -6257,19 +7120,19 @@ ha_innobase::innobase_lock_autoinc(void)
 		ut_error;
 	}
 
-	return(ulong(error));
+	return(error);
 }
 
 /********************************************************************//**
 Reset the autoinc value in the table.
 @return	DB_SUCCESS if all went well else error code */
 UNIV_INTERN
-ulint
+dberr_t
 ha_innobase::innobase_reset_autoinc(
 /*================================*/
 	ulonglong	autoinc)	/*!< in: value to store */
 {
-	ulint		error;
+	dberr_t		error;
 
 	error = innobase_lock_autoinc();
 
@@ -6280,7 +7143,7 @@ ha_innobase::innobase_reset_autoinc(
 		dict_table_autoinc_unlock(prebuilt->table);
 	}
 
-	return(ulong(error));
+	return(error);
 }
 
 /********************************************************************//**
@@ -6288,12 +7151,12 @@ Store the autoinc value in the table. The autoinc value is only set if
 it's greater than the existing autoinc value in the table.
 @return	DB_SUCCESS if all went well else error code */
 UNIV_INTERN
-ulint
+dberr_t
 ha_innobase::innobase_set_max_autoinc(
 /*==================================*/
 	ulonglong	auto_inc)	/*!< in: value to store */
 {
-	ulint		error;
+	dberr_t		error;
 
 	error = innobase_lock_autoinc();
 
@@ -6304,7 +7167,7 @@ ha_innobase::innobase_set_max_autoinc(
 		dict_table_autoinc_unlock(prebuilt->table);
 	}
 
-	return(ulong(error));
+	return(error);
 }
 
 /********************************************************************//**
@@ -6317,30 +7180,38 @@ ha_innobase::write_row(
 /*===================*/
 	uchar*	record)	/*!< in: a row in MySQL format */
 {
-	ulint		error = 0;
-        int             error_result= 0;
+	dberr_t		error;
+	int		error_result= 0;
 	ibool		auto_inc_used= FALSE;
 	ulint		sql_command;
 	trx_t*		trx = thd_to_trx(user_thd);
 
 	DBUG_ENTER("ha_innobase::write_row");
 
-	if (prebuilt->trx != trx) {
-	  sql_print_error("The transaction object for the table handle is at "
-			  "%p, but for the current thread it is at %p",
-			  (const void*) prebuilt->trx, (const void*) trx);
+	if (srv_read_only_mode) {
+		ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	} else if (prebuilt->trx != trx) {
+		sql_print_error("The transaction object for the table handle "
+				"is at %p, but for the current thread it is at "
+				"%p",
+				(const void*) prebuilt->trx, (const void*) trx);
 
 		fputs("InnoDB: Dump of 200 bytes around prebuilt: ", stderr);
-		ut_print_buf(stderr, ((const byte*)prebuilt) - 100, 200);
+		ut_print_buf(stderr, ((const byte*) prebuilt) - 100, 200);
 		fputs("\n"
 			"InnoDB: Dump of 200 bytes around ha_data: ",
 			stderr);
 		ut_print_buf(stderr, ((const byte*) trx) - 100, 200);
 		putc('\n', stderr);
 		ut_error;
+	} else if (!trx_is_started(trx)) {
+		++trx->will_lock;
 	}
 
-	if (share->ib_table->is_corrupt) {
+	ha_statistic_increment(&SSV::ha_write_count);
+
+	if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) {
 		DBUG_RETURN(HA_ERR_CRASHED);
 	}
 
@@ -6423,7 +7294,7 @@ no_commit:
 		innobase_get_auto_increment(). */
 		prebuilt->autoinc_error = DB_SUCCESS;
 
-		if ((error = update_auto_increment())) {
+		if ((error_result = update_auto_increment())) {
 			/* We don't want to mask autoinc overflow errors. */
 
 			/* Handle the case where the AUTOINC sub-system
@@ -6434,15 +7305,14 @@ no_commit:
 				my_error(ER_AUTOINC_READ_FAILED, MYF(0));
 				goto func_exit;
 			} else if (prebuilt->autoinc_error != DB_SUCCESS) {
-				error = (int) prebuilt->autoinc_error;
+				error = prebuilt->autoinc_error;
 				goto report_error;
 			}
 
 			/* MySQL errors are passed straight back. except for
-                           HA_ERR_AUTO_INC_READ_FAILED. This can only happen
+                           ER_AUTOINC_READ_FAILED. This can only happen
                            for values out of range.
                          */
-			error_result = (int) error;
 			goto func_exit;
 		}
 
@@ -6458,9 +7328,10 @@ no_commit:
 		build_template(true);
 	}
 
-	innodb_srv_conc_enter_innodb(prebuilt->trx);
+	innobase_srv_conc_enter_innodb(prebuilt->trx);
 
 	error = row_insert_for_mysql((byte*) record, prebuilt);
+	DEBUG_SYNC(user_thd, "ib_after_row_insert");
 
 #ifdef EXTENDED_FOR_USERSTAT
 	if (UNIV_LIKELY(error == DB_SUCCESS && !trx->fake_changes)) {
@@ -6470,7 +7341,6 @@ no_commit:
 
 	/* Handle duplicate key errors */
 	if (auto_inc_used) {
-		ulint		err;
 		ulonglong	auto_inc;
 		ulonglong	col_max_value;
 
@@ -6532,6 +7402,7 @@ set_max_autoinc:
 
 					ulonglong	offset;
 					ulonglong	increment;
+					dberr_t		err;
 
 					offset = prebuilt->autoinc_offset;
 					increment = prebuilt->autoinc_increment;
@@ -6550,20 +7421,33 @@ set_max_autoinc:
 				}
 			}
 			break;
+		default:
+			break;
 		}
 	}
 
-	innodb_srv_conc_exit_innodb(prebuilt->trx);
+	innobase_srv_conc_exit_innodb(prebuilt->trx);
 
 report_error:
-	error_result = convert_error_code_to_mysql((int) error,
+	if (error == DB_TABLESPACE_DELETED) {
+		ib_senderrf(
+			trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_DISCARDED,
+			table->s->table_name.str);
+	}
+
+	error_result = convert_error_code_to_mysql(error,
 						   prebuilt->table->flags,
 						   user_thd);
 
+	if (error_result == HA_FTS_INVALID_DOCID) {
+		my_error(HA_FTS_INVALID_DOCID, MYF(0));
+	}
+
 func_exit:
 	innobase_active_small();
 
-	if (share->ib_table->is_corrupt) {
+	if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) {
 		DBUG_RETURN(HA_ERR_CRASHED);
 	}
 
@@ -6573,9 +7457,9 @@ func_exit:
 /**********************************************************************//**
 Checks which fields have changed in a row and stores information
 of them to an update vector.
-@return	error number or 0 */
+@return	DB_SUCCESS or error code */
 static
-int
+dberr_t
 calc_row_difference(
 /*================*/
 	upd_t*		uvect,		/*!< in/out: update vector */
@@ -6605,6 +7489,12 @@ calc_row_difference(
 	dfield_t	dfield;
 	dict_index_t*	clust_index;
 	uint		sql_idx, innodb_idx= 0;
+	ibool		changes_fts_column = FALSE;
+	ibool		changes_fts_doc_col = FALSE;
+	trx_t*          trx = thd_to_trx(thd);
+	doc_id_t	doc_id = FTS_NULL_DOC_ID;
+
+	ut_ad(!srv_read_only_mode);
 
 	n_fields = table->s->fields;
 	clust_index = dict_table_get_first_index(prebuilt->table);
@@ -6654,12 +7544,12 @@ calc_row_difference(
 				o_ptr = row_mysql_read_true_varchar(
 					&o_len, o_ptr,
 					(ulint)
-					(((Field_varstring*)field)->length_bytes));
+					(((Field_varstring*) field)->length_bytes));
 
 				n_ptr = row_mysql_read_true_varchar(
 					&n_len, n_ptr,
 					(ulint)
-					(((Field_varstring*)field)->length_bytes));
+					(((Field_varstring*) field)->length_bytes));
 			}
 
 			break;
@@ -6667,14 +7557,24 @@ calc_row_difference(
 			;
 		}
 
-		if (field->null_ptr) {
-			if (field_in_record_is_null(table, field,
-							(char*) old_row)) {
+		if (field_mysql_type == MYSQL_TYPE_LONGLONG
+		    && prebuilt->table->fts
+		    && innobase_strcasecmp(
+			field->field_name, FTS_DOC_ID_COL_NAME) == 0) {
+			doc_id = (doc_id_t) mach_read_from_n_little_endian(
+				n_ptr, 8);
+			if (doc_id == 0) {
+				return(DB_FTS_INVALID_DOCID);
+			}
+		}
+
+
+		if (field->real_maybe_null()) {
+			if (field->is_null_in_record(old_row)) {
 				o_len = UNIV_SQL_NULL;
 			}
 
-			if (field_in_record_is_null(table, field,
-							(char*) new_row)) {
+			if (field->is_null_in_record(new_row)) {
 				n_len = UNIV_SQL_NULL;
 			}
 		}
@@ -6695,7 +7595,7 @@ calc_row_difference(
 
 				buf = row_mysql_store_col_in_innobase_format(
 					&dfield,
-					(byte*)buf,
+					(byte*) buf,
 					TRUE,
 					new_mysql_row_col,
 					col_pack_len,
@@ -6710,17 +7610,123 @@ calc_row_difference(
 			ufield->field_no = dict_col_get_clust_pos(
 				&prebuilt->table->cols[innodb_idx], clust_index);
 			n_changed++;
+
+			/* If an FTS indexed column was changed by this
+			UPDATE then we need to inform the FTS sub-system.
+
+			NOTE: Currently we re-index all FTS indexed columns
+			even if only a subset of the FTS indexed columns
+			have been updated. That is the reason we are
+			checking only once here. Later we will need to
+			note which columns have been updated and do
+			selective processing. */
+			if (prebuilt->table->fts != NULL) {
+				ulint           offset;
+				dict_table_t*   innodb_table;
+
+				innodb_table = prebuilt->table;
+
+				if (!changes_fts_column) {
+					offset = row_upd_changes_fts_column(
+						innodb_table, ufield);
+
+					if (offset != ULINT_UNDEFINED) {
+						changes_fts_column = TRUE;
+					}
+				}
+
+				if (!changes_fts_doc_col) {
+					changes_fts_doc_col =
+					row_upd_changes_doc_id(
+						innodb_table, ufield);
+				}
+			}
 		}
-                if (field->stored_in_db)
-                  innodb_idx++;
+		innodb_idx++;
+	}
+
+	/* If the update changes a column with an FTS index on it, we
+	then add an update column node with a new document id to the
+	other changes. We piggy back our changes on the normal UPDATE
+	to reduce processing and IO overhead. */
+	if (!prebuilt->table->fts) {
+			trx->fts_next_doc_id = 0;
+	} else if (changes_fts_column || changes_fts_doc_col) {
+		dict_table_t*   innodb_table = prebuilt->table;
+
+		ufield = uvect->fields + n_changed;
+
+		if (!DICT_TF2_FLAG_IS_SET(
+			innodb_table, DICT_TF2_FTS_HAS_DOC_ID)) {
+
+			/* If Doc ID is managed by user, and if any
+			FTS indexed column has been updated, its corresponding
+			Doc ID must also be updated. Otherwise, return
+			error */
+			if (changes_fts_column && !changes_fts_doc_col) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr, " InnoDB: A new Doc ID"
+					" must be supplied while updating"
+					" FTS indexed columns.\n");
+				return(DB_FTS_INVALID_DOCID);
+			}
+
+			/* Doc ID must monotonically increase */
+			ut_ad(innodb_table->fts->cache);
+			if (doc_id < prebuilt->table->fts->cache->next_doc_id) {
+				fprintf(stderr,
+					"InnoDB: FTS Doc ID must be larger than"
+					" "IB_ID_FMT" for table",
+					innodb_table->fts->cache->next_doc_id
+					- 1);
+				ut_print_name(stderr, trx,
+					      TRUE, innodb_table->name);
+				putc('\n', stderr);
+
+				return(DB_FTS_INVALID_DOCID);
+			} else if ((doc_id
+				    - prebuilt->table->fts->cache->next_doc_id)
+				   >= FTS_DOC_ID_MAX_STEP) {
+				fprintf(stderr,
+					"InnoDB: Doc ID "UINT64PF" is too"
+					" big. Its difference with largest"
+					" Doc ID used "UINT64PF" cannot"
+					" exceed or equal to %d\n",
+					doc_id,
+					prebuilt->table->fts->cache->next_doc_id - 1,
+					FTS_DOC_ID_MAX_STEP);
+			}
+
+
+			trx->fts_next_doc_id = doc_id;
+		} else {
+			/* If the Doc ID is a hidden column, it can't be
+			changed by user */
+			ut_ad(!changes_fts_doc_col);
+
+			/* Doc ID column is hidden, a new Doc ID will be
+			generated by following fts_update_doc_id() call */
+			trx->fts_next_doc_id = 0;
+		}
+
+		fts_update_doc_id(
+			innodb_table, ufield, &trx->fts_next_doc_id);
+
+		++n_changed;
+	} else {
+		/* We have a Doc ID column, but none of FTS indexed
+		columns are touched, nor the Doc ID column, so set
+		fts_next_doc_id to UINT64_UNDEFINED, which means do not
+		update the Doc ID column */
+		trx->fts_next_doc_id = UINT64_UNDEFINED;
 	}
 
 	uvect->n_fields = n_changed;
 	uvect->info_bits = 0;
 
-	ut_a(buf <= (byte*)original_upd_buff + buff_len);
+	ut_a(buf <= (byte*) original_upd_buff + buff_len);
 
-	return(0);
+	return(DB_SUCCESS);
 }
 
 /**********************************************************************//**
@@ -6739,13 +7745,20 @@ ha_innobase::update_row(
 	uchar*		new_row)	/*!< in: new row in MySQL format */
 {
 	upd_t*		uvect;
-	int		error = 0;
+	dberr_t		error;
 	trx_t*		trx = thd_to_trx(user_thd);
 
 	DBUG_ENTER("ha_innobase::update_row");
 
 	ut_a(prebuilt->trx == trx);
 
+	if (srv_read_only_mode) {
+		ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	} else if (!trx_is_started(trx)) {
+		++trx->will_lock;
+	}
+
 	if (upd_buf == NULL) {
 		ut_ad(upd_buf_size == 0);
 
@@ -6763,7 +7776,9 @@ ha_innobase::update_row(
 		}
 	}
 
-	if (share->ib_table->is_corrupt) {
+	ha_statistic_increment(&SSV::ha_update_count);
+
+	if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) {
 		DBUG_RETURN(HA_ERR_CRASHED);
 	}
 
@@ -6776,15 +7791,19 @@ ha_innobase::update_row(
 	/* Build an update vector from the modified fields in the rows
 	(uses upd_buf of the handle) */
 
-	calc_row_difference(uvect, (uchar*) old_row, new_row, table,
-			    upd_buf, upd_buf_size, prebuilt, user_thd);
+	error = calc_row_difference(uvect, (uchar*) old_row, new_row, table,
+				    upd_buf, upd_buf_size, prebuilt, user_thd);
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
 
 	/* This is not a delete */
 	prebuilt->upd_node->is_delete = FALSE;
 
 	ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
 
-	innodb_srv_conc_enter_innodb(trx);
+	innobase_srv_conc_enter_innodb(trx);
 
 	error = row_update_for_mysql((byte*) old_row, prebuilt);
 
@@ -6833,19 +7852,22 @@ ha_innobase::update_row(
 	}
 #endif
 
-	innodb_srv_conc_exit_innodb(trx);
+	innobase_srv_conc_exit_innodb(trx);
 
-	error = convert_error_code_to_mysql(error,
+func_exit:
+	int err = convert_error_code_to_mysql(error,
 					    prebuilt->table->flags, user_thd);
 
-	if (error == 0 /* success */
-	    && uvect->n_fields == 0 /* no columns were updated */) {
+	/* If success and no columns were updated. */
+	if (err == 0 && uvect->n_fields == 0) {
 
 		/* This is the same as success, but instructs
 		MySQL that the row is not really updated and it
 		should not increase the count of updated rows.
 		This is fix for http://bugs.mysql.com/29157 */
-		error = HA_ERR_RECORD_IS_THE_SAME;
+		err = HA_ERR_RECORD_IS_THE_SAME;
+	} else if (err == HA_FTS_INVALID_DOCID) {
+		my_error(HA_FTS_INVALID_DOCID, MYF(0));
 	}
 
 	/* Tell InnoDB server that there might be work for
@@ -6853,11 +7875,11 @@ ha_innobase::update_row(
 
 	innobase_active_small();
 
-	if (share->ib_table->is_corrupt) {
+	if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) {
 		DBUG_RETURN(HA_ERR_CRASHED);
 	}
 
-	DBUG_RETURN(error);
+	DBUG_RETURN(err);
 }
 
 /**********************************************************************//**
@@ -6869,14 +7891,24 @@ ha_innobase::delete_row(
 /*====================*/
 	const uchar*	record)	/*!< in: a row in MySQL format */
 {
-	int		error = 0;
+	dberr_t		error;
 	trx_t*		trx = thd_to_trx(user_thd);
 
 	DBUG_ENTER("ha_innobase::delete_row");
 
 	ut_a(prebuilt->trx == trx);
 
-	if (share->ib_table->is_corrupt) {
+	if (srv_read_only_mode) {
+		ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	} else if (!trx_is_started(trx)) {
+		++trx->will_lock;
+	}
+
+	ha_statistic_increment(&SSV::ha_delete_count);
+
+	if (UNIV_UNLIKELY(share && share->ib_table
+			  && share->ib_table->is_corrupt)) {
 		DBUG_RETURN(HA_ERR_CRASHED);
 	}
 
@@ -6888,7 +7920,7 @@ ha_innobase::delete_row(
 
 	prebuilt->upd_node->is_delete = TRUE;
 
-	innodb_srv_conc_enter_innodb(trx);
+	innobase_srv_conc_enter_innodb(trx);
 
 	error = row_update_for_mysql((byte*) record, prebuilt);
 
@@ -6898,21 +7930,20 @@ ha_innobase::delete_row(
 	}
 #endif
 
-	innodb_srv_conc_exit_innodb(trx);
-
-	error = convert_error_code_to_mysql(
-		error, prebuilt->table->flags, user_thd);
+	innobase_srv_conc_exit_innodb(trx);
 
 	/* Tell the InnoDB server that there might be work for
 	utility threads: */
 
 	innobase_active_small();
 
-	if (share->ib_table->is_corrupt) {
+	if (UNIV_UNLIKELY(share && share->ib_table
+			  && share->ib_table->is_corrupt)) {
 		DBUG_RETURN(HA_ERR_CRASHED);
 	}
 
-	DBUG_RETURN(error);
+	DBUG_RETURN(convert_error_code_to_mysql(
+			    error, prebuilt->table->flags, user_thd));
 }
 
 /**********************************************************************//**
@@ -6926,8 +7957,6 @@ ha_innobase::unlock_row(void)
 {
 	DBUG_ENTER("ha_innobase::unlock_row");
 
-	ut_ad(prebuilt->trx->state == TRX_ACTIVE);
-
 	/* Consistent read does not take any locks, thus there is
 	nothing to unlock. */
 
@@ -6935,6 +7964,12 @@ ha_innobase::unlock_row(void)
 		DBUG_VOID_RETURN;
 	}
 
+	/* Ideally, this assert must be in the beginning of the function.
+	But there are some calls to this function from the SQL layer when the
+	transaction is in state TRX_STATE_NOT_STARTED.  The check on
+	prebuilt->select_lock_type above gets around this issue. */
+	ut_ad(trx_state_eq(prebuilt->trx, TRX_STATE_ACTIVE));
+
 	switch (prebuilt->row_read_type) {
 	case ROW_READ_WITH_LOCKS:
 		if (!srv_locks_unsafe_for_binlog
@@ -7010,8 +8045,8 @@ ha_innobase::index_end(void)
 {
 	int	error	= 0;
 	DBUG_ENTER("index_end");
-	active_index=MAX_KEY;
-	in_range_check_pushed_down= FALSE;
+	active_index = MAX_KEY;
+	in_range_check_pushed_down = FALSE;
 	ds_mrr.dsmrr_close();
 	DBUG_RETURN(error);
 }
@@ -7033,7 +8068,7 @@ convert_search_mode_to_innobase(
 		return(PAGE_CUR_GE);
 	case HA_READ_KEY_OR_PREV:
 		return(PAGE_CUR_LE);
-	case HA_READ_AFTER_KEY:	
+	case HA_READ_AFTER_KEY:
 		return(PAGE_CUR_G);
 	case HA_READ_BEFORE_KEY:
 		return(PAGE_CUR_L);
@@ -7147,7 +8182,7 @@ ha_innobase::index_read(
 	dict_index_t*	index;
 	ulint		match_mode	= 0;
 	int		error;
-	ulint		ret;
+	dberr_t		ret;
 
 	DBUG_ENTER("index_read");
 	DEBUG_SYNC_C("ha_innobase_index_read_begin");
@@ -7157,8 +8192,8 @@ ha_innobase::index_read(
 
 	ha_statistic_increment(&SSV::ha_read_key_count);
 
-	if (UNIV_UNLIKELY(share->ib_table->is_corrupt &&
-			  srv_pass_corrupt_table <= 1)) {
+	if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share
+			  && share->ib_table && share->ib_table->is_corrupt)) {
 		DBUG_RETURN(HA_ERR_CRASHED);
 	}
 
@@ -7174,6 +8209,10 @@ ha_innobase::index_read(
 			    : HA_ERR_TABLE_DEF_CHANGED);
 	}
 
+	if (index->type & DICT_FTS) {
+		DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
+	}
+
 	/* Note that if the index for which the search template is built is not
 	necessarily prebuilt->index, but can also be the clustered index */
 
@@ -7218,19 +8257,19 @@ ha_innobase::index_read(
 
 	if (mode != PAGE_CUR_UNSUPP) {
 
-		innodb_srv_conc_enter_innodb(prebuilt->trx);
+		innobase_srv_conc_enter_innodb(prebuilt->trx);
 
 		ret = row_search_for_mysql((byte*) buf, mode, prebuilt,
 					   match_mode, 0);
 
-		innodb_srv_conc_exit_innodb(prebuilt->trx);
+		innobase_srv_conc_exit_innodb(prebuilt->trx);
 	} else {
 
 		ret = DB_UNSUPPORTED;
 	}
 
-	if (UNIV_UNLIKELY(share->ib_table->is_corrupt &&
-			  srv_pass_corrupt_table <= 1)) {
+	if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share
+			  && share->ib_table && share->ib_table->is_corrupt)) {
 		DBUG_RETURN(HA_ERR_CRASHED);
 	}
 
@@ -7238,6 +8277,7 @@ ha_innobase::index_read(
 	case DB_SUCCESS:
 		error = 0;
 		table->status = 0;
+		srv_stats.n_rows_read.add((size_t) prebuilt->trx->id, 1);
 #ifdef EXTENDED_FOR_USERSTAT
 		rows_read++;
 		if (active_index < MAX_KEY)
@@ -7252,10 +8292,30 @@ ha_innobase::index_read(
 		error = HA_ERR_KEY_NOT_FOUND;
 		table->status = STATUS_NOT_FOUND;
 		break;
+	case DB_TABLESPACE_DELETED:
+
+		ib_senderrf(
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_DISCARDED,
+			table->s->table_name.str);
+
+		table->status = STATUS_NOT_FOUND;
+		error = HA_ERR_NO_SUCH_TABLE;
+		break;
+	case DB_TABLESPACE_NOT_FOUND:
+
+		ib_senderrf(
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_MISSING, MYF(0),
+			table->s->table_name.str);
+
+		table->status = STATUS_NOT_FOUND;
+		error = HA_ERR_NO_SUCH_TABLE;
+		break;
 	default:
-		error = convert_error_code_to_mysql((int) ret,
-						    prebuilt->table->flags,
-						    user_thd);
+		error = convert_error_code_to_mysql(
+			ret, prebuilt->table->flags, user_thd);
+
 		table->status = STATUS_NOT_FOUND;
 		break;
 	}
@@ -7348,8 +8408,8 @@ ha_innobase::change_active_index(
 {
 	DBUG_ENTER("change_active_index");
 
-	if (UNIV_UNLIKELY(share->ib_table->is_corrupt &&
-			  srv_pass_corrupt_table <= 1)) {
+	if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share
+			  && share->ib_table && share->ib_table->is_corrupt)) {
 		DBUG_RETURN(HA_ERR_CRASHED);
 	}
 
@@ -7372,8 +8432,8 @@ ha_innobase::change_active_index(
 
 	if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
 		if (dict_index_is_corrupted(prebuilt->index)) {
-			char	index_name[MAX_FULL_NAME_LEN + 1];
-			char	table_name[MAX_FULL_NAME_LEN + 1];
+			char index_name[MAX_FULL_NAME_LEN + 1];
+			char table_name[MAX_FULL_NAME_LEN + 1];
 
 			innobase_format_name(
 				index_name, sizeof index_name,
@@ -7384,7 +8444,7 @@ ha_innobase::change_active_index(
 				prebuilt->index->table->name, FALSE);
 
 			push_warning_printf(
-				user_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				user_thd, Sql_condition::WARN_LEVEL_WARN,
 				HA_ERR_INDEX_CORRUPT,
 				"InnoDB: Index %s for table %s is"
 				" marked as corrupted",
@@ -7392,7 +8452,7 @@ ha_innobase::change_active_index(
 			DBUG_RETURN(HA_ERR_INDEX_CORRUPT);
 		} else {
 			push_warning_printf(
-				user_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				user_thd, Sql_condition::WARN_LEVEL_WARN,
 				HA_ERR_TABLE_DEF_CHANGED,
 				"InnoDB: insufficient history for index %u",
 				keynr);
@@ -7462,27 +8522,27 @@ ha_innobase::general_fetch(
 	uint	match_mode)	/*!< in: 0, ROW_SEL_EXACT, or
 				ROW_SEL_EXACT_PREFIX */
 {
-	ulint		ret;
-	int		error	= 0;
+	dberr_t	ret;
+	int	error;
 
 	DBUG_ENTER("general_fetch");
 
-	if (UNIV_UNLIKELY(share->ib_table->is_corrupt &&
-			  srv_pass_corrupt_table <= 1)) {
+	if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share
+			  && share->ib_table && share->ib_table->is_corrupt)) {
 		DBUG_RETURN(HA_ERR_CRASHED);
 	}
 
 	ut_a(prebuilt->trx == thd_to_trx(user_thd));
 
-	innodb_srv_conc_enter_innodb(prebuilt->trx);
+	innobase_srv_conc_enter_innodb(prebuilt->trx);
 
 	ret = row_search_for_mysql(
-		(byte*)buf, 0, prebuilt, match_mode, direction);
+		(byte*) buf, 0, prebuilt, match_mode, direction);
 
-	innodb_srv_conc_exit_innodb(prebuilt->trx);
+	innobase_srv_conc_exit_innodb(prebuilt->trx);
 
-	if (UNIV_UNLIKELY(share->ib_table->is_corrupt &&
-			  srv_pass_corrupt_table <= 1)) {
+	if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share
+			  && share->ib_table && share->ib_table->is_corrupt)) {
 		DBUG_RETURN(HA_ERR_CRASHED);
 	}
 
@@ -7490,6 +8550,7 @@ ha_innobase::general_fetch(
 	case DB_SUCCESS:
 		error = 0;
 		table->status = 0;
+		srv_stats.n_rows_read.add((size_t) prebuilt->trx->id, 1);
 #ifdef EXTENDED_FOR_USERSTAT
 		rows_read++;
 		if (active_index < MAX_KEY)
@@ -7504,9 +8565,30 @@ ha_innobase::general_fetch(
 		error = HA_ERR_END_OF_FILE;
 		table->status = STATUS_NOT_FOUND;
 		break;
+	case DB_TABLESPACE_DELETED:
+
+		ib_senderrf(
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_DISCARDED,
+			table->s->table_name.str);
+
+		table->status = STATUS_NOT_FOUND;
+		error = HA_ERR_NO_SUCH_TABLE;
+		break;
+	case DB_TABLESPACE_NOT_FOUND:
+
+		ib_senderrf(
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_MISSING,
+			table->s->table_name.str);
+
+		table->status = STATUS_NOT_FOUND;
+		error = HA_ERR_NO_SUCH_TABLE;
+		break;
 	default:
 		error = convert_error_code_to_mysql(
-			(int) ret, prebuilt->table->flags, user_thd);
+			ret, prebuilt->table->flags, user_thd);
+
 		table->status = STATUS_NOT_FOUND;
 		break;
 	}
@@ -7692,40 +8774,350 @@ ha_innobase::rnd_pos(
 			length of data in pos has to be ref_length */
 {
 	int		error;
-	uint		keynr	= active_index;
 	DBUG_ENTER("rnd_pos");
 	DBUG_DUMP("key", pos, ref_length);
 
 	ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
 
-	if (prebuilt->clust_index_was_generated) {
-		/* No primary key was defined for the table and we
-		generated the clustered index from the row id: the
-		row reference is the row id, not any key value
-		that MySQL knows of */
+	/* Note that we assume the length of the row reference is fixed
+	for the table, and it is == ref_length */
 
-		error = change_active_index(MAX_KEY);
-	} else {
-		error = change_active_index(primary_key);
-	}
+	error = index_read(buf, pos, ref_length, HA_READ_KEY_EXACT);
 
 	if (error) {
 		DBUG_PRINT("error", ("Got error: %d", error));
-		DBUG_RETURN(error);
 	}
 
-	/* Note that we assume the length of the row reference is fixed
-	for the table, and it is == ref_length */
+	DBUG_RETURN(error);
+}
 
-	error = index_read(buf, pos, ref_length, HA_READ_KEY_EXACT);
+/**********************************************************************//**
+Initialize FT index scan
+@return 0 or error number */
+UNIV_INTERN
+int
+ha_innobase::ft_init()
+/*==================*/
+{
+	DBUG_ENTER("ft_init");
 
-	if (error) {
-		DBUG_PRINT("error", ("Got error: %d", error));
+	trx_t*	trx = check_trx_exists(ha_thd());
+
+	/* FTS queries are not treated as autocommit non-locking selects.
+	This is because the FTS implementation can acquire locks behind
+	the scenes. This has not been verified but it is safer to treat
+	them as regular read only transactions for now. */
+
+	if (!trx_is_started(trx)) {
+		++trx->will_lock;
 	}
 
-	change_active_index(keynr);
+	DBUG_RETURN(rnd_init(false));
+}
 
-	DBUG_RETURN(error);
+/**********************************************************************//**
+Initialize FT index scan
+@return FT_INFO structure if successful or NULL */
+UNIV_INTERN
+FT_INFO*
+ha_innobase::ft_init_ext(
+/*=====================*/
+	uint			flags,	/* in: */
+	uint			keynr,	/* in: */
+	String*			key)	/* in: */
+{
+	trx_t*			trx;
+	dict_table_t*		table;
+	dberr_t			error;
+	byte*			query = (byte*) key->ptr();
+	ulint			query_len = key->length();
+	const CHARSET_INFO*	char_set = key->charset();
+	NEW_FT_INFO*		fts_hdl = NULL;
+	dict_index_t*		index;
+	fts_result_t*		result;
+	char			buf_tmp[8192];
+	ulint			buf_tmp_used;
+	uint			num_errors;
+
+	if (fts_enable_diag_print) {
+		fprintf(stderr, "keynr=%u, '%.*s'\n",
+			keynr, (int) key->length(), (byte*) key->ptr());
+
+		if (flags & FT_BOOL) {
+			fprintf(stderr, "BOOL search\n");
+		} else {
+			fprintf(stderr, "NL search\n");
+		}
+	}
+
+	/* FIXME: utf32 and utf16 are not compatible with some
+	string function used. So to convert them to uft8 before
+	proceed. */
+	if (strcmp(char_set->csname, "utf32") == 0
+	    || strcmp(char_set->csname, "utf16") == 0) {
+		buf_tmp_used = innobase_convert_string(
+			buf_tmp, sizeof(buf_tmp) - 1,
+			&my_charset_utf8_general_ci,
+			query, query_len, (CHARSET_INFO*) char_set,
+			&num_errors);
+
+		query = (byte*) buf_tmp;
+		query_len = buf_tmp_used;
+		query[query_len] = 0;
+	}
+
+	trx = prebuilt->trx;
+
+	/* FTS queries are not treated as autocommit non-locking selects.
+	This is because the FTS implementation can acquire locks behind
+	the scenes. This has not been verified but it is safer to treat
+	them as regular read only transactions for now. */
+
+	if (!trx_is_started(trx)) {
+		++trx->will_lock;
+	}
+
+	table = prebuilt->table;
+
+	/* Table does not have an FTS index */
+	if (!table->fts || ib_vector_is_empty(table->fts->indexes)) {
+		my_error(ER_TABLE_HAS_NO_FT, MYF(0));
+		return(NULL);
+	}
+
+	if (keynr == NO_SUCH_KEY) {
+		/* FIXME: Investigate the NO_SUCH_KEY usage */
+		index = (dict_index_t*) ib_vector_getp(table->fts->indexes, 0);
+	} else {
+		index = innobase_get_index(keynr);
+	}
+
+	if (!index || index->type != DICT_FTS) {
+		my_error(ER_TABLE_HAS_NO_FT, MYF(0));
+		return(NULL);
+	}
+
+	if (!(table->fts->fts_status & ADDED_TABLE_SYNCED)) {
+		fts_init_index(table, FALSE);
+
+		table->fts->fts_status |= ADDED_TABLE_SYNCED;
+	}
+
+	error = fts_query(trx, index, flags, query, query_len, &result);
+
+	if (error != DB_SUCCESS) {
+		my_error(convert_error_code_to_mysql(error, 0, NULL),
+			MYF(0));
+		return(NULL);
+	}
+
+	/* Allocate FTS handler, and instantiate it before return */
+	fts_hdl = static_cast<NEW_FT_INFO*>(my_malloc(sizeof(NEW_FT_INFO),
+				   MYF(0)));
+
+	fts_hdl->please = const_cast<_ft_vft*>(&ft_vft_result);
+	fts_hdl->could_you = const_cast<_ft_vft_ext*>(&ft_vft_ext_result);
+	fts_hdl->ft_prebuilt = prebuilt;
+	fts_hdl->ft_result = result;
+
+	/* FIXME: Re-evluate the condition when Bug 14469540
+	is resolved */
+	prebuilt->in_fts_query = true;
+
+	return((FT_INFO*) fts_hdl);
+}
+
+/*****************************************************************//**
+Set up search tuple for a query through FTS_DOC_ID_INDEX on
+supplied Doc ID. This is used by MySQL to retrieve the documents
+once the search result (Doc IDs) is available */
+static
+void
+innobase_fts_create_doc_id_key(
+/*===========================*/
+	dtuple_t*	tuple,		/* in/out: prebuilt->search_tuple */
+	const dict_index_t*
+			index,		/* in: index (FTS_DOC_ID_INDEX) */
+	doc_id_t*	doc_id)		/* in/out: doc id to search, value
+					could be changed to storage format
+					used for search. */
+{
+	doc_id_t	temp_doc_id;
+	dfield_t*	dfield = dtuple_get_nth_field(tuple, 0);
+
+	ut_a(dict_index_get_n_unique(index) == 1);
+
+	dtuple_set_n_fields(tuple, index->n_fields);
+	dict_index_copy_types(tuple, index, index->n_fields);
+
+#ifdef UNIV_DEBUG
+	/* The unique Doc ID field should be an eight-bytes integer */
+	dict_field_t*	field = dict_index_get_nth_field(index, 0);
+        ut_a(field->col->mtype == DATA_INT);
+	ut_ad(sizeof(*doc_id) == field->fixed_len);
+	ut_ad(innobase_strcasecmp(index->name, FTS_DOC_ID_INDEX_NAME) == 0);
+#endif /* UNIV_DEBUG */
+
+	/* Convert to storage byte order */
+	mach_write_to_8(reinterpret_cast<byte*>(&temp_doc_id), *doc_id);
+	*doc_id = temp_doc_id;
+	dfield_set_data(dfield, doc_id, sizeof(*doc_id));
+
+        dtuple_set_n_fields_cmp(tuple, 1);
+
+	for (ulint i = 1; i < index->n_fields; i++) {
+		dfield = dtuple_get_nth_field(tuple, i);
+		dfield_set_null(dfield);
+	}
+}
+
+/**********************************************************************//**
+Fetch next result from the FT result set
+@return error code */
+UNIV_INTERN
+int
+ha_innobase::ft_read(
+/*=================*/
+	uchar*		buf)		/*!< in/out: buf contain result row */
+{
+	fts_result_t*	result;
+	int		error;
+	row_prebuilt_t*	ft_prebuilt;
+
+	ft_prebuilt = ((NEW_FT_INFO*) ft_handler)->ft_prebuilt;
+
+	ut_a(ft_prebuilt == prebuilt);
+
+	result = ((NEW_FT_INFO*) ft_handler)->ft_result;
+
+	if (result->current == NULL) {
+		/* This is the case where the FTS query did not
+		contain and matching documents. */
+		if (result->rankings_by_id != NULL) {
+			/* Now that we have the complete result, we
+			need to sort the document ids on their rank
+			calculation. */
+
+			fts_query_sort_result_on_rank(result);
+
+			result->current = const_cast<ib_rbt_node_t*>(
+				rbt_first(result->rankings_by_rank));
+		} else {
+			ut_a(result->current == NULL);
+		}
+	} else {
+		result->current = const_cast<ib_rbt_node_t*>(
+			rbt_next(result->rankings_by_rank, result->current));
+	}
+
+next_record:
+
+	if (result->current != NULL) {
+		dict_index_t*	index;
+		dtuple_t*	tuple = prebuilt->search_tuple;
+		doc_id_t	search_doc_id;
+
+		/* If we only need information from result we can return
+		   without fetching the table row */
+		if (ft_prebuilt->read_just_key) {
+			table->status= 0;
+			return(0);
+		}
+
+		index = dict_table_get_index_on_name(
+			prebuilt->table, FTS_DOC_ID_INDEX_NAME);
+
+		/* Must find the index */
+		ut_a(index);
+
+		/* Switch to the FTS doc id index */
+		prebuilt->index = index;
+
+		fts_ranking_t*	ranking = rbt_value(
+			fts_ranking_t, result->current);
+
+		search_doc_id = ranking->doc_id;
+
+		/* We pass a pointer of search_doc_id because it will be
+		converted to storage byte order used in the search
+		tuple. */
+		innobase_fts_create_doc_id_key(tuple, index, &search_doc_id);
+
+		innobase_srv_conc_enter_innodb(prebuilt->trx);
+
+		dberr_t ret = row_search_for_mysql(
+			(byte*) buf, PAGE_CUR_GE, prebuilt, ROW_SEL_EXACT, 0);
+
+		innobase_srv_conc_exit_innodb(prebuilt->trx);
+
+		switch (ret) {
+		case DB_SUCCESS:
+			error = 0;
+			table->status = 0;
+			break;
+		case DB_RECORD_NOT_FOUND:
+			result->current = const_cast<ib_rbt_node_t*>(
+				rbt_next(result->rankings_by_rank,
+					 result->current));
+
+			if (!result->current) {
+				/* exhaust the result set, should return
+				HA_ERR_END_OF_FILE just like
+				ha_innobase::general_fetch() and/or
+				ha_innobase::index_first() etc. */
+				error = HA_ERR_END_OF_FILE;
+				table->status = STATUS_NOT_FOUND;
+			} else {
+				goto next_record;
+			}
+			break;
+		case DB_END_OF_INDEX:
+			error = HA_ERR_END_OF_FILE;
+			table->status = STATUS_NOT_FOUND;
+			break;
+		case DB_TABLESPACE_DELETED:
+
+			ib_senderrf(
+				prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLESPACE_DISCARDED,
+				table->s->table_name.str);
+
+			table->status = STATUS_NOT_FOUND;
+			error = HA_ERR_NO_SUCH_TABLE;
+			break;
+		case DB_TABLESPACE_NOT_FOUND:
+
+			ib_senderrf(
+				prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLESPACE_MISSING,
+				table->s->table_name.str);
+
+			table->status = STATUS_NOT_FOUND;
+			error = HA_ERR_NO_SUCH_TABLE;
+			break;
+		default:
+			error = convert_error_code_to_mysql(
+				ret, 0, user_thd);
+
+			table->status = STATUS_NOT_FOUND;
+			break;
+		}
+
+		return(error);
+	}
+
+	return(HA_ERR_END_OF_FILE);
+}
+
+/*************************************************************************
+*/
+
+void
+ha_innobase::ft_end()
+{
+	fprintf(stderr, "ft_end()\n");
+
+	rnd_end();
 }
 
 /*********************************************************************//**
@@ -7756,7 +9148,7 @@ ha_innobase::position(
 
 		memcpy(ref, prebuilt->row_id, len);
 	} else {
-		len = store_key_val_for_row(primary_key, (char*)ref,
+		len = store_key_val_for_row(primary_key, (char*) ref,
 							 ref_length, record);
 	}
 
@@ -7764,8 +9156,8 @@ ha_innobase::position(
 	table. */
 
 	if (len != ref_length) {
-	  sql_print_error("Stored ref len is %lu, but table ref len is %lu",
-			  (ulong) len, (ulong) ref_length);
+		sql_print_error("Stored ref len is %lu, but table ref len is "
+				"%lu", (ulong) len, (ulong) ref_length);
 	}
 }
 
@@ -7776,29 +9168,91 @@ See http://bugs.mysql.com/32710 for expl. why we choose PROCESS. */
 	 && check_global_access(thd, PROCESS_ACL))
 
 /*****************************************************************//**
-Creates a table definition to an InnoDB database. */
+Check whether there exist a column named as "FTS_DOC_ID", which is
+reserved for InnoDB FTS Doc ID
+@return true if there exist a "FTS_DOC_ID" column */
 static
+bool
+create_table_check_doc_id_col(
+/*==========================*/
+	trx_t*		trx,		/*!< in: InnoDB transaction handle */
+	const TABLE*	form,		/*!< in: information on table
+					columns and indexes */
+	ulint*		doc_id_col)	/*!< out: Doc ID column number if
+					there exist a FTS_DOC_ID column,
+					ULINT_UNDEFINED if column is of the
+					wrong type/name/size */
+{
+	for (ulint i = 0; i < form->s->fields; i++) {
+		const Field*	field;
+		ulint		col_type;
+		ulint		col_len;
+		ulint		unsigned_type;
+
+		field = form->field[i];
+
+		col_type = get_innobase_type_from_mysql_type(&unsigned_type,
+							     field);
+
+		col_len = field->pack_length();
+
+		if (innobase_strcasecmp(field->field_name,
+					FTS_DOC_ID_COL_NAME) == 0) {
+
+			/* Note the name is case sensitive due to
+			our internal query parser */
+			if (col_type == DATA_INT
+			    && !field->real_maybe_null()
+			    && col_len == sizeof(doc_id_t)
+			    && (strcmp(field->field_name,
+				      FTS_DOC_ID_COL_NAME) == 0)) {
+				*doc_id_col = i;
+			} else {
+				push_warning_printf(
+					trx->mysql_thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
+					"InnoDB: FTS_DOC_ID column must be "
+					"of BIGINT NOT NULL type, and named "
+					"in all capitalized characters");
+				my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+					 field->field_name);
+				*doc_id_col = ULINT_UNDEFINED;
+			}
+
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/*****************************************************************//**
+Creates a table definition to an InnoDB database. */
+static __attribute__((nonnull, warn_unused_result))
 int
 create_table_def(
 /*=============*/
 	trx_t*		trx,		/*!< in: InnoDB transaction handle */
-	TABLE*		form,		/*!< in: information on table
+	const TABLE*	form,		/*!< in: information on table
 					columns and indexes */
 	const char*	table_name,	/*!< in: table name */
-	const char*	path_of_temp_table,/*!< in: if this is a table explicitly
+	const char*	temp_path,	/*!< in: if this is a table explicitly
 					created by the user with the
 					TEMPORARY keyword, then this
 					parameter is the dir path where the
 					table should be placed if we create
 					an .ibd file for it (no .ibd extension
-					in the path, though); otherwise this
-					is NULL */
-	ulint		flags)		/*!< in: table flags */
+					in the path, though). Otherwise this
+					is a zero length-string */
+	const char*	remote_path,	/*!< in: Remote path or zero length-string */
+	ulint		flags,		/*!< in: table flags */
+	ulint		flags2)		/*!< in: table flags2 */
 {
-	Field*		field;
+	THD*		thd = trx->mysql_thd;
 	dict_table_t*	table;
-	ulint		n_cols;
-	int		error;
+	ulint		n_cols, s_cols;
+	dberr_t		err;
 	ulint		col_type;
 	ulint		col_len;
 	ulint		nulls_allowed;
@@ -7807,92 +9261,136 @@ create_table_def(
 	ulint		long_true_varchar;
 	ulint		charset_no;
 	ulint		i;
+	ulint		doc_id_col = 0;
+	ibool		has_doc_id_col = FALSE;
+	mem_heap_t*	heap;
 
 	DBUG_ENTER("create_table_def");
 	DBUG_PRINT("enter", ("table_name: %s", table_name));
 
-	ut_a(trx->mysql_thd != NULL);
+	DBUG_ASSERT(thd != NULL);
 
 	/* MySQL does the name length check. But we do additional check
 	on the name length here */
 	if (strlen(table_name) > MAX_FULL_NAME_LEN) {
 		push_warning_printf(
-			(THD*) trx->mysql_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+			thd, Sql_condition::WARN_LEVEL_WARN,
 			ER_TABLE_NAME,
 			"InnoDB: Table Name or Database Name is too long");
 
 		DBUG_RETURN(ER_TABLE_NAME);
 	}
 
+	/* table_name must contain '/'. Later in the code we assert if it
+	does not */
+	if (strcmp(strchr(table_name, '/') + 1,
+		   "innodb_table_monitor") == 0) {
+		push_warning(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			HA_ERR_WRONG_COMMAND,
+			DEPRECATED_MSG_INNODB_TABLE_MONITOR);
+	}
+
 	n_cols = form->s->fields;
+	s_cols = form->s->stored_fields;
+
+	/* Check whether there already exists a FTS_DOC_ID column */
+	if (create_table_check_doc_id_col(trx, form, &doc_id_col)){
+
+		/* Raise error if the Doc ID column is of wrong type or name */
+		if (doc_id_col == ULINT_UNDEFINED) {
+			trx_commit_for_mysql(trx);
+
+			err = DB_ERROR;
+			goto error_ret;
+		} else {
+			has_doc_id_col = TRUE;
+		}
+	}
 
 	/* We pass 0 as the space id, and determine at a lower level the space
 	id where to store the table */
 
-	table = dict_mem_table_create(table_name, 0, form->s->stored_fields, flags);
+	if (flags2 & DICT_TF2_FTS) {
+		/* Adjust for the FTS hidden field */
+		if (!has_doc_id_col) {
+			table = dict_mem_table_create(table_name, 0, s_cols + 1,
+						      flags, flags2);
 
-	if (path_of_temp_table) {
+			/* Set the hidden doc_id column. */
+			table->fts->doc_col = s_cols;
+		} else {
+			table = dict_mem_table_create(table_name, 0, s_cols,
+						      flags, flags2);
+			table->fts->doc_col = doc_id_col;
+		}
+	} else {
+		table = dict_mem_table_create(table_name, 0, s_cols,
+					      flags, flags2);
+	}
+
+	if (flags2 & DICT_TF2_TEMPORARY) {
+		ut_a(strlen(temp_path));
 		table->dir_path_of_temp_table =
-			mem_heap_strdup(table->heap, path_of_temp_table);
+			mem_heap_strdup(table->heap, temp_path);
+	}
+
+	if (DICT_TF_HAS_DATA_DIR(flags)) {
+		ut_a(strlen(remote_path));
+		table->data_dir_path = mem_heap_strdup(table->heap, remote_path);
+	} else {
+		table->data_dir_path = NULL;
 	}
+	heap = mem_heap_create(1000);
 
 	for (i = 0; i < n_cols; i++) {
-		field = form->field[i];
+		Field*	field = form->field[i];
 		if (!field->stored_in_db)
-		  continue;
+			continue;
 
 		col_type = get_innobase_type_from_mysql_type(&unsigned_type,
 							     field);
 
 		if (!col_type) {
 			push_warning_printf(
-				(THD*) trx->mysql_thd,
-				MYSQL_ERROR::WARN_LEVEL_WARN,
+				thd, Sql_condition::WARN_LEVEL_WARN,
 				ER_CANT_CREATE_TABLE,
 				"Error creating table '%s' with "
 				"column '%s'. Please check its "
 				"column type and try to re-create "
 				"the table with an appropriate "
 				"column type.",
-				table->name, (char*) field->field_name);
+				table->name, field->field_name);
 			goto err_col;
 		}
 
-		if (field->null_ptr) {
-			nulls_allowed = 0;
-		} else {
-			nulls_allowed = DATA_NOT_NULL;
-		}
-
-		if (field->binary()) {
-			binary_type = DATA_BINARY_TYPE;
-		} else {
-			binary_type = 0;
-		}
+		nulls_allowed = field->real_maybe_null() ? 0 : DATA_NOT_NULL;
+		binary_type = field->binary() ? DATA_BINARY_TYPE : 0;
 
 		charset_no = 0;
 
 		if (dtype_is_string_type(col_type)) {
 
-			charset_no = (ulint)field->charset()->number;
+			charset_no = (ulint) field->charset()->number;
 
-			if (UNIV_UNLIKELY(charset_no >= 256)) {
+			if (UNIV_UNLIKELY(charset_no > MAX_CHAR_COLL_NUM)) {
 				/* in data0type.h we assume that the
 				number fits in one byte in prtype */
 				push_warning_printf(
-					(THD*) trx->mysql_thd,
-					MYSQL_ERROR::WARN_LEVEL_WARN,
+					thd, Sql_condition::WARN_LEVEL_WARN,
 					ER_CANT_CREATE_TABLE,
 					"In InnoDB, charset-collation codes"
 					" must be below 256."
 					" Unsupported code %lu.",
 					(ulong) charset_no);
+				mem_heap_free(heap);
 				DBUG_RETURN(ER_CANT_CREATE_TABLE);
 			}
 		}
 
-		ut_a(field->type() < 256); /* we assume in dtype_form_prtype()
-					   that this fits in one byte */
+		/* we assume in dtype_form_prtype() that this fits in
+		two bytes */
+		ut_a(field->type() <= MAX_CHAR_COLL_NUM);
 		col_len = field->pack_length();
 
 		/* The MySQL pack length contains 1 or 2 bytes length field
@@ -7903,9 +9401,9 @@ create_table_def(
 		long_true_varchar = 0;
 
 		if (field->type() == MYSQL_TYPE_VARCHAR) {
-			col_len -= ((Field_varstring*)field)->length_bytes;
+			col_len -= ((Field_varstring*) field)->length_bytes;
 
-			if (((Field_varstring*)field)->length_bytes == 2) {
+			if (((Field_varstring*) field)->length_bytes == 2) {
 				long_true_varchar = DATA_LONG_TRUE_VARCHAR;
 			}
 		}
@@ -7917,39 +9415,56 @@ create_table_def(
 				 field->field_name);
 err_col:
 			dict_mem_table_free(table);
+			mem_heap_free(heap);
 			trx_commit_for_mysql(trx);
 
-			error = DB_ERROR;
+			err = DB_ERROR;
 			goto error_ret;
 		}
 
-		dict_mem_table_add_col(table, table->heap,
-			(char*) field->field_name,
+		dict_mem_table_add_col(table, heap,
+			field->field_name,
 			col_type,
 			dtype_form_prtype(
-				(ulint)field->type()
+				(ulint) field->type()
 				| nulls_allowed | unsigned_type
 				| binary_type | long_true_varchar,
 				charset_no),
 			col_len);
 	}
 
-	error = row_create_table_for_mysql(table, trx);
+	/* Add the FTS doc_id hidden column. */
+	if (flags2 & DICT_TF2_FTS && !has_doc_id_col) {
+		fts_add_doc_id_column(table, heap);
+	}
 
-	if (error == DB_DUPLICATE_KEY) {
-		char buf[100];
+	err = row_create_table_for_mysql(table, trx, false);
+
+	mem_heap_free(heap);
+
+	DBUG_EXECUTE_IF("ib_create_err_tablespace_exist",
+			err = DB_TABLESPACE_EXISTS;);
+
+	if (err == DB_DUPLICATE_KEY || err == DB_TABLESPACE_EXISTS) {
+		char display_name[FN_REFLEN];
 		char* buf_end = innobase_convert_identifier(
-			buf, sizeof buf - 1, table_name, strlen(table_name),
-			trx->mysql_thd, TRUE);
+			display_name, sizeof(display_name) - 1,
+			table_name, strlen(table_name),
+			thd, TRUE);
 
 		*buf_end = '\0';
-		my_error(ER_TABLE_EXISTS_ERROR, MYF(0), buf);
+
+		my_error(err == DB_DUPLICATE_KEY
+			 ? ER_TABLE_EXISTS_ERROR
+			 : ER_TABLESPACE_EXISTS, MYF(0), display_name);
 	}
 
-error_ret:
-	error = convert_error_code_to_mysql(error, flags, NULL);
+	if (err == DB_SUCCESS && (flags2 & DICT_TF2_FTS)) {
+		fts_optimize_add_table(table);
+	}
 
-	DBUG_RETURN(error);
+error_ret:
+	DBUG_RETURN(convert_error_code_to_mysql(err, flags, thd));
 }
 
 /*****************************************************************//**
@@ -7959,95 +9474,112 @@ int
 create_index(
 /*=========*/
 	trx_t*		trx,		/*!< in: InnoDB transaction handle */
-	TABLE*		form,		/*!< in: information on table
+	const TABLE*	form,		/*!< in: information on table
 					columns and indexes */
 	ulint		flags,		/*!< in: InnoDB table flags */
 	const char*	table_name,	/*!< in: table name */
 	uint		key_num)	/*!< in: index number */
 {
-	Field*		field;
 	dict_index_t*	index;
 	int		error;
-	ulint		n_fields;
-	KEY*		key;
-	KEY_PART_INFO*	key_part;
+	const KEY*	key;
 	ulint		ind_type;
-	ulint		col_type;
-	ulint		prefix_len;
-	ulint		is_unsigned;
-	ulint		i;
-	ulint		j;
 	ulint*		field_lengths;
 
 	DBUG_ENTER("create_index");
 
 	key = form->key_info + key_num;
 
-	n_fields = key->key_parts;
-
 	/* Assert that "GEN_CLUST_INDEX" cannot be used as non-primary index */
 	ut_a(innobase_strcasecmp(key->name, innobase_index_reserve_name) != 0);
 
+	if (key->flags & HA_FULLTEXT) {
+		index = dict_mem_index_create(table_name, key->name, 0,
+					      DICT_FTS,
+					      key->user_defined_key_parts);
+
+		for (ulint i = 0; i < key->user_defined_key_parts; i++) {
+			KEY_PART_INFO*	key_part = key->key_part + i;
+			dict_mem_index_add_field(
+				index, key_part->field->field_name, 0);
+		}
+
+		DBUG_RETURN(convert_error_code_to_mysql(
+				    row_create_index_for_mysql(
+					    index, trx, NULL),
+				    flags, NULL));
+
+	}
+
 	ind_type = 0;
 
 	if (key_num == form->s->primary_key) {
-		ind_type = ind_type | DICT_CLUSTERED;
+		ind_type |= DICT_CLUSTERED;
 	}
 
-	if (key->flags & HA_NOSAME ) {
-		ind_type = ind_type | DICT_UNIQUE;
+	if (key->flags & HA_NOSAME) {
+		ind_type |= DICT_UNIQUE;
 	}
 
+	field_lengths = (ulint*) my_malloc(
+		key->user_defined_key_parts * sizeof *
+				field_lengths, MYF(MY_FAE));
+
 	/* We pass 0 as the space id, and determine at a lower level the space
 	id where to store the table */
 
 	index = dict_mem_index_create(table_name, key->name, 0,
-				      ind_type, n_fields);
+				      ind_type, key->user_defined_key_parts);
 
-	field_lengths = (ulint*) my_malloc(sizeof(ulint) * n_fields,
-		MYF(MY_FAE));
+	for (ulint i = 0; i < key->user_defined_key_parts; i++) {
+		KEY_PART_INFO*	key_part = key->key_part + i;
+		ulint		prefix_len;
+		ulint		col_type;
+		ulint		is_unsigned;
 
-	for (i = 0; i < n_fields; i++) {
-		key_part = key->key_part + i;
 
-		/* (The flag HA_PART_KEY_SEG denotes in MySQL a column prefix
-		field in an index: we only store a specified number of first
-		bytes of the column to the index field.) The flag does not
-		seem to be properly set by MySQL. Let us fall back on testing
+		/* (The flag HA_PART_KEY_SEG denotes in MySQL a
+		column prefix field in an index: we only store a
+		specified number of first bytes of the column to
+		the index field.) The flag does not seem to be
+		properly set by MySQL. Let us fall back on testing
 		the length of the key part versus the column. */
 
-		field = NULL;
-		for (j = 0; j < form->s->fields; j++) {
+		Field*	field = NULL;
+
+		for (ulint j = 0; j < form->s->fields; j++) {
 
 			field = form->field[j];
 
 			if (0 == innobase_strcasecmp(
-					field->field_name,
-					key_part->field->field_name)) {
+				    field->field_name,
+				    key_part->field->field_name)) {
 				/* Found the corresponding column */
 
-				break;
+				goto found;
 			}
 		}
 
-		ut_a(j < form->s->fields);
-
+		ut_error;
+found:
 		col_type = get_innobase_type_from_mysql_type(
-					&is_unsigned, key_part->field);
+			&is_unsigned, key_part->field);
 
 		if (DATA_BLOB == col_type
-			|| (key_part->length < field->pack_length()
-				&& field->type() != MYSQL_TYPE_VARCHAR)
-			|| (field->type() == MYSQL_TYPE_VARCHAR
-				&& key_part->length < field->pack_length()
-				- ((Field_varstring*)field)->length_bytes)) {
+		    || (key_part->length < field->pack_length()
+			&& field->type() != MYSQL_TYPE_VARCHAR)
+		    || (field->type() == MYSQL_TYPE_VARCHAR
+			&& key_part->length < field->pack_length()
+			- ((Field_varstring*) field)->length_bytes)) {
 
-			prefix_len = key_part->length;
-
-			if (col_type == DATA_INT
-				|| col_type == DATA_FLOAT
-				|| col_type == DATA_DOUBLE
-				|| col_type == DATA_DECIMAL) {
+			switch (col_type) {
+			default:
+				prefix_len = key_part->length;
+				break;
+			case DATA_INT:
+			case DATA_FLOAT:
+			case DATA_DOUBLE:
+			case DATA_DECIMAL:
 				sql_print_error(
 					"MySQL is trying to create a column "
 					"prefix index field, on an "
@@ -8064,16 +9596,19 @@ create_index(
 
 		field_lengths[i] = key_part->length;
 
-		dict_mem_index_add_field(index,
-			(char*) key_part->field->field_name, prefix_len);
+		dict_mem_index_add_field(
+			index, key_part->field->field_name, prefix_len);
 	}
 
+	ut_ad(key->flags & HA_FULLTEXT || !(index->type & DICT_FTS));
+
 	/* Even though we've defined max_supported_key_part_length, we
 	still do our own checking using field_lengths to be absolutely
 	sure we don't create too long indexes. */
-	error = row_create_index_for_mysql(index, trx, field_lengths);
 
-	error = convert_error_code_to_mysql(error, flags, NULL);
+	error = convert_error_code_to_mysql(
+		row_create_index_for_mysql(index, trx, field_lengths),
+		flags, NULL);
 
 	my_free(field_lengths);
 
@@ -8092,7 +9627,7 @@ create_clustered_index_when_no_primary(
 	const char*	table_name)	/*!< in: table name */
 {
 	dict_index_t*	index;
-	int		error;
+	dberr_t		error;
 
 	/* We pass 0 as the space id, and determine at a lower level the space
 	id where to store the table */
@@ -8102,9 +9637,7 @@ create_clustered_index_when_no_primary(
 
 	error = row_create_index_for_mysql(index, trx, NULL);
 
-	error = convert_error_code_to_mysql(error, flags, NULL);
-
-	return(error);
+	return(convert_error_code_to_mysql(error, flags, NULL));
 }
 
 /*****************************************************************//**
@@ -8137,27 +9670,27 @@ get_row_format_name(
 }
 
 /** If file-per-table is missing, issue warning and set ret false */
-#define CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE		\
-	if (!srv_file_per_table) {				\
+#define CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE(use_tablespace)\
+	if (!use_tablespace) {					\
 		push_warning_printf(				\
-			thd, MYSQL_ERROR::WARN_LEVEL_WARN,	\
-			HA_WRONG_CREATE_OPTION,		\
+			thd, Sql_condition::WARN_LEVEL_WARN,	\
+			ER_ILLEGAL_HA_CREATE_OPTION,		\
 			"InnoDB: ROW_FORMAT=%s requires"	\
 			" innodb_file_per_table.",		\
 			get_row_format_name(row_format));	\
-		ret = FALSE;					\
+		ret = "ROW_FORMAT";					\
 	}
 
 /** If file-format is Antelope, issue warning and set ret false */
 #define CHECK_ERROR_ROW_TYPE_NEEDS_GT_ANTELOPE			\
-	if (srv_file_format < DICT_TF_FORMAT_ZIP) {		\
+	if (srv_file_format < UNIV_FORMAT_B) {		\
 		push_warning_printf(				\
-			thd, MYSQL_ERROR::WARN_LEVEL_WARN,	\
-			HA_WRONG_CREATE_OPTION,		\
+			thd, Sql_condition::WARN_LEVEL_WARN,	\
+			ER_ILLEGAL_HA_CREATE_OPTION,		\
 			"InnoDB: ROW_FORMAT=%s requires"	\
 			" innodb_file_format > Antelope.",	\
 			get_row_format_name(row_format));	\
-		ret = FALSE;					\
+		ret = "ROW_FORMAT";				\
 	}
 
 
@@ -8166,25 +9699,26 @@ Validates the create options. We may build on this function
 in future. For now, it checks two specifiers:
 KEY_BLOCK_SIZE and ROW_FORMAT
 If innodb_strict_mode is not set then this function is a no-op
-@return	TRUE if valid. */
-static
-ibool
-create_options_are_valid(
-/*=====================*/
+@return	NULL if valid, string if not. */
+UNIV_INTERN
+const char*
+create_options_are_invalid(
+/*=======================*/
 	THD*		thd,		/*!< in: connection thread. */
 	TABLE*		form,		/*!< in: information on table
 					columns and indexes */
-	HA_CREATE_INFO*	create_info)	/*!< in: create info. */
+	HA_CREATE_INFO*	create_info,	/*!< in: create info. */
+	bool		use_tablespace)	/*!< in: srv_file_per_table */
 {
 	ibool	kbs_specified	= FALSE;
-	ibool	ret		= TRUE;
+	const char*	ret	= NULL;
 	enum row_type	row_format	= form->s->row_type;
 
 	ut_ad(thd != NULL);
 
 	/* If innodb_strict_mode is not set don't do any validation. */
 	if (!(THDVAR(thd, strict_mode))) {
-		return(TRUE);
+		return(NULL);
 	}
 
 	ut_ad(form != NULL);
@@ -8194,62 +9728,80 @@ create_options_are_valid(
 	if (create_info->key_block_size) {
 		kbs_specified = TRUE;
 		switch (create_info->key_block_size) {
+			ulint	kbs_max;
 		case 1:
 		case 2:
 		case 4:
 		case 8:
 		case 16:
 			/* Valid KEY_BLOCK_SIZE, check its dependencies. */
-			if (!srv_file_per_table) {
+			if (!use_tablespace) {
 				push_warning(
-					thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-					HA_WRONG_CREATE_OPTION,
+					thd, Sql_condition::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
 					"InnoDB: KEY_BLOCK_SIZE requires"
 					" innodb_file_per_table.");
-				ret = FALSE;
+				ret = "KEY_BLOCK_SIZE";
 			}
-			if (srv_file_format < DICT_TF_FORMAT_ZIP) {
+			if (srv_file_format < UNIV_FORMAT_B) {
 				push_warning(
-					thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-					HA_WRONG_CREATE_OPTION,
+					thd, Sql_condition::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
 					"InnoDB: KEY_BLOCK_SIZE requires"
 					" innodb_file_format > Antelope.");
-					ret = FALSE;
+				ret = "KEY_BLOCK_SIZE";
+			}
+
+			/* The maximum KEY_BLOCK_SIZE (KBS) is 16. But if
+			UNIV_PAGE_SIZE is smaller than 16k, the maximum
+			KBS is also smaller. */
+			kbs_max = ut_min(
+				1 << (UNIV_PAGE_SSIZE_MAX - 1),
+				1 << (PAGE_ZIP_SSIZE_MAX - 1));
+			if (create_info->key_block_size > kbs_max) {
+				push_warning_printf(
+					thd, Sql_condition::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
+					"InnoDB: KEY_BLOCK_SIZE=%ld"
+					" cannot be larger than %ld.",
+					create_info->key_block_size,
+					kbs_max);
+				ret = "KEY_BLOCK_SIZE";
 			}
 			break;
 		default:
 			push_warning_printf(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-				HA_WRONG_CREATE_OPTION,
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
 				"InnoDB: invalid KEY_BLOCK_SIZE = %lu."
 				" Valid values are [1, 2, 4, 8, 16]",
 				create_info->key_block_size);
-			ret = FALSE;
+			ret = "KEY_BLOCK_SIZE";
 			break;
 		}
 	}
-	
+
 	/* Check for a valid Innodb ROW_FORMAT specifier and
 	other incompatibilities. */
 	switch (row_format) {
 	case ROW_TYPE_COMPRESSED:
-		CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE;
+		CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE(use_tablespace);
 		CHECK_ERROR_ROW_TYPE_NEEDS_GT_ANTELOPE;
 		break;
 	case ROW_TYPE_DYNAMIC:
-		CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE;
+		CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE(use_tablespace);
 		CHECK_ERROR_ROW_TYPE_NEEDS_GT_ANTELOPE;
 		/* fall through since dynamic also shuns KBS */
 	case ROW_TYPE_COMPACT:
 	case ROW_TYPE_REDUNDANT:
 		if (kbs_specified) {
 			push_warning_printf(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-				HA_WRONG_CREATE_OPTION,
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
 				"InnoDB: cannot specify ROW_FORMAT = %s"
 				" with KEY_BLOCK_SIZE.",
 				get_row_format_name(row_format));
-			ret = FALSE;
+			ret = "KEY_BLOCK_SIZE";
 		}
 		break;
 	case ROW_TYPE_DEFAULT:
@@ -8258,13 +9810,43 @@ create_options_are_valid(
 	case ROW_TYPE_PAGE:
 	case ROW_TYPE_NOT_USED:
 		push_warning(
-			thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-			HA_WRONG_CREATE_OPTION,		\
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_ILLEGAL_HA_CREATE_OPTION,		\
 			"InnoDB: invalid ROW_FORMAT specifier.");
-		ret = FALSE;
+		ret = "ROW_TYPE";
 		break;
 	}
 
+	/* Use DATA DIRECTORY only with file-per-table. */
+	if (create_info->data_file_name && !use_tablespace) {
+		push_warning(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_ILLEGAL_HA_CREATE_OPTION,
+			"InnoDB: DATA DIRECTORY requires"
+			" innodb_file_per_table.");
+		ret = "DATA DIRECTORY";
+	}
+
+	/* Do not use DATA DIRECTORY with TEMPORARY TABLE. */
+	if (create_info->data_file_name
+	    && create_info->options & HA_LEX_CREATE_TMP_TABLE) {
+		push_warning(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_ILLEGAL_HA_CREATE_OPTION,
+			"InnoDB: DATA DIRECTORY cannot be used"
+			" for TEMPORARY tables.");
+		ret = "DATA DIRECTORY";
+	}
+
+	/* Do not allow INDEX_DIRECTORY */
+	if (create_info->index_file_name) {
+		push_warning_printf(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_ILLEGAL_HA_CREATE_OPTION,
+			"InnoDB: INDEX DIRECTORY is not supported");
+		ret = "INDEX DIRECTORY";
+	}
+
 	return(ret);
 }
 
@@ -8274,55 +9856,64 @@ UNIV_INTERN
 void
 ha_innobase::update_create_info(
 /*============================*/
-	HA_CREATE_INFO* create_info)	/*!< in/out: create info */
+	HA_CREATE_INFO*	create_info)	/*!< in/out: create info */
 {
-  if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
-    ha_innobase::info(HA_STATUS_AUTO);
-    create_info->auto_increment_value = stats.auto_increment_value;
-  }
+	if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
+		ha_innobase::info(HA_STATUS_AUTO);
+		create_info->auto_increment_value = stats.auto_increment_value;
+	}
+
+	/* Update the DATA DIRECTORY name from SYS_DATAFILES. */
+	dict_get_and_save_data_dir_path(prebuilt->table, false);
+
+	if (prebuilt->table->data_dir_path) {
+		create_info->data_file_name = prebuilt->table->data_dir_path;
+	}
 }
 
 /*****************************************************************//**
-Creates a new table to an InnoDB database.
-@return	error number */
+Initialize the table FTS stopword list
+@return TRUE if success */
+UNIV_INTERN
+ibool
+innobase_fts_load_stopword(
+/*=======================*/
+	dict_table_t*	table,	/*!< in: Table has the FTS */
+	trx_t*		trx,	/*!< in: transaction */
+	THD*		thd)	/*!< in: current thread */
+{
+	return(fts_load_stopword(table, trx,
+				 fts_server_stopword_table,
+				 THDVAR(thd, ft_user_stopword_table),
+				 THDVAR(thd, ft_enable_stopword), FALSE));
+}
+
+/*****************************************************************//**
+Parses the table name into normal name and either temp path or remote path
+if needed.
+@return	0 if successful, otherwise, error number */
 UNIV_INTERN
 int
-ha_innobase::create(
-/*================*/
-	const char*	name,		/*!< in: table name */
-	TABLE*		form,		/*!< in: information on table
-					columns and indexes */
-	HA_CREATE_INFO*	create_info)	/*!< in: more information of the
+ha_innobase::parse_table_name(
+/*==========================*/
+	const char*	name,		/*!< in/out: table name provided*/
+	HA_CREATE_INFO*	create_info,	/*!< in: more information of the
 					created table, contains also the
 					create statement string */
+	ulint		flags,		/*!< in: flags*/
+	ulint		flags2,		/*!< in: flags2*/
+	char*		norm_name,	/*!< out: normalized table name */
+	char*		temp_path,	/*!< out: absolute path of table */
+	char*		remote_path)	/*!< out: remote path of table */
 {
-	int		error;
-	dict_table_t*	innobase_table;
-	trx_t*		parent_trx;
-	trx_t*		trx;
-	int		primary_key_no;
-	uint		i;
-	char		name2[FN_REFLEN];
-	char		norm_name[FN_REFLEN];
 	THD*		thd = ha_thd();
-	ib_int64_t	auto_inc_value;
-	ulint		flags;
-	/* Cache the value of innodb_file_format, in case it is
-	modified by another thread while the table is being created. */
-	const ulint	file_format = srv_file_format;
-	const char*	stmt;
-	size_t		stmt_len;
-	enum row_type	row_format;
-
-	DBUG_ENTER("ha_innobase::create");
-
-	DBUG_ASSERT(thd != NULL);
-	DBUG_ASSERT(create_info != NULL);
+	bool		use_tablespace = flags2 & DICT_TF2_USE_TABLESPACE;
+	DBUG_ENTER("ha_innobase::parse_table_name");
 
 #ifdef __WIN__
 	/* Names passed in from server are in two formats:
 	1. <database_name>/<table_name>: for normal table creation
-	2. full path: for temp table creation, or sym link
+	2. full path: for temp table creation, or DATA DIRECTORY.
 
 	When srv_file_per_table is on and mysqld_embedded is off,
 	check for full path pattern, i.e.
@@ -8331,7 +9922,7 @@ ha_innobase::create(
 	returns error if it is in full path format, but not creating a temp.
 	table. Currently InnoDB does not support symbolic link on Windows. */
 
-	if (srv_file_per_table
+	if (use_tablespace
 	    && !mysqld_embedded
 	    && !(create_info->options & HA_LEX_CREATE_TMP_TABLE)) {
 
@@ -8343,70 +9934,177 @@ ha_innobase::create(
 	}
 #endif
 
-	if (form->s->stored_fields > 1000) {
-		/* The limit probably should be REC_MAX_N_FIELDS - 3 = 1020,
-		but we play safe here */
+	normalize_table_name(norm_name, name);
+	temp_path[0] = '\0';
+	remote_path[0] = '\0';
 
-		DBUG_RETURN(HA_ERR_TO_BIG_ROW);
+	/* A full path is used for TEMPORARY TABLE and DATA DIRECTORY.
+	In the case of;
+	  CREATE TEMPORARY TABLE ... DATA DIRECTORY={path} ... ;
+	We ignore the DATA DIRECTORY. */
+	if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
+		strncpy(temp_path, name, FN_REFLEN - 1);
 	}
 
-	ut_a(strlen(name) < sizeof(name2));
+	if (create_info->data_file_name) {
+		bool ignore = false;
 
-	strcpy(name2, name);
+		/* Use DATA DIRECTORY only with file-per-table. */
+		if (!use_tablespace) {
+			push_warning(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: DATA DIRECTORY requires"
+				" innodb_file_per_table.");
+			ignore = true;
+		}
 
-	normalize_table_name(norm_name, name2);
+		/* Do not use DATA DIRECTORY with TEMPORARY TABLE. */
+		if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
+			push_warning(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: DATA DIRECTORY cannot be"
+				" used for TEMPORARY tables.");
+			ignore = true;
+		}
 
-	/* Create the table definition in InnoDB */
+		if (ignore) {
+			my_error(WARN_OPTION_IGNORED, ME_JUST_WARNING,
+				"DATA DIRECTORY");
+		} else {
+			strncpy(remote_path, create_info->data_file_name,
+				FN_REFLEN - 1);
+		}
+	}
 
-	flags = 0;
+	if (create_info->index_file_name) {
+		my_error(WARN_OPTION_IGNORED, ME_JUST_WARNING,
+			"INDEX DIRECTORY");
+	}
 
-	/* Validate create options if innodb_strict_mode is set. */
-	if (!create_options_are_valid(thd, form, create_info)) {
-		DBUG_RETURN(HA_WRONG_CREATE_OPTION);
+	DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Determines InnoDB table flags.
+@retval true if successful, false if error */
+UNIV_INTERN
+bool
+innobase_table_flags(
+/*=================*/
+	const TABLE*		form,		/*!< in: table */
+	const HA_CREATE_INFO*	create_info,	/*!< in: information
+						on table columns and indexes */
+	THD*			thd,		/*!< in: connection */
+	bool			use_tablespace,	/*!< in: whether to create
+						outside system tablespace */
+	ulint*			flags,		/*!< out: DICT_TF flags */
+	ulint*			flags2)		/*!< out: DICT_TF2 flags */
+{
+	DBUG_ENTER("innobase_table_flags");
+
+	const char*	fts_doc_id_index_bad = NULL;
+	bool		zip_allowed = true;
+	ulint		zip_ssize = 0;
+	enum row_type	row_format;
+	rec_format_t	innodb_row_format = REC_FORMAT_COMPACT;
+	bool		use_data_dir;
+
+	/* Cache the value of innodb_file_format, in case it is
+	modified by another thread while the table is being created. */
+	const ulint	file_format_allowed = srv_file_format;
+
+	*flags = 0;
+	*flags2 = 0;
+
+	/* Check if there are any FTS indexes defined on this table. */
+	for (uint i = 0; i < form->s->keys; i++) {
+		const KEY*	key = &form->key_info[i];
+
+		if (key->flags & HA_FULLTEXT) {
+			*flags2 |= DICT_TF2_FTS;
+
+			/* We don't support FTS indexes in temporary
+			tables. */
+			if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
+
+				my_error(ER_INNODB_NO_FT_TEMP_TABLE, MYF(0));
+				DBUG_RETURN(false);
+			}
+
+			if (key->flags & HA_USES_PARSER) {
+				my_error(ER_INNODB_NO_FT_USES_PARSER, MYF(0));
+                                DBUG_RETURN(false);
+			}
+
+			if (fts_doc_id_index_bad) {
+				goto index_bad;
+			}
+		}
+
+		if (innobase_strcasecmp(key->name, FTS_DOC_ID_INDEX_NAME)) {
+			continue;
+		}
+
+		/* Do a pre-check on FTS DOC ID index */
+		if (!(key->flags & HA_NOSAME)
+		    || strcmp(key->name, FTS_DOC_ID_INDEX_NAME)
+		    || strcmp(key->key_part[0].field->field_name,
+			      FTS_DOC_ID_COL_NAME)) {
+			fts_doc_id_index_bad = key->name;
+		}
+
+		if (fts_doc_id_index_bad && (*flags2 & DICT_TF2_FTS)) {
+index_bad:
+			my_error(ER_INNODB_FT_WRONG_DOCID_INDEX, MYF(0),
+				 fts_doc_id_index_bad);
+			DBUG_RETURN(false);
+		}
 	}
 
 	if (create_info->key_block_size) {
-		/* Determine the page_zip.ssize corresponding to the
-		requested page size (key_block_size) in kilobytes. */
-
-		ulint	ssize, ksize;
-		ulint	key_block_size = create_info->key_block_size;
-
-		/*  Set 'flags' to the correct key_block_size.
-		It will be zero if key_block_size is an invalid number.*/
-		for (ssize = ksize = 1; ssize <= DICT_TF_ZSSIZE_MAX;
-		     ssize++, ksize <<= 1) {
-			if (key_block_size == ksize) {
-				flags = ssize << DICT_TF_ZSSIZE_SHIFT
-					| DICT_TF_COMPACT
-					| DICT_TF_FORMAT_ZIP
-					  << DICT_TF_FORMAT_SHIFT;
+		/* The requested compressed page size (key_block_size)
+		is given in kilobytes. If it is a valid number, store
+		that value as the number of log2 shifts from 512 in
+		zip_ssize. Zero means it is not compressed. */
+		ulint zssize;		/* Zip Shift Size */
+		ulint kbsize;		/* Key Block Size */
+		for (zssize = kbsize = 1;
+		     zssize <= ut_min(UNIV_PAGE_SSIZE_MAX,
+				      PAGE_ZIP_SSIZE_MAX);
+		     zssize++, kbsize <<= 1) {
+			if (kbsize == create_info->key_block_size) {
+				zip_ssize = zssize;
 				break;
 			}
 		}
 
-		if (!srv_file_per_table) {
+		/* Make sure compressed row format is allowed. */
+		if (!use_tablespace) {
 			push_warning(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-				HA_WRONG_CREATE_OPTION,
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
 				"InnoDB: KEY_BLOCK_SIZE requires"
 				" innodb_file_per_table.");
-			flags = 0;
+			zip_allowed = FALSE;
 		}
 
-		if (file_format < DICT_TF_FORMAT_ZIP) {
+		if (file_format_allowed < UNIV_FORMAT_B) {
 			push_warning(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-				HA_WRONG_CREATE_OPTION,
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
 				"InnoDB: KEY_BLOCK_SIZE requires"
 				" innodb_file_format > Antelope.");
-			flags = 0;
+			zip_allowed = FALSE;
 		}
 
-		if (!flags) {
+		if (!zip_allowed
+		    || zssize > ut_min(UNIV_PAGE_SSIZE_MAX,
+				       PAGE_ZIP_SSIZE_MAX)) {
 			push_warning_printf(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-				HA_WRONG_CREATE_OPTION,
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
 				"InnoDB: ignoring KEY_BLOCK_SIZE=%lu.",
 				create_info->key_block_size);
 		}
@@ -8414,7 +10112,7 @@ ha_innobase::create(
 
 	row_format = form->s->row_type;
 
-	if (flags) {
+	if (zip_ssize && zip_allowed) {
 		/* if ROW_FORMAT is set to default,
 		automatically change it to COMPRESSED.*/
 		if (row_format == ROW_TYPE_DEFAULT) {
@@ -8427,79 +10125,169 @@ ha_innobase::create(
 			such combinations can be obtained
 			with ALTER TABLE anyway. */
 			push_warning_printf(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-				HA_WRONG_CREATE_OPTION,
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
 				"InnoDB: ignoring KEY_BLOCK_SIZE=%lu"
 				" unless ROW_FORMAT=COMPRESSED.",
 				create_info->key_block_size);
-			flags = 0;
+			zip_allowed = FALSE;
 		}
 	} else {
-		/* flags == 0 means no KEY_BLOCK_SIZE.*/
-		if (row_format == ROW_TYPE_COMPRESSED) {
-			/* ROW_FORMAT=COMPRESSED without
-			KEY_BLOCK_SIZE implies half the
-			maximum KEY_BLOCK_SIZE. */
-			flags = (DICT_TF_ZSSIZE_MAX - 1)
-				<< DICT_TF_ZSSIZE_SHIFT
-				| DICT_TF_COMPACT
-				| DICT_TF_FORMAT_ZIP
-				<< DICT_TF_FORMAT_SHIFT;
-//#if DICT_TF_ZSSIZE_MAX < 1
-//# error "DICT_TF_ZSSIZE_MAX < 1"
-//#endif
+		/* zip_ssize == 0 means no KEY_BLOCK_SIZE.*/
+		if (row_format == ROW_TYPE_COMPRESSED && zip_allowed) {
+			/* ROW_FORMAT=COMPRESSED without KEY_BLOCK_SIZE
+			implies half the maximum KEY_BLOCK_SIZE(*1k) or
+			UNIV_PAGE_SIZE, whichever is less. */
+			zip_ssize = ut_min(UNIV_PAGE_SSIZE_MAX,
+					   PAGE_ZIP_SSIZE_MAX) - 1;
 		}
 	}
 
+	/* Validate the row format.  Correct it if necessary */
 	switch (row_format) {
 	case ROW_TYPE_REDUNDANT:
+		innodb_row_format = REC_FORMAT_REDUNDANT;
 		break;
+
 	case ROW_TYPE_COMPRESSED:
 	case ROW_TYPE_DYNAMIC:
-		if (!srv_file_per_table) {
+		if (!use_tablespace) {
 			push_warning_printf(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-				HA_WRONG_CREATE_OPTION,
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
 				"InnoDB: ROW_FORMAT=%s requires"
 				" innodb_file_per_table.",
 				get_row_format_name(row_format));
-		} else if (file_format < DICT_TF_FORMAT_ZIP) {
+		} else if (file_format_allowed == UNIV_FORMAT_A) {
 			push_warning_printf(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-				HA_WRONG_CREATE_OPTION,
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
 				"InnoDB: ROW_FORMAT=%s requires"
 				" innodb_file_format > Antelope.",
 				get_row_format_name(row_format));
 		} else {
-			flags |= DICT_TF_COMPACT
-			         | (DICT_TF_FORMAT_ZIP
-			            << DICT_TF_FORMAT_SHIFT);
+			innodb_row_format = (row_format == ROW_TYPE_DYNAMIC
+					     ? REC_FORMAT_DYNAMIC
+					     : REC_FORMAT_COMPRESSED);
 			break;
 		}
-
-		/* fall through */
+		zip_allowed = FALSE;
+		/* fall through to set row_format = COMPACT */
 	case ROW_TYPE_NOT_USED:
 	case ROW_TYPE_FIXED:
 	case ROW_TYPE_PAGE:
 		push_warning(
-			thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-			HA_WRONG_CREATE_OPTION,
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_ILLEGAL_HA_CREATE_OPTION,
 			"InnoDB: assuming ROW_FORMAT=COMPACT.");
 	case ROW_TYPE_DEFAULT:
+		/* If we fell through, set row format to Compact. */
+		row_format = ROW_TYPE_COMPACT;
 	case ROW_TYPE_COMPACT:
-		flags = DICT_TF_COMPACT;
 		break;
 	}
 
-	/* Look for a primary key */
+	/* Set the table flags */
+	if (!zip_allowed) {
+		zip_ssize = 0;
+	}
+
+	use_data_dir = use_tablespace
+		       && ((create_info->data_file_name != NULL)
+		       && !(create_info->options & HA_LEX_CREATE_TMP_TABLE));
 
-	primary_key_no= (form->s->primary_key != MAX_KEY ?
-			 (int) form->s->primary_key :
-			 -1);
+	dict_tf_set(flags, innodb_row_format, zip_ssize, use_data_dir);
+
+	if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
+		*flags2 |= DICT_TF2_TEMPORARY;
+	}
+
+	if (use_tablespace) {
+		*flags2 |= DICT_TF2_USE_TABLESPACE;
+	}
+
+	DBUG_RETURN(true);
+}
+
+/*****************************************************************//**
+Creates a new table to an InnoDB database.
+@return	error number */
+UNIV_INTERN
+int
+ha_innobase::create(
+/*================*/
+	const char*	name,		/*!< in: table name */
+	TABLE*		form,		/*!< in: information on table
+					columns and indexes */
+	HA_CREATE_INFO*	create_info)	/*!< in: more information of the
+					created table, contains also the
+					create statement string */
+{
+	int		error;
+	trx_t*		parent_trx;
+	trx_t*		trx;
+	int		primary_key_no;
+	uint		i;
+	char		norm_name[FN_REFLEN];	/* {database}/{tablename} */
+	char		temp_path[FN_REFLEN];	/* absolute path of temp frm */
+	char		remote_path[FN_REFLEN];	/* absolute path of table */
+	THD*		thd = ha_thd();
+	ib_int64_t	auto_inc_value;
+
+	/* Cache the global variable "srv_file_per_table" to a local
+	variable before using it. Note that "srv_file_per_table"
+	is not under dict_sys mutex protection, and could be changed
+	while creating the table. So we read the current value here
+	and make all further decisions based on this. */
+	bool		use_tablespace = srv_file_per_table;
+
+	/* Zip Shift Size - log2 - 9 of compressed page size,
+	zero for uncompressed */
+	ulint		flags;
+	ulint		flags2;
+	dict_table_t*	innobase_table = NULL;
+
+	const char*	stmt;
+	size_t		stmt_len;
+
+	DBUG_ENTER("ha_innobase::create");
+
+	DBUG_ASSERT(thd != NULL);
+	DBUG_ASSERT(create_info != NULL);
+
+	if (form->s->fields > REC_MAX_N_USER_FIELDS) {
+		DBUG_RETURN(HA_ERR_TOO_MANY_FIELDS);
+	} else if (srv_read_only_mode) {
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	}
+
+	/* Create the table definition in InnoDB */
+
+	/* Validate create options if innodb_strict_mode is set. */
+	if (create_options_are_invalid(
+			thd, form, create_info, use_tablespace)) {
+		DBUG_RETURN(HA_WRONG_CREATE_OPTION);
+	}
+
+	if (!innobase_table_flags(form, create_info,
+				  thd, use_tablespace,
+				  &flags, &flags2)) {
+		DBUG_RETURN(-1);
+	}
+
+	error = parse_table_name(name, create_info, flags, flags2,
+				 norm_name, temp_path, remote_path);
+	if (error) {
+		DBUG_RETURN(error);
+	}
+
+	/* Look for a primary key */
+	primary_key_no = (form->s->primary_key != MAX_KEY ?
+			  (int) form->s->primary_key :
+			  -1);
 
 	/* Our function innobase_get_mysql_key_number_for_index assumes
 	the primary key is always number 0, if it exists */
-
 	ut_a(primary_key_no == -1 || primary_key_no == 0);
 
 	/* Check for name conflicts (with reserved name) for
@@ -8513,10 +10301,6 @@ ha_innobase::create(
 		DBUG_RETURN(HA_ERR_GENERIC);
 	}
 
-	if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
-		flags |= DICT_TF2_TEMPORARY << DICT_TF2_SHIFT;
-	}
-
 	/* Get the transaction associated with the current thd, or create one
 	if not yet created */
 
@@ -8537,19 +10321,16 @@ ha_innobase::create(
 
 	/* Latch the InnoDB data dictionary exclusively so that no deadlocks
 	or lock waits can happen in it during a table create operation.
-	Drop table etc. do this latching in row0mysql.c. */
+	Drop table etc. do this latching in row0mysql.cc. */
 
 	row_mysql_lock_data_dictionary(trx);
 
-	error = create_table_def(trx, form, norm_name,
-		create_info->options & HA_LEX_CREATE_TMP_TABLE ? name2 : NULL,
-		flags);
-
+	error = create_table_def(trx, form, norm_name, temp_path,
+				 remote_path, flags, flags2);
 	if (error) {
 		goto cleanup;
 	}
 
-
 	/* Create the keys */
 
 	if (form->s->keys == 0 || primary_key_no == -1) {
@@ -8573,9 +10354,66 @@ ha_innobase::create(
 		}
 	}
 
+	/* Create the ancillary tables that are common to all FTS indexes on
+	this table. */
+	if (flags2 & DICT_TF2_FTS) {
+		enum fts_doc_id_index_enum	ret;
+
+		innobase_table = dict_table_open_on_name(
+			norm_name, TRUE, FALSE, DICT_ERR_IGNORE_NONE);
+
+		ut_a(innobase_table);
+
+		/* Check whether there already exists FTS_DOC_ID_INDEX */
+		ret = innobase_fts_check_doc_id_index_in_def(
+			form->s->keys, form->key_info);
+
+		switch (ret) {
+		case FTS_INCORRECT_DOC_ID_INDEX:
+			push_warning_printf(thd,
+					    Sql_condition::WARN_LEVEL_WARN,
+					    ER_WRONG_NAME_FOR_INDEX,
+					    " InnoDB: Index name %s is reserved"
+					    " for the unique index on"
+					    " FTS_DOC_ID column for FTS"
+					    " Document ID indexing"
+					    " on table %s. Please check"
+					    " the index definition to"
+					    " make sure it is of correct"
+					    " type\n",
+					    FTS_DOC_ID_INDEX_NAME,
+					    innobase_table->name);
+
+			if (innobase_table->fts) {
+				fts_free(innobase_table);
+			}
+
+			dict_table_close(innobase_table, TRUE, FALSE);
+			my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+				 FTS_DOC_ID_INDEX_NAME);
+			error = -1;
+			goto cleanup;
+		case FTS_EXIST_DOC_ID_INDEX:
+		case FTS_NOT_EXIST_DOC_ID_INDEX:
+			break;
+		}
+
+		dberr_t	err = fts_create_common_tables(
+			trx, innobase_table, norm_name,
+			(ret == FTS_EXIST_DOC_ID_INDEX));
+
+		error = convert_error_code_to_mysql(err, 0, NULL);
+
+		dict_table_close(innobase_table, TRUE, FALSE);
+
+		if (error) {
+			goto cleanup;
+		}
+	}
+
 	for (i = 0; i < form->s->keys; i++) {
 
-		if (i != (uint) primary_key_no) {
+		if (i != static_cast<uint>(primary_key_no)) {
 
 			if ((error = create_index(trx, form, flags,
 						  norm_name, i))) {
@@ -8584,18 +10422,28 @@ ha_innobase::create(
 		}
 	}
 
+	/* Cache all the FTS indexes on this table in the FTS specific
+	structure. They are used for FTS indexed column update handling. */
+	if (flags2 & DICT_TF2_FTS) {
+		fts_t*          fts = innobase_table->fts;
+
+		ut_a(fts != NULL);
+
+		dict_table_get_all_fts_indexes(innobase_table, fts->indexes);
+	}
+
 	stmt = innobase_get_stmt(thd, &stmt_len);
 
 	if (stmt) {
-		error = row_table_add_foreign_constraints(
+		dberr_t	err = row_table_add_foreign_constraints(
 			trx, stmt, stmt_len, norm_name,
 			create_info->options & HA_LEX_CREATE_TMP_TABLE);
 
-		switch (error) {
+		switch (err) {
 
 		case DB_PARENT_NO_INDEX:
 			push_warning_printf(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				thd, Sql_condition::WARN_LEVEL_WARN,
 				HA_ERR_CANNOT_ADD_FOREIGN,
 				"Create table '%s' with foreign key constraint"
 				" failed. There is no index in the referenced"
@@ -8605,16 +10453,18 @@ ha_innobase::create(
 
 		case DB_CHILD_NO_INDEX:
 			push_warning_printf(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				thd, Sql_condition::WARN_LEVEL_WARN,
 				HA_ERR_CANNOT_ADD_FOREIGN,
 				"Create table '%s' with foreign key constraint"
 				" failed. There is no index in the referencing"
 				" table where referencing columns appear"
 				" as the first columns.\n", norm_name);
 			break;
-                }
+		default:
+			break;
+		}
 
-		error = convert_error_code_to_mysql(error, flags, NULL);
+		error = convert_error_code_to_mysql(err, flags, NULL);
 
 		if (error) {
 			goto cleanup;
@@ -8631,11 +10481,15 @@ ha_innobase::create(
 
 	log_buffer_flush_to_disk();
 
-	innobase_table = dict_table_get(norm_name, FALSE,
-					DICT_ERR_IGNORE_NONE);
+	innobase_table = dict_table_open_on_name(
+		norm_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
 
 	DBUG_ASSERT(innobase_table != 0);
 
+	innobase_copy_frm_flags_from_create_info(innobase_table, create_info);
+
+	dict_stats_update(innobase_table, DICT_STATS_EMPTY_TABLE);
+
 	if (innobase_table) {
 		/* We update the highest file format in the system table
 		space, if this table has higher file format setting. */
@@ -8645,6 +10499,16 @@ ha_innobase::create(
 			dict_table_get_format(innobase_table));
 	}
 
+	/* Load server stopword into FTS cache */
+	if (flags2 & DICT_TF2_FTS) {
+		if (!innobase_fts_load_stopword(innobase_table, NULL, thd)) {
+			dict_table_close(innobase_table, FALSE, FALSE);
+			srv_active_wake_master_thread();
+			trx_free_for_mysql(trx);
+			DBUG_RETURN(-1);
+		}
+	}
+
 	/* Note: We can't call update_thd() as prebuilt will not be
 	setup at this stage and so we use thd. */
 
@@ -8675,6 +10539,8 @@ ha_innobase::create(
 		dict_table_autoinc_unlock(innobase_table);
 	}
 
+	dict_table_close(innobase_table, FALSE, FALSE);
+
 	/* Tell the InnoDB server that there might be work for
 	utility threads: */
 
@@ -8685,7 +10551,7 @@ ha_innobase::create(
 	DBUG_RETURN(0);
 
 cleanup:
-	innobase_commit_low(trx);
+	trx_rollback_for_mysql(trx);
 
 	row_mysql_unlock_data_dictionary(trx);
 
@@ -8703,9 +10569,8 @@ ha_innobase::discard_or_import_tablespace(
 /*======================================*/
 	my_bool discard)	/*!< in: TRUE if discard, else import */
 {
+	dberr_t		err;
 	dict_table_t*	dict_table;
-	trx_t*		trx;
-	int		err;
 
 	DBUG_ENTER("ha_innobase::discard_or_import_tablespace");
 
@@ -8713,26 +10578,85 @@ ha_innobase::discard_or_import_tablespace(
 	ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
 	ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
 
+	if (srv_read_only_mode) {
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	}
+
 	dict_table = prebuilt->table;
-	trx = prebuilt->trx;
 
-	if (discard) {
-		err = row_discard_tablespace_for_mysql(dict_table->name, trx);
+	if (dict_table->space == TRX_SYS_SPACE) {
+
+		ib_senderrf(
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLE_IN_SYSTEM_TABLESPACE,
+			table->s->table_name.str);
+
+		DBUG_RETURN(HA_ERR_TABLE_NEEDS_UPGRADE);
+	}
+
+	trx_start_if_not_started(prebuilt->trx);
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads. */
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+
+	/* Obtain an exclusive lock on the table. */
+	err = row_mysql_lock_table(
+		prebuilt->trx, dict_table, LOCK_X,
+		discard ? "setting table lock for DISCARD TABLESPACE"
+			: "setting table lock for IMPORT TABLESPACE");
+
+	if (err != DB_SUCCESS) {
+		/* unable to lock the table: do nothing */
+	} else if (discard) {
+
+		/* Discarding an already discarded tablespace should be an
+		idempotent operation. Also, if the .ibd file is missing the
+		user may want to set the DISCARD flag in order to IMPORT
+		a new tablespace. */
+
+		if (dict_table->ibd_file_missing) {
+			ib_senderrf(
+				prebuilt->trx->mysql_thd,
+				IB_LOG_LEVEL_WARN, ER_TABLESPACE_MISSING,
+				table->s->table_name.str);
+		}
+
+		err = row_discard_tablespace_for_mysql(
+			dict_table->name, prebuilt->trx);
+
+	} else if (!dict_table->ibd_file_missing) {
+		/* Commit the transaction in order to
+		release the table lock. */
+		trx_commit_for_mysql(prebuilt->trx);
+
+		ib_senderrf(
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_EXISTS, table->s->table_name.str);
+
+		DBUG_RETURN(HA_ERR_TABLE_EXIST);
 	} else {
-		err = row_import_tablespace_for_mysql(dict_table->name, trx);
+		err = row_import_for_mysql(dict_table, prebuilt);
 
-		/* in expanded import mode re-initialize auto_increment again */
-		if ((err == DB_SUCCESS) && srv_expand_import &&
-		    (table->found_next_number_field != NULL)) {
-			dict_table_autoinc_lock(dict_table);
-			innobase_initialize_autoinc();
-			dict_table_autoinc_unlock(dict_table);
+		if (err == DB_SUCCESS) {
+
+			if (table->found_next_number_field) {
+				dict_table_autoinc_lock(dict_table);
+				innobase_initialize_autoinc();
+				dict_table_autoinc_unlock(dict_table);
+			}
+
+			info(HA_STATUS_TIME
+			     | HA_STATUS_CONST
+			     | HA_STATUS_VARIABLE
+			     | HA_STATUS_AUTO);
 		}
 	}
 
-	err = convert_error_code_to_mysql(err, dict_table->flags, NULL);
+	/* Commit the transaction in order to release the table lock. */
+	trx_commit_for_mysql(prebuilt->trx);
 
-	DBUG_RETURN(err);
+	DBUG_RETURN(convert_error_code_to_mysql(err, dict_table->flags, NULL));
 }
 
 /*****************************************************************//**
@@ -8740,19 +10664,24 @@ Deletes all rows of an InnoDB table.
 @return	error number */
 UNIV_INTERN
 int
-ha_innobase::truncate(void)
-/*==============================*/
+ha_innobase::truncate()
+/*===================*/
 {
+	dberr_t		err;
 	int		error;
 
 	DBUG_ENTER("ha_innobase::truncate");
 
+	if (srv_read_only_mode) {
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	}
+
 	/* Get the transaction associated with the current thd, or create one
 	if not yet created, and update prebuilt->trx */
 
 	update_thd(ha_thd());
 
-	if (share->ib_table->is_corrupt) {
+	if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) {
 		DBUG_RETURN(HA_ERR_CRASHED);
 	}
 
@@ -8760,17 +10689,37 @@ ha_innobase::truncate(void)
 		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
 	}
 
+	if (!trx_is_started(prebuilt->trx)) {
+		++prebuilt->trx->will_lock;
+	}
 	/* Truncate the table in InnoDB */
 
-	error = row_truncate_table_for_mysql(prebuilt->table, prebuilt->trx);
+	err = row_truncate_table_for_mysql(prebuilt->table, prebuilt->trx);
 
-	if (share->ib_table->is_corrupt) {
+	if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) {
 		DBUG_RETURN(HA_ERR_CRASHED);
 	}
 
-	error = convert_error_code_to_mysql(error, prebuilt->table->flags,
-					    NULL);
+	switch (err) {
+
+	case DB_TABLESPACE_DELETED:
+	case DB_TABLESPACE_NOT_FOUND:
+		ib_senderrf(
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			(err == DB_TABLESPACE_DELETED ?
+			ER_TABLESPACE_DISCARDED : ER_TABLESPACE_MISSING),
+			table->s->table_name.str);
+		table->status = STATUS_NOT_FOUND;
+		error = HA_ERR_NO_SUCH_TABLE;
+		break;
 
+	default:
+		error = convert_error_code_to_mysql(
+			err, prebuilt->table->flags,
+			prebuilt->trx->mysql_thd);
+		table->status = STATUS_NOT_FOUND;
+		break;
+	}
 	DBUG_RETURN(error);
 }
 
@@ -8788,11 +10737,11 @@ ha_innobase::delete_table(
 	const char*	name)	/*!< in: table name */
 {
 	ulint	name_len;
-	int	error;
+	dberr_t	err;
 	trx_t*	parent_trx;
 	trx_t*	trx;
-	THD	*thd = ha_thd();
-	char	norm_name[1000];
+	THD*	thd = ha_thd();
+	char	norm_name[FN_REFLEN];
 
 	DBUG_ENTER("ha_innobase::delete_table");
 
@@ -8800,18 +10749,21 @@ ha_innobase::delete_table(
 		"test_normalize_table_name_low",
 		test_normalize_table_name_low();
 	);
+	DBUG_EXECUTE_IF(
+		"test_ut_format_name",
+		test_ut_format_name();
+	);
 
 	/* Strangely, MySQL passes the table name without the '.frm'
 	extension, in contrast to ::create */
 	normalize_table_name(norm_name, name);
 
-	if (IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(norm_name, thd)) {
+	if (srv_read_only_mode) {
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	} else if (IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(norm_name, thd)) {
 		DBUG_RETURN(HA_ERR_GENERIC);
 	}
 
-	/* Get the transaction associated with the current thd, or create one
-	if not yet created */
-
 	parent_trx = check_trx_exists(thd);
 
 	/* In case MySQL calls this in the middle of a SELECT query, release
@@ -8831,11 +10783,51 @@ ha_innobase::delete_table(
 
 	ut_a(name_len < 1000);
 
+	/* Either the transaction is already flagged as a locking transaction
+	or it hasn't been started yet. */
+
+	ut_a(!trx_is_started(trx) || trx->will_lock > 0);
+
+	/* We are doing a DDL operation. */
+	++trx->will_lock;
+	trx->ddl = true;
+
 	/* Drop the table in InnoDB */
+	err = row_drop_table_for_mysql(
+		norm_name, trx, thd_sql_command(thd) == SQLCOM_DROP_DB);
+
+
+	if (err == DB_TABLE_NOT_FOUND
+	    && innobase_get_lower_case_table_names() == 1) {
+		char*	is_part = NULL;
+#ifdef __WIN__
+		is_part = strstr(norm_name, "#p#");
+#else
+		is_part = strstr(norm_name, "#P#");
+#endif /* __WIN__ */
+
+		if (is_part) {
+			char	par_case_name[FN_REFLEN];
 
-	error = row_drop_table_for_mysql(norm_name, trx,
-					 thd_sql_command(thd)
-					 == SQLCOM_DROP_DB);
+#ifndef __WIN__
+			/* Check for the table using lower
+			case name, including the partition
+			separator "P" */
+			strcpy(par_case_name, norm_name);
+			innobase_casedn_str(par_case_name);
+#else
+			/* On Windows platfrom, check
+			whether there exists table name in
+			system table whose name is
+			not being normalized to lower case */
+			normalize_table_name_low(
+				par_case_name, name, FALSE);
+#endif
+			err = row_drop_table_for_mysql(
+				par_case_name, trx,
+				thd_sql_command(thd) == SQLCOM_DROP_DB);
+		}
+	}
 
 	/* Flush the log to reduce probability that the .frm files and
 	the InnoDB data dictionary get out-of-sync if the user runs
@@ -8852,9 +10844,7 @@ ha_innobase::delete_table(
 
 	trx_free_for_mysql(trx);
 
-	error = convert_error_code_to_mysql(error, 0, NULL);
-
-	DBUG_RETURN(error);
+	DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL));
 }
 
 /*****************************************************************//**
@@ -8863,11 +10853,11 @@ static
 void
 innobase_drop_database(
 /*===================*/
-	handlerton *hton, /*!< in: handlerton of Innodb */
-	char*	path)	/*!< in: database path; inside InnoDB the name
-			of the last directory in the path is used as
-			the database name: for example, in 'mysql/data/test'
-			the database name is 'test' */
+	handlerton*	hton,	/*!< in: handlerton of Innodb */
+	char*		path)	/*!< in: database path; inside InnoDB the name
+				of the last directory in the path is used as
+				the database name: for example, in
+				'mysql/data/test' the database name is 'test' */
 {
 	ulint	len		= 0;
 	trx_t*	trx;
@@ -8880,6 +10870,10 @@ innobase_drop_database(
 
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
+	if (srv_read_only_mode) {
+		return;
+	}
+
 	/* In the Windows plugin, thd = current_thd is always NULL */
 	if (thd) {
 		trx_t*	parent_trx = check_trx_exists(thd);
@@ -8907,20 +10901,25 @@ innobase_drop_database(
 #ifdef	__WIN__
 	innobase_casedn_str(namebuf);
 #endif
-#if defined __WIN__ && !defined MYSQL_SERVER
-	/* In the Windows plugin, thd = current_thd is always NULL */
-	trx = trx_allocate_for_mysql();
-	trx->mysql_thd = NULL;
-#else
 	trx = innobase_trx_allocate(thd);
-#endif
+
 	if (UNIV_UNLIKELY(trx->fake_changes)) {
 		my_free(namebuf);
 		innobase_commit_low(trx);
 		trx_free_for_mysql(trx);
 		return; /* ignore */
 	}
+
+	/* Either the transaction is already flagged as a locking transaction
+	or it hasn't been started yet. */
+
+	ut_a(!trx_is_started(trx) || trx->will_lock > 0);
+
+	/* We are doing a DDL operation. */
+	++trx->will_lock;
+
 	row_drop_database_for_mysql(namebuf, trx);
+
 	my_free(namebuf);
 
 	/* Flush the log to reduce probability that the .frm files and
@@ -8937,75 +10936,120 @@ innobase_drop_database(
 	innobase_commit_low(trx);
 	trx_free_for_mysql(trx);
 }
+
 /*********************************************************************//**
 Renames an InnoDB table.
-@return	0 or error code */
-static
-int
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 innobase_rename_table(
 /*==================*/
 	trx_t*		trx,	/*!< in: transaction */
 	const char*	from,	/*!< in: old name of the table */
-	const char*	to,	/*!< in: new name of the table */
-	ibool		lock_and_commit)
-				/*!< in: TRUE=lock data dictionary and commit */
+	const char*	to)	/*!< in: new name of the table */
 {
-	int	error;
-	char*	norm_to;
-	char*	norm_from;
+	dberr_t	error;
+	char	norm_to[FN_REFLEN];
+	char	norm_from[FN_REFLEN];
+
 	DBUG_ENTER("innobase_rename_table");
+	DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
 
-	// Magic number 64 arbitrary
-	norm_to = (char*) my_malloc(strlen(to) + 64, MYF(0));
-	norm_from = (char*) my_malloc(strlen(from) + 64, MYF(0));
+	ut_ad(!srv_read_only_mode);
 
 	normalize_table_name(norm_to, to);
 	normalize_table_name(norm_from, from);
 
 	DEBUG_SYNC_C("innodb_rename_table_ready");
 
+	trx_start_if_not_started(trx);
+
 	/* Serialize data dictionary operations with dictionary mutex:
-	no deadlocks can occur then in these operations.  Start the
-	transaction first to avoid a possible deadlock in the server. */
+	no deadlocks can occur then in these operations. */
 
-	trx_start_if_not_started(trx);
-	if (lock_and_commit) {
-		row_mysql_lock_data_dictionary(trx);
-	}
+	row_mysql_lock_data_dictionary(trx);
 
-	/* Flag this transaction as a dictionary operation, so that
-	the data dictionary will be locked in crash recovery. */
-	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+	/* Transaction must be flagged as a locking transaction or it hasn't
+	been started yet. */
+
+	ut_a(trx->will_lock > 0);
 
 	error = row_rename_table_for_mysql(
-		norm_from, norm_to, trx, lock_and_commit);
+		norm_from, norm_to, trx, TRUE);
 
 	if (error != DB_SUCCESS) {
-		FILE* ef = dict_foreign_err_file;
+		if (error == DB_TABLE_NOT_FOUND
+		    && innobase_get_lower_case_table_names() == 1) {
+			char*	is_part = NULL;
+#ifdef __WIN__
+			is_part = strstr(norm_from, "#p#");
+#else
+			is_part = strstr(norm_from, "#P#");
+#endif /* __WIN__ */
 
-		DBUG_PRINT("info", ("rename failed: %d", error));
-		fputs("InnoDB: Renaming table ", ef);
-		ut_print_name(ef, trx, TRUE, norm_from);
-		fputs(" to ", ef);
-		ut_print_name(ef, trx, TRUE, norm_to);
-		fputs(" failed!\n", ef);
-	}
+			if (is_part) {
+				char	par_case_name[FN_REFLEN];
+#ifndef __WIN__
+				/* Check for the table using lower
+				case name, including the partition
+				separator "P" */
+				strcpy(par_case_name, norm_from);
+				innobase_casedn_str(par_case_name);
+#else
+				/* On Windows platfrom, check
+				whether there exists table name in
+				system table whose name is
+				not being normalized to lower case */
+				normalize_table_name_low(
+					par_case_name, from, FALSE);
+#endif
+				trx_start_if_not_started(trx);
+				error = row_rename_table_for_mysql(
+					par_case_name, norm_to, trx, TRUE);
+			}
+		}
 
-	if (lock_and_commit) {
-		row_mysql_unlock_data_dictionary(trx);
+		if (error != DB_SUCCESS) {
+			if (!srv_read_only_mode) {
+				FILE* ef = dict_foreign_err_file;
+
+				fputs("InnoDB: Renaming table ", ef);
+				ut_print_name(ef, trx, TRUE, norm_from);
+				fputs(" to ", ef);
+				ut_print_name(ef, trx, TRUE, norm_to);
+				fputs(" failed!\n", ef);
+			}
+		} else {
+#ifndef __WIN__
+			sql_print_warning("Rename partition table %s "
+					  "succeeds after converting to lower "
+					  "case. The table may have "
+					  "been moved from a case "
+					  "in-sensitive file system.\n",
+					  norm_from);
+#else
+			sql_print_warning("Rename partition table %s "
+					  "succeeds after skipping the step to "
+					  "lower case the table name. "
+					  "The table may have been "
+					  "moved from a case sensitive "
+					  "file system.\n",
+					  norm_from);
+#endif /* __WIN__ */
+		}
+	}
 
-		/* Flush the log to reduce probability that the .frm
-		files and the InnoDB data dictionary get out-of-sync
-		if the user runs with innodb_flush_log_at_trx_commit = 0 */
+	row_mysql_unlock_data_dictionary(trx);
 
-		log_buffer_flush_to_disk();
-	}
+	/* Flush the log to reduce probability that the .frm
+	files and the InnoDB data dictionary get out-of-sync
+	if the user runs with innodb_flush_log_at_trx_commit = 0 */
 
-	my_free(norm_to);
-	my_free(norm_from);
+	log_buffer_flush_to_disk();
 
 	DBUG_RETURN(error);
 }
+
 /*********************************************************************//**
 Renames an InnoDB table.
 @return	0 or error code */
@@ -9017,12 +11061,17 @@ ha_innobase::rename_table(
 	const char*	to)	/*!< in: new name of the table */
 {
 	trx_t*	trx;
-	int	error;
+	dberr_t	error;
 	trx_t*	parent_trx;
 	THD*	thd		= ha_thd();
 
 	DBUG_ENTER("ha_innobase::rename_table");
 
+	if (srv_read_only_mode) {
+		ib_senderrf(thd, IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	}
+
 	/* Get the transaction associated with the current thd, or create one
 	if not yet created */
 
@@ -9040,7 +11089,11 @@ ha_innobase::rename_table(
 		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
 	}
 
-	error = innobase_rename_table(trx, from, to, TRUE);
+	/* We are doing a DDL operation. */
+	++trx->will_lock;
+	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+	error = innobase_rename_table(trx, from, to);
 
 	DEBUG_SYNC(thd, "after_innobase_rename_table");
 
@@ -9052,6 +11105,27 @@ ha_innobase::rename_table(
 	innobase_commit_low(trx);
 	trx_free_for_mysql(trx);
 
+	if (error == DB_SUCCESS) {
+		char	norm_from[MAX_FULL_NAME_LEN];
+		char	norm_to[MAX_FULL_NAME_LEN];
+		char	errstr[512];
+		dberr_t	ret;
+
+		normalize_table_name(norm_from, from);
+		normalize_table_name(norm_to, to);
+
+		ret = dict_stats_rename_table(norm_from, norm_to,
+					      errstr, sizeof(errstr));
+
+		if (ret != DB_SUCCESS) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, " InnoDB: %s\n", errstr);
+
+			push_warning(thd, Sql_condition::WARN_LEVEL_WARN,
+				     ER_LOCK_WAIT_TIMEOUT, errstr);
+		}
+	}
+
 	/* Add a special case to handle the Duplicated Key error
 	and return DB_ERROR instead.
 	This is to avoid a possible SIGSEGV error from mysql error
@@ -9064,15 +11138,13 @@ ha_innobase::rename_table(
 	the dup key error here is due to an existing table whose name
 	is the one we are trying to rename to) and return the generic
 	error code. */
-	if (error == (int) DB_DUPLICATE_KEY) {
+	if (error == DB_DUPLICATE_KEY) {
 		my_error(ER_TABLE_EXISTS_ERROR, MYF(0), to);
 
 		error = DB_ERROR;
 	}
 
-	error = convert_error_code_to_mysql(error, 0, NULL);
-
-	DBUG_RETURN(error);
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
 }
 
 /*********************************************************************//**
@@ -9084,9 +11156,9 @@ ha_innobase::records_in_range(
 /*==========================*/
 	uint			keynr,		/*!< in: index number */
 	key_range		*min_key,	/*!< in: start key value of the
-						   range, may also be 0 */
+						range, may also be 0 */
 	key_range		*max_key)	/*!< in: range end key val, may
-						   also be 0 */
+						also be 0 */
 {
 	KEY*		key;
 	dict_index_t*	index;
@@ -9095,7 +11167,6 @@ ha_innobase::records_in_range(
 	ib_int64_t	n_rows;
 	ulint		mode1;
 	ulint		mode2;
-        uint key_parts;
 	mem_heap_t*	heap;
 
 	DBUG_ENTER("records_in_range");
@@ -9131,19 +11202,14 @@ ha_innobase::records_in_range(
 		goto func_exit;
 	}
 
-        key_parts= key->key_parts;
-        if ((min_key && min_key->keypart_map>=(key_part_map) (1<<key_parts)) ||
-            (max_key && max_key->keypart_map>=(key_part_map) (1<<key_parts)))
-          key_parts= key->ext_key_parts;
-
-	heap = mem_heap_create(2 * (key_parts * sizeof(dfield_t)
+	heap = mem_heap_create(2 * (key->ext_key_parts * sizeof(dfield_t)
 				    + sizeof(dtuple_t)));
 
-	range_start= dtuple_create(heap, key_parts);
-	dict_index_copy_types(range_start, index, key_parts);
+	range_start= dtuple_create(heap, key->ext_key_parts);
+	dict_index_copy_types(range_start, index, key->ext_key_parts);
 
-	range_end= dtuple_create(heap, key_parts);
-	dict_index_copy_types(range_end, index, key_parts);
+	range_end= dtuple_create(heap, key->ext_key_parts);
+	dict_index_copy_types(range_end, index, key->ext_key_parts);
 
 	row_sel_convert_mysql_key_to_innobase(
 				range_start,
@@ -9209,13 +11275,13 @@ filesort.cc.
 @return	upper bound of rows */
 UNIV_INTERN
 ha_rows
-ha_innobase::estimate_rows_upper_bound(void)
-/*======================================*/
+ha_innobase::estimate_rows_upper_bound()
+/*====================================*/
 {
-	dict_index_t*	index;
-	ulonglong	estimate;
-	ulonglong	local_data_file_length;
-	ulint		stat_n_leaf_pages;
+	const dict_index_t*	index;
+	ulonglong		estimate;
+	ulonglong		local_data_file_length;
+	ulint			stat_n_leaf_pages;
 
 	DBUG_ENTER("estimate_rows_upper_bound");
 
@@ -9225,8 +11291,7 @@ ha_innobase::estimate_rows_upper_bound(void)
 
 	update_thd(ha_thd());
 
-	prebuilt->trx->op_info = (char*)
-				 "calculating upper bound for table rows";
+	prebuilt->trx->op_info = "calculating upper bound for table rows";
 
 	/* In case MySQL calls this in the middle of a SELECT query, release
 	possible adaptive hash latch to avoid deadlocks of threads */
@@ -9242,16 +11307,15 @@ ha_innobase::estimate_rows_upper_bound(void)
 	local_data_file_length =
 		((ulonglong) stat_n_leaf_pages) * UNIV_PAGE_SIZE;
 
-
 	/* Calculate a minimum length for a clustered index record and from
 	that an upper bound for the number of rows. Since we only calculate
-	new statistics in row0mysql.c when a table has grown by a threshold
+	new statistics in row0mysql.cc when a table has grown by a threshold
 	factor, we must add a safety factor 2 in front of the formula below. */
 
-	estimate = 2 * local_data_file_length /
-					 dict_index_calc_min_rec_len(index);
+	estimate = 2 * local_data_file_length
+		/ dict_index_calc_min_rec_len(index);
 
-	prebuilt->trx->op_info = (char*)"";
+	prebuilt->trx->op_info = "";
 
 	DBUG_RETURN((ha_rows) estimate);
 }
@@ -9271,7 +11335,32 @@ ha_innobase::scan_time()
 	as a random disk read, that is, we do not divide the following
 	by 10, which would be physically realistic. */
 
-	return((double) (prebuilt->table->stat_clustered_index_size));
+	/* The locking below is disabled for performance reasons. Without
+	it we could end up returning uninitialized value to the caller,
+	which in the worst case could make some query plan go bogus or
+	issue a Valgrind warning. */
+#if 0
+	/* avoid potential lock order violation with dict_table_stats_lock()
+	below */
+	update_thd(ha_thd());
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+#endif
+
+	ulint	stat_clustered_index_size;
+
+#if 0
+	dict_table_stats_lock(prebuilt->table, RW_S_LATCH);
+#endif
+
+	ut_a(prebuilt->table->stat_initialized);
+
+	stat_clustered_index_size = prebuilt->table->stat_clustered_index_size;
+
+#if 0
+	dict_table_stats_unlock(prebuilt->table, RW_S_LATCH);
+#endif
+
+	return((double) stat_clustered_index_size);
 }
 
 /******************************************************************//**
@@ -9307,6 +11396,16 @@ ha_innobase::read_time(
 	return(ranges + (double) rows / (double) total_rows * time_for_scan);
 }
 
+/******************************************************************//**
+Return the size of the InnoDB memory buffer. */
+UNIV_INTERN
+longlong
+ha_innobase::get_memory_buffer_size() const
+/*=======================================*/
+{
+	return(innobase_buffer_pool_size);
+}
+
 UNIV_INTERN
 bool
 ha_innobase::is_corrupt() const
@@ -9327,7 +11426,7 @@ match. In this case, we have to take into account if we generated a
 default clustered index for the table
 @return the key number used inside MySQL */
 static
-unsigned int
+int
 innobase_get_mysql_key_number_for_index(
 /*====================================*/
 	INNOBASE_SHARE*		share,	/*!< in: share structure for index
@@ -9336,15 +11435,16 @@ innobase_get_mysql_key_number_for_index(
 					dictionary */
 	dict_table_t*		ib_table,/*!< in: table in Innodb data
 					dictionary */
-        const dict_index_t*     index)	/*!< in: index */
+	const dict_index_t*	index)	/*!< in: index */
 {
 	const dict_index_t*	ind;
 	unsigned int		i;
 
-        ut_a(index);
+ 	ut_a(index);
 
-	/* If index does not belong to the table of share structure. Search
-	index->table instead */
+	/* If index does not belong to the table object of share structure
+	(ib_table comes from the share structure) search the index->table
+	object instead */
 	if (index->table != ib_table) {
 		i = 0;
 		ind = dict_table_get_first_index(index->table);
@@ -9364,19 +11464,16 @@ innobase_get_mysql_key_number_for_index(
 
 	/* If index translation table exists, we will first check
 	the index through index translation table for a match. */
-        if (share->idx_trans_tbl.index_mapping) {
+	if (share->idx_trans_tbl.index_mapping) {
 		for (i = 0; i < share->idx_trans_tbl.index_count; i++) {
 			if (share->idx_trans_tbl.index_mapping[i] == index) {
 				return(i);
 			}
 		}
 
-		/* If index_count in translation table is set to 0, it
-		is possible we are in the process of rebuilding table,
-		do not spit error in this case */
-		if (share->idx_trans_tbl.index_count) {
-			/* Print an error message if we cannot find the index
-			** in the "index translation table". */
+		/* Print an error message if we cannot find the index
+		in the "index translation table". */
+		if (*index->name != TEMP_INDEX_PREFIX) {
 			sql_print_error("Cannot find index %s in InnoDB index "
 					"translation table.", index->name);
 		}
@@ -9393,11 +11490,30 @@ innobase_get_mysql_key_number_for_index(
 		if (index == ind) {
 			return(i);
 		}
-        }
+	}
+
+	/* Loop through each index of the table and lock them */
+	for (ind = dict_table_get_first_index(ib_table);
+	     ind != NULL;
+	     ind = dict_table_get_next_index(ind)) {
+		if (index == ind) {
+			/* Temp index is internal to InnoDB, that is
+			not present in the MySQL index list, so no
+			need to print such mismatch warning. */
+			if (*(index->name) != TEMP_INDEX_PREFIX) {
+				sql_print_warning(
+					"Find index %s in InnoDB index list "
+					"but not its MySQL index number "
+					"It could be an InnoDB internal index.",
+					index->name);
+			}
+			return(-1);
+		}
+	}
 
 	ut_error;
 
-        return(0);
+	return(-1);
 }
 
 /*********************************************************************//**
@@ -9414,45 +11530,49 @@ innodb_rec_per_key(
 	ha_rows		records)	/*!< in: estimated total records */
 {
 	ha_rows		rec_per_key;
+	ib_uint64_t	n_diff;
+
+	ut_a(index->table->stat_initialized);
 
 	ut_ad(i < dict_index_get_n_unique(index));
 
-	/* Note the stat_n_diff_key_vals[] stores the diff value with
-	n-prefix indexing, so it is always stat_n_diff_key_vals[i + 1] */
-	if (index->stat_n_diff_key_vals[i + 1] == 0) {
+	n_diff = index->stat_n_diff_key_vals[i];
+
+	if (n_diff == 0) {
 
 		rec_per_key = records;
 	} else if (srv_innodb_stats_method == SRV_STATS_NULLS_IGNORED) {
-		ib_int64_t	num_null;
+		ib_uint64_t	n_null;
+		ib_uint64_t	n_non_null;
 
-		/* Number of rows with NULL value in this
-		field */
-		num_null = records - index->stat_n_non_null_key_vals[i];
+		n_non_null = index->stat_n_non_null_key_vals[i];
 
 		/* In theory, index->stat_n_non_null_key_vals[i]
 		should always be less than the number of records.
 		Since this is statistics value, the value could
 		have slight discrepancy. But we will make sure
 		the number of null values is not a negative number. */
-		num_null = (num_null < 0) ? 0 : num_null;
+		if (records < n_non_null) {
+			n_null = 0;
+		} else {
+			n_null = records - n_non_null;
+		}
 
 		/* If the number of NULL values is the same as or
 		large than that of the distinct values, we could
-		consider that the table consists mostly of NULL value. 
+		consider that the table consists mostly of NULL value.
 		Set rec_per_key to 1. */
-		if (index->stat_n_diff_key_vals[i + 1] <= num_null) {
+		if (n_diff <= n_null) {
 			rec_per_key = 1;
 		} else {
 			/* Need to exclude rows with NULL values from
 			rec_per_key calculation */
-			rec_per_key = (ha_rows)(
-				(records - num_null)
-				/ (index->stat_n_diff_key_vals[i + 1]
-				   - num_null));
+			rec_per_key = (ha_rows)
+				((records - n_null) / (n_diff - n_null));
 		}
 	} else {
-		rec_per_key = (ha_rows)
-			 (records / index->stat_n_diff_key_vals[i + 1]);
+		DEBUG_SYNC_C("after_checking_for_0");
+		rec_per_key = (ha_rows) (records / n_diff);
 	}
 
 	return(rec_per_key);
@@ -9460,20 +11580,18 @@ innodb_rec_per_key(
 
 /*********************************************************************//**
 Returns statistics information of the table to the MySQL interpreter,
-in various fields of the handle object. */
+in various fields of the handle object.
+@return HA_ERR_* error code or 0 */
 UNIV_INTERN
 int
 ha_innobase::info_low(
 /*==================*/
-	uint	flag,			/*!< in: what information MySQL
-					requests */
-	bool	called_from_analyze)	/* in: TRUE if called from
-					::analyze() */
+	uint	flag,	/*!< in: what information is requested */
+	bool	is_analyze)
 {
 	dict_table_t*	ib_table;
-	dict_index_t*	index;
 	ha_rows		rec_per_key;
-	ib_int64_t	n_rows;
+	ib_uint64_t	n_rows;
 	char		path[FN_REFLEN];
 	os_file_stat_t	stat_info;
 
@@ -9497,58 +11615,50 @@ ha_innobase::info_low(
 	trx_search_latch_release_if_reserved(prebuilt->trx);
 
 	ib_table = prebuilt->table;
+	DBUG_ASSERT(ib_table->n_ref_count > 0);
 
 	if (flag & HA_STATUS_TIME) {
-		if ((called_from_analyze || innobase_stats_on_metadata) && !share->ib_table->is_corrupt) {
-			/* In sql_show we call with this flag: update
-			then statistics so that they are up-to-date */
-
-			if (srv_use_sys_stats_table && !((ib_table->flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY)
-			    && called_from_analyze) {
-				/* If the indexes on the table don't have enough rows in SYS_STATS system table, */
-				/* they need to be created. */
-				dict_index_t*	index;
+		if (is_analyze || innobase_stats_on_metadata) {
+
+			dict_stats_upd_option_t	opt;
+			dberr_t			ret;
 
-				prebuilt->trx->op_info = "confirming rows of SYS_STATS to store statistics";
+			prebuilt->trx->op_info = "updating table statistics";
 
-				ut_a(!trx_is_started(prebuilt->trx));
+			if (dict_stats_is_persistent_enabled(ib_table)) {
 
-				for (index = dict_table_get_first_index(ib_table);
-				     index != NULL;
-				     index = dict_table_get_next_index(index)) {
-					if (dict_is_older_statistics(index)) {
-						row_delete_stats_for_mysql(index, prebuilt->trx);
-						innobase_commit_low(prebuilt->trx);
-					}
-					row_insert_stats_for_mysql(index, prebuilt->trx);
-					innobase_commit_low(prebuilt->trx);
+				if (is_analyze) {
+					opt = DICT_STATS_RECALC_PERSISTENT;
+				} else {
+					/* This is e.g. 'SHOW INDEXES', fetch
+					the persistent stats from disk. */
+					opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY;
 				}
-
-				ut_a(!trx_is_started(prebuilt->trx));
+			} else {
+				opt = DICT_STATS_RECALC_TRANSIENT;
 			}
 
-			prebuilt->trx->op_info = "updating table statistics";
-
-			DEBUG_SYNC_C("info_before_stats_update");
+			ut_ad(!mutex_own(&dict_sys->mutex));
+			ret = dict_stats_update(ib_table, opt);
 
-			dict_update_statistics(
-				ib_table,
-				FALSE, /* update even if initialized */
-				called_from_analyze,
-				FALSE /* update even if not changed too much */);
+			if (ret != DB_SUCCESS) {
+				prebuilt->trx->op_info = "";
+				DBUG_RETURN(HA_ERR_GENERIC);
+			}
 
-			prebuilt->trx->op_info = "returning various info to MySQL";
+			prebuilt->trx->op_info =
+				"returning various info to MySQL";
 		}
 
 		my_snprintf(path, sizeof(path), "%s/%s%s",
-				mysql_data_home, ib_table->name, reg_ext);
+			    mysql_data_home, ib_table->name, reg_ext);
 
 		unpack_filename(path,path);
 
 		/* Note that we do not know the access time of the table,
 		nor the CHECK TABLE time, nor the UPDATE or INSERT time. */
 
-		if (os_file_get_status(path,&stat_info)) {
+		if (os_file_get_status(path, &stat_info, false) == DB_SUCCESS) {
 			stats.create_time = (ulong) stat_info.ctime;
 		}
 	}
@@ -9556,15 +11666,28 @@ ha_innobase::info_low(
 	if (flag & HA_STATUS_VARIABLE) {
 
 		ulint	page_size;
+		ulint	stat_clustered_index_size;
+		ulint	stat_sum_of_other_index_sizes;
 
-		dict_table_stats_lock(ib_table, RW_S_LATCH);
+		if (!(flag & HA_STATUS_NO_LOCK)) {
+			dict_table_stats_lock(ib_table, RW_S_LATCH);
+		}
+
+		ut_a(ib_table->stat_initialized);
 
 		n_rows = ib_table->stat_n_rows;
 
-		/* Because we do not protect stat_n_rows by any mutex in a
-		delete, it is theoretically possible that the value can be
-		smaller than zero! TODO: fix this race.
+		stat_clustered_index_size
+			= ib_table->stat_clustered_index_size;
+
+		stat_sum_of_other_index_sizes
+			= ib_table->stat_sum_of_other_index_sizes;
 
+		if (!(flag & HA_STATUS_NO_LOCK)) {
+			dict_table_stats_unlock(ib_table, RW_S_LATCH);
+		}
+
+		/*
 		The MySQL optimizer seems to assume in a left join that n_rows
 		is an accurate estimate if it is zero. Of course, it is not,
 		since we do not have any locks on the rows yet at this phase.
@@ -9574,10 +11697,6 @@ ha_innobase::info_low(
 		set. That way SHOW TABLE STATUS will show the best estimate,
 		while the optimizer never sees the table empty. */
 
-		if (n_rows < 0) {
-			n_rows = 0;
-		}
-
 		if (n_rows == 0 && !(flag & HA_STATUS_TIME)) {
 			n_rows++;
 		}
@@ -9604,23 +11723,21 @@ ha_innobase::info_low(
 			page_size = UNIV_PAGE_SIZE;
 		}
 
-		stats.records = (ha_rows)n_rows;
+		stats.records = (ha_rows) n_rows;
 		stats.deleted = 0;
 		stats.data_file_length
-			= ((ulonglong) ib_table->stat_clustered_index_size)
+			= ((ulonglong) stat_clustered_index_size)
 			* page_size;
-		stats.index_file_length =
-			((ulonglong) ib_table->stat_sum_of_other_index_sizes)
+		stats.index_file_length
+			= ((ulonglong) stat_sum_of_other_index_sizes)
 			* page_size;
 
-		dict_table_stats_unlock(ib_table, RW_S_LATCH);
-
 		/* Since fsp_get_available_space_in_free_extents() is
 		acquiring latches inside InnoDB, we do not call it if we
 		are asked by MySQL to avoid locking. Another reason to
 		avoid the call is that it uses quite a lot of CPU.
 		See Bug#38185. */
-		if (flag & HA_STATUS_NO_LOCK || !srv_stats_update_need_lock
+		if (flag & HA_STATUS_NO_LOCK
 		    || !(flag & HA_STATUS_VARIABLE_EXTRA)) {
 			/* We do not update delete_length if no
 			locking is requested so the "old" value can
@@ -9646,14 +11763,15 @@ ha_innobase::info_low(
 
 				push_warning_printf(
 					thd,
-					MYSQL_ERROR::WARN_LEVEL_WARN,
+					Sql_condition::WARN_LEVEL_WARN,
 					ER_CANT_GET_STAT,
 					"InnoDB: Trying to get the free "
 					"space for table %s but its "
 					"tablespace has been discarded or "
 					"the .ibd file is missing. Setting "
-					"the free space to zero.",
-					ib_table->name);
+					"the free space to zero. "
+					"(errno: %M)",
+					ib_table->name, errno);
 
 				stats.delete_length = 0;
 			} else {
@@ -9662,12 +11780,13 @@ ha_innobase::info_low(
 		}
 
 		stats.check_time = 0;
-		stats.mrr_length_per_rec= ref_length +  portable_sizeof_char_ptr;
+		stats.mrr_length_per_rec = ref_length + sizeof(void*);
 
 		if (stats.records == 0) {
 			stats.mean_rec_length = 0;
 		} else {
-			stats.mean_rec_length = (ulong) (stats.data_file_length / stats.records);
+			stats.mean_rec_length = (ulong)
+				(stats.data_file_length / stats.records);
 		}
 	}
 
@@ -9677,10 +11796,41 @@ ha_innobase::info_low(
 		matches up. If prebuilt->clust_index_was_generated
 		holds, InnoDB defines GEN_CLUST_INDEX internally */
 		ulint	num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes)
-					- prebuilt->clust_index_was_generated;
+			- prebuilt->clust_index_was_generated;
+		if (table->s->keys < num_innodb_index) {
+			/* If there are too many indexes defined
+			inside InnoDB, ignore those that are being
+			created, because MySQL will only consider
+			the fully built indexes here. */
+
+			for (const dict_index_t* index
+				     = UT_LIST_GET_FIRST(ib_table->indexes);
+			     index != NULL;
+			     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+				/* First, online index creation is
+				completed inside InnoDB, and then
+				MySQL attempts to upgrade the
+				meta-data lock so that it can rebuild
+				the .frm file. If we get here in that
+				time frame, dict_index_is_online_ddl()
+				would not hold and the index would
+				still not be included in TABLE_SHARE. */
+				if (*index->name == TEMP_INDEX_PREFIX) {
+					num_innodb_index--;
+				}
+			}
+
+			if (table->s->keys < num_innodb_index
+			    && innobase_fts_check_doc_id_index(
+				    ib_table, NULL, NULL)
+			    == FTS_EXIST_DOC_ID_INDEX) {
+				num_innodb_index--;
+			}
+		}
 
 		if (table->s->keys != num_innodb_index) {
-			sql_print_error("Table %s contains %lu "
+			sql_print_error("InnoDB: Table %s contains %lu "
 					"indexes inside InnoDB, which "
 					"is different from the number of "
 					"indexes %u defined in the MySQL ",
@@ -9688,7 +11838,11 @@ ha_innobase::info_low(
 					table->s->keys);
 		}
 
-		dict_table_stats_lock(ib_table, RW_S_LATCH);
+		if (!(flag & HA_STATUS_NO_LOCK)) {
+			dict_table_stats_lock(ib_table, RW_S_LATCH);
+		}
+
+		ut_a(ib_table->stat_initialized);
 
 		for (i = 0; i < table->s->keys; i++) {
 			ulong	j;
@@ -9698,7 +11852,7 @@ ha_innobase::info_low(
 			The identity of index (match up index name with
 			that of table->key_info[i]) is already verified in
 			innobase_get_index().  */
-			index = innobase_get_index(i);
+			dict_index_t* index = innobase_get_index(i);
 
 			if (index == NULL) {
 				sql_print_error("Table %s contains fewer "
@@ -9713,18 +11867,29 @@ ha_innobase::info_low(
 				break;
 			}
 
-			for (j = 0; j < table->key_info[i].key_parts; j++) {
+			for (j = 0; j < table->key_info[i].ext_key_parts; j++) {
+
+				if (table->key_info[i].flags & HA_FULLTEXT) {
+					/* The whole concept has no validity
+					for FTS indexes. */
+					table->key_info[i].rec_per_key[j] = 1;
+					continue;
+				}
 
 				if (j + 1 > index->n_uniq) {
 					sql_print_error(
-"Index %s of %s has %lu columns unique inside InnoDB, but MySQL is asking "
-"statistics for %lu columns. Have you mixed up .frm files from different "
-"installations? "
-"See " REFMAN "innodb-troubleshooting.html\n",
-							index->name,
-							ib_table->name,
-							(unsigned long)
-							index->n_uniq, j + 1);
+						"Index %s of %s has %lu columns"
+					        " unique inside InnoDB, but "
+						"MySQL is asking statistics for"
+					        " %lu columns. Have you mixed "
+						"up .frm files from different "
+					       	"installations? "
+						"See " REFMAN
+						"innodb-troubleshooting.html\n",
+						index->name,
+						ib_table->name,
+						(unsigned long)
+						index->n_uniq, j + 1);
 					break;
 				}
 
@@ -9742,58 +11907,16 @@ ha_innobase::info_low(
 					rec_per_key = 1;
 				}
 
-				table->key_info[i].rec_per_key[j]=
+				table->key_info[i].rec_per_key[j] =
 				  rec_per_key >= ~(ulong) 0 ? ~(ulong) 0 :
 				  (ulong) rec_per_key;
 			}
 
-                        KEY *key_info= table->key_info+i; 
-                        key_part_map ext_key_part_map=
-                                             key_info->ext_key_part_map;                               
-
-                        if (key_info->key_parts != key_info->ext_key_parts) {
-
-                                KEY *pk_key_info= key_info+
-                                                  table->s->primary_key;
-                                uint k = key_info->key_parts;
-                                ha_rows k_rec_per_key = rec_per_key;
-                                uint pk_parts = pk_key_info->key_parts;
-                          
-		                index= innobase_get_index(
-                                        table->s->primary_key);
-                                
-                                n_rows= ib_table->stat_n_rows;
-    
-                                for (j = 0; j < pk_parts; j++) {
- 
-				         if (ext_key_part_map & 1<<j) {
-
-                                                rec_per_key =
-						innodb_rec_per_key(index,
-                                                        j, stats.records);
-                               
-				                if (rec_per_key == 0) {
-					                rec_per_key = 1;
-				                }
-                                                else if (rec_per_key > 1) {
-                                                        rec_per_key =
-                                                          (ha_rows)
-                                                          (k_rec_per_key *
-						          (double)rec_per_key /
-							   n_rows);
-						}
-                                                
-				                key_info->rec_per_key[k++]=
-				                rec_per_key >= ~(ulong) 0 ?
-                                                ~(ulong) 0 :
-                                                (ulong) rec_per_key;
-
-					} 
-				}
-			}                                         
 		}
 
-		dict_table_stats_unlock(ib_table, RW_S_LATCH);
+		if (!(flag & HA_STATUS_NO_LOCK)) {
+			dict_table_stats_unlock(ib_table, RW_S_LATCH);
+		}
 	}
 
 	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
@@ -9813,7 +11936,11 @@ ha_innobase::info_low(
 			errkey = innobase_get_mysql_key_number_for_index(
 					share, table, ib_table, err_index);
 		} else {
-			errkey = (unsigned int) prebuilt->trx->error_key_num;
+			errkey = (unsigned int) (
+				(prebuilt->trx->error_key_num
+				 == ULINT_UNDEFINED)
+					? ~0
+					: prebuilt->trx->error_key_num);
 		}
 	}
 
@@ -9829,20 +11956,21 @@ func_exit:
 
 /*********************************************************************//**
 Returns statistics information of the table to the MySQL interpreter,
-in various fields of the handle object. */
+in various fields of the handle object.
+@return HA_ERR_* error code or 0 */
 UNIV_INTERN
 int
 ha_innobase::info(
 /*==============*/
-	uint	flag)	/*!< in: what information MySQL requests */
+	uint	flag)	/*!< in: what information is requested */
 {
-	return(info_low(flag, false /* not called from analyze */));
+	return(this->info_low(flag, false /* not ANALYZE */));
 }
 
 /**********************************************************************//**
-Updates index cardinalities of the table, based on 8 random dives into
+Updates index cardinalities of the table, based on random dives into
 each index tree. This does NOT calculate exact statistics on the table.
-@return	returns always 0 (success) */
+@return	HA_ADMIN_* error code or HA_ADMIN_OK */
 UNIV_INTERN
 int
 ha_innobase::analyze(
@@ -9850,19 +11978,27 @@ ha_innobase::analyze(
 	THD*		thd,		/*!< in: connection thread handle */
 	HA_CHECK_OPT*	check_opt)	/*!< in: currently ignored */
 {
-	if (share->ib_table->is_corrupt) {
+	int	ret;
+
+	if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) {
 		return(HA_ADMIN_CORRUPT);
 	}
 
-	/* Simply call ::info() with all the flags */
-	info_low(HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE,
-		 true /* called from analyze */);
+	/* Simply call this->info_low() with all the flags
+	and request recalculation of the statistics */
+	ret = this->info_low(
+		HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE,
+		true /* this is ANALYZE */);
 
-	if (share->ib_table->is_corrupt) {
+	if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) {
 		return(HA_ADMIN_CORRUPT);
 	}
 
-	return(0);
+	if (ret != 0) {
+		return(HA_ADMIN_FAILED);
+	}
+
+	return(HA_ADMIN_OK);
 }
 
 /**********************************************************************//**
@@ -9875,7 +12011,25 @@ ha_innobase::optimize(
 	THD*		thd,		/*!< in: connection thread handle */
 	HA_CHECK_OPT*	check_opt)	/*!< in: currently ignored */
 {
-	return(HA_ADMIN_TRY_ALTER);
+	/*FTS-FIXME: Since MySQL doesn't support engine-specific commands,
+	we have to hijack some existing command in order to be able to test
+	the new admin commands added in InnoDB's FTS support. For now, we
+	use MySQL's OPTIMIZE command, normally mapped to ALTER TABLE in
+	InnoDB (so it recreates the table anew), and map it to OPTIMIZE.
+
+	This works OK otherwise, but MySQL locks the entire table during
+	calls to OPTIMIZE, which is undesirable. */
+
+	if (innodb_optimize_fulltext_only) {
+		if (prebuilt->table->fts && prebuilt->table->fts->cache) {
+			fts_sync_table(prebuilt->table);
+			fts_optimize_table(prebuilt->table);
+		}
+		return(HA_ADMIN_OK);
+	} else {
+
+		return(HA_ADMIN_TRY_ALTER);
+	}
 }
 
 /*******************************************************************//**
@@ -9911,19 +12065,23 @@ ha_innobase::check(
 		build_template(true);
 	}
 
-	if (prebuilt->table->ibd_file_missing) {
-		sql_print_error("InnoDB: Error:\n"
-			"InnoDB: MySQL is trying to use a table handle"
-			" but the .ibd file for\n"
-			"InnoDB: table %s does not exist.\n"
-			"InnoDB: Have you deleted the .ibd file"
-			" from the database directory under\n"
-			"InnoDB: the MySQL datadir, or have you"
-			" used DISCARD TABLESPACE?\n"
-			"InnoDB: Please refer to\n"
-			"InnoDB: " REFMAN "innodb-troubleshooting.html\n"
-			"InnoDB: how you can resolve the problem.\n",
-			prebuilt->table->name);
+	if (dict_table_is_discarded(prebuilt->table)) {
+
+		ib_senderrf(
+			thd,
+			IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_DISCARDED,
+			table->s->table_name.str);
+
+		DBUG_RETURN(HA_ADMIN_CORRUPT);
+
+	} else if (prebuilt->table->ibd_file_missing) {
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_MISSING,
+			table->s->table_name.str);
+
 		DBUG_RETURN(HA_ADMIN_CORRUPT);
 	}
 
@@ -9947,31 +12105,27 @@ ha_innobase::check(
 	prebuilt->table->corrupted = FALSE;
 
 	/* Enlarge the fatal lock wait timeout during CHECK TABLE. */
-	mutex_enter(&kernel_mutex);
-	srv_fatal_semaphore_wait_threshold += SRV_SEMAPHORE_WAIT_EXTENSION;
-	mutex_exit(&kernel_mutex);
+	os_increment_counter_by_amount(
+		server_mutex,
+		srv_fatal_semaphore_wait_threshold,
+		SRV_SEMAPHORE_WAIT_EXTENSION);
 
 	for (index = dict_table_get_first_index(prebuilt->table);
 	     index != NULL;
 	     index = dict_table_get_next_index(index)) {
 		char	index_name[MAX_FULL_NAME_LEN + 1];
-#if 0
-		fputs("Validating index ", stderr);
-		ut_print_name(stderr, trx, FALSE, index->name);
-		putc('\n', stderr);
-#endif
 
-		/* If this is an index being created, break */
+		/* If this is an index being created or dropped, break */
 		if (*index->name == TEMP_INDEX_PREFIX) {
 			break;
-		}  else if (!btr_validate_index(index, prebuilt->trx)) {
+		} else if (!btr_validate_index(index, prebuilt->trx)) {
 			is_ok = FALSE;
 
 			innobase_format_name(
 				index_name, sizeof index_name,
-				prebuilt->index->name, TRUE);
+				index->name, TRUE);
 
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 					    ER_NOT_KEYFILE,
 					    "InnoDB: The B-tree of"
 					    " index %s is corrupted.",
@@ -9994,7 +12148,8 @@ ha_innobase::check(
 
 			if (dict_index_is_corrupted(prebuilt->index)) {
 				push_warning_printf(
-					user_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+					user_thd,
+					Sql_condition::WARN_LEVEL_WARN,
 					HA_ERR_INDEX_CORRUPT,
 					"InnoDB: Index %s is marked as"
 					" corrupted",
@@ -10002,7 +12157,8 @@ ha_innobase::check(
 				is_ok = FALSE;
 			} else {
 				push_warning_printf(
-					thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+					thd,
+					Sql_condition::WARN_LEVEL_WARN,
 					HA_ERR_TABLE_DEF_CHANGED,
 					"InnoDB: Insufficient history for"
 					" index %s",
@@ -10025,15 +12181,15 @@ ha_innobase::check(
 				index_name, sizeof index_name,
 				index->name, TRUE);
 
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-					    ER_NOT_KEYFILE,
-					    "InnoDB: The B-tree of"
-					    " index %s is corrupted.",
-					    index_name);
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_NOT_KEYFILE,
+				"InnoDB: The B-tree of"
+				" index %s is corrupted.",
+				index_name);
 			is_ok = FALSE;
-			row_mysql_lock_data_dictionary(prebuilt->trx);
-			dict_set_corrupted(index);
-			row_mysql_unlock_data_dictionary(prebuilt->trx);
+			dict_set_corrupted(
+				index, prebuilt->trx, "CHECK TABLE-check index");
 		}
 
 		if (thd_kill_level(user_thd)) {
@@ -10047,19 +12203,20 @@ ha_innobase::check(
 
 		if (index == dict_table_get_first_index(prebuilt->table)) {
 			n_rows_in_table = n_rows;
-		} else if (n_rows != n_rows_in_table) {
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-					    ER_NOT_KEYFILE,
-					    "InnoDB: Index '%-.200s'"
-					    " contains %lu entries,"
-					    " should be %lu.",
-					    index->name,
-					    (ulong) n_rows,
-					    (ulong) n_rows_in_table);
+		} else if (!(index->type & DICT_FTS)
+			   && (n_rows != n_rows_in_table)) {
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_NOT_KEYFILE,
+				"InnoDB: Index '%-.200s' contains %lu"
+				" entries, should be %lu.",
+				index->name,
+				(ulong) n_rows,
+				(ulong) n_rows_in_table);
 			is_ok = FALSE;
-			row_mysql_lock_data_dictionary(prebuilt->trx);
-			dict_set_corrupted(index);
-			row_mysql_unlock_data_dictionary(prebuilt->trx);
+			dict_set_corrupted(
+				index, prebuilt->trx,
+				"CHECK TABLE; Wrong count");
 		}
 	}
 
@@ -10070,9 +12227,8 @@ ha_innobase::check(
 		index = dict_table_get_first_index(prebuilt->table);
 
 		if (!dict_index_is_corrupted(index)) {
-			mutex_enter(&dict_sys->mutex);
-			dict_set_corrupted(index);
-			mutex_exit(&dict_sys->mutex);
+			dict_set_corrupted(
+				index, prebuilt->trx, "CHECK TABLE");
 		}
 		prebuilt->table->corrupted = TRUE;
 	}
@@ -10084,23 +12240,24 @@ ha_innobase::check(
 	at every CHECK TABLE */
 
 	if (!btr_search_validate()) {
-		push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+		push_warning(thd, Sql_condition::WARN_LEVEL_WARN,
 			     ER_NOT_KEYFILE,
 			     "InnoDB: The adaptive hash index is corrupted.");
 		is_ok = FALSE;
 	}
 
 	/* Restore the fatal lock wait timeout after CHECK TABLE. */
-	mutex_enter(&kernel_mutex);
-	srv_fatal_semaphore_wait_threshold -= SRV_SEMAPHORE_WAIT_EXTENSION;
-	mutex_exit(&kernel_mutex);
+	os_decrement_counter_by_amount(
+		server_mutex,
+		srv_fatal_semaphore_wait_threshold,
+		SRV_SEMAPHORE_WAIT_EXTENSION);
 
 	prebuilt->trx->op_info = "";
 	if (thd_kill_level(user_thd)) {
 		my_error(ER_QUERY_INTERRUPTED, MYF(0));
 	}
 
-	if (share->ib_table->is_corrupt) {
+	if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) {
 		return(HA_ADMIN_CORRUPT);
 	}
 
@@ -10127,7 +12284,7 @@ ha_innobase::update_table_comment(
 	handle. */
 
 	if (length > 64000 - 3) {
-		return((char*)comment); /* string too long */
+		return((char*) comment); /* string too long */
 	}
 
 	update_thd(ha_thd());
@@ -10142,40 +12299,47 @@ ha_innobase::update_table_comment(
 
 	/* output the data to a temporary file */
 
-	mutex_enter(&srv_dict_tmpfile_mutex);
-	rewind(srv_dict_tmpfile);
+	if (!srv_read_only_mode) {
 
-	fprintf(srv_dict_tmpfile, "InnoDB free: %llu kB",
-		fsp_get_available_space_in_free_extents(
-			prebuilt->table->space));
+		mutex_enter(&srv_dict_tmpfile_mutex);
 
-	dict_print_info_on_foreign_keys(FALSE, srv_dict_tmpfile,
-				prebuilt->trx, prebuilt->table);
-	flen = ftell(srv_dict_tmpfile);
-	if (flen < 0) {
-		flen = 0;
-	} else if (length + flen + 3 > 64000) {
-		flen = 64000 - 3 - length;
-	}
+		rewind(srv_dict_tmpfile);
 
-	/* allocate buffer for the full string, and
-	read the contents of the temporary file */
+		fprintf(srv_dict_tmpfile, "InnoDB free: %llu kB",
+			fsp_get_available_space_in_free_extents(
+				prebuilt->table->space));
 
-	str = (char*) my_malloc(length + flen + 3, MYF(0));
+		dict_print_info_on_foreign_keys(
+			FALSE, srv_dict_tmpfile, prebuilt->trx,
+			prebuilt->table);
 
-	if (str) {
-		char* pos	= str + length;
-		if (length) {
-			memcpy(str, comment, length);
-			*pos++ = ';';
-			*pos++ = ' ';
+		flen = ftell(srv_dict_tmpfile);
+
+		if (flen < 0) {
+			flen = 0;
+		} else if (length + flen + 3 > 64000) {
+			flen = 64000 - 3 - length;
 		}
-		rewind(srv_dict_tmpfile);
-		flen = (uint) fread(pos, 1, flen, srv_dict_tmpfile);
-		pos[flen] = 0;
-	}
 
-	mutex_exit(&srv_dict_tmpfile_mutex);
+		/* allocate buffer for the full string, and
+		read the contents of the temporary file */
+
+		str = (char*) my_malloc(length + flen + 3, MYF(0));
+
+		if (str) {
+			char* pos	= str + length;
+			if (length) {
+				memcpy(str, comment, length);
+				*pos++ = ';';
+				*pos++ = ' ';
+			}
+			rewind(srv_dict_tmpfile);
+			flen = (uint) fread(pos, 1, flen, srv_dict_tmpfile);
+			pos[flen] = 0;
+		}
+
+		mutex_exit(&srv_dict_tmpfile_mutex);
+	}
 
 	prebuilt->trx->op_info = (char*)"";
 
@@ -10192,8 +12356,8 @@ char*
 ha_innobase::get_foreign_key_create_info(void)
 /*==========================================*/
 {
-	char*	str	= 0;
 	long	flen;
+	char*	str	= 0;
 
 	ut_a(prebuilt != NULL);
 
@@ -10211,31 +12375,36 @@ ha_innobase::get_foreign_key_create_info(void)
 
 	trx_search_latch_release_if_reserved(prebuilt->trx);
 
-	mutex_enter(&srv_dict_tmpfile_mutex);
-	rewind(srv_dict_tmpfile);
+	if (!srv_read_only_mode) {
+		mutex_enter(&srv_dict_tmpfile_mutex);
+		rewind(srv_dict_tmpfile);
 
-	/* output the data to a temporary file */
-	dict_print_info_on_foreign_keys(TRUE, srv_dict_tmpfile,
-				prebuilt->trx, prebuilt->table);
-	prebuilt->trx->op_info = (char*)"";
+		/* Output the data to a temporary file */
+		dict_print_info_on_foreign_keys(
+			TRUE, srv_dict_tmpfile, prebuilt->trx,
+			prebuilt->table);
 
-	flen = ftell(srv_dict_tmpfile);
-	if (flen < 0) {
-		flen = 0;
-	}
+		prebuilt->trx->op_info = (char*)"";
 
-	/* allocate buffer for the string, and
-	read the contents of the temporary file */
+		flen = ftell(srv_dict_tmpfile);
 
-	str = (char*) my_malloc(flen + 1, MYF(0));
+		if (flen < 0) {
+			flen = 0;
+		}
 
-	if (str) {
-		rewind(srv_dict_tmpfile);
-		flen = (uint) fread(str, 1, flen, srv_dict_tmpfile);
-		str[flen] = 0;
-	}
+		/* Allocate buffer for the string, and
+		read the contents of the temporary file */
 
-	mutex_exit(&srv_dict_tmpfile_mutex);
+		str = (char*) my_malloc(flen + 1, MYF(0));
+
+		if (str) {
+			rewind(srv_dict_tmpfile);
+			flen = (uint) fread(str, 1, flen, srv_dict_tmpfile);
+			str[flen] = 0;
+		}
+
+		mutex_exit(&srv_dict_tmpfile_mutex);
+	}
 
 	return(str);
 }
@@ -10354,7 +12523,7 @@ get_foreign_key_info(
 
 	f_key_info.referenced_key_name = referenced_key_name;
 
-	pf_key_info = (FOREIGN_KEY_INFO *) thd_memdup(thd, &f_key_info,
+	pf_key_info = (FOREIGN_KEY_INFO*) thd_memdup(thd, &f_key_info,
 						      sizeof(FOREIGN_KEY_INFO));
 
 	return(pf_key_info);
@@ -10449,17 +12618,16 @@ ha_innobase::can_switch_engines(void)
 	bool	can_switch;
 
 	DBUG_ENTER("ha_innobase::can_switch_engines");
-
-	ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+	update_thd();
 
 	prebuilt->trx->op_info =
 			"determining if there are foreign key constraints";
-	row_mysql_lock_data_dictionary(prebuilt->trx);
+	row_mysql_freeze_data_dictionary(prebuilt->trx);
 
 	can_switch = !UT_LIST_GET_FIRST(prebuilt->table->referenced_list)
 			&& !UT_LIST_GET_FIRST(prebuilt->table->foreign_list);
 
-	row_mysql_unlock_data_dictionary(prebuilt->trx);
+	row_mysql_unfreeze_data_dictionary(prebuilt->trx);
 	prebuilt->trx->op_info = "";
 
 	DBUG_RETURN(can_switch);
@@ -10508,58 +12676,63 @@ ha_innobase::extra(
 	enum ha_extra_function operation)
 			   /*!< in: HA_EXTRA_FLUSH or some other flag */
 {
+	check_trx_exists(ha_thd());
+
 	/* Warning: since it is not sure that MySQL calls external_lock
 	before calling this function, the trx field in prebuilt can be
 	obsolete! */
 
 	switch (operation) {
-		case HA_EXTRA_FLUSH:
-			if (prebuilt->blob_heap) {
-				row_mysql_prebuilt_free_blob_heap(prebuilt);
-			}
-			break;
-		case HA_EXTRA_RESET_STATE:
-			reset_template();
-			thd_to_trx(ha_thd())->duplicates = 0;
-			break;
-		case HA_EXTRA_NO_KEYREAD:
-			prebuilt->read_just_key = 0;
-			break;
-		case HA_EXTRA_KEYREAD:
-			prebuilt->read_just_key = 1;
-			break;
-		case HA_EXTRA_KEYREAD_PRESERVE_FIELDS:
-			prebuilt->keep_other_fields_on_keyread = 1;
-			break;
+	case HA_EXTRA_FLUSH:
+		if (prebuilt->blob_heap) {
+			row_mysql_prebuilt_free_blob_heap(prebuilt);
+		}
+		break;
+	case HA_EXTRA_RESET_STATE:
+		reset_template();
+		thd_to_trx(ha_thd())->duplicates = 0;
+		break;
+	case HA_EXTRA_NO_KEYREAD:
+		prebuilt->read_just_key = 0;
+		break;
+	case HA_EXTRA_KEYREAD:
+		prebuilt->read_just_key = 1;
+		break;
+	case HA_EXTRA_KEYREAD_PRESERVE_FIELDS:
+		prebuilt->keep_other_fields_on_keyread = 1;
+		break;
 
-			/* IMPORTANT: prebuilt->trx can be obsolete in
-			this method, because it is not sure that MySQL
-			calls external_lock before this method with the
-			parameters below.  We must not invoke update_thd()
-			either, because the calling threads may change.
-			CAREFUL HERE, OR MEMORY CORRUPTION MAY OCCUR! */
-		case HA_EXTRA_INSERT_WITH_UPDATE:
-			thd_to_trx(ha_thd())->duplicates |= TRX_DUP_IGNORE;
-			break;
-		case HA_EXTRA_NO_IGNORE_DUP_KEY:
-			thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_IGNORE;
-			break;
-		case HA_EXTRA_WRITE_CAN_REPLACE:
-			thd_to_trx(ha_thd())->duplicates |= TRX_DUP_REPLACE;
-			break;
-		case HA_EXTRA_WRITE_CANNOT_REPLACE:
-			thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_REPLACE;
-			break;
-		default:/* Do nothing */
-			;
+		/* IMPORTANT: prebuilt->trx can be obsolete in
+		this method, because it is not sure that MySQL
+		calls external_lock before this method with the
+		parameters below.  We must not invoke update_thd()
+		either, because the calling threads may change.
+		CAREFUL HERE, OR MEMORY CORRUPTION MAY OCCUR! */
+	case HA_EXTRA_INSERT_WITH_UPDATE:
+		thd_to_trx(ha_thd())->duplicates |= TRX_DUP_IGNORE;
+		break;
+	case HA_EXTRA_NO_IGNORE_DUP_KEY:
+		thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_IGNORE;
+		break;
+	case HA_EXTRA_WRITE_CAN_REPLACE:
+		thd_to_trx(ha_thd())->duplicates |= TRX_DUP_REPLACE;
+		break;
+	case HA_EXTRA_WRITE_CANNOT_REPLACE:
+		thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_REPLACE;
+		break;
+	default:/* Do nothing */
+		;
 	}
 
 	return(0);
 }
 
+/******************************************************************//**
+*/
 UNIV_INTERN
 int
 ha_innobase::reset()
+/*================*/
 {
 	if (prebuilt->blob_heap) {
 		row_mysql_prebuilt_free_blob_heap(prebuilt);
@@ -10610,7 +12783,8 @@ ha_innobase::start_stmt(
 	INSERT, for example. */
 
 	trx_search_latch_release_if_reserved(trx);
-	innodb_srv_conc_force_exit_innodb(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	/* Reset the AUTOINC statement level counter for multi-row INSERTs. */
 	trx->n_autoinc_rows = 0;
@@ -10644,6 +12818,7 @@ ha_innobase::start_stmt(
 		3) ::init_table_handle_for_HANDLER(), and
 		4) ::transactional_table_lock(). */
 
+		ut_a(prebuilt->stored_select_lock_type != LOCK_NONE_UNSET);
 		prebuilt->select_lock_type = prebuilt->stored_select_lock_type;
 	}
 
@@ -10651,6 +12826,10 @@ ha_innobase::start_stmt(
 
 	innobase_register_trx(ht, thd, trx);
 
+	if (!trx_is_started(trx)) {
+		++trx->will_lock;
+	}
+
 	return(0);
 }
 
@@ -10663,13 +12842,16 @@ innobase_map_isolation_level(
 /*=========================*/
 	enum_tx_isolation	iso)	/*!< in: MySQL isolation level code */
 {
-	switch(iso) {
-		case ISO_REPEATABLE_READ: return(TRX_ISO_REPEATABLE_READ);
-		case ISO_READ_COMMITTED: return(TRX_ISO_READ_COMMITTED);
-		case ISO_SERIALIZABLE: return(TRX_ISO_SERIALIZABLE);
-		case ISO_READ_UNCOMMITTED: return(TRX_ISO_READ_UNCOMMITTED);
-		default: ut_a(0); return(0);
+	switch (iso) {
+	case ISO_REPEATABLE_READ:	return(TRX_ISO_REPEATABLE_READ);
+	case ISO_READ_COMMITTED:	return(TRX_ISO_READ_COMMITTED);
+	case ISO_SERIALIZABLE:		return(TRX_ISO_SERIALIZABLE);
+	case ISO_READ_UNCOMMITTED:	return(TRX_ISO_READ_UNCOMMITTED);
 	}
+
+	ut_error;
+
+	return(0);
 }
 
 /******************************************************************//**
@@ -10701,15 +12883,15 @@ ha_innobase::external_lock(
 	informative error message and return with an error.
 	Note: decide_logging_format would give the same error message,
 	except it cannot give the extra details. */
+
 	if (lock_type == F_WRLCK
 	    && !(table_flags() & HA_BINLOG_STMT_CAPABLE)
 	    && thd_binlog_format(thd) == BINLOG_FORMAT_STMT
 	    && thd_binlog_filter_ok(thd)
-            && thd_sqlcom_can_generate_row_events(thd))
-        {
-		int skip = 0;
+	    && thd_sqlcom_can_generate_row_events(thd)) {
+		bool skip = 0;
 		/* used by test case */
-		DBUG_EXECUTE_IF("no_innodb_binlog_errors", skip = 1;);
+		DBUG_EXECUTE_IF("no_innodb_binlog_errors", skip = true;);
 		if (!skip) {
 			my_error(ER_BINLOG_STMT_MODE_AND_ROW_ENGINE, MYF(0),
 			         " InnoDB is limited to row-logging when "
@@ -10719,6 +12901,32 @@ ha_innobase::external_lock(
 		}
 	}
 
+	/* Check for UPDATEs in read-only mode. */
+	if (srv_read_only_mode
+	    && (thd_sql_command(thd) == SQLCOM_UPDATE
+		|| thd_sql_command(thd) == SQLCOM_INSERT
+		|| thd_sql_command(thd) == SQLCOM_REPLACE
+		|| thd_sql_command(thd) == SQLCOM_DROP_TABLE
+		|| thd_sql_command(thd) == SQLCOM_ALTER_TABLE
+		|| thd_sql_command(thd) == SQLCOM_OPTIMIZE
+		|| (thd_sql_command(thd) == SQLCOM_CREATE_TABLE
+		    && lock_type == F_WRLCK)
+		|| thd_sql_command(thd) == SQLCOM_CREATE_INDEX
+		|| thd_sql_command(thd) == SQLCOM_DROP_INDEX
+		|| thd_sql_command(thd) == SQLCOM_DELETE)) {
+
+		if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE)
+		{
+			ib_senderrf(thd, IB_LOG_LEVEL_WARN,
+				    ER_READ_ONLY_MODE);
+			DBUG_RETURN(HA_ERR_TABLE_READONLY);
+		} else {
+			ib_senderrf(thd, IB_LOG_LEVEL_WARN,
+				    ER_READ_ONLY_MODE);
+			DBUG_RETURN(HA_ERR_TABLE_READONLY);
+		}
+
+	}
 
 	trx = prebuilt->trx;
 
@@ -10727,6 +12935,41 @@ ha_innobase::external_lock(
 
 	reset_template();
 
+	switch (prebuilt->table->quiesce) {
+	case QUIESCE_START:
+		/* Check for FLUSH TABLE t WITH READ LOCK; */
+		if (!srv_read_only_mode
+		    && thd_sql_command(thd) == SQLCOM_FLUSH
+		    && lock_type == F_RDLCK) {
+
+			row_quiesce_table_start(prebuilt->table, trx);
+
+			/* Use the transaction instance to track UNLOCK
+			TABLES. It can be done via START TRANSACTION; too
+			implicitly. */
+
+			++trx->flush_tables;
+		}
+		break;
+
+	case QUIESCE_COMPLETE:
+		/* Check for UNLOCK TABLES; implicit or explicit
+		or trx interruption. */
+		if (trx->flush_tables > 0
+		    && (lock_type == F_UNLCK || trx_is_interrupted(trx))) {
+
+			row_quiesce_table_complete(prebuilt->table, trx);
+
+			ut_a(trx->flush_tables > 0);
+			--trx->flush_tables;
+		}
+
+		break;
+
+	case QUIESCE_NONE:
+		break;
+	}
+
 	if (lock_type == F_WRLCK) {
 
 		/* If this is a SELECT, then it is in UPDATE TABLE ...
@@ -10777,13 +13020,13 @@ ha_innobase::external_lock(
 			    && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT)
 			    && thd_in_lock_tables(thd)) {
 
-				ulint	error = row_lock_table_for_mysql(
+				dberr_t	error = row_lock_table_for_mysql(
 					prebuilt, NULL, 0);
 
 				if (error != DB_SUCCESS) {
-					error = convert_error_code_to_mysql(
-						(int) error, 0, thd);
-					DBUG_RETURN((int) error);
+					DBUG_RETURN(
+						convert_error_code_to_mysql(
+							error, 0, thd));
 				}
 			}
 
@@ -10793,6 +13036,13 @@ ha_innobase::external_lock(
 		trx->n_mysql_tables_in_use++;
 		prebuilt->mysql_has_locked = TRUE;
 
+		if (!trx_is_started(trx)
+		    && (prebuilt->select_lock_type != LOCK_NONE
+			|| prebuilt->stored_select_lock_type != LOCK_NONE)) {
+
+			++trx->will_lock;
+		}
+
 		DBUG_RETURN(0);
 	}
 
@@ -10802,11 +13052,12 @@ ha_innobase::external_lock(
 	prebuilt->mysql_has_locked = FALSE;
 
 	/* Release a possible FIFO ticket and search latch. Since we
-	may reserve the kernel mutex, we have to release the search
+	may reserve the trx_sys->mutex, we have to release the search
 	system latch first to obey the latching order. */
 
 	trx_search_latch_release_if_reserved(trx);
-	innodb_srv_conc_force_exit_innodb(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	/* If the MySQL lock count drops to zero we know that the current SQL
 	statement has ended */
@@ -10855,6 +13106,13 @@ ha_innobase::external_lock(
 		}
 	}
 
+	if (!trx_is_started(trx)
+	    && (prebuilt->select_lock_type != LOCK_NONE
+		|| prebuilt->stored_select_lock_type != LOCK_NONE)) {
+
+		++trx->will_lock;
+	}
+
 	DBUG_RETURN(0);
 }
 
@@ -10880,23 +13138,27 @@ ha_innobase::transactional_table_lock(
 
 	update_thd(thd);
 
-	if (share->ib_table->is_corrupt) {
+	if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) {
 		DBUG_RETURN(HA_ERR_CRASHED);
 	}
 
-	if (prebuilt->table->ibd_file_missing && !thd_tablespace_op(thd)) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: MySQL is trying to use a table handle"
-			" but the .ibd file for\n"
-			"InnoDB: table %s does not exist.\n"
-			"InnoDB: Have you deleted the .ibd file"
-			" from the database directory under\n"
-			"InnoDB: the MySQL datadir?"
-			"InnoDB: See " REFMAN
-			"innodb-troubleshooting.html\n"
-			"InnoDB: how you can resolve the problem.\n",
-			prebuilt->table->name);
+	if (!thd_tablespace_op(thd)) {
+
+		if (dict_table_is_discarded(prebuilt->table)) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLESPACE_DISCARDED,
+				table->s->table_name.str);
+
+		} else if (prebuilt->table->ibd_file_missing) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLESPACE_MISSING,
+				table->s->table_name.str);
+		}
+
 		DBUG_RETURN(HA_ERR_CRASHED);
 	}
 
@@ -10914,11 +13176,12 @@ ha_innobase::transactional_table_lock(
 		prebuilt->select_lock_type = LOCK_S;
 		prebuilt->stored_select_lock_type = LOCK_S;
 	} else {
-		ut_print_timestamp(stderr);
-		fprintf(stderr, "  InnoDB error:\n"
-"MySQL is trying to set transactional table lock with corrupted lock type\n"
-"to table %s, lock type %d does not exist.\n",
-				prebuilt->table->name, lock_type);
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"MySQL is trying to set transactional table lock "
+			"with corrupted lock type to table %s, lock type "
+			"%d does not exist.",
+			table->s->table_name.str, lock_type);
+
 		DBUG_RETURN(HA_ERR_CRASHED);
 	}
 
@@ -10927,14 +13190,14 @@ ha_innobase::transactional_table_lock(
 	innobase_register_trx(ht, thd, trx);
 
 	if (THDVAR(thd, table_locks) && thd_in_lock_tables(thd)) {
-		ulint	error = DB_SUCCESS;
+		dberr_t	error;
 
 		error = row_lock_table_for_mysql(prebuilt, NULL, 0);
 
 		if (error != DB_SUCCESS) {
-			error = convert_error_code_to_mysql(
-				(int) error, prebuilt->table->flags, thd);
-			DBUG_RETURN((int) error);
+			DBUG_RETURN(
+				convert_error_code_to_mysql(
+					error, prebuilt->table->flags, thd));
 		}
 
 		if (thd_test_options(
@@ -10955,8 +13218,8 @@ ha_innobase::transactional_table_lock(
 Here we export InnoDB status variables to MySQL. */
 static
 void
-innodb_export_status(void)
-/*======================*/
+innodb_export_status()
+/*==================*/
 {
 	if (innodb_inited) {
 		srv_export_innodb_status();
@@ -10964,15 +13227,16 @@ innodb_export_status(void)
 }
 
 /************************************************************************//**
-Implements the SHOW INNODB STATUS command. Sends the output of the InnoDB
-Monitor to the client. */
+Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the
+InnoDB Monitor to the client.
+@return 0 on success */
 static
-bool
+int
 innodb_show_status(
 /*===============*/
 	handlerton*	hton,	/*!< in: the innodb handlerton */
-	THD*	thd,	/*!< in: the MySQL query thread of the caller */
-	stat_print_fn *stat_print)
+	THD*		thd,	/*!< in: the MySQL query thread of the caller */
+	stat_print_fn*	stat_print)
 {
 	trx_t*			trx;
 	static const char	truncated_msg[] = "... truncated...\n";
@@ -10984,25 +13248,34 @@ innodb_show_status(
 	DBUG_ENTER("innodb_show_status");
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
+	/* We don't create the temp files or associated
+	mutexes in read-only-mode */
+
+	if (srv_read_only_mode) {
+		DBUG_RETURN(0);
+	}
+
 	trx = check_trx_exists(thd);
 
 	trx_search_latch_release_if_reserved(trx);
-	innodb_srv_conc_force_exit_innodb(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	/* We let the InnoDB Monitor to output at most MAX_STATUS_SIZE
 	bytes of text. */
 
-	long	flen, usable_len;
 	char*	str;
+	ssize_t	flen, usable_len;
 
 	mutex_enter(&srv_monitor_file_mutex);
 	rewind(srv_monitor_file);
+
 	srv_printf_innodb_monitor(srv_monitor_file, FALSE,
 				  &trx_list_start, &trx_list_end);
-	flen = ftell(srv_monitor_file);
+
 	os_file_set_eof(srv_monitor_file);
 
-	if (flen < 0) {
+	if ((flen = ftell(srv_monitor_file)) < 0) {
 		flen = 0;
 	}
 
@@ -11018,28 +13291,31 @@ innodb_show_status(
 
 	if (!(str = (char*) my_malloc(usable_len + 1, MYF(0)))) {
 		mutex_exit(&srv_monitor_file_mutex);
-		DBUG_RETURN(TRUE);
+		DBUG_RETURN(1);
 	}
 
 	rewind(srv_monitor_file);
+
 	if (flen < MAX_STATUS_SIZE) {
 		/* Display the entire output. */
-		flen = (long) fread(str, 1, flen, srv_monitor_file);
+		flen = fread(str, 1, flen, srv_monitor_file);
 	} else if (trx_list_end < (ulint) flen
-			&& trx_list_start < trx_list_end
-			&& trx_list_start + (flen - trx_list_end)
-			< MAX_STATUS_SIZE - sizeof truncated_msg - 1) {
+		   && trx_list_start < trx_list_end
+		   && trx_list_start + (flen - trx_list_end)
+		   < MAX_STATUS_SIZE - sizeof truncated_msg - 1) {
+
 		/* Omit the beginning of the list of active transactions. */
-		long len = (long) fread(str, 1, trx_list_start, srv_monitor_file);
+		ssize_t	len = fread(str, 1, trx_list_start, srv_monitor_file);
+
 		memcpy(str + len, truncated_msg, sizeof truncated_msg - 1);
 		len += sizeof truncated_msg - 1;
 		usable_len = (MAX_STATUS_SIZE - 1) - len;
 		fseek(srv_monitor_file, flen - usable_len, SEEK_SET);
-		len += (long) fread(str + len, 1, usable_len, srv_monitor_file);
+		len += fread(str + len, 1, usable_len, srv_monitor_file);
 		flen = len;
 	} else {
 		/* Omit the end of the output. */
-		flen = (long) fread(str, 1, MAX_STATUS_SIZE - 1, srv_monitor_file);
+		flen = fread(str, 1, MAX_STATUS_SIZE - 1, srv_monitor_file);
 	}
 
 	mutex_exit(&srv_monitor_file_mutex);
@@ -11055,9 +13331,9 @@ innodb_show_status(
 
 /************************************************************************//**
 Implements the SHOW MUTEX STATUS command.
-@return TRUE on failure, FALSE on success. */
+@return 0 on success. */
 static
-bool
+int
 innodb_mutex_show_status(
 /*=====================*/
 	handlerton*	hton,		/*!< in: the innodb handlerton */
@@ -11066,22 +13342,28 @@ innodb_mutex_show_status(
 	stat_print_fn*	stat_print)	/*!< in: function for printing
 					statistics */
 {
-	char buf1[IO_SIZE], buf2[IO_SIZE];
-	mutex_t*	mutex;
+	char		buf1[IO_SIZE];
+	char		buf2[IO_SIZE];
+	ib_mutex_t*	mutex;
 	rw_lock_t*	lock;
 	ulint		block_mutex_oswait_count = 0;
 	ulint		block_lock_oswait_count = 0;
-	mutex_t*	block_mutex = NULL;
+	ib_mutex_t*	block_mutex = NULL;
 	rw_lock_t*	block_lock = NULL;
 #ifdef UNIV_DEBUG
-	ulint	  rw_lock_count= 0;
-	ulint	  rw_lock_count_spin_loop= 0;
-	ulint	  rw_lock_count_spin_rounds= 0;
-	ulint	  rw_lock_count_os_wait= 0;
-	ulint	  rw_lock_count_os_yield= 0;
-	ulonglong rw_lock_wait_time= 0;
+	ulint		rw_lock_count= 0;
+	ulint		rw_lock_count_spin_loop= 0;
+	ulint		rw_lock_count_spin_rounds= 0;
+	ulint		rw_lock_count_os_wait= 0;
+	ulint		rw_lock_count_os_yield= 0;
+	ulonglong	rw_lock_wait_time= 0;
 #endif /* UNIV_DEBUG */
-	uint	  hton_name_len= (uint) strlen(innobase_hton_name), buf1len, buf2len;
+	uint		buf1len;
+	uint		buf2len;
+	uint		hton_name_len;
+
+	hton_name_len = (uint) strlen(innobase_hton_name);
+
 	DBUG_ENTER("innodb_mutex_show_status");
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
@@ -11098,41 +13380,7 @@ innodb_mutex_show_status(
 			block_mutex_oswait_count += mutex->count_os_wait;
 			continue;
 		}
-#ifdef UNIV_DEBUG
-		if (mutex->mutex_type != 1) {
-			if (mutex->count_using > 0) {
-				buf1len= my_snprintf(buf1, sizeof(buf1),
-					"%s:%s",
-					mutex->cmutex_name,
-					innobase_basename(mutex->cfile_name));
-				buf2len= my_snprintf(buf2, sizeof(buf2),
-					"count=%lu, spin_waits=%lu,"
-					" spin_rounds=%lu, "
-					"os_waits=%lu, os_yields=%lu,"
-					" os_wait_times=%lu",
-					mutex->count_using,
-					mutex->count_spin_loop,
-					mutex->count_spin_rounds,
-					mutex->count_os_wait,
-					mutex->count_os_yield,
-					(ulong) (mutex->lspent_time/1000));
-
-				if (stat_print(thd, innobase_hton_name,
-						hton_name_len, buf1, buf1len,
-						buf2, buf2len)) {
-					mutex_exit(&mutex_list_mutex);
-					DBUG_RETURN(1);
-				}
-			}
-		} else {
-			rw_lock_count += mutex->count_using;
-			rw_lock_count_spin_loop += mutex->count_spin_loop;
-			rw_lock_count_spin_rounds += mutex->count_spin_rounds;
-			rw_lock_count_os_wait += mutex->count_os_wait;
-			rw_lock_count_os_yield += mutex->count_os_yield;
-			rw_lock_wait_time += mutex->lspent_time;
-		}
-#else /* UNIV_DEBUG */
+
 		buf1len= (uint) my_snprintf(buf1, sizeof(buf1), "%s",
 				     mutex->cmutex_name);
 		buf2len= (uint) my_snprintf(buf2, sizeof(buf2), "os_waits=%lu",
@@ -11144,7 +13392,6 @@ innodb_mutex_show_status(
 			mutex_exit(&mutex_list_mutex);
 			DBUG_RETURN(1);
 		}
-#endif /* UNIV_DEBUG */
 	}
 
 	if (block_mutex) {
@@ -11227,34 +13474,54 @@ innodb_mutex_show_status(
 	}
 #endif /* UNIV_DEBUG */
 
-	DBUG_RETURN(FALSE);
+	/* Success */
+	DBUG_RETURN(0);
 }
 
+/************************************************************************//**
+Return 0 on success and non-zero on failure. Note: the bool return type
+seems to be abused here, should be an int. */
 static
-bool innobase_show_status(handlerton *hton, THD* thd, 
-                          stat_print_fn* stat_print,
-                          enum ha_stat_type stat_type)
+bool
+innobase_show_status(
+/*=================*/
+	handlerton*		hton,	/*!< in: the innodb handlerton */
+	THD*			thd,	/*!< in: the MySQL query thread
+					of the caller */
+	stat_print_fn*		stat_print,
+	enum ha_stat_type	stat_type)
 {
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
 	switch (stat_type) {
 	case HA_ENGINE_STATUS:
-		return innodb_show_status(hton, thd, stat_print);
+		/* Non-zero return value means there was an error. */
+		return(innodb_show_status(hton, thd, stat_print) != 0);
+
 	case HA_ENGINE_MUTEX:
-		return innodb_mutex_show_status(hton, thd, stat_print);
-	default:
-		return(FALSE);
+		/* Non-zero return value means there was an error. */
+		return(innodb_mutex_show_status(hton, thd, stat_print) != 0);
+
+	case HA_ENGINE_LOGS:
+		/* Not handled */
+		break;
 	}
+
+	/* Success */
+	return(false);
 }
 
 /************************************************************************//**
- Handling the shared INNOBASE_SHARE structure that is needed to provide table
- locking.
-****************************************************************************/
-
-static INNOBASE_SHARE* get_share(const char* table_name)
+Handling the shared INNOBASE_SHARE structure that is needed to provide table
+locking. Register the table name if it doesn't exist in the hash table. */
+static
+INNOBASE_SHARE*
+get_share(
+/*======*/
+	const char*	table_name)
 {
-	INNOBASE_SHARE *share;
+	INNOBASE_SHARE*	share;
+
 	mysql_mutex_lock(&innobase_share_mutex);
 
 	ulint	fold = ut_fold_string(table_name);
@@ -11271,7 +13538,7 @@ static INNOBASE_SHARE* get_share(const char* table_name)
 		/* TODO: invoke HASH_MIGRATE if innobase_open_tables
 		grows too big */
 
-		share = (INNOBASE_SHARE *) my_malloc(sizeof(*share)+length+1,
+		share = (INNOBASE_SHARE*) my_malloc(sizeof(*share)+length+1,
 			MYF(MY_FAE | MY_ZEROFILL));
 
 		share->table_name = (char*) memcpy(share + 1,
@@ -11294,7 +13561,13 @@ static INNOBASE_SHARE* get_share(const char* table_name)
 	return(share);
 }
 
-static void free_share(INNOBASE_SHARE* share)
+/************************************************************************//**
+Free the shared object that was registered with get_share(). */
+static
+void
+free_share(
+/*=======*/
+	INNOBASE_SHARE*	share)	/*!< in/own: table share to free */
 {
 	mysql_mutex_lock(&innobase_share_mutex);
 
@@ -11370,7 +13643,7 @@ ha_innobase::store_lock(
 	if (lock_type != TL_IGNORE
 	    && trx->n_mysql_tables_in_use == 0) {
 		trx->isolation_level = innobase_map_isolation_level(
-			(enum_tx_isolation) thd_tx_isolation(thd));
+                        (enum_tx_isolation) thd_tx_isolation(thd));
 
 		if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
 		    && trx->global_read_view) {
@@ -11386,12 +13659,54 @@ ha_innobase::store_lock(
 	const bool in_lock_tables = thd_in_lock_tables(thd);
 	const uint sql_command = thd_sql_command(thd);
 
-	if (sql_command == SQLCOM_DROP_TABLE) {
+	if (srv_read_only_mode
+	    && (sql_command == SQLCOM_UPDATE
+		|| sql_command == SQLCOM_INSERT
+		|| sql_command == SQLCOM_REPLACE
+		|| sql_command == SQLCOM_DROP_TABLE
+		|| sql_command == SQLCOM_ALTER_TABLE
+		|| sql_command == SQLCOM_OPTIMIZE
+		|| (sql_command == SQLCOM_CREATE_TABLE
+		    && (lock_type >= TL_WRITE_CONCURRENT_INSERT
+			 && lock_type <= TL_WRITE))
+		|| sql_command == SQLCOM_CREATE_INDEX
+		|| sql_command == SQLCOM_DROP_INDEX
+		|| sql_command == SQLCOM_DELETE)) {
+
+		ib_senderrf(trx->mysql_thd,
+			    IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+
+	} else if (sql_command == SQLCOM_FLUSH
+		   && lock_type == TL_READ_NO_INSERT) {
+
+		/* Check for FLUSH TABLES ... WITH READ LOCK */
+
+		/* Note: This call can fail, but there is no way to return
+		the error to the caller. We simply ignore it for now here
+		and push the error code to the caller where the error is
+		detected in the function. */
+
+		dberr_t	err = row_quiesce_set_state(
+			prebuilt->table, QUIESCE_START, trx);
+
+		ut_a(err == DB_SUCCESS || err == DB_UNSUPPORTED);
+
+		if (trx->isolation_level == TRX_ISO_SERIALIZABLE) {
+			prebuilt->select_lock_type = LOCK_S;
+			prebuilt->stored_select_lock_type = LOCK_S;
+		} else {
+			prebuilt->select_lock_type = LOCK_NONE;
+			prebuilt->stored_select_lock_type = LOCK_NONE;
+		}
+
+	/* Check for DROP TABLE */
+	} else if (sql_command == SQLCOM_DROP_TABLE) {
 
 		/* MySQL calls this function in DROP TABLE though this table
 		handle may belong to another thd that is running a query. Let
-		us in that case skip any changes to the prebuilt struct. */ 
+		us in that case skip any changes to the prebuilt struct. */
 
+	/* Check for LOCK TABLE t1,...,tn WITH SHARED LOCKS */
 	} else if ((lock_type == TL_READ && in_lock_tables)
 		   || (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables)
 		   || lock_type == TL_READ_WITH_SHARED_LOCKS
@@ -11417,18 +13732,19 @@ ha_innobase::store_lock(
 		unexpected if an obsolete consistent read view would be
 		used. */
 
-		ulint	isolation_level;
+		/* Use consistent read for checksum table */
 
-		isolation_level = trx->isolation_level;
-
-		if ((srv_locks_unsafe_for_binlog
-		     || isolation_level <= TRX_ISO_READ_COMMITTED)
-		    && isolation_level != TRX_ISO_SERIALIZABLE
-		    && (lock_type == TL_READ || lock_type == TL_READ_NO_INSERT)
-		    && (sql_command == SQLCOM_INSERT_SELECT
-			|| sql_command == SQLCOM_REPLACE_SELECT
-			|| sql_command == SQLCOM_UPDATE
-			|| sql_command == SQLCOM_CREATE_TABLE)) {
+		if (sql_command == SQLCOM_CHECKSUM
+		    || sql_command == SQLCOM_CHECKSUM
+		    || ((srv_locks_unsafe_for_binlog
+			|| trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+			&& trx->isolation_level != TRX_ISO_SERIALIZABLE
+			&& (lock_type == TL_READ
+			    || lock_type == TL_READ_NO_INSERT)
+			&& (sql_command == SQLCOM_INSERT_SELECT
+			    || sql_command == SQLCOM_REPLACE_SELECT
+			    || sql_command == SQLCOM_UPDATE
+			    || sql_command == SQLCOM_CREATE_TABLE))) {
 
 			/* If we either have innobase_locks_unsafe_for_binlog
 			option set or this session is using READ COMMITTED
@@ -11442,12 +13758,6 @@ ha_innobase::store_lock(
 
 			prebuilt->select_lock_type = LOCK_NONE;
 			prebuilt->stored_select_lock_type = LOCK_NONE;
-		} else if (sql_command == SQLCOM_CHECKSUM ||
-                           sql_command == SQLCOM_ANALYZE) {
-			/* Use consistent read for checksum table */
-
-			prebuilt->select_lock_type = LOCK_NONE;
-			prebuilt->stored_select_lock_type = LOCK_NONE;
 		} else {
 			prebuilt->select_lock_type = LOCK_S;
 			prebuilt->stored_select_lock_type = LOCK_S;
@@ -11531,6 +13841,13 @@ ha_innobase::store_lock(
 
 	*to++= &lock;
 
+	if (!trx_is_started(trx)
+	    && (prebuilt->select_lock_type != LOCK_NONE
+	        || prebuilt->stored_select_lock_type != LOCK_NONE)) {
+
+		++trx->will_lock;
+	}
+
 	return(to);
 }
 
@@ -11540,13 +13857,13 @@ the AUTOINC value. If SUCCESS then the table AUTOINC mutex will be locked
 on return and all relevant locks acquired.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 ha_innobase::innobase_get_autoinc(
 /*==============================*/
 	ulonglong*	value)		/*!< out: autoinc value */
 {
- 	*value = 0;
- 
+	*value = 0;
+
 	prebuilt->autoinc_error = innobase_lock_autoinc();
 
 	if (prebuilt->autoinc_error == DB_SUCCESS) {
@@ -11597,24 +13914,22 @@ ha_innobase::innobase_peek_autoinc(void)
 }
 
 /*********************************************************************//**
-This function initializes the auto-inc counter if it has not been
-initialized yet. This function does not change the value of the auto-inc
-counter if it already has been initialized. Returns the value of the
-auto-inc counter in *first_value, and ULONGLONG_MAX in *nb_reserved_values (as
-we have a table-level lock). offset, increment, nb_desired_values are ignored.
-*first_value is set to -1 if error (deadlock or lock wait timeout) */
+Returns the value of the auto-inc counter in *first_value and ~0 on failure. */
 UNIV_INTERN
 void
 ha_innobase::get_auto_increment(
 /*============================*/
-        ulonglong	offset,              /*!< in: table autoinc offset */
-        ulonglong	increment,           /*!< in: table autoinc increment */
-        ulonglong	nb_desired_values,   /*!< in: number of values reqd */
-        ulonglong	*first_value,        /*!< out: the autoinc value */
-        ulonglong	*nb_reserved_values) /*!< out: count of reserved values */
+	ulonglong	offset,			/*!< in: table autoinc offset */
+	ulonglong	increment,		/*!< in: table autoinc
+						increment */
+	ulonglong	nb_desired_values,	/*!< in: number of values
+						reqd */
+	ulonglong*	first_value,		/*!< out: the autoinc value */
+	ulonglong*	nb_reserved_values)	/*!< out: count of reserved
+						values */
 {
 	trx_t*		trx;
-	ulint		error;
+	dberr_t		error;
 	ulonglong	autoinc = 0;
 
 	/* Prepare prebuilt->trx in the table handle */
@@ -11728,18 +14043,15 @@ ha_innobase::reset_auto_increment(
 {
 	DBUG_ENTER("ha_innobase::reset_auto_increment");
 
-	int	error;
+	dberr_t	error;
 
 	update_thd(ha_thd());
 
 	error = row_lock_table_autoinc_for_mysql(prebuilt);
 
 	if (error != DB_SUCCESS) {
-		error = convert_error_code_to_mysql(error,
-						    prebuilt->table->flags,
-						    user_thd);
-
-		DBUG_RETURN(error);
+		DBUG_RETURN(convert_error_code_to_mysql(
+				    error, prebuilt->table->flags, user_thd));
 	}
 
 	/* The next value can never be 0. */
@@ -11752,10 +14064,14 @@ ha_innobase::reset_auto_increment(
 	DBUG_RETURN(0);
 }
 
-/* See comment in handler.cc */
+/*******************************************************************//**
+See comment in handler.cc */
 UNIV_INTERN
 bool
-ha_innobase::get_error_message(int error, String *buf)
+ha_innobase::get_error_message(
+/*===========================*/
+	int	error,
+	String*	buf)
 {
 	trx_t*	trx = check_trx_exists(ha_thd());
 
@@ -11766,6 +14082,64 @@ ha_innobase::get_error_message(int error, String *buf)
 }
 
 /*******************************************************************//**
+  Retrieves the names of the table and the key for which there was a
+  duplicate entry in the case of HA_ERR_FOREIGN_DUPLICATE_KEY.
+
+  If any of the names is not available, then this method will return
+  false and will not change any of child_table_name or child_key_name.
+
+  @param child_table_name[out]    Table name
+  @param child_table_name_len[in] Table name buffer size
+  @param child_key_name[out]      Key name
+  @param child_key_name_len[in]   Key name buffer size
+
+  @retval  true                  table and key names were available
+                                 and were written into the corresponding
+                                 out parameters.
+  @retval  false                 table and key names were not available,
+                                 the out parameters were not touched.
+*/
+bool
+ha_innobase::get_foreign_dup_key(
+/*=============================*/
+	char*	child_table_name,
+	uint	child_table_name_len,
+	char*	child_key_name,
+	uint	child_key_name_len)
+{
+	const dict_index_t*	err_index;
+
+	ut_a(prebuilt->trx != NULL);
+	ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
+
+	err_index = trx_get_error_info(prebuilt->trx);
+
+	if (err_index == NULL) {
+		return(false);
+	}
+	/* else */
+
+	/* copy table name (and convert from filename-safe encoding to
+	system_charset_info) */
+	char*	p;
+	p = strchr(err_index->table->name, '/');
+	/* strip ".../" prefix if any */
+	if (p != NULL) {
+		p++;
+	} else {
+		p = err_index->table->name;
+	}
+	uint	len;
+	len = filename_to_tablename(p, child_table_name, child_table_name_len);
+	child_table_name[len] = '\0';
+
+	/* copy index name */
+	ut_snprintf(child_key_name, child_key_name_len, "%s", err_index->name);
+
+	return(true);
+}
+
+/*******************************************************************//**
 Compares two 'refs'. A 'ref' is the (internal) primary key value of the row.
 If there is no explicitly declared non-null unique key or a primary key, then
 InnoDB internally uses the row id as the primary key.
@@ -11799,7 +14173,7 @@ ha_innobase::cmp_ref(
 	key_part = table->key_info[table->s->primary_key].key_part;
 
 	key_part_end = key_part
-			+ table->key_info[table->s->primary_key].key_parts;
+			+ table->key_info[table->s->primary_key].user_defined_key_parts;
 
 	for (; key_part != key_part_end; ++key_part) {
 		field = key_part->field;
@@ -11818,8 +14192,8 @@ ha_innobase::cmp_ref(
 
 			ref1 += 2;
 			ref2 += 2;
-			result = ((Field_blob*)field)->cmp( ref1, len1,
-                                                            ref2, len2);
+			result = ((Field_blob*) field)->cmp(
+				ref1, len1, ref2, len2);
 		} else {
 			result = field->key_cmp(ref1, ref2);
 		}
@@ -11844,11 +14218,10 @@ my_bool
 ha_innobase::register_query_cache_table(
 /*====================================*/
 	THD*		thd,		/*!< in: user thread handle */
-	char*		table_key,	/*!< in: concatenation of database name,
-					the null character NUL,
-					and the table name */
-	uint		key_length,	/*!< in: length of the full name, i.e.
-					len(dbname) + len(tablename) + 1 */
+	char*		table_key,	/*!< in: normalized path to the  
+					table */
+	uint		key_length,	/*!< in: length of the normalized
+					path to the table */
 	qc_engine_callback*
 			call_back,	/*!< out: pointer to function for
 					checking if query caching
@@ -11862,16 +14235,22 @@ ha_innobase::register_query_cache_table(
 							 engine_data));
 }
 
+/*******************************************************************//**
+Get the bin log name. */
 UNIV_INTERN
-char*
+const char*
 ha_innobase::get_mysql_bin_log_name()
+/*=================================*/
 {
 	return(trx_sys_mysql_bin_log_name);
 }
 
+/*******************************************************************//**
+Get the bin log offset (or file position). */
 UNIV_INTERN
 ulonglong
 ha_innobase::get_mysql_bin_log_pos()
+/*================================*/
 {
 	/* trx... is ib_int64_t, which is a typedef for a 64-bit integer
 	(__int64 or longlong) so it's ok to cast it to ulonglong. */
@@ -11885,7 +14264,7 @@ characters for prefix indexes using a multibyte character set. The function
 finds charset information and returns length of prefix_len characters in the
 index field in bytes.
 @return	number of bytes occupied by the first n characters */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ulint
 innobase_get_at_most_n_mbchars(
 /*===========================*/
@@ -11956,16 +14335,16 @@ static
 int
 innobase_xa_prepare(
 /*================*/
-        handlerton*	hton,	/*!< in: InnoDB handlerton */
-	THD*		thd,	/*!< in: handle to the MySQL thread of
-				the user whose XA transaction should
-				be prepared */
-	bool		all)	/*!< in: TRUE - commit transaction
-				FALSE - the current SQL statement
-				ended */
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread of
+					the user whose XA transaction should
+					be prepared */
+	bool		prepare_trx)	/*!< in: true - prepare transaction
+					false - the current SQL statement
+					ended */
 {
-	int error = 0;
-	trx_t* trx = check_trx_exists(thd);
+	int		error = 0;
+	trx_t*		trx = check_trx_exists(thd);
 
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
@@ -11979,10 +14358,11 @@ innobase_xa_prepare(
 
 	if (UNIV_UNLIKELY(trx->fake_changes)) {
 
-		if (all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT
-					      | OPTION_BEGIN))) {
+		if (prepare_trx
+		    || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT
+					  | OPTION_BEGIN))) {
 
-			thd->stmt_da->reset_diagnostics_area();
+			thd->get_stmt_da()->reset_diagnostics_area();
 			return(HA_ERR_WRONG_COMMAND);
 		}
 		return(0);
@@ -11991,11 +14371,12 @@ innobase_xa_prepare(
 	thd_get_xid(thd, (MYSQL_XID*) &trx->xid);
 
 	/* Release a possible FIFO ticket and search latch. Since we will
-	reserve the kernel mutex, we have to release the search system latch
-	first to obey the latching order. */
+	reserve the trx_sys->mutex, we have to release the search system
+	latch first to obey the latching order. */
 
 	trx_search_latch_release_if_reserved(trx);
-	innodb_srv_conc_force_exit_innodb(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
 
@@ -12003,7 +14384,7 @@ innobase_xa_prepare(
 				"but transaction is active");
 	}
 
-	if (all
+	if (prepare_trx
 	    || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
 
 		/* We were instructed to prepare the whole transaction, or
@@ -12011,27 +14392,12 @@ innobase_xa_prepare(
 
 		ut_ad(trx_is_registered_for_2pc(trx));
 
-		/* Update the replication position info in current trx.  This
-		is different from the binlog position update that happens
-		during XA COMMIT.  In contrast to that, the slave position is
-		an actual part of the changes made by this transaction and thus
-		must be updated in the XA PREPARE stage.  Since the trx sys
-		header page changes are not undo-logged, again store this
-		position in a different field in the XA COMMIT stage, so that
-		it might be used in case of rollbacks. */
-
-		/* Since currently there might be only one slave SQL thread, we
-		don't need to take any precautions (e.g. prepare_commit_mutex)
-		to ensure position ordering.  Revisit this in 5.6 which has
-		both the multi-threaded replication to cause us problems and
-		the group commit to solve them.  */
-
-		innobase_copy_repl_coords_to_trx(thd, trx);
-
-		error = (int) trx_prepare_for_mysql(trx);
+		trx_prepare_for_mysql(trx);
 
 		DBUG_EXECUTE_IF("crash_innodb_after_prepare",
 				DBUG_SUICIDE(););
+
+		error = 0;
 	} else {
 		/* We just mark the SQL statement ended and do not do a
 		transaction prepare */
@@ -12039,7 +14405,7 @@ innobase_xa_prepare(
 		/* If we had reserved the auto-inc lock for some
 		table in this SQL statement we release it now */
 
-		row_unlock_table_autoinc_for_mysql(trx);
+		lock_unlock_table_autoinc(trx);
 
 		/* Store the current undo_no of the transaction so that we
 		know where to roll back if we have to roll back the next
@@ -12053,6 +14419,24 @@ innobase_xa_prepare(
 
 	srv_active_wake_master_thread();
 
+	if (thd_sql_command(thd) != SQLCOM_XA_PREPARE
+	    && (prepare_trx
+		|| !thd_test_options(
+			thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
+
+		/* For ibbackup to work the order of transactions in binlog
+		and InnoDB must be the same. Consider the situation
+
+		  thread1> prepare; write to binlog; ...
+			  <context switch>
+		  thread2> prepare; write to binlog; commit
+		  thread1>			     ... commit
+
+                The server guarantees that writes to the binary log
+                and commits are in the same order, so we do not have
+                to handle this case. */
+	}
+
 	return(error);
 }
 
@@ -12124,22 +14508,6 @@ innobase_rollback_by_xid(
 	if (trx) {
 		int	ret = innobase_rollback_trx(trx);
 		trx_free_for_background(trx);
-
-		if (innobase_overwrite_relay_log_info) {
-
-			/* On rollback of a prepared transaction revert the
-			current slave positions to the ones recorded by the
-			last COMMITTed transaction.  This has an effect of
-			undoing the position change caused by the transaction
-			being rolled back.  Assumes single-threaded slave SQL
-			thread.  If the server has non-master write traffic
-			with XA rollbacks, this will cause additional spurious
-			slave info log overwrites, which should be harmless. */
-
-			trx_sys_print_committed_mysql_master_log_pos();
-			innobase_do_overwrite_relay_log_info();
-		}
-
 		return(ret);
 	} else {
 		return(XAER_NOTA);
@@ -12156,8 +14524,8 @@ static
 void*
 innobase_create_cursor_view(
 /*========================*/
-        handlerton *hton, /*!< in: innobase hton */
-	THD* thd)	  /*!< in: user thread handle */
+	handlerton*	hton,	/*!< in: innobase hton */
+	THD*		thd)	/*!< in: user thread handle */
 {
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
@@ -12172,9 +14540,9 @@ static
 void
 innobase_close_cursor_view(
 /*=======================*/
-        handlerton *hton,
-	THD*	thd,	/*!< in: user thread handle */
-	void*	curview)/*!< in: Consistent read view to be closed */
+	handlerton*	hton,	/*!< in: innobase hton */
+	THD*		thd,	/*!< in: user thread handle */
+	void*		curview)/*!< in: Consistent read view to be closed */
 {
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
@@ -12191,9 +14559,9 @@ static
 void
 innobase_set_cursor_view(
 /*=====================*/
-        handlerton *hton,
-	THD*	thd,	/*!< in: user thread handle */
-	void*	curview)/*!< in: Consistent cursor view to be set */
+	handlerton*	hton,	/*!< in: innobase hton */
+	THD*		thd,	/*!< in: user thread handle */
+	void*		curview)/*!< in: Consistent cursor view to be set */
 {
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
@@ -12202,190 +14570,209 @@ innobase_set_cursor_view(
 }
 
 /*******************************************************************//**
-If col_name is not NULL, check whether the named column is being
-renamed in the table. If col_name is not provided, check
-whether any one of columns in the table is being renamed.
-@return true if the column is being renamed */
-static
+*/
+UNIV_INTERN
 bool
-check_column_being_renamed(
-/*=======================*/
-	const TABLE*	table,		/*!< in: MySQL table */
-	const char*	col_name)	/*!< in: name of the column */
+ha_innobase::check_if_incompatible_data(
+/*====================================*/
+	HA_CREATE_INFO*	info,
+	uint		table_changes)
 {
-	uint		k;
-	Field*		field;
+	innobase_copy_frm_flags_from_create_info(prebuilt->table, info);
 
-	for (k = 0; k < table->s->fields; k++) {
-		field = table->field[k];
+	if (table_changes != IS_EQUAL_YES) {
 
-		if (field->flags & FIELD_IS_RENAMED) {
+		return(COMPATIBLE_DATA_NO);
+	}
 
-			/* If col_name is not provided, return
-			if the field is marked as being renamed. */
-			if (!col_name) {
-				return(true);
-			}
+	/* Check that auto_increment value was not changed */
+	if ((info->used_fields & HA_CREATE_USED_AUTO) &&
+		info->auto_increment_value != 0) {
 
-			/* If col_name is provided, return only
-			if names match */
-			if (innobase_strcasecmp(field->field_name,
-						col_name) == 0) {
-				return(true);
-			}
-		}
+		return(COMPATIBLE_DATA_NO);
 	}
 
-	return(false);
-}
+	/* Check that row format didn't change */
+	if ((info->used_fields & HA_CREATE_USED_ROW_FORMAT)
+	    && info->row_type != get_row_type()) {
 
-/*******************************************************************//**
-Check whether any of the given columns is being renamed in the table.
-@return true if any of col_names is being renamed in table */
-static
-bool
-column_is_being_renamed(
-/*====================*/
-	TABLE*		table,		/*!< in: MySQL table */
-	uint		n_cols,		/*!< in: number of columns */
-	const char**	col_names)	/*!< in: names of the columns */
-{
-	uint		j;
+		return(COMPATIBLE_DATA_NO);
+	}
 
-	for (j = 0; j < n_cols; j++) {
-		if (check_column_being_renamed(table, col_names[j])) {
-			return(true);
-		}
+	/* Specifying KEY_BLOCK_SIZE requests a rebuild of the table. */
+	if (info->used_fields & HA_CREATE_USED_KEY_BLOCK_SIZE) {
+		return(COMPATIBLE_DATA_NO);
 	}
 
-	return(false);
+	return(COMPATIBLE_DATA_YES);
 }
 
-/***********************************************************************
-Check whether a column in table "table" is being renamed and if this column
-is part of a foreign key, either part of another table, referencing this
-table or part of this table, referencing another table. */
+/****************************************************************//**
+Update the system variable innodb_io_capacity_max using the "saved"
+value. This function is registered as a callback with MySQL. */
 static
-bool
-foreign_key_column_is_being_renamed(
-/*================================*/
-					/* out: true if a column that
-					participates in a foreign key definition
-					is being renamed */
-	row_prebuilt_t*	prebuilt,	/* in: InnoDB prebuilt struct */
-	TABLE*		table)		/* in: MySQL table */
+void
+innodb_io_capacity_max_update(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
 {
-	dict_foreign_t*	foreign;
-
-	/* check whether there are foreign keys at all */
-	if (UT_LIST_GET_LEN(prebuilt->table->foreign_list) == 0
-	    && UT_LIST_GET_LEN(prebuilt->table->referenced_list) == 0) {
-		/* no foreign keys involved with prebuilt->table */
-
-		return(false);
-	}
-
-	row_mysql_lock_data_dictionary(prebuilt->trx);
-
-	/* Check whether any column in the foreign key constraints which refer
-	to this table is being renamed. */
-	for (foreign = UT_LIST_GET_FIRST(prebuilt->table->referenced_list);
-	     foreign != NULL;
-	     foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) {
-
-		if (column_is_being_renamed(table, foreign->n_fields,
-					    foreign->referenced_col_names)) {
-
-			row_mysql_unlock_data_dictionary(prebuilt->trx);
-			return(true);
-		}
+	ulong	in_val = *static_cast<const ulong*>(save);
+	if (in_val < srv_io_capacity) {
+		in_val = srv_io_capacity;
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "innodb_io_capacity_max cannot be"
+				    " set lower than innodb_io_capacity.");
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Setting innodb_io_capacity_max to %lu",
+				    srv_io_capacity);
 	}
 
-	/* Check whether any column in the foreign key constraints in the
-	table is being renamed. */
-	for (foreign = UT_LIST_GET_FIRST(prebuilt->table->foreign_list);
-	     foreign != NULL;
-	     foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) {
-
-		if (column_is_being_renamed(table, foreign->n_fields,
-					    foreign->foreign_col_names)) {
+	srv_max_io_capacity = in_val;
+}
 
-			row_mysql_unlock_data_dictionary(prebuilt->trx);
-			return(true);
-		}
+/****************************************************************//**
+Update the system variable innodb_io_capacity using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_io_capacity_update(
+/*======================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	ulong	in_val = *static_cast<const ulong*>(save);
+	if (in_val > srv_max_io_capacity) {
+		in_val = srv_max_io_capacity;
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "innodb_io_capacity cannot be set"
+				    " higher than innodb_io_capacity_max.");
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Setting innodb_io_capacity to %lu",
+				    srv_max_io_capacity);
 	}
 
-	row_mysql_unlock_data_dictionary(prebuilt->trx);
-
-	return(false);
+	srv_io_capacity = in_val;
 }
 
-UNIV_INTERN
-bool
-ha_innobase::check_if_incompatible_data(
-	HA_CREATE_INFO*	info,
-	uint		table_changes)
+/****************************************************************//**
+Update the system variable innodb_log_arch_expire_sec using
+the "saved" value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_log_archive_expire_update(
+/*==============================*/
+	THD*				thd,		/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,		/*!< in: pointer to
+							system variable */
+	void*				var_ptr,	/*!< out: unused */
+	const void*			save)		/*!< in: immediate result
+							from check function */
 {
-	enum row_type row_type, info_row_type;
-	DBUG_ENTER("ha_innobase::check_if_incompatible_data");
-
-	if (table_changes != IS_EQUAL_YES) {
-
-		DBUG_PRINT("info", ("table_changes != IS_EQUAL_YES "
-				    "-> COMPATIBLE_DATA_NO"));
-		DBUG_RETURN(COMPATIBLE_DATA_NO);
-	}
-
-	/* Check that auto_increment value was not changed */
-	if ((info->used_fields & HA_CREATE_USED_AUTO) &&
-		info->auto_increment_value != 0) {
+	srv_log_arch_expire_sec = *(ulint*) save;
+}
 
-		DBUG_PRINT("info", ("auto_increment_value changed -> "
-				    "COMPATIBLE_DATA_NO"));
-		DBUG_RETURN(COMPATIBLE_DATA_NO);
-	}
+static
+void
+innodb_log_archive_update(
+/*======================*/
+	THD*				thd,
+	struct st_mysql_sys_var*	var,
+	void*				var_ptr,
+	const void*			save)
+{
+	my_bool	in_val = *static_cast<const my_bool*>(save);
 
-	/* For column rename operation, MySQL does not supply enough
-	information (new column name etc.) for InnoDB to make appropriate
-	system metadata change. To avoid system metadata inconsistency,
-	currently we can just request a table rebuild/copy by returning
-	COMPATIBLE_DATA_NO */
-	if (check_column_being_renamed(table, NULL)) {
-        	DBUG_RETURN(COMPATIBLE_DATA_NO);
+	if (in_val) {
+		/* turn archiving on */
+		srv_log_archive_on = innobase_log_archive = 1;
+		log_archive_archivelog();
+	} else {
+		/* turn archivng off */
+		srv_log_archive_on = innobase_log_archive = 0;
+		log_archive_noarchivelog();
 	}
+}
 
-	/* Check if a column participating in a foreign key is being renamed.
-	There is no mechanism for updating InnoDB foreign key definitions. */
-	if (foreign_key_column_is_being_renamed(prebuilt, table)) {
+/****************************************************************//**
+Update the system variable innodb_max_dirty_pages_pct using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_max_dirty_pages_pct_update(
+/*==============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	ulong	in_val = *static_cast<const ulong*>(save);
+	if (in_val < srv_max_dirty_pages_pct_lwm) {
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "innodb_max_dirty_pages_pct cannot be"
+				    " set lower than"
+				    " innodb_max_dirty_pages_pct_lwm.");
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Lowering"
+				    " innodb_max_dirty_page_pct_lwm to %lu",
+				    in_val);
 
-		DBUG_RETURN(COMPATIBLE_DATA_NO);
+		srv_max_dirty_pages_pct_lwm = in_val;
 	}
 
-	/* Check that row format didn't change */
-	row_type = get_row_type();
-	info_row_type = info->row_type;
-	/* Default is compact. */
-	if (info_row_type == ROW_TYPE_DEFAULT)
-		info_row_type = ROW_TYPE_COMPACT;
-	if ((info->used_fields & HA_CREATE_USED_ROW_FORMAT) &&
-	    row_type != ((info->row_type == ROW_TYPE_DEFAULT)
-				? ROW_TYPE_COMPACT : info->row_type)) {
-
-		DBUG_PRINT("info", ("get_row_type()=%d != info->row_type=%d -> "
-				    "COMPATIBLE_DATA_NO",
-				    row_type, info->row_type));
-		DBUG_RETURN(COMPATIBLE_DATA_NO);
-	}
+	srv_max_buf_pool_modified_pct = in_val;
+}
 
-	/* Specifying KEY_BLOCK_SIZE requests a rebuild of the table. */
-	if (info->used_fields & HA_CREATE_USED_KEY_BLOCK_SIZE) {
-		DBUG_PRINT("info", ("HA_CREATE_USED_KEY_BLOCK_SIZE -> "
-				    "COMPATIBLE_DATA_NO"));
-		DBUG_RETURN(COMPATIBLE_DATA_NO);
+/****************************************************************//**
+Update the system variable innodb_max_dirty_pages_pct_lwm using the
+"saved" value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_max_dirty_pages_pct_lwm_update(
+/*==================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	ulong	in_val = *static_cast<const ulong*>(save);
+	if (in_val > srv_max_buf_pool_modified_pct) {
+		in_val = srv_max_buf_pool_modified_pct;
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "innodb_max_dirty_pages_pct_lwm"
+				    " cannot be set higher than"
+				    " innodb_max_dirty_pages_pct.");
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Setting innodb_max_dirty_page_pct_lwm"
+				    " to %lu",
+				    in_val);
 	}
 
-	DBUG_PRINT("info", (" -> COMPATIBLE_DATA_YES"));
-	DBUG_RETURN(COMPATIBLE_DATA_YES);
+	srv_max_dirty_pages_pct_lwm = in_val;
 }
 
 /************************************************************//**
@@ -12409,13 +14796,13 @@ innobase_file_format_name_lookup(
 	/* Check for valid parse. */
 	if (*endp == '\0' && *format_name != '\0') {
 
-		if (format_id <= DICT_TF_FORMAT_MAX) {
+		if (format_id <= UNIV_FORMAT_MAX) {
 
 			return(format_id);
 		}
 	} else {
 
-		for (format_id = 0; format_id <= DICT_TF_FORMAT_MAX;
+		for (format_id = 0; format_id <= UNIV_FORMAT_MAX;
 		     format_id++) {
 			const char*	name;
 
@@ -12428,7 +14815,7 @@ innobase_file_format_name_lookup(
 		}
 	}
 
-	return(DICT_TF_FORMAT_MAX + 1);
+	return(UNIV_FORMAT_MAX + 1);
 }
 
 /************************************************************//**
@@ -12445,7 +14832,7 @@ innobase_file_format_validate_and_set(
 
 	format_id = innobase_file_format_name_lookup(format_max);
 
-	if (format_id < DICT_TF_FORMAT_MAX + 1) {
+	if (format_id < UNIV_FORMAT_MAX + 1) {
 		srv_max_file_format_at_startup = format_id;
 
 		return((int) format_id);
@@ -12484,7 +14871,7 @@ innodb_file_format_name_validate(
 		format_id = innobase_file_format_name_lookup(
 			file_format_input);
 
-		if (format_id <= DICT_TF_FORMAT_MAX) {
+		if (format_id <= UNIV_FORMAT_MAX) {
 
 			/* Save a pointer to the name in the
 			'file_format_name_map' constant array. */
@@ -12526,7 +14913,7 @@ innodb_file_format_name_update(
 
 		format_id = innobase_file_format_name_lookup(format_name);
 
-		if (format_id <= DICT_TF_FORMAT_MAX) {
+		if (format_id <= UNIV_FORMAT_MAX) {
 			srv_file_format = format_id;
 		}
 	}
@@ -12534,6 +14921,7 @@ innodb_file_format_name_update(
 	*static_cast<const char**>(var_ptr)
 		= trx_sys_file_format_id_to_name(srv_file_format);
 }
+
 /*************************************************************//**
 Check if valid argument to innodb_file_format_max. This function
 is registered as a callback with MySQL.
@@ -12569,19 +14957,19 @@ innodb_file_format_max_validate(
 			'file_format_name_map' constant array. */
 			*static_cast<const char**>(save) =
 			    trx_sys_file_format_id_to_name(
-						(uint)format_id);
+						(uint) format_id);
 
 			return(0);
 
 		} else {
 			push_warning_printf(thd,
-			  MYSQL_ERROR::WARN_LEVEL_WARN,
+			  Sql_condition::WARN_LEVEL_WARN,
 			  ER_WRONG_ARGUMENTS,
 			  "InnoDB: invalid innodb_file_format_max "
 			  "value; can be any format up to %s "
 			  "or equivalent id of %d",
-			  trx_sys_file_format_id_to_name(DICT_TF_FORMAT_MAX),
-			  DICT_TF_FORMAT_MAX);
+			  trx_sys_file_format_id_to_name(UNIV_FORMAT_MAX),
+			  UNIV_FORMAT_MAX);
 		}
 	}
 
@@ -12596,13 +14984,13 @@ static
 void
 innodb_file_format_max_update(
 /*==========================*/
-	THD*				thd,		/*!< in: thread handle */
-	struct st_mysql_sys_var*	var,		/*!< in: pointer to
-							system variable */
-	void*				var_ptr,	/*!< out: where the
-							formal string goes */
-	const void*			save)		/*!< in: immediate result
-							from check function */
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
 {
 	const char*	format_name_in;
 	const char**	format_name_out;
@@ -12620,9 +15008,9 @@ innodb_file_format_max_update(
 
 	format_id = innobase_file_format_name_lookup(format_name_in);
 
-	if (format_id > DICT_TF_FORMAT_MAX) {
+	if (format_id > UNIV_FORMAT_MAX) {
 		/* DEFAULT is "on", which is invalid at runtime. */
-		push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 				    ER_WRONG_ARGUMENTS,
 				    "Ignoring SET innodb_file_format=%s",
 				    format_name_in);
@@ -12640,6 +15028,171 @@ innodb_file_format_max_update(
 	}
 }
 
+/*************************************************************//**
+Check whether valid argument given to innobase_*_stopword_table.
+This function is registered as a callback with MySQL.
+@return 0 for valid stopword table */
+static
+int
+innodb_stopword_table_validate(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	stopword_table_name;
+	char		buff[STRING_BUFFER_USUAL_SIZE];
+	int		len = sizeof(buff);
+	trx_t*		trx;
+	int		ret = 1;
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	stopword_table_name = value->val_str(value, buff, &len);
+
+	trx = check_trx_exists(thd);
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Validate the stopword table's (if supplied) existence and
+	of the right format */
+	if (!stopword_table_name
+	    || fts_valid_stopword_table(stopword_table_name)) {
+		*static_cast<const char**>(save) = stopword_table_name;
+		ret = 0;
+	}
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	return(ret);
+}
+
+/****************************************************************//**
+Update global variable fts_server_stopword_table with the "saved"
+stopword table name value. This function is registered as a callback
+with MySQL. */
+static
+void
+innodb_stopword_table_update(
+/*=========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	const char*	stopword_table_name;
+	char*		old;
+
+	ut_a(save != NULL);
+	ut_a(var_ptr != NULL);
+
+	stopword_table_name = *static_cast<const char*const*>(save);
+	old = *(char**) var_ptr;
+
+	if (stopword_table_name) {
+		*(char**) var_ptr =  my_strdup(stopword_table_name,  MYF(0));
+	} else {
+		*(char**) var_ptr = NULL;
+	}
+
+	if (old) {
+		my_free(old);
+	}
+
+	fts_server_stopword_table = *(char**) var_ptr;
+}
+
+/*************************************************************//**
+Check whether valid argument given to "innodb_fts_internal_tbl_name"
+This function is registered as a callback with MySQL.
+@return 0 for valid stopword table */
+static
+int
+innodb_internal_table_validate(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	table_name;
+	char		buff[STRING_BUFFER_USUAL_SIZE];
+	int		len = sizeof(buff);
+	int		ret = 1;
+	dict_table_t*	user_table;
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	table_name = value->val_str(value, buff, &len);
+
+	if (!table_name) {
+		*static_cast<const char**>(save) = NULL;
+		return(0);
+	}
+
+	user_table = dict_table_open_on_name(
+		table_name, FALSE, TRUE, DICT_ERR_IGNORE_NONE);
+
+	if (user_table) {
+		if (dict_table_has_fts_index(user_table)) {
+			*static_cast<const char**>(save) = table_name;
+			ret = 0;
+		}
+
+		dict_table_close(user_table, FALSE, TRUE);
+	}
+
+	return(ret);
+}
+
+/****************************************************************//**
+Update global variable "fts_internal_tbl_name" with the "saved"
+stopword table name value. This function is registered as a callback
+with MySQL. */
+static
+void
+innodb_internal_table_update(
+/*=========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	const char*	table_name;
+	char*		old;
+
+	ut_a(save != NULL);
+	ut_a(var_ptr != NULL);
+
+	table_name = *static_cast<const char*const*>(save);
+	old = *(char**) var_ptr;
+
+	if (table_name) {
+		*(char**) var_ptr =  my_strdup(table_name,  MYF(0));
+	} else {
+		*(char**) var_ptr = NULL;
+	}
+
+	if (old) {
+		my_free(old);
+	}
+
+	fts_internal_tbl_name = *(char**) var_ptr;
+}
+
 /****************************************************************//**
 Update the system variable innodb_adaptive_hash_index using the "saved"
 value. This function is registered as a callback with MySQL. */
@@ -12647,13 +15200,13 @@ static
 void
 innodb_adaptive_hash_index_update(
 /*==============================*/
-	THD*				thd,		/*!< in: thread handle */
-	struct st_mysql_sys_var*	var,		/*!< in: pointer to
-							system variable */
-	void*				var_ptr,	/*!< out: where the
-							formal string goes */
-	const void*			save)		/*!< in: immediate result
-							from check function */
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
 {
 	if (*(my_bool*) save) {
 		btr_search_enable();
@@ -12663,6 +15216,30 @@ innodb_adaptive_hash_index_update(
 }
 
 /****************************************************************//**
+Update the system variable innodb_cmp_per_index using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_cmp_per_index_update(
+/*========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	/* Reset the stats whenever we enable the table
+	INFORMATION_SCHEMA.innodb_cmp_per_index. */
+	if (!srv_cmp_per_index_enabled && *(my_bool*) save) {
+		page_zip_reset_stat_per_index();
+	}
+
+	srv_cmp_per_index_enabled = !!(*(my_bool*) save);
+}
+
+/****************************************************************//**
 Update the system variable innodb_old_blocks_pct using the "saved"
 value. This function is registered as a callback with MySQL. */
 static
@@ -12681,6 +15258,27 @@ innodb_old_blocks_pct_update(
 		*static_cast<const uint*>(save), TRUE);
 }
 
+/****************************************************************//**
+Update the system variable innodb_old_blocks_pct using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_change_buffer_max_size_update(
+/*=================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innobase_change_buffer_max_size =
+			(*static_cast<const uint*>(save));
+	ibuf_max_size_update(innobase_change_buffer_max_size);
+}
+
+
 /*************************************************************//**
 Find the corresponding ibuf_use_t value that indexes into
 innobase_change_buffering_values[] array for the input
@@ -12701,7 +15299,7 @@ innodb_find_change_buffering_value(
 		/* found a match */
 		if (!innobase_strcasecmp(
 			input_name, innobase_change_buffering_values[use])) {
-			return((ibuf_use_t)use);
+			return((ibuf_use_t) use);
 		}
 	}
 
@@ -12783,10 +15381,528 @@ innodb_change_buffering_update(
 		 *static_cast<const char*const*>(save);
 }
 
-#ifndef DBUG_OFF
+/*************************************************************//**
+Just emit a warning that the usage of the variable is deprecated.
+@return	0 */
+static
+void
+innodb_stats_sample_pages_update(
+/*=============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+#define STATS_SAMPLE_PAGES_DEPRECATED_MSG \
+	"Using innodb_stats_sample_pages is deprecated and " \
+	"the variable may be removed in future releases. " \
+	"Please use innodb_stats_transient_sample_pages " \
+	"instead."
+
+	push_warning(thd, Sql_condition::WARN_LEVEL_WARN,
+		     HA_ERR_WRONG_COMMAND, STATS_SAMPLE_PAGES_DEPRECATED_MSG);
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: Warning: %s\n",
+		STATS_SAMPLE_PAGES_DEPRECATED_MSG);
+
+	srv_stats_transient_sample_pages =
+		*static_cast<const unsigned long long*>(save);
+}
+
+/****************************************************************//**
+Update the monitor counter according to the "set_option",  turn
+on/off or reset specified monitor counter. */
+static
+void
+innodb_monitor_set_option(
+/*======================*/
+	const monitor_info_t* monitor_info,/*!< in: monitor info for the monitor
+					to set */
+	mon_option_t	set_option)	/*!< in: Turn on/off reset the
+					counter */
+{
+	monitor_id_t	monitor_id = monitor_info->monitor_id;
+
+	/* If module type is MONITOR_GROUP_MODULE, it cannot be
+	turned on/off individually. It should never use this
+	function to set options */
+	ut_a(!(monitor_info->monitor_type & MONITOR_GROUP_MODULE));
+
+	switch (set_option) {
+	case MONITOR_TURN_ON:
+		MONITOR_ON(monitor_id);
+		MONITOR_INIT(monitor_id);
+		MONITOR_SET_START(monitor_id);
+
+		/* If the monitor to be turned on uses
+		exisitng monitor counter (status variable),
+		make special processing to remember existing
+		counter value. */
+		if (monitor_info->monitor_type
+		    & MONITOR_EXISTING) {
+			srv_mon_process_existing_counter(
+				monitor_id, MONITOR_TURN_ON);
+		}
+		break;
+
+	case MONITOR_TURN_OFF:
+		if (monitor_info->monitor_type & MONITOR_EXISTING) {
+			srv_mon_process_existing_counter(
+				monitor_id, MONITOR_TURN_OFF);
+		}
+
+		MONITOR_OFF(monitor_id);
+		MONITOR_SET_OFF(monitor_id);
+		break;
+
+	case MONITOR_RESET_VALUE:
+		srv_mon_reset(monitor_id);
+		break;
+
+	case MONITOR_RESET_ALL_VALUE:
+		srv_mon_reset_all(monitor_id);
+		break;
+
+	default:
+		ut_error;
+	}
+}
+
+/****************************************************************//**
+Find matching InnoDB monitor counters and update their status
+according to the "set_option",  turn on/off or reset specified
+monitor counter. */
+static
+void
+innodb_monitor_update_wildcard(
+/*===========================*/
+	const char*	name,		/*!< in: monitor name to match */
+	mon_option_t	set_option)	/*!< in: the set option, whether
+					to turn on/off or reset the counter */
+{
+	ut_a(name);
+
+	for (ulint use = 0; use < NUM_MONITOR; use++) {
+		ulint		type;
+		monitor_id_t	monitor_id = static_cast<monitor_id_t>(use);
+		monitor_info_t*	monitor_info;
+
+		if (!innobase_wildcasecmp(
+			srv_mon_get_name(monitor_id), name)) {
+			monitor_info = srv_mon_get_info(monitor_id);
+
+			type = monitor_info->monitor_type;
+
+			/* If the monitor counter is of MONITOR_MODULE
+			type, skip it. Except for those also marked with
+			MONITOR_GROUP_MODULE flag, which can be turned
+			on only as a module. */
+			if (!(type & MONITOR_MODULE)
+			     && !(type & MONITOR_GROUP_MODULE)) {
+				innodb_monitor_set_option(monitor_info,
+							  set_option);
+			}
+
+			/* Need to special handle counters marked with
+			MONITOR_GROUP_MODULE, turn on the whole module if
+			any one of it comes here. Currently, only
+			"module_buf_page" is marked with MONITOR_GROUP_MODULE */
+			if (type & MONITOR_GROUP_MODULE) {
+				if ((monitor_id >= MONITOR_MODULE_BUF_PAGE)
+				     && (monitor_id < MONITOR_MODULE_OS)) {
+					if (set_option == MONITOR_TURN_ON
+					    && MONITOR_IS_ON(
+						MONITOR_MODULE_BUF_PAGE)) {
+						continue;
+					}
+
+					srv_mon_set_module_control(
+						MONITOR_MODULE_BUF_PAGE,
+						set_option);
+				} else {
+					/* If new monitor is added with
+					MONITOR_GROUP_MODULE, it needs
+					to be added here. */
+					ut_ad(0);
+				}
+			}
+		}
+	}
+}
+
+/*************************************************************//**
+Given a configuration variable name, find corresponding monitor counter
+and return its monitor ID if found.
+@return	monitor ID if found, MONITOR_NO_MATCH if there is no match */
+static
+ulint
+innodb_monitor_id_by_name_get(
+/*==========================*/
+	const char*	name)	/*!< in: monitor counter namer */
+{
+	ut_a(name);
+
+	/* Search for wild character '%' in the name, if
+	found, we treat it as a wildcard match. We do not search for
+	single character wildcard '_' since our monitor names already contain
+	such character. To avoid confusion, we request user must include
+	at least one '%' character to activate the wildcard search. */
+	if (strchr(name, '%')) {
+		return(MONITOR_WILDCARD_MATCH);
+	}
+
+	/* Not wildcard match, check for an exact match */
+	for (ulint i = 0; i < NUM_MONITOR; i++) {
+		if (!innobase_strcasecmp(
+			name, srv_mon_get_name(static_cast<monitor_id_t>(i)))) {
+			return(i);
+		}
+	}
+
+	return(MONITOR_NO_MATCH);
+}
+/*************************************************************//**
+Validate that the passed in monitor name matches at least one
+monitor counter name with wildcard compare.
+@return	TRUE if at least one monitor name matches */
+static
+ibool
+innodb_monitor_validate_wildcard_name(
+/*==================================*/
+	const char*	name)	/*!< in: monitor counter namer */
+{
+	for (ulint i = 0; i < NUM_MONITOR; i++) {
+		if (!innobase_wildcasecmp(
+			srv_mon_get_name(static_cast<monitor_id_t>(i)), name)) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+/*************************************************************//**
+Validate the passed in monitor name, find and save the
+corresponding monitor name in the function parameter "save".
+@return	0 if monitor name is valid */
+static
+int
+innodb_monitor_valid_byname(
+/*========================*/
+	void*			save,	/*!< out: immediate result
+					for update function */
+	const char*		name)	/*!< in: incoming monitor name */
+{
+	ulint		use;
+	monitor_info_t*	monitor_info;
+
+	if (!name) {
+		return(1);
+	}
+
+	use = innodb_monitor_id_by_name_get(name);
+
+	/* No monitor name matches, nor it is wildcard match */
+	if (use == MONITOR_NO_MATCH) {
+		return(1);
+	}
+
+	if (use < NUM_MONITOR) {
+		monitor_info = srv_mon_get_info((monitor_id_t) use);
+
+		/* If the monitor counter is marked with
+		MONITOR_GROUP_MODULE flag, then this counter
+		cannot be turned on/off individually, instead
+		it shall be turned on/off as a group using
+		its module name */
+		if ((monitor_info->monitor_type & MONITOR_GROUP_MODULE)
+		    && (!(monitor_info->monitor_type & MONITOR_MODULE))) {
+			sql_print_warning(
+				"Monitor counter '%s' cannot"
+				" be turned on/off individually."
+				" Please use its module name"
+				" to turn on/off the counters"
+				" in the module as a group.\n",
+				name);
+
+			return(1);
+		}
+
+	} else {
+		ut_a(use == MONITOR_WILDCARD_MATCH);
+
+		/* For wildcard match, if there is not a single monitor
+		counter name that matches, treat it as an invalid
+		value for the system configuration variables */
+		if (!innodb_monitor_validate_wildcard_name(name)) {
+			return(1);
+		}
+	}
+
+	/* Save the configure name for innodb_monitor_update() */
+	*static_cast<const char**>(save) = name;
+
+	return(0);
+}
+/*************************************************************//**
+Validate passed-in "value" is a valid monitor counter name.
+This function is registered as a callback with MySQL.
+@return	0 for valid name */
+static
+int
+innodb_monitor_validate(
+/*====================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	name;
+	char*		monitor_name;
+	char		buff[STRING_BUFFER_USUAL_SIZE];
+	int		len = sizeof(buff);
+	int		ret;
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	name = value->val_str(value, buff, &len);
+
+	/* monitor_name could point to memory from MySQL
+	or buff[]. Always dup the name to memory allocated
+	by InnoDB, so we can access it in another callback
+	function innodb_monitor_update() and free it appropriately */
+	if (name) {
+		monitor_name = my_strdup(name, MYF(0));
+	} else {
+		return(1);
+	}
+
+	ret = innodb_monitor_valid_byname(save, monitor_name);
+
+	if (ret) {
+		/* Validation failed */
+		my_free(monitor_name);
+	} else {
+		/* monitor_name will be freed in separate callback function
+		innodb_monitor_update(). Assert "save" point to
+		the "monitor_name" variable */
+		ut_ad(*static_cast<char**>(save) == monitor_name);
+	}
+
+	return(ret);
+}
+
+/****************************************************************//**
+Update the system variable innodb_enable(disable/reset/reset_all)_monitor
+according to the "set_option" and turn on/off or reset specified monitor
+counter. */
+static
+void
+innodb_monitor_update(
+/*==================*/
+	THD*			thd,		/*!< in: thread handle */
+	void*			var_ptr,	/*!< out: where the
+						formal string goes */
+	const void*		save,		/*!< in: immediate result
+						from check function */
+	mon_option_t		set_option,	/*!< in: the set option,
+						whether to turn on/off or
+						reset the counter */
+	ibool			free_mem)	/*!< in: whether we will
+						need to free the memory */
+{
+	monitor_info_t*	monitor_info;
+	ulint		monitor_id;
+	ulint		err_monitor = 0;
+	const char*	name;
+
+	ut_a(save != NULL);
+
+	name = *static_cast<const char*const*>(save);
+
+	if (!name) {
+		monitor_id = MONITOR_DEFAULT_START;
+	} else {
+		monitor_id = innodb_monitor_id_by_name_get(name);
+
+		/* Double check we have a valid monitor ID */
+		if (monitor_id == MONITOR_NO_MATCH) {
+			return;
+		}
+	}
+
+	if (monitor_id == MONITOR_DEFAULT_START) {
+		/* If user set the variable to "default", we will
+		print a message and make this set operation a "noop".
+		The check is being made here is because "set default"
+		does not go through validation function */
+		if (thd) {
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_NO_DEFAULT,
+				"Default value is not defined for "
+				"this set option. Please specify "
+				"correct counter or module name.");
+		} else {
+			sql_print_error(
+				"Default value is not defined for "
+				"this set option. Please specify "
+				"correct counter or module name.\n");
+		}
+
+		if (var_ptr) {
+			*(const char**) var_ptr = NULL;
+		}
+	} else if (monitor_id == MONITOR_WILDCARD_MATCH) {
+		innodb_monitor_update_wildcard(name, set_option);
+	} else {
+		monitor_info = srv_mon_get_info(
+			static_cast<monitor_id_t>(monitor_id));
+
+		ut_a(monitor_info);
+
+		/* If monitor is already truned on, someone could already
+		collect monitor data, exit and ask user to turn off the
+		monitor before turn it on again. */
+		if (set_option == MONITOR_TURN_ON
+		    && MONITOR_IS_ON(monitor_id)) {
+			err_monitor = monitor_id;
+			goto exit;
+		}
+
+		if (var_ptr) {
+			*(const char**) var_ptr = monitor_info->monitor_name;
+		}
+
+		/* Depending on the monitor name is for a module or
+		a counter, process counters in the whole module or
+		individual counter. */
+		if (monitor_info->monitor_type & MONITOR_MODULE) {
+			srv_mon_set_module_control(
+				static_cast<monitor_id_t>(monitor_id),
+				set_option);
+		} else {
+			innodb_monitor_set_option(monitor_info, set_option);
+		}
+	}
+exit:
+	/* Only if we are trying to turn on a monitor that already
+	been turned on, we will set err_monitor. Print related
+	information */
+	if (err_monitor) {
+		sql_print_warning("Monitor %s is already enabled.",
+				  srv_mon_get_name((monitor_id_t) err_monitor));
+	}
+
+	if (free_mem && name) {
+		my_free((void*) name);
+	}
+
+	return;
+}
+
+#ifdef __WIN__
+/*************************************************************//**
+Validate if passed-in "value" is a valid value for
+innodb_buffer_pool_filename. On Windows, file names with colon (:)
+are not allowed.
+
+@return	0 for valid name */
+static
+int
+innodb_srv_buf_dump_filename_validate(
+/*==================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	buf_name;
+	char		buff[OS_FILE_MAX_PATH];
+	int		len= sizeof(buff);
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	buf_name = value->val_str(value, buff, &len);
+
+	if (buf_name) {
+		if (is_filename_allowed(buf_name, len, FALSE)){
+			*static_cast<const char**>(save) = buf_name;
+			return(0);
+		} else {
+			push_warning_printf(thd,
+				Sql_condition::WARN_LEVEL_WARN,
+				ER_WRONG_ARGUMENTS,
+				"InnoDB: innodb_buffer_pool_filename "
+				"cannot have colon (:) in the file name.");
+
+		}
+	}
+
+	return(1);
+}
+#else /* __WIN__ */
+# define innodb_srv_buf_dump_filename_validate NULL
+#endif /* __WIN__ */
+
+#ifdef UNIV_DEBUG
 static char* srv_buffer_pool_evict;
 
 /****************************************************************//**
+Evict all uncompressed pages of compressed tables from the buffer pool.
+Keep the compressed pages in the buffer pool.
+@return whether all uncompressed pages were evicted */
+static __attribute__((warn_unused_result))
+bool
+innodb_buffer_pool_evict_uncompressed(void)
+/*=======================================*/
+{
+	bool	all_evicted = true;
+
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool = &buf_pool_ptr[i];
+
+		mutex_enter(&buf_pool->LRU_list_mutex);
+
+		for (buf_block_t* block = UT_LIST_GET_LAST(
+			     buf_pool->unzip_LRU);
+		     block != NULL; ) {
+			buf_block_t*	prev_block = UT_LIST_GET_PREV(
+				unzip_LRU, block);
+			ut_ad(buf_block_get_state(block)
+			      == BUF_BLOCK_FILE_PAGE);
+			ut_ad(block->in_unzip_LRU_list);
+			ut_ad(block->page.in_LRU_list);
+
+			mutex_enter(&block->mutex);
+			if (!buf_LRU_free_page(&block->page, false)) {
+				mutex_exit(&block->mutex);
+				all_evicted = false;
+			} else {
+				mutex_exit(&block->mutex);
+				mutex_enter(&buf_pool->LRU_list_mutex);
+			}
+
+			block = prev_block;
+		}
+
+		mutex_exit(&buf_pool->LRU_list_mutex);
+	}
+
+	return(all_evicted);
+}
+
+/****************************************************************//**
 Called on SET GLOBAL innodb_buffer_pool_evict=...
 Handles some values specially, to evict pages from the buffer pool.
 SET GLOBAL innodb_buffer_pool_evict='uncompressed'
@@ -12803,61 +15919,373 @@ innodb_buffer_pool_evict_update(
 {
 	if (const char* op = *static_cast<const char*const*>(save)) {
 		if (!strcmp(op, "uncompressed")) {
-			/* Evict all uncompressed pages of compressed
-			tables from the buffer pool. Keep the compressed
-			pages in the buffer pool. */
+			for (uint tries = 0; tries < 10000; tries++) {
+				if (innodb_buffer_pool_evict_uncompressed()) {
+					return;
+				}
 
-			for (ulint i = 0; i < srv_buf_pool_instances; i++) {
-				buf_pool_t*	buf_pool = &buf_pool_ptr[i];
-				ibool have_LRU_mutex = TRUE;
+				os_thread_sleep(10000);
+			}
 
-				//buf_pool_mutex_enter(buf_pool);
-				mutex_enter(&buf_pool->LRU_list_mutex);
+			/* We failed to evict all uncompressed pages. */
+			ut_ad(0);
+		}
+	}
+}
+#endif /* UNIV_DEBUG */
 
-				for (buf_block_t* block = UT_LIST_GET_LAST(
-					     buf_pool->unzip_LRU);
-				     block != NULL; ) {
-
-					buf_block_t*	prev_block
-						= UT_LIST_GET_PREV(unzip_LRU,
-								   block);
-					ut_ad(buf_block_get_state(block)
-					      == BUF_BLOCK_FILE_PAGE);
-					ut_ad(block->in_unzip_LRU_list);
-					ut_ad(block->page.in_LRU_list);
-
-					mutex_enter(&block->mutex);
-					buf_LRU_free_block(&block->page,
-							   FALSE, &have_LRU_mutex);
-					mutex_exit(&block->mutex);
-					block = prev_block;
-				}
+/****************************************************************//**
+Update the system variable innodb_monitor_enable and enable
+specified monitor counter.
+This function is registered as a callback with MySQL. */
+static
+void
+innodb_enable_monitor_update(
+/*=========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innodb_monitor_update(thd, var_ptr, save, MONITOR_TURN_ON, TRUE);
+}
 
-				if (have_LRU_mutex) {
-					mutex_exit(&buf_pool->LRU_list_mutex);
-				}
-				//buf_pool_mutex_exit(buf_pool);
-			}
+/****************************************************************//**
+Update the system variable innodb_monitor_disable and turn
+off specified monitor counter. */
+static
+void
+innodb_disable_monitor_update(
+/*==========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innodb_monitor_update(thd, var_ptr, save, MONITOR_TURN_OFF, TRUE);
+}
+
+/****************************************************************//**
+Update the system variable innodb_monitor_reset and reset
+specified monitor counter(s).
+This function is registered as a callback with MySQL. */
+static
+void
+innodb_reset_monitor_update(
+/*========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innodb_monitor_update(thd, var_ptr, save, MONITOR_RESET_VALUE, TRUE);
+}
+
+/****************************************************************//**
+Update the system variable innodb_monitor_reset_all and reset
+all value related monitor counter.
+This function is registered as a callback with MySQL. */
+static
+void
+innodb_reset_all_monitor_update(
+/*============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innodb_monitor_update(thd, var_ptr, save, MONITOR_RESET_ALL_VALUE,
+			      TRUE);
+}
+
+/****************************************************************//**
+Update log_checksum_algorithm_ptr with a pointer to the function corresponding
+to a given checksum algorithm. */
+static
+void
+innodb_log_checksum_algorithm_update(
+/*=================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	srv_checksum_algorithm_t	algorithm;
+
+	algorithm = (srv_checksum_algorithm_t)
+		(*static_cast<const ulong*>(save));
+
+	/* Make sure we are the only log user */
+	mutex_enter(&log_sys->mutex);
+
+	switch (algorithm) {
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+		log_checksum_algorithm_ptr=log_block_calc_checksum_innodb;
+		break;
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+		log_checksum_algorithm_ptr=log_block_calc_checksum_crc32;
+		break;
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+		log_checksum_algorithm_ptr=log_block_calc_checksum_none;
+		break;
+	default:
+		ut_a(0);
+	}
+
+	srv_log_checksum_algorithm = algorithm;
+
+	mutex_exit(&log_sys->mutex);
+}
+
+/****************************************************************//**
+Parse and enable InnoDB monitor counters during server startup.
+User can list the monitor counters/groups to be enable by specifying
+"loose-innodb_monitor_enable=monitor_name1;monitor_name2..."
+in server configuration file or at the command line. The string
+separate could be ";", "," or empty space. */
+static
+void
+innodb_enable_monitor_at_startup(
+/*=============================*/
+	char*	str)	/*!< in/out: monitor counter enable list */
+{
+	static const char*	sep = " ;,";
+	char*			last;
+
+	ut_a(str);
+
+	/* Walk through the string, and separate each monitor counter
+	and/or counter group name, and calling innodb_monitor_update()
+	if successfully updated. Please note that the "str" would be
+	changed by strtok_r() as it walks through it. */
+	for (char* option = strtok_r(str, sep, &last);
+	     option;
+	     option = strtok_r(NULL, sep, &last)) {
+		ulint	ret;
+		char*	option_name;
+
+		ret = innodb_monitor_valid_byname(&option_name, option);
+
+		/* The name is validated if ret == 0 */
+		if (!ret) {
+			innodb_monitor_update(NULL, NULL, &option,
+					      MONITOR_TURN_ON, FALSE);
+		} else {
+			sql_print_warning("Invalid monitor counter"
+					  " name: '%s'", option);
 		}
 	}
 }
-#endif /* !DBUG_OFF */
 
-static int show_innodb_vars(THD *thd, SHOW_VAR *var, char *buff)
+#ifdef UNIV_LINUX
+
+/****************************************************************//**
+Update the innodb_sched_priority_cleaner variable and set the thread
+priority accordingly.  */
+static
+void
+innodb_sched_priority_cleaner_update(
+/*=================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	ulint	priority = *static_cast<const ulint *>(save);
+	ulint	actual_priority;
+
+	if (srv_read_only_mode) {
+
+		return;
+	}
+
+	ut_ad(buf_page_cleaner_is_active);
+	actual_priority = os_thread_set_priority(srv_cleaner_tid, priority);
+	if (UNIV_UNLIKELY(actual_priority != priority)) {
+
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Failed to set the page cleaner thread "
+				    "priority to %lu,  "
+				    "the current priority is %lu", priority,
+				    actual_priority);
+	} else {
+
+		srv_sched_priority_cleaner = priority;
+	}
+}
+
+#if defined(UNIV_DEBUG) || (UNIV_PERF_DEBUG)
+
+/****************************************************************//**
+Update the innodb_sched_priority_purge variable and set the thread
+priorities accordingly.  */
+static
+void
+innodb_sched_priority_purge_update(
+/*===============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
 {
-  innodb_export_status();
-  var->type= SHOW_ARRAY;
-  var->value= (char *) &innodb_status_variables;
-  return 0;
+	ulint	priority = *static_cast<const ulint *>(save);
+
+	if (srv_read_only_mode) {
+		return;
+	}
+
+	ut_ad(purge_sys->state == PURGE_STATE_RUN);
+	for (ulint i = 0; i < srv_n_purge_threads; i++) {
+
+		ulint actual_priority
+			= os_thread_set_priority(srv_purge_tids[i], priority);
+		if (UNIV_UNLIKELY(actual_priority != priority)) {
+
+			push_warning_printf(thd,
+					    Sql_condition::WARN_LEVEL_WARN,
+					    ER_WRONG_ARGUMENTS,
+					    "Failed to set the purge "
+					    "thread priority to %lu, the "
+					    "current priority is %lu, "
+					    "aborting priority update",
+					    priority, actual_priority);
+			return;
+		}
+	}
+
+	srv_sched_priority_purge = priority;
 }
 
-/*********************************************************************//**
+/****************************************************************//**
+Update the innodb_sched_priority_io variable and set the thread
+priorities accordingly.  */
+static
+void
+innodb_sched_priority_io_update(
+/*============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+					        system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	ulint	priority = *static_cast<const ulint *>(save);
+
+	for (ulint i = 0; i < srv_n_file_io_threads; i++) {
+
+		ulint actual_priority = os_thread_set_priority(srv_io_tids[i],
+							       priority);
+
+		if (UNIV_UNLIKELY(actual_priority != priority)) {
+
+			push_warning_printf(thd,
+					    Sql_condition::WARN_LEVEL_WARN,
+					    ER_WRONG_ARGUMENTS,
+					    "Failed to set the I/O "
+					    "thread priority to %lu, the "
+					    "current priority is %lu, "
+					    "aborting priority update",
+					    priority, actual_priority);
+			return;
+		}
+	}
+
+	srv_sched_priority_io = priority;
+}
+
+/****************************************************************//**
+Update the innodb_sched_priority_master variable and set the thread
+priorities accordingly.  */
+static
+void
+innodb_sched_priority_master_update(
+/*================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	ulint	priority = *static_cast<const lint *>(save);
+	ulint	actual_priority;
+
+	if (srv_read_only_mode) {
+		return;
+	}
+
+	actual_priority = os_thread_set_priority(srv_master_tid, priority);
+	if (UNIV_UNLIKELY(actual_priority != priority)) {
+
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Failed to set the master thread "
+				    "priority to %lu,  "
+				    "the current priority is %lu", priority,
+				    actual_priority);
+	} else {
+
+		srv_sched_priority_master = priority;
+	}
+}
+
+#endif /* defined(UNIV_DEBUG) || (UNIV_PERF_DEBUG) */
+
+#endif /* UNIV_LINUX */
+
+/****************************************************************//**
+Callback function for accessing the InnoDB variables from MySQL:
+SHOW VARIABLES. */
+static
+int
+show_innodb_vars(
+/*=============*/
+	THD*		thd,
+	SHOW_VAR*	var,
+	char*		buff)
+{
+	innodb_export_status();
+	var->type = SHOW_ARRAY;
+	var->value = (char*) &innodb_status_variables;
+
+	return(0);
+}
+
+/****************************************************************//**
 This function checks each index name for a table against reserved
 system default primary index name 'GEN_CLUST_INDEX'. If a name
 matches, this function pushes an warning message to the client,
 and returns true.
 @return true if the index name matches the reserved name */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 bool
 innobase_index_name_is_reserved(
 /*============================*/
@@ -12876,7 +16304,7 @@ innobase_index_name_is_reserved(
 					innobase_index_reserve_name) == 0) {
 			/* Push warning to mysql */
 			push_warning_printf(thd,
-					    MYSQL_ERROR::WARN_LEVEL_WARN,
+					    Sql_condition::WARN_LEVEL_WARN,
 					    ER_WRONG_NAME_FOR_INDEX,
 					    "Cannot Create Index with name "
 					    "'%s'. The name is reserved "
@@ -12895,8 +16323,35 @@ innobase_index_name_is_reserved(
 }
 
 /***********************************************************************
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+of prebuilt->fts_doc_id
+@return the relevance ranking value */
+UNIV_INTERN
+float
+innobase_fts_retrieve_ranking(
+/*============================*/
+		FT_INFO * fts_hdl)	/*!< in: FTS handler */
+{
+	row_prebuilt_t*	ft_prebuilt;
+	fts_result_t*	result;
+
+	result = ((NEW_FT_INFO*) fts_hdl)->ft_result;
+
+	ft_prebuilt = ((NEW_FT_INFO*) fts_hdl)->ft_prebuilt;
+
+	if (ft_prebuilt->read_just_key) {
+		fts_ranking_t*  ranking =
+			rbt_value(fts_ranking_t, result->current);
+		return(ranking->rank);
+	}
+
+	/* Retrieve the ranking value for doc_id with value of
+	prebuilt->fts_doc_id */
+	return(fts_retrieve_ranking(result, ft_prebuilt->fts_doc_id));
+}
+
+/***********************************************************************
 functions for kill session of idle transaction */
-extern "C"
 ibool
 innobase_thd_is_idle(
 /*=================*/
@@ -12909,7 +16364,6 @@ innobase_thd_is_idle(
 #endif
 }
 
-extern "C"
 ib_int64_t
 innobase_thd_get_start_time(
 /*========================*/
@@ -12922,7 +16376,28 @@ innobase_thd_get_start_time(
 #endif
 }
 
-extern "C"
+/***********************************************************************
+Free the memory for the FTS handler */
+UNIV_INTERN
+void
+innobase_fts_close_ranking(
+/*=======================*/
+		FT_INFO * fts_hdl)
+{
+	fts_result_t*	result;
+
+	((NEW_FT_INFO*) fts_hdl)->ft_prebuilt->in_fts_query = false;
+
+	result = ((NEW_FT_INFO*) fts_hdl)->ft_result;
+
+	fts_query_free_result(result);
+
+	my_free((uchar*) fts_hdl);
+
+	return;
+}
+
+UNIV_INTERN
 void
 innobase_thd_kill(
 /*==============*/
@@ -12935,17 +16410,80 @@ innobase_thd_kill(
 #endif
 }
 
-extern "C"
-ulong
-innobase_thd_get_thread_id(
-/*=======================*/
-	const void*	thd)
+/***********************************************************************
+Find and Retrieve the FTS Relevance Ranking result for doc with doc_id
+of prebuilt->fts_doc_id
+@return the relevance ranking value */
+UNIV_INTERN
+float
+innobase_fts_find_ranking(
+/*======================*/
+		FT_INFO*	fts_hdl,	/*!< in: FTS handler */
+		uchar*		record,		/*!< in: Unused */
+		uint		len)		/*!< in: Unused */
 {
-	return(thd_get_thread_id((const THD*) thd));
+	row_prebuilt_t*	ft_prebuilt;
+	fts_result_t*	result;
+
+	ft_prebuilt = ((NEW_FT_INFO*) fts_hdl)->ft_prebuilt;
+	result = ((NEW_FT_INFO*) fts_hdl)->ft_result;
+
+	/* Retrieve the ranking value for doc_id with value of
+	prebuilt->fts_doc_id */
+	return(fts_retrieve_ranking(result, ft_prebuilt->fts_doc_id));
 }
 
 #ifdef UNIV_DEBUG
+static my_bool	innodb_purge_run_now = TRUE;
+static my_bool	innodb_purge_stop_now = TRUE;
 static my_bool	innodb_log_checkpoint_now = TRUE;
+static my_bool	innodb_track_redo_log_now = TRUE;
+
+/****************************************************************//**
+Set the purge state to RUN. If purge is disabled then it
+is a no-op. This function is registered as a callback with MySQL. */
+static
+void
+purge_run_now_set(
+/*==============*/
+	THD*				thd	/*!< in: thread handle */
+					__attribute__((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					__attribute__((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					__attribute__((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save && trx_purge_state() != PURGE_STATE_DISABLED) {
+		trx_purge_run();
+	}
+}
+
+/****************************************************************//**
+Set the purge state to STOP. If purge is disabled then it
+is a no-op. This function is registered as a callback with MySQL. */
+static
+void
+purge_stop_now_set(
+/*===============*/
+	THD*				thd	/*!< in: thread handle */
+					__attribute__((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					__attribute__((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					__attribute__((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save && trx_purge_state() != PURGE_STATE_DISABLED) {
+		trx_purge_stop();
+	}
+}
 
 /****************************************************************//**
 Force innodb to checkpoint. */
@@ -12954,19 +16492,19 @@ void
 checkpoint_now_set(
 /*===============*/
 	THD*				thd	/*!< in: thread handle */
-	__attribute__((unused)),
+					__attribute__((unused)),
 	struct st_mysql_sys_var*	var	/*!< in: pointer to system
 						variable */
-	__attribute__((unused)),
+					__attribute__((unused)),
 	void*				var_ptr	/*!< out: where the formal
 						string goes */
-	__attribute__((unused)),
+					__attribute__((unused)),
 	const void*			save)	/*!< in: immediate result from
 						check function */
 {
 	if (*(my_bool*) save) {
 		while (log_sys->last_checkpoint_lsn < log_sys->lsn) {
-			log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+			log_make_checkpoint_at(LSN_MAX, TRUE);
 			fil_flush_file_spaces(FIL_LOG);
 		}
 		fil_write_flushed_lsn_to_data_files(log_sys->lsn, 0);
@@ -12974,8 +16512,6 @@ checkpoint_now_set(
 	}
 }
 
-static my_bool	innodb_track_redo_log_now = TRUE;
-
 /****************************************************************//**
 Force log tracker to track the log synchronously.  */
 static
@@ -12983,15 +16519,15 @@ void
 track_redo_log_now_set(
 /*===================*/
 	THD*				thd	/*!< in: thread handle */
-	__attribute__((unused)),
+					__attribute__((unused)),
 	struct st_mysql_sys_var*	var	/*!< in: pointer to system
-						variable */
-	__attribute__((unused)),
+						  variable */
+					__attribute__((unused)),
 	void*				var_ptr	/*!< out: where the formal
-						string goes */
-	__attribute__((unused)),
+						  string goes */
+					__attribute__((unused)),
 	const void*			save)	/*!< in: immediate result from
-						check function */
+						  check function */
 {
 	if (*(my_bool*) save && srv_track_changed_pages) {
 
@@ -13002,37 +16538,233 @@ track_redo_log_now_set(
 
 #endif /* UNIV_DEBUG */
 
+/***********************************************************************
+@return version of the extended FTS API */
+uint
+innobase_fts_get_version()
+/*======================*/
+{
+	/* Currently this doesn't make much sense as returning
+	HA_CAN_FULLTEXT_EXT automatically mean this version is supported.
+	This supposed to ease future extensions.  */
+	return(2);
+}
+
+/***********************************************************************
+@return Which part of the extended FTS API is supported */
+ulonglong
+innobase_fts_flags()
+/*================*/
+{
+	return(FTS_ORDERED_RESULT | FTS_DOCID_IN_RESULT);
+}
+
+
+/***********************************************************************
+Find and Retrieve the FTS doc_id for the current result row
+@return the document ID */
+ulonglong
+innobase_fts_retrieve_docid(
+/*========================*/
+		FT_INFO_EXT * fts_hdl)	/*!< in: FTS handler */
+{
+	row_prebuilt_t* ft_prebuilt;
+	fts_result_t*	result;
+
+	ft_prebuilt = ((NEW_FT_INFO *)fts_hdl)->ft_prebuilt;
+	result = ((NEW_FT_INFO *)fts_hdl)->ft_result;
+
+	if (ft_prebuilt->read_just_key) {
+		fts_ranking_t* ranking =
+			rbt_value(fts_ranking_t, result->current);
+		return(ranking->doc_id);
+	}
+
+	return(ft_prebuilt->fts_doc_id);
+}
+
+
+ulong
+innobase_thd_get_thread_id(
+/*=======================*/
+	const void*	thd)
+{
+	return(thd_get_thread_id((const THD*) thd));
+}
+
+
+
+/***********************************************************************
+Find and retrieve the size of the current result
+@return number of matching rows */
+ulonglong
+innobase_fts_count_matches(
+/*=======================*/
+	FT_INFO_EXT* fts_hdl)	/*!< in: FTS handler */
+{
+	NEW_FT_INFO*	handle = (NEW_FT_INFO *) fts_hdl;
+
+	if (handle->ft_result->rankings_by_id != 0) {
+		return rbt_size(handle->ft_result->rankings_by_id);
+	} else {
+		return(0);
+	}
+}
+
+/* These variables are never read by InnoDB or changed. They are a kind of
+dummies that are needed by the MySQL infrastructure to call
+buffer_pool_dump_now(), buffer_pool_load_now() and buffer_pool_load_abort()
+by the user by doing:
+  SET GLOBAL innodb_buffer_pool_dump_now=ON;
+  SET GLOBAL innodb_buffer_pool_load_now=ON;
+  SET GLOBAL innodb_buffer_pool_load_abort=ON;
+Their values are read by MySQL and displayed to the user when the variables
+are queried, e.g.:
+  SELECT @@innodb_buffer_pool_dump_now;
+  SELECT @@innodb_buffer_pool_load_now;
+  SELECT @@innodb_buffer_pool_load_abort; */
+static my_bool	innodb_buffer_pool_dump_now = FALSE;
+static my_bool	innodb_buffer_pool_load_now = FALSE;
+static my_bool	innodb_buffer_pool_load_abort = FALSE;
+
+/****************************************************************//**
+Trigger a dump of the buffer pool if innodb_buffer_pool_dump_now is set
+to ON. This function is registered as a callback with MySQL. */
+static
+void
+buffer_pool_dump_now(
+/*=================*/
+	THD*				thd	/*!< in: thread handle */
+					__attribute__((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					__attribute__((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					__attribute__((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save && !srv_read_only_mode) {
+		buf_dump_start();
+	}
+}
+
+/****************************************************************//**
+Trigger a load of the buffer pool if innodb_buffer_pool_load_now is set
+to ON. This function is registered as a callback with MySQL. */
+static
+void
+buffer_pool_load_now(
+/*=================*/
+	THD*				thd	/*!< in: thread handle */
+					__attribute__((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					__attribute__((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					__attribute__((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save) {
+		buf_load_start();
+	}
+}
+
+/****************************************************************//**
+Abort a load of the buffer pool if innodb_buffer_pool_load_abort
+is set to ON. This function is registered as a callback with MySQL. */
+static
+void
+buffer_pool_load_abort(
+/*===================*/
+	THD*				thd	/*!< in: thread handle */
+					__attribute__((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					__attribute__((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					__attribute__((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save) {
+		buf_load_abort();
+	}
+}
+
 static SHOW_VAR innodb_status_variables_export[]= {
-  {"Innodb",                   (char*) &show_innodb_vars, SHOW_FUNC},
-  {NullS, NullS, SHOW_LONG}
+	{"Innodb", (char*) &show_innodb_vars, SHOW_FUNC},
+	{NullS, NullS, SHOW_LONG}
 };
 
 static struct st_mysql_storage_engine innobase_storage_engine=
 { MYSQL_HANDLERTON_INTERFACE_VERSION };
 
 /* plugin options */
+
+static MYSQL_SYSVAR_ENUM(checksum_algorithm, srv_checksum_algorithm,
+  PLUGIN_VAR_RQCMDARG,
+  "The algorithm InnoDB uses for page checksumming. Possible values are "
+  "CRC32 (hardware accelerated if the CPU supports it) "
+    "write crc32, allow any of the other checksums to match when reading; "
+  "STRICT_CRC32 "
+    "write crc32, do not allow other algorithms to match when reading; "
+  "INNODB "
+    "write a software calculated checksum, allow any other checksums "
+    "to match when reading; "
+  "STRICT_INNODB "
+    "write a software calculated checksum, do not allow other algorithms "
+    "to match when reading; "
+  "NONE "
+    "write a constant magic number, do not do any checksum verification "
+    "when reading (same as innodb_checksums=OFF); "
+  "STRICT_NONE "
+    "write a constant magic number, do not allow values other than that "
+    "magic number when reading; "
+  "Files updated when this option is set to crc32 or strict_crc32 will "
+  "not be readable by MySQL versions older than 5.6.3",
+  NULL, NULL, SRV_CHECKSUM_ALGORITHM_INNODB,
+  &innodb_checksum_algorithm_typelib);
+
+
+static MYSQL_SYSVAR_ENUM(log_checksum_algorithm, srv_log_checksum_algorithm,
+  PLUGIN_VAR_RQCMDARG,
+  "The algorithm InnoDB uses for log block checksums. Possible values are "
+  "CRC32 (hardware accelerated if the CPU supports it) "
+    "write crc32, allow any of the other checksums to match when reading; "
+  "STRICT_CRC32 "
+    "write crc32, do not allow other algorithms to match when reading; "
+  "INNODB "
+    "write a software calculated checksum, allow any other checksums "
+    "to match when reading; "
+  "STRICT_INNODB "
+    "write a software calculated checksum, do not allow other algorithms "
+    "to match when reading; "
+  "NONE "
+    "write a constant magic number, do not do any checksum verification "
+    "when reading (same as innodb_checksums=OFF); "
+  "STRICT_NONE "
+    "write a constant magic number, do not allow values other than that "
+    "magic number when reading; "
+  "Logs created when this option is set to crc32/strict_crc32/none/strict_none "
+  "will not be readable by any MySQL version or Percona Server versions that do"
+  "not support this feature",
+  NULL, innodb_log_checksum_algorithm_update, SRV_CHECKSUM_ALGORITHM_INNODB,
+  &innodb_checksum_algorithm_typelib);
+
+
 static MYSQL_SYSVAR_BOOL(checksums, innobase_use_checksums,
   PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "DEPRECATED. Use innodb_checksum_algorithm=NONE instead of setting "
+  "this to OFF. "
   "Enable InnoDB checksums validation (enabled by default). "
   "Disable with --skip-innodb-checksums.",
   NULL, NULL, TRUE);
 
-static MYSQL_SYSVAR_BOOL(fast_checksum, innobase_fast_checksum,
-  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
-  "DEPRECATED. #### WARNING #### : This feature is DEPRECATED and WILL "
-  "be removed in Percona Server 5.6. "
-  "Change the algorithm of checksum for the whole of datapage to 4-bytes word based. "
-  "The original checksum is checked after the new one. It may be slow for reading page"
-  " which has orginal checksum. Overwrite the page or recreate the InnoDB database, "
-  "if you want the entire benefit for performance at once. "
-  "#### Attention: The checksum is not compatible for normal or disabled version! ####",
-  NULL, NULL, FALSE);
-
-static MYSQL_SYSVAR_ULONG(page_size, innobase_page_size,
-  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  "###EXPERIMENTAL###: The universal page size of the database. Changing for created database is not supported. Use on your own risk!",
-  NULL, NULL, (1 << 14), (1 << 12), (1 << UNIV_PAGE_SIZE_SHIFT_MAX), 0);
-
 static MYSQL_SYSVAR_ULONG(log_block_size, innobase_log_block_size,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "###EXPERIMENTAL###: The log block size of the transaction log file. Changing for created log file is not supported. Use on your own risk!",
@@ -13044,17 +16776,6 @@ static MYSQL_SYSVAR_STR(data_home_dir, innobase_data_home_dir,
   "The common part for InnoDB table spaces.",
   NULL, NULL, NULL);
 
-static MYSQL_SYSVAR_BOOL(recovery_stats, innobase_recovery_stats,
-  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
-  "Output statistics of recovery process after it.",
-  NULL, NULL, FALSE);
-
-static MYSQL_SYSVAR_BOOL(recovery_update_relay_log, innobase_overwrite_relay_log_info,
-  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
-  "During InnoDB crash recovery on slave overwrite relay-log.info "
-  "to align master log file position if information in InnoDB and relay-log.info is different.",
-  NULL, NULL, FALSE);
-
 static MYSQL_SYSVAR_BOOL(doublewrite, innobase_use_doublewrite,
   PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
   "Enable InnoDB doublewrite buffer (enabled by default). "
@@ -13078,31 +16799,61 @@ static MYSQL_SYSVAR_BOOL(use_fallocate, innobase_use_fallocate,
 static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity,
   PLUGIN_VAR_RQCMDARG,
   "Number of IOPs the server can do. Tunes the background IO rate",
-  NULL, NULL, 200, 100, ~0UL, 0);
+  NULL, innodb_io_capacity_update, 200, 100, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULONG(io_capacity_max, srv_max_io_capacity,
+  PLUGIN_VAR_RQCMDARG,
+  "Limit to which innodb_io_capacity can be inflated.",
+  NULL, innodb_io_capacity_max_update,
+  SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT, 100,
+  SRV_MAX_IO_CAPACITY_LIMIT, 0);
+
+#ifdef UNIV_DEBUG
+static MYSQL_SYSVAR_BOOL(purge_run_now, innodb_purge_run_now,
+  PLUGIN_VAR_OPCMDARG,
+  "Set purge state to RUN",
+  NULL, purge_run_now_set, FALSE);
+
+static MYSQL_SYSVAR_BOOL(purge_stop_now, innodb_purge_stop_now,
+  PLUGIN_VAR_OPCMDARG,
+  "Set purge state to STOP",
+  NULL, purge_stop_now_set, FALSE);
+
+static MYSQL_SYSVAR_BOOL(log_checkpoint_now, innodb_log_checkpoint_now,
+  PLUGIN_VAR_OPCMDARG,
+  "Force checkpoint now",
+  NULL, checkpoint_now_set, FALSE);
+
+static MYSQL_SYSVAR_BOOL(track_redo_log_now,
+  innodb_track_redo_log_now,
+  PLUGIN_VAR_OPCMDARG,
+  "Force log tracker to catch up with checkpoint now",
+  NULL, track_redo_log_now_set, FALSE);
+#endif /* UNIV_DEBUG */
 
 static MYSQL_SYSVAR_ULONG(purge_batch_size, srv_purge_batch_size,
   PLUGIN_VAR_OPCMDARG,
   "Number of UNDO log pages to purge in one batch from the history list.",
   NULL, NULL,
-  20,			/* Default setting */
+  300,			/* Default setting */
   1,			/* Minimum value */
   5000, 0);		/* Maximum value */
 
-static MYSQL_SYSVAR_ULONG(rollback_segments, srv_rollback_segments,
-  PLUGIN_VAR_OPCMDARG,
-  "Number of UNDO logs to use.",
+static MYSQL_SYSVAR_ULONG(purge_threads, srv_n_purge_threads,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Purge threads can be from 1 to 32. Default is 1.",
   NULL, NULL,
-  128,			/* Default setting */
+  1,			/* Default setting */
   1,			/* Minimum value */
-  TRX_SYS_N_RSEGS, 0);	/* Maximum value */
+  SRV_MAX_N_PURGE_THREADS, 0);		/* Maximum value */
 
-static MYSQL_SYSVAR_ULONG(purge_threads, srv_n_purge_threads,
+static MYSQL_SYSVAR_ULONG(sync_array_size, srv_sync_array_size,
   PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
-  "Purge threads can be either 0 or 1.",
+  "Size of the mutex/lock wait array.",
   NULL, NULL,
   1,			/* Default setting */
-  0,			/* Minimum value */
-  1, 0);		/* Maximum value */
+  1,			/* Minimum value */
+  1024, 0);		/* Maximum value */
 
 static MYSQL_SYSVAR_ULONG(fast_shutdown, innobase_fast_shutdown,
   PLUGIN_VAR_OPCMDARG,
@@ -13113,7 +16864,7 @@ static MYSQL_SYSVAR_ULONG(fast_shutdown, innobase_fast_shutdown,
 static MYSQL_SYSVAR_BOOL(file_per_table, srv_file_per_table,
   PLUGIN_VAR_NOCMDARG,
   "Stores each InnoDB table to an .ibd file in the database dir.",
-  NULL, NULL, FALSE);
+  NULL, NULL, TRUE);
 
 static MYSQL_SYSVAR_STR(file_format, innobase_file_format_name,
   PLUGIN_VAR_RQCMDARG,
@@ -13134,7 +16885,7 @@ static MYSQL_SYSVAR_BOOL(file_format_check, innobase_file_format_check,
 
 /* If a new file format is introduced, the file format
 name needs to be updated accordingly. Please refer to
-file_format_name_map[] defined in trx0sys.c for the next
+file_format_name_map[] defined in trx0sys.cc for the next
 file format name. */
 static MYSQL_SYSVAR_STR(file_format_max, innobase_file_format_max,
   PLUGIN_VAR_OPCMDARG,
@@ -13142,6 +16893,18 @@ static MYSQL_SYSVAR_STR(file_format_max, innobase_file_format_max,
   innodb_file_format_max_validate,
   innodb_file_format_max_update, "Antelope");
 
+static MYSQL_SYSVAR_STR(ft_server_stopword_table, innobase_server_stopword_table,
+  PLUGIN_VAR_OPCMDARG,
+  "The user supplied stopword table name.",
+  innodb_stopword_table_validate,
+  innodb_stopword_table_update,
+  NULL);
+
+static MYSQL_SYSVAR_UINT(flush_log_at_timeout, srv_flush_log_at_timeout,
+  PLUGIN_VAR_OPCMDARG,
+  "Write and flush logs every (n) second.",
+  NULL, NULL, 1, 0, 2700, 0);
+
 /* Changed to the THDVAR */
 //static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
 //  PLUGIN_VAR_OPCMDARG,
@@ -13171,6 +16934,8 @@ static MYSQL_SYSVAR_BOOL(force_load_corrupted, srv_load_corrupted,
 
 static MYSQL_SYSVAR_BOOL(locks_unsafe_for_binlog, innobase_locks_unsafe_for_binlog,
   PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "DEPRECATED. This option may be removed in future releases. "
+  "Please use READ COMMITTED transaction isolation level instead. "
   "Force InnoDB to not use next-key locking, to use only row-level locking.",
   NULL, NULL, FALSE);
 
@@ -13190,29 +16955,61 @@ static MYSQL_SYSVAR_STR(log_arch_dir, innobase_log_arch_dir,
   "Where full logs should be archived.", NULL, NULL, NULL);
 
 static MYSQL_SYSVAR_BOOL(log_archive, innobase_log_archive,
-  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
-  "Set to 1 if you want to have logs archived.", NULL, NULL, FALSE);
+  PLUGIN_VAR_OPCMDARG,
+  "Set to 1 if you want to have logs archived.",
+  NULL, innodb_log_archive_update, FALSE);
 #endif /* UNIV_LOG_ARCHIVE */
 
-static MYSQL_SYSVAR_STR(log_group_home_dir, innobase_log_group_home_dir,
+static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Path to InnoDB log files.", NULL, NULL, NULL);
 
+static MYSQL_SYSVAR_ULONG(log_arch_expire_sec,
+  srv_log_arch_expire_sec, PLUGIN_VAR_OPCMDARG,
+  "Expiration time for archived innodb transaction logs.",
+  NULL, innodb_log_archive_expire_update, 0, 0, ~0UL, 0);
+
 static MYSQL_SYSVAR_ULONG(max_dirty_pages_pct, srv_max_buf_pool_modified_pct,
   PLUGIN_VAR_RQCMDARG,
   "Percentage of dirty pages allowed in bufferpool.",
-  NULL, NULL, 75, 0, 99, 0);
+  NULL, innodb_max_dirty_pages_pct_update, 75, 0, 99, 0);
+
+static MYSQL_SYSVAR_ULONG(max_dirty_pages_pct_lwm,
+  srv_max_dirty_pages_pct_lwm,
+  PLUGIN_VAR_RQCMDARG,
+  "Percentage of dirty pages at which flushing kicks in.",
+  NULL, innodb_max_dirty_pages_pct_lwm_update, 0, 0, 99, 0);
+
+static MYSQL_SYSVAR_ULONG(adaptive_flushing_lwm,
+  srv_adaptive_flushing_lwm,
+  PLUGIN_VAR_RQCMDARG,
+  "Percentage of log capacity below which no adaptive flushing happens.",
+  NULL, NULL, 10, 0, 70, 0);
 
 static MYSQL_SYSVAR_BOOL(adaptive_flushing, srv_adaptive_flushing,
   PLUGIN_VAR_NOCMDARG,
   "Attempt flushing dirty pages to avoid IO bursts at checkpoints.",
   NULL, NULL, TRUE);
 
+static MYSQL_SYSVAR_ULONG(flushing_avg_loops,
+  srv_flushing_avg_loops,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of iterations over which the background flushing is averaged.",
+  NULL, NULL, 30, 1, 1000, 0);
+
 static MYSQL_SYSVAR_ULONG(max_purge_lag, srv_max_purge_lag,
   PLUGIN_VAR_RQCMDARG,
   "Desired maximum length of the purge queue (0 = no limit)",
   NULL, NULL, 0, 0, ~0UL, 0);
 
+static MYSQL_SYSVAR_ULONG(max_purge_lag_delay, srv_max_purge_lag_delay,
+   PLUGIN_VAR_RQCMDARG,
+   "Maximum delay of user threads in micro-seconds",
+   NULL, NULL,
+   0L,			/* Default seting */
+   0L,			/* Minimum value */
+   10000000UL, 0);	/* Maximum value */
+
 static MYSQL_SYSVAR_BOOL(rollback_on_timeout, innobase_rollback_on_timeout,
   PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
   "Roll back the complete transaction on lock wait timeout, for 4.x compatibility (disabled by default)",
@@ -13220,45 +17017,47 @@ static MYSQL_SYSVAR_BOOL(rollback_on_timeout, innobase_rollback_on_timeout,
 
 static MYSQL_SYSVAR_BOOL(status_file, innobase_create_status_file,
   PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_NOSYSVAR,
-  "Enable SHOW INNODB STATUS output in the innodb_status.<pid> file",
+  "Enable SHOW ENGINE INNODB STATUS output in the innodb_status.<pid> file",
   NULL, NULL, FALSE);
 
 static MYSQL_SYSVAR_BOOL(stats_on_metadata, innobase_stats_on_metadata,
   PLUGIN_VAR_OPCMDARG,
-  "Enable statistics gathering for metadata commands such as SHOW TABLE STATUS (on by default)",
-  NULL, NULL, TRUE);
+  "Enable statistics gathering for metadata commands such as "
+  "SHOW TABLE STATUS for tables that use transient statistics (off by default)",
+  NULL, NULL, FALSE);
 
-static MYSQL_SYSVAR_ULONGLONG(stats_sample_pages, srv_stats_sample_pages,
+static MYSQL_SYSVAR_ULONGLONG(stats_sample_pages, srv_stats_transient_sample_pages,
   PLUGIN_VAR_RQCMDARG,
-  "The number of index pages to sample when calculating statistics (default 8)",
-  NULL, NULL, 8, 1, ~0ULL, 0);
+  "Deprecated, use innodb_stats_transient_sample_pages instead",
+  NULL, innodb_stats_sample_pages_update, 8, 1, ~0ULL, 0);
 
-static MYSQL_SYSVAR_ULINT(stats_auto_update, srv_stats_auto_update,
+static MYSQL_SYSVAR_ULONGLONG(stats_transient_sample_pages,
+  srv_stats_transient_sample_pages,
   PLUGIN_VAR_RQCMDARG,
-  "Enable/Disable InnoDB's auto update statistics of indexes. "
-  "(except for ANALYZE TABLE command) 0:disable 1:enable",
-  NULL, NULL, 1, 0, 1, 0);
+  "The number of leaf index pages to sample when calculating transient "
+  "statistics (if persistent statistics are not used, default 8)",
+  NULL, NULL, 8, 1, ~0ULL, 0);
 
-static MYSQL_SYSVAR_ULINT(stats_update_need_lock, srv_stats_update_need_lock,
-  PLUGIN_VAR_RQCMDARG,
-  "Enable/Disable InnoDB's update statistics which needs to lock dictionary. "
-  "e.g. Data_free.",
-  NULL, NULL, 1, 0, 1, 0);
+static MYSQL_SYSVAR_BOOL(stats_persistent, srv_stats_persistent,
+  PLUGIN_VAR_OPCMDARG,
+  "InnoDB persistent statistics enabled for all tables unless overridden "
+  "at table level",
+  NULL, NULL, TRUE);
 
-static MYSQL_SYSVAR_BOOL(use_sys_stats_table, innobase_use_sys_stats_table,
-  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
-  "Enable to use SYS_STATS system table to store statistics statically, "
-  "And avoids to calculate statistics at every first open of the tables. "
-  "This option may make the opportunities of update statistics less. "
-  "So you should use ANALYZE TABLE command intentionally.",
-  NULL, NULL, FALSE);
+static MYSQL_SYSVAR_BOOL(stats_auto_recalc, srv_stats_auto_recalc,
+  PLUGIN_VAR_OPCMDARG,
+  "InnoDB automatic recalculation of persistent statistics enabled for all "
+  "tables unless overridden at table level (automatic recalculation is only "
+  "done when InnoDB decides that the table has changed too much and needs a "
+  "new statistics)",
+  NULL, NULL, TRUE);
 
-#ifdef UNIV_DEBUG
-static MYSQL_SYSVAR_ULONG(persistent_stats_root_page,
-  innobase_sys_stats_root_page, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  "Override the SYS_STATS root page id, 0 = no override (for testing only)",
-  NULL, NULL, 0, 0, ULONG_MAX, 0);
-#endif
+static MYSQL_SYSVAR_ULONGLONG(stats_persistent_sample_pages,
+  srv_stats_persistent_sample_pages,
+  PLUGIN_VAR_RQCMDARG,
+  "The number of leaf index pages to sample when calculating persistent "
+  "statistics (by ANALYZE, default 20)",
+  NULL, NULL, 20, 1, ~0ULL, 0);
 
 static MYSQL_SYSVAR_BOOL(adaptive_hash_index, btr_search_enabled,
   PLUGIN_VAR_OPCMDARG,
@@ -13270,7 +17069,8 @@ static MYSQL_SYSVAR_BOOL(adaptive_hash_index, btr_search_enabled,
 reasons. This limitation can be easily removed later. */
 static MYSQL_SYSVAR_ULINT(adaptive_hash_index_partitions, btr_search_index_num,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  "Number of InnoDB adaptive hash index partitions (default 1: disable partitioning)",
+  "Number of InnoDB adaptive hash index partitions (default 1: disable "
+  "partitioning)",
   NULL, NULL, 1, 1, sizeof(ulint) * 8, 0);
 
 static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay,
@@ -13279,22 +17079,33 @@ static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay,
   "innodb_thread_concurrency is reached (0 by default)",
   NULL, NULL, 0, 0, ~0UL, 0);
 
+static MYSQL_SYSVAR_UINT(compression_level, page_zip_level,
+  PLUGIN_VAR_RQCMDARG,
+  "Compression level used for compressed row format.  0 is no compression"
+  ", 1 is fastest, 9 is best compression and default is 6.",
+  NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0);
+
+static MYSQL_SYSVAR_BOOL(log_compressed_pages, page_zip_log_pages,
+       PLUGIN_VAR_OPCMDARG,
+  "Enables/disables the logging of entire compressed page images."
+  " InnoDB logs the compressed pages to prevent corruption if"
+  " the zlib compression algorithm changes."
+  " When turned OFF, InnoDB will assume that the zlib"
+  " compression algorithm doesn't change.",
+  NULL, NULL, TRUE);
+
 static MYSQL_SYSVAR_LONG(additional_mem_pool_size, innobase_additional_mem_pool_size,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "DEPRECATED. This option may be removed in future releases, "
+  "together with the option innodb_use_sys_malloc and with the InnoDB's "
+  "internal memory allocator. "
   "Size of a memory pool InnoDB uses to store data dictionary information and other internal data structures.",
   NULL, NULL, 8*1024*1024L, 512*1024L, LONG_MAX, 1024);
 
 static MYSQL_SYSVAR_ULONG(autoextend_increment, srv_auto_extend_increment,
   PLUGIN_VAR_RQCMDARG,
   "Data file autoextend increment in megabytes",
-  NULL, NULL, 8L, 1L, 1000L, 0);
-
-#ifndef DBUG_OFF
-static MYSQL_SYSVAR_STR(buffer_pool_evict, srv_buffer_pool_evict,
-  PLUGIN_VAR_RQCMDARG,
-  "Evict pages from the InnoDB buffer pool.",
-  NULL, innodb_buffer_pool_evict_update, "");
-#endif /* !DBUG_OFF */
+  NULL, NULL, 64L, 1L, 1000L, 0);
 
 static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
@@ -13307,20 +17118,185 @@ static MYSQL_SYSVAR_BOOL(buffer_pool_populate, srv_buf_pool_populate,
   "established by the buffer pool memory region. Disabled by default.",
   NULL, NULL, FALSE);
 
+static MYSQL_SYSVAR_ENUM(foreground_preflush, srv_foreground_preflush,
+  PLUGIN_VAR_OPCMDARG,
+  "The algorithm InnoDB uses for the query threads at sync preflush.  "
+  "Possible values are "
+  "SYNC_PREFLUSH: perform a sync preflush as Oracle MySQL; "
+  "EXPONENTIAL_BACKOFF: (default) wait for the page cleaner flush.",
+  NULL, NULL, SRV_FOREGROUND_PREFLUSH_EXP_BACKOFF,
+  &innodb_foreground_preflush_typelib);
+
+#ifdef UNIV_LINUX
+
+static MYSQL_SYSVAR_ULONG(sched_priority_cleaner, srv_sched_priority_cleaner,
+  PLUGIN_VAR_RQCMDARG,
+  "Nice value for the cleaner thread scheduling",
+  NULL, innodb_sched_priority_cleaner_update, 19, 0, 39, 0);
+
+#endif /* UNIV_LINUX */
+
+#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG
+static MYSQL_SYSVAR_ULONG(page_hash_locks, srv_n_page_hash_locks,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Number of rw_locks protecting buffer pool page_hash. Rounded up to the next power of 2",
+  NULL, NULL, 16, 1, MAX_PAGE_HASH_LOCKS, 0);
+
+static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Number of pages reserved in doublewrite buffer for batch flushing",
+  NULL, NULL, 120, 1, 127, 0);
+
+#ifdef UNIV_LINUX
+
+static MYSQL_SYSVAR_ULONG(sched_priority_purge, srv_sched_priority_purge,
+  PLUGIN_VAR_RQCMDARG,
+  "Nice value for the purge thread scheduling",
+  NULL, innodb_sched_priority_purge_update, 19, 0, 39, 0);
+
+static MYSQL_SYSVAR_ULONG(sched_priority_io, srv_sched_priority_io,
+  PLUGIN_VAR_RQCMDARG,
+  "Nice value for the I/O handler thread scheduling",
+  NULL, innodb_sched_priority_io_update, 19, 0, 39, 0);
+
+static MYSQL_SYSVAR_ULONG(sched_priority_master, srv_sched_priority_master,
+  PLUGIN_VAR_RQCMDARG,
+  "Nice value for the master thread scheduling",
+  NULL, innodb_sched_priority_master_update, 19, 0, 39, 0);
+
+static MYSQL_SYSVAR_BOOL(priority_purge, srv_purge_thread_priority,
+  PLUGIN_VAR_OPCMDARG,
+  "Make purge coordinator and worker threads acquire shared resources with "
+  "priority", NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(priority_io, srv_io_thread_priority,
+  PLUGIN_VAR_OPCMDARG,
+  "Make I/O threads acquire shared resources with priority",
+   NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(priority_cleaner, srv_cleaner_thread_priority,
+  PLUGIN_VAR_OPCMDARG,
+  "Make buffer pool cleaner thread acquire shared resources with priority",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(priority_master, srv_master_thread_priority,
+  PLUGIN_VAR_OPCMDARG,
+  "Make buffer pool cleaner thread acquire shared resources with priority",
+   NULL, NULL, FALSE);
+
+#endif /* UNIV_LINUX */
+
+static MYSQL_SYSVAR_ULINT(cleaner_max_lru_time, srv_cleaner_max_lru_time,
+  PLUGIN_VAR_RQCMDARG,
+  "The maximum time limit for a single LRU tail flush iteration by the page "
+  "cleaner thread in miliseconds",
+  NULL, NULL, 1000, 0, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULINT(cleaner_max_flush_time, srv_cleaner_max_flush_time,
+  PLUGIN_VAR_RQCMDARG,
+  "The maximum time limit for a single flush list flush iteration by the page "
+  "cleaner thread in miliseconds",
+  NULL, NULL, 1000, 0, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULINT(cleaner_flush_chunk_size,
+  srv_cleaner_flush_chunk_size,
+  PLUGIN_VAR_RQCMDARG,
+  "Divide page cleaner flush list flush batches into chunks of this size",
+  NULL, NULL, 100, 1, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULINT(cleaner_lru_chunk_size,
+  srv_cleaner_lru_chunk_size,
+  PLUGIN_VAR_RQCMDARG,
+  "Divide page cleaner LRU list flush batches into chunks of this size",
+  NULL, NULL, 100, 1, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULINT(cleaner_free_list_lwm, srv_cleaner_free_list_lwm,
+  PLUGIN_VAR_RQCMDARG,
+  "Page cleaner will keep on flushing the same buffer pool instance if its "
+  "free list length is below this percentage of innodb_lru_scan_depth",
+  NULL, NULL, 10, 0, 100, 0);
+
+static MYSQL_SYSVAR_BOOL(cleaner_eviction_factor, srv_cleaner_eviction_factor,
+  PLUGIN_VAR_OPCMDARG,
+  "Make page cleaner LRU flushes use evicted instead of flushed page counts "
+  "for its heuristics",
+  NULL, NULL, FALSE);
+
+#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */
+
+static MYSQL_SYSVAR_ENUM(cleaner_lsn_age_factor,
+  srv_cleaner_lsn_age_factor,
+  PLUGIN_VAR_OPCMDARG,
+  "The formula for LSN age factor for page cleaner adaptive flushing. "
+  "LEGACY: Original Oracle MySQL 5.6 formula. "
+  "HIGH_CHECKPOINT: (the default) Percona Server 5.6 formula.",
+  NULL, NULL, SRV_CLEANER_LSN_AGE_FACTOR_HIGH_CHECKPOINT,
+  &innodb_cleaner_lsn_age_factor_typelib);
+
+static MYSQL_SYSVAR_ENUM(empty_free_list_algorithm,
+  srv_empty_free_list_algorithm,
+  PLUGIN_VAR_OPCMDARG,
+  "The algorithm to use for empty free list handling.  Allowed values: "
+  "LEGACY: Original Oracle MySQL 5.6 handling with single page flushes; "
+  "BACKOFF: (default) Wait until cleaner produces a free page.",
+  NULL, NULL, SRV_EMPTY_FREE_LIST_BACKOFF,
+  &innodb_empty_free_list_algorithm_typelib);
+
 static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Number of buffer pool instances, set to higher value on high-end machines to increase scalability",
-  NULL, NULL, 1L, 1L, MAX_BUFFER_POOLS, 1L);
+  NULL, NULL, 0L, 0L, MAX_BUFFER_POOLS, 1L);
+
+static MYSQL_SYSVAR_STR(buffer_pool_filename, srv_buf_dump_filename,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
+  "Filename to/from which to dump/load the InnoDB buffer pool",
+  innodb_srv_buf_dump_filename_validate, NULL, SRV_BUF_DUMP_FILENAME_DEFAULT);
 
-static MYSQL_SYSVAR_UINT(buffer_pool_shm_key, innobase_buffer_pool_shm_key,
+static MYSQL_SYSVAR_BOOL(buffer_pool_dump_now, innodb_buffer_pool_dump_now,
+  PLUGIN_VAR_RQCMDARG,
+  "Trigger an immediate dump of the buffer pool into a file named @@innodb_buffer_pool_filename",
+  NULL, buffer_pool_dump_now, FALSE);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_dump_at_shutdown, srv_buffer_pool_dump_at_shutdown,
+  PLUGIN_VAR_RQCMDARG,
+  "Dump the buffer pool into a file named @@innodb_buffer_pool_filename",
+  NULL, NULL, FALSE);
+
+#ifdef UNIV_DEBUG
+static MYSQL_SYSVAR_STR(buffer_pool_evict, srv_buffer_pool_evict,
+  PLUGIN_VAR_RQCMDARG,
+  "Evict pages from the buffer pool",
+  NULL, innodb_buffer_pool_evict_update, "");
+#endif /* UNIV_DEBUG */
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_load_now, innodb_buffer_pool_load_now,
+  PLUGIN_VAR_RQCMDARG,
+  "Trigger an immediate load of the buffer pool from a file named @@innodb_buffer_pool_filename",
+  NULL, buffer_pool_load_now, FALSE);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_load_abort, innodb_buffer_pool_load_abort,
+  PLUGIN_VAR_RQCMDARG,
+  "Abort a currently running load of the buffer pool",
+  NULL, buffer_pool_load_abort, FALSE);
+
+/* there is no point in changing this during runtime, thus readonly */
+static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_startup,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  "[Deprecated option] no effect",
-  NULL, NULL, 0, 0, INT_MAX32, 0);
+  "Load the buffer pool from a file named @@innodb_buffer_pool_filename",
+  NULL, NULL, FALSE);
 
-static MYSQL_SYSVAR_BOOL(buffer_pool_shm_checksum, innobase_buffer_pool_shm_checksum,
-  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
-  "[Deprecated option] no effect",
-  NULL, NULL, TRUE);
+static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth,
+  PLUGIN_VAR_RQCMDARG,
+  "How deep to scan LRU to keep it clean",
+  NULL, NULL, 1024, 100, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULONG(flush_neighbors, srv_flush_neighbors,
+  PLUGIN_VAR_OPCMDARG,
+  "Set to 0 (don't flush neighbors from buffer pool),"
+  " 1 (flush contiguous neighbors from buffer pool)"
+  " or 2 (flush neighbors from buffer pool),"
+  " when flushing a block",
+  NULL, NULL, 1, 0, 2, 0);
 
 static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency,
   PLUGIN_VAR_RQCMDARG,
@@ -13330,7 +17306,7 @@ static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency,
 static MYSQL_SYSVAR_ULONG(concurrency_tickets, srv_n_free_tickets_to_enter,
   PLUGIN_VAR_RQCMDARG,
   "Number of times a thread is allowed to enter InnoDB within the same SQL query after it has once got the ticket",
-  NULL, NULL, 500L, 1L, ~0UL, 0);
+  NULL, NULL, 5000L, 1L, ~0UL, 0);
 
 #ifdef EXTENDED_FOR_KILLIDLE
 #define kill_idle_help_text "If non-zero value, the idle session with transaction which is idle over the value in seconds is killed by InnoDB."
@@ -13345,6 +17321,73 @@ static MYSQL_SYSVAR_LONG(file_io_threads, innobase_file_io_threads,
   "Number of file I/O threads in InnoDB.",
   NULL, NULL, 4, 4, 64, 0);
 
+static MYSQL_SYSVAR_BOOL(ft_enable_diag_print, fts_enable_diag_print,
+  PLUGIN_VAR_OPCMDARG,
+  "Whether to enable additional FTS diagnostic printout ",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(disable_sort_file_cache, srv_disable_sort_file_cache,
+  PLUGIN_VAR_OPCMDARG,
+  "Whether to disable OS system file cache for sort I/O",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_STR(ft_aux_table, fts_internal_tbl_name,
+  PLUGIN_VAR_NOCMDARG,
+  "FTS internal auxiliary table to be checked",
+  innodb_internal_table_validate,
+  innodb_internal_table_update, NULL);
+
+static MYSQL_SYSVAR_ULONG(ft_cache_size, fts_max_cache_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "InnoDB Fulltext search cache size in bytes",
+  NULL, NULL, 8000000, 1600000, 80000000, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_total_cache_size, fts_max_total_cache_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Total memory allocated for InnoDB Fulltext Search cache",
+  NULL, NULL, 640000000, 32000000, 1600000000, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_result_cache_limit, fts_result_cache_limit,
+  PLUGIN_VAR_RQCMDARG,
+  "InnoDB Fulltext search query result cache limit in bytes",
+  NULL, NULL, 2000000000L, 1000000L, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_min_token_size, fts_min_token_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "InnoDB Fulltext search minimum token size in characters",
+  NULL, NULL, 3, 0, 16, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_max_token_size, fts_max_token_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "InnoDB Fulltext search maximum token size in characters",
+  NULL, NULL, FTS_MAX_WORD_LEN_IN_CHAR, 10, FTS_MAX_WORD_LEN_IN_CHAR, 0);
+
+
+static MYSQL_SYSVAR_ULONG(ft_num_word_optimize, fts_num_word_optimize,
+  PLUGIN_VAR_OPCMDARG,
+  "InnoDB Fulltext search number of words to optimize for each optimize table call ",
+  NULL, NULL, 2000, 1000, 10000, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_sort_pll_degree, fts_sort_pll_degree,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "InnoDB Fulltext search parallel sort degree, will round up to nearest power of 2 number",
+  NULL, NULL, 2, 1, 16, 0);
+
+static MYSQL_SYSVAR_ULONG(sort_buffer_size, srv_sort_buf_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Memory buffer size for index creation",
+  NULL, NULL, 1048576, 65536, 64<<20, 0);
+
+static MYSQL_SYSVAR_ULONGLONG(online_alter_log_max_size, srv_online_max_size,
+  PLUGIN_VAR_RQCMDARG,
+  "Maximum modification log file size for online index creation",
+  NULL, NULL, 128<<20, 65536, ~0ULL, 0);
+
+static MYSQL_SYSVAR_BOOL(optimize_fulltext_only, innodb_optimize_fulltext_only,
+  PLUGIN_VAR_NOCMDARG,
+  "Only optimize the Fulltext index of the table",
+  NULL, NULL, FALSE);
+
 static MYSQL_SYSVAR_ULONG(read_io_threads, innobase_read_io_threads,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Number of background read I/O threads in InnoDB.",
@@ -13355,11 +17398,24 @@ static MYSQL_SYSVAR_ULONG(write_io_threads, innobase_write_io_threads,
   "Number of background write I/O threads in InnoDB.",
   NULL, NULL, 4, 1, 64, 0);
 
-static MYSQL_SYSVAR_LONG(force_recovery, innobase_force_recovery,
+static MYSQL_SYSVAR_ULONG(force_recovery, srv_force_recovery,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Helps to save your data in case the disk image of the database becomes corrupt.",
   NULL, NULL, 0, 0, 6, 0);
 
+#ifndef DBUG_OFF
+static MYSQL_SYSVAR_ULONG(force_recovery_crash, srv_force_recovery_crash,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Kills the server during crash recovery.",
+  NULL, NULL, 0, 0, 10, 0);
+#endif /* !DBUG_OFF */
+
+static MYSQL_SYSVAR_ULONG(page_size, srv_page_size,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Page size to use for all InnoDB tablespaces.",
+  NULL, NULL, UNIV_PAGE_SIZE_DEF,
+  UNIV_PAGE_SIZE_MIN, UNIV_PAGE_SIZE_MAX, 0);
+
 static MYSQL_SYSVAR_LONG(log_buffer_size, innobase_log_buffer_size,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "The size of the buffer which InnoDB uses to write log to the log files on disk.",
@@ -13368,17 +17424,19 @@ static MYSQL_SYSVAR_LONG(log_buffer_size, innobase_log_buffer_size,
 static MYSQL_SYSVAR_LONGLONG(log_file_size, innobase_log_file_size,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Size of each log file in a log group.",
-  NULL, NULL, 5*1024*1024L, 1*1024*1024L, LONGLONG_MAX, 1024*1024L);
+  NULL, NULL, 48*1024*1024L, 1*1024*1024L, LONGLONG_MAX, 1024*1024L);
 
-static MYSQL_SYSVAR_LONG(log_files_in_group, innobase_log_files_in_group,
+static MYSQL_SYSVAR_ULONG(log_files_in_group, srv_n_log_files,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  "Number of log files in the log group. InnoDB writes to the files in a circular fashion. Value 3 is recommended here.",
-  NULL, NULL, 2, 2, 100, 0);
+  "Number of log files in the log group. InnoDB writes to the files in a circular fashion.",
+  NULL, NULL, 2, 2, SRV_N_LOG_FILES_MAX, 0);
 
+/* Note that the default and minimum values are set to 0 to
+detect if the option is passed and print deprecation message */
 static MYSQL_SYSVAR_LONG(mirrored_log_groups, innobase_mirrored_log_groups,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Number of identical copies of log groups we keep for the database. Currently this should be set to 1.",
-  NULL, NULL, 1, 1, 10, 0);
+  NULL, NULL, 0, 0, 10, 0);
 
 static MYSQL_SYSVAR_UINT(old_blocks_pct, innobase_old_blocks_pct,
   PLUGIN_VAR_RQCMDARG,
@@ -13389,13 +17447,13 @@ static MYSQL_SYSVAR_UINT(old_blocks_time, buf_LRU_old_threshold_ms,
   PLUGIN_VAR_RQCMDARG,
   "Move blocks to the 'new' end of the buffer pool if the first access"
   " was at least this many milliseconds ago."
-  " The timeout is disabled if 0 (the default).",
-  NULL, NULL, 0, 0, UINT_MAX32, 0);
+  " The timeout is disabled if 0.",
+  NULL, NULL, 1000, 0, UINT_MAX32, 0);
 
 static MYSQL_SYSVAR_LONG(open_files, innobase_open_files,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "How many files at the maximum InnoDB keeps open at the same time.",
-  NULL, NULL, 300L, 10L, LONG_MAX, 0);
+  NULL, NULL, 0L, 0L, LONG_MAX, 0);
 
 static MYSQL_SYSVAR_ULONG(sync_spin_loops, srv_n_spin_wait_rounds,
   PLUGIN_VAR_RQCMDARG,
@@ -13407,31 +17465,65 @@ static MYSQL_SYSVAR_ULONG(spin_wait_delay, srv_spin_wait_delay,
   "Maximum delay between polling for a spin lock (6 by default)",
   NULL, NULL, 6L, 0L, ~0UL, 0);
 
-static MYSQL_SYSVAR_BOOL(thread_concurrency_timer_based,
-  innobase_thread_concurrency_timer_based,
-  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
-  "Use InnoDB timer based concurrency throttling. ",
-  NULL, NULL, FALSE);
-
 static MYSQL_SYSVAR_ULONG(thread_concurrency, srv_thread_concurrency,
   PLUGIN_VAR_RQCMDARG,
   "Helps in performance tuning in heavily concurrent environments. Sets the maximum number of threads allowed inside InnoDB. Value 0 will disable the thread throttling.",
   NULL, NULL, 0, 0, 1000, 0);
 
+#ifdef HAVE_ATOMIC_BUILTINS
+static MYSQL_SYSVAR_ULONG(
+  adaptive_max_sleep_delay, srv_adaptive_max_sleep_delay,
+  PLUGIN_VAR_RQCMDARG,
+  "The upper limit of the sleep delay in usec. Value of 0 disables it.",
+  NULL, NULL,
+  150000,			/* Default setting */
+  0,				/* Minimum value */
+  1000000, 0);			/* Maximum value */
+#endif /* HAVE_ATOMIC_BUILTINS */
+
 static MYSQL_SYSVAR_ULONG(thread_sleep_delay, srv_thread_sleep_delay,
   PLUGIN_VAR_RQCMDARG,
-  "Time of innodb thread sleeping before joining InnoDB queue (usec). Value 0 disable a sleep",
-  NULL, NULL, 10000L, 0L, ~0UL, 0);
+  "Time of innodb thread sleeping before joining InnoDB queue (usec). "
+  "Value 0 disable a sleep",
+  NULL, NULL,
+  10000L,
+  0L,
+  ~0UL, 0);
 
 static MYSQL_SYSVAR_STR(data_file_path, innobase_data_file_path,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Path to individual files and their sizes.",
   NULL, NULL, NULL);
 
-static MYSQL_SYSVAR_STR(doublewrite_file, innobase_doublewrite_file,
+static MYSQL_SYSVAR_STR(undo_directory, srv_undo_dir,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  "Path to special datafile for doublewrite buffer. (default is "": not used) ### ONLY FOR EXPERTS!!! ###",
-  NULL, NULL, NULL);
+  "Directory where undo tablespace files live, this path can be absolute.",
+  NULL, NULL, ".");
+
+static MYSQL_SYSVAR_ULONG(undo_tablespaces, srv_undo_tablespaces,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Number of undo tablespaces to use. ",
+  NULL, NULL,
+  0L,			/* Default seting */
+  0L,			/* Minimum value */
+  126L, 0);		/* Maximum value */
+
+static MYSQL_SYSVAR_ULONG(undo_logs, srv_undo_logs,
+  PLUGIN_VAR_OPCMDARG,
+  "Number of undo logs to use.",
+  NULL, NULL,
+  TRX_SYS_N_RSEGS,	/* Default setting */
+  1,			/* Minimum value */
+  TRX_SYS_N_RSEGS, 0);	/* Maximum value */
+
+/* Alias for innodb_undo_logs, this config variable is deprecated. */
+static MYSQL_SYSVAR_ULONG(rollback_segments, srv_undo_logs,
+  PLUGIN_VAR_OPCMDARG,
+  "Number of undo logs to use (deprecated).",
+  NULL, NULL,
+  TRX_SYS_N_RSEGS,	/* Default setting */
+  1,			/* Minimum value */
+  TRX_SYS_N_RSEGS, 0);	/* Maximum value */
 
 static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
@@ -13451,6 +17543,8 @@ static MYSQL_SYSVAR_STR(version, innodb_version_str,
 
 static MYSQL_SYSVAR_BOOL(use_sys_malloc, srv_use_sys_malloc,
   PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "DEPRECATED. This option may be removed in future releases, "
+  "together with the InnoDB's internal memory allocator. "
   "Use OS memory allocator instead of InnoDB's internal memory allocator",
   NULL, NULL, TRUE);
 
@@ -13459,6 +17553,37 @@ static MYSQL_SYSVAR_BOOL(use_native_aio, srv_use_native_aio,
   "Use native AIO if supported on this platform.",
   NULL, NULL, TRUE);
 
+static MYSQL_SYSVAR_BOOL(api_enable_binlog, ib_binlog_enabled,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Enable binlog for applications direct access InnoDB through InnoDB APIs",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(api_enable_mdl, ib_mdl_enabled,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Enable MDL for applications direct access InnoDB through InnoDB APIs",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(api_disable_rowlock, ib_disable_row_lock,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Disable row lock when direct access InnoDB through InnoDB APIs",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(api_trx_level, ib_trx_level_setting,
+  PLUGIN_VAR_OPCMDARG,
+  "InnoDB API transaction isolation level",
+  NULL, NULL,
+  0,		/* Default setting */
+  0,		/* Minimum value */
+  3, 0);	/* Maximum value */
+
+static MYSQL_SYSVAR_ULONG(api_bk_commit_interval, ib_bk_commit_interval,
+  PLUGIN_VAR_OPCMDARG,
+  "Background commit interval in seconds",
+  NULL, NULL,
+  5,		/* Default setting */
+  1,		/* Minimum value */
+  1024 * 1024 * 1024, 0);	/* Maximum value */
+
 static MYSQL_SYSVAR_STR(change_buffering, innobase_change_buffering,
   PLUGIN_VAR_RQCMDARG,
   "Buffer changes to reduce random access: "
@@ -13466,6 +17591,14 @@ static MYSQL_SYSVAR_STR(change_buffering, innobase_change_buffering,
   innodb_change_buffering_validate,
   innodb_change_buffering_update, "all");
 
+static MYSQL_SYSVAR_UINT(change_buffer_max_size,
+  innobase_change_buffer_max_size,
+  PLUGIN_VAR_RQCMDARG,
+  "Maximum on-disk size of change buffer in terms of percentage"
+  " of the buffer pool.",
+  NULL, innodb_change_buffer_max_size_update,
+  CHANGE_BUFFER_DEFAULT_SIZE, 0, 50, 0);
+
 static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method,
    PLUGIN_VAR_RQCMDARG,
   "Specifies how InnoDB index statistics collection code should "
@@ -13502,6 +17635,12 @@ static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug,
   PLUGIN_VAR_RQCMDARG,
   "Debug flags for InnoDB change buffering (0=none, 2=crash at merge)",
   NULL, NULL, 0, 0, 2, 0);
+
+static MYSQL_SYSVAR_BOOL(disable_background_merge,
+  srv_ibuf_disable_background_merge,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_RQCMDARG,
+  "Disable change buffering merges by the master thread",
+  NULL, NULL, FALSE);
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 
 static MYSQL_SYSVAR_BOOL(random_read_ahead, srv_random_read_ahead,
@@ -13515,6 +17654,59 @@ static MYSQL_SYSVAR_ULONG(read_ahead_threshold, srv_read_ahead_threshold,
   "trigger a readahead.",
   NULL, NULL, 56, 0, 64, 0);
 
+static MYSQL_SYSVAR_STR(monitor_enable, innobase_enable_monitor_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "Turn on a monitor counter",
+  innodb_monitor_validate,
+  innodb_enable_monitor_update, NULL);
+
+static MYSQL_SYSVAR_STR(monitor_disable, innobase_disable_monitor_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "Turn off a monitor counter",
+  innodb_monitor_validate,
+  innodb_disable_monitor_update, NULL);
+
+static MYSQL_SYSVAR_STR(monitor_reset, innobase_reset_monitor_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "Reset a monitor counter",
+  innodb_monitor_validate,
+  innodb_reset_monitor_update, NULL);
+
+static MYSQL_SYSVAR_STR(monitor_reset_all, innobase_reset_all_monitor_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "Reset all values for a monitor counter",
+  innodb_monitor_validate,
+  innodb_reset_all_monitor_update, NULL);
+
+static MYSQL_SYSVAR_BOOL(print_all_deadlocks, srv_print_all_deadlocks,
+  PLUGIN_VAR_OPCMDARG,
+  "Print all deadlocks to MySQL error log (off by default)",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(compression_failure_threshold_pct,
+  zip_failure_threshold_pct, PLUGIN_VAR_OPCMDARG,
+  "If the compression failure rate of a table is greater than this number"
+  " more padding is added to the pages to reduce the failures. A value of"
+  " zero implies no padding",
+  NULL, NULL, 5, 0, 100, 0);
+
+static MYSQL_SYSVAR_ULONG(compression_pad_pct_max,
+  zip_pad_max, PLUGIN_VAR_OPCMDARG,
+  "Percentage of empty space on a data page that can be reserved"
+  " to make the page compressible.",
+  NULL, NULL, 50, 0, 75, 0);
+
+static MYSQL_SYSVAR_BOOL(read_only, srv_read_only_mode,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Start InnoDB in read only mode (off by default)",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(cmp_per_index_enabled, srv_cmp_per_index_enabled,
+  PLUGIN_VAR_OPCMDARG,
+  "Enable INFORMATION_SCHEMA.innodb_cmp_per_index, "
+  "may have negative impact on performance (off by default)",
+  NULL, innodb_cmp_per_index_update, FALSE);
+
 #ifdef UNIV_DEBUG
 static MYSQL_SYSVAR_UINT(trx_rseg_n_slots_debug, trx_rseg_n_slots_debug,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_NOCMDOPT,
@@ -13534,150 +17726,6 @@ static MYSQL_SYSVAR_BOOL(trx_purge_view_update_only_debug,
   NULL, NULL, FALSE);
 #endif /* UNIV_DEBUG */
 
-static MYSQL_SYSVAR_LONGLONG(ibuf_max_size, srv_ibuf_max_size,
-  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  "The maximum size of the insert buffer. (in bytes)",
-  NULL, NULL, LONGLONG_MAX, 0, LONGLONG_MAX, 0);
-
-static MYSQL_SYSVAR_ULONG(ibuf_active_contract, srv_ibuf_active_contract,
-  PLUGIN_VAR_RQCMDARG,
-  "Enable/Disable active_contract of insert buffer. 0:disable 1:enable",
-  NULL, NULL, 1, 0, 1, 0);
-
-static MYSQL_SYSVAR_ULONG(ibuf_accel_rate, srv_ibuf_accel_rate,
-  PLUGIN_VAR_RQCMDARG,
-  "Tunes amount of insert buffer processing of background, in addition to innodb_io_capacity. (in percentage)",
-  NULL, NULL, 100, 100, 999999999, 0);
-
-static MYSQL_SYSVAR_ULINT(checkpoint_age_target, srv_checkpoint_age_target,
-  PLUGIN_VAR_RQCMDARG,
-  "Control soft limit of checkpoint age. (0 : not control)",
-  NULL, NULL, 0, 0, ~0UL, 0);
-
-static
-void
-innodb_flush_neighbor_pages_update(
-  THD* thd,
-  struct st_mysql_sys_var* var,
-  void* var_ptr,
-  const void* save)
-{
-  *(long *)var_ptr = (*(long *)save) % 3;
-}
-
-const char *flush_neighbor_pages_names[]=
-{
-  "none", /* 0 */
-  "area",
-  "cont", /* 2 */
-  /* For compatibility with the older patch */
-  "0", /* "none" + 3 */
-  "1", /* "area" + 3 */
-  "2", /* "cont" + 3 */
-  NullS
-};
-
-TYPELIB flush_neighbor_pages_typelib=
-{
-  array_elements(flush_neighbor_pages_names) - 1,
-  "flush_neighbor_pages_typelib",
-  flush_neighbor_pages_names,
-  NULL
-};
-
-static MYSQL_SYSVAR_ENUM(flush_neighbor_pages, srv_flush_neighbor_pages,
-  PLUGIN_VAR_RQCMDARG, "Neighbor page flushing behaviour: none: do not flush, "
-                       "[area]: flush selected pages one-by-one, "
-                       "cont: flush a contiguous block of pages", NULL,
-  innodb_flush_neighbor_pages_update, 1, &flush_neighbor_pages_typelib);
-
-static
-void
-innodb_read_ahead_update(
-  THD* thd,
-  struct st_mysql_sys_var*     var,
-  void*        var_ptr,
-  const void*  save)
-{
-  *(long *)var_ptr= (*(long *)save) & 3;
-}
-const char *read_ahead_names[]=
-{
-  "none", /* 0 */
-  "random",
-  "linear",
-  "both", /* 3 */
-  /* For compatibility of the older patch */
-  "0", /* 4 ("none" + 4) */
-  "1",
-  "2",
-  "3", /* 7 ("both" + 4) */
-  NullS
-};
-TYPELIB read_ahead_typelib=
-{
-  array_elements(read_ahead_names) - 1, "read_ahead_typelib",
-  read_ahead_names, NULL
-};
-static MYSQL_SYSVAR_ENUM(read_ahead, srv_read_ahead,
-  PLUGIN_VAR_RQCMDARG,
-  "Control read ahead activity (none, random, [linear], both). [from 1.0.5: random read ahead is ignored]",
-  NULL, innodb_read_ahead_update, 2, &read_ahead_typelib);
-
-static
-void
-innodb_adaptive_flushing_method_update(
-  THD* thd,
-  struct st_mysql_sys_var*     var,
-  void*        var_ptr,
-  const void*  save)
-{
-  *(long *)var_ptr= (*(long *)save) % 3;
-}
-const char *adaptive_flushing_method_names[]=
-{
-  "native", /* 0 */
-  "estimate", /* 1 */
-  "keep_average", /* 2 */
-  /* For compatibility of the older patch */
-  "0", /* 3 ("none" + 3) */
-  "1", /* 4 ("estimate" + 3) */
-  "2", /* 5 ("keep_average" + 3) */
-  NullS
-};
-TYPELIB adaptive_flushing_method_typelib=
-{
-  array_elements(adaptive_flushing_method_names) - 1, "adaptive_flushing_method_typelib",
-  adaptive_flushing_method_names, NULL
-};
-static MYSQL_SYSVAR_ENUM(adaptive_flushing_method, srv_adaptive_flushing_method,
-  PLUGIN_VAR_RQCMDARG,
-  "Choose method of innodb_adaptive_flushing. (native, [estimate], keep_average)",
-  NULL, innodb_adaptive_flushing_method_update, 1, &adaptive_flushing_method_typelib);
-
-static MYSQL_SYSVAR_ULONG(import_table_from_xtrabackup, srv_expand_import,
-  PLUGIN_VAR_RQCMDARG,
-  "Enable/Disable converting automatically *.ibd files when import tablespace.",
-  NULL, NULL, 0, 0, 1, 0);
-
-static MYSQL_SYSVAR_ULINT(dict_size_limit, srv_dict_size_limit,
-  PLUGIN_VAR_RQCMDARG,
-  "Limit the allocated memory for dictionary cache. (0: unlimited)",
-  NULL, NULL, 0, 0, LONG_MAX, 0);
-
-static MYSQL_SYSVAR_UINT(buffer_pool_restore_at_startup, srv_auto_lru_dump,
-  PLUGIN_VAR_RQCMDARG,
-  "Time in seconds between automatic buffer pool dumps. "
-  "0 (the default) disables automatic dumps.",
-  NULL, NULL, 0, 0, UINT_MAX32, 0);
-
-static MYSQL_SYSVAR_BOOL(blocking_buffer_pool_restore,
-  innobase_blocking_lru_restore,
-  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
-  "Block XtraDB startup process until buffer pool is full restored from a "
-  "dump file (if present). Disabled by default.",
-  NULL, NULL, FALSE);
-
 const char *corrupt_table_action_names[]=
 {
   "assert", /* 0 */
@@ -13698,26 +17746,6 @@ static	MYSQL_SYSVAR_ENUM(corrupt_table_action, srv_pass_corrupt_table,
   "except for the deletion. Possible options are 'assert', 'warn' & 'salvage'",
   NULL, NULL, 0, &corrupt_table_action_typelib);
 
-static MYSQL_SYSVAR_ULINT(lazy_drop_table, srv_lazy_drop_table,
-  PLUGIN_VAR_RQCMDARG,
-  "[Deprecated option] no effect",
-  NULL, NULL, 0, 0, 1, 0);
-
-#ifdef UNIV_DEBUG
-
-static MYSQL_SYSVAR_BOOL(log_checkpoint_now, innodb_log_checkpoint_now,
-  PLUGIN_VAR_OPCMDARG,
-  "Force checkpoint now",
-  NULL, checkpoint_now_set, FALSE);
-
-static MYSQL_SYSVAR_BOOL(track_redo_log_now,
-  innodb_track_redo_log_now,
-  PLUGIN_VAR_OPCMDARG,
-  "Force log tracker to catch up with checkpoint now",
-  NULL, track_redo_log_now_set, FALSE);
-
-#endif /* UNIV_DEBUG */
-
 static MYSQL_SYSVAR_BOOL(locking_fake_changes, srv_fake_changes_locks,
   PLUGIN_VAR_NOCMDARG,
   "###EXPERIMENTAL### if enabled, transactions will get S row locks instead "
@@ -13725,41 +17753,46 @@ static MYSQL_SYSVAR_BOOL(locking_fake_changes, srv_fake_changes_locks,
   "not take any locks at all.",
   NULL, NULL, TRUE);
 
-static MYSQL_SYSVAR_BOOL(print_all_deadlocks, srv_print_all_deadlocks,
-  PLUGIN_VAR_OPCMDARG,
-  "Print all deadlocks to MySQL error log (off by default)",
-  NULL, NULL, FALSE);
-
 static MYSQL_SYSVAR_BOOL(use_stacktrace, srv_use_stacktrace,
   PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
   "Print stacktrace on long semaphore wait (off by default supported only on linux)",
   NULL, NULL, FALSE);
 
 static struct st_mysql_sys_var* innobase_system_variables[]= {
-  MYSQL_SYSVAR(page_size),
   MYSQL_SYSVAR(log_block_size),
   MYSQL_SYSVAR(additional_mem_pool_size),
+  MYSQL_SYSVAR(api_trx_level),
+  MYSQL_SYSVAR(api_bk_commit_interval),
   MYSQL_SYSVAR(autoextend_increment),
-#ifndef DBUG_OFF
-  MYSQL_SYSVAR(buffer_pool_evict),
-#endif /* !DBUG_OFF */
   MYSQL_SYSVAR(buffer_pool_size),
   MYSQL_SYSVAR(buffer_pool_populate),
   MYSQL_SYSVAR(buffer_pool_instances),
-  MYSQL_SYSVAR(buffer_pool_shm_key),
-  MYSQL_SYSVAR(buffer_pool_shm_checksum),
+  MYSQL_SYSVAR(buffer_pool_filename),
+  MYSQL_SYSVAR(buffer_pool_dump_now),
+  MYSQL_SYSVAR(buffer_pool_dump_at_shutdown),
+#ifdef UNIV_DEBUG
+  MYSQL_SYSVAR(buffer_pool_evict),
+#endif /* UNIV_DEBUG */
+  MYSQL_SYSVAR(buffer_pool_load_now),
+  MYSQL_SYSVAR(buffer_pool_load_abort),
+  MYSQL_SYSVAR(buffer_pool_load_at_startup),
+  MYSQL_SYSVAR(lru_scan_depth),
+  MYSQL_SYSVAR(flush_neighbors),
+  MYSQL_SYSVAR(checksum_algorithm),
+  MYSQL_SYSVAR(log_checksum_algorithm),
   MYSQL_SYSVAR(checksums),
-  MYSQL_SYSVAR(fast_checksum),
   MYSQL_SYSVAR(commit_concurrency),
   MYSQL_SYSVAR(concurrency_tickets),
+  MYSQL_SYSVAR(compression_level),
   MYSQL_SYSVAR(kill_idle_transaction),
   MYSQL_SYSVAR(data_file_path),
-  MYSQL_SYSVAR(doublewrite_file),
   MYSQL_SYSVAR(data_home_dir),
   MYSQL_SYSVAR(doublewrite),
+  MYSQL_SYSVAR(api_enable_binlog),
+  MYSQL_SYSVAR(api_enable_mdl),
+  MYSQL_SYSVAR(api_disable_rowlock),
   MYSQL_SYSVAR(use_atomic_writes),
   MYSQL_SYSVAR(use_fallocate),
-  MYSQL_SYSVAR(recovery_stats),
   MYSQL_SYSVAR(fast_shutdown),
   MYSQL_SYSVAR(file_io_threads),
   MYSQL_SYSVAR(read_io_threads),
@@ -13768,10 +17801,22 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(file_format),
   MYSQL_SYSVAR(file_format_check),
   MYSQL_SYSVAR(file_format_max),
+  MYSQL_SYSVAR(flush_log_at_timeout),
   MYSQL_SYSVAR(flush_log_at_trx_commit),
   MYSQL_SYSVAR(use_global_flush_log_at_trx_commit),
   MYSQL_SYSVAR(flush_method),
   MYSQL_SYSVAR(force_recovery),
+#ifndef DBUG_OFF
+  MYSQL_SYSVAR(force_recovery_crash),
+#endif /* !DBUG_OFF */
+  MYSQL_SYSVAR(ft_cache_size),
+  MYSQL_SYSVAR(ft_total_cache_size),
+  MYSQL_SYSVAR(ft_result_cache_limit),
+  MYSQL_SYSVAR(ft_enable_stopword),
+  MYSQL_SYSVAR(ft_max_token_size),
+  MYSQL_SYSVAR(ft_min_token_size),
+  MYSQL_SYSVAR(ft_num_word_optimize),
+  MYSQL_SYSVAR(ft_sort_pll_degree),
   MYSQL_SYSVAR(large_prefix),
   MYSQL_SYSVAR(force_load_corrupted),
   MYSQL_SYSVAR(locks_unsafe_for_binlog),
@@ -13779,28 +17824,38 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
 #ifdef UNIV_LOG_ARCHIVE
   MYSQL_SYSVAR(log_arch_dir),
   MYSQL_SYSVAR(log_archive),
+  MYSQL_SYSVAR(log_arch_expire_sec),
 #endif /* UNIV_LOG_ARCHIVE */
+  MYSQL_SYSVAR(page_size),
   MYSQL_SYSVAR(log_buffer_size),
   MYSQL_SYSVAR(log_file_size),
   MYSQL_SYSVAR(log_files_in_group),
   MYSQL_SYSVAR(log_group_home_dir),
+  MYSQL_SYSVAR(log_compressed_pages),
   MYSQL_SYSVAR(max_dirty_pages_pct),
+  MYSQL_SYSVAR(max_dirty_pages_pct_lwm),
+  MYSQL_SYSVAR(adaptive_flushing_lwm),
   MYSQL_SYSVAR(adaptive_flushing),
+  MYSQL_SYSVAR(flushing_avg_loops),
   MYSQL_SYSVAR(max_purge_lag),
+  MYSQL_SYSVAR(max_purge_lag_delay),
   MYSQL_SYSVAR(mirrored_log_groups),
   MYSQL_SYSVAR(old_blocks_pct),
   MYSQL_SYSVAR(old_blocks_time),
   MYSQL_SYSVAR(open_files),
-  MYSQL_SYSVAR(recovery_update_relay_log),
+  MYSQL_SYSVAR(optimize_fulltext_only),
   MYSQL_SYSVAR(rollback_on_timeout),
+  MYSQL_SYSVAR(ft_aux_table),
+  MYSQL_SYSVAR(ft_enable_diag_print),
+  MYSQL_SYSVAR(ft_server_stopword_table),
+  MYSQL_SYSVAR(ft_user_stopword_table),
+  MYSQL_SYSVAR(disable_sort_file_cache),
   MYSQL_SYSVAR(stats_on_metadata),
-  MYSQL_SYSVAR(stats_auto_update),
-  MYSQL_SYSVAR(stats_update_need_lock),
-  MYSQL_SYSVAR(use_sys_stats_table),
-#ifdef UNIV_DEBUG
-  MYSQL_SYSVAR(persistent_stats_root_page),
-#endif
   MYSQL_SYSVAR(stats_sample_pages),
+  MYSQL_SYSVAR(stats_transient_sample_pages),
+  MYSQL_SYSVAR(stats_persistent),
+  MYSQL_SYSVAR(stats_persistent_sample_pages),
+  MYSQL_SYSVAR(stats_auto_recalc),
   MYSQL_SYSVAR(adaptive_hash_index),
   MYSQL_SYSVAR(adaptive_hash_index_partitions),
   MYSQL_SYSVAR(stats_method),
@@ -13808,58 +17863,91 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(status_file),
   MYSQL_SYSVAR(strict_mode),
   MYSQL_SYSVAR(support_xa),
+  MYSQL_SYSVAR(sort_buffer_size),
+  MYSQL_SYSVAR(online_alter_log_max_size),
   MYSQL_SYSVAR(sync_spin_loops),
   MYSQL_SYSVAR(spin_wait_delay),
   MYSQL_SYSVAR(table_locks),
   MYSQL_SYSVAR(thread_concurrency),
-  MYSQL_SYSVAR(thread_concurrency_timer_based),
+#ifdef HAVE_ATOMIC_BUILTINS
+  MYSQL_SYSVAR(adaptive_max_sleep_delay),
+#endif /* HAVE_ATOMIC_BUILTINS */
   MYSQL_SYSVAR(thread_sleep_delay),
   MYSQL_SYSVAR(autoinc_lock_mode),
   MYSQL_SYSVAR(show_verbose_locks),
   MYSQL_SYSVAR(show_locks_held),
   MYSQL_SYSVAR(version),
-  MYSQL_SYSVAR(ibuf_max_size),
-  MYSQL_SYSVAR(ibuf_active_contract),
-  MYSQL_SYSVAR(ibuf_accel_rate),
-  MYSQL_SYSVAR(checkpoint_age_target),
-  MYSQL_SYSVAR(flush_neighbor_pages),
-  MYSQL_SYSVAR(read_ahead),
-  MYSQL_SYSVAR(adaptive_flushing_method),
-  MYSQL_SYSVAR(import_table_from_xtrabackup),
-  MYSQL_SYSVAR(dict_size_limit),
   MYSQL_SYSVAR(use_sys_malloc),
   MYSQL_SYSVAR(use_native_aio),
   MYSQL_SYSVAR(change_buffering),
+  MYSQL_SYSVAR(change_buffer_max_size),
   MYSQL_SYSVAR(track_changed_pages),
   MYSQL_SYSVAR(max_bitmap_file_size),
   MYSQL_SYSVAR(max_changed_pages),
 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
   MYSQL_SYSVAR(change_buffering_debug),
+  MYSQL_SYSVAR(disable_background_merge),
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
   MYSQL_SYSVAR(random_read_ahead),
   MYSQL_SYSVAR(read_ahead_threshold),
+  MYSQL_SYSVAR(read_only),
   MYSQL_SYSVAR(io_capacity),
-  MYSQL_SYSVAR(buffer_pool_restore_at_startup),
-  MYSQL_SYSVAR(blocking_buffer_pool_restore),
+  MYSQL_SYSVAR(io_capacity_max),
+  MYSQL_SYSVAR(monitor_enable),
+  MYSQL_SYSVAR(monitor_disable),
+  MYSQL_SYSVAR(monitor_reset),
+  MYSQL_SYSVAR(monitor_reset_all),
   MYSQL_SYSVAR(purge_threads),
   MYSQL_SYSVAR(purge_batch_size),
+#ifdef UNIV_DEBUG
+  MYSQL_SYSVAR(purge_run_now),
+  MYSQL_SYSVAR(purge_stop_now),
+  MYSQL_SYSVAR(log_checkpoint_now),
+  MYSQL_SYSVAR(track_redo_log_now),
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_LINUX
+  MYSQL_SYSVAR(sched_priority_cleaner),
+#endif
+#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG
+  MYSQL_SYSVAR(page_hash_locks),
+  MYSQL_SYSVAR(doublewrite_batch_size),
+#ifdef UNIV_LINUX
+  MYSQL_SYSVAR(sched_priority_purge),
+  MYSQL_SYSVAR(sched_priority_io),
+  MYSQL_SYSVAR(sched_priority_master),
+  MYSQL_SYSVAR(priority_purge),
+  MYSQL_SYSVAR(priority_io),
+  MYSQL_SYSVAR(priority_cleaner),
+  MYSQL_SYSVAR(priority_master),
+#endif /* UNIV_LINUX */
+  MYSQL_SYSVAR(cleaner_max_lru_time),
+  MYSQL_SYSVAR(cleaner_max_flush_time),
+  MYSQL_SYSVAR(cleaner_flush_chunk_size),
+  MYSQL_SYSVAR(cleaner_lru_chunk_size),
+  MYSQL_SYSVAR(cleaner_free_list_lwm),
+  MYSQL_SYSVAR(cleaner_eviction_factor),
+#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */
+  MYSQL_SYSVAR(cleaner_lsn_age_factor),
+  MYSQL_SYSVAR(foreground_preflush),
+  MYSQL_SYSVAR(empty_free_list_algorithm),
+  MYSQL_SYSVAR(print_all_deadlocks),
+  MYSQL_SYSVAR(cmp_per_index_enabled),
+  MYSQL_SYSVAR(undo_logs),
   MYSQL_SYSVAR(rollback_segments),
+  MYSQL_SYSVAR(undo_directory),
+  MYSQL_SYSVAR(undo_tablespaces),
+  MYSQL_SYSVAR(sync_array_size),
+  MYSQL_SYSVAR(compression_failure_threshold_pct),
+  MYSQL_SYSVAR(compression_pad_pct_max),
 #ifdef UNIV_DEBUG
   MYSQL_SYSVAR(trx_rseg_n_slots_debug),
   MYSQL_SYSVAR(limit_optimistic_insert_debug),
   MYSQL_SYSVAR(trx_purge_view_update_only_debug),
 #endif /* UNIV_DEBUG */
   MYSQL_SYSVAR(corrupt_table_action),
-  MYSQL_SYSVAR(lazy_drop_table),
   MYSQL_SYSVAR(fake_changes),
   MYSQL_SYSVAR(locking_fake_changes),
-  MYSQL_SYSVAR(merge_sort_block_size),
-  MYSQL_SYSVAR(print_all_deadlocks),
   MYSQL_SYSVAR(use_stacktrace),
-#ifdef UNIV_DEBUG
-  MYSQL_SYSVAR(log_checkpoint_now),
-  MYSQL_SYSVAR(track_redo_log_now),
-#endif /* UNIV_DEBUG */
   NULL
 };
 
@@ -13879,8 +17967,9 @@ maria_declare_plugin(xtradb)
   INNODB_VERSION_STR,         /* string version */
   MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
 },
-i_s_innodb_rseg,
-i_s_innodb_undo_logs,
+i_s_xtradb_read_view,
+i_s_xtradb_internal_hash_tables,
+i_s_xtradb_rseg,
 i_s_innodb_trx,
 i_s_innodb_locks,
 i_s_innodb_lock_waits,
@@ -13888,6 +17977,18 @@ i_s_innodb_cmp,
 i_s_innodb_cmp_reset,
 i_s_innodb_cmpmem,
 i_s_innodb_cmpmem_reset,
+i_s_innodb_cmp_per_index,
+i_s_innodb_cmp_per_index_reset,
+i_s_innodb_buffer_page,
+i_s_innodb_buffer_page_lru,
+i_s_innodb_buffer_stats,
+i_s_innodb_metrics,
+i_s_innodb_ft_default_stopword,
+i_s_innodb_ft_deleted,
+i_s_innodb_ft_being_deleted,
+i_s_innodb_ft_config,
+i_s_innodb_ft_index_cache,
+i_s_innodb_ft_index_table,
 i_s_innodb_sys_tables,
 i_s_innodb_sys_tablestats,
 i_s_innodb_sys_indexes,
@@ -13895,17 +17996,9 @@ i_s_innodb_sys_columns,
 i_s_innodb_sys_fields,
 i_s_innodb_sys_foreign,
 i_s_innodb_sys_foreign_cols,
-i_s_innodb_sys_stats,
-i_s_innodb_table_stats,
-i_s_innodb_index_stats,
-i_s_innodb_buffer_pool_pages,
-i_s_innodb_buffer_pool_pages_index,
-i_s_innodb_buffer_pool_pages_blob,
-i_s_innodb_admin_command,
-i_s_innodb_changed_pages,
-i_s_innodb_buffer_page,
-i_s_innodb_buffer_page_lru,
-i_s_innodb_buffer_stats
+i_s_innodb_sys_tablespaces,
+i_s_innodb_sys_datafiles,
+i_s_innodb_changed_pages
 maria_declare_plugin_end;
 
 /** @brief Initialize the default value of innodb_commit_concurrency.
@@ -13919,16 +18012,31 @@ to 0, even if it was initially set to nonzero at the command line
 or configuration file. */
 static
 void
-innobase_commit_concurrency_init_default(void)
-/*==========================================*/
+innobase_commit_concurrency_init_default()
+/*======================================*/
 {
 	MYSQL_SYSVAR_NAME(commit_concurrency).def_val
 		= innobase_commit_concurrency;
 }
 
+/** @brief Initialize the default and max value of innodb_undo_logs.
+
+Once InnoDB is running, the default value and the max value of
+innodb_undo_logs must be equal to the available undo logs,
+given by srv_available_undo_logs. */
+static
+void
+innobase_undo_logs_init_default_max()
+/*=================================*/
+{
+	MYSQL_SYSVAR_NAME(undo_logs).max_val
+		= MYSQL_SYSVAR_NAME(undo_logs).def_val
+		= srv_available_undo_logs;
+}
+
 #ifdef UNIV_COMPILE_TEST_FUNCS
 
-typedef struct innobase_convert_name_test_struct {
+struct innobase_convert_name_test_t {
 	char*		buf;
 	ulint		buflen;
 	const char*	id;
@@ -13937,7 +18045,7 @@ typedef struct innobase_convert_name_test_struct {
 	ibool		file_id;
 
 	const char*	expected;
-} innobase_convert_name_test_t;
+};
 
 void
 test_innobase_convert_name()
@@ -14045,12 +18153,229 @@ test_innobase_convert_name()
 		}
 	}
 }
+
 #endif /* UNIV_COMPILE_TEST_FUNCS */
 
+/**
+ * Index Condition Pushdown interface implementation
+ */
+
+/*************************************************************//**
+InnoDB index push-down condition check
+@return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */
+UNIV_INTERN
+enum icp_result
+innobase_index_cond(
+/*================*/
+	void*	file)	/*!< in/out: pointer to ha_innobase */
+{
+  return handler_index_cond_check(file);
+}
+
+/** Attempt to push down an index condition.
+* @param[in] keyno	MySQL key number
+* @param[in] idx_cond	Index condition to be checked
+* @return Part of idx_cond which the handler will not evaluate
+*/
+UNIV_INTERN
+class Item*
+ha_innobase::idx_cond_push(
+	uint		keyno,
+	class Item*	idx_cond)
+{
+	DBUG_ENTER("ha_innobase::idx_cond_push");
+	DBUG_ASSERT(keyno != MAX_KEY);
+	DBUG_ASSERT(idx_cond != NULL);
+
+	pushed_idx_cond = idx_cond;
+	pushed_idx_cond_keyno = keyno;
+	in_range_check_pushed_down = TRUE;
+	/* We will evaluate the condition entirely */
+	DBUG_RETURN(NULL);
+}
+
+/******************************************************************//**
+Use this when the args are passed to the format string from
+errmsg-utf8.txt directly as is.
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+	THD *thd, Sql_condition::enum_warning_level level,
+	uint code, const char *format, ...);
+*/
+UNIV_INTERN
+void
+ib_senderrf(
+/*========*/
+	THD*		thd,		/*!< in/out: session */
+	ib_log_level_t	level,		/*!< in: warning level */
+	ib_uint32_t	code,		/*!< MySQL error code */
+	...)				/*!< Args */
+{
+	char*		str;
+	va_list         args;
+	const char*	format = innobase_get_err_msg(code);
+
+	/* If the caller wants to push a message to the client then
+	the caller must pass a valid session handle. */
+
+	ut_a(thd != 0);
+
+	/* The error code must exist in the errmsg-utf8.txt file. */
+	ut_a(format != 0);
+
+	va_start(args, code);
+
+#ifdef __WIN__
+	int		size = _vscprintf(format, args) + 1;
+	str = static_cast<char*>(malloc(size));
+	str[size - 1] = 0x0;
+	vsnprintf(str, size, format, args);
+#elif HAVE_VASPRINTF
+	(void) vasprintf(&str, format, args);
+#else
+	/* Use a fixed length string. */
+	str = static_cast<char*>(malloc(BUFSIZ));
+	my_vsnprintf(str, BUFSIZ, format, args);
+#endif /* __WIN__ */
+
+	Sql_condition::enum_warning_level	l;
+
+	l = Sql_condition::WARN_LEVEL_NOTE;
+
+	switch(level) {
+	case IB_LOG_LEVEL_INFO:
+		break;
+	case IB_LOG_LEVEL_WARN:
+		l = Sql_condition::WARN_LEVEL_WARN;
+		break;
+	case IB_LOG_LEVEL_ERROR:
+		/* We can't use push_warning_printf(), it is a hard error. */
+		my_printf_error(code, "%s", MYF(0), str);
+		break;
+	case IB_LOG_LEVEL_FATAL:
+		l = Sql_condition::WARN_LEVEL_END;
+		break;
+	}
+
+	if (level != IB_LOG_LEVEL_ERROR) {
+		push_warning_printf(thd, l, code, "InnoDB: %s", str);
+	}
+
+	va_end(args);
+	free(str);
+
+	if (level == IB_LOG_LEVEL_FATAL) {
+		ut_error;
+	}
+}
+
+/******************************************************************//**
+Use this when the args are first converted to a formatted string and then
+passed to the format string from errmsg-utf8.txt. The error message format
+must be: "Some string ... %s".
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+	THD *thd, Sql_condition::enum_warning_level level,
+	uint code, const char *format, ...);
+*/
+UNIV_INTERN
+void
+ib_errf(
+/*====*/
+	THD*		thd,		/*!< in/out: session */
+	ib_log_level_t	level,		/*!< in: warning level */
+	ib_uint32_t	code,		/*!< MySQL error code */
+	const char*	format,		/*!< printf format */
+	...)				/*!< Args */
+{
+	char*		str;
+	va_list         args;
+
+	/* If the caller wants to push a message to the client then
+	the caller must pass a valid session handle. */
+
+	ut_a(thd != 0);
+	ut_a(format != 0);
+
+	va_start(args, format);
+
+#ifdef __WIN__
+	int		size = _vscprintf(format, args) + 1;
+	str = static_cast<char*>(malloc(size));
+	str[size - 1] = 0x0;
+	vsnprintf(str, size, format, args);
+#elif HAVE_VASPRINTF
+	(void) vasprintf(&str, format, args);
+#else
+	/* Use a fixed length string. */
+	str = static_cast<char*>(malloc(BUFSIZ));
+	my_vsnprintf(str, BUFSIZ, format, args);
+#endif /* __WIN__ */
+
+	ib_senderrf(thd, level, code, str);
+
+	va_end(args);
+	free(str);
+}
+
+/******************************************************************//**
+Write a message to the MySQL log, prefixed with "InnoDB: " */
+UNIV_INTERN
+void
+ib_logf(
+/*====*/
+	ib_log_level_t	level,		/*!< in: warning level */
+	const char*	format,		/*!< printf format */
+	...)				/*!< Args */
+{
+	char*		str;
+	va_list         args;
+
+	va_start(args, format);
+
+#ifdef __WIN__
+	int		size = _vscprintf(format, args) + 1;
+	str = static_cast<char*>(malloc(size));
+	str[size - 1] = 0x0;
+	vsnprintf(str, size, format, args);
+#elif HAVE_VASPRINTF
+	(void) vasprintf(&str, format, args);
+#else
+	/* Use a fixed length string. */
+	str = static_cast<char*>(malloc(BUFSIZ));
+	my_vsnprintf(str, BUFSIZ, format, args);
+#endif /* __WIN__ */
+
+	switch(level) {
+	case IB_LOG_LEVEL_INFO:
+		sql_print_information("InnoDB: %s", str);
+		break;
+	case IB_LOG_LEVEL_WARN:
+		sql_print_warning("InnoDB: %s", str);
+		break;
+	case IB_LOG_LEVEL_ERROR:
+		sql_print_error("InnoDB: %s", str);
+		break;
+	case IB_LOG_LEVEL_FATAL:
+		sql_print_error("InnoDB: %s", str);
+		break;
+	}
+
+	va_end(args);
+	free(str);
+
+	if (level == IB_LOG_LEVEL_FATAL) {
+		ut_error;
+	}
+}
+
 /**********************************************************************
 Converts an identifier from my_charset_filename to UTF-8 charset.
 @return result string length, as returned by strconvert() */
-extern "C"
 uint
 innobase_convert_to_filename_charset(
 /*=================================*/
@@ -14062,7 +18387,24 @@ innobase_convert_to_filename_charset(
 	CHARSET_INFO*	cs_to = &my_charset_filename;
 	CHARSET_INFO*	cs_from = system_charset_info;
 
-	return(strconvert(cs_from, from, cs_to, to, len, &errors));
+	return(strconvert(cs_from, from, strlen(from), cs_to, to, len, &errors));
+}
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset.
+@return result string length, as returned by strconvert() */
+uint
+innobase_convert_to_system_charset(
+/*===============================*/
+	char*		to,	/* out: converted identifier */
+	const char*	from,	/* in: identifier to convert */
+	ulint		len,	/* in: length of 'to', in bytes */
+	uint*		errors)	/* out: error return */
+{
+	CHARSET_INFO*	cs1 = &my_charset_filename;
+	CHARSET_INFO*	cs2 = system_charset_info;
+
+	return(strconvert(cs1, from, strlen(from), cs2, to, len, errors));
 }
 
 
@@ -14090,7 +18432,7 @@ ha_rows ha_innobase::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                                  void *seq_init_param,  
                                                  uint n_ranges, uint *bufsz,
                                                  uint *flags, 
-                                                 COST_VECT *cost)
+                                                 Cost_estimate *cost)
 {
   /* See comments in ha_myisam::multi_range_read_info_const */
   ds_mrr.init(this, table);
@@ -14105,7 +18447,7 @@ ha_rows ha_innobase::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
 
 ha_rows ha_innobase::multi_range_read_info(uint keyno, uint n_ranges, uint keys,
                                            uint key_parts, uint *bufsz, 
-                                           uint *flags, COST_VECT *cost)
+                                           uint *flags, Cost_estimate *cost)
 {
   ds_mrr.init(this, table);
   ha_rows res= ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz, 
@@ -14127,29 +18469,3 @@ bool ha_innobase::is_thd_killed()
   return thd_kill_level(user_thd);
 }
 
-/**
- * Index Condition Pushdown interface implementation
- */
-
-/** Attempt to push down an index condition.
-* @param[in] keyno	MySQL key number
-* @param[in] idx_cond	Index condition to be checked
-* @return idx_cond if pushed; NULL if not pushed
-*/
-UNIV_INTERN
-class Item*
-ha_innobase::idx_cond_push(
-	uint		keyno,
-	class Item*	idx_cond)
-{
-	DBUG_ENTER("ha_innobase::idx_cond_push");
-	DBUG_ASSERT(keyno != MAX_KEY);
-	DBUG_ASSERT(idx_cond != NULL);
-
-	pushed_idx_cond = idx_cond;
-	pushed_idx_cond_keyno = keyno;
-	in_range_check_pushed_down = TRUE;
-	/* Table handler will check the entire condition */
-	DBUG_RETURN(NULL);
-}
-
diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h
index b46e6a382e1..773a9b6b04d 100644
--- a/storage/xtradb/handler/ha_innodb.h
+++ b/storage/xtradb/handler/ha_innodb.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2000, 2010, MySQL AB & Innobase Oy. All Rights Reserved.
+Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -23,20 +23,18 @@ this program; if not, write to the Free Software Foundation, Inc.,
   Innodb
 */
 
-#ifdef USE_PRAGMA_INTERFACE
-#pragma interface			/* gcc class implementation */
-#endif
+#include "dict0stats.h"
 
 /* Structure defines translation table between mysql index and innodb
 index structures */
-typedef struct innodb_idx_translate_struct {
+struct innodb_idx_translate_t {
 	ulint		index_count;	/*!< number of valid index entries
 					in the index_mapping array */
 	ulint		array_size;	/*!< array size of index_mapping */
 	dict_index_t**	index_mapping;	/*!< index pointer array directly
 					maps to index in Innodb from MySQL
 					array index */
-} innodb_idx_translate_t;
+};
 
 
 /** InnoDB table share */
@@ -56,15 +54,8 @@ typedef struct st_innobase_share {
 } INNOBASE_SHARE;
 
 
-/** InnoDB B-tree index */
-struct dict_index_struct;
-/** Prebuilt structures in an Innobase table handle used within MySQL */
-struct row_prebuilt_struct;
-
-/** InnoDB B-tree index */
-typedef struct dict_index_struct dict_index_t;
-/** Prebuilt structures in an Innobase table handle used within MySQL */
-typedef struct row_prebuilt_struct row_prebuilt_t;
+/** Prebuilt structures in an InnoDB table handle used within MySQL */
+struct row_prebuilt_t;
 
 /** The class defining a handle to an Innodb table */
 class ha_innobase: public handler
@@ -104,15 +95,13 @@ class ha_innobase: public handler
 	void update_thd();
 	int change_active_index(uint keynr);
 	int general_fetch(uchar* buf, uint direction, uint match_mode);
-	ulint innobase_lock_autoinc();
+	dberr_t innobase_lock_autoinc();
 	ulonglong innobase_peek_autoinc();
-	ulint innobase_set_max_autoinc(ulonglong auto_inc);
-	ulint innobase_reset_autoinc(ulonglong auto_inc);
-	ulint innobase_get_autoinc(ulonglong* value);
-	ulint innobase_update_autoinc(ulonglong	auto_inc);
+	dberr_t innobase_set_max_autoinc(ulonglong auto_inc);
+	dberr_t innobase_reset_autoinc(ulonglong auto_inc);
+	dberr_t innobase_get_autoinc(ulonglong* value);
 	void innobase_initialize_autoinc();
 	dict_index_t* innobase_get_index(uint keynr);
-	int info_low(uint flag, bool called_from_analyze);
 
 	/* Init values for the class: */
  public:
@@ -124,6 +113,7 @@ class ha_innobase: public handler
 	*/
 	enum row_type get_row_type() const;
 
+	const char* table_type() const;
 	const char* index_type(uint key_number);
 	Table_flags table_flags() const;
 	ulong index_flags(uint idx, uint part, bool all_parts) const;
@@ -137,6 +127,7 @@ class ha_innobase: public handler
 	int close(void);
 	double scan_time();
 	double read_time(uint index, uint ranges, ha_rows rows);
+	longlong get_memory_buffer_size() const;
 	my_bool is_fake_change_enabled(THD *thd);
 	bool is_corrupt() const;
 
@@ -165,13 +156,18 @@ class ha_innobase: public handler
 	int rnd_next(uchar *buf);
 	int rnd_pos(uchar * buf, uchar *pos);
 
+	int ft_init();
+	void ft_end();
+	FT_INFO *ft_init_ext(uint flags, uint inx, String* key);
+	int ft_read(uchar* buf);
+
 	void position(const uchar *record);
 	int info(uint);
 	int analyze(THD* thd,HA_CHECK_OPT* check_opt);
 	int optimize(THD* thd,HA_CHECK_OPT* check_opt);
 	int discard_or_import_tablespace(my_bool discard);
 	int extra(enum ha_extra_function operation);
-        int reset();
+	int reset();
 	int external_lock(THD *thd, int lock_type);
 	int transactional_table_lock(THD *thd, int lock_type);
 	int start_stmt(THD *thd, thr_lock_type lock_type);
@@ -181,6 +177,13 @@ class ha_innobase: public handler
 	ha_rows estimate_rows_upper_bound();
 
 	void update_create_info(HA_CREATE_INFO* create_info);
+	int parse_table_name(const char*name,
+			     HA_CREATE_INFO* create_info,
+			     ulint flags,
+			     ulint flags2,
+			     char* norm_name,
+			     char* temp_path,
+			     char* remote_path);
 	int create(const char *name, register TABLE *form,
 					HA_CREATE_INFO *create_info);
 	int truncate();
@@ -205,7 +208,7 @@ class ha_innobase: public handler
 	int reset_auto_increment(ulonglong value);
 
 	virtual bool get_error_message(int error, String *buf);
-
+	virtual bool get_foreign_dup_key(char*, uint, char*, uint);
 	uint8 table_cache_type();
 	/*
 	  ask handler about permission to cache table during query registration
@@ -214,17 +217,80 @@ class ha_innobase: public handler
 					   uint key_length,
 					   qc_engine_callback *call_back,
 					   ulonglong *engine_data);
-	static char *get_mysql_bin_log_name();
+	static const char *get_mysql_bin_log_name();
 	static ulonglong get_mysql_bin_log_pos();
 	bool primary_key_is_clustered();
 	int cmp_ref(const uchar *ref1, const uchar *ref2);
-	/** Fast index creation (smart ALTER TABLE) @see handler0alter.cc @{ */
-	int add_index(TABLE *table_arg, KEY *key_info, uint num_of_keys,
-		      handler_add_index **add);
-	int final_add_index(handler_add_index *add, bool commit);
-	int prepare_drop_index(TABLE *table_arg, uint *key_num,
-			       uint num_of_keys);
-	int final_drop_index(TABLE *table_arg);
+	/** On-line ALTER TABLE interface @see handler0alter.cc @{ */
+
+	/** Check if InnoDB supports a particular alter table in-place
+	@param altered_table	TABLE object for new version of table.
+	@param ha_alter_info	Structure describing changes to be done
+	by ALTER TABLE and holding data used during in-place alter.
+
+	@retval HA_ALTER_INPLACE_NOT_SUPPORTED	Not supported
+	@retval HA_ALTER_INPLACE_NO_LOCK	Supported
+	@retval HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE
+						Supported, but requires lock
+						during main phase and exclusive
+						lock during prepare phase.
+	@retval HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE
+						Supported, prepare phase
+						requires exclusive lock.
+	*/
+	enum_alter_inplace_result check_if_supported_inplace_alter(
+		TABLE*			altered_table,
+		Alter_inplace_info*	ha_alter_info);
+	/** Allows InnoDB to update internal structures with concurrent
+	writes blocked (provided that check_if_supported_inplace_alter()
+	did not return HA_ALTER_INPLACE_NO_LOCK).
+	This will be invoked before inplace_alter_table().
+
+	@param altered_table	TABLE object for new version of table.
+	@param ha_alter_info	Structure describing changes to be done
+	by ALTER TABLE and holding data used during in-place alter.
+
+	@retval true		Failure
+	@retval false		Success
+	*/
+	bool prepare_inplace_alter_table(
+		TABLE*			altered_table,
+		Alter_inplace_info*	ha_alter_info);
+
+	/** Alter the table structure in-place with operations
+	specified using HA_ALTER_FLAGS and Alter_inplace_information.
+	The level of concurrency allowed during this operation depends
+	on the return value from check_if_supported_inplace_alter().
+
+	@param altered_table	TABLE object for new version of table.
+	@param ha_alter_info	Structure describing changes to be done
+	by ALTER TABLE and holding data used during in-place alter.
+
+	@retval true		Failure
+	@retval false		Success
+	*/
+	bool inplace_alter_table(
+		TABLE*			altered_table,
+		Alter_inplace_info*	ha_alter_info);
+
+	/** Commit or rollback the changes made during
+	prepare_inplace_alter_table() and inplace_alter_table() inside
+	the storage engine. Note that the allowed level of concurrency
+	during this operation will be the same as for
+	inplace_alter_table() and thus might be higher than during
+	prepare_inplace_alter_table(). (E.g concurrent writes were
+	blocked during prepare, but might not be during commit).
+	@param altered_table	TABLE object for new version of table.
+	@param ha_alter_info	Structure describing changes to be done
+	by ALTER TABLE and holding data used during in-place alter.
+	@param commit		true => Commit, false => Rollback.
+	@retval true		Failure
+	@retval false		Success
+	*/
+	bool commit_inplace_alter_table(
+		TABLE*			altered_table,
+		Alter_inplace_info*	ha_alter_info,
+		bool			commit);
 	/** @} */
 	bool check_if_incompatible_data(HA_CREATE_INFO *info,
 					uint table_changes);
@@ -242,6 +308,8 @@ private:
 	@see build_template() */
 	inline void reset_template();
 
+	int info_low(uint, bool);
+
 public:
 	/** @name Multi Range Read interface @{ */
 	/** Initialize multi range read @see DsMrr_impl::dsmrr_init
@@ -250,11 +318,11 @@ public:
 	* @param n_ranges
 	* @param mode
 	* @param buf
-        */
+	*/
 	int multi_range_read_init(RANGE_SEQ_IF* seq,
 				  void* seq_init_param,
-                                  uint n_ranges, uint mode,
-                                  HANDLER_BUFFER *buf);
+				  uint n_ranges, uint mode,
+				  HANDLER_BUFFER* buf);
 	/** Process next multi range read @see DsMrr_impl::dsmrr_next
 	* @param range_info
 	*/
@@ -270,23 +338,23 @@ public:
 	* @param flags
 	* @param cost
 	*/
-        ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
-                                            void *seq_init_param, 
-                                            uint n_ranges, uint *bufsz,
-                                            uint *flags, COST_VECT *cost);
+	ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF* seq,
+					   void* seq_init_param,
+					   uint n_ranges, uint* bufsz,
+					   uint* flags, Cost_estimate* cost);
 	/** Initialize multi range read and get information.
 	* @see DsMrr_impl::dsmrr_info
 	* @param keyno
-        * @param n_ranges
-        * @param keys
-        * @param key_parts
+	* @param seq
+	* @param seq_init_param
+	* @param n_ranges
 	* @param bufsz
 	* @param flags
 	* @param cost
 	*/
-        ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                      uint key_parts, uint *bufsz, 
-                                      uint *flags, COST_VECT *cost);
+	ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+				      uint key_parts, uint* bufsz, uint* flags,
+				      Cost_estimate* cost);
         int multi_range_read_explain_info(uint mrr_mode,
                                           char *str, size_t size);
 
@@ -302,8 +370,7 @@ public:
 
 private:
 	/** The multi range read session object */
-        DsMrr_impl ds_mrr;
-
+	DsMrr_impl ds_mrr;
 	/* @} */
 };
 
@@ -316,6 +383,7 @@ the definitions are bracketed with #ifdef INNODB_COMPATIBILITY_HOOKS */
 #endif
 
 extern "C" {
+
 struct charset_info_st *thd_charset(MYSQL_THD thd);
 LEX_STRING *thd_query_string(MYSQL_THD thd);
 
@@ -363,50 +431,225 @@ bool thd_binlog_filter_ok(const MYSQL_THD thd);
   @return 1 the query may generate row changes, 0 otherwise.
 */
 bool thd_sqlcom_can_generate_row_events(const MYSQL_THD thd);
-}
+
+/**
+  Gets information on the durability property requested by
+  a thread.
+  @param  thd   Thread handle
+  @return a durability property.
+*/
+enum durability_properties thd_get_durability_property(const MYSQL_THD thd);
+
+/** Is strict sql_mode set.
+@param thd	Thread object
+@return True if sql_mode has strict mode (all or trans), false otherwise.
+*/
+bool thd_is_strict_mode(const MYSQL_THD thd)
+__attribute__((nonnull));
+} /* extern "C" */
 
 /** Get the file name and position of the MySQL binlog corresponding to the
  * current commit.
  */
 extern void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file);
 
-typedef struct trx_struct trx_t;
-/********************************************************************//**
-@file handler/ha_innodb.h
-Converts an InnoDB error code to a MySQL error code and also tells to MySQL
-about a possible transaction rollback inside InnoDB caused by a lock wait
-timeout or a deadlock.
-@return	MySQL error code */
-extern "C"
-int
-convert_error_code_to_mysql(
-/*========================*/
-	int		error,	/*!< in: InnoDB error code */
-	ulint		flags,	/*!< in: InnoDB table flags, or 0 */
-	MYSQL_THD	thd);	/*!< in: user thread handle or NULL */
+struct trx_t;
+
+extern const struct _ft_vft ft_vft_result;
+
+/* Structure Returned by ha_innobase::ft_init_ext() */
+typedef struct new_ft_info
+{
+	struct _ft_vft		*please;
+	struct _ft_vft_ext	*could_you;
+	row_prebuilt_t*		ft_prebuilt;
+	fts_result_t*		ft_result;
+} NEW_FT_INFO;
 
 /*********************************************************************//**
 Allocates an InnoDB transaction for a MySQL handler object.
 @return	InnoDB transaction handle */
-extern "C"
 trx_t*
 innobase_trx_allocate(
 /*==================*/
 	MYSQL_THD	thd);	/*!< in: user thread handle */
 
-
 /*********************************************************************//**
 This function checks each index name for a table against reserved
 system default primary index name 'GEN_CLUST_INDEX'. If a name
 matches, this function pushes an warning message to the client,
 and returns true.
 @return true if the index name matches the reserved name */
-extern "C"
+UNIV_INTERN
 bool
 innobase_index_name_is_reserved(
 /*============================*/
 	THD*		thd,		/*!< in/out: MySQL connection */
 	const KEY*	key_info,	/*!< in: Indexes to be created */
-	ulint		num_of_keys);	/*!< in: Number of indexes to
+	ulint		num_of_keys)	/*!< in: Number of indexes to
 					be created. */
+	__attribute__((nonnull, warn_unused_result));
 
+/*****************************************************************//**
+Determines InnoDB table flags.
+@retval true if successful, false if error */
+UNIV_INTERN
+bool
+innobase_table_flags(
+/*=================*/
+	const TABLE*		form,		/*!< in: table */
+	const HA_CREATE_INFO*	create_info,	/*!< in: information
+						on table columns and indexes */
+	THD*			thd,		/*!< in: connection */
+	bool			use_tablespace,	/*!< in: whether to create
+						outside system tablespace */
+	ulint*			flags,		/*!< out: DICT_TF flags */
+	ulint*			flags2)		/*!< out: DICT_TF2 flags */
+	__attribute__((nonnull, warn_unused_result));
+
+/*****************************************************************//**
+Validates the create options. We may build on this function
+in future. For now, it checks two specifiers:
+KEY_BLOCK_SIZE and ROW_FORMAT
+If innodb_strict_mode is not set then this function is a no-op
+@return	NULL if valid, string if not. */
+UNIV_INTERN
+const char*
+create_options_are_invalid(
+/*=======================*/
+	THD*		thd,		/*!< in: connection thread. */
+	TABLE*		form,		/*!< in: information on table
+					columns and indexes */
+	HA_CREATE_INFO*	create_info,	/*!< in: create info. */
+	bool		use_tablespace)	/*!< in: srv_file_per_table */
+	__attribute__((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+of prebuilt->fts_doc_id
+@return the relevance ranking value */
+UNIV_INTERN
+float
+innobase_fts_retrieve_ranking(
+/*==========================*/
+	FT_INFO*	fts_hdl);	/*!< in: FTS handler */
+
+/*********************************************************************//**
+Find and Retrieve the FTS Relevance Ranking result for doc with doc_id
+of prebuilt->fts_doc_id
+@return the relevance ranking value */
+UNIV_INTERN
+float
+innobase_fts_find_ranking(
+/*======================*/
+	FT_INFO*	fts_hdl,	/*!< in: FTS handler */
+	uchar*		record,		/*!< in: Unused */
+	uint		len);		/*!< in: Unused */
+/*********************************************************************//**
+Free the memory for the FTS handler */
+UNIV_INTERN
+void
+innobase_fts_close_ranking(
+/*=======================*/
+	FT_INFO*	fts_hdl)	/*!< in: FTS handler */
+	__attribute__((nonnull));
+/*****************************************************************//**
+Initialize the table FTS stopword list
+@return TRUE if success */
+UNIV_INTERN
+ibool
+innobase_fts_load_stopword(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: Table has the FTS */
+	trx_t*		trx,		/*!< in: transaction */
+	THD*		thd)		/*!< in: current thread */
+	__attribute__((nonnull(1,3), warn_unused_result));
+
+/** Some defines for innobase_fts_check_doc_id_index() return value */
+enum fts_doc_id_index_enum {
+	FTS_INCORRECT_DOC_ID_INDEX,
+	FTS_EXIST_DOC_ID_INDEX,
+	FTS_NOT_EXIST_DOC_ID_INDEX
+};
+
+/*******************************************************************//**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column.
+@return the status of the FTS_DOC_ID index */
+UNIV_INTERN
+enum fts_doc_id_index_enum
+innobase_fts_check_doc_id_index(
+/*============================*/
+	const dict_table_t*	table,		/*!< in: table definition */
+	const TABLE*		altered_table,	/*!< in: MySQL table
+						that is being altered */
+	ulint*			fts_doc_col_no)	/*!< out: The column number for
+						Doc ID */
+	__attribute__((warn_unused_result));
+
+/*******************************************************************//**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column in MySQL create index definition.
+@return FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index,
+FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */
+UNIV_INTERN
+enum fts_doc_id_index_enum
+innobase_fts_check_doc_id_index_in_def(
+/*===================================*/
+	ulint		n_key,		/*!< in: Number of keys */
+	const KEY*	key_info)	/*!< in: Key definitions */
+	__attribute__((nonnull, warn_unused_result));
+
+/***********************************************************************
+@return version of the extended FTS API */
+uint
+innobase_fts_get_version();
+
+/***********************************************************************
+@return Which part of the extended FTS API is supported */
+ulonglong
+innobase_fts_flags();
+
+/***********************************************************************
+Find and Retrieve the FTS doc_id for the current result row
+@return the document ID */
+ulonglong
+innobase_fts_retrieve_docid(
+/*============================*/
+	FT_INFO_EXT*	fts_hdl);	/*!< in: FTS handler */
+
+/***********************************************************************
+Find and retrieve the size of the current result
+@return number of matching rows */
+ulonglong
+innobase_fts_count_matches(
+/*============================*/
+	FT_INFO_EXT*	fts_hdl);	/*!< in: FTS handler */
+
+/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default
+system clustered index when there is no primary key. */
+extern const char innobase_index_reserve_name[];
+
+/*********************************************************************//**
+Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+UNIV_INTERN
+void
+innobase_copy_frm_flags_from_create_info(
+/*=====================================*/
+	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
+	const HA_CREATE_INFO*	create_info);	/*!< in: create info */
+
+/*********************************************************************//**
+Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+UNIV_INTERN
+void
+innobase_copy_frm_flags_from_table_share(
+/*=====================================*/
+	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
+	const TABLE_SHARE*	table_share);	/*!< in: table share */
diff --git a/storage/xtradb/handler/handler0alter.cc b/storage/xtradb/handler/handler0alter.cc
index 9ffa295257d..9c535285d1e 100644
--- a/storage/xtradb/handler/handler0alter.cc
+++ b/storage/xtradb/handler/handler0alter.cc
@@ -23,22 +23,1029 @@ Smart ALTER TABLE
 
 #include <unireg.h>
 #include <mysqld_error.h>
-#include <sql_class.h>
-#include <sql_lex.h>                            // SQLCOM_CREATE_INDEX
+#include <log.h>
+#include <debug_sync.h>
 #include <innodb_priv.h>
+#include <sql_alter.h>
+#include <sql_class.h>
 
-extern "C" {
+#include "dict0crea.h"
+#include "dict0dict.h"
+#include "dict0priv.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
 #include "log0log.h"
+#include "rem0types.h"
+#include "row0log.h"
 #include "row0merge.h"
 #include "srv0srv.h"
 #include "trx0trx.h"
 #include "trx0roll.h"
 #include "ha_prototypes.h"
 #include "handler0alter.h"
-}
+#include "srv0mon.h"
+#include "fts0priv.h"
+#include "pars0pars.h"
 
 #include "ha_innodb.h"
 
+/** Operations for creating secondary indexes (no rebuild needed) */
+static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ONLINE_CREATE
+	= Alter_inplace_info::ADD_INDEX
+	| Alter_inplace_info::ADD_UNIQUE_INDEX;
+
+/** Operations for rebuilding a table in place */
+static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ALTER_REBUILD
+	= Alter_inplace_info::ADD_PK_INDEX
+	| Alter_inplace_info::DROP_PK_INDEX
+	| Alter_inplace_info::CHANGE_CREATE_OPTION
+	/* CHANGE_CREATE_OPTION needs to check innobase_need_rebuild() */
+	| Alter_inplace_info::ALTER_COLUMN_NULLABLE
+	| Alter_inplace_info::ALTER_COLUMN_NOT_NULLABLE
+	| Alter_inplace_info::ALTER_COLUMN_ORDER
+	| Alter_inplace_info::DROP_COLUMN
+	| Alter_inplace_info::ADD_COLUMN
+	/*
+	| Alter_inplace_info::ALTER_COLUMN_TYPE
+	| Alter_inplace_info::ALTER_COLUMN_EQUAL_PACK_LENGTH
+	*/
+	;
+
+/** Operations that require changes to data */
+static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ALTER_DATA
+	= INNOBASE_ONLINE_CREATE | INNOBASE_ALTER_REBUILD;
+
+/** Operations for altering a table that InnoDB does not care about */
+static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_INPLACE_IGNORE
+	= Alter_inplace_info::ALTER_COLUMN_DEFAULT
+	| Alter_inplace_info::ALTER_COLUMN_COLUMN_FORMAT
+	| Alter_inplace_info::ALTER_COLUMN_STORAGE_TYPE
+	| Alter_inplace_info::ALTER_RENAME;
+
+/** Operations on foreign key definitions (changing the schema only) */
+static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_FOREIGN_OPERATIONS
+	= Alter_inplace_info::DROP_FOREIGN_KEY
+	| Alter_inplace_info::ADD_FOREIGN_KEY;
+
+/** Operations that InnoDB cares about and can perform without rebuild */
+static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ALTER_NOREBUILD
+	= INNOBASE_ONLINE_CREATE
+	| INNOBASE_FOREIGN_OPERATIONS
+	| Alter_inplace_info::DROP_INDEX
+	| Alter_inplace_info::DROP_UNIQUE_INDEX
+	| Alter_inplace_info::ALTER_COLUMN_NAME;
+
+/* Report an InnoDB error to the client by invoking my_error(). */
+static UNIV_COLD __attribute__((nonnull))
+void
+my_error_innodb(
+/*============*/
+	dberr_t		error,	/*!< in: InnoDB error code */
+	const char*	table,	/*!< in: table name */
+	ulint		flags)	/*!< in: table flags */
+{
+	switch (error) {
+	case DB_MISSING_HISTORY:
+		my_error(ER_TABLE_DEF_CHANGED, MYF(0));
+		break;
+	case DB_RECORD_NOT_FOUND:
+		my_error(ER_KEY_NOT_FOUND, MYF(0), table);
+		break;
+	case DB_DEADLOCK:
+		my_error(ER_LOCK_DEADLOCK, MYF(0));
+		break;
+	case DB_LOCK_WAIT_TIMEOUT:
+		my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
+		break;
+	case DB_INTERRUPTED:
+		my_error(ER_QUERY_INTERRUPTED, MYF(0));
+		break;
+	case DB_OUT_OF_MEMORY:
+		my_error(ER_OUT_OF_RESOURCES, MYF(0));
+		break;
+	case DB_OUT_OF_FILE_SPACE:
+		my_error(ER_RECORD_FILE_FULL, MYF(0), table);
+		break;
+	case DB_TOO_BIG_INDEX_COL:
+		my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+			 DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags));
+		break;
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+		my_error(ER_TOO_MANY_CONCURRENT_TRXS, MYF(0));
+		break;
+	case DB_LOCK_TABLE_FULL:
+		my_error(ER_LOCK_TABLE_FULL, MYF(0));
+		break;
+	case DB_UNDO_RECORD_TOO_BIG:
+		my_error(ER_UNDO_RECORD_TOO_BIG, MYF(0));
+		break;
+	case DB_CORRUPTION:
+		my_error(ER_NOT_KEYFILE, MYF(0), table);
+		break;
+	case DB_TOO_BIG_RECORD:
+		my_error(ER_TOO_BIG_ROWSIZE, MYF(0),
+			 page_get_free_space_of_empty(
+				 flags & DICT_TF_COMPACT) / 2);
+		break;
+	case DB_INVALID_NULL:
+		/* TODO: report the row, as we do for DB_DUPLICATE_KEY */
+		my_error(ER_INVALID_USE_OF_NULL, MYF(0));
+		break;
+#ifdef UNIV_DEBUG
+	case DB_SUCCESS:
+	case DB_DUPLICATE_KEY:
+	case DB_TABLESPACE_EXISTS:
+	case DB_ONLINE_LOG_TOO_BIG:
+		/* These codes should not be passed here. */
+		ut_error;
+#endif /* UNIV_DEBUG */
+	default:
+		my_error(ER_GET_ERRNO, MYF(0), error, "InnoDB");
+		break;
+	}
+}
+
+/** Determine if fulltext indexes exist in a given table.
+@param table		MySQL table
+@return			whether fulltext indexes exist on the table */
+static
+bool
+innobase_fulltext_exist(
+/*====================*/
+	const TABLE*	table)
+{
+	for (uint i = 0; i < table->s->keys; i++) {
+		if (table->key_info[i].flags & HA_FULLTEXT) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/*******************************************************************//**
+Determine if ALTER TABLE needs to rebuild the table.
+@param ha_alter_info		the DDL operation
+@return whether it is necessary to rebuild the table */
+static __attribute__((nonnull, warn_unused_result))
+bool
+innobase_need_rebuild(
+/*==================*/
+	const Alter_inplace_info*	ha_alter_info)
+{
+	if (ha_alter_info->handler_flags
+	    == Alter_inplace_info::CHANGE_CREATE_OPTION
+	    && !(ha_alter_info->create_info->used_fields
+		 & (HA_CREATE_USED_ROW_FORMAT
+		    | HA_CREATE_USED_KEY_BLOCK_SIZE))) {
+		/* Any other CHANGE_CREATE_OPTION than changing
+		ROW_FORMAT or KEY_BLOCK_SIZE is ignored. */
+		return(false);
+	}
+
+	return(!!(ha_alter_info->handler_flags & INNOBASE_ALTER_REBUILD));
+}
+
+/** Check if InnoDB supports a particular alter table in-place
+@param altered_table	TABLE object for new version of table.
+@param ha_alter_info	Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+
+@retval HA_ALTER_INPLACE_NOT_SUPPORTED	Not supported
+@retval HA_ALTER_INPLACE_NO_LOCK	Supported
+@retval HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE Supported, but requires
+lock during main phase and exclusive lock during prepare phase.
+@retval HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE	Supported, prepare phase
+requires exclusive lock (any transactions that have accessed the table
+must commit or roll back first, and no transactions can access the table
+while prepare_inplace_alter_table() is executing)
+*/
+UNIV_INTERN
+enum_alter_inplace_result
+ha_innobase::check_if_supported_inplace_alter(
+/*==========================================*/
+	TABLE*			altered_table,
+	Alter_inplace_info*	ha_alter_info)
+{
+	DBUG_ENTER("check_if_supported_inplace_alter");
+
+	if (srv_read_only_mode) {
+		ha_alter_info->unsupported_reason =
+			innobase_get_err_msg(ER_READ_ONLY_MODE);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	} else if (srv_created_new_raw || srv_force_recovery) {
+		ha_alter_info->unsupported_reason =
+			innobase_get_err_msg(ER_READ_ONLY_MODE);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	if (altered_table->s->stored_fields > REC_MAX_N_USER_FIELDS) {
+		/* Deny the inplace ALTER TABLE. MySQL will try to
+		re-create the table and ha_innobase::create() will
+		return an error too. This is how we effectively
+		deny adding too many columns to a table. */
+		ha_alter_info->unsupported_reason =
+			innobase_get_err_msg(ER_TOO_MANY_FIELDS);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	update_thd();
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+
+	if (ha_alter_info->handler_flags
+	    & ~(INNOBASE_INPLACE_IGNORE
+		| INNOBASE_ALTER_NOREBUILD
+		| INNOBASE_ALTER_REBUILD)) {
+
+		if (ha_alter_info->handler_flags
+			& (Alter_inplace_info::ALTER_COLUMN_EQUAL_PACK_LENGTH
+			   | Alter_inplace_info::ALTER_COLUMN_TYPE))
+			ha_alter_info->unsupported_reason = innobase_get_err_msg(
+				ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_COLUMN_TYPE);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	/* Only support online add foreign key constraint when
+	check_foreigns is turned off */
+	if ((ha_alter_info->handler_flags
+	     & Alter_inplace_info::ADD_FOREIGN_KEY)
+	    && prebuilt->trx->check_foreigns) {
+		ha_alter_info->unsupported_reason = innobase_get_err_msg(
+			ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FK_CHECK);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) {
+		DBUG_RETURN(HA_ALTER_INPLACE_NO_LOCK);
+	}
+
+	/* Only support NULL -> NOT NULL change if strict table sql_mode
+	is set. Fall back to COPY for conversion if not strict tables.
+	In-Place will fail with an error when trying to convert
+	NULL to a NOT NULL value. */
+	if ((ha_alter_info->handler_flags
+	     & Alter_inplace_info::ALTER_COLUMN_NOT_NULLABLE)
+	    && !thd_is_strict_mode(user_thd)) {
+		ha_alter_info->unsupported_reason = innobase_get_err_msg(
+			ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOT_NULL);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	/* InnoDB cannot IGNORE when creating unique indexes. IGNORE
+	should silently delete some duplicate rows. Our inplace_alter
+	code will not delete anything from existing indexes. */
+	if (ha_alter_info->ignore
+	    && (ha_alter_info->handler_flags
+		& (Alter_inplace_info::ADD_PK_INDEX
+		   | Alter_inplace_info::ADD_UNIQUE_INDEX))) {
+		ha_alter_info->unsupported_reason = innobase_get_err_msg(
+			ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_IGNORE);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	/* DROP PRIMARY KEY is only allowed in combination with ADD
+	PRIMARY KEY. */
+	if ((ha_alter_info->handler_flags
+	     & (Alter_inplace_info::ADD_PK_INDEX
+		| Alter_inplace_info::DROP_PK_INDEX))
+	    == Alter_inplace_info::DROP_PK_INDEX) {
+		ha_alter_info->unsupported_reason = innobase_get_err_msg(
+			ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOPK);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	/* If a column change from NOT NULL to NULL,
+	and there's a implict pk on this column. the
+	table should be rebuild. The change should
+	only go through the "Copy" method.*/
+	if ((ha_alter_info->handler_flags
+	     & Alter_inplace_info::ALTER_COLUMN_NULLABLE)) {
+		uint primary_key = altered_table->s->primary_key;
+
+		/* See if MYSQL table has no pk but we do.*/
+		if (UNIV_UNLIKELY(primary_key >= MAX_KEY)
+		    && !row_table_got_default_clust_index(prebuilt->table)) {
+			ha_alter_info->unsupported_reason = innobase_get_err_msg(
+				ER_PRIMARY_CANT_HAVE_NULL);
+			DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+		}
+	}
+
+	/*
+	  InnoDB in different MariaDB versions was generating different mtype
+	  codes for certain types. In some cases the signed/unsigned bit was
+	  generated differently too.
+
+	  Online ALTER would change the mtype/unsigned_flag (to what the
+	  current code generates) without changing the underlying data
+	  represenation, and it might result in data corruption.
+
+	  Don't do online ALTER if mtype/unsigned_flag are wrong.
+	*/
+	for (ulint i = 0; i < table->s->fields; i++) {
+		const Field*		field = table->field[i];
+		const dict_col_t*	col = dict_table_get_nth_col(prebuilt->table, i);
+		ulint		unsigned_flag;
+		if (col->mtype != get_innobase_type_from_mysql_type(&unsigned_flag, field)) {
+
+			DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+		}
+
+		if ((col->prtype & DATA_UNSIGNED) != unsigned_flag) {
+
+			DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+		}
+	}
+
+	/* We should be able to do the operation in-place.
+	See if we can do it online (LOCK=NONE). */
+	bool	online = true;
+
+	List_iterator_fast<Create_field> cf_it(
+		ha_alter_info->alter_info->create_list);
+
+	/* Fix the key parts. */
+	for (KEY* new_key = ha_alter_info->key_info_buffer;
+	     new_key < ha_alter_info->key_info_buffer
+		     + ha_alter_info->key_count;
+	     new_key++) {
+		for (KEY_PART_INFO* key_part = new_key->key_part;
+		     key_part < new_key->key_part + new_key->user_defined_key_parts;
+		     key_part++) {
+			const Create_field*	new_field;
+
+			DBUG_ASSERT(key_part->fieldnr
+				    < altered_table->s->fields);
+
+			cf_it.rewind();
+			for (uint fieldnr = 0; (new_field = cf_it++);
+			     fieldnr++) {
+				if (fieldnr == key_part->fieldnr) {
+					break;
+				}
+			}
+
+			DBUG_ASSERT(new_field);
+
+			key_part->field = altered_table->field[
+				key_part->fieldnr];
+			/* In some special cases InnoDB emits "false"
+			duplicate key errors with NULL key values. Let
+			us play safe and ensure that we can correctly
+			print key values even in such cases .*/
+			key_part->null_offset = key_part->field->null_offset();
+			key_part->null_bit = key_part->field->null_bit;
+
+			if (new_field->field) {
+				/* This is an existing column. */
+				continue;
+			}
+
+			/* This is an added column. */
+			DBUG_ASSERT(ha_alter_info->handler_flags
+				    & Alter_inplace_info::ADD_COLUMN);
+
+			/* We cannot replace a hidden FTS_DOC_ID
+			with a user-visible FTS_DOC_ID. */
+			if (prebuilt->table->fts
+			    && innobase_fulltext_exist(altered_table)
+			    && !my_strcasecmp(
+				    system_charset_info,
+				    key_part->field->field_name,
+				    FTS_DOC_ID_COL_NAME)) {
+				ha_alter_info->unsupported_reason = innobase_get_err_msg(
+					ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_HIDDEN_FTS);
+				DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+			}
+
+			DBUG_ASSERT((MTYP_TYPENR(key_part->field->unireg_check)
+				     == Field::NEXT_NUMBER)
+				    == !!(key_part->field->flags
+					  & AUTO_INCREMENT_FLAG));
+
+			if (key_part->field->flags & AUTO_INCREMENT_FLAG) {
+				/* We cannot assign an AUTO_INCREMENT
+				column values during online ALTER. */
+				DBUG_ASSERT(key_part->field == altered_table
+					    -> found_next_number_field);
+				ha_alter_info->unsupported_reason = innobase_get_err_msg(
+					ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_AUTOINC);
+				online = false;
+			}
+		}
+	}
+
+	DBUG_ASSERT(!prebuilt->table->fts || prebuilt->table->fts->doc_col
+		    <= table->s->stored_fields);
+	DBUG_ASSERT(!prebuilt->table->fts || prebuilt->table->fts->doc_col
+		    < dict_table_get_n_user_cols(prebuilt->table));
+
+	if (prebuilt->table->fts
+	    && innobase_fulltext_exist(altered_table)) {
+		/* FULLTEXT indexes are supposed to remain. */
+		/* Disallow DROP INDEX FTS_DOC_ID_INDEX */
+
+		for (uint i = 0; i < ha_alter_info->index_drop_count; i++) {
+			if (!my_strcasecmp(
+				    system_charset_info,
+				    ha_alter_info->index_drop_buffer[i]->name,
+				    FTS_DOC_ID_INDEX_NAME)) {
+				ha_alter_info->unsupported_reason = innobase_get_err_msg(
+					ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_CHANGE_FTS);
+				DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+			}
+		}
+
+		/* InnoDB can have a hidden FTS_DOC_ID_INDEX on a
+		visible FTS_DOC_ID column as well. Prevent dropping or
+		renaming the FTS_DOC_ID. */
+
+		for (Field** fp = table->field; *fp; fp++) {
+			if (!((*fp)->flags
+			      & (FIELD_IS_RENAMED | FIELD_IS_DROPPED))) {
+				continue;
+			}
+
+			if (!my_strcasecmp(
+				    system_charset_info,
+				    (*fp)->field_name,
+				    FTS_DOC_ID_COL_NAME)) {
+				ha_alter_info->unsupported_reason = innobase_get_err_msg(
+					ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_CHANGE_FTS);
+				DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+			}
+		}
+	}
+
+	prebuilt->trx->will_lock++;
+
+	if (!online) {
+		/* We already determined that only a non-locking
+		operation is possible. */
+	} else if (((ha_alter_info->handler_flags
+		     & Alter_inplace_info::ADD_PK_INDEX)
+		    || innobase_need_rebuild(ha_alter_info))
+		   && (innobase_fulltext_exist(altered_table)
+		       || (prebuilt->table->flags2
+			   & DICT_TF2_FTS_HAS_DOC_ID))) {
+		/* Refuse to rebuild the table online, if
+		fulltext indexes are to survive the rebuild,
+		or if the table contains a hidden FTS_DOC_ID column. */
+		online = false;
+		/* If the table already contains fulltext indexes,
+		refuse to rebuild the table natively altogether. */
+		if (prebuilt->table->fts) {
+			ha_alter_info->unsupported_reason = innobase_get_err_msg(
+				ER_INNODB_FT_LIMIT);
+			DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+		}
+		ha_alter_info->unsupported_reason = innobase_get_err_msg(
+			ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS);
+	} else if ((ha_alter_info->handler_flags
+		    & Alter_inplace_info::ADD_INDEX)) {
+		/* Building a full-text index requires a lock.
+		We could do without a lock if the table already contains
+		an FTS_DOC_ID column, but in that case we would have
+		to apply the modification log to the full-text indexes. */
+
+		for (uint i = 0; i < ha_alter_info->index_add_count; i++) {
+			const KEY* key =
+				&ha_alter_info->key_info_buffer[
+					ha_alter_info->index_add_buffer[i]];
+			if (key->flags & HA_FULLTEXT) {
+				DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK
+					      & ~(HA_FULLTEXT
+						  | HA_PACK_KEY
+						  | HA_GENERATED_KEY
+						  | HA_BINARY_PACK_KEY)));
+				ha_alter_info->unsupported_reason = innobase_get_err_msg(
+					ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS);
+				online = false;
+				break;
+			}
+		}
+	}
+
+	DBUG_RETURN(online
+		    ? HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE
+		    : HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE);
+}
+
+/*************************************************************//**
+Initialize the dict_foreign_t structure with supplied info
+@return true if added, false if duplicate foreign->id */
+static __attribute__((nonnull(1,3,5,7)))
+bool
+innobase_init_foreign(
+/*==================*/
+	dict_foreign_t*	foreign,		/*!< in/out: structure to
+						initialize */
+	char*		constraint_name,	/*!< in/out: constraint name if
+						exists */
+	dict_table_t*	table,			/*!< in: foreign table */
+	dict_index_t*	index,			/*!< in: foreign key index */
+	const char**	column_names,		/*!< in: foreign key column
+						names */
+	ulint		num_field,		/*!< in: number of columns */
+	const char*	referenced_table_name,	/*!< in: referenced table
+						name */
+	dict_table_t*	referenced_table,	/*!< in: referenced table */
+	dict_index_t*	referenced_index,	/*!< in: referenced index */
+	const char**	referenced_column_names,/*!< in: referenced column
+						names */
+	ulint		referenced_num_field)	/*!< in: number of referenced
+						columns */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+        if (constraint_name) {
+                ulint   db_len;
+
+                /* Catenate 'databasename/' to the constraint name specified
+                by the user: we conceive the constraint as belonging to the
+                same MySQL 'database' as the table itself. We store the name
+                to foreign->id. */
+
+                db_len = dict_get_db_name_len(table->name);
+
+                foreign->id = static_cast<char*>(mem_heap_alloc(
+                        foreign->heap, db_len + strlen(constraint_name) + 2));
+
+                ut_memcpy(foreign->id, table->name, db_len);
+                foreign->id[db_len] = '/';
+                strcpy(foreign->id + db_len + 1, constraint_name);
+
+		/* Check if any existing foreign key has the same id,
+		this is needed only if user supplies the constraint name */
+
+		for (const dict_foreign_t* existing_foreign
+			= UT_LIST_GET_FIRST(table->foreign_list);
+		     existing_foreign != 0;
+		     existing_foreign = UT_LIST_GET_NEXT(
+			     foreign_list, existing_foreign)) {
+
+			if (ut_strcmp(existing_foreign->id, foreign->id) == 0) {
+				return(false);
+			}
+		}
+        }
+
+        foreign->foreign_table = table;
+        foreign->foreign_table_name = mem_heap_strdup(
+                foreign->heap, table->name);
+        dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
+
+        foreign->foreign_index = index;
+        foreign->n_fields = (unsigned int) num_field;
+
+        foreign->foreign_col_names = static_cast<const char**>(
+                mem_heap_alloc(foreign->heap, num_field * sizeof(void*)));
+
+        for (ulint i = 0; i < foreign->n_fields; i++) {
+                foreign->foreign_col_names[i] = mem_heap_strdup(
+                        foreign->heap, column_names[i]);
+        }
+
+	foreign->referenced_index = referenced_index;
+	foreign->referenced_table = referenced_table;
+
+	foreign->referenced_table_name = mem_heap_strdup(
+		foreign->heap, referenced_table_name);
+        dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
+
+        foreign->referenced_col_names = static_cast<const char**>(
+                mem_heap_alloc(foreign->heap,
+			       referenced_num_field * sizeof(void*)));
+
+        for (ulint i = 0; i < foreign->n_fields; i++) {
+                foreign->referenced_col_names[i]
+                        = mem_heap_strdup(foreign->heap,
+					  referenced_column_names[i]);
+        }
+
+	return(true);
+}
+
+/*************************************************************//**
+Check whether the foreign key options is legit
+@return true if it is */
+static __attribute__((nonnull, warn_unused_result))
+bool
+innobase_check_fk_option(
+/*=====================*/
+	const dict_foreign_t*	foreign)	/*!< in: foreign key */
+{
+	if (!foreign->foreign_index) {
+		return(true);
+	}
+
+	if (foreign->type & (DICT_FOREIGN_ON_UPDATE_SET_NULL
+			     | DICT_FOREIGN_ON_DELETE_SET_NULL)) {
+
+		for (ulint j = 0; j < foreign->n_fields; j++) {
+			if ((dict_index_get_nth_col(
+				     foreign->foreign_index, j)->prtype)
+			    & DATA_NOT_NULL) {
+
+				/* It is not sensible to define
+				SET NULL if the column is not
+				allowed to be NULL! */
+				return(false);
+			}
+		}
+	}
+
+	return(true);
+}
+
+/*************************************************************//**
+Set foreign key options
+@return true if successfully set */
+static __attribute__((nonnull, warn_unused_result))
+bool
+innobase_set_foreign_key_option(
+/*============================*/
+	dict_foreign_t*	foreign,	/*!< in:InnoDB Foreign key */
+	Foreign_key*	fk_key)		/*!< in: Foreign key info from
+					MySQL */
+{
+	ut_ad(!foreign->type);
+
+	switch (fk_key->delete_opt) {
+	case Foreign_key::FK_OPTION_NO_ACTION:
+	case Foreign_key::FK_OPTION_RESTRICT:
+	case Foreign_key::FK_OPTION_DEFAULT:
+		foreign->type = DICT_FOREIGN_ON_DELETE_NO_ACTION;
+		break;
+	case Foreign_key::FK_OPTION_CASCADE:
+		foreign->type = DICT_FOREIGN_ON_DELETE_CASCADE;
+		break;
+	case Foreign_key::FK_OPTION_SET_NULL:
+		foreign->type = DICT_FOREIGN_ON_DELETE_SET_NULL;
+		break;
+	}
+
+	switch (fk_key->update_opt) {
+	case Foreign_key::FK_OPTION_NO_ACTION:
+	case Foreign_key::FK_OPTION_RESTRICT:
+	case Foreign_key::FK_OPTION_DEFAULT:
+		foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION;
+		break;
+	case Foreign_key::FK_OPTION_CASCADE:
+		foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE;
+		break;
+	case Foreign_key::FK_OPTION_SET_NULL:
+		foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL;
+		break;
+	}
+
+	return(innobase_check_fk_option(foreign));
+}
+
+/*******************************************************************//**
+Check if a foreign key constraint can make use of an index
+that is being created.
+@return	useable index, or NULL if none found */
+static __attribute__((nonnull, warn_unused_result))
+const KEY*
+innobase_find_equiv_index(
+/*======================*/
+	const char*const*	col_names,
+					/*!< in: column names */
+	uint			n_cols,	/*!< in: number of columns */
+	const KEY*		keys,	/*!< in: index information */
+	const uint*		add,	/*!< in: indexes being created */
+	uint			n_add)	/*!< in: number of indexes to create */
+{
+	for (uint i = 0; i < n_add; i++) {
+		const KEY*	key = &keys[add[i]];
+
+		if (key->user_defined_key_parts < n_cols) {
+no_match:
+			continue;
+		}
+
+		for (uint j = 0; j < n_cols; j++) {
+			const KEY_PART_INFO&	key_part = key->key_part[j];
+			uint32			col_len
+				= key_part.field->pack_length();
+
+			/* The MySQL pack length contains 1 or 2 bytes
+			length field for a true VARCHAR. */
+
+			if (key_part.field->type() == MYSQL_TYPE_VARCHAR) {
+				col_len -= static_cast<const Field_varstring*>(
+					key_part.field)->length_bytes;
+			}
+
+			if (key_part.length < col_len) {
+
+				/* Column prefix indexes cannot be
+				used for FOREIGN KEY constraints. */
+				goto no_match;
+			}
+
+			if (innobase_strcasecmp(col_names[j],
+						key_part.field->field_name)) {
+				/* Name mismatch */
+				goto no_match;
+			}
+		}
+
+		return(key);
+	}
+
+	return(NULL);
+}
+
+/*************************************************************//**
+Find an index whose first fields are the columns in the array
+in the same order and is not marked for deletion
+@return matching index, NULL if not found */
+static __attribute__((nonnull(1,2,6), warn_unused_result))
+dict_index_t*
+innobase_find_fk_index(
+/*===================*/
+	Alter_inplace_info*	ha_alter_info,
+					/*!< in: alter table info */
+	dict_table_t*		table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	dict_index_t**		drop_index,
+					/*!< in: indexes to be dropped */
+	ulint			n_drop_index,
+					/*!< in: size of drop_index[] */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols) /*!< in: number of columns */
+{
+	dict_index_t*	index;
+
+	index = dict_table_get_first_index(table);
+
+	while (index != NULL) {
+		if (!(index->type & DICT_FTS)
+		    && dict_foreign_qualify_index(
+			    table, col_names, columns, n_cols,
+			    index, NULL, true, 0)) {
+			for (ulint i = 0; i < n_drop_index; i++) {
+				if (index == drop_index[i]) {
+					/* Skip to-be-dropped indexes. */
+					goto next_rec;
+				}
+			}
+
+			return(index);
+		}
+
+next_rec:
+		index = dict_table_get_next_index(index);
+	}
+
+	return(NULL);
+}
+
+/*************************************************************//**
+Create InnoDB foreign key structure from MySQL alter_info
+@retval true if successful
+@retval false on error (will call my_error()) */
+static __attribute__((nonnull(1,2,3,7,8), warn_unused_result))
+bool
+innobase_get_foreign_key_info(
+/*==========================*/
+	Alter_inplace_info*
+			ha_alter_info,	/*!< in: alter table info */
+	const TABLE_SHARE*
+			table_share,	/*!< in: the TABLE_SHARE */
+	dict_table_t*	table,		/*!< in: table */
+	const char**	col_names,	/*!< in: column names, or NULL
+					to use table->col_names */
+	dict_index_t**	drop_index,	/*!< in: indexes to be dropped */
+	ulint		n_drop_index,	/*!< in: size of drop_index[] */
+	dict_foreign_t**add_fk,		/*!< out: foreign constraint added */
+	ulint*		n_add_fk,	/*!< out: number of foreign
+					constraints added */
+	const trx_t*	trx)		/*!< in: user transaction */
+{
+	Key*		key;
+	Foreign_key*	fk_key;
+	dict_table_t*	referenced_table = NULL;
+	char*		referenced_table_name = NULL;
+	ulint		num_fk = 0;
+	Alter_info*	alter_info = ha_alter_info->alter_info;
+
+	*n_add_fk = 0;
+
+	List_iterator<Key> key_iterator(alter_info->key_list);
+
+	while ((key=key_iterator++)) {
+		if (key->type != Key::FOREIGN_KEY) {
+			continue;
+		}
+
+		const char*	column_names[MAX_NUM_FK_COLUMNS];
+		dict_index_t*	index = NULL;
+		const char*	referenced_column_names[MAX_NUM_FK_COLUMNS];
+		dict_index_t*	referenced_index = NULL;
+		ulint		num_col = 0;
+		ulint		referenced_num_col = 0;
+		bool		correct_option;
+		char*		db_namep = NULL;
+		char*		tbl_namep = NULL;
+		ulint		db_name_len = 0;
+		ulint		tbl_name_len = 0;
+#ifdef __WIN__
+		char		db_name[MAX_DATABASE_NAME_LEN];
+		char		tbl_name[MAX_TABLE_NAME_LEN];
+#endif
+
+		fk_key = static_cast<Foreign_key*>(key);
+
+		if (fk_key->columns.elements > 0) {
+			ulint	i = 0;
+			Key_part_spec* column;
+			List_iterator<Key_part_spec> key_part_iterator(
+				fk_key->columns);
+
+			/* Get all the foreign key column info for the
+			current table */
+			while ((column = key_part_iterator++)) {
+				column_names[i] = column->field_name.str;
+				ut_ad(i < MAX_NUM_FK_COLUMNS);
+				i++;
+			}
+
+			index = innobase_find_fk_index(
+				ha_alter_info,
+				table, col_names,
+				drop_index, n_drop_index,
+				column_names, i);
+
+			/* MySQL would add a index in the creation
+			list if no such index for foreign table,
+			so we have to use DBUG_EXECUTE_IF to simulate
+			the scenario */
+			DBUG_EXECUTE_IF("innodb_test_no_foreign_idx",
+					index = NULL;);
+
+			/* Check whether there exist such
+			index in the the index create clause */
+			if (!index && !innobase_find_equiv_index(
+				    column_names, i,
+				    ha_alter_info->key_info_buffer,
+				    ha_alter_info->index_add_buffer,
+				    ha_alter_info->index_add_count)) {
+				my_error(
+					ER_FK_NO_INDEX_CHILD,
+					MYF(0),
+					fk_key->name.str
+					? fk_key->name.str : "",
+					table_share->table_name.str);
+				goto err_exit;
+			}
+
+			num_col = i;
+		}
+
+		add_fk[num_fk] = dict_mem_foreign_create();
+
+#ifndef __WIN__
+		tbl_namep = fk_key->ref_table.str;
+		tbl_name_len = fk_key->ref_table.length;
+		db_namep = fk_key->ref_db.str;
+		db_name_len = fk_key->ref_db.length;
+#else
+		ut_ad(fk_key->ref_table.str);
+
+		memcpy(tbl_name, fk_key->ref_table.str,
+		       fk_key->ref_table.length);
+		tbl_name[fk_key->ref_table.length] = 0;
+		innobase_casedn_str(tbl_name);
+		tbl_name_len = strlen(tbl_name);
+		tbl_namep = &tbl_name[0];
+
+		if (fk_key->ref_db.str != NULL) {
+			memcpy(db_name, fk_key->ref_db.str,
+			       fk_key->ref_db.length);
+			db_name[fk_key->ref_db.length] = 0;
+			innobase_casedn_str(db_name);
+			db_name_len = strlen(db_name);
+			db_namep = &db_name[0];
+		}
+#endif
+		mutex_enter(&dict_sys->mutex);
+
+		referenced_table_name = dict_get_referenced_table(
+			table->name,
+			db_namep,
+			db_name_len,
+			tbl_namep,
+			tbl_name_len,
+			&referenced_table,
+			add_fk[num_fk]->heap);
+
+		/* Test the case when referenced_table failed to
+		open, if trx->check_foreigns is not set, we should
+		still be able to add the foreign key */
+		DBUG_EXECUTE_IF("innodb_test_open_ref_fail",
+				referenced_table = NULL;);
+
+		if (!referenced_table && trx->check_foreigns) {
+			mutex_exit(&dict_sys->mutex);
+			my_error(ER_FK_CANNOT_OPEN_PARENT,
+				 MYF(0), tbl_namep);
+
+			goto err_exit;
+		}
+
+		if (fk_key->ref_columns.elements > 0) {
+			ulint	i = 0;
+			Key_part_spec* column;
+			List_iterator<Key_part_spec> key_part_iterator(
+				fk_key->ref_columns);
+
+			while ((column = key_part_iterator++)) {
+				referenced_column_names[i] =
+					column->field_name.str;
+				ut_ad(i < MAX_NUM_FK_COLUMNS);
+				i++;
+			}
+
+			if (referenced_table) {
+				referenced_index =
+					dict_foreign_find_index(
+						referenced_table, 0,
+						referenced_column_names,
+						i, index,
+						TRUE, FALSE);
+
+				DBUG_EXECUTE_IF(
+					"innodb_test_no_reference_idx",
+					referenced_index = NULL;);
+
+				/* Check whether there exist such
+				index in the the index create clause */
+				if (!referenced_index) {
+					mutex_exit(&dict_sys->mutex);
+					my_error(ER_FK_NO_INDEX_PARENT, MYF(0),
+						 fk_key->name.str
+						 ? fk_key->name.str : "",
+						 tbl_namep);
+					goto err_exit;
+				}
+			} else {
+				ut_a(!trx->check_foreigns);
+			}
+
+			referenced_num_col = i;
+		}
+
+		if (!innobase_init_foreign(
+			    add_fk[num_fk], fk_key->name.str,
+			    table, index, column_names,
+			    num_col, referenced_table_name,
+			    referenced_table, referenced_index,
+			    referenced_column_names, referenced_num_col)) {
+			mutex_exit(&dict_sys->mutex);
+			my_error(
+				ER_FK_DUP_NAME,
+				MYF(0),
+				add_fk[num_fk]->id);
+			goto err_exit;
+		}
+
+		mutex_exit(&dict_sys->mutex);
+
+		correct_option = innobase_set_foreign_key_option(
+			add_fk[num_fk], fk_key);
+
+		DBUG_EXECUTE_IF("innodb_test_wrong_fk_option",
+				correct_option = false;);
+
+		if (!correct_option) {
+			my_error(ER_FK_INCORRECT_OPTION,
+				 MYF(0),
+				 table_share->table_name.str,
+				 add_fk[num_fk]->id);
+			goto err_exit;
+		}
+
+		num_fk++;
+	}
+
+	*n_add_fk = num_fk;
+
+	return(true);
+err_exit:
+	for (ulint i = 0; i <= num_fk; i++) {
+		if (add_fk[i]) {
+			dict_foreign_free(add_fk[i]);
+		}
+	}
+
+	return(false);
+}
+
 /*************************************************************//**
 Copies an InnoDB column to a MySQL field.  This function is
 adapted from row_sel_field_store_in_mysql_format(). */
@@ -91,10 +1098,9 @@ innobase_col_to_mysql(
 		break;
 
 	case DATA_BLOB:
-		/* Store a pointer to the BLOB buffer to dest: the BLOB was
-		already copied to the buffer in row_sel_store_mysql_rec */
-
-		row_mysql_store_blob_ref(dest, flen, data, len);
+		/* Skip MySQL BLOBs when reporting an erroneous row
+		during index creation or table rebuild. */
+		field->set_null();
 		break;
 
 #ifdef UNIV_DEBUG
@@ -131,32 +1137,38 @@ innobase_col_to_mysql(
 
 /*************************************************************//**
 Copies an InnoDB record to table->record[0]. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_rec_to_mysql(
 /*==================*/
-	TABLE*			table,		/*!< in/out: MySQL table */
-	const rec_t*		rec,		/*!< in: record */
-	const dict_index_t*	index,		/*!< in: index */
-	const ulint*		offsets)	/*!< in: rec_get_offsets(
-						rec, index, ...) */
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: index */
+	const ulint*		offsets)/*!< in: rec_get_offsets(
+					rec, index, ...) */
 {
-	uint	n_fields	= table->s->fields;
-	uint	i;
+	uint	n_fields	= table->s->stored_fields;
+        uint    sql_idx         = 0;
 
-	ut_ad(n_fields == dict_table_get_n_user_cols(index->table));
+	ut_ad(n_fields == dict_table_get_n_user_cols(index->table)
+	      - !!(DICT_TF2_FLAG_IS_SET(index->table,
+					DICT_TF2_FTS_HAS_DOC_ID)));
 
-	for (i = 0; i < n_fields; i++) {
-		Field*		field	= table->field[i];
+	for (uint i = 0; i < n_fields; i++, sql_idx++) {
+		Field*		field;
 		ulint		ipos;
 		ulint		ilen;
 		const uchar*	ifield;
 
+                while (!((field= table->field[sql_idx])->stored_in_db))
+                          sql_idx++;
+
 		field->reset();
 
 		ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE);
 
-		if (UNIV_UNLIKELY(ipos == ULINT_UNDEFINED)) {
+		if (ipos == ULINT_UNDEFINED
+		    || rec_offs_nth_extern(offsets, ipos)) {
 null_field:
 			field->set_null();
 			continue;
@@ -180,8 +1192,95 @@ null_field:
 }
 
 /*************************************************************//**
+Copies an InnoDB index entry to table->record[0]. */
+UNIV_INTERN
+void
+innobase_fields_to_mysql(
+/*=====================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const dict_index_t*	index,	/*!< in: InnoDB index */
+	const dfield_t*		fields)	/*!< in: InnoDB index fields */
+{
+	uint	n_fields	= table->s->stored_fields;
+        uint    sql_idx         = 0;
+
+	ut_ad(n_fields == dict_table_get_n_user_cols(index->table)
+	      - !!(DICT_TF2_FLAG_IS_SET(index->table,
+					DICT_TF2_FTS_HAS_DOC_ID)));
+
+	for (uint i = 0; i < n_fields; i++, sql_idx++) {
+		Field*		field;
+		ulint		ipos;
+
+                while (!((field= table->field[sql_idx])->stored_in_db))
+                          sql_idx++;
+
+		field->reset();
+
+		ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE);
+
+		if (ipos == ULINT_UNDEFINED
+		    || dfield_is_ext(&fields[ipos])
+		    || dfield_is_null(&fields[ipos])) {
+
+			field->set_null();
+		} else {
+			field->set_notnull();
+
+			const dfield_t*	df	= &fields[ipos];
+
+			innobase_col_to_mysql(
+				dict_field_get_col(
+					dict_index_get_nth_field(index, ipos)),
+				static_cast<const uchar*>(dfield_get_data(df)),
+				dfield_get_len(df), field);
+		}
+	}
+}
+
+/*************************************************************//**
+Copies an InnoDB row to table->record[0]. */
+UNIV_INTERN
+void
+innobase_row_to_mysql(
+/*==================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const dict_table_t*	itab,	/*!< in: InnoDB table */
+	const dtuple_t*		row)	/*!< in: InnoDB row */
+{
+	uint  n_fields	= table->s->stored_fields;
+        uint  sql_idx   = 0;
+
+	/* The InnoDB row may contain an extra FTS_DOC_ID column at the end. */
+	ut_ad(row->n_fields == dict_table_get_n_cols(itab));
+	ut_ad(n_fields == row->n_fields - DATA_N_SYS_COLS
+	      - !!(DICT_TF2_FLAG_IS_SET(itab, DICT_TF2_FTS_HAS_DOC_ID)));
+
+	for (uint i = 0; i < n_fields; i++, sql_idx++) {
+		Field*          field;
+		const dfield_t*	df	= dtuple_get_nth_field(row, i);
+
+                while (!((field= table->field[sql_idx])->stored_in_db))
+                          sql_idx++;
+
+		field->reset();
+
+		if (dfield_is_ext(df) || dfield_is_null(df)) {
+			field->set_null();
+		} else {
+			field->set_notnull();
+
+			innobase_col_to_mysql(
+				dict_table_get_nth_col(itab, i),
+				static_cast<const uchar*>(dfield_get_data(df)),
+				dfield_get_len(df), field);
+		}
+	}
+}
+
+/*************************************************************//**
 Resets table->record[0]. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_rec_reset(
 /*===============*/
@@ -195,66 +1294,29 @@ innobase_rec_reset(
 	}
 }
 
-/******************************************************************//**
-Removes the filename encoding of a database and table name. */
-static
-void
-innobase_convert_tablename(
-/*=======================*/
-	char*	s)	/*!< in: identifier; out: decoded identifier */
-{
-	uint	errors;
-
-	char*	slash = strchr(s, '/');
-
-	if (slash) {
-		char*	t;
-		/* Temporarily replace the '/' with NUL. */
-		*slash = 0;
-		/* Convert the database name. */
-		strconvert(&my_charset_filename, s, system_charset_info,
-			   s, slash - s + 1, &errors);
-
-		t = s + strlen(s);
-		ut_ad(slash >= t);
-		/* Append a  '.' after the database name. */
-		*t++ = '.';
-		slash++;
-		/* Convert the table name. */
-		strconvert(&my_charset_filename, slash, system_charset_info,
-			   t, slash - t + strlen(slash), &errors);
-	} else {
-		strconvert(&my_charset_filename, s,
-			   system_charset_info, s, strlen(s), &errors);
-	}
-}
-
 /*******************************************************************//**
 This function checks that index keys are sensible.
 @return	0 or error number */
-static
+static __attribute__((nonnull, warn_unused_result))
 int
 innobase_check_index_keys(
 /*======================*/
-	const KEY*		key_info,	/*!< in: Indexes to be
-						created */
-	ulint			num_of_keys,	/*!< in: Number of
-						indexes to be created */
-	const dict_table_t*	table)		/*!< in: Existing indexes */
+	const Alter_inplace_info*	info,
+				/*!< in: indexes to be created or dropped */
+	const dict_table_t*		innodb_table)
+				/*!< in: Existing indexes */
 {
-	ulint		key_num;
-
-	ut_ad(key_info);
-	ut_ad(num_of_keys);
-
-	for (key_num = 0; key_num < num_of_keys; key_num++) {
-		const KEY&	key = key_info[key_num];
+	for (uint key_num = 0; key_num < info->index_add_count;
+	     key_num++) {
+		const KEY&	key = info->key_info_buffer[
+			info->index_add_buffer[key_num]];
 
 		/* Check that the same index name does not appear
 		twice in indexes to be created. */
 
 		for (ulint i = 0; i < key_num; i++) {
-			const KEY&	key2 = key_info[i];
+			const KEY&	key2 = info->key_info_buffer[
+				info->index_add_buffer[i]];
 
 			if (0 == strcmp(key.name, key2.name)) {
 				my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
@@ -266,23 +1328,36 @@ innobase_check_index_keys(
 
 		/* Check that the same index name does not already exist. */
 
-		for (const dict_index_t* index
-			     = dict_table_get_first_index(table);
-		     index; index = dict_table_get_next_index(index)) {
+		const dict_index_t* index;
 
-			if (0 == strcmp(key.name, index->name)) {
-				my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
-					 key.name);
+		for (index = dict_table_get_first_index(innodb_table);
+		     index; index = dict_table_get_next_index(index)) {
 
-				return(ER_WRONG_NAME_FOR_INDEX);
+			if (!strcmp(key.name, index->name)) {
+				break;
 			}
 		}
 
-		/* Check that MySQL does not try to create a column
-		prefix index field on an inappropriate data type and
-		that the same column does not appear twice in the index. */
+		if (index) {
+			/* If a key by the same name is being created and
+			dropped, the name clash is OK. */
+			for (uint i = 0; i < info->index_drop_count;
+			     i++) {
+				const KEY*	drop_key
+					= info->index_drop_buffer[i];
 
-		for (ulint i = 0; i < key.key_parts; i++) {
+				if (0 == strcmp(key.name, drop_key->name)) {
+					goto name_ok;
+				}
+			}
+
+			my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), key.name);
+
+			return(ER_WRONG_NAME_FOR_INDEX);
+		}
+
+name_ok:
+		for (ulint i = 0; i < key.user_defined_key_parts; i++) {
 			const KEY_PART_INFO&	key_part1
 				= key.key_part[i];
 			const Field*		field
@@ -297,6 +1372,10 @@ innobase_check_index_keys(
 			case DATA_FLOAT:
 			case DATA_DOUBLE:
 			case DATA_DECIMAL:
+				/* Check that MySQL does not try to
+				create a column prefix index field on
+				an inappropriate data type. */
+
 				if (field->type() == MYSQL_TYPE_VARCHAR) {
 					if (key_part1.length
 					    >= field->pack_length()
@@ -316,17 +1395,19 @@ innobase_check_index_keys(
 				return(ER_WRONG_KEY_COLUMN);
 			}
 
+			/* Check that the same column does not appear
+			twice in the index. */
+
 			for (ulint j = 0; j < i; j++) {
 				const KEY_PART_INFO&	key_part2
 					= key.key_part[j];
 
-				if (strcmp(key_part1.field->field_name,
-					   key_part2.field->field_name)) {
+				if (key_part1.fieldnr != key_part2.fieldnr) {
 					continue;
 				}
 
 				my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB",
-					 key_part1.field->field_name);
+					 field->field_name);
 				return(ER_WRONG_KEY_COLUMN);
 			}
 		}
@@ -337,16 +1418,19 @@ innobase_check_index_keys(
 
 /*******************************************************************//**
 Create index field definition for key part */
-static
+static __attribute__((nonnull(2,3)))
 void
 innobase_create_index_field_def(
 /*============================*/
-	KEY_PART_INFO*		key_part,	/*!< in: MySQL key definition */
-	mem_heap_t*		heap,		/*!< in: memory heap */
-	merge_index_field_t*	index_field)	/*!< out: index field
+	const TABLE*		altered_table,	/*!< in: MySQL table that is
+						being altered, or NULL
+						if a new clustered index is
+						not being created */
+	const KEY_PART_INFO*	key_part,	/*!< in: MySQL key definition */
+	index_field_t*		index_field)	/*!< out: index field
 						definition for key_part */
 {
-	Field*		field;
+	const Field*	field;
 	ibool		is_unsigned;
 	ulint		col_type;
 
@@ -355,9 +1439,13 @@ innobase_create_index_field_def(
 	ut_ad(key_part);
 	ut_ad(index_field);
 
-	field = key_part->field;
+	field = altered_table
+		? altered_table->field[key_part->fieldnr]
+		: key_part->field;
 	ut_a(field);
 
+	index_field->col_no = key_part->fieldnr;
+
 	col_type = get_innobase_type_from_mysql_type(&is_unsigned, field);
 
 	if (DATA_BLOB == col_type
@@ -365,51 +1453,55 @@ innobase_create_index_field_def(
 		&& field->type() != MYSQL_TYPE_VARCHAR)
 	    || (field->type() == MYSQL_TYPE_VARCHAR
 		&& key_part->length < field->pack_length()
-			- ((Field_varstring*)field)->length_bytes)) {
+			- ((Field_varstring*) field)->length_bytes)) {
 
 		index_field->prefix_len = key_part->length;
 	} else {
 		index_field->prefix_len = 0;
 	}
 
-	index_field->field_name = mem_heap_strdup(heap, field->field_name);
-
 	DBUG_VOID_RETURN;
 }
 
 /*******************************************************************//**
 Create index definition for key */
-static
+static __attribute__((nonnull))
 void
 innobase_create_index_def(
 /*======================*/
-	KEY*			key,		/*!< in: key definition */
-	bool			new_primary,	/*!< in: TRUE=generating
-						a new primary key
+	const TABLE*		altered_table,	/*!< in: MySQL table that is
+						being altered */
+	const KEY*		keys,		/*!< in: key definitions */
+	ulint			key_number,	/*!< in: MySQL key number */
+	bool			new_clustered,	/*!< in: true if generating
+						a new clustered index
 						on the table */
-	bool			key_primary,	/*!< in: TRUE if this key
-						is a primary key */
-	merge_index_def_t*	index,		/*!< out: index definition */
+	bool			key_clustered,	/*!< in: true if this is
+						the new clustered index */
+	index_def_t*		index,		/*!< out: index definition */
 	mem_heap_t*		heap)		/*!< in: heap where memory
 						is allocated */
 {
-	ulint	i;
-	ulint	len;
-	ulint	n_fields = key->key_parts;
-	char*	index_name;
+	const KEY*	key = &keys[key_number];
+	ulint		i;
+	ulint		len;
+	ulint		n_fields = key->user_defined_key_parts;
+	char*		index_name;
 
 	DBUG_ENTER("innobase_create_index_def");
+	DBUG_ASSERT(!key_clustered || new_clustered);
 
-	index->fields = (merge_index_field_t*) mem_heap_alloc(
-		heap, n_fields * sizeof *index->fields);
+	index->fields = static_cast<index_field_t*>(
+		mem_heap_alloc(heap, n_fields * sizeof *index->fields));
 
 	index->ind_type = 0;
+	index->key_number = key_number;
 	index->n_fields = n_fields;
 	len = strlen(key->name) + 1;
-	index->name = index_name = (char*) mem_heap_alloc(heap,
-							  len + !new_primary);
+	index->name = index_name = static_cast<char*>(
+		mem_heap_alloc(heap, len + !new_clustered));
 
-	if (UNIV_LIKELY(!new_primary)) {
+	if (!new_clustered) {
 		*index_name++ = TEMP_INDEX_PREFIX;
 	}
 
@@ -419,83 +1511,239 @@ innobase_create_index_def(
 		index->ind_type |= DICT_UNIQUE;
 	}
 
-	if (key_primary) {
+	if (key_clustered) {
+		DBUG_ASSERT(!(key->flags & HA_FULLTEXT));
 		index->ind_type |= DICT_CLUSTERED;
+	} else if (key->flags & HA_FULLTEXT) {
+		DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK
+			      & ~(HA_FULLTEXT
+				  | HA_PACK_KEY
+				  | HA_BINARY_PACK_KEY)));
+		DBUG_ASSERT(!(key->flags & HA_NOSAME));
+		DBUG_ASSERT(!index->ind_type);
+		index->ind_type |= DICT_FTS;
+	}
+
+	if (!new_clustered) {
+		altered_table = NULL;
 	}
 
 	for (i = 0; i < n_fields; i++) {
-		innobase_create_index_field_def(&key->key_part[i], heap,
-						&index->fields[i]);
+		innobase_create_index_field_def(
+			altered_table, &key->key_part[i], &index->fields[i]);
 	}
 
 	DBUG_VOID_RETURN;
 }
 
 /*******************************************************************//**
-Copy index field definition */
+Check whether the table has the FTS_DOC_ID column
+@return whether there exists an FTS_DOC_ID column */
 static
-void
-innobase_copy_index_field_def(
+bool
+innobase_fts_check_doc_id_col(
 /*==========================*/
-	const dict_field_t*	field,		/*!< in: definition to copy */
-	merge_index_field_t*	index_field)	/*!< out: copied definition */
+	const dict_table_t*	table,  /*!< in: InnoDB table with
+					fulltext index */
+	const TABLE*		altered_table,
+					/*!< in: MySQL table with
+					fulltext index */
+	ulint*			fts_doc_col_no)
+					/*!< out: The column number for
+					Doc ID, or ULINT_UNDEFINED
+					if it is of wrong type */
 {
-	DBUG_ENTER("innobase_copy_index_field_def");
-	DBUG_ASSERT(field != NULL);
-	DBUG_ASSERT(index_field != NULL);
+	*fts_doc_col_no = ULINT_UNDEFINED;
+
+	const uint n_cols = altered_table->s->stored_fields;
+        uint sql_idx = 0;
+	uint i;
+
+	for (i = 0; i < n_cols; i++, sql_idx++) {
+		const Field*	field;
+                while (!((field= altered_table->field[sql_idx])->
+                                 stored_in_db))
+                          sql_idx++;
+		if (my_strcasecmp(system_charset_info,
+				  field->field_name, FTS_DOC_ID_COL_NAME)) {
+			continue;
+		}
 
-	index_field->field_name = field->name;
-	index_field->prefix_len = field->prefix_len;
+		if (strcmp(field->field_name, FTS_DOC_ID_COL_NAME)) {
+			my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+				 field->field_name);
+		} else if (field->type() != MYSQL_TYPE_LONGLONG
+			   || field->pack_length() != 8
+			   || field->real_maybe_null()
+			   || !(field->flags & UNSIGNED_FLAG)) {
+			my_error(ER_INNODB_FT_WRONG_DOCID_COLUMN, MYF(0),
+				 field->field_name);
+		} else {
+			*fts_doc_col_no = i;
+		}
 
-	DBUG_VOID_RETURN;
+		return(true);
+	}
+
+	if (!table) {
+		return(false);
+	}
+
+	for (; i + DATA_N_SYS_COLS < (uint) table->n_cols; i++) {
+		const char*     name = dict_table_get_col_name(table, i);
+
+		if (strcmp(name, FTS_DOC_ID_COL_NAME) == 0) {
+#ifdef UNIV_DEBUG
+			const dict_col_t*       col;
+
+			col = dict_table_get_nth_col(table, i);
+
+			/* Because the FTS_DOC_ID does not exist in
+			the MySQL data dictionary, this must be the
+			internally created FTS_DOC_ID column. */
+			ut_ad(col->mtype == DATA_INT);
+			ut_ad(col->len == 8);
+			ut_ad(col->prtype & DATA_NOT_NULL);
+			ut_ad(col->prtype & DATA_UNSIGNED);
+#endif /* UNIV_DEBUG */
+			*fts_doc_col_no = i;
+			return(true);
+		}
+	}
+
+	return(false);
 }
 
 /*******************************************************************//**
-Copy index definition for the index */
-static
-void
-innobase_copy_index_def(
-/*====================*/
-	const dict_index_t*	index,	/*!< in: index definition to copy */
-	merge_index_def_t*	new_index,/*!< out: Index definition */
-	mem_heap_t*		heap)	/*!< in: heap where allocated */
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column.
+@return	the status of the FTS_DOC_ID index */
+UNIV_INTERN
+enum fts_doc_id_index_enum
+innobase_fts_check_doc_id_index(
+/*============================*/
+	const dict_table_t*	table,		/*!< in: table definition */
+	const TABLE*		altered_table,	/*!< in: MySQL table
+						that is being altered */
+	ulint*			fts_doc_col_no)	/*!< out: The column number for
+						Doc ID, or ULINT_UNDEFINED
+						if it is being created in
+						ha_alter_info */
 {
-	ulint	n_fields;
-	ulint	i;
+	const dict_index_t*	index;
+	const dict_field_t*	field;
+
+	if (altered_table) {
+		/* Check if a unique index with the name of
+		FTS_DOC_ID_INDEX_NAME is being created. */
+
+		for (uint i = 0; i < altered_table->s->keys; i++) {
+			const KEY& key = altered_table->key_info[i];
 
-	DBUG_ENTER("innobase_copy_index_def");
+			if (innobase_strcasecmp(
+				    key.name, FTS_DOC_ID_INDEX_NAME)) {
+				continue;
+			}
+
+			if ((key.flags & HA_NOSAME)
+			    && key.user_defined_key_parts == 1
+			    && !strcmp(key.name, FTS_DOC_ID_INDEX_NAME)
+			    && !strcmp(key.key_part[0].field->field_name,
+				       FTS_DOC_ID_COL_NAME)) {
+				if (fts_doc_col_no) {
+					*fts_doc_col_no = ULINT_UNDEFINED;
+				}
+				return(FTS_EXIST_DOC_ID_INDEX);
+			} else {
+				return(FTS_INCORRECT_DOC_ID_INDEX);
+			}
+		}
+	}
 
-	/* Note that we take only those fields that user defined to be
-	in the index.  In the internal representation more colums were
-	added and those colums are not copied .*/
+	if (!table) {
+		return(FTS_NOT_EXIST_DOC_ID_INDEX);
+	}
 
-	n_fields = index->n_user_defined_cols;
+	for (index = dict_table_get_first_index(table);
+	     index; index = dict_table_get_next_index(index)) {
 
-	new_index->fields = (merge_index_field_t*) mem_heap_alloc(
-		heap, n_fields * sizeof *new_index->fields);
+		/* Check if there exists a unique index with the name of
+		FTS_DOC_ID_INDEX_NAME */
+		if (innobase_strcasecmp(index->name, FTS_DOC_ID_INDEX_NAME)) {
+			continue;
+		}
 
-	/* When adding a PRIMARY KEY, we may convert a previous
-	clustered index to a secondary index (UNIQUE NOT NULL). */
-	new_index->ind_type = index->type & ~DICT_CLUSTERED;
-	new_index->n_fields = n_fields;
-	new_index->name = index->name;
+		if (!dict_index_is_unique(index)
+		    || dict_index_get_n_unique(index) > 1
+		    || strcmp(index->name, FTS_DOC_ID_INDEX_NAME)) {
+			return(FTS_INCORRECT_DOC_ID_INDEX);
+		}
 
-	for (i = 0; i < n_fields; i++) {
-		innobase_copy_index_field_def(&index->fields[i],
-					      &new_index->fields[i]);
+		/* Check whether the index has FTS_DOC_ID as its
+		first column */
+		field = dict_index_get_nth_field(index, 0);
+
+		/* The column would be of a BIGINT data type */
+		if (strcmp(field->name, FTS_DOC_ID_COL_NAME) == 0
+		    && field->col->mtype == DATA_INT
+		    && field->col->len == 8
+		    && field->col->prtype & DATA_NOT_NULL) {
+			if (fts_doc_col_no) {
+				*fts_doc_col_no = dict_col_get_no(field->col);
+			}
+			return(FTS_EXIST_DOC_ID_INDEX);
+		} else {
+			return(FTS_INCORRECT_DOC_ID_INDEX);
+		}
 	}
 
-	DBUG_VOID_RETURN;
+
+	/* Not found */
+	return(FTS_NOT_EXIST_DOC_ID_INDEX);
 }
+/*******************************************************************//**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column in MySQL create index definition.
+@return	FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index,
+FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */
+UNIV_INTERN
+enum fts_doc_id_index_enum
+innobase_fts_check_doc_id_index_in_def(
+/*===================================*/
+	ulint		n_key,		/*!< in: Number of keys */
+	const KEY*	key_info)	/*!< in: Key definition */
+{
+	/* Check whether there is a "FTS_DOC_ID_INDEX" in the to be built index
+	list */
+	for (ulint j = 0; j < n_key; j++) {
+		const KEY*	key = &key_info[j];
 
+		if (innobase_strcasecmp(key->name, FTS_DOC_ID_INDEX_NAME)) {
+			continue;
+		}
+
+		/* Do a check on FTS DOC ID_INDEX, it must be unique,
+		named as "FTS_DOC_ID_INDEX" and on column "FTS_DOC_ID" */
+		if (!(key->flags & HA_NOSAME)
+		    || key->user_defined_key_parts != 1
+		    || strcmp(key->name, FTS_DOC_ID_INDEX_NAME)
+		    || strcmp(key->key_part[0].field->field_name,
+			      FTS_DOC_ID_COL_NAME)) {
+			return(FTS_INCORRECT_DOC_ID_INDEX);
+		}
+
+		return(FTS_EXIST_DOC_ID_INDEX);
+	}
+
+	return(FTS_NOT_EXIST_DOC_ID_INDEX);
+}
 /*******************************************************************//**
 Create an index table where indexes are ordered as follows:
 
 IF a new primary key is defined for the table THEN
 
 	1) New primary key
-	2) Original secondary indexes
-	3) New secondary indexes
+	2) The remaining keys in key_info
 
 ELSE
 
@@ -503,387 +1751,1368 @@ ELSE
 
 ENDIF
 
-
-@return	key definitions or NULL */
-static
-merge_index_def_t*
-innobase_create_key_def(
-/*====================*/
-	trx_t*		trx,		/*!< in: trx */
-	const dict_table_t*table,		/*!< in: table definition */
-	mem_heap_t*	heap,		/*!< in: heap where space for key
-					definitions are allocated */
-	KEY*		key_info,	/*!< in: Indexes to be created */
-	ulint&		n_keys)		/*!< in/out: Number of indexes to
-					be created */
+@return	key definitions */
+static __attribute__((nonnull, warn_unused_result, malloc))
+index_def_t*
+innobase_create_key_defs(
+/*=====================*/
+	mem_heap_t*			heap,
+			/*!< in/out: memory heap where space for key
+			definitions are allocated */
+	const Alter_inplace_info*	ha_alter_info,
+			/*!< in: alter operation */
+	const TABLE*			altered_table,
+			/*!< in: MySQL table that is being altered */
+	ulint&				n_add,
+			/*!< in/out: number of indexes to be created */
+	ulint&				n_fts_add,
+			/*!< out: number of FTS indexes to be created */
+	bool				got_default_clust,
+			/*!< in: whether the table lacks a primary key */
+	ulint&				fts_doc_id_col,
+			/*!< in: The column number for Doc ID */
+	bool&				add_fts_doc_id,
+			/*!< in: whether we need to add new DOC ID
+			column for FTS index */
+	bool&				add_fts_doc_idx)
+			/*!< in: whether we need to add new DOC ID
+			index for FTS index */
 {
-	ulint			i = 0;
-	merge_index_def_t*	indexdef;
-	merge_index_def_t*	indexdefs;
+	index_def_t*		indexdef;
+	index_def_t*		indexdefs;
 	bool			new_primary;
+	const uint*const	add
+		= ha_alter_info->index_add_buffer;
+	const KEY*const		key_info
+		= ha_alter_info->key_info_buffer;
 
-	DBUG_ENTER("innobase_create_key_def");
-
-	indexdef = indexdefs = (merge_index_def_t*)
-		mem_heap_alloc(heap, sizeof *indexdef
-			       * (n_keys + UT_LIST_GET_LEN(table->indexes)));
+	DBUG_ENTER("innobase_create_key_defs");
+	DBUG_ASSERT(!add_fts_doc_id || add_fts_doc_idx);
+	DBUG_ASSERT(ha_alter_info->index_add_count == n_add);
 
 	/* If there is a primary key, it is always the first index
-	defined for the table. */
+	defined for the innodb_table. */
 
-	new_primary = !my_strcasecmp(system_charset_info,
-				     key_info->name, "PRIMARY");
+	new_primary = n_add > 0
+		&& !my_strcasecmp(system_charset_info,
+				  key_info[*add].name, "PRIMARY");
+	n_fts_add = 0;
 
 	/* If there is a UNIQUE INDEX consisting entirely of NOT NULL
 	columns and if the index does not contain column prefix(es)
 	(only prefix/part of the column is indexed), MySQL will treat the
 	index as a PRIMARY KEY unless the table already has one. */
 
-	if (!new_primary && (key_info->flags & HA_NOSAME)
-	    && (!(key_info->flags & HA_KEY_HAS_PART_KEY_SEG))
-	    && row_table_got_default_clust_index(table)) {
-		uint	key_part = key_info->key_parts;
+	if (n_add > 0 && !new_primary && got_default_clust
+	    && (key_info[*add].flags & HA_NOSAME)
+	    && !(key_info[*add].flags & HA_KEY_HAS_PART_KEY_SEG)) {
+		uint	key_part = key_info[*add].user_defined_key_parts;
 
-		new_primary = TRUE;
+		new_primary = true;
 
 		while (key_part--) {
-			if (key_info->key_part[key_part].key_type
-			    & FIELDFLAG_MAYBE_NULL) {
-				new_primary = FALSE;
+			const uint	maybe_null
+				= key_info[*add].key_part[key_part].key_type
+				& FIELDFLAG_MAYBE_NULL;
+			DBUG_ASSERT(!maybe_null
+				    == !key_info[*add].key_part[key_part].
+				    field->real_maybe_null());
+
+			if (maybe_null) {
+				new_primary = false;
 				break;
 			}
 		}
 	}
 
-	if (new_primary) {
-		const dict_index_t*	index;
+	const bool rebuild = new_primary || add_fts_doc_id
+		|| innobase_need_rebuild(ha_alter_info);
+	/* Reserve one more space if new_primary is true, and we might
+	need to add the FTS_DOC_ID_INDEX */
+	indexdef = indexdefs = static_cast<index_def_t*>(
+		mem_heap_alloc(
+			heap, sizeof *indexdef
+			* (ha_alter_info->key_count
+			   + rebuild
+			   + got_default_clust)));
+
+	if (rebuild) {
+		ulint	primary_key_number;
+
+		if (new_primary) {
+			DBUG_ASSERT(n_add > 0);
+			primary_key_number = *add;
+		} else if (got_default_clust) {
+			/* Create the GEN_CLUST_INDEX */
+			index_def_t*	index = indexdef++;
+
+			index->fields = NULL;
+			index->n_fields = 0;
+			index->ind_type = DICT_CLUSTERED;
+			index->name = mem_heap_strdup(
+				heap, innobase_index_reserve_name);
+			index->key_number = ~0;
+			primary_key_number = ULINT_UNDEFINED;
+			goto created_clustered;
+		} else {
+			primary_key_number = 0;
+		}
 
 		/* Create the PRIMARY key index definition */
-		innobase_create_index_def(&key_info[i++], TRUE, TRUE,
-					  indexdef++, heap);
+		innobase_create_index_def(
+			altered_table, key_info, primary_key_number,
+			TRUE, TRUE, indexdef++, heap);
 
-		row_mysql_lock_data_dictionary(trx);
+created_clustered:
+		n_add = 1;
 
-		index = dict_table_get_first_index(table);
+		for (ulint i = 0; i < ha_alter_info->key_count; i++) {
+			if (i == primary_key_number) {
+				continue;
+			}
+			/* Copy the index definitions. */
+			innobase_create_index_def(
+				altered_table, key_info, i, TRUE, FALSE,
+				indexdef, heap);
 
-		/* Copy the index definitions of the old table.  Skip
-		the old clustered index if it is a generated clustered
-		index or a PRIMARY KEY.  If the clustered index is a
-		UNIQUE INDEX, it must be converted to a secondary index. */
+			if (indexdef->ind_type & DICT_FTS) {
+				n_fts_add++;
+			}
 
-		if (dict_index_get_nth_col(index, 0)->mtype == DATA_SYS
-		    || !my_strcasecmp(system_charset_info,
-				      index->name, "PRIMARY")) {
-			index = dict_table_get_next_index(index);
+			indexdef++;
+			n_add++;
 		}
 
-		while (index) {
-			innobase_copy_index_def(index, indexdef++, heap);
-			index = dict_table_get_next_index(index);
+		if (n_fts_add > 0) {
+			if (!add_fts_doc_id
+			    && !innobase_fts_check_doc_id_col(
+				    NULL, altered_table,
+				    &fts_doc_id_col)) {
+				fts_doc_id_col =
+                                  altered_table->s->stored_fields;
+				add_fts_doc_id = true;
+			}
+
+			if (!add_fts_doc_idx) {
+				fts_doc_id_index_enum	ret;
+				ulint			doc_col_no;
+
+				ret = innobase_fts_check_doc_id_index(
+					NULL, altered_table, &doc_col_no);
+
+				/* This should have been checked before */
+				ut_ad(ret != FTS_INCORRECT_DOC_ID_INDEX);
+
+				if (ret == FTS_NOT_EXIST_DOC_ID_INDEX) {
+					add_fts_doc_idx = true;
+				} else {
+					ut_ad(ret == FTS_EXIST_DOC_ID_INDEX);
+					ut_ad(doc_col_no == ULINT_UNDEFINED
+					      || doc_col_no == fts_doc_id_col);
+				}
+			}
 		}
+	} else {
+		/* Create definitions for added secondary indexes. */
 
-		row_mysql_unlock_data_dictionary(trx);
-	}
+		for (ulint i = 0; i < n_add; i++) {
+			innobase_create_index_def(
+				altered_table, key_info, add[i], FALSE, FALSE,
+				indexdef, heap);
 
-	/* Create definitions for added secondary indexes. */
+			if (indexdef->ind_type & DICT_FTS) {
+				n_fts_add++;
+			}
 
-	while (i < n_keys) {
-		innobase_create_index_def(&key_info[i++], new_primary, FALSE,
-					  indexdef++, heap);
+			indexdef++;
+		}
 	}
 
-	n_keys = indexdef - indexdefs;
+	DBUG_ASSERT(indexdefs + n_add == indexdef);
+
+	if (add_fts_doc_idx) {
+		index_def_t*	index = indexdef++;
+
+		index->fields = static_cast<index_field_t*>(
+			mem_heap_alloc(heap, sizeof *index->fields));
+		index->n_fields = 1;
+		index->fields->col_no = fts_doc_id_col;
+		index->fields->prefix_len = 0;
+		index->ind_type = DICT_UNIQUE;
+
+		if (rebuild) {
+			index->name = mem_heap_strdup(
+				heap, FTS_DOC_ID_INDEX_NAME);
+			ut_ad(!add_fts_doc_id
+			      || fts_doc_id_col == altered_table->s->stored_fields);
+		} else {
+			char*	index_name;
+			index->name = index_name = static_cast<char*>(
+				mem_heap_alloc(
+					heap,
+					1 + sizeof FTS_DOC_ID_INDEX_NAME));
+			*index_name++ = TEMP_INDEX_PREFIX;
+			memcpy(index_name, FTS_DOC_ID_INDEX_NAME,
+			       sizeof FTS_DOC_ID_INDEX_NAME);
+		}
+
+		/* TODO: assign a real MySQL key number for this */
+		index->key_number = ULINT_UNDEFINED;
+		n_add++;
+	}
 
+	DBUG_ASSERT(indexdef > indexdefs);
+	DBUG_ASSERT((ulint) (indexdef - indexdefs)
+		    <= ha_alter_info->key_count
+		    + add_fts_doc_idx + got_default_clust);
+	DBUG_ASSERT(ha_alter_info->index_add_count <= n_add);
 	DBUG_RETURN(indexdefs);
 }
 
 /*******************************************************************//**
 Check each index column size, make sure they do not exceed the max limit
-@return	HA_ERR_INDEX_COL_TOO_LONG if index column size exceeds limit */
-static
-int
+@return	true if index column size exceeds limit */
+static __attribute__((nonnull, warn_unused_result))
+bool
 innobase_check_column_length(
 /*=========================*/
-	const dict_table_t*table,	/*!< in: table definition */
+	ulint		max_col_len,	/*!< in: maximum column length */
 	const KEY*	key_info)	/*!< in: Indexes to be created */
 {
-	ulint	max_col_len = DICT_MAX_FIELD_LEN_BY_FORMAT(table);
-
-	for (ulint key_part = 0; key_part < key_info->key_parts; key_part++) {
+	for (ulint key_part = 0; key_part < key_info->user_defined_key_parts; key_part++) {
 		if (key_info->key_part[key_part].length > max_col_len) {
-			my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), max_col_len);
-			return(HA_ERR_INDEX_COL_TOO_LONG);
+			return(true);
 		}
 	}
-	return(0);
+	return(false);
 }
 
-/*******************************************************************//**
-Create a temporary tablename using query id, thread id, and id
-@return	temporary tablename */
+struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx
+{
+	/** Dummy query graph */
+	que_thr_t*	thr;
+	/** reference to the prebuilt struct of the creating instance */
+	row_prebuilt_t*&prebuilt;
+	/** InnoDB indexes being created */
+	dict_index_t**	add_index;
+	/** MySQL key numbers for the InnoDB indexes that are being created */
+	const ulint*	add_key_numbers;
+	/** number of InnoDB indexes being created */
+	ulint		num_to_add_index;
+	/** InnoDB indexes being dropped */
+	dict_index_t**	drop_index;
+	/** number of InnoDB indexes being dropped */
+	const ulint	num_to_drop_index;
+	/** InnoDB foreign key constraints being dropped */
+	dict_foreign_t** drop_fk;
+	/** number of InnoDB foreign key constraints being dropped */
+	const ulint	num_to_drop_fk;
+	/** InnoDB foreign key constraints being added */
+	dict_foreign_t** add_fk;
+	/** number of InnoDB foreign key constraints being dropped */
+	const ulint	num_to_add_fk;
+	/** whether to create the indexes online */
+	bool		online;
+	/** memory heap */
+	mem_heap_t*	heap;
+	/** dictionary transaction */
+	trx_t*		trx;
+	/** original table (if rebuilt, differs from indexed_table) */
+	dict_table_t*	old_table;
+	/** table where the indexes are being created or dropped */
+	dict_table_t*	new_table;
+	/** mapping of old column numbers to new ones, or NULL */
+	const ulint*	col_map;
+	/** new column names, or NULL if nothing was renamed */
+	const char**	col_names;
+	/** added AUTO_INCREMENT column position, or ULINT_UNDEFINED */
+	const ulint	add_autoinc;
+	/** default values of ADD COLUMN, or NULL */
+	const dtuple_t*	add_cols;
+	/** autoinc sequence to use */
+	ib_sequence_t	sequence;
+	/** maximum auto-increment value */
+	ulonglong	max_autoinc;
+	/** temporary table name to use for old table when renaming tables */
+	const char*	tmp_name;
+
+	ha_innobase_inplace_ctx(row_prebuilt_t*& prebuilt_arg,
+				dict_index_t** drop_arg,
+				ulint num_to_drop_arg,
+				dict_foreign_t** drop_fk_arg,
+				ulint num_to_drop_fk_arg,
+				dict_foreign_t** add_fk_arg,
+				ulint num_to_add_fk_arg,
+				bool online_arg,
+				mem_heap_t* heap_arg,
+				dict_table_t* new_table_arg,
+				const char** col_names_arg,
+				ulint add_autoinc_arg,
+				ulonglong autoinc_col_min_value_arg,
+				ulonglong autoinc_col_max_value_arg) :
+		inplace_alter_handler_ctx(),
+		prebuilt (prebuilt_arg),
+		add_index (0), add_key_numbers (0), num_to_add_index (0),
+		drop_index (drop_arg), num_to_drop_index (num_to_drop_arg),
+		drop_fk (drop_fk_arg), num_to_drop_fk (num_to_drop_fk_arg),
+		add_fk (add_fk_arg), num_to_add_fk (num_to_add_fk_arg),
+		online (online_arg), heap (heap_arg), trx (0),
+		old_table (prebuilt_arg->table),
+		new_table (new_table_arg),
+		col_map (0), col_names (col_names_arg),
+		add_autoinc (add_autoinc_arg),
+		add_cols (0),
+		sequence(prebuilt->trx->mysql_thd,
+			 autoinc_col_min_value_arg, autoinc_col_max_value_arg),
+		max_autoinc (0),
+		tmp_name (0)
+	{
+#ifdef UNIV_DEBUG
+		for (ulint i = 0; i < num_to_add_index; i++) {
+			ut_ad(!add_index[i]->to_be_dropped);
+		}
+		for (ulint i = 0; i < num_to_drop_index; i++) {
+			ut_ad(drop_index[i]->to_be_dropped);
+		}
+#endif /* UNIV_DEBUG */
+
+		thr = pars_complete_graph_for_exec(NULL, prebuilt->trx, heap);
+	}
+
+	~ha_innobase_inplace_ctx()
+	{
+		mem_heap_free(heap);
+	}
+
+	/** Determine if the table will be rebuilt.
+	@return whether the table will be rebuilt */
+	bool need_rebuild () const { return(old_table != new_table); }
+
+private:
+	// Disable copying
+	ha_innobase_inplace_ctx(const ha_innobase_inplace_ctx&);
+	ha_innobase_inplace_ctx& operator=(const ha_innobase_inplace_ctx&);
+};
+
+/********************************************************************//**
+Drop any indexes that we were not able to free previously due to
+open table handles. */
 static
-char*
-innobase_create_temporary_tablename(
-/*================================*/
-	mem_heap_t*	heap,		/*!< in: memory heap */
-	char		id,		/*!< in: identifier [0-9a-zA-Z] */
-	const char*     table_name)	/*!< in: table name */
+void
+online_retry_drop_indexes_low(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx)	/*!< in/out: transaction */
 {
-	char*			name;
-	ulint			len;
-	static const char	suffix[] = "@0023 "; /* "# " */
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+
+	/* We can have table->n_ref_count > 1, because other threads
+	may have prebuilt->table pointing to the table. However, these
+	other threads should be between statements, waiting for the
+	next statement to execute, or for a meta-data lock. */
+	ut_ad(table->n_ref_count >= 1);
+
+	if (table->drop_aborted) {
+		row_merge_drop_indexes(trx, table, TRUE);
+	}
+}
 
-	len = strlen(table_name);
+/********************************************************************//**
+Drop any indexes that we were not able to free previously due to
+open table handles. */
+static __attribute__((nonnull))
+void
+online_retry_drop_indexes(
+/*======================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	THD*		user_thd)	/*!< in/out: MySQL connection */
+{
+	if (table->drop_aborted) {
+		trx_t*	trx = innobase_trx_allocate(user_thd);
 
-	name = (char*) mem_heap_alloc(heap, len + sizeof suffix);
-	memcpy(name, table_name, len);
-	memcpy(name + len, suffix, sizeof suffix);
-	name[len + (sizeof suffix - 2)] = id;
+		trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
 
-	return(name);
+		row_mysql_lock_data_dictionary(trx);
+		online_retry_drop_indexes_low(table, trx);
+		trx_commit_for_mysql(trx);
+		row_mysql_unlock_data_dictionary(trx);
+		trx_free_for_mysql(trx);
+	}
+
+#ifdef UNIV_DEBUG
+	mutex_enter(&dict_sys->mutex);
+	dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE);
+	mutex_exit(&dict_sys->mutex);
+	ut_a(!table->drop_aborted);
+#endif /* UNIV_DEBUG */
 }
 
-class ha_innobase_add_index : public handler_add_index
+/********************************************************************//**
+Commit a dictionary transaction and drop any indexes that we were not
+able to free previously due to open table handles. */
+static __attribute__((nonnull))
+void
+online_retry_drop_indexes_with_trx(
+/*===============================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx)	/*!< in/out: transaction */
 {
-public:
-	/** table where the indexes are being created */
-	dict_table_t* indexed_table;
-	ha_innobase_add_index(TABLE* table, KEY* key_info, uint num_of_keys,
-			      dict_table_t* indexed_table_arg) :
-		handler_add_index(table, key_info, num_of_keys),
-		indexed_table (indexed_table_arg) {}
-	~ha_innobase_add_index() {}
-};
+	ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
 
-/*******************************************************************//**
-Create indexes.
-@return	0 or error number */
-UNIV_INTERN
-int
-ha_innobase::add_index(
-/*===================*/
-	TABLE*			table,		/*!< in: Table where indexes
-						are created */
-	KEY*			key_info,	/*!< in: Indexes
-						to be created */
-	uint			num_of_keys,	/*!< in: Number of indexes
-						to be created */
-	handler_add_index**	add)		/*!< out: context */
+	/* Now that the dictionary is being locked, check if we can
+	drop any incompletely created indexes that may have been left
+	behind in rollback_inplace_alter_table() earlier. */
+	if (table->drop_aborted) {
+
+		trx->table_id = 0;
+
+		trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+
+		online_retry_drop_indexes_low(table, trx);
+		trx_commit_for_mysql(trx);
+	}
+}
+
+/** Determines if InnoDB is dropping a foreign key constraint.
+@param foreign		the constraint
+@param drop_fk		constraints being dropped
+@param n_drop_fk	number of constraints that are being dropped
+@return whether the constraint is being dropped */
+inline __attribute__((pure, nonnull, warn_unused_result))
+bool
+innobase_dropping_foreign(
+/*======================*/
+	const dict_foreign_t*	foreign,
+	dict_foreign_t**	drop_fk,
+	ulint			n_drop_fk)
 {
-	dict_index_t**	index;		/*!< Index to be created */
-	dict_table_t*	indexed_table;	/*!< Table where indexes are created */
-	merge_index_def_t* index_defs;	/*!< Index definitions */
-	mem_heap_t*     heap;		/*!< Heap for index definitions */
-	trx_t*		trx;		/*!< Transaction */
-	ulint		num_of_idx;
-	ulint		num_created	= 0;
-	ibool		dict_locked	= FALSE;
-	ulint		new_primary;
-	int		error;
+	while (n_drop_fk--) {
+		if (*drop_fk++ == foreign) {
+			return(true);
+		}
+	}
 
-	DBUG_ENTER("ha_innobase::add_index");
-	ut_a(table);
-	ut_a(key_info);
-	ut_a(num_of_keys);
+	return(false);
+}
 
-	*add = NULL;
+/** Determines if an InnoDB FOREIGN KEY constraint depends on a
+column that is being dropped or modified to NOT NULL.
+@param user_table	InnoDB table as it is before the ALTER operation
+@param col_name		Name of the column being altered
+@param drop_fk		constraints being dropped
+@param n_drop_fk	number of constraints that are being dropped
+@param drop		true=drop column, false=set NOT NULL
+@retval true		Not allowed (will call my_error())
+@retval false		Allowed
+*/
+static __attribute__((pure, nonnull, warn_unused_result))
+bool
+innobase_check_foreigns_low(
+/*========================*/
+	const dict_table_t*	user_table,
+	dict_foreign_t**	drop_fk,
+	ulint			n_drop_fk,
+	const char*		col_name,
+	bool			drop)
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	/* Check if any FOREIGN KEY constraints are defined on this
+	column. */
+	for (const dict_foreign_t* foreign = UT_LIST_GET_FIRST(
+		     user_table->foreign_list);
+	     foreign;
+	     foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) {
+		if (!drop && !(foreign->type
+			       & (DICT_FOREIGN_ON_DELETE_SET_NULL
+				  | DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+			continue;
+		}
 
-	if (srv_created_new_raw || srv_force_recovery) {
-		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+		if (innobase_dropping_foreign(foreign, drop_fk, n_drop_fk)) {
+			continue;
+		}
+
+		for (unsigned f = 0; f < foreign->n_fields; f++) {
+			if (!strcmp(foreign->foreign_col_names[f],
+				    col_name)) {
+				my_error(drop
+					 ? ER_FK_COLUMN_CANNOT_DROP
+					 : ER_FK_COLUMN_NOT_NULL, MYF(0),
+					 col_name, foreign->id);
+				return(true);
+			}
+		}
 	}
 
-	update_thd();
+	if (!drop) {
+		/* SET NULL clauses on foreign key constraints of
+		child tables affect the child tables, not the parent table.
+		The column can be NOT NULL in the parent table. */
+		return(false);
+	}
 
-	/* In case MySQL calls this in the middle of a SELECT query, release
-	possible adaptive hash latch to avoid deadlocks of threads. */
-	trx_search_latch_release_if_reserved(prebuilt->trx);
+	/* Check if any FOREIGN KEY constraints in other tables are
+	referring to the column that is being dropped. */
+	for (const dict_foreign_t* foreign = UT_LIST_GET_FIRST(
+		     user_table->referenced_list);
+	     foreign;
+	     foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) {
+		if (innobase_dropping_foreign(foreign, drop_fk, n_drop_fk)) {
+			continue;
+		}
 
-	if (UNIV_UNLIKELY(prebuilt->trx->fake_changes)) {
-		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+		for (unsigned f = 0; f < foreign->n_fields; f++) {
+			char display_name[FN_REFLEN];
+
+			if (strcmp(foreign->referenced_col_names[f],
+				   col_name)) {
+				continue;
+			}
+
+			char* buf_end = innobase_convert_name(
+				display_name, (sizeof display_name) - 1,
+				foreign->foreign_table_name,
+				strlen(foreign->foreign_table_name),
+				NULL, TRUE);
+			*buf_end = '\0';
+			my_error(ER_FK_COLUMN_CANNOT_DROP_CHILD,
+				 MYF(0), col_name, foreign->id,
+				 display_name);
+
+			return(true);
+		}
 	}
 
-	/* Check if the index name is reserved. */
-	if (innobase_index_name_is_reserved(user_thd, key_info, num_of_keys)) {
-		DBUG_RETURN(ER_WRONG_NAME_FOR_INDEX);
+	return(false);
+}
+
+/** Determines if an InnoDB FOREIGN KEY constraint depends on a
+column that is being dropped or modified to NOT NULL.
+@param ha_alter_info	Data used during in-place alter
+@param altered_table	MySQL table that is being altered
+@param old_table	MySQL table as it is before the ALTER operation
+@param user_table	InnoDB table as it is before the ALTER operation
+@param drop_fk		constraints being dropped
+@param n_drop_fk	number of constraints that are being dropped
+@retval true		Not allowed (will call my_error())
+@retval false		Allowed
+*/
+static __attribute__((pure, nonnull, warn_unused_result))
+bool
+innobase_check_foreigns(
+/*====================*/
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		altered_table,
+	const TABLE*		old_table,
+	const dict_table_t*	user_table,
+	dict_foreign_t**	drop_fk,
+	ulint			n_drop_fk)
+{
+	List_iterator_fast<Create_field> cf_it(
+		ha_alter_info->alter_info->create_list);
+
+	for (Field** fp = old_table->field; *fp; fp++) {
+		cf_it.rewind();
+		const Create_field* new_field;
+
+		ut_ad(!(*fp)->real_maybe_null()
+		      == !!((*fp)->flags & NOT_NULL_FLAG));
+
+		while ((new_field = cf_it++)) {
+			if (new_field->field == *fp) {
+				break;
+			}
+		}
+
+		if (!new_field || (new_field->flags & NOT_NULL_FLAG)) {
+			if (innobase_check_foreigns_low(
+				    user_table, drop_fk, n_drop_fk,
+				    (*fp)->field_name, !new_field)) {
+				return(true);
+			}
+		}
 	}
 
-	indexed_table = dict_table_get(prebuilt->table->name, FALSE,
-				       DICT_ERR_IGNORE_NONE);
+	return(false);
+}
+
+/** Convert a default value for ADD COLUMN.
+
+@param heap	Memory heap where allocated
+@param dfield	InnoDB data field to copy to
+@param field	MySQL value for the column
+@param comp	nonzero if in compact format */
+static __attribute__((nonnull))
+void
+innobase_build_col_map_add(
+/*=======================*/
+	mem_heap_t*	heap,
+	dfield_t*	dfield,
+	const Field*	field,
+	ulint		comp)
+{
+	if (field->is_real_null()) {
+		dfield_set_null(dfield);
+		return;
+	}
+
+	ulint	size	= field->pack_length();
+
+	byte*	buf	= static_cast<byte*>(mem_heap_alloc(heap, size));
 
-	if (UNIV_UNLIKELY(!indexed_table)) {
-		DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+	row_mysql_store_col_in_innobase_format(
+		dfield, buf, TRUE, field->ptr, size, comp);
+}
+
+/** Construct the translation table for reordering, dropping or
+adding columns.
+
+@param ha_alter_info	Data used during in-place alter
+@param altered_table	MySQL table that is being altered
+@param table		MySQL table as it is before the ALTER operation
+@param new_table	InnoDB table corresponding to MySQL altered_table
+@param old_table	InnoDB table corresponding to MYSQL table
+@param add_cols		Default values for ADD COLUMN, or NULL if no ADD COLUMN
+@param heap		Memory heap where allocated
+@return	array of integers, mapping column numbers in the table
+to column numbers in altered_table */
+static __attribute__((nonnull(1,2,3,4,5,7), warn_unused_result))
+const ulint*
+innobase_build_col_map(
+/*===================*/
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		altered_table,
+	const TABLE*		table,
+	const dict_table_t*	new_table,
+	const dict_table_t*	old_table,
+	dtuple_t*		add_cols,
+	mem_heap_t*		heap)
+{
+        uint old_i, old_innobase_i;
+	DBUG_ENTER("innobase_build_col_map");
+	DBUG_ASSERT(altered_table != table);
+	DBUG_ASSERT(new_table != old_table);
+	DBUG_ASSERT(dict_table_get_n_cols(new_table)
+		    >= altered_table->s->stored_fields + DATA_N_SYS_COLS);
+	DBUG_ASSERT(dict_table_get_n_cols(old_table)
+		    >= table->s->stored_fields + DATA_N_SYS_COLS);
+	DBUG_ASSERT(!!add_cols == !!(ha_alter_info->handler_flags
+				     & Alter_inplace_info::ADD_COLUMN));
+	DBUG_ASSERT(!add_cols || dtuple_get_n_fields(add_cols)
+		    == dict_table_get_n_cols(new_table));
+
+	ulint*	col_map = static_cast<ulint*>(
+		mem_heap_alloc(heap, old_table->n_cols * sizeof *col_map));
+
+	List_iterator_fast<Create_field> cf_it(
+		ha_alter_info->alter_info->create_list);
+	uint i = 0, sql_idx = 0;
+
+	/* Any dropped columns will map to ULINT_UNDEFINED. */
+	for (old_innobase_i = 0;
+             old_innobase_i + DATA_N_SYS_COLS < old_table->n_cols;
+	     old_innobase_i++) {
+		col_map[old_innobase_i] = ULINT_UNDEFINED;
 	}
 
-	ut_a(indexed_table == prebuilt->table);
+	while (const Create_field* new_field = cf_it++) {
+                if (!new_field->stored_in_db)
+                {
+                  sql_idx++;
+                  continue;
+                }
+		for (old_i = 0, old_innobase_i= 0;
+                     table->field[old_i];
+                     old_i++) {
+			const Field* field = table->field[old_i];
+                        if (!table->field[old_i]->stored_in_db)
+                          continue;
+			if (new_field->field == field) {
+				col_map[old_innobase_i] = i;
+				goto found_col;
+			}
+                        old_innobase_i++;
+		}
 
-	if (indexed_table->tablespace_discarded) {
-		DBUG_RETURN(-1);
+		innobase_build_col_map_add(
+			heap, dtuple_get_nth_field(add_cols, i),
+			altered_table->s->field[sql_idx],
+			dict_table_is_comp(new_table));
+found_col:
+		i++;
+                sql_idx++;
 	}
 
-	/* Check that index keys are sensible */
-	error = innobase_check_index_keys(key_info, num_of_keys, prebuilt->table);
+	DBUG_ASSERT(i == altered_table->s->stored_fields);
+
+	i = table->s->stored_fields;
+
+	/* Add the InnoDB hidden FTS_DOC_ID column, if any. */
+	if (i + DATA_N_SYS_COLS < old_table->n_cols) {
+		/* There should be exactly one extra field,
+		the FTS_DOC_ID. */
+		DBUG_ASSERT(DICT_TF2_FLAG_IS_SET(old_table,
+						 DICT_TF2_FTS_HAS_DOC_ID));
+		DBUG_ASSERT(i + DATA_N_SYS_COLS + 1 == old_table->n_cols);
+		DBUG_ASSERT(!strcmp(dict_table_get_col_name(
+					    old_table, table->s->stored_fields),
+				    FTS_DOC_ID_COL_NAME));
+		if (altered_table->s->stored_fields + DATA_N_SYS_COLS
+		    < new_table->n_cols) {
+			DBUG_ASSERT(DICT_TF2_FLAG_IS_SET(
+					    new_table,
+					    DICT_TF2_FTS_HAS_DOC_ID));
+			DBUG_ASSERT(altered_table->s->stored_fields
+				    + DATA_N_SYS_COLS + 1
+				    == new_table->n_cols);
+			col_map[i] = altered_table->s->stored_fields;
+		} else {
+			DBUG_ASSERT(!DICT_TF2_FLAG_IS_SET(
+					    new_table,
+					    DICT_TF2_FTS_HAS_DOC_ID));
+			col_map[i] = ULINT_UNDEFINED;
+		}
+
+		i++;
+	} else {
+		DBUG_ASSERT(!DICT_TF2_FLAG_IS_SET(
+				    old_table,
+				    DICT_TF2_FTS_HAS_DOC_ID));
+	}
 
-	if (UNIV_UNLIKELY(error)) {
-		DBUG_RETURN(error);
+	for (; i < old_table->n_cols; i++) {
+		col_map[i] = i + new_table->n_cols - old_table->n_cols;
 	}
 
-	/* Check each index's column length to make sure they do not
-	exceed limit */
-	for (ulint i = 0; i < num_of_keys; i++) {
-		error = innobase_check_column_length(prebuilt->table,
-						     &key_info[i]);
+	DBUG_RETURN(col_map);
+}
+
+/** Drop newly create FTS index related auxiliary table during
+FIC create index process, before fts_add_index is called
+@param table    table that was being rebuilt online
+@param trx	transaction
+@return		DB_SUCCESS if successful, otherwise last error code
+*/
+static
+dberr_t
+innobase_drop_fts_index_table(
+/*==========================*/
+        dict_table_t*   table,
+	trx_t*		trx)
+{
+	dberr_t		ret_err = DB_SUCCESS;
+
+	for (dict_index_t* index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+		if (index->type & DICT_FTS) {
+			dberr_t	err;
 
-		if (error) {
-			DBUG_RETURN(error);
+			err = fts_drop_index_tables(trx, index);
+
+			if (err != DB_SUCCESS) {
+				ret_err = err;
+			}
 		}
 	}
 
-	heap = mem_heap_create(1024);
-	trx_start_if_not_started(prebuilt->trx);
+	return(ret_err);
+}
+
+/** Get the new column names if any columns were renamed
+@param ha_alter_info	Data used during in-place alter
+@param altered_table	MySQL table that is being altered
+@param user_table	InnoDB table as it is before the ALTER operation
+@param heap		Memory heap for the allocation
+@return array of new column names in rebuilt_table, or NULL if not renamed */
+static __attribute__((nonnull, warn_unused_result))
+const char**
+innobase_get_col_names(
+/*===================*/
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		altered_table,
+	const dict_table_t*	user_table,
+	mem_heap_t*		heap)
+{
+	const char**		cols;
+	uint			i;
+
+	DBUG_ENTER("innobase_get_col_names");
+	DBUG_ASSERT(user_table->n_def > altered_table->s->fields);
+	DBUG_ASSERT(ha_alter_info->handler_flags
+		    & Alter_inplace_info::ALTER_COLUMN_NAME);
+
+	cols = static_cast<const char**>(
+		mem_heap_alloc(heap, user_table->n_def * sizeof *cols));
+
+	for (i = 0; i < altered_table->s->fields; i++) {
+		const Field*	field = altered_table->field[i];
+		cols[i] = field->field_name;
+	}
+
+	/* Copy the internal column names. */
+	cols[i] = dict_table_get_col_name(user_table, i);
+
+	while (++i < user_table->n_def) {
+		cols[i] = cols[i - 1] + strlen(cols[i - 1]) + 1;
+	}
+
+	DBUG_RETURN(cols);
+}
+
+/** Update internal structures with concurrent writes blocked,
+while preparing ALTER TABLE.
+
+@param ha_alter_info	Data used during in-place alter
+@param altered_table	MySQL table that is being altered
+@param old_table	MySQL table as it is before the ALTER operation
+@param table_name	Table name in MySQL
+@param flags		Table and tablespace flags
+@param flags2		Additional table flags
+@param fts_doc_id_col	The column number of FTS_DOC_ID
+@param add_fts_doc_id	Flag: add column FTS_DOC_ID?
+@param add_fts_doc_id_idx Flag: add index FTS_DOC_ID_INDEX (FTS_DOC_ID)?
+
+@retval true		Failure
+@retval false		Success
+*/
+static __attribute__((warn_unused_result, nonnull(1,2,3,4)))
+bool
+prepare_inplace_alter_table_dict(
+/*=============================*/
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		altered_table,
+	const TABLE*		old_table,
+	const char*		table_name,
+	ulint			flags,
+	ulint			flags2,
+	ulint			fts_doc_id_col,
+	bool			add_fts_doc_id,
+	bool			add_fts_doc_id_idx)
+{
+	bool			dict_locked	= false;
+	ulint*			add_key_nums;	/* MySQL key numbers */
+	index_def_t*		index_defs;	/* index definitions */
+	dict_table_t*		user_table;
+	dict_index_t*		fts_index	= NULL;
+	ulint			new_clustered	= 0;
+	dberr_t			error;
+	ulint			num_fts_index;
+	ha_innobase_inplace_ctx*ctx;
+        uint                    sql_idx;
+
+	DBUG_ENTER("prepare_inplace_alter_table_dict");
+
+	ctx = static_cast<ha_innobase_inplace_ctx*>
+		(ha_alter_info->handler_ctx);
+
+	DBUG_ASSERT((ctx->add_autoinc != ULINT_UNDEFINED)
+		    == (ctx->sequence.m_max_value > 0));
+	DBUG_ASSERT(!ctx->num_to_drop_index == !ctx->drop_index);
+	DBUG_ASSERT(!ctx->num_to_drop_fk == !ctx->drop_fk);
+	DBUG_ASSERT(!add_fts_doc_id || add_fts_doc_id_idx);
+	DBUG_ASSERT(!add_fts_doc_id_idx
+		    || innobase_fulltext_exist(altered_table));
+	DBUG_ASSERT(!ctx->add_cols);
+	DBUG_ASSERT(!ctx->add_index);
+	DBUG_ASSERT(!ctx->add_key_numbers);
+	DBUG_ASSERT(!ctx->num_to_add_index);
+
+	user_table = ctx->new_table;
+
+	trx_start_if_not_started_xa(ctx->prebuilt->trx);
 
 	/* Create a background transaction for the operations on
 	the data dictionary tables. */
-	trx = innobase_trx_allocate(user_thd);
-	if (UNIV_UNLIKELY(trx->fake_changes)) {
-		mem_heap_free(heap);
-		trx_general_rollback_for_mysql(trx, NULL);
-		trx_free_for_mysql(trx);
+	ctx->trx = innobase_trx_allocate(ctx->prebuilt->trx->mysql_thd);
+
+	if (UNIV_UNLIKELY(ctx->trx->fake_changes)) {
+		trx_rollback_to_savepoint(ctx->trx, NULL);
+		trx_free_for_mysql(ctx->trx);
 		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
 	}
 
-	trx_start_if_not_started(trx);
+	trx_start_for_ddl(ctx->trx, TRX_DICT_OP_INDEX);
 
 	/* Create table containing all indexes to be built in this
-	alter table add index so that they are in the correct order
+	ALTER TABLE ADD INDEX so that they are in the correct order
 	in the table. */
 
-	num_of_idx = num_of_keys;
+	ctx->num_to_add_index = ha_alter_info->index_add_count;
+
+	index_defs = innobase_create_key_defs(
+		ctx->heap, ha_alter_info, altered_table, ctx->num_to_add_index,
+		num_fts_index,
+		row_table_got_default_clust_index(ctx->new_table),
+		fts_doc_id_col, add_fts_doc_id, add_fts_doc_id_idx);
+
+	new_clustered = DICT_CLUSTERED & index_defs[0].ind_type;
+
+	if (num_fts_index > 1) {
+		my_error(ER_INNODB_FT_LIMIT, MYF(0));
+		goto error_handled;
+	}
 
-	index_defs = innobase_create_key_def(
-		trx, prebuilt->table, heap, key_info, num_of_idx);
+	if (!ctx->online) {
+		/* This is not an online operation (LOCK=NONE). */
+	} else if (ctx->add_autoinc == ULINT_UNDEFINED
+		   && num_fts_index == 0
+		   && (!innobase_need_rebuild(ha_alter_info)
+		       || !innobase_fulltext_exist(altered_table))) {
+		/* InnoDB can perform an online operation (LOCK=NONE). */
+	} else {
+		/* This should have been blocked in
+		check_if_supported_inplace_alter(). */
+		ut_ad(0);
+		my_error(ER_NOT_SUPPORTED_YET, MYF(0),
+			 thd_query_string(ctx->prebuilt->trx->mysql_thd)->str);
+		goto error_handled;
+	}
 
-	new_primary = DICT_CLUSTERED & index_defs[0].ind_type;
+	/* The primary index would be rebuilt if a FTS Doc ID
+	column is to be added, and the primary index definition
+	is just copied from old table and stored in indexdefs[0] */
+	DBUG_ASSERT(!add_fts_doc_id || new_clustered);
+	DBUG_ASSERT(!!new_clustered ==
+		    (innobase_need_rebuild(ha_alter_info)
+		     || add_fts_doc_id));
 
 	/* Allocate memory for dictionary index definitions */
 
-	index = (dict_index_t**) mem_heap_alloc(
-		heap, num_of_idx * sizeof *index);
+	ctx->add_index = static_cast<dict_index_t**>(
+		mem_heap_alloc(ctx->heap, ctx->num_to_add_index
+			       * sizeof *ctx->add_index));
+	ctx->add_key_numbers = add_key_nums = static_cast<ulint*>(
+		mem_heap_alloc(ctx->heap, ctx->num_to_add_index
+			       * sizeof *ctx->add_key_numbers));
 
-	/* Flag this transaction as a dictionary operation, so that
-	the data dictionary will be locked in crash recovery. */
-	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+	/* This transaction should be dictionary operation, so that
+	the data dictionary will be locked during crash recovery. */
+
+	ut_ad(ctx->trx->dict_operation == TRX_DICT_OP_INDEX);
 
 	/* Acquire a lock on the table before creating any indexes. */
-	error = row_merge_lock_table(prebuilt->trx, prebuilt->table,
-				     new_primary ? LOCK_X : LOCK_S);
 
-	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+	if (ctx->online) {
+		error = DB_SUCCESS;
+	} else {
+		error = row_merge_lock_table(
+			ctx->prebuilt->trx, ctx->new_table, LOCK_S);
+
+		if (error != DB_SUCCESS) {
 
-		goto error_handling;
+			goto error_handling;
+		}
 	}
 
 	/* Latch the InnoDB data dictionary exclusively so that no deadlocks
 	or lock waits can happen in it during an index create operation. */
 
-	row_mysql_lock_data_dictionary(trx);
-	dict_locked = TRUE;
+	row_mysql_lock_data_dictionary(ctx->trx);
+	dict_locked = true;
+
+	/* Wait for background stats processing to stop using the table that
+	we are going to alter. We know bg stats will not start using it again
+	until we are holding the data dict locked and we are holding it here
+	at least until checking ut_ad(user_table->n_ref_count == 1) below.
+	XXX what may happen if bg stats opens the table after we
+	have unlocked data dictionary below? */
+	dict_stats_wait_bg_to_stop_using_table(user_table, ctx->trx);
+
+	online_retry_drop_indexes_low(ctx->new_table, ctx->trx);
+
+	ut_d(dict_table_check_for_dup_indexes(
+		     ctx->new_table, CHECK_ABORTED_OK));
+
+	/* If a new clustered index is defined for the table we need
+	to rebuild the table with a temporary name. */
+
+	if (new_clustered) {
+		const char*	new_table_name
+			= dict_mem_create_temporary_tablename(
+				ctx->heap,
+				ctx->new_table->name,
+				ctx->new_table->id);
+		ulint		n_cols;
+		dtuple_t*	add_cols;
+
+		if (innobase_check_foreigns(
+			    ha_alter_info, altered_table, old_table,
+			    user_table, ctx->drop_fk, ctx->num_to_drop_fk)) {
+			goto new_clustered_failed;
+		}
 
-	ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE));
+		n_cols = altered_table->s->stored_fields;
 
-	/* If a new primary key is defined for the table we need
-	to drop the original table and rebuild all indexes. */
+		if (add_fts_doc_id) {
+			n_cols++;
+			DBUG_ASSERT(flags2 & DICT_TF2_FTS);
+			DBUG_ASSERT(add_fts_doc_id_idx);
+			flags2 |= DICT_TF2_FTS_ADD_DOC_ID
+				| DICT_TF2_FTS_HAS_DOC_ID
+				| DICT_TF2_FTS;
+		}
 
-	if (UNIV_UNLIKELY(new_primary)) {
-		/* This transaction should be the only one
-		operating on the table. */
-		ut_a(prebuilt->table->n_mysql_handles_opened == 1);
+		DBUG_ASSERT(!add_fts_doc_id_idx || (flags2 & DICT_TF2_FTS));
 
-		char*	new_table_name = innobase_create_temporary_tablename(
-			heap, '1', prebuilt->table->name);
+		/* Create the table. */
+		trx_set_dict_operation(ctx->trx, TRX_DICT_OP_TABLE);
 
-		/* Clone the table. */
-		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
-		indexed_table = row_merge_create_temporary_table(
-			new_table_name, index_defs, prebuilt->table, trx);
+		if (dict_table_get_low(new_table_name)) {
+			my_error(ER_TABLE_EXISTS_ERROR, MYF(0),
+				 new_table_name);
+			goto new_clustered_failed;
+		}
 
-		if (!indexed_table) {
+		/* The initial space id 0 may be overridden later. */
+		ctx->new_table = dict_mem_table_create(
+			new_table_name, 0, n_cols, flags, flags2);
+		/* The rebuilt indexed_table will use the renamed
+		column names. */
+		ctx->col_names = NULL;
+
+		if (DICT_TF_HAS_DATA_DIR(flags)) {
+			ctx->new_table->data_dir_path =
+				mem_heap_strdup(ctx->new_table->heap,
+				user_table->data_dir_path);
+		}
 
-			switch (trx->error_state) {
-			case DB_TABLESPACE_ALREADY_EXISTS:
-			case DB_DUPLICATE_KEY:
-				innobase_convert_tablename(new_table_name);
-				my_error(HA_ERR_TABLE_EXIST, MYF(0),
-					 new_table_name);
-				error = HA_ERR_TABLE_EXIST;
-				break;
-			default:
-				error = convert_error_code_to_mysql(
-					trx->error_state,
-					prebuilt->table->flags,
-					user_thd);
+                sql_idx= 0;
+		for (uint i = 0; i < altered_table->s->stored_fields; i++, sql_idx++) {
+			const Field*	field;
+                        while (!((field= altered_table->field[sql_idx])->
+                                 stored_in_db))
+                          sql_idx++;
+			ulint		is_unsigned;
+			ulint		field_type
+				= (ulint) field->type();
+			ulint		col_type
+				= get_innobase_type_from_mysql_type(
+					&is_unsigned, field);
+			ulint		charset_no;
+			ulint		col_len;
+
+			/* we assume in dtype_form_prtype() that this
+			fits in two bytes */
+			ut_a(field_type <= MAX_CHAR_COLL_NUM);
+
+			if (!field->real_maybe_null()) {
+				field_type |= DATA_NOT_NULL;
+			}
+
+			if (field->binary()) {
+				field_type |= DATA_BINARY_TYPE;
+			}
+
+			if (is_unsigned) {
+				field_type |= DATA_UNSIGNED;
+			}
+
+			if (dtype_is_string_type(col_type)) {
+				charset_no = (ulint) field->charset()->number;
+
+				if (charset_no > MAX_CHAR_COLL_NUM) {
+					dict_mem_table_free(
+						ctx->new_table);
+					my_error(ER_WRONG_KEY_COLUMN, MYF(0),
+						 field->field_name);
+					goto new_clustered_failed;
+				}
+			} else {
+				charset_no = 0;
+			}
+
+			col_len = field->pack_length();
+
+			/* The MySQL pack length contains 1 or 2 bytes
+			length field for a true VARCHAR. Let us
+			subtract that, so that the InnoDB column
+			length in the InnoDB data dictionary is the
+			real maximum byte length of the actual data. */
+
+			if (field->type() == MYSQL_TYPE_VARCHAR) {
+				uint32	length_bytes
+					= static_cast<const Field_varstring*>(
+						field)->length_bytes;
+
+				col_len -= length_bytes;
+
+				if (length_bytes == 2) {
+					field_type |= DATA_LONG_TRUE_VARCHAR;
+				}
+			}
+
+			if (dict_col_name_is_reserved(field->field_name)) {
+				dict_mem_table_free(ctx->new_table);
+				my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+					 field->field_name);
+				goto new_clustered_failed;
 			}
 
-			ut_d(dict_table_check_for_dup_indexes(prebuilt->table,
-							      TRUE));
-			mem_heap_free(heap);
-			trx_general_rollback_for_mysql(trx, NULL);
-			row_mysql_unlock_data_dictionary(trx);
-			trx_free_for_mysql(trx);
-			trx_commit_for_mysql(prebuilt->trx);
-			DBUG_RETURN(error);
+			dict_mem_table_add_col(
+				ctx->new_table, ctx->heap,
+				field->field_name,
+				col_type,
+				dtype_form_prtype(field_type, charset_no),
+				col_len);
 		}
 
-		trx->table_id = indexed_table->id;
+		if (add_fts_doc_id) {
+			fts_add_doc_id_column(ctx->new_table, ctx->heap);
+			ctx->new_table->fts->doc_col = fts_doc_id_col;
+			ut_ad(fts_doc_id_col == altered_table->s->stored_fields);
+		} else if (ctx->new_table->fts) {
+			ctx->new_table->fts->doc_col = fts_doc_id_col;
+		}
+
+		error = row_create_table_for_mysql(
+			ctx->new_table, ctx->trx, false);
+
+		switch (error) {
+			dict_table_t*	temp_table;
+		case DB_SUCCESS:
+			/* We need to bump up the table ref count and
+			before we can use it we need to open the
+			table. The new_table must be in the data
+			dictionary cache, because we are still holding
+			the dict_sys->mutex. */
+			ut_ad(mutex_own(&dict_sys->mutex));
+			temp_table = dict_table_open_on_name(
+				ctx->new_table->name, TRUE, FALSE,
+				DICT_ERR_IGNORE_NONE);
+			ut_a(ctx->new_table == temp_table);
+			/* n_ref_count must be 1, because purge cannot
+			be executing on this very table as we are
+			holding dict_operation_lock X-latch. */
+			DBUG_ASSERT(ctx->new_table->n_ref_count == 1);
+			break;
+		case DB_TABLESPACE_EXISTS:
+			my_error(ER_TABLESPACE_EXISTS, MYF(0),
+				 new_table_name);
+			goto new_clustered_failed;
+		case DB_DUPLICATE_KEY:
+			my_error(HA_ERR_TABLE_EXIST, MYF(0),
+				 altered_table->s->table_name.str);
+			goto new_clustered_failed;
+		default:
+			my_error_innodb(error, table_name, flags);
+		new_clustered_failed:
+			DBUG_ASSERT(ctx->trx != ctx->prebuilt->trx);
+			trx_rollback_to_savepoint(ctx->trx, NULL);
+
+			ut_ad(user_table->n_ref_count == 1);
+
+			online_retry_drop_indexes_with_trx(
+				user_table, ctx->trx);
+			goto err_exit;
+		}
+
+		if (ha_alter_info->handler_flags
+		    & Alter_inplace_info::ADD_COLUMN) {
+			add_cols = dtuple_create(
+				ctx->heap,
+				dict_table_get_n_cols(ctx->new_table));
+
+			dict_table_copy_types(add_cols, ctx->new_table);
+		} else {
+			add_cols = NULL;
+		}
+
+		ctx->col_map = innobase_build_col_map(
+			ha_alter_info, altered_table, old_table,
+			ctx->new_table, user_table,
+			add_cols, ctx->heap);
+		ctx->add_cols = add_cols;
+	} else {
+		DBUG_ASSERT(!innobase_need_rebuild(ha_alter_info));
+
+		if (!ctx->new_table->fts
+		    && innobase_fulltext_exist(altered_table)) {
+			ctx->new_table->fts = fts_create(
+				ctx->new_table);
+			ctx->new_table->fts->doc_col = fts_doc_id_col;
+		}
 	}
 
+	/* Assign table_id, so that no table id of
+	fts_create_index_tables() will be written to the undo logs. */
+	DBUG_ASSERT(ctx->new_table->id != 0);
+	ctx->trx->table_id = ctx->new_table->id;
+
 	/* Create the indexes in SYS_INDEXES and load into dictionary. */
 
-	for (num_created = 0; num_created < num_of_idx; num_created++) {
+	for (ulint a = 0; a < ctx->num_to_add_index; a++) {
+
+		ctx->add_index[a] = row_merge_create_index(
+			ctx->trx, ctx->new_table,
+			&index_defs[a]);
+
+		add_key_nums[a] = index_defs[a].key_number;
+
+		if (!ctx->add_index[a]) {
+			error = ctx->trx->error_state;
+			DBUG_ASSERT(error != DB_SUCCESS);
+			goto error_handling;
+		}
+
+		if (ctx->add_index[a]->type & DICT_FTS) {
+			DBUG_ASSERT(num_fts_index);
+			DBUG_ASSERT(!fts_index);
+			DBUG_ASSERT(ctx->add_index[a]->type == DICT_FTS);
+			fts_index = ctx->add_index[a];
+		}
+
+		/* If only online ALTER TABLE operations have been
+		requested, allocate a modification log. If the table
+		will be locked anyway, the modification
+		log is unnecessary. When rebuilding the table
+		(new_clustered), we will allocate the log for the
+		clustered index of the old table, later. */
+		if (new_clustered
+		    || !ctx->online
+		    || user_table->ibd_file_missing
+		    || dict_table_is_discarded(user_table)) {
+			/* No need to allocate a modification log. */
+			ut_ad(!ctx->add_index[a]->online_log);
+		} else if (ctx->add_index[a]->type & DICT_FTS) {
+			/* Fulltext indexes are not covered
+			by a modification log. */
+		} else {
+			DBUG_EXECUTE_IF("innodb_OOM_prepare_inplace_alter",
+					error = DB_OUT_OF_MEMORY;
+					goto error_handling;);
+			rw_lock_x_lock(&ctx->add_index[a]->lock);
+			bool ok = row_log_allocate(ctx->add_index[a],
+						   NULL, true, NULL, NULL);
+			rw_lock_x_unlock(&ctx->add_index[a]->lock);
+
+			if (!ok) {
+				error = DB_OUT_OF_MEMORY;
+				goto error_handling;
+			}
+		}
+	}
+
+	ut_ad(new_clustered == ctx->need_rebuild());
+
+	DBUG_EXECUTE_IF("innodb_OOM_prepare_inplace_alter",
+			error = DB_OUT_OF_MEMORY;
+			goto error_handling;);
+
+	if (new_clustered && ctx->online) {
+		/* Allocate a log for online table rebuild. */
+		dict_index_t* clust_index = dict_table_get_first_index(
+			user_table);
 
-		index[num_created] = row_merge_create_index(
-			trx, indexed_table, &index_defs[num_created]);
+		rw_lock_x_lock(&clust_index->lock);
+		bool ok = row_log_allocate(
+			clust_index, ctx->new_table,
+			!(ha_alter_info->handler_flags
+			  & Alter_inplace_info::ADD_PK_INDEX),
+			ctx->add_cols, ctx->col_map);
+		rw_lock_x_unlock(&clust_index->lock);
 
-		if (!index[num_created]) {
-			error = trx->error_state;
+		if (!ok) {
+			error = DB_OUT_OF_MEMORY;
 			goto error_handling;
 		}
 	}
 
-	ut_ad(error == DB_SUCCESS);
+	if (ctx->online) {
+		/* Assign a consistent read view for
+		row_merge_read_clustered_index(). */
+		trx_assign_read_view(ctx->prebuilt->trx);
+	}
 
-	/* Commit the data dictionary transaction in order to release
-	the table locks on the system tables.  This means that if
-	MySQL crashes while creating a new primary key inside
-	row_merge_build_indexes(), indexed_table will not be dropped
-	by trx_rollback_active().  It will have to be recovered or
-	dropped by the database administrator. */
-	trx_commit_for_mysql(trx);
+	if (fts_index) {
+		/* Ensure that the dictionary operation mode will
+		not change while creating the auxiliary tables. */
+		trx_dict_op_t	op = trx_get_dict_operation(ctx->trx);
 
-	row_mysql_unlock_data_dictionary(trx);
-	dict_locked = FALSE;
+#ifdef UNIV_DEBUG
+		switch (op) {
+		case TRX_DICT_OP_NONE:
+			break;
+		case TRX_DICT_OP_TABLE:
+		case TRX_DICT_OP_INDEX:
+			goto op_ok;
+		}
+		ut_error;
+op_ok:
+#endif /* UNIV_DEBUG */
+		ut_ad(ctx->trx->dict_operation_lock_mode == RW_X_LATCH);
+		ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
 
-	ut_a(trx->n_active_thrs == 0);
-	ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
+		DICT_TF2_FLAG_SET(ctx->new_table, DICT_TF2_FTS);
 
-	if (UNIV_UNLIKELY(new_primary)) {
-		/* A primary key is to be built.  Acquire an exclusive
-		table lock also on the table that is being created. */
-		ut_ad(indexed_table != prebuilt->table);
+		/* This function will commit the transaction and reset
+		the trx_t::dict_operation flag on success. */
 
-		error = row_merge_lock_table(prebuilt->trx, indexed_table,
-					     LOCK_X);
+		error = fts_create_index_tables(ctx->trx, fts_index);
 
-		if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		DBUG_EXECUTE_IF("innodb_test_fail_after_fts_index_table",
+				error = DB_LOCK_WAIT_TIMEOUT;
+				goto error_handling;);
 
+		if (error != DB_SUCCESS) {
 			goto error_handling;
 		}
+
+		trx_start_for_ddl(ctx->trx, op);
+
+		if (!ctx->new_table->fts
+		    || ib_vector_size(ctx->new_table->fts->indexes) == 0) {
+			error = fts_create_common_tables(
+				ctx->trx, ctx->new_table,
+				user_table->name, TRUE);
+
+			DBUG_EXECUTE_IF(
+				"innodb_test_fail_after_fts_common_table",
+				error = DB_LOCK_WAIT_TIMEOUT;);
+
+			if (error != DB_SUCCESS) {
+				goto error_handling;
+			}
+
+			ctx->new_table->fts->fts_status
+				|= TABLE_DICT_LOCKED;
+
+			error = innobase_fts_load_stopword(
+				ctx->new_table, ctx->trx,
+				ctx->prebuilt->trx->mysql_thd)
+				? DB_SUCCESS : DB_ERROR;
+			ctx->new_table->fts->fts_status
+				&= ~TABLE_DICT_LOCKED;
+
+			if (error != DB_SUCCESS) {
+				goto error_handling;
+			}
+		}
+
+		ut_ad(trx_get_dict_operation(ctx->trx) == op);
 	}
 
-	/* Read the clustered index of the table and build indexes
-	based on this information using temporary files and merge sort. */
-	error = row_merge_build_indexes(prebuilt->trx,
-					prebuilt->table, indexed_table,
-					index, num_of_idx, table);
+	DBUG_ASSERT(error == DB_SUCCESS);
+
+	/* Commit the data dictionary transaction in order to release
+	the table locks on the system tables.  This means that if
+	MySQL crashes while creating a new primary key inside
+	row_merge_build_indexes(), ctx->new_table will not be dropped
+	by trx_rollback_active().  It will have to be recovered or
+	dropped by the database administrator. */
+	trx_commit_for_mysql(ctx->trx);
+
+	row_mysql_unlock_data_dictionary(ctx->trx);
+	dict_locked = false;
+
+	ut_a(ctx->trx->lock.n_active_thrs == 0);
 
 	DBUG_EXECUTE_IF("crash_innodb_add_index_after", DBUG_SUICIDE(););
 
@@ -896,507 +3125,2434 @@ error_handling:
 		ut_a(!dict_locked);
 
 		ut_d(mutex_enter(&dict_sys->mutex));
-		ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE));
+		ut_d(dict_table_check_for_dup_indexes(
+			     user_table, CHECK_PARTIAL_OK));
 		ut_d(mutex_exit(&dict_sys->mutex));
-                *add = new ha_innobase_add_index(table, key_info, num_of_keys,
-                                                 indexed_table);
+		DBUG_RETURN(false);
+	case DB_TABLESPACE_EXISTS:
+		my_error(ER_TABLESPACE_EXISTS, MYF(0), "(unknown)");
 		break;
-
-	case DB_TOO_BIG_RECORD:
-		my_error(HA_ERR_TO_BIG_ROW, MYF(0));
-		goto error;
-	case DB_PRIMARY_KEY_IS_NULL:
-		my_error(ER_PRIMARY_CANT_HAVE_NULL, MYF(0));
-		/* fall through */
 	case DB_DUPLICATE_KEY:
-error:
-		prebuilt->trx->error_info = NULL;
-		/* fall through */
+		my_error(ER_DUP_KEY, MYF(0), "SYS_INDEXES");
+		break;
 	default:
-		trx->error_state = DB_SUCCESS;
+		my_error_innodb(error, table_name, user_table->flags);
+	}
 
-		if (new_primary) {
-			if (indexed_table != prebuilt->table) {
-				row_merge_drop_table(trx, indexed_table);
+error_handled:
+
+	ctx->prebuilt->trx->error_info = NULL;
+	ctx->trx->error_state = DB_SUCCESS;
+
+	if (!dict_locked) {
+		row_mysql_lock_data_dictionary(ctx->trx);
+	}
+
+	if (new_clustered) {
+		if (ctx->need_rebuild()) {
+
+			if (DICT_TF2_FLAG_IS_SET(
+				    ctx->new_table, DICT_TF2_FTS)) {
+				innobase_drop_fts_index_table(
+					ctx->new_table, ctx->trx);
 			}
-		} else {
-			if (!dict_locked) {
-				row_mysql_lock_data_dictionary(trx);
-				dict_locked = TRUE;
+
+			dict_table_close(ctx->new_table, TRUE, FALSE);
+
+#if defined UNIV_DEBUG || defined UNIV_DDL_DEBUG
+			/* Nobody should have initialized the stats of the
+			newly created table yet. When this is the case, we
+			know that it has not been added for background stats
+			gathering. */
+			ut_a(!ctx->new_table->stat_initialized);
+#endif /* UNIV_DEBUG || UNIV_DDL_DEBUG */
+
+			row_merge_drop_table(ctx->trx, ctx->new_table);
+
+			/* Free the log for online table rebuild, if
+			one was allocated. */
+
+			dict_index_t* clust_index = dict_table_get_first_index(
+				user_table);
+
+			rw_lock_x_lock(&clust_index->lock);
+
+			if (clust_index->online_log) {
+				ut_ad(ctx->online);
+				row_log_abort_sec(clust_index);
+				clust_index->online_status
+					= ONLINE_INDEX_COMPLETE;
 			}
 
-			row_merge_drop_indexes(trx, indexed_table,
-					       index, num_created);
+			rw_lock_x_unlock(&clust_index->lock);
 		}
-	}
 
-	trx_commit_for_mysql(trx);
-	if (prebuilt->trx) {
-		trx_commit_for_mysql(prebuilt->trx);
+		trx_commit_for_mysql(ctx->trx);
+		/* n_ref_count must be 1, because purge cannot
+		be executing on this very table as we are
+		holding dict_operation_lock X-latch. */
+		DBUG_ASSERT(user_table->n_ref_count == 1 || ctx->online);
+
+		online_retry_drop_indexes_with_trx(user_table, ctx->trx);
+	} else {
+		ut_ad(!ctx->need_rebuild());
+		row_merge_drop_indexes(ctx->trx, user_table, TRUE);
+		trx_commit_for_mysql(ctx->trx);
 	}
 
-	if (dict_locked) {
-		row_mysql_unlock_data_dictionary(trx);
+	ut_d(dict_table_check_for_dup_indexes(user_table, CHECK_ALL_COMPLETE));
+	ut_ad(!user_table->drop_aborted);
+
+err_exit:
+	/* Clear the to_be_dropped flag in the data dictionary cache. */
+	for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+		DBUG_ASSERT(*ctx->drop_index[i]->name != TEMP_INDEX_PREFIX);
+		DBUG_ASSERT(ctx->drop_index[i]->to_be_dropped);
+		ctx->drop_index[i]->to_be_dropped = 0;
 	}
 
-	trx_free_for_mysql(trx);
-	mem_heap_free(heap);
+	row_mysql_unlock_data_dictionary(ctx->trx);
+
+	trx_free_for_mysql(ctx->trx);
+	trx_commit_for_mysql(ctx->prebuilt->trx);
+
+	delete ctx;
+	ha_alter_info->handler_ctx = NULL;
 
 	/* There might be work for utility threads.*/
 	srv_active_wake_master_thread();
 
-	DBUG_RETURN(convert_error_code_to_mysql(error, prebuilt->table->flags,
-						user_thd));
+	DBUG_RETURN(true);
 }
 
-/*******************************************************************//**
-Finalize or undo add_index().
-@return	0 or error number */
+/* Check whether an index is needed for the foreign key constraint.
+If so, if it is dropped, is there an equivalent index can play its role.
+@return true if the index is needed and can't be dropped */
+static __attribute__((nonnull(1,2,3,5), warn_unused_result))
+bool
+innobase_check_foreign_key_index(
+/*=============================*/
+	Alter_inplace_info*	ha_alter_info,	/*!< in: Structure describing
+						changes to be done by ALTER
+						TABLE */
+	dict_index_t*		index,		/*!< in: index to check */
+	dict_table_t*		indexed_table,	/*!< in: table that owns the
+						foreign keys */
+	const char**		col_names,	/*!< in: column names, or NULL
+						for indexed_table->col_names */
+	trx_t*			trx,		/*!< in/out: transaction */
+	dict_foreign_t**	drop_fk,	/*!< in: Foreign key constraints
+						to drop */
+	ulint			n_drop_fk)	/*!< in: Number of foreign keys
+						to drop */
+{
+	dict_foreign_t*	foreign;
+
+	/* Check if the index is referenced. */
+	foreign = dict_table_get_referenced_constraint(indexed_table, index);
+
+	ut_ad(!foreign || indexed_table
+	      == foreign->referenced_table);
+
+	if (foreign
+	    && !dict_foreign_find_index(
+		    indexed_table, col_names,
+		    foreign->referenced_col_names,
+		    foreign->n_fields, index,
+		    /*check_charsets=*/TRUE,
+		    /*check_null=*/FALSE)
+	    && !innobase_find_equiv_index(
+		    foreign->referenced_col_names,
+		    foreign->n_fields,
+		    ha_alter_info->key_info_buffer,
+		    ha_alter_info->index_add_buffer,
+		    ha_alter_info->index_add_count)
+	    ) {
+		trx->error_info = index;
+		return(true);
+	}
+
+	/* Check if this index references some
+	other table */
+	foreign = dict_table_get_foreign_constraint(
+		indexed_table, index);
+
+	ut_ad(!foreign || indexed_table
+	      == foreign->foreign_table);
+
+	if (foreign
+	    && !innobase_dropping_foreign(
+		    foreign, drop_fk, n_drop_fk)
+	    && !dict_foreign_find_index(
+		    indexed_table, col_names,
+		    foreign->foreign_col_names,
+		    foreign->n_fields, index,
+		    /*check_charsets=*/TRUE,
+		    /*check_null=*/FALSE)
+	    && !innobase_find_equiv_index(
+		    foreign->foreign_col_names,
+		    foreign->n_fields,
+		    ha_alter_info->key_info_buffer,
+		    ha_alter_info->index_add_buffer,
+		    ha_alter_info->index_add_count)
+	    ) {
+		trx->error_info = index;
+		return(true);
+	}
+
+	return(false);
+}
+
+/** Allows InnoDB to update internal structures with concurrent
+writes blocked (provided that check_if_supported_inplace_alter()
+did not return HA_ALTER_INPLACE_NO_LOCK).
+This will be invoked before inplace_alter_table().
+
+@param altered_table	TABLE object for new version of table.
+@param ha_alter_info	Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+
+@retval true		Failure
+@retval false		Success
+*/
 UNIV_INTERN
-int
-ha_innobase::final_add_index(
-/*=========================*/
-	handler_add_index*	add_arg,/*!< in: context from add_index() */
-	bool			commit)	/*!< in: true=commit, false=rollback */
+bool
+ha_innobase::prepare_inplace_alter_table(
+/*=====================================*/
+	TABLE*			altered_table,
+	Alter_inplace_info*	ha_alter_info)
 {
-	ha_innobase_add_index*	add;
-	trx_t*			trx;
-	int			err	= 0;
+	dict_index_t**	drop_index;	/*!< Index to be dropped */
+	ulint		n_drop_index;	/*!< Number of indexes to drop */
+	dict_foreign_t**drop_fk;	/*!< Foreign key constraints to drop */
+	ulint		n_drop_fk;	/*!< Number of foreign keys to drop */
+	dict_foreign_t**add_fk = NULL;	/*!< Foreign key constraints to drop */
+	ulint		n_add_fk;	/*!< Number of foreign keys to drop */
+	dict_table_t*	indexed_table;	/*!< Table where indexes are created */
+	mem_heap_t*     heap;
+	const char**	col_names;
+	int		error;
+	ulint		flags;
+	ulint		flags2;
+	ulint		max_col_len;
+	ulint		add_autoinc_col_no	= ULINT_UNDEFINED;
+	ulonglong	autoinc_col_max_value	= 0;
+	ulint		fts_doc_col_no		= ULINT_UNDEFINED;
+	bool		add_fts_doc_id		= false;
+	bool		add_fts_doc_id_idx	= false;
+
+	DBUG_ENTER("prepare_inplace_alter_table");
+	DBUG_ASSERT(!ha_alter_info->handler_ctx);
+	DBUG_ASSERT(ha_alter_info->create_info);
+	DBUG_ASSERT(!srv_read_only_mode);
 
-	DBUG_ENTER("ha_innobase::final_add_index");
+	if (UNIV_UNLIKELY(prebuilt->trx->fake_changes)) {
+		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+	}
 
-	ut_ad(add_arg);
-	add = static_cast<class ha_innobase_add_index*>(add_arg);
+	MONITOR_ATOMIC_INC(MONITOR_PENDING_ALTER_TABLE);
 
-	/* Create a background transaction for the operations on
-	the data dictionary tables. */
-	trx = innobase_trx_allocate(user_thd);
-	trx_start_if_not_started(trx);
+#ifdef UNIV_DEBUG
+	for (dict_index_t* index = dict_table_get_first_index(prebuilt->table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+		ut_ad(!index->to_be_dropped);
+	}
+#endif /* UNIV_DEBUG */
 
-	/* Flag this transaction as a dictionary operation, so that
-	the data dictionary will be locked in crash recovery. */
-	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+	ut_d(mutex_enter(&dict_sys->mutex));
+	ut_d(dict_table_check_for_dup_indexes(
+		     prebuilt->table, CHECK_ABORTED_OK));
+	ut_d(mutex_exit(&dict_sys->mutex));
 
-	/* Latch the InnoDB data dictionary exclusively so that no deadlocks
-	or lock waits can happen in it during an index create operation. */
-	row_mysql_lock_data_dictionary(trx);
+	if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) {
+		/* Nothing to do */
+		goto func_exit;
+	}
 
-	if (add->indexed_table != prebuilt->table) {
-		ulint	error;
-
-		/* We copied the table (new_primary). */
-		if (commit) {
-			mem_heap_t*	heap;
-			char*		tmp_name;
-
-			heap = mem_heap_create(1024);
-
-			/* A new primary key was defined for the table
-			and there was no error at this point. We can
-			now rename the old table as a temporary table,
-			rename the new temporary table as the old
-			table and drop the old table. */
-			tmp_name = innobase_create_temporary_tablename(
-				heap, '2', prebuilt->table->name);
-
-			error = row_merge_rename_tables(
-				prebuilt->table, add->indexed_table,
-				tmp_name, trx);
-
-			switch (error) {
-			case DB_TABLESPACE_ALREADY_EXISTS:
-			case DB_DUPLICATE_KEY:
-				innobase_convert_tablename(tmp_name);
-				my_error(HA_ERR_TABLE_EXIST, MYF(0), tmp_name);
-				err = HA_ERR_TABLE_EXIST;
-				break;
-			default:
-				err = convert_error_code_to_mysql(
-					error, prebuilt->table->flags,
-					user_thd);
-				break;
+	if (ha_alter_info->handler_flags
+	    & Alter_inplace_info::CHANGE_CREATE_OPTION) {
+		if (const char* invalid_opt = create_options_are_invalid(
+			    user_thd, altered_table,
+			    ha_alter_info->create_info,
+			    prebuilt->table->space != 0)) {
+			my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0),
+				 table_type(), invalid_opt);
+			goto err_exit_no_heap;
+		}
+	}
+
+	/* Check if any index name is reserved. */
+	if (innobase_index_name_is_reserved(
+		    user_thd,
+		    ha_alter_info->key_info_buffer,
+		    ha_alter_info->key_count)) {
+err_exit_no_heap:
+		DBUG_ASSERT(prebuilt->trx->dict_operation_lock_mode == 0);
+		if (ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) {
+			online_retry_drop_indexes(prebuilt->table, user_thd);
+		}
+		DBUG_RETURN(true);
+	}
+
+	indexed_table = prebuilt->table;
+
+	/* Check that index keys are sensible */
+	error = innobase_check_index_keys(ha_alter_info, indexed_table);
+
+	if (error) {
+		goto err_exit_no_heap;
+	}
+
+	/* Prohibit renaming a column to something that the table
+	already contains. */
+	if (ha_alter_info->handler_flags
+	    & Alter_inplace_info::ALTER_COLUMN_NAME) {
+		List_iterator_fast<Create_field> cf_it(
+			ha_alter_info->alter_info->create_list);
+
+		for (Field** fp = table->field; *fp; fp++) {
+			if (!((*fp)->flags & FIELD_IS_RENAMED)) {
+				continue;
+			}
+
+			const char* name = 0;
+
+			cf_it.rewind();
+			while (Create_field* cf = cf_it++) {
+				if (cf->field == *fp) {
+					name = cf->field_name;
+					goto check_if_ok_to_rename;
+				}
 			}
 
-			mem_heap_free(heap);
+			ut_error;
+check_if_ok_to_rename:
+			/* Prohibit renaming a column from FTS_DOC_ID
+			if full-text indexes exist. */
+			if (!my_strcasecmp(system_charset_info,
+					   (*fp)->field_name,
+					   FTS_DOC_ID_COL_NAME)
+			    && innobase_fulltext_exist(altered_table)) {
+				my_error(ER_INNODB_FT_WRONG_DOCID_COLUMN,
+					 MYF(0), name);
+				goto err_exit_no_heap;
+			}
+
+			/* Prohibit renaming a column to an internal column. */
+			const char*	s = prebuilt->table->col_names;
+			unsigned j;
+			/* Skip user columns.
+			MySQL should have checked these already.
+			We want to allow renaming of c1 to c2, c2 to c1. */
+			for (j = 0; j < table->s->fields; j++) {
+				s += strlen(s) + 1;
+			}
+
+			for (; j < prebuilt->table->n_def; j++) {
+				if (!my_strcasecmp(
+					    system_charset_info, name, s)) {
+					my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+						 s);
+					goto err_exit_no_heap;
+				}
+
+				s += strlen(s) + 1;
+			}
+		}
+	}
+
+	if (!innobase_table_flags(altered_table,
+				  ha_alter_info->create_info,
+				  user_thd,
+				  srv_file_per_table
+				  || indexed_table->space != 0,
+				  &flags, &flags2)) {
+		goto err_exit_no_heap;
+	}
+
+	max_col_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags);
+
+	/* Check each index's column length to make sure they do not
+	exceed limit */
+	for (ulint i = 0; i < ha_alter_info->index_add_count; i++) {
+		const KEY* key = &ha_alter_info->key_info_buffer[
+			ha_alter_info->index_add_buffer[i]];
+
+		if (key->flags & HA_FULLTEXT) {
+			/* The column length does not matter for
+			fulltext search indexes. But, UNIQUE
+			fulltext indexes are not supported. */
+			DBUG_ASSERT(!(key->flags & HA_NOSAME));
+			DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK
+				      & ~(HA_FULLTEXT
+					  | HA_PACK_KEY
+					  | HA_BINARY_PACK_KEY)));
+			continue;
+		}
+
+		if (innobase_check_column_length(max_col_len, key)) {
+			my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+				 max_col_len);
+			goto err_exit_no_heap;
+		}
+	}
+
+	/* Check existing index definitions for too-long column
+	prefixes as well, in case max_col_len shrunk. */
+	for (const dict_index_t* index
+		     = dict_table_get_first_index(indexed_table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+		if (index->type & DICT_FTS) {
+			DBUG_ASSERT(index->type == DICT_FTS
+				    || (index->type & DICT_CORRUPT));
+			continue;
 		}
 
-		if (!commit || err) {
-			error = row_merge_drop_table(trx, add->indexed_table);
-			trx_commit_for_mysql(prebuilt->trx);
+		for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
+			const dict_field_t* field
+				= dict_index_get_nth_field(index, i);
+			if (field->prefix_len > max_col_len) {
+				my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+					 max_col_len);
+				goto err_exit_no_heap;
+			}
+		}
+	}
+
+	n_drop_index = 0;
+	n_drop_fk = 0;
+
+	if (ha_alter_info->handler_flags
+	    & (INNOBASE_ALTER_NOREBUILD | INNOBASE_ALTER_REBUILD)) {
+		heap = mem_heap_create(1024);
+
+		if (ha_alter_info->handler_flags
+		    & Alter_inplace_info::ALTER_COLUMN_NAME) {
+			col_names = innobase_get_col_names(
+				ha_alter_info, altered_table, indexed_table,
+				heap);
 		} else {
-			dict_table_t*	old_table = prebuilt->table;
-			trx_commit_for_mysql(prebuilt->trx);
-			row_prebuilt_free(prebuilt, TRUE);
-			error = row_merge_drop_table(trx, old_table);
-			add->indexed_table->n_mysql_handles_opened++;
-			prebuilt = row_create_prebuilt(add->indexed_table,
-				0 /* XXX Do we know the mysql_row_len here?
-				Before the addition of this parameter to
-				row_create_prebuilt() the mysql_row_len
-				member was left 0 (from zalloc) in the
-				prebuilt object. */);
-		}
-
-		err = convert_error_code_to_mysql(
-			error, prebuilt->table->flags, user_thd);
+			col_names = NULL;
+		}
+	} else {
+		heap = NULL;
+		col_names = NULL;
+	}
+
+	if (ha_alter_info->handler_flags
+	    & Alter_inplace_info::DROP_FOREIGN_KEY) {
+		DBUG_ASSERT(ha_alter_info->alter_info->drop_list.elements > 0);
+
+		drop_fk = static_cast<dict_foreign_t**>(
+			mem_heap_alloc(
+				heap,
+				ha_alter_info->alter_info->drop_list.elements
+				* sizeof(dict_foreign_t*)));
+
+		List_iterator<Alter_drop> drop_it(
+			ha_alter_info->alter_info->drop_list);
+
+		while (Alter_drop* drop = drop_it++) {
+			if (drop->type != Alter_drop::FOREIGN_KEY) {
+				continue;
+			}
+
+			for (dict_foreign_t* foreign = UT_LIST_GET_FIRST(
+				     prebuilt->table->foreign_list);
+			     foreign != NULL;
+			     foreign = UT_LIST_GET_NEXT(
+				     foreign_list, foreign)) {
+				const char* fid = strchr(foreign->id, '/');
+
+				DBUG_ASSERT(fid);
+				/* If no database/ prefix was present in
+				the FOREIGN KEY constraint name, compare
+				to the full constraint name. */
+				fid = fid ? fid + 1 : foreign->id;
+
+				if (!my_strcasecmp(system_charset_info,
+						   fid, drop->name)) {
+					drop_fk[n_drop_fk++] = foreign;
+					goto found_fk;
+				}
+			}
+
+			my_error(ER_CANT_DROP_FIELD_OR_KEY, MYF(0),
+				 drop->name);
+			goto err_exit;
+found_fk:
+			continue;
+		}
+
+		DBUG_ASSERT(n_drop_fk > 0);
+		DBUG_ASSERT(n_drop_fk
+			    == ha_alter_info->alter_info->drop_list.elements);
 	} else {
-		/* We created secondary indexes (!new_primary). */
+		drop_fk = NULL;
+	}
+
+	if (ha_alter_info->index_drop_count) {
+		dict_index_t*	drop_primary = NULL;
+
+		DBUG_ASSERT(ha_alter_info->handler_flags
+			    & (Alter_inplace_info::DROP_INDEX
+			       | Alter_inplace_info::DROP_UNIQUE_INDEX
+			       | Alter_inplace_info::DROP_PK_INDEX));
+		/* Check which indexes to drop. */
+		drop_index = static_cast<dict_index_t**>(
+			mem_heap_alloc(
+				heap, (ha_alter_info->index_drop_count + 1)
+				* sizeof *drop_index));
+
+		for (uint i = 0; i < ha_alter_info->index_drop_count; i++) {
+			const KEY*	key
+				= ha_alter_info->index_drop_buffer[i];
+			dict_index_t*	index
+				= dict_table_get_index_on_name_and_min_id(
+					indexed_table, key->name);
+
+			if (!index) {
+				push_warning_printf(
+					user_thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					HA_ERR_WRONG_INDEX,
+					"InnoDB could not find key "
+					"with name %s", key->name);
+			} else {
+				ut_ad(!index->to_be_dropped);
+				if (!dict_index_is_clust(index)) {
+					drop_index[n_drop_index++] = index;
+				} else {
+					drop_primary = index;
+				}
+			}
+		}
+
+		/* If all FULLTEXT indexes were removed, drop an
+		internal FTS_DOC_ID_INDEX as well, unless it exists in
+		the table. */
+
+		if (innobase_fulltext_exist(table)
+		    && !innobase_fulltext_exist(altered_table)
+		    && !DICT_TF2_FLAG_IS_SET(
+			indexed_table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			dict_index_t*	fts_doc_index
+				= dict_table_get_index_on_name(
+					indexed_table, FTS_DOC_ID_INDEX_NAME);
+
+			// Add some fault tolerance for non-debug builds.
+			if (fts_doc_index == NULL) {
+				goto check_if_can_drop_indexes;
+			}
+
+			DBUG_ASSERT(!fts_doc_index->to_be_dropped);
+
+			for (uint i = 0; i < table->s->keys; i++) {
+				if (!my_strcasecmp(
+					    system_charset_info,
+					    FTS_DOC_ID_INDEX_NAME,
+					    table->key_info[i].name)) {
+					/* The index exists in the MySQL
+					data dictionary. Do not drop it,
+					even though it is no longer needed
+					by InnoDB fulltext search. */
+					goto check_if_can_drop_indexes;
+				}
+			}
+
+			drop_index[n_drop_index++] = fts_doc_index;
+		}
+
+check_if_can_drop_indexes:
+		/* Check if the indexes can be dropped. */
+
+		/* Prevent a race condition between DROP INDEX and
+		CREATE TABLE adding FOREIGN KEY constraints. */
+		row_mysql_lock_data_dictionary(prebuilt->trx);
 
-		if (commit) {
-			err = convert_error_code_to_mysql(
-				row_merge_rename_indexes(trx, prebuilt->table),
-				prebuilt->table->flags, user_thd);
+		if (!n_drop_index) {
+			drop_index = NULL;
+		} else {
+			/* Flag all indexes that are to be dropped. */
+			for (ulint i = 0; i < n_drop_index; i++) {
+				ut_ad(!drop_index[i]->to_be_dropped);
+				drop_index[i]->to_be_dropped = 1;
+			}
 		}
 
-		if (!commit || err) {
-			dict_index_t*	index;
-			dict_index_t*	next_index;
+		if (prebuilt->trx->check_foreigns) {
+			for (uint i = 0; i < n_drop_index; i++) {
+			     dict_index_t*	index = drop_index[i];
+
+				if (innobase_check_foreign_key_index(
+					ha_alter_info, index,
+					indexed_table, col_names,
+					prebuilt->trx, drop_fk, n_drop_fk)) {
+					row_mysql_unlock_data_dictionary(
+						prebuilt->trx);
+					prebuilt->trx->error_info = index;
+					print_error(HA_ERR_DROP_INDEX_FK,
+						    MYF(0));
+					goto err_exit;
+				}
+			}
 
-			for (index = dict_table_get_first_index(
-				     prebuilt->table);
-			     index; index = next_index) {
+			/* If a primary index is dropped, need to check
+			any depending foreign constraints get affected */
+			if (drop_primary
+			    && innobase_check_foreign_key_index(
+				ha_alter_info, drop_primary,
+				indexed_table, col_names,
+				prebuilt->trx, drop_fk, n_drop_fk)) {
+				row_mysql_unlock_data_dictionary(prebuilt->trx);
+				print_error(HA_ERR_DROP_INDEX_FK, MYF(0));
+				goto err_exit;
+			}
+		}
 
-				next_index = dict_table_get_next_index(index);
+		row_mysql_unlock_data_dictionary(prebuilt->trx);
+	} else {
+		drop_index = NULL;
+	}
 
-				if (*index->name == TEMP_INDEX_PREFIX) {
-					row_merge_drop_index(
-						index, prebuilt->table, trx);
+	n_add_fk = 0;
+
+	if (ha_alter_info->handler_flags
+	    & Alter_inplace_info::ADD_FOREIGN_KEY) {
+		ut_ad(!prebuilt->trx->check_foreigns);
+
+		add_fk = static_cast<dict_foreign_t**>(
+			mem_heap_zalloc(
+				heap,
+				ha_alter_info->alter_info->key_list.elements
+				* sizeof(dict_foreign_t*)));
+
+		if (!innobase_get_foreign_key_info(
+			    ha_alter_info, table_share,
+			    prebuilt->table, col_names,
+			    drop_index, n_drop_index,
+			    add_fk, &n_add_fk, prebuilt->trx)) {
+err_exit:
+			if (n_drop_index) {
+				row_mysql_lock_data_dictionary(prebuilt->trx);
+
+				/* Clear the to_be_dropped flags, which might
+				have been set at this point. */
+				for (ulint i = 0; i < n_drop_index; i++) {
+					DBUG_ASSERT(*drop_index[i]->name
+						    != TEMP_INDEX_PREFIX);
+					drop_index[i]->to_be_dropped = 0;
 				}
+
+				row_mysql_unlock_data_dictionary(prebuilt->trx);
+			}
+
+			if (heap) {
+				mem_heap_free(heap);
 			}
+
+			goto err_exit_no_heap;
 		}
 	}
 
-	/* If index is successfully built, we will need to rebuild index
-	translation table. Set valid index entry count in the translation
-	table to zero. */
-	if (err == 0 && commit) {
-		share->idx_trans_tbl.index_count = 0;
+	if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA)
+	    || (ha_alter_info->handler_flags
+		== Alter_inplace_info::CHANGE_CREATE_OPTION
+		&& !innobase_need_rebuild(ha_alter_info))) {
+
+		if (heap) {
+			ha_alter_info->handler_ctx
+				= new ha_innobase_inplace_ctx(
+					prebuilt,
+					drop_index, n_drop_index,
+					drop_fk, n_drop_fk,
+					add_fk, n_add_fk,
+					ha_alter_info->online,
+					heap, indexed_table,
+					col_names, ULINT_UNDEFINED, 0, 0);
+		}
+
+func_exit:
+		DBUG_ASSERT(prebuilt->trx->dict_operation_lock_mode == 0);
+		if (ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) {
+			online_retry_drop_indexes(prebuilt->table, user_thd);
+		}
+		DBUG_RETURN(false);
 	}
 
-	trx_commit_for_mysql(trx);
-	if (prebuilt->trx) {
-		trx_commit_for_mysql(prebuilt->trx);
+	/* If we are to build a full-text search index, check whether
+	the table already has a DOC ID column.  If not, we will need to
+	add a Doc ID hidden column and rebuild the primary index */
+	if (innobase_fulltext_exist(altered_table)) {
+		ulint	doc_col_no;
+
+		if (!innobase_fts_check_doc_id_col(
+			    prebuilt->table, altered_table, &fts_doc_col_no)) {
+			fts_doc_col_no = altered_table->s->stored_fields;
+			add_fts_doc_id = true;
+			add_fts_doc_id_idx = true;
+
+			push_warning_printf(
+				user_thd,
+				Sql_condition::WARN_LEVEL_WARN,
+				HA_ERR_WRONG_INDEX,
+				"InnoDB rebuilding table to add column "
+				FTS_DOC_ID_COL_NAME);
+		} else if (fts_doc_col_no == ULINT_UNDEFINED) {
+			goto err_exit;
+		}
+
+		switch (innobase_fts_check_doc_id_index(
+				prebuilt->table, altered_table, &doc_col_no)) {
+		case FTS_NOT_EXIST_DOC_ID_INDEX:
+			add_fts_doc_id_idx = true;
+			break;
+		case FTS_INCORRECT_DOC_ID_INDEX:
+			my_error(ER_INNODB_FT_WRONG_DOCID_INDEX, MYF(0),
+				 FTS_DOC_ID_INDEX_NAME);
+			goto err_exit;
+		case FTS_EXIST_DOC_ID_INDEX:
+			DBUG_ASSERT(doc_col_no == fts_doc_col_no
+				    || doc_col_no == ULINT_UNDEFINED
+				    || (ha_alter_info->handler_flags
+					& (Alter_inplace_info::ALTER_COLUMN_ORDER
+					   | Alter_inplace_info::DROP_COLUMN
+					   | Alter_inplace_info::ADD_COLUMN)));
+		}
 	}
 
-	ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE));
-	row_mysql_unlock_data_dictionary(trx);
+	/* See if an AUTO_INCREMENT column was added. */
+	uint i = 0, innodb_idx= 0;
+	List_iterator_fast<Create_field> cf_it(
+		ha_alter_info->alter_info->create_list);
+	while (const Create_field* new_field = cf_it++) {
+		const Field*	field;
+                if (!new_field->stored_in_db) {
+                  i++;
+                  continue;
+                }
+
+		DBUG_ASSERT(i < altered_table->s->fields);
+		DBUG_ASSERT(innodb_idx < altered_table->s->stored_fields);
+
+		for (uint old_i = 0; table->field[old_i]; old_i++) {
+                        if (!table->field[old_i]->stored_in_db)
+                          continue;
+			if (new_field->field == table->field[old_i]) {
+				goto found_col;
+			}
+		}
 
-	trx_free_for_mysql(trx);
+		/* This is an added column. */
+		DBUG_ASSERT(!new_field->field);
+		DBUG_ASSERT(ha_alter_info->handler_flags
+			    & Alter_inplace_info::ADD_COLUMN);
 
-	/* There might be work for utility threads.*/
-	srv_active_wake_master_thread();
+		field = altered_table->field[i];
 
-	delete add;
-	DBUG_RETURN(err);
+		DBUG_ASSERT((MTYP_TYPENR(field->unireg_check)
+			     == Field::NEXT_NUMBER)
+			    == !!(field->flags & AUTO_INCREMENT_FLAG));
+
+		if (field->flags & AUTO_INCREMENT_FLAG) {
+			if (add_autoinc_col_no != ULINT_UNDEFINED) {
+				/* This should have been blocked earlier. */
+				ut_ad(0);
+				my_error(ER_WRONG_AUTO_KEY, MYF(0));
+				goto err_exit;
+			}
+			add_autoinc_col_no = innodb_idx;
+
+			autoinc_col_max_value = innobase_get_int_col_max_value(
+				field);
+		}
+found_col:
+		i++;
+                innodb_idx++;
+	}
+
+	DBUG_ASSERT(heap);
+	DBUG_ASSERT(user_thd == prebuilt->trx->mysql_thd);
+	DBUG_ASSERT(!ha_alter_info->handler_ctx);
+
+	ha_alter_info->handler_ctx = new ha_innobase_inplace_ctx(
+		prebuilt,
+		drop_index, n_drop_index,
+		drop_fk, n_drop_fk, add_fk, n_add_fk,
+		ha_alter_info->online,
+		heap, prebuilt->table, col_names,
+		add_autoinc_col_no,
+		ha_alter_info->create_info->auto_increment_value,
+		autoinc_col_max_value);
+
+	DBUG_RETURN(prepare_inplace_alter_table_dict(
+			    ha_alter_info, altered_table, table,
+			    table_share->table_name.str,
+			    flags, flags2,
+			    fts_doc_col_no, add_fts_doc_id,
+			    add_fts_doc_id_idx));
 }
 
-/*******************************************************************//**
-Prepare to drop some indexes of a table.
-@return	0 or error number */
+/** Alter the table structure in-place with operations
+specified using Alter_inplace_info.
+The level of concurrency allowed during this operation depends
+on the return value from check_if_supported_inplace_alter().
+
+@param altered_table	TABLE object for new version of table.
+@param ha_alter_info	Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+
+@retval true		Failure
+@retval false		Success
+*/
 UNIV_INTERN
-int
-ha_innobase::prepare_drop_index(
-/*============================*/
-	TABLE*	table,		/*!< in: Table where indexes are dropped */
-	uint*	key_num,	/*!< in: Key nums to be dropped */
-	uint	num_of_keys)	/*!< in: Number of keys to be dropped */
+bool
+ha_innobase::inplace_alter_table(
+/*=============================*/
+	TABLE*			altered_table,
+	Alter_inplace_info*	ha_alter_info)
 {
-	trx_t*		trx;
-	int		err = 0;
-	uint 		n_key;
-
-	DBUG_ENTER("ha_innobase::prepare_drop_index");
-	ut_ad(table);
-	ut_ad(key_num);
-	ut_ad(num_of_keys);
-	if (srv_created_new_raw || srv_force_recovery) {
-		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+	dberr_t	error;
+
+	DBUG_ENTER("inplace_alter_table");
+	DBUG_ASSERT(!srv_read_only_mode);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+	ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	DEBUG_SYNC(user_thd, "innodb_inplace_alter_table_enter");
+
+	if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA)) {
+ok_exit:
+		DEBUG_SYNC(user_thd, "innodb_after_inplace_alter_table");
+		DBUG_RETURN(false);
 	}
 
-	update_thd();
+	if (ha_alter_info->handler_flags
+	    == Alter_inplace_info::CHANGE_CREATE_OPTION
+	    && !innobase_need_rebuild(ha_alter_info)) {
+		goto ok_exit;
+	}
 
-	trx_search_latch_release_if_reserved(prebuilt->trx);
-	trx = prebuilt->trx;
+	ha_innobase_inplace_ctx*	ctx
+		= static_cast<ha_innobase_inplace_ctx*>
+		(ha_alter_info->handler_ctx);
 
-	if (UNIV_UNLIKELY(trx->fake_changes)) {
-		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+	DBUG_ASSERT(ctx);
+	DBUG_ASSERT(ctx->trx);
+	DBUG_ASSERT(ctx->prebuilt == prebuilt);
+
+	if (prebuilt->table->ibd_file_missing
+	    || dict_table_is_discarded(prebuilt->table)) {
+		goto all_done;
+	}
+
+	/* Read the clustered index of the table and build
+	indexes based on this information using temporary
+	files and merge sort. */
+	DBUG_EXECUTE_IF("innodb_OOM_inplace_alter",
+			error = DB_OUT_OF_MEMORY; goto oom;);
+	error = row_merge_build_indexes(
+		prebuilt->trx,
+		prebuilt->table, ctx->new_table,
+		ctx->online,
+		ctx->add_index, ctx->add_key_numbers, ctx->num_to_add_index,
+		altered_table, ctx->add_cols, ctx->col_map,
+		ctx->add_autoinc, ctx->sequence);
+#ifndef DBUG_OFF
+oom:
+#endif /* !DBUG_OFF */
+	if (error == DB_SUCCESS && ctx->online && ctx->need_rebuild()) {
+		DEBUG_SYNC_C("row_log_table_apply1_before");
+		error = row_log_table_apply(
+			ctx->thr, prebuilt->table, altered_table);
 	}
 
-	/* Test and mark all the indexes to be dropped */
+	DEBUG_SYNC_C("inplace_after_index_build");
 
-	row_mysql_lock_data_dictionary(trx);
-	ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE));
+	DBUG_EXECUTE_IF("create_index_fail",
+			error = DB_DUPLICATE_KEY;);
 
-	/* Check that none of the indexes have previously been flagged
-	for deletion. */
-	{
-		const dict_index_t*	index
-			= dict_table_get_first_index(prebuilt->table);
-		do {
-			ut_a(!index->to_be_dropped);
-			index = dict_table_get_next_index(index);
-		} while (index);
+	/* After an error, remove all those index definitions
+	from the dictionary which were defined. */
+
+	switch (error) {
+		KEY*	dup_key;
+	all_done:
+	case DB_SUCCESS:
+		ut_d(mutex_enter(&dict_sys->mutex));
+		ut_d(dict_table_check_for_dup_indexes(
+			     prebuilt->table, CHECK_PARTIAL_OK));
+		ut_d(mutex_exit(&dict_sys->mutex));
+		/* prebuilt->table->n_ref_count can be anything here,
+		given that we hold at most a shared lock on the table. */
+		goto ok_exit;
+	case DB_DUPLICATE_KEY:
+		if (prebuilt->trx->error_key_num == ULINT_UNDEFINED
+		    || ha_alter_info->key_count == 0) {
+			/* This should be the hidden index on
+			FTS_DOC_ID, or there is no PRIMARY KEY in the
+			table. Either way, we should be seeing and
+			reporting a bogus duplicate key error. */
+			dup_key = NULL;
+		} else {
+			DBUG_ASSERT(prebuilt->trx->error_key_num
+				    < ha_alter_info->key_count);
+			dup_key = &ha_alter_info->key_info_buffer[
+				prebuilt->trx->error_key_num];
+		}
+		print_keydup_error(altered_table, dup_key, MYF(0));
+		break;
+	case DB_ONLINE_LOG_TOO_BIG:
+		DBUG_ASSERT(ctx->online);
+		my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0),
+			 (prebuilt->trx->error_key_num == ULINT_UNDEFINED)
+			 ? FTS_DOC_ID_INDEX_NAME
+			 : ha_alter_info->key_info_buffer[
+				 prebuilt->trx->error_key_num].name);
+		break;
+	case DB_INDEX_CORRUPT:
+		my_error(ER_INDEX_CORRUPT, MYF(0),
+			 (prebuilt->trx->error_key_num == ULINT_UNDEFINED)
+			 ? FTS_DOC_ID_INDEX_NAME
+			 : ha_alter_info->key_info_buffer[
+				 prebuilt->trx->error_key_num].name);
+		break;
+	default:
+		my_error_innodb(error,
+				table_share->table_name.str,
+				prebuilt->table->flags);
 	}
 
-	for (n_key = 0; n_key < num_of_keys; n_key++) {
-		const KEY*	key;
-		dict_index_t*	index;
+	/* prebuilt->table->n_ref_count can be anything here, given
+	that we hold at most a shared lock on the table. */
+	prebuilt->trx->error_info = NULL;
+	ctx->trx->error_state = DB_SUCCESS;
 
-		key = table->key_info + key_num[n_key];
-		index = dict_table_get_index_on_name_and_min_id(
-			prebuilt->table, key->name);
+	DBUG_RETURN(true);
+}
+
+/** Free the modification log for online table rebuild.
+@param table	table that was being rebuilt online */
+static
+void
+innobase_online_rebuild_log_free(
+/*=============================*/
+	dict_table_t*	table)
+{
+	dict_index_t* clust_index = dict_table_get_first_index(table);
 
-		if (!index) {
-			sql_print_error("InnoDB could not find key n:o %u "
-					"with name %s for table %s",
-					key_num[n_key],
-					key ? key->name : "NULL",
-					prebuilt->table->name);
+	ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
 
-			err = HA_ERR_KEY_NOT_FOUND;
-			goto func_exit;
+	rw_lock_x_lock(&clust_index->lock);
+
+	if (clust_index->online_log) {
+		ut_ad(dict_index_get_online_status(clust_index)
+		      == ONLINE_INDEX_CREATION);
+		clust_index->online_status = ONLINE_INDEX_COMPLETE;
+		row_log_free(clust_index->online_log);
+		DEBUG_SYNC_C("innodb_online_rebuild_log_free_aborted");
+	}
+
+	DBUG_ASSERT(dict_index_get_online_status(clust_index)
+		    == ONLINE_INDEX_COMPLETE);
+	rw_lock_x_unlock(&clust_index->lock);
+}
+
+/** Rollback a secondary index creation, drop the indexes with
+temparary index prefix
+@param user_table	InnoDB table
+@param table		the TABLE
+@param locked		TRUE=table locked, FALSE=may need to do a lazy drop
+@param trx		the transaction
+*/
+static __attribute__((nonnull))
+void
+innobase_rollback_sec_index(
+/*========================*/
+	dict_table_t*		user_table,
+	const TABLE*		table,
+	ibool			locked,
+	trx_t*			trx)
+{
+	row_merge_drop_indexes(trx, user_table, locked);
+
+	/* Free the table->fts only if there is no FTS_DOC_ID
+	in the table */
+	if (user_table->fts
+	    && !DICT_TF2_FLAG_IS_SET(user_table,
+				     DICT_TF2_FTS_HAS_DOC_ID)
+	    && !innobase_fulltext_exist(table)) {
+		fts_free(user_table);
+	}
+}
+
+/** Roll back the changes made during prepare_inplace_alter_table()
+and inplace_alter_table() inside the storage engine. Note that the
+allowed level of concurrency during this operation will be the same as
+for inplace_alter_table() and thus might be higher than during
+prepare_inplace_alter_table(). (E.g concurrent writes were blocked
+during prepare, but might not be during commit).
+
+@param ha_alter_info	Data used during in-place alter.
+@param table		the TABLE
+@param prebuilt		the prebuilt struct
+@retval true		Failure
+@retval false		Success
+*/
+inline __attribute__((nonnull, warn_unused_result))
+bool
+rollback_inplace_alter_table(
+/*=========================*/
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		table,
+	row_prebuilt_t*		prebuilt)
+{
+	bool	fail	= false;
+
+	ha_innobase_inplace_ctx*	ctx
+		= static_cast<ha_innobase_inplace_ctx*>
+		(ha_alter_info->handler_ctx);
+
+	DBUG_ENTER("rollback_inplace_alter_table");
+
+	if (!ctx || !ctx->trx) {
+		/* If we have not started a transaction yet,
+		(almost) nothing has been or needs to be done. */
+		goto func_exit;
+	}
+
+	row_mysql_lock_data_dictionary(ctx->trx);
+
+	if (ctx->need_rebuild()) {
+		dberr_t	err;
+		ulint	flags	= ctx->new_table->flags;
+
+		/* DML threads can access ctx->new_table via the
+		online rebuild log. Free it first. */
+		innobase_online_rebuild_log_free(prebuilt->table);
+
+		/* Since the FTS index specific auxiliary tables has
+		not yet registered with "table->fts" by fts_add_index(),
+		we will need explicitly delete them here */
+		if (DICT_TF2_FLAG_IS_SET(ctx->new_table, DICT_TF2_FTS)) {
+
+			err = innobase_drop_fts_index_table(
+				ctx->new_table, ctx->trx);
+
+			if (err != DB_SUCCESS) {
+				my_error_innodb(
+					err, table->s->table_name.str,
+					flags);
+				fail = true;
+			}
 		}
 
-		/* Refuse to drop the clustered index.  It would be
-		better to automatically generate a clustered index,
-		but mysql_alter_table() will call this method only
-		after ha_innobase::add_index(). */
+		/* Drop the table. */
+		dict_table_close(ctx->new_table, TRUE, FALSE);
+
+#if defined UNIV_DEBUG || defined UNIV_DDL_DEBUG
+		/* Nobody should have initialized the stats of the
+		newly created table yet. When this is the case, we
+		know that it has not been added for background stats
+		gathering. */
+		ut_a(!ctx->new_table->stat_initialized);
+#endif /* UNIV_DEBUG || UNIV_DDL_DEBUG */
+
+		err = row_merge_drop_table(ctx->trx, ctx->new_table);
+
+		switch (err) {
+		case DB_SUCCESS:
+			break;
+		default:
+			my_error_innodb(err, table->s->table_name.str,
+					flags);
+			fail = true;
+		}
+	} else {
+		DBUG_ASSERT(!(ha_alter_info->handler_flags
+			      & Alter_inplace_info::ADD_PK_INDEX));
+		DBUG_ASSERT(ctx->new_table == prebuilt->table);
+
+		trx_start_for_ddl(ctx->trx, TRX_DICT_OP_INDEX);
+
+		innobase_rollback_sec_index(
+			prebuilt->table, table, FALSE, ctx->trx);
+	}
 
-		if (dict_index_is_clust(index)) {
-			my_error(ER_REQUIRES_PRIMARY_KEY, MYF(0));
-			err = -1;
-			goto func_exit;
+	trx_commit_for_mysql(ctx->trx);
+	row_mysql_unlock_data_dictionary(ctx->trx);
+	trx_free_for_mysql(ctx->trx);
+
+func_exit:
+#ifndef DBUG_OFF
+	dict_index_t* clust_index = dict_table_get_first_index(
+		prebuilt->table);
+	DBUG_ASSERT(!clust_index->online_log);
+	DBUG_ASSERT(dict_index_get_online_status(clust_index)
+		    == ONLINE_INDEX_COMPLETE);
+#endif /* !DBUG_OFF */
+
+	if (ctx) {
+		DBUG_ASSERT(ctx->prebuilt == prebuilt);
+
+		if (ctx->num_to_add_fk) {
+			for (ulint i = 0; i < ctx->num_to_add_fk; i++) {
+				dict_foreign_free(ctx->add_fk[i]);
+			}
 		}
 
-		rw_lock_x_lock(dict_index_get_lock(index));
-		index->to_be_dropped = TRUE;
-		rw_lock_x_unlock(dict_index_get_lock(index));
+		if (ctx->num_to_drop_index) {
+			row_mysql_lock_data_dictionary(prebuilt->trx);
+
+			/* Clear the to_be_dropped flags
+			in the data dictionary cache.
+			The flags may already have been cleared,
+			in case an error was detected in
+			commit_inplace_alter_table(). */
+			for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+				dict_index_t*	index = ctx->drop_index[i];
+				DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX);
+
+				index->to_be_dropped = 0;
+			}
+
+			row_mysql_unlock_data_dictionary(prebuilt->trx);
+		}
 	}
 
-	/* If FOREIGN_KEY_CHECKS = 1 you may not drop an index defined
-	for a foreign key constraint because InnoDB requires that both
-	tables contain indexes for the constraint. Such index can
-	be dropped only if FOREIGN_KEY_CHECKS is set to 0.
-	Note that CREATE INDEX id ON table does a CREATE INDEX and
-	DROP INDEX, and we can ignore here foreign keys because a
-	new index for the foreign key has already been created.
+	trx_commit_for_mysql(prebuilt->trx);
+	srv_active_wake_master_thread();
+	MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
+	DBUG_RETURN(fail);
+}
 
-	We check for the foreign key constraints after marking the
-	candidate indexes for deletion, because when we check for an
-	equivalent foreign index we don't want to select an index that
-	is later deleted. */
+/** Drop a FOREIGN KEY constraint from the data dictionary tables.
+@param trx		data dictionary transaction
+@param table_name	Table name in MySQL
+@param foreign_id	Foreign key constraint identifier
+@retval true		Failure
+@retval false		Success */
+static __attribute__((nonnull, warn_unused_result))
+bool
+innobase_drop_foreign_try(
+/*======================*/
+	trx_t*			trx,
+	const char*		table_name,
+	const char*		foreign_id)
+{
+	DBUG_ENTER("innobase_drop_foreign_try");
+
+	DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* Drop the constraint from the data dictionary. */
+	static const char sql[] =
+		"PROCEDURE DROP_FOREIGN_PROC () IS\n"
+		"BEGIN\n"
+		"DELETE FROM SYS_FOREIGN WHERE ID=:id;\n"
+		"DELETE FROM SYS_FOREIGN_COLS WHERE ID=:id;\n"
+		"END;\n";
+
+	dberr_t		error;
+	pars_info_t*	info;
+
+	info = pars_info_create();
+	pars_info_add_str_literal(info, "id", foreign_id);
+
+	trx->op_info = "dropping foreign key constraint from dictionary";
+	error = que_eval_sql(info, sql, FALSE, trx);
+	trx->op_info = "";
+
+	DBUG_EXECUTE_IF("ib_drop_foreign_error",
+			error = DB_OUT_OF_FILE_SPACE;);
+
+	if (error != DB_SUCCESS) {
+		my_error_innodb(error, table_name, 0);
+		trx->error_state = DB_SUCCESS;
+		DBUG_RETURN(true);
+	}
 
-	if (trx->check_foreigns
-	    && thd_sql_command(user_thd) != SQLCOM_CREATE_INDEX) {
-		dict_index_t*	index;
+	DBUG_RETURN(false);
+}
+
+/** Rename a column in the data dictionary tables.
+@param user_table	InnoDB table that was being altered
+@param trx		data dictionary transaction
+@param table_name	Table name in MySQL
+@param nth_col		0-based index of the column
+@param from		old column name
+@param to		new column name
+@param new_clustered	whether the table has been rebuilt
+@retval true		Failure
+@retval false		Success */
+static __attribute__((nonnull, warn_unused_result))
+bool
+innobase_rename_column_try(
+/*=======================*/
+	const dict_table_t*	user_table,
+	trx_t*			trx,
+	const char*		table_name,
+	ulint			nth_col,
+	const char*		from,
+	const char*		to,
+	bool			new_clustered)
+{
+	pars_info_t*	info;
+	dberr_t		error;
 
-		for (index = dict_table_get_first_index(prebuilt->table);
-		     index;
-		     index = dict_table_get_next_index(index)) {
-			dict_foreign_t*	foreign;
+	DBUG_ENTER("innobase_rename_column_try");
 
-			if (!index->to_be_dropped) {
+	DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
 
+	if (new_clustered) {
+		goto rename_foreign;
+	}
+
+	info = pars_info_create();
+
+	pars_info_add_ull_literal(info, "tableid", user_table->id);
+	pars_info_add_int4_literal(info, "nth", nth_col);
+	pars_info_add_str_literal(info, "old", from);
+	pars_info_add_str_literal(info, "new", to);
+
+	trx->op_info = "renaming column in SYS_COLUMNS";
+
+	error = que_eval_sql(
+		info,
+		"PROCEDURE RENAME_SYS_COLUMNS_PROC () IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_COLUMNS SET NAME=:new\n"
+		"WHERE TABLE_ID=:tableid AND NAME=:old\n"
+		"AND POS=:nth;\n"
+		"END;\n",
+		FALSE, trx);
+
+	DBUG_EXECUTE_IF("ib_rename_column_error",
+			error = DB_OUT_OF_FILE_SPACE;);
+
+	if (error != DB_SUCCESS) {
+err_exit:
+		my_error_innodb(error, table_name, 0);
+		trx->error_state = DB_SUCCESS;
+		trx->op_info = "";
+		DBUG_RETURN(true);
+	}
+
+	trx->op_info = "renaming column in SYS_FIELDS";
+
+	for (const dict_index_t* index = dict_table_get_first_index(
+		     user_table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
+			if (strcmp(dict_index_get_nth_field(index, i)->name,
+				   from)) {
 				continue;
 			}
 
-			/* Check if the index is referenced. */
-			foreign = dict_table_get_referenced_constraint(
-				prebuilt->table, index);
+			info = pars_info_create();
 
-			if (foreign) {
-index_needed:
-				trx_set_detailed_error(
-					trx,
-					"Index needed in foreign key "
-					"constraint");
+			pars_info_add_ull_literal(info, "indexid", index->id);
+			pars_info_add_int4_literal(info, "nth", i);
+			pars_info_add_str_literal(info, "old", from);
+			pars_info_add_str_literal(info, "new", to);
 
-				trx->error_info = index;
+			error = que_eval_sql(
+				info,
+				"PROCEDURE RENAME_SYS_FIELDS_PROC () IS\n"
+				"BEGIN\n"
 
-				err = HA_ERR_DROP_INDEX_FK;
-				break;
-			} else {
-				/* Check if this index references some
-				other table */
-				foreign = dict_table_get_foreign_constraint(
-					prebuilt->table, index);
+				"UPDATE SYS_FIELDS SET COL_NAME=:new\n"
+				"WHERE INDEX_ID=:indexid AND COL_NAME=:old\n"
+				"AND POS=:nth;\n"
 
-				if (foreign) {
-					ut_a(foreign->foreign_index == index);
+				/* Try again, in case there is a prefix_len
+				encoded in SYS_FIELDS.POS */
 
-					/* Search for an equivalent index that
-					the foreign key constraint could use
-					if this index were to be deleted. */
-					if (!dict_foreign_find_equiv_index(
-						foreign)) {
+				"UPDATE SYS_FIELDS SET COL_NAME=:new\n"
+				"WHERE INDEX_ID=:indexid AND COL_NAME=:old\n"
+				"AND POS>=65536*:nth AND POS<65536*(:nth+1);\n"
 
-						goto index_needed;
-					}
-				}
+				"END;\n",
+				FALSE, trx);
+
+			if (error != DB_SUCCESS) {
+				goto err_exit;
 			}
 		}
-	} else if (thd_sql_command(user_thd) == SQLCOM_CREATE_INDEX) {
-		/* This is a drop of a foreign key constraint index that
-		was created by MySQL when the constraint was added.  MySQL
-		does this when the user creates an index explicitly which
-		can be used in place of the automatically generated index. */
+	}
 
-		dict_index_t*	index;
+rename_foreign:
+	trx->op_info = "renaming column in SYS_FOREIGN_COLS";
 
-		for (index = dict_table_get_first_index(prebuilt->table);
-		     index;
-		     index = dict_table_get_next_index(index)) {
-			dict_foreign_t*	foreign;
+	for (const dict_foreign_t* foreign = UT_LIST_GET_FIRST(
+		     user_table->foreign_list);
+	     foreign != NULL;
+	     foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) {
+		for (unsigned i = 0; i < foreign->n_fields; i++) {
+			if (strcmp(foreign->foreign_col_names[i], from)) {
+				continue;
+			}
 
-			if (!index->to_be_dropped) {
+			info = pars_info_create();
+
+			pars_info_add_str_literal(info, "id", foreign->id);
+			pars_info_add_int4_literal(info, "nth", i);
+			pars_info_add_str_literal(info, "old", from);
+			pars_info_add_str_literal(info, "new", to);
+
+			error = que_eval_sql(
+				info,
+				"PROCEDURE RENAME_SYS_FOREIGN_F_PROC () IS\n"
+				"BEGIN\n"
+				"UPDATE SYS_FOREIGN_COLS\n"
+				"SET FOR_COL_NAME=:new\n"
+				"WHERE ID=:id AND POS=:nth\n"
+				"AND FOR_COL_NAME=:old;\n"
+				"END;\n",
+				FALSE, trx);
+
+			if (error != DB_SUCCESS) {
+				goto err_exit;
+			}
+		}
+	}
 
+	for (const dict_foreign_t* foreign = UT_LIST_GET_FIRST(
+		     user_table->referenced_list);
+	     foreign != NULL;
+	     foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) {
+		for (unsigned i = 0; i < foreign->n_fields; i++) {
+			if (strcmp(foreign->referenced_col_names[i], from)) {
 				continue;
 			}
 
-			/* Check if this index references some other table */
-			foreign = dict_table_get_foreign_constraint(
-				prebuilt->table, index);
+			info = pars_info_create();
+
+			pars_info_add_str_literal(info, "id", foreign->id);
+			pars_info_add_int4_literal(info, "nth", i);
+			pars_info_add_str_literal(info, "old", from);
+			pars_info_add_str_literal(info, "new", to);
+
+			error = que_eval_sql(
+				info,
+				"PROCEDURE RENAME_SYS_FOREIGN_R_PROC () IS\n"
+				"BEGIN\n"
+				"UPDATE SYS_FOREIGN_COLS\n"
+				"SET REF_COL_NAME=:new\n"
+				"WHERE ID=:id AND POS=:nth\n"
+				"AND REF_COL_NAME=:old;\n"
+				"END;\n",
+				FALSE, trx);
+
+			if (error != DB_SUCCESS) {
+				goto err_exit;
+			}
+		}
+	}
 
-			if (foreign == NULL) {
+	trx->op_info = "";
+	DBUG_RETURN(false);
+}
 
-				continue;
+/** Rename columns in the data dictionary tables.
+@param ha_alter_info	Data used during in-place alter.
+@param ctx		In-place ALTER TABLE context
+@param table		the TABLE
+@param trx		data dictionary transaction
+@param table_name	Table name in MySQL
+@retval true		Failure
+@retval false		Success */
+static __attribute__((nonnull, warn_unused_result))
+bool
+innobase_rename_columns_try(
+/*========================*/
+	Alter_inplace_info*	ha_alter_info,
+	ha_innobase_inplace_ctx*ctx,
+	const TABLE*		table,
+	trx_t*			trx,
+	const char*		table_name)
+{
+	List_iterator_fast<Create_field> cf_it(
+		ha_alter_info->alter_info->create_list);
+	uint i = 0;
+
+	DBUG_ASSERT(ctx);
+	DBUG_ASSERT(ha_alter_info->handler_flags
+		    & Alter_inplace_info::ALTER_COLUMN_NAME);
+
+	for (Field** fp = table->field; *fp; fp++, i++) {
+		if (!((*fp)->flags & FIELD_IS_RENAMED) || !((*fp)->stored_in_db)) {
+			continue;
+		}
+
+		cf_it.rewind();
+		while (Create_field* cf = cf_it++) {
+			if (cf->field == *fp) {
+				if (innobase_rename_column_try(
+					    ctx->old_table, trx, table_name, i,
+					    cf->field->field_name,
+					    cf->field_name,
+					    ctx->need_rebuild())) {
+					return(true);
+				}
+				goto processed_field;
 			}
+		}
+
+		ut_error;
+processed_field:
+		continue;
+	}
 
-			ut_a(foreign->foreign_index == index);
+	return(false);
+}
 
-			/* Search for an equivalent index that the
-			foreign key constraint could use if this index
-			were to be deleted. */
+/** Rename columns in the data dictionary cache
+as part of commit_cache_norebuild().
+@param ha_alter_info	Data used during in-place alter.
+@param table		the TABLE
+@param user_table	InnoDB table that was being altered */
+static __attribute__((nonnull))
+void
+innobase_rename_columns_cache(
+/*==========================*/
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		table,
+	dict_table_t*		user_table)
+{
+	if (!(ha_alter_info->handler_flags
+	      & Alter_inplace_info::ALTER_COLUMN_NAME)) {
+		return;
+	}
 
-			if (!dict_foreign_find_equiv_index(foreign)) {
-				trx_set_detailed_error(
-					trx,
-					"Index needed in foreign key "
-					"constraint");
+	List_iterator_fast<Create_field> cf_it(
+		ha_alter_info->alter_info->create_list);
+	uint i = 0;
 
-				trx->error_info = foreign->foreign_index;
+	for (Field** fp = table->field; *fp; fp++, i++) {
+		if (!((*fp)->flags & FIELD_IS_RENAMED)) {
+			continue;
+		}
 
-				err = HA_ERR_DROP_INDEX_FK;
-				break;
+		cf_it.rewind();
+		while (Create_field* cf = cf_it++) {
+			if (cf->field == *fp) {
+				dict_mem_table_col_rename(user_table, i,
+							  cf->field->field_name,
+							  cf->field_name);
+				goto processed_field;
 			}
 		}
+
+		ut_error;
+processed_field:
+		continue;
+	}
+}
+
+/** Get the auto-increment value of the table on commit.
+@param ha_alter_info	Data used during in-place alter
+@param ctx		In-place ALTER TABLE context
+@param altered_table	MySQL table that is being altered
+@param old_table	MySQL table as it is before the ALTER operation
+@return the next auto-increment value (0 if not present) */
+static __attribute__((nonnull, warn_unused_result))
+ulonglong
+commit_get_autoinc(
+/*===============*/
+	Alter_inplace_info*	ha_alter_info,
+	ha_innobase_inplace_ctx*ctx,
+	const TABLE*		altered_table,
+	const TABLE*		old_table)
+{
+	ulonglong		max_autoinc;
+
+	DBUG_ENTER("commit_get_autoinc");
+
+	if (!altered_table->found_next_number_field) {
+		/* There is no AUTO_INCREMENT column in the table
+		after the ALTER operation. */
+		max_autoinc = 0;
+	} else if (ctx->add_autoinc != ULINT_UNDEFINED) {
+		/* An AUTO_INCREMENT column was added. Get the last
+		value from the sequence, which may be based on a
+		supplied AUTO_INCREMENT value. */
+		max_autoinc = ctx->sequence.last();
+	} else if ((ha_alter_info->handler_flags
+		    & Alter_inplace_info::CHANGE_CREATE_OPTION)
+		   && (ha_alter_info->create_info->used_fields
+		       & HA_CREATE_USED_AUTO)) {
+		/* An AUTO_INCREMENT value was supplied, but the table
+		was not rebuilt. Get the user-supplied value or the
+		last value from the sequence. */
+		ut_ad(old_table->found_next_number_field);
+
+		max_autoinc = ha_alter_info->create_info->auto_increment_value;
+
+		dict_table_autoinc_lock(ctx->old_table);
+		if (max_autoinc < ctx->old_table->autoinc) {
+			max_autoinc = ctx->old_table->autoinc;
+		}
+		dict_table_autoinc_unlock(ctx->old_table);
+	} else {
+		/* An AUTO_INCREMENT value was not specified.
+		Read the old counter value from the table. */
+		ut_ad(old_table->found_next_number_field);
+		dict_table_autoinc_lock(ctx->old_table);
+		max_autoinc = ctx->old_table->autoinc;
+		dict_table_autoinc_unlock(ctx->old_table);
 	}
 
-func_exit:
-	if (err) {
-		/* Undo our changes since there was some sort of error. */
-		dict_index_t*	index
-			= dict_table_get_first_index(prebuilt->table);
+	DBUG_RETURN(max_autoinc);
+}
 
-		do {
-			rw_lock_x_lock(dict_index_get_lock(index));
-			index->to_be_dropped = FALSE;
-			rw_lock_x_unlock(dict_index_get_lock(index));
-			index = dict_table_get_next_index(index);
-		} while (index);
+/** Add or drop foreign key constraints to the data dictionary tables,
+but do not touch the data dictionary cache.
+@param ha_alter_info	Data used during in-place alter
+@param ctx		In-place ALTER TABLE context
+@param trx		Data dictionary transaction
+@param table_name	Table name in MySQL
+@retval true		Failure
+@retval false		Success
+*/
+static __attribute__((nonnull, warn_unused_result))
+bool
+innobase_update_foreign_try(
+/*========================*/
+	ha_innobase_inplace_ctx*ctx,
+	trx_t*			trx,
+	const char*		table_name)
+{
+	ulint	foreign_id;
+	ulint	i;
+
+	DBUG_ENTER("innobase_update_foreign_try");
+	DBUG_ASSERT(ctx);
+
+	foreign_id = dict_table_get_highest_foreign_id(ctx->new_table);
+
+	foreign_id++;
+
+	for (i = 0; i < ctx->num_to_add_fk; i++) {
+		dict_foreign_t*		fk = ctx->add_fk[i];
+
+		ut_ad(fk->foreign_table == ctx->new_table
+		      || fk->foreign_table == ctx->old_table);
+
+		dberr_t error = dict_create_add_foreign_id(
+			&foreign_id, ctx->old_table->name, fk);
+
+		if (error != DB_SUCCESS) {
+			my_error(ER_TOO_LONG_IDENT, MYF(0),
+				 fk->id);
+			DBUG_RETURN(true);
+		}
+
+		if (!fk->foreign_index) {
+			fk->foreign_index = dict_foreign_find_index(
+				ctx->new_table, ctx->col_names,
+				fk->foreign_col_names,
+				fk->n_fields, fk->referenced_index, TRUE,
+				fk->type
+				& (DICT_FOREIGN_ON_DELETE_SET_NULL
+				   | DICT_FOREIGN_ON_UPDATE_SET_NULL));
+			if (!fk->foreign_index) {
+				my_error(ER_FK_INCORRECT_OPTION,
+					 MYF(0), table_name, fk->id);
+				DBUG_RETURN(true);
+			}
+		}
+
+		/* The fk->foreign_col_names[] uses renamed column
+		names, while the columns in ctx->old_table have not
+		been renamed yet. */
+		error = dict_create_add_foreign_to_dictionary(
+			ctx->old_table->name, fk, trx);
+
+		DBUG_EXECUTE_IF(
+			"innodb_test_cannot_add_fk_system",
+			error = DB_ERROR;);
+
+		if (error != DB_SUCCESS) {
+			my_error(ER_FK_FAIL_ADD_SYSTEM, MYF(0),
+				 fk->id);
+			DBUG_RETURN(true);
+		}
 	}
 
-	ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE));
-	row_mysql_unlock_data_dictionary(trx);
+	for (i = 0; i < ctx->num_to_drop_fk; i++) {
+		dict_foreign_t* fk = ctx->drop_fk[i];
+
+		DBUG_ASSERT(fk->foreign_table == ctx->old_table);
 
-	DBUG_RETURN(err);
+		if (innobase_drop_foreign_try(trx, table_name, fk->id)) {
+			DBUG_RETURN(true);
+		}
+	}
+
+	DBUG_RETURN(false);
 }
 
-/*******************************************************************//**
-Drop the indexes that were passed to a successful prepare_drop_index().
-@return	0 or error number */
-UNIV_INTERN
-int
-ha_innobase::final_drop_index(
+/** Update the foreign key constraint definitions in the data dictionary cache
+after the changes to data dictionary tables were committed.
+@param ctx	In-place ALTER TABLE context
+@return		InnoDB error code (should always be DB_SUCCESS) */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+innobase_update_foreign_cache(
 /*==========================*/
-	TABLE*	table)		/*!< in: Table where indexes are dropped */
+	ha_innobase_inplace_ctx*	ctx)
 {
-	dict_index_t*	index;		/*!< Index to be dropped */
-	trx_t*		trx;		/*!< Transaction */
-	int		err;
+	dict_table_t*	user_table;
 
-	DBUG_ENTER("ha_innobase::final_drop_index");
-	ut_ad(table);
+	DBUG_ENTER("innobase_update_foreign_cache");
 
-	if (srv_created_new_raw || srv_force_recovery) {
-		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+	user_table = ctx->old_table;
+
+	/* Discard the added foreign keys, because we will
+	load them from the data dictionary. */
+	for (ulint i = 0; i < ctx->num_to_add_fk; i++) {
+		dict_foreign_t*	fk = ctx->add_fk[i];
+		dict_foreign_free(fk);
 	}
 
-	update_thd();
+	if (ctx->need_rebuild()) {
+		/* The rebuilt table is already using the renamed
+		column names. No need to pass col_names or to drop
+		constraints from the data dictionary cache. */
+		DBUG_ASSERT(!ctx->col_names);
+		DBUG_ASSERT(UT_LIST_GET_LEN(user_table->foreign_list) == 0);
+		DBUG_ASSERT(UT_LIST_GET_LEN(user_table->referenced_list) == 0);
+		user_table = ctx->new_table;
+	} else {
+		/* Drop the foreign key constraints if the
+		table was not rebuilt. If the table is rebuilt,
+		there would not be any foreign key contraints for
+		it yet in the data dictionary cache. */
+		for (ulint i = 0; i < ctx->num_to_drop_fk; i++) {
+			dict_foreign_t* fk = ctx->drop_fk[i];
+			dict_foreign_remove_from_cache(fk);
+		}
+	}
 
-	trx_search_latch_release_if_reserved(prebuilt->trx);
-	trx_start_if_not_started(prebuilt->trx);
+	/* Load the old or added foreign keys from the data dictionary
+	and prevent the table from being evicted from the data
+	dictionary cache (work around the lack of WL#6049). */
+	DBUG_RETURN(dict_load_foreigns(user_table->name,
+				       ctx->col_names, false, true,
+				       DICT_ERR_IGNORE_NONE));
+}
 
-	/* Create a background transaction for the operations on
-	the data dictionary tables. */
-	trx = innobase_trx_allocate(user_thd);
-	if (UNIV_UNLIKELY(trx->fake_changes)) {
-		trx_general_rollback_for_mysql(trx, NULL);
-		trx_free_for_mysql(trx);
-		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+/** Commit the changes made during prepare_inplace_alter_table()
+and inplace_alter_table() inside the data dictionary tables,
+when rebuilding the table.
+@param ha_alter_info	Data used during in-place alter
+@param ctx		In-place ALTER TABLE context
+@param altered_table	MySQL table that is being altered
+@param old_table	MySQL table as it is before the ALTER operation
+@param trx		Data dictionary transaction
+@param table_name	Table name in MySQL
+@retval true		Failure
+@retval false		Success
+*/
+inline __attribute__((nonnull, warn_unused_result))
+bool
+commit_try_rebuild(
+/*===============*/
+	Alter_inplace_info*	ha_alter_info,
+	ha_innobase_inplace_ctx*ctx,
+	TABLE*			altered_table,
+	const TABLE*		old_table,
+	trx_t*			trx,
+	const char*		table_name)
+{
+	dict_table_t*	rebuilt_table	= ctx->new_table;
+	dict_table_t*	user_table	= ctx->old_table;
+
+	DBUG_ENTER("commit_try_rebuild");
+	DBUG_ASSERT(ctx->need_rebuild());
+	DBUG_ASSERT(trx->dict_operation_lock_mode == RW_X_LATCH);
+	DBUG_ASSERT(!(ha_alter_info->handler_flags
+		      & Alter_inplace_info::DROP_FOREIGN_KEY)
+		    || ctx->num_to_drop_fk > 0);
+	DBUG_ASSERT(ctx->num_to_drop_fk
+		    == ha_alter_info->alter_info->drop_list.elements);
+
+	for (dict_index_t* index = dict_table_get_first_index(rebuilt_table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+		DBUG_ASSERT(dict_index_get_online_status(index)
+			    == ONLINE_INDEX_COMPLETE);
+		DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX);
+		if (dict_index_is_corrupted(index)) {
+			my_error(ER_INDEX_CORRUPT, MYF(0),
+				 index->name);
+			DBUG_RETURN(true);
+		}
 	}
 
-	trx_start_if_not_started(trx);
+	if (innobase_update_foreign_try(ctx, trx, table_name)) {
+		DBUG_RETURN(true);
+	}
 
-	/* Flag this transaction as a dictionary operation, so that
-	the data dictionary will be locked in crash recovery. */
-	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+	dberr_t	error;
 
-	/* Lock the table exclusively, to ensure that no active
-	transaction depends on an index that is being dropped. */
-	err = convert_error_code_to_mysql(
-		row_merge_lock_table(prebuilt->trx, prebuilt->table, LOCK_X),
-		prebuilt->table->flags, user_thd);
+	/* Clear the to_be_dropped flag in the data dictionary cache
+	of user_table. */
+	for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+		dict_index_t*	index = ctx->drop_index[i];
+		DBUG_ASSERT(index->table == user_table);
+		DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX);
+		DBUG_ASSERT(index->to_be_dropped);
+		index->to_be_dropped = 0;
+	}
 
-	row_mysql_lock_data_dictionary(trx);
-	ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE));
+	/* We copied the table. Any indexes that were requested to be
+	dropped were not created in the copy of the table. Apply any
+	last bit of the rebuild log and then rename the tables. */
+
+	if (ctx->online) {
+		DEBUG_SYNC_C("row_log_table_apply2_before");
+		error = row_log_table_apply(
+			ctx->thr, user_table, altered_table);
+		ulint	err_key = thr_get_trx(ctx->thr)->error_key_num;
+
+		switch (error) {
+			KEY*	dup_key;
+		case DB_SUCCESS:
+			break;
+		case DB_DUPLICATE_KEY:
+			if (err_key == ULINT_UNDEFINED) {
+				/* This should be the hidden index on
+				FTS_DOC_ID. */
+				dup_key = NULL;
+			} else {
+				DBUG_ASSERT(err_key <
+					    ha_alter_info->key_count);
+				dup_key = &ha_alter_info
+					->key_info_buffer[err_key];
+			}
+			print_keydup_error(altered_table, dup_key, MYF(0));
+			DBUG_RETURN(true);
+		case DB_ONLINE_LOG_TOO_BIG:
+			my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0),
+				 ha_alter_info->key_info_buffer[0].name);
+			DBUG_RETURN(true);
+		case DB_INDEX_CORRUPT:
+			my_error(ER_INDEX_CORRUPT, MYF(0),
+				 (err_key == ULINT_UNDEFINED)
+				 ? FTS_DOC_ID_INDEX_NAME
+				 : ha_alter_info->key_info_buffer[err_key]
+				 .name);
+			DBUG_RETURN(true);
+		default:
+			my_error_innodb(error, table_name, user_table->flags);
+			DBUG_RETURN(true);
+		}
+	}
 
-	if (UNIV_UNLIKELY(err)) {
+	if ((ha_alter_info->handler_flags
+	     & Alter_inplace_info::ALTER_COLUMN_NAME)
+	    && innobase_rename_columns_try(ha_alter_info, ctx, old_table,
+					   trx, table_name)) {
+		DBUG_RETURN(true);
+	}
 
-		/* Unmark the indexes to be dropped. */
-		for (index = dict_table_get_first_index(prebuilt->table);
-		     index; index = dict_table_get_next_index(index)) {
+	DBUG_EXECUTE_IF("ib_ddl_crash_before_rename", DBUG_SUICIDE(););
+
+	/* The new table must inherit the flag from the
+	"parent" table. */
+	if (dict_table_is_discarded(user_table)) {
+		rebuilt_table->ibd_file_missing = true;
+		rebuilt_table->flags2 |= DICT_TF2_DISCARDED;
+	}
+
+	/* We can now rename the old table as a temporary table,
+	rename the new temporary table as the old table and drop the
+	old table. First, we only do this in the data dictionary
+	tables. The actual renaming will be performed in
+	commit_cache_rebuild(), once the data dictionary transaction
+	has been successfully committed. */
+
+	error = row_merge_rename_tables_dict(
+		user_table, rebuilt_table, ctx->tmp_name, trx);
+
+	/* We must be still holding a table handle. */
+	DBUG_ASSERT(user_table->n_ref_count >= 1);
+
+	DBUG_EXECUTE_IF("ib_ddl_crash_after_rename", DBUG_SUICIDE(););
+	DBUG_EXECUTE_IF("ib_rebuild_cannot_rename", error = DB_ERROR;);
+
+	if (user_table->n_ref_count > 1) {
+		/* This should only occur when an innodb_memcached
+		connection with innodb_api_enable_mdl=off was started
+		before commit_inplace_alter_table() locked the data
+		dictionary. We must roll back the ALTER TABLE, because
+		we cannot drop a table while it is being used. */
+
+		/* Normally, n_ref_count must be 1, because purge
+		cannot be executing on this very table as we are
+		holding dict_operation_lock X-latch. */
 
+		error = DB_LOCK_WAIT_TIMEOUT;
+	}
+
+	switch (error) {
+	case DB_SUCCESS:
+		DBUG_RETURN(false);
+	case DB_TABLESPACE_EXISTS:
+		ut_a(rebuilt_table->n_ref_count == 1);
+		my_error(ER_TABLESPACE_EXISTS, MYF(0), ctx->tmp_name);
+		DBUG_RETURN(true);
+	case DB_DUPLICATE_KEY:
+		ut_a(rebuilt_table->n_ref_count == 1);
+		my_error(ER_TABLE_EXISTS_ERROR, MYF(0), ctx->tmp_name);
+		DBUG_RETURN(true);
+	default:
+		my_error_innodb(error, table_name, user_table->flags);
+		DBUG_RETURN(true);
+	}
+}
+
+/** Apply the changes made during commit_try_rebuild(),
+to the data dictionary cache and the file system.
+@param ctx	In-place ALTER TABLE context */
+inline __attribute__((nonnull))
+void
+commit_cache_rebuild(
+/*=================*/
+	ha_innobase_inplace_ctx*	ctx)
+{
+	dberr_t		error;
+
+	DBUG_ENTER("commit_cache_rebuild");
+	DBUG_ASSERT(ctx->need_rebuild());
+	DBUG_ASSERT(dict_table_is_discarded(ctx->old_table)
+		    == dict_table_is_discarded(ctx->new_table));
+
+	const char* old_name = mem_heap_strdup(
+		ctx->heap, ctx->old_table->name);
+
+	/* We already committed and redo logged the renames,
+	so this must succeed. */
+	error = dict_table_rename_in_cache(
+		ctx->old_table, ctx->tmp_name, FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	DEBUG_SYNC_C("commit_cache_rebuild_middle");
+
+	error = dict_table_rename_in_cache(
+		ctx->new_table, old_name, FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	DBUG_VOID_RETURN;
+}
+
+/** Commit the changes made during prepare_inplace_alter_table()
+and inplace_alter_table() inside the data dictionary tables,
+when not rebuilding the table.
+@param ha_alter_info	Data used during in-place alter
+@param ctx		In-place ALTER TABLE context
+@param old_table	MySQL table as it is before the ALTER operation
+@param trx		Data dictionary transaction
+@param table_name	Table name in MySQL
+@retval true		Failure
+@retval false		Success
+*/
+inline __attribute__((nonnull, warn_unused_result))
+bool
+commit_try_norebuild(
+/*=================*/
+	Alter_inplace_info*	ha_alter_info,
+	ha_innobase_inplace_ctx*ctx,
+	const TABLE*		old_table,
+	trx_t*			trx,
+	const char*		table_name)
+{
+	DBUG_ENTER("commit_try_norebuild");
+	DBUG_ASSERT(!ctx->need_rebuild());
+	DBUG_ASSERT(trx->dict_operation_lock_mode == RW_X_LATCH);
+	DBUG_ASSERT(!(ha_alter_info->handler_flags
+		      & Alter_inplace_info::DROP_FOREIGN_KEY)
+		    || ctx->num_to_drop_fk > 0);
+	DBUG_ASSERT(ctx->num_to_drop_fk
+		    == ha_alter_info->alter_info->drop_list.elements);
+
+	for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+		dict_index_t*	index = ctx->add_index[i];
+		DBUG_ASSERT(dict_index_get_online_status(index)
+			    == ONLINE_INDEX_COMPLETE);
+		DBUG_ASSERT(*index->name == TEMP_INDEX_PREFIX);
+		if (dict_index_is_corrupted(index)) {
+			/* Report a duplicate key
+			error for the index that was
+			flagged corrupted, most likely
+			because a duplicate value was
+			inserted (directly or by
+			rollback) after
+			ha_innobase::inplace_alter_table()
+			completed.
+			TODO: report this as a corruption
+			with a detailed reason once
+			WL#6379 has been implemented. */
+			my_error(ER_DUP_UNKNOWN_IN_INDEX,
+				 MYF(0), index->name + 1);
+			DBUG_RETURN(true);
+		}
+	}
+
+	if (innobase_update_foreign_try(ctx, trx, table_name)) {
+		DBUG_RETURN(true);
+	}
+
+	dberr_t	error;
+
+	/* We altered the table in place. */
+	/* Lose the TEMP_INDEX_PREFIX. */
+	for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+		dict_index_t*	index = ctx->add_index[i];
+		DBUG_ASSERT(dict_index_get_online_status(index)
+			    == ONLINE_INDEX_COMPLETE);
+		DBUG_ASSERT(*index->name
+			    == TEMP_INDEX_PREFIX);
+		error = row_merge_rename_index_to_add(
+			trx, ctx->new_table->id, index->id);
+		if (error != DB_SUCCESS) {
+			sql_print_error(
+				"InnoDB: rename index to add: %lu\n",
+				(ulong) error);
+			DBUG_ASSERT(0);
+			my_error(ER_INTERNAL_ERROR, MYF(0),
+				 "rename index to add");
+			DBUG_RETURN(true);
+		}
+	}
+
+	/* Drop any indexes that were requested to be dropped.
+	Rename them to TEMP_INDEX_PREFIX in the data
+	dictionary first. We do not bother to rename
+	index->name in the dictionary cache, because the index
+	is about to be freed after row_merge_drop_indexes_dict(). */
+
+	for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+		dict_index_t*	index = ctx->drop_index[i];
+		DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX);
+		DBUG_ASSERT(index->table == ctx->new_table);
+		DBUG_ASSERT(index->to_be_dropped);
+
+		error = row_merge_rename_index_to_drop(
+			trx, index->table->id, index->id);
+		if (error != DB_SUCCESS) {
+			sql_print_error(
+				"InnoDB: rename index to drop: %lu\n",
+				(ulong) error);
+			DBUG_ASSERT(0);
+			my_error(ER_INTERNAL_ERROR, MYF(0),
+				 "rename index to drop");
+			DBUG_RETURN(true);
+		}
+	}
+
+	if (!(ha_alter_info->handler_flags
+	      & Alter_inplace_info::ALTER_COLUMN_NAME)) {
+		DBUG_RETURN(false);
+	}
+
+	DBUG_RETURN(innobase_rename_columns_try(ha_alter_info, ctx,
+						old_table, trx, table_name));
+}
+
+/** Commit the changes to the data dictionary cache
+after a successful commit_try_norebuild() call.
+@param ctx		In-place ALTER TABLE context
+@param table		the TABLE before the ALTER
+@param trx		Data dictionary transaction object
+(will be started and committed)
+@return whether all replacements were found for dropped indexes */
+inline __attribute__((nonnull, warn_unused_result))
+bool
+commit_cache_norebuild(
+/*===================*/
+	ha_innobase_inplace_ctx*ctx,
+	const TABLE*		table,
+	trx_t*			trx)
+{
+	DBUG_ENTER("commit_cache_norebuild");
+
+	bool	found = true;
+
+	DBUG_ASSERT(!ctx->need_rebuild());
+
+	for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+		dict_index_t*	index = ctx->add_index[i];
+		DBUG_ASSERT(dict_index_get_online_status(index)
+			    == ONLINE_INDEX_COMPLETE);
+		DBUG_ASSERT(*index->name == TEMP_INDEX_PREFIX);
+		index->name++;
+	}
+
+	if (ctx->num_to_drop_index) {
+		/* Really drop the indexes that were dropped.
+		The transaction had to be committed first
+		(after renaming the indexes), so that in the
+		event of a crash, crash recovery will drop the
+		indexes, because it drops all indexes whose
+		names start with TEMP_INDEX_PREFIX. Once we
+		have started dropping an index tree, there is
+		no way to roll it back. */
+
+		for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+			dict_index_t*	index = ctx->drop_index[i];
+			DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX);
+			DBUG_ASSERT(index->table == ctx->new_table);
+			DBUG_ASSERT(index->to_be_dropped);
+
+			/* Replace the indexes in foreign key
+			constraints if needed. */
+
+			if (!dict_foreign_replace_index(
+				    index->table, ctx->col_names, index)) {
+				found = false;
+			}
+
+			/* Mark the index dropped
+			in the data dictionary cache. */
 			rw_lock_x_lock(dict_index_get_lock(index));
-			index->to_be_dropped = FALSE;
+			index->page = FIL_NULL;
 			rw_lock_x_unlock(dict_index_get_lock(index));
 		}
 
-		goto func_exit;
+		trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+		row_merge_drop_indexes_dict(trx, ctx->new_table->id);
+
+		for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+			dict_index_t*	index = ctx->drop_index[i];
+			DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX);
+			DBUG_ASSERT(index->table == ctx->new_table);
+
+			if (index->type & DICT_FTS) {
+				DBUG_ASSERT(index->type == DICT_FTS
+					    || (index->type
+						& DICT_CORRUPT));
+				DBUG_ASSERT(index->table->fts);
+				fts_drop_index(index->table, index, trx);
+			}
+
+			dict_index_remove_from_cache(index->table, index);
+		}
+
+		trx_commit_for_mysql(trx);
 	}
 
-	/* Drop indexes marked to be dropped */
+	DBUG_RETURN(found);
+}
 
-	index = dict_table_get_first_index(prebuilt->table);
+/** Adjust the persistent statistics after non-rebuilding ALTER TABLE.
+Remove statistics for dropped indexes, add statistics for created indexes
+and rename statistics for renamed indexes.
+@param ha_alter_info	Data used during in-place alter
+@param ctx		In-place ALTER TABLE context
+@param altered_table	MySQL table that is being altered
+@param table_name	Table name in MySQL
+@param thd		MySQL connection
+*/
+static
+void
+alter_stats_norebuild(
+/*==================*/
+	Alter_inplace_info*		ha_alter_info,
+	ha_innobase_inplace_ctx*	ctx,
+	TABLE*				altered_table,
+	const char*			table_name,
+	THD*				thd)
+{
+	ulint	i;
 
-	while (index) {
-		dict_index_t*	next_index;
+	DBUG_ENTER("alter_stats_norebuild");
+	DBUG_ASSERT(!ctx->need_rebuild());
 
-		next_index = dict_table_get_next_index(index);
+	if (!dict_stats_is_persistent_enabled(ctx->new_table)) {
+		DBUG_VOID_RETURN;
+	}
 
-		if (index->to_be_dropped) {
+	/* TODO: This will not drop the (unused) statistics for
+	FTS_DOC_ID_INDEX if it was a hidden index, dropped together
+	with the last renamining FULLTEXT index. */
+	for (i = 0; i < ha_alter_info->index_drop_count; i++) {
+		const KEY* key = ha_alter_info->index_drop_buffer[i];
 
-			row_merge_drop_index(index, prebuilt->table, trx);
+		if (key->flags & HA_FULLTEXT) {
+			/* There are no index cardinality
+			statistics for FULLTEXT indexes. */
+			continue;
 		}
 
-		index = next_index;
+		char	errstr[1024];
+
+		if (dict_stats_drop_index(
+			    ctx->new_table->name, key->name,
+			    errstr, sizeof errstr) != DB_SUCCESS) {
+			push_warning(thd,
+				     Sql_condition::WARN_LEVEL_WARN,
+				     ER_LOCK_WAIT_TIMEOUT, errstr);
+		}
 	}
 
-	/* Check that all flagged indexes were dropped. */
-	for (index = dict_table_get_first_index(prebuilt->table);
-	     index; index = dict_table_get_next_index(index)) {
-		ut_a(!index->to_be_dropped);
+	for (i = 0; i < ctx->num_to_add_index; i++) {
+		dict_index_t*	index = ctx->add_index[i];
+		DBUG_ASSERT(index->table == ctx->new_table);
+
+		if (!(index->type & DICT_FTS)) {
+			dict_stats_init(ctx->new_table);
+			dict_stats_update_for_index(index);
+		}
 	}
 
-	/* We will need to rebuild index translation table. Set
-	valid index entry count in the translation table to zero */
-	share->idx_trans_tbl.index_count = 0;
+	DBUG_VOID_RETURN;
+}
 
-func_exit:
-	ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE));
-	trx_commit_for_mysql(trx);
-	trx_commit_for_mysql(prebuilt->trx);
-	row_mysql_unlock_data_dictionary(trx);
+/** Adjust the persistent statistics after rebuilding ALTER TABLE.
+Remove statistics for dropped indexes, add statistics for created indexes
+and rename statistics for renamed indexes.
+@param table		InnoDB table that was rebuilt by ALTER TABLE
+@param table_name	Table name in MySQL
+@param thd		MySQL connection
+*/
+static
+void
+alter_stats_rebuild(
+/*================*/
+	dict_table_t*	table,
+	const char*	table_name,
+	THD*		thd)
+{
+	DBUG_ENTER("alter_stats_rebuild");
+
+	if (dict_table_is_discarded(table)
+	    || !dict_stats_is_persistent_enabled(table)) {
+		DBUG_VOID_RETURN;
+	}
+
+	dberr_t	ret;
+
+	ret = dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT);
+
+	if (ret != DB_SUCCESS) {
+		push_warning_printf(
+			thd,
+			Sql_condition::WARN_LEVEL_WARN,
+			ER_ALTER_INFO,
+			"Error updating stats for table '%s' "
+			"after table rebuild: %s",
+			table_name, ut_strerr(ret));
+	}
+
+	DBUG_VOID_RETURN;
+}
+
+#ifndef DBUG_OFF
+# define DBUG_INJECT_CRASH(prefix, count)			\
+do {								\
+	char buf[32];						\
+	ut_snprintf(buf, sizeof buf, prefix "_%u", count);	\
+	DBUG_EXECUTE_IF(buf, DBUG_SUICIDE(););			\
+} while (0)
+#else
+# define DBUG_INJECT_CRASH(prefix, count)
+#endif
+
+/** Commit or rollback the changes made during
+prepare_inplace_alter_table() and inplace_alter_table() inside
+the storage engine. Note that the allowed level of concurrency
+during this operation will be the same as for
+inplace_alter_table() and thus might be higher than during
+prepare_inplace_alter_table(). (E.g concurrent writes were
+blocked during prepare, but might not be during commit).
+@param altered_table	TABLE object for new version of table.
+@param ha_alter_info	Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+@param commit		true => Commit, false => Rollback.
+@retval true		Failure
+@retval false		Success
+*/
+UNIV_INTERN
+bool
+ha_innobase::commit_inplace_alter_table(
+/*====================================*/
+	TABLE*			altered_table,
+	Alter_inplace_info*	ha_alter_info,
+	bool			commit)
+{
+	ha_innobase_inplace_ctx*	ctx0
+		= static_cast<ha_innobase_inplace_ctx*>
+		(ha_alter_info->handler_ctx);
+#ifndef DBUG_OFF
+	uint				crash_inject_count	= 1;
+	uint				crash_fail_inject_count	= 1;
+	uint				failure_inject_count	= 1;
+#endif
+
+	DBUG_ENTER("commit_inplace_alter_table");
+	DBUG_ASSERT(!srv_read_only_mode);
+	DBUG_ASSERT(!ctx0 || ctx0->prebuilt == prebuilt);
+	DBUG_ASSERT(!ctx0 || ctx0->old_table == prebuilt->table);
+
+	DEBUG_SYNC_C("innodb_commit_inplace_alter_table_enter");
+
+	DEBUG_SYNC_C("innodb_commit_inplace_alter_table_wait");
+
+	if (!commit) {
+		/* A rollback is being requested. So far we may at
+		most have created some indexes. If any indexes were to
+		be dropped, they would actually be dropped in this
+		method if commit=true. */
+		DBUG_RETURN(rollback_inplace_alter_table(
+				    ha_alter_info, table, prebuilt));
+	}
+
+	if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) {
+		DBUG_ASSERT(!ctx0);
+		MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
+		DBUG_RETURN(false);
+	}
+
+	DBUG_ASSERT(ctx0);
+
+	inplace_alter_handler_ctx**	ctx_array;
+	inplace_alter_handler_ctx*	ctx_single[2];
+
+	ctx_single[0] = ctx0;
+	ctx_single[1] = NULL;
+	ctx_array = ctx_single;
+
+	DBUG_ASSERT(ctx0 == ctx_array[0]);
+	ut_ad(prebuilt->table == ctx0->old_table);
+
+	/* Free the ctx->trx of other partitions, if any. We will only
+	use the ctx0->trx here. Others may have been allocated in
+	the prepare stage. */
+
+	for (inplace_alter_handler_ctx** pctx = &ctx_array[1]; *pctx;
+	     pctx++) {
+		ha_innobase_inplace_ctx*	ctx
+			= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+		if (ctx->trx) {
+			trx_free_for_mysql(ctx->trx);
+			ctx->trx = NULL;
+		}
+	}
+
+	trx_start_if_not_started_xa(prebuilt->trx);
+
+	for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx; pctx++) {
+		ha_innobase_inplace_ctx*	ctx
+			= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+		DBUG_ASSERT(ctx->prebuilt->trx == prebuilt->trx);
+
+		/* Exclusively lock the table, to ensure that no other
+		transaction is holding locks on the table while we
+		change the table definition. The MySQL meta-data lock
+		should normally guarantee that no conflicting locks
+		exist. However, FOREIGN KEY constraints checks and any
+		transactions collected during crash recovery could be
+		holding InnoDB locks only, not MySQL locks. */
+
+		dberr_t error = row_merge_lock_table(
+			prebuilt->trx, ctx->old_table, LOCK_X);
+
+		if (error != DB_SUCCESS) {
+			my_error_innodb(
+				error, table_share->table_name.str, 0);
+			DBUG_RETURN(true);
+		}
+	}
+
+	DEBUG_SYNC(user_thd, "innodb_alter_commit_after_lock_table");
+
+	const bool	new_clustered	= ctx0->need_rebuild();
+	trx_t*		trx		= ctx0->trx;
+	bool		fail		= false;
+
+	if (new_clustered) {
+		for (inplace_alter_handler_ctx** pctx = ctx_array;
+		     *pctx; pctx++) {
+			ha_innobase_inplace_ctx*	ctx
+				= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+			DBUG_ASSERT(ctx->need_rebuild());
+
+			if (ctx->old_table->fts) {
+				ut_ad(!ctx->old_table->fts->add_wq);
+				fts_optimize_remove_table(
+					ctx->old_table);
+			}
+
+			if (ctx->new_table->fts) {
+				ut_ad(!ctx->new_table->fts->add_wq);
+				fts_optimize_remove_table(
+					ctx->new_table);
+			}
+		}
+	}
+
+	if (!trx) {
+		DBUG_ASSERT(!new_clustered);
+		trx = innobase_trx_allocate(user_thd);
+	}
+
+	trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+	/* Latch the InnoDB data dictionary exclusively so that no deadlocks
+	or lock waits can happen in it during the data dictionary operation. */
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Prevent the background statistics collection from accessing
+	the tables. */
+	for (;;) {
+		bool	retry = false;
+
+		for (inplace_alter_handler_ctx** pctx = ctx_array;
+		     *pctx; pctx++) {
+			ha_innobase_inplace_ctx*	ctx
+				= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+			DBUG_ASSERT(new_clustered == ctx->need_rebuild());
+
+			if (new_clustered
+			    && !dict_stats_stop_bg(ctx->old_table)) {
+				retry = true;
+			}
+
+			if (!dict_stats_stop_bg(ctx->new_table)) {
+				retry = true;
+			}
+		}
+
+		if (!retry) {
+			break;
+		}
+
+		DICT_STATS_BG_YIELD(trx);
+	}
+
+	/* Apply the changes to the data dictionary tables, for all
+	partitions. */
+
+	for (inplace_alter_handler_ctx** pctx = ctx_array;
+	     *pctx && !fail; pctx++) {
+		ha_innobase_inplace_ctx*	ctx
+			= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+		DBUG_ASSERT(new_clustered == ctx->need_rebuild());
+
+		ctx->max_autoinc = commit_get_autoinc(
+			ha_alter_info, ctx, altered_table, table);
+
+		if (ctx->need_rebuild()) {
+			ctx->tmp_name = dict_mem_create_temporary_tablename(
+				ctx->heap, ctx->new_table->name,
+				ctx->new_table->id);
+
+			fail = commit_try_rebuild(
+				ha_alter_info, ctx, altered_table, table,
+				trx, table_share->table_name.str);
+		} else {
+			fail = commit_try_norebuild(
+				ha_alter_info, ctx, table, trx,
+				table_share->table_name.str);
+		}
+		DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+				  crash_inject_count++);
+#ifndef DBUG_OFF
+		{
+			/* Generate a dynamic dbug text. */
+			char buf[32];
+			ut_snprintf(buf, sizeof buf, "ib_commit_inplace_fail_%u",
+				    failure_inject_count++);
+			DBUG_EXECUTE_IF(buf,
+					my_error(ER_INTERNAL_ERROR, MYF(0),
+						 "Injected error!");
+					fail = true;
+			);
+		}
+#endif
+	}
+
+	/* Commit or roll back the changes to the data dictionary. */
+
+	if (fail) {
+		trx_rollback_for_mysql(trx);
+	} else if (!new_clustered) {
+		trx_commit_for_mysql(trx);
+	} else {
+		mtr_t	mtr;
+		mtr_start(&mtr);
+
+		for (inplace_alter_handler_ctx** pctx = ctx_array;
+		     *pctx; pctx++) {
+			ha_innobase_inplace_ctx*	ctx
+				= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+			DBUG_ASSERT(ctx->need_rebuild());
+			/* Generate the redo log for the file
+			operations that will be performed in
+			commit_cache_rebuild(). */
+			fil_mtr_rename_log(ctx->old_table->space,
+					   ctx->old_table->name,
+					   ctx->new_table->space,
+					   ctx->new_table->name,
+					   ctx->tmp_name, &mtr);
+			DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+					  crash_inject_count++);
+		}
+
+		/* Test what happens on crash if the redo logs
+		are flushed to disk here. The log records
+		about the rename should not be committed, and
+		the data dictionary transaction should be
+		rolled back, restoring the old table. */
+		DBUG_EXECUTE_IF("innodb_alter_commit_crash_before_commit",
+				log_buffer_flush_to_disk();
+				DBUG_SUICIDE(););
+		ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+		ut_ad(!trx->fts_trx);
+		ut_ad(trx->insert_undo || trx->update_undo);
+
+		/* The following call commits the
+		mini-transaction, making the data dictionary
+		transaction committed at mtr.end_lsn. The
+		transaction becomes 'durable' by the time when
+		log_buffer_flush_to_disk() returns. In the
+		logical sense the commit in the file-based
+		data structures happens here. */
+		trx_commit_low(trx, &mtr);
+
+		/* If server crashes here, the dictionary in
+		InnoDB and MySQL will differ.  The .ibd files
+		and the .frm files must be swapped manually by
+		the administrator. No loss of data. */
+		DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit",
+				log_buffer_flush_to_disk();
+				DBUG_SUICIDE(););
+	}
 
 	/* Flush the log to reduce probability that the .frm files and
 	the InnoDB data dictionary get out-of-sync if the user runs
@@ -1404,12 +5560,367 @@ func_exit:
 
 	log_buffer_flush_to_disk();
 
-	trx_free_for_mysql(trx);
+	/* At this point, the changes to the persistent storage have
+	been committed or rolled back. What remains to be done is to
+	update the in-memory structures, close some handles, release
+	temporary files, and (unless we rolled back) update persistent
+	statistics. */
+	dberr_t	error		= DB_SUCCESS;
+
+	for (inplace_alter_handler_ctx** pctx = ctx_array;
+	     *pctx; pctx++) {
+		ha_innobase_inplace_ctx*	ctx
+			= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+		DBUG_ASSERT(ctx->need_rebuild() == new_clustered);
+
+		if (new_clustered) {
+			innobase_online_rebuild_log_free(ctx->old_table);
+		}
+
+		if (fail) {
+			if (new_clustered) {
+				dict_table_close(ctx->new_table,
+						 TRUE, FALSE);
+
+#if defined UNIV_DEBUG || defined UNIV_DDL_DEBUG
+				/* Nobody should have initialized the
+				stats of the newly created table
+				yet. When this is the case, we know
+				that it has not been added for
+				background stats gathering. */
+				ut_a(!ctx->new_table->stat_initialized);
+#endif /* UNIV_DEBUG || UNIV_DDL_DEBUG */
+
+				trx_start_for_ddl(trx, TRX_DICT_OP_TABLE);
+				row_merge_drop_table(trx, ctx->new_table);
+				trx_commit_for_mysql(trx);
+				ctx->new_table = NULL;
+			} else {
+				/* We failed, but did not rebuild the table.
+				Roll back any ADD INDEX, or get rid of garbage
+				ADD INDEX that was left over from a previous
+				ALTER TABLE statement. */
+				trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+				innobase_rollback_sec_index(
+					ctx->new_table, table, TRUE, trx);
+				trx_commit_for_mysql(trx);
+			}
+			DBUG_INJECT_CRASH("ib_commit_inplace_crash_fail",
+					  crash_fail_inject_count++);
+
+			continue;
+		}
+
+		innobase_copy_frm_flags_from_table_share(
+			ctx->new_table, altered_table->s);
+
+		if (new_clustered) {
+			/* We will reload and refresh the
+			in-memory foreign key constraint
+			metadata. This is a rename operation
+			in preparing for dropping the old
+			table. Set the table to_be_dropped bit
+			here, so to make sure DML foreign key
+			constraint check does not use the
+			stale dict_foreign_t. This is done
+			because WL#6049 (FK MDL) has not been
+			implemented yet. */
+			ctx->old_table->to_be_dropped = true;
+
+			/* Rename the tablespace files. */
+			commit_cache_rebuild(ctx);
+
+			error = innobase_update_foreign_cache(ctx);
+			if (error != DB_SUCCESS) {
+				goto foreign_fail;
+			}
+		} else {
+			error = innobase_update_foreign_cache(ctx);
+
+			if (error != DB_SUCCESS) {
+foreign_fail:
+				/* The data dictionary cache
+				should be corrupted now.  The
+				best solution should be to
+				kill and restart the server,
+				but the *.frm file has not
+				been replaced yet. */
+				my_error(ER_CANNOT_ADD_FOREIGN,
+					 MYF(0));
+				sql_print_error(
+					"InnoDB: dict_load_foreigns()"
+					" returned %u for %s",
+					(unsigned) error,
+					thd_query_string(user_thd)
+					->str);
+				ut_ad(0);
+			} else {
+				if (!commit_cache_norebuild(
+					    ctx, table, trx)) {
+					ut_a(!prebuilt->trx->check_foreigns);
+				}
+
+				innobase_rename_columns_cache(
+					ha_alter_info, table,
+					ctx->new_table);
+			}
+		}
+		DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+				  crash_inject_count++);
+	}
+
+	/* Invalidate the index translation table. In partitioned
+	tables, there is one TABLE_SHARE (and also only one TABLE)
+	covering all partitions. */
+	share->idx_trans_tbl.index_count = 0;
+
+	if (trx == ctx0->trx) {
+		ctx0->trx = NULL;
+	}
 
 	/* Tell the InnoDB server that there might be work for
 	utility threads: */
 
 	srv_active_wake_master_thread();
 
-	DBUG_RETURN(err);
+	if (fail) {
+		for (inplace_alter_handler_ctx** pctx = ctx_array;
+		     *pctx; pctx++) {
+			ha_innobase_inplace_ctx*	ctx
+				= static_cast<ha_innobase_inplace_ctx*>
+				(*pctx);
+			DBUG_ASSERT(ctx->need_rebuild() == new_clustered);
+
+			ut_d(dict_table_check_for_dup_indexes(
+				     ctx->old_table,
+				     CHECK_ABORTED_OK));
+			ut_a(fts_check_cached_index(ctx->old_table));
+			DBUG_INJECT_CRASH("ib_commit_inplace_crash_fail",
+					  crash_fail_inject_count++);
+		}
+
+		row_mysql_unlock_data_dictionary(trx);
+		trx_free_for_mysql(trx);
+		DBUG_RETURN(true);
+	}
+
+	/* Release the table locks. */
+	trx_commit_for_mysql(prebuilt->trx);
+
+	DBUG_EXECUTE_IF("ib_ddl_crash_after_user_trx_commit", DBUG_SUICIDE(););
+
+	for (inplace_alter_handler_ctx** pctx = ctx_array;
+	     *pctx; pctx++) {
+		ha_innobase_inplace_ctx*	ctx
+			= static_cast<ha_innobase_inplace_ctx*>
+			(*pctx);
+		DBUG_ASSERT(ctx->need_rebuild() == new_clustered);
+
+		if (altered_table->found_next_number_field) {
+			dict_table_t* t = ctx->new_table;
+
+			dict_table_autoinc_lock(t);
+			dict_table_autoinc_initialize(t, ctx->max_autoinc);
+			dict_table_autoinc_unlock(t);
+		}
+
+		bool	add_fts	= false;
+
+		/* Publish the created fulltext index, if any.
+		Note that a fulltext index can be created without
+		creating the clustered index, if there already exists
+		a suitable FTS_DOC_ID column. If not, one will be
+		created, implying new_clustered */
+		for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+			dict_index_t*	index = ctx->add_index[i];
+
+			if (index->type & DICT_FTS) {
+				DBUG_ASSERT(index->type == DICT_FTS);
+				fts_add_index(index, ctx->new_table);
+				add_fts = true;
+			}
+		}
+
+		ut_d(dict_table_check_for_dup_indexes(
+			     ctx->new_table, CHECK_ALL_COMPLETE));
+
+		if (add_fts) {
+			fts_optimize_add_table(ctx->new_table);
+		}
+
+		ut_d(dict_table_check_for_dup_indexes(
+			     ctx->new_table, CHECK_ABORTED_OK));
+		ut_a(fts_check_cached_index(ctx->new_table));
+
+		if (new_clustered) {
+			/* Since the table has been rebuilt, we remove
+			all persistent statistics corresponding to the
+			old copy of the table (which was renamed to
+			ctx->tmp_name). */
+
+			char	errstr[1024];
+
+			DBUG_ASSERT(0 == strcmp(ctx->old_table->name,
+						ctx->tmp_name));
+
+			if (dict_stats_drop_table(
+				    ctx->new_table->name,
+				    errstr, sizeof(errstr))
+			    != DB_SUCCESS) {
+				push_warning_printf(
+					user_thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					ER_ALTER_INFO,
+					"Deleting persistent statistics"
+					" for rebuilt table '%s' in"
+					" InnoDB failed: %s",
+					table->s->table_name.str,
+					errstr);
+			}
+
+			DBUG_EXECUTE_IF("ib_ddl_crash_before_commit",
+					DBUG_SUICIDE(););
+
+			trx_t* const	user_trx = prebuilt->trx;
+
+			row_prebuilt_free(ctx->prebuilt, TRUE);
+
+			/* Drop the copy of the old table, which was
+			renamed to ctx->tmp_name at the atomic DDL
+			transaction commit.  If the system crashes
+			before this is completed, some orphan tables
+			with ctx->tmp_name may be recovered. */
+			trx_start_for_ddl(trx, TRX_DICT_OP_TABLE);
+			row_merge_drop_table(trx, ctx->old_table);
+			trx_commit_for_mysql(trx);
+
+			/* Rebuild the prebuilt object. */
+			ctx->prebuilt = row_create_prebuilt(
+				ctx->new_table, altered_table->s->reclength);
+			trx_start_if_not_started(user_trx);
+			user_trx->will_lock++;
+			prebuilt->trx = user_trx;
+		}
+		DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+				  crash_inject_count++);
+	}
+
+	row_mysql_unlock_data_dictionary(trx);
+	trx_free_for_mysql(trx);
+
+	/* TODO: The following code could be executed
+	while allowing concurrent access to the table
+	(MDL downgrade). */
+
+	if (new_clustered) {
+		for (inplace_alter_handler_ctx** pctx = ctx_array;
+		     *pctx; pctx++) {
+			ha_innobase_inplace_ctx*	ctx
+				= static_cast<ha_innobase_inplace_ctx*>
+				(*pctx);
+			DBUG_ASSERT(ctx->need_rebuild());
+
+			alter_stats_rebuild(
+				ctx->new_table, table->s->table_name.str,
+				user_thd);
+			DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+					  crash_inject_count++);
+		}
+	} else {
+		for (inplace_alter_handler_ctx** pctx = ctx_array;
+		     *pctx; pctx++) {
+			ha_innobase_inplace_ctx*	ctx
+				= static_cast<ha_innobase_inplace_ctx*>
+				(*pctx);
+			DBUG_ASSERT(!ctx->need_rebuild());
+
+			alter_stats_norebuild(
+				ha_alter_info, ctx, altered_table,
+				table->s->table_name.str, user_thd);
+			DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+					  crash_inject_count++);
+		}
+	}
+
+	/* TODO: Also perform DROP TABLE and DROP INDEX after
+	the MDL downgrade. */
+
+#ifndef DBUG_OFF
+	dict_index_t* clust_index = dict_table_get_first_index(
+		prebuilt->table);
+	DBUG_ASSERT(!clust_index->online_log);
+	DBUG_ASSERT(dict_index_get_online_status(clust_index)
+		    == ONLINE_INDEX_COMPLETE);
+
+	for (dict_index_t* index = dict_table_get_first_index(
+		     prebuilt->table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+		DBUG_ASSERT(!index->to_be_dropped);
+	}
+#endif /* DBUG_OFF */
+
+	MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
+	DBUG_RETURN(false);
+}
+
+/**
+@param thd - the session
+@param start_value - the lower bound
+@param max_value - the upper bound (inclusive) */
+UNIV_INTERN
+ib_sequence_t::ib_sequence_t(
+	THD*		thd,
+	ulonglong	start_value,
+	ulonglong	max_value)
+	:
+	m_max_value(max_value),
+	m_increment(0),
+	m_offset(0),
+	m_next_value(start_value),
+	m_eof(false)
+{
+	if (thd != 0 && m_max_value > 0) {
+
+		thd_get_autoinc(thd, &m_offset, &m_increment);
+
+		if (m_increment > 1 || m_offset > 1) {
+
+			/* If there is an offset or increment specified
+			then we need to work out the exact next value. */
+
+			m_next_value = innobase_next_autoinc(
+				start_value, 1,
+				m_increment, m_offset, m_max_value);
+
+		} else if (start_value == 0) {
+			/* The next value can never be 0. */
+			m_next_value = 1;
+		}
+	} else {
+		m_eof = true;
+	}
+}
+
+/**
+Postfix increment
+@return the next value to insert */
+UNIV_INTERN
+ulonglong
+ib_sequence_t::operator++(int) UNIV_NOTHROW
+{
+	ulonglong	current = m_next_value;
+
+	ut_ad(!m_eof);
+	ut_ad(m_max_value > 0);
+
+	m_next_value = innobase_next_autoinc(
+		current, 1, m_increment, m_offset, m_max_value);
+
+	if (m_next_value == m_max_value && current == m_next_value) {
+		m_eof = true;
+	}
+
+	return(current);
 }
diff --git a/storage/xtradb/handler/i_s.cc b/storage/xtradb/handler/i_s.cc
index 71808bcd0e6..22fae62038f 100644
--- a/storage/xtradb/handler/i_s.cc
+++ b/storage/xtradb/handler/i_s.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2013, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -44,40 +44,38 @@ Created July 18, 2007 Vasil Dimov
 #include <sql_plugin.h>
 #include <innodb_priv.h>
 
-extern "C" {
-#include "btr0pcur.h"	/* for file sys_tables related info. */
+#include "btr0pcur.h"
 #include "btr0types.h"
-#include "dict0load.h" /* for file sys_tables related info. */
+#include "dict0dict.h"
+#include "dict0load.h"
 #include "buf0buddy.h"
 #include "buf0buf.h"
 #include "ibuf0ibuf.h"
 #include "dict0mem.h"
 #include "dict0types.h"
-#include "srv0srv.h" /* for srv_max_changed_pages */
-#include "dict0boot.h"
 #include "ha_prototypes.h"
 #include "srv0start.h"
+#include "srv0srv.h"
 #include "trx0i_s.h"
-#include "trx0trx.h" /* for TRX_QUE_STATE_STR_MAX_LEN */
-#include "buf0lru.h" /* for XTRA_LRU_[DUMP/RESTORE] */
-#include "trx0rseg.h"
-#include "trx0undo.h"
+#include "trx0trx.h"
+#include "srv0mon.h"
+#include "fut0fut.h"
+#include "pars0pars.h"
+#include "fts0types.h"
+#include "fts0opt.h"
+#include "fts0priv.h"
 #include "log0online.h"
 #include "btr0btr.h"
 #include "page0zip.h"
-#include "log0log.h"
-}
 
 /** structure associates a name string with a file page type and/or buffer
 page state. */
-struct buffer_page_desc_str_struct{
+struct buf_page_desc_t{
 	const char*	type_str;	/*!< String explain the page
 					type/state */
 	ulint		type_value;	/*!< Page type or page state */
 };
 
-typedef struct buffer_page_desc_str_struct	buf_page_desc_str_t;
-
 /** Change buffer B-tree page */
 #define	I_S_PAGE_TYPE_IBUF		(FIL_PAGE_TYPE_LAST + 1)
 
@@ -90,7 +88,7 @@ in i_s_page_type[] array */
 #define I_S_PAGE_TYPE_INDEX		1
 
 /** Name string for File Page Types */
-static buf_page_desc_str_t	i_s_page_type[] = {
+static buf_page_desc_t	i_s_page_type[] = {
 	{"ALLOCATED", FIL_PAGE_TYPE_ALLOCATED},
 	{"INDEX", FIL_PAGE_INDEX},
 	{"UNDO_LOG", FIL_PAGE_UNDO_LOG},
@@ -116,7 +114,7 @@ static buf_page_desc_str_t	i_s_page_type[] = {
 /** This structure defines information we will fetch from pages
 currently cached in the buffer pool. It will be used to populate
 table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE */
-struct buffer_page_info_struct{
+struct buf_page_info_t{
 	ulint		block_id;	/*!< Buffer Pool block ID */
 	unsigned	space_id:32;	/*!< Tablespace ID */
 	unsigned	page_num:32;	/*!< Page number/offset */
@@ -138,9 +136,9 @@ struct buffer_page_info_struct{
 					/*!< Compressed page size */
 	unsigned	page_state:BUF_PAGE_STATE_BITS; /*!< Page state */
 	unsigned	page_type:4;	/*!< Page type */
-	unsigned	num_recs;
+	unsigned	num_recs:UNIV_PAGE_SIZE_SHIFT_MAX-2;
 					/*!< Number of records on Page */
-	unsigned	data_size;
+	unsigned	data_size:UNIV_PAGE_SIZE_SHIFT_MAX;
 					/*!< Sum of the sizes of the records */
 	lsn_t		newest_mod;	/*!< Log sequence number of
 					the youngest modification */
@@ -149,12 +147,9 @@ struct buffer_page_info_struct{
 	index_id_t	index_id;	/*!< Index ID if a index page */
 };
 
-typedef struct buffer_page_info_struct	buf_page_info_t;
-
 /** maximum number of buffer page info we would cache. */
 #define MAX_BUF_INFO_CACHED		10000
 
-
 #define OK(expr)		\
 	if ((expr) != 0) {	\
 		DBUG_RETURN(1);	\
@@ -163,7 +158,7 @@ typedef struct buffer_page_info_struct	buf_page_info_t;
 #define RETURN_IF_INNODB_NOT_STARTED(plugin_name)			\
 do {									\
 	if (!srv_was_started) {						\
-		push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,	\
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,	\
 				    ER_CANT_FIND_SYSTEM_REC,		\
 				    "InnoDB: SELECTing from "		\
 				    "INFORMATION_SCHEMA.%s but "	\
@@ -173,7 +168,7 @@ do {									\
 	}								\
 } while (0)
 
-#if !defined __STRICT_ANSI__ && defined __GNUC__ && (__GNUC__) > 2 && 	\
+#if !defined __STRICT_ANSI__ && defined __GNUC__ && (__GNUC__) > 2 &&	\
 	!defined __INTEL_COMPILER && !defined __clang__
 #define STRUCT_FLD(name, value)	name: value
 #else
@@ -231,7 +226,7 @@ trx_i_s_common_fill_table(
 /*======================*/
 	THD*		thd,	/*!< in: thread */
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	COND*		cond);	/*!< in: condition (not used) */
+	Item*		);	/*!< in: condition (not used) */
 
 /*******************************************************************//**
 Unbind a dynamic INFORMATION_SCHEMA table.
@@ -255,16 +250,20 @@ field_store_time_t(
 	MYSQL_TIME	my_time;
 	struct tm	tm_time;
 
+	if (time) {
 #if 0
-	/* use this if you are sure that `variables' and `time_zone'
-	are always initialized */
-	thd->variables.time_zone->gmt_sec_to_TIME(
-		&my_time, (my_time_t) time);
+		/* use this if you are sure that `variables' and `time_zone'
+		are always initialized */
+		thd->variables.time_zone->gmt_sec_to_TIME(
+			&my_time, (my_time_t) time);
 #else
-	localtime_r(&time, &tm_time);
-	localtime_to_TIME(&my_time, &tm_time);
-	my_time.time_type = MYSQL_TIMESTAMP_DATETIME;
+		localtime_r(&time, &tm_time);
+		localtime_to_TIME(&my_time, &tm_time);
+		my_time.time_type = MYSQL_TIMESTAMP_DATETIME;
 #endif
+	} else {
+		memset(&my_time, 0, sizeof(my_time));
+	}
 
 	return(field->store_time(&my_time));
 }
@@ -297,6 +296,43 @@ field_store_string(
 }
 
 /*******************************************************************//**
+Store the name of an index in a MYSQL_TYPE_VARCHAR field.
+Handles the names of incomplete secondary indexes.
+@return	0 on success */
+static
+int
+field_store_index_name(
+/*===================*/
+	Field*		field,		/*!< in/out: target field for
+					storage */
+	const char*	index_name)	/*!< in: NUL-terminated utf-8
+					index name, possibly starting with
+					TEMP_INDEX_PREFIX */
+{
+	int	ret;
+
+	ut_ad(index_name != NULL);
+	ut_ad(field->real_type() == MYSQL_TYPE_VARCHAR);
+
+	/* Since TEMP_INDEX_PREFIX is not a valid UTF8, we need to convert
+	it to something else. */
+	if (index_name[0] == TEMP_INDEX_PREFIX) {
+		char	buf[NAME_LEN + 1];
+		buf[0] = '?';
+		memcpy(buf + 1, index_name + 1, strlen(index_name));
+		ret = field->store(buf, strlen(buf),
+				   system_charset_info);
+	} else {
+		ret = field->store(index_name, strlen(index_name),
+				   system_charset_info);
+	}
+
+	field->set_notnull();
+
+	return(ret);
+}
+
+/*******************************************************************//**
 Auxiliary function to store ulint value in MYSQL_TYPE_LONGLONG field.
 If the value is ULINT_UNDEFINED then the field it set to NULL.
 @return	0 on success */
@@ -523,6 +559,24 @@ static ST_FIELD_INFO	innodb_trx_fields_info[] =
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
+#define IDX_TRX_READ_ONLY		22
+	{STRUCT_FLD(field_name,		"trx_is_read_only"),
+	 STRUCT_FLD(field_length,	1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_AUTOCOMMIT_NON_LOCKING	23
+	{STRUCT_FLD(field_name,		"trx_autocommit_non_locking"),
+	 STRUCT_FLD(field_length,	1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
 	END_OF_ST_FIELD_INFO
 };
 
@@ -669,6 +723,15 @@ fill_innodb_trx_from_cache(
 		OK(fields[IDX_TRX_ADAPTIVE_HASH_TIMEOUT]->store(
 			   (longlong) row->trx_search_latch_timeout, true));
 
+		/* trx_is_read_only*/
+		OK(fields[IDX_TRX_READ_ONLY]->store(
+				(long) row->trx_is_read_only, true));
+
+		/* trx_is_autocommit_non_locking */
+		OK(fields[IDX_TRX_AUTOCOMMIT_NON_LOCKING]->store(
+				(long) row->trx_is_autocommit_non_locking,
+				true));
+
 		OK(schema_table_store_record(thd, table));
 	}
 
@@ -909,16 +972,9 @@ fill_innodb_locks_from_cache(
 
 		/* lock_index */
 		if (row->lock_index != NULL) {
-
-			bufend = innobase_convert_name(buf, sizeof(buf),
-						       row->lock_index,
-						       strlen(row->lock_index),
-						       thd, FALSE);
-			OK(fields[IDX_LOCK_INDEX]->store(buf, bufend - buf,
-							 system_charset_info));
-			fields[IDX_LOCK_INDEX]->set_notnull();
+			OK(field_store_index_name(fields[IDX_LOCK_INDEX],
+						  row->lock_index));
 		} else {
-
 			fields[IDX_LOCK_INDEX]->set_null();
 		}
 
@@ -1205,7 +1261,7 @@ trx_i_s_common_fill_table(
 /*======================*/
 	THD*		thd,	/*!< in: thread */
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	COND*		cond)	/*!< in: condition (not used) */
+	Item*		)	/*!< in: condition (not used) */
 {
 	const char*		table_name;
 	int			ret;
@@ -1365,10 +1421,10 @@ i_s_cmp_fill_low(
 /*=============*/
 	THD*		thd,	/*!< in: thread */
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	COND*		cond,	/*!< in: condition (ignored) */
+	Item*		,	/*!< in: condition (ignored) */
 	ibool		reset)	/*!< in: TRUE=reset cumulated counts */
 {
-	TABLE*	table	= (TABLE *) tables->table;
+	TABLE*	table	= (TABLE*) tables->table;
 	int	status	= 0;
 
 	DBUG_ENTER("i_s_cmp_fill_low");
@@ -1381,17 +1437,17 @@ i_s_cmp_fill_low(
 
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
 
-	for (uint i = 0; i < PAGE_ZIP_NUM_SSIZE - 1; i++) {
+	for (uint i = 0; i < PAGE_ZIP_SSIZE_MAX; i++) {
 		page_zip_stat_t*	zip_stat = &page_zip_stat[i];
 
-		table->field[0]->store(PAGE_ZIP_MIN_SIZE << i);
+		table->field[0]->store(UNIV_ZIP_SIZE_MIN << i);
 
 		/* The cumulated counts are not protected by any
-		mutex.  Thus, some operation in page0zip.c could
+		mutex.  Thus, some operation in page0zip.cc could
 		increment a counter between the time we read it and
 		clear it.  We could introduce mutex protection, but it
 		could cause a measureable performance hit in
-		page0zip.c. */
+		page0zip.cc. */
 		table->field[1]->store(zip_stat->compressed);
 		table->field[2]->store(zip_stat->compressed_ok);
 		table->field[3]->store(
@@ -1422,7 +1478,7 @@ i_s_cmp_fill(
 /*=========*/
 	THD*		thd,	/*!< in: thread */
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	COND*		cond)	/*!< in: condition (ignored) */
+	Item*		cond)	/*!< in: condition (ignored) */
 {
 	return(i_s_cmp_fill_low(thd, tables, cond, FALSE));
 }
@@ -1436,7 +1492,7 @@ i_s_cmp_reset_fill(
 /*===============*/
 	THD*		thd,	/*!< in: thread */
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	COND*		cond)	/*!< in: condition (ignored) */
+	Item*		cond)	/*!< in: condition (ignored) */
 {
 	return(i_s_cmp_fill_low(thd, tables, cond, TRUE));
 }
@@ -1572,6 +1628,354 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmp_reset =
         INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
 };
 
+/* Fields of the dynamic tables
+information_schema.innodb_cmp_per_index and
+information_schema.innodb_cmp_per_index_reset. */
+static ST_FIELD_INFO	i_s_cmp_per_index_fields_info[] =
+{
+#define IDX_DATABASE_NAME	0
+	{STRUCT_FLD(field_name,		"database_name"),
+	 STRUCT_FLD(field_length,	192),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TABLE_NAME		1
+	{STRUCT_FLD(field_name,		"table_name"),
+	 STRUCT_FLD(field_length,	192),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_INDEX_NAME		2
+	{STRUCT_FLD(field_name,		"index_name"),
+	 STRUCT_FLD(field_length,	192),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_COMPRESS_OPS	3
+	{STRUCT_FLD(field_name,		"compress_ops"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_COMPRESS_OPS_OK	4
+	{STRUCT_FLD(field_name,		"compress_ops_ok"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_COMPRESS_TIME	5
+	{STRUCT_FLD(field_name,		"compress_time"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_UNCOMPRESS_OPS	6
+	{STRUCT_FLD(field_name,		"uncompress_ops"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_UNCOMPRESS_TIME	7
+	{STRUCT_FLD(field_name,		"uncompress_time"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill the dynamic table
+information_schema.innodb_cmp_per_index or
+information_schema.innodb_cmp_per_index_reset.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmp_per_index_fill_low(
+/*=======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		,	/*!< in: condition (ignored) */
+	ibool		reset)	/*!< in: TRUE=reset cumulated counts */
+{
+	TABLE*	table = tables->table;
+	Field**	fields = table->field;
+	int	status = 0;
+
+	DBUG_ENTER("i_s_cmp_per_index_fill_low");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL, true)) {
+
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	/* Create a snapshot of the stats so we do not bump into lock
+	order violations with dict_sys->mutex below. */
+	mutex_enter(&page_zip_stat_per_index_mutex);
+	page_zip_stat_per_index_t		snap (page_zip_stat_per_index);
+	mutex_exit(&page_zip_stat_per_index_mutex);
+
+	mutex_enter(&dict_sys->mutex);
+
+	page_zip_stat_per_index_t::iterator	iter;
+	ulint					i;
+
+	for (iter = snap.begin(), i = 0; iter != snap.end(); iter++, i++) {
+
+		char		name[192];
+		dict_index_t*	index = dict_index_find_on_id_low(iter->first);
+
+		if (index != NULL) {
+			char	db_utf8[MAX_DB_UTF8_LEN];
+			char	table_utf8[MAX_TABLE_UTF8_LEN];
+
+			dict_fs2utf8(index->table_name,
+				     db_utf8, sizeof(db_utf8),
+				     table_utf8, sizeof(table_utf8));
+
+			field_store_string(fields[IDX_DATABASE_NAME], db_utf8);
+			field_store_string(fields[IDX_TABLE_NAME], table_utf8);
+			field_store_index_name(fields[IDX_INDEX_NAME],
+					       index->name);
+		} else {
+			/* index not found */
+			ut_snprintf(name, sizeof(name),
+				    "index_id:" IB_ID_FMT, iter->first);
+			field_store_string(fields[IDX_DATABASE_NAME],
+					   "unknown");
+			field_store_string(fields[IDX_TABLE_NAME],
+					   "unknown");
+			field_store_string(fields[IDX_INDEX_NAME],
+					   name);
+		}
+
+		fields[IDX_COMPRESS_OPS]->store(
+			iter->second.compressed);
+
+		fields[IDX_COMPRESS_OPS_OK]->store(
+			iter->second.compressed_ok);
+
+		fields[IDX_COMPRESS_TIME]->store(
+			(long) (iter->second.compressed_usec / 1000000));
+
+		fields[IDX_UNCOMPRESS_OPS]->store(
+			iter->second.decompressed);
+
+		fields[IDX_UNCOMPRESS_TIME]->store(
+			(long) (iter->second.decompressed_usec / 1000000));
+
+		if (schema_table_store_record(thd, table)) {
+			status = 1;
+			break;
+		}
+
+		/* Release and reacquire the dict mutex to allow other
+		threads to proceed. This could eventually result in the
+		contents of INFORMATION_SCHEMA.innodb_cmp_per_index being
+		inconsistent, but it is an acceptable compromise. */
+		if (i % 1000 == 0) {
+			mutex_exit(&dict_sys->mutex);
+			mutex_enter(&dict_sys->mutex);
+		}
+	}
+
+	mutex_exit(&dict_sys->mutex);
+
+	if (reset) {
+		page_zip_reset_stat_per_index();
+	}
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp_per_index.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmp_per_index_fill(
+/*===================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmp_per_index_fill_low(thd, tables, cond, FALSE));
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp_per_index_reset.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmp_per_index_reset_fill(
+/*=========================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmp_per_index_fill_low(thd, tables, cond, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp_per_index.
+@return	0 on success */
+static
+int
+i_s_cmp_per_index_init(
+/*===================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmp_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_cmp_per_index_fields_info;
+	schema->fill_table = i_s_cmp_per_index_fill;
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp_per_index_reset.
+@return	0 on success */
+static
+int
+i_s_cmp_per_index_reset_init(
+/*=========================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmp_reset_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_cmp_per_index_fields_info;
+	schema->fill_table = i_s_cmp_per_index_reset_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmp_per_index =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_CMP_PER_INDEX"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "Statistics for the InnoDB compression (per index)"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_cmp_per_index_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmp_per_index_reset =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_CMP_PER_INDEX_RESET"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "Statistics for the InnoDB compression (per index);"
+		   " reset cumulated counts"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_cmp_per_index_reset_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
 /* Fields of the dynamic table information_schema.innodb_cmpmem. */
 static ST_FIELD_INFO	i_s_cmpmem_fields_info[] =
 {
@@ -1621,7 +2025,7 @@ static ST_FIELD_INFO	i_s_cmpmem_fields_info[] =
 	 STRUCT_FLD(value,		0),
 	 STRUCT_FLD(field_flags,	0),
 	 STRUCT_FLD(old_name,		"Total Duration of Relocations,"
-		    			" in Seconds"),
+					" in Seconds"),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
 	END_OF_ST_FIELD_INFO
@@ -1637,11 +2041,11 @@ i_s_cmpmem_fill_low(
 /*================*/
 	THD*		thd,	/*!< in: thread */
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	COND*		cond,	/*!< in: condition (ignored) */
+	Item*		,	/*!< in: condition (ignored) */
 	ibool		reset)	/*!< in: TRUE=reset cumulated counts */
 {
 	int		status = 0;
-	TABLE*	table	= (TABLE *) tables->table;
+	TABLE*	table	= (TABLE*) tables->table;
 
 	DBUG_ENTER("i_s_cmpmem_fill_low");
 
@@ -1673,13 +2077,14 @@ i_s_cmpmem_fill_low(
 			table->field[3]->store(UNIV_LIKELY(x < BUF_BUDDY_SIZES)
 				? UT_LIST_GET_LEN(buf_pool->zip_free[x])
 				: 0);
-			table->field[4]->store((longlong)
-			buddy_stat->relocated, true);
+			table->field[4]->store(
+				(longlong) buddy_stat->relocated, true);
 			table->field[5]->store(
 				(ulong) (buddy_stat->relocated_usec / 1000000));
 
 			if (reset) {
-				/* This is protected by buf_pool->mutex. */
+				/* This is protected by
+				buf_pool->zip_free_mutex. */
 				buddy_stat->relocated = 0;
 				buddy_stat->relocated_usec = 0;
 			}
@@ -1709,7 +2114,7 @@ i_s_cmpmem_fill(
 /*============*/
 	THD*		thd,	/*!< in: thread */
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	COND*		cond)	/*!< in: condition (ignored) */
+	Item*		cond)	/*!< in: condition (ignored) */
 {
 	return(i_s_cmpmem_fill_low(thd, tables, cond, FALSE));
 }
@@ -1723,7 +2128,7 @@ i_s_cmpmem_reset_fill(
 /*==================*/
 	THD*		thd,	/*!< in: thread */
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	COND*		cond)	/*!< in: condition (ignored) */
+	Item*		cond)	/*!< in: condition (ignored) */
 {
 	return(i_s_cmpmem_fill_low(thd, tables, cond, TRUE));
 }
@@ -1859,6 +2264,1645 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmpmem_reset =
         INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
 };
 
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_metrics */
+static ST_FIELD_INFO	innodb_metrics_fields_info[] =
+{
+#define	METRIC_NAME		0
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_SUBSYS		1
+	{STRUCT_FLD(field_name,		"SUBSYSTEM"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_VALUE_START	2
+	{STRUCT_FLD(field_name,		"COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_MAX_VALUE_START	3
+	{STRUCT_FLD(field_name,		"MAX_COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_MIN_VALUE_START	4
+	{STRUCT_FLD(field_name,		"MIN_COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_AVG_VALUE_START	5
+	{STRUCT_FLD(field_name,		"AVG_COUNT"),
+	 STRUCT_FLD(field_length,	MAX_FLOAT_STR_LENGTH),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_VALUE_RESET	6
+	{STRUCT_FLD(field_name,		"COUNT_RESET"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_MAX_VALUE_RESET	7
+	{STRUCT_FLD(field_name,		"MAX_COUNT_RESET"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_MIN_VALUE_RESET	8
+	{STRUCT_FLD(field_name,		"MIN_COUNT_RESET"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_AVG_VALUE_RESET	9
+	{STRUCT_FLD(field_name,		"AVG_COUNT_RESET"),
+	 STRUCT_FLD(field_length,	MAX_FLOAT_STR_LENGTH),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_START_TIME	10
+	{STRUCT_FLD(field_name,		"TIME_ENABLED"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_DATETIME),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_STOP_TIME	11
+	{STRUCT_FLD(field_name,		"TIME_DISABLED"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_DATETIME),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_TIME_ELAPSED	12
+	{STRUCT_FLD(field_name,		"TIME_ELAPSED"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_RESET_TIME	13
+	{STRUCT_FLD(field_name,		"TIME_RESET"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_DATETIME),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_STATUS		14
+	{STRUCT_FLD(field_name,		"STATUS"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_TYPE		15
+	{STRUCT_FLD(field_name,		"TYPE"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_DESC		16
+	{STRUCT_FLD(field_name,		"COMMENT"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Fill the information schema metrics table.
+@return	0 on success */
+static
+int
+i_s_metrics_fill(
+/*=============*/
+	THD*		thd,		/*!< in: thread */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	int		count;
+	Field**		fields;
+	double		time_diff = 0;
+	monitor_info_t*	monitor_info;
+	mon_type_t	min_val;
+	mon_type_t	max_val;
+
+	DBUG_ENTER("i_s_metrics_fill");
+	fields = table_to_fill->field;
+
+	for (count = 0; count < NUM_MONITOR; count++) {
+		monitor_info = srv_mon_get_info((monitor_id_t) count);
+
+		/* A good place to sanity check the Monitor ID */
+		ut_a(count == monitor_info->monitor_id);
+
+		/* If the item refers to a Module, nothing to fill,
+		continue. */
+		if ((monitor_info->monitor_type & MONITOR_MODULE)
+		    || (monitor_info->monitor_type & MONITOR_HIDDEN)) {
+			continue;
+		}
+
+		/* If this is an existing "status variable", and
+		its corresponding counter is still on, we need
+		to calculate the result from its corresponding
+		counter. */
+		if (monitor_info->monitor_type & MONITOR_EXISTING
+		    && MONITOR_IS_ON(count)) {
+			srv_mon_process_existing_counter((monitor_id_t) count,
+							 MONITOR_GET_VALUE);
+		}
+
+		/* Fill in counter's basic information */
+		OK(field_store_string(fields[METRIC_NAME],
+				      monitor_info->monitor_name));
+
+		OK(field_store_string(fields[METRIC_SUBSYS],
+				      monitor_info->monitor_module));
+
+		OK(field_store_string(fields[METRIC_DESC],
+				      monitor_info->monitor_desc));
+
+		/* Fill in counter values */
+		OK(fields[METRIC_VALUE_RESET]->store(
+			MONITOR_VALUE(count), FALSE));
+
+		OK(fields[METRIC_VALUE_START]->store(
+			MONITOR_VALUE_SINCE_START(count), FALSE));
+
+		/* If the max value is MAX_RESERVED, counter max
+		value has not been updated. Set the column value
+		to NULL. */
+		if (MONITOR_MAX_VALUE(count) == MAX_RESERVED
+		    || MONITOR_MAX_MIN_NOT_INIT(count)) {
+			fields[METRIC_MAX_VALUE_RESET]->set_null();
+		} else {
+			OK(fields[METRIC_MAX_VALUE_RESET]->store(
+				MONITOR_MAX_VALUE(count), FALSE));
+			fields[METRIC_MAX_VALUE_RESET]->set_notnull();
+		}
+
+		/* If the min value is MAX_RESERVED, counter min
+		value has not been updated. Set the column value
+		to NULL. */
+		if (MONITOR_MIN_VALUE(count) == MIN_RESERVED
+		    || MONITOR_MAX_MIN_NOT_INIT(count)) {
+			fields[METRIC_MIN_VALUE_RESET]->set_null();
+		} else {
+			OK(fields[METRIC_MIN_VALUE_RESET]->store(
+				MONITOR_MIN_VALUE(count), FALSE));
+			fields[METRIC_MIN_VALUE_RESET]->set_notnull();
+		}
+
+		/* Calculate the max value since counter started */
+		max_val = srv_mon_calc_max_since_start((monitor_id_t) count);
+
+		if (max_val == MAX_RESERVED
+		    || MONITOR_MAX_MIN_NOT_INIT(count)) {
+			fields[METRIC_MAX_VALUE_START]->set_null();
+		} else {
+			OK(fields[METRIC_MAX_VALUE_START]->store(
+				max_val, FALSE));
+			fields[METRIC_MAX_VALUE_START]->set_notnull();
+		}
+
+		/* Calculate the min value since counter started */
+		min_val = srv_mon_calc_min_since_start((monitor_id_t) count);
+
+		if (min_val == MIN_RESERVED
+		    || MONITOR_MAX_MIN_NOT_INIT(count)) {
+			fields[METRIC_MIN_VALUE_START]->set_null();
+		} else {
+			OK(fields[METRIC_MIN_VALUE_START]->store(
+				min_val, FALSE));
+
+			fields[METRIC_MIN_VALUE_START]->set_notnull();
+		}
+
+		/* If monitor has been enabled (no matter it is disabled
+		or not now), fill METRIC_START_TIME and METRIC_TIME_ELAPSED
+		field */
+		if (MONITOR_FIELD(count, mon_start_time)) {
+			OK(field_store_time_t(fields[METRIC_START_TIME],
+				(time_t)MONITOR_FIELD(count, mon_start_time)));
+			fields[METRIC_START_TIME]->set_notnull();
+
+			/* If monitor is enabled, the TIME_ELAPSED is the
+			time difference between current and time when monitor
+			is enabled. Otherwise, it is the time difference
+			between time when monitor is enabled and time
+			when it is disabled */
+			if (MONITOR_IS_ON(count)) {
+				time_diff = difftime(time(NULL),
+					MONITOR_FIELD(count, mon_start_time));
+			} else {
+				time_diff =  difftime(
+					MONITOR_FIELD(count, mon_stop_time),
+					MONITOR_FIELD(count, mon_start_time));
+			}
+
+			OK(fields[METRIC_TIME_ELAPSED]->store(
+				time_diff));
+			fields[METRIC_TIME_ELAPSED]->set_notnull();
+		} else {
+			fields[METRIC_START_TIME]->set_null();
+			fields[METRIC_TIME_ELAPSED]->set_null();
+			time_diff = 0;
+		}
+
+		/* Unless MONITOR__NO_AVERAGE is marked, we will need
+		to calculate the average value. If this is a monitor set
+		owner marked by MONITOR_SET_OWNER, divide
+		the value by another counter (number of calls) designated
+		by monitor_info->monitor_related_id.
+		Otherwise average the counter value by the time between the
+		time that the counter is enabled and time it is disabled
+		or time it is sampled. */
+		if (!(monitor_info->monitor_type & MONITOR_NO_AVERAGE)
+		    && (monitor_info->monitor_type & MONITOR_SET_OWNER)
+		    && monitor_info->monitor_related_id) {
+			mon_type_t	value_start
+				 = MONITOR_VALUE_SINCE_START(
+					monitor_info->monitor_related_id);
+
+			if (value_start) {
+				OK(fields[METRIC_AVG_VALUE_START]->store(
+					MONITOR_VALUE_SINCE_START(count)
+					/ value_start, FALSE));
+
+				fields[METRIC_AVG_VALUE_START]->set_notnull();
+			} else {
+				fields[METRIC_AVG_VALUE_START]->set_null();
+			}
+
+			if (MONITOR_VALUE(monitor_info->monitor_related_id)) {
+				OK(fields[METRIC_AVG_VALUE_RESET]->store(
+					MONITOR_VALUE(count)
+					/ MONITOR_VALUE(
+					monitor_info->monitor_related_id),
+					FALSE));
+			} else {
+				fields[METRIC_AVG_VALUE_RESET]->set_null();
+			}
+		} else if (!(monitor_info->monitor_type & MONITOR_NO_AVERAGE)
+			   && !(monitor_info->monitor_type
+				& MONITOR_DISPLAY_CURRENT)) {
+			if (time_diff) {
+				OK(fields[METRIC_AVG_VALUE_START]->store(
+					(double) MONITOR_VALUE_SINCE_START(
+						count) / time_diff));
+				fields[METRIC_AVG_VALUE_START]->set_notnull();
+			} else {
+				fields[METRIC_AVG_VALUE_START]->set_null();
+			}
+
+			if (MONITOR_FIELD(count, mon_reset_time)) {
+				/* calculate the time difference since last
+				reset */
+				if (MONITOR_IS_ON(count)) {
+					time_diff = difftime(
+						time(NULL), MONITOR_FIELD(
+							count, mon_reset_time));
+				} else {
+					time_diff =  difftime(
+					MONITOR_FIELD(count, mon_stop_time),
+					MONITOR_FIELD(count, mon_reset_time));
+				}
+			} else {
+				time_diff = 0;
+			}
+
+			if (time_diff) {
+				OK(fields[METRIC_AVG_VALUE_RESET]->store(
+					(double )MONITOR_VALUE(count)
+					/ time_diff));
+				fields[METRIC_AVG_VALUE_RESET]->set_notnull();
+			} else {
+				fields[METRIC_AVG_VALUE_RESET]->set_null();
+			}
+		} else {
+			fields[METRIC_AVG_VALUE_START]->set_null();
+			fields[METRIC_AVG_VALUE_RESET]->set_null();
+		}
+
+
+		if (MONITOR_IS_ON(count)) {
+			/* If monitor is on, the stop time will set to NULL */
+			fields[METRIC_STOP_TIME]->set_null();
+
+			/* Display latest Monitor Reset Time only if Monitor
+			counter is on. */
+			if (MONITOR_FIELD(count, mon_reset_time)) {
+				OK(field_store_time_t(
+					fields[METRIC_RESET_TIME],
+					(time_t)MONITOR_FIELD(
+						count, mon_reset_time)));
+				fields[METRIC_RESET_TIME]->set_notnull();
+			} else {
+				fields[METRIC_RESET_TIME]->set_null();
+			}
+
+			/* Display the monitor status as "enabled" */
+			OK(field_store_string(fields[METRIC_STATUS],
+					      "enabled"));
+		} else {
+			if (MONITOR_FIELD(count, mon_stop_time)) {
+				OK(field_store_time_t(fields[METRIC_STOP_TIME],
+				(time_t)MONITOR_FIELD(count, mon_stop_time)));
+				fields[METRIC_STOP_TIME]->set_notnull();
+			} else {
+				fields[METRIC_STOP_TIME]->set_null();
+			}
+
+			fields[METRIC_RESET_TIME]->set_null();
+
+			OK(field_store_string(fields[METRIC_STATUS],
+					      "disabled"));
+		}
+
+		if (monitor_info->monitor_type & MONITOR_DISPLAY_CURRENT) {
+			OK(field_store_string(fields[METRIC_TYPE],
+					      "value"));
+		} else if (monitor_info->monitor_type & MONITOR_EXISTING) {
+			OK(field_store_string(fields[METRIC_TYPE],
+					      "status_counter"));
+		} else if (monitor_info->monitor_type & MONITOR_SET_OWNER) {
+			OK(field_store_string(fields[METRIC_TYPE],
+					      "set_owner"));
+		} else if ( monitor_info->monitor_type & MONITOR_SET_MEMBER) {
+			OK(field_store_string(fields[METRIC_TYPE],
+					      "set_member"));
+		} else {
+			OK(field_store_string(fields[METRIC_TYPE],
+					      "counter"));
+		}
+
+		OK(schema_table_store_record(thd, table_to_fill));
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Function to fill information schema metrics tables.
+@return	0 on success */
+static
+int
+i_s_metrics_fill_table(
+/*===================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	DBUG_ENTER("i_s_metrics_fill_table");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL, true)) {
+		DBUG_RETURN(0);
+	}
+
+	i_s_metrics_fill(thd, tables->table);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_metrics
+@return	0 on success */
+static
+int
+innodb_metrics_init(
+/*================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_metrics_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_metrics_fields_info;
+	schema->fill_table = i_s_metrics_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_metrics =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_METRICS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB Metrics Info"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_metrics_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_ft_default_stopword */
+static ST_FIELD_INFO	i_s_stopword_fields_info[] =
+{
+#define STOPWORD_VALUE	0
+	{STRUCT_FLD(field_name,		"value"),
+	 STRUCT_FLD(field_length,	TRX_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_ft_default_stopword.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_stopword_fill(
+/*==============*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	Field**	fields;
+	ulint	i = 0;
+	TABLE*	table = (TABLE*) tables->table;
+
+	DBUG_ENTER("i_s_stopword_fill");
+
+	fields = table->field;
+
+	/* Fill with server default stopword list in array
+	fts_default_stopword */
+	while (fts_default_stopword[i]) {
+		OK(field_store_string(fields[STOPWORD_VALUE],
+				      fts_default_stopword[i]));
+
+		OK(schema_table_store_record(thd, table));
+		i++;
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_ft_default_stopword.
+@return	0 on success */
+static
+int
+i_s_stopword_init(
+/*==============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_stopword_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_stopword_fields_info;
+	schema->fill_table = i_s_stopword_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_ft_default_stopword =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_stopword_fields_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_DEFAULT_STOPWORD"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "Default stopword list for InnDB Full Text Search"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_stopword_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
+INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED */
+static ST_FIELD_INFO	i_s_fts_doc_fields_info[] =
+{
+#define	I_S_FTS_DOC_ID			0
+	{STRUCT_FLD(field_name,		"DOC_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED or
+INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_deleted_generic_fill(
+/*=========================*/
+	THD*		thd,		/*!< in: thread */
+	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
+	ibool		being_deleted)	/*!< in: BEING_DELTED table */
+{
+	Field**			fields;
+	TABLE*			table = (TABLE*) tables->table;
+	trx_t*			trx;
+	fts_table_t		fts_table;
+	fts_doc_ids_t*		deleted;
+	dict_table_t*		user_table;
+
+	DBUG_ENTER("i_s_fts_deleted_generic_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL, true)) {
+		DBUG_RETURN(0);
+	}
+
+	if (!fts_internal_tbl_name) {
+		DBUG_RETURN(0);
+	}
+
+	deleted = fts_doc_ids_create();
+
+	user_table = dict_table_open_on_name(
+		fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	}
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "Select for FTS DELETE TABLE";
+
+	FTS_INIT_FTS_TABLE(&fts_table,
+			   (being_deleted) ? "BEING_DELETED" : "DELETED",
+			   FTS_COMMON_TABLE, user_table);
+
+	fts_table_fetch_doc_ids(trx, &fts_table, deleted);
+
+	fields = table->field;
+
+	for (ulint j = 0; j < ib_vector_size(deleted->doc_ids); ++j) {
+		doc_id_t	doc_id;
+
+		doc_id = *(doc_id_t*) ib_vector_get_const(deleted->doc_ids, j);
+
+		OK(fields[I_S_FTS_DOC_ID]->store((longlong) doc_id, true));
+
+		OK(schema_table_store_record(thd, table));
+	}
+
+	trx_free_for_background(trx);
+
+	fts_doc_ids_free(deleted);
+
+	dict_table_close(user_table, FALSE, FALSE);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_deleted_fill(
+/*=================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	DBUG_ENTER("i_s_fts_deleted_fill");
+
+	DBUG_RETURN(i_s_fts_deleted_generic_fill(thd, tables, FALSE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
+@return	0 on success */
+static
+int
+i_s_fts_deleted_init(
+/*=================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_deleted_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_fts_doc_fields_info;
+	schema->fill_table = i_s_fts_deleted_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_ft_deleted =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_fts_doc_fields_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_DELETED"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "INNODB AUXILIARY FTS DELETED TABLE"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_fts_deleted_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_being_deleted_fill(
+/*=======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	DBUG_ENTER("i_s_fts_being_deleted_fill");
+
+	DBUG_RETURN(i_s_fts_deleted_generic_fill(thd, tables, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED
+@return	0 on success */
+static
+int
+i_s_fts_being_deleted_init(
+/*=======================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_deleted_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_fts_doc_fields_info;
+	schema->fill_table = i_s_fts_being_deleted_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_ft_being_deleted =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_fts_doc_fields_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_BEING_DELETED"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "INNODB AUXILIARY FTS BEING DELETED TABLE"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_fts_being_deleted_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED and
+INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE */
+static ST_FIELD_INFO	i_s_fts_index_fields_info[] =
+{
+#define	I_S_FTS_WORD			0
+	{STRUCT_FLD(field_name,		"WORD"),
+	 STRUCT_FLD(field_length,	FTS_MAX_WORD_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	I_S_FTS_FIRST_DOC_ID		1
+	{STRUCT_FLD(field_name,		"FIRST_DOC_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	I_S_FTS_LAST_DOC_ID		2
+	{STRUCT_FLD(field_name,		"LAST_DOC_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	I_S_FTS_DOC_COUNT		3
+	{STRUCT_FLD(field_name,		"DOC_COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	I_S_FTS_ILIST_DOC_ID		4
+	{STRUCT_FLD(field_name,		"DOC_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	I_S_FTS_ILIST_DOC_POS		5
+	{STRUCT_FLD(field_name,		"POSITION"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Go through the Doc Node and its ilist, fill the dynamic table
+INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED for one FTS index on the table.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_index_cache_fill_one_index(
+/*===============================*/
+	fts_index_cache_t*	index_cache,	/*!< in: FTS index cache */
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables)		/*!< in/out: tables to fill */
+{
+	TABLE*			table = (TABLE*) tables->table;
+	Field**			fields;
+	const ib_rbt_node_t*	rbt_node;
+
+	DBUG_ENTER("i_s_fts_index_cache_fill_one_index");
+
+	fields = table->field;
+
+	/* Go through each word in the index cache */
+	for (rbt_node = rbt_first(index_cache->words);
+	     rbt_node;
+	     rbt_node = rbt_next(index_cache->words, rbt_node)) {
+		doc_id_t	doc_id = 0;
+
+		fts_tokenizer_word_t* word;
+
+		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+		/* Decrypt the ilist, and display Dod ID and word position */
+		for (ulint i = 0; i < ib_vector_size(word->nodes); i++) {
+			fts_node_t*	node;
+			byte*		ptr;
+			ulint		decoded = 0;
+
+			node = static_cast<fts_node_t*> (ib_vector_get(
+				word->nodes, i));
+
+			ptr = node->ilist;
+
+			while (decoded < node->ilist_size) {
+				ulint	pos = fts_decode_vlc(&ptr);
+
+				doc_id += pos;
+
+				/* Get position info */
+				while (*ptr) {
+					pos = fts_decode_vlc(&ptr);
+
+					OK(field_store_string(
+						fields[I_S_FTS_WORD],
+						reinterpret_cast<const char*>
+						(word->text.f_str)));
+
+					OK(fields[I_S_FTS_FIRST_DOC_ID]->store(
+						(longlong) node->first_doc_id,
+						true));
+
+					OK(fields[I_S_FTS_LAST_DOC_ID]->store(
+						(longlong) node->last_doc_id,
+						true));
+
+					OK(fields[I_S_FTS_DOC_COUNT]->store(
+						node->doc_count));
+
+					OK(fields[I_S_FTS_ILIST_DOC_ID]->store(
+						(longlong) doc_id, true));
+
+					OK(fields[I_S_FTS_ILIST_DOC_POS]->store(
+						pos));
+
+					OK(schema_table_store_record(
+						thd, table));
+				}
+
+				++ptr;
+
+				decoded = ptr - (byte*) node->ilist;
+			}
+		}
+	}
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_index_cache_fill(
+/*=====================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	dict_table_t*		user_table;
+	fts_cache_t*		cache;
+
+	DBUG_ENTER("i_s_fts_index_cache_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL, true)) {
+		DBUG_RETURN(0);
+	}
+
+	if (!fts_internal_tbl_name) {
+		DBUG_RETURN(0);
+	}
+
+	user_table = dict_table_open_on_name(
+		fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	}
+
+	cache = user_table->fts->cache;
+
+	ut_a(cache);
+
+	for (ulint i = 0; i < ib_vector_size(cache->indexes); i++) {
+		fts_index_cache_t*      index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*> (
+			ib_vector_get(cache->indexes, i));
+
+		i_s_fts_index_cache_fill_one_index(index_cache, thd, tables);
+	}
+
+	dict_table_close(user_table, FALSE, FALSE);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHE
+@return	0 on success */
+static
+int
+i_s_fts_index_cache_init(
+/*=====================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_index_cache_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_fts_index_fields_info;
+	schema->fill_table = i_s_fts_index_cache_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_ft_index_cache =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_fts_index_fields_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_INDEX_CACHE"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "INNODB AUXILIARY FTS INDEX CACHED"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_fts_index_cache_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/*******************************************************************//**
+Go through a FTS index auxiliary table, fetch its rows and fill
+FTS word cache structure.
+@return	DB_SUCCESS on success, otherwise error code */
+static
+ulint
+i_s_fts_index_table_fill_selected(
+/*==============================*/
+	dict_index_t*		index,		/*!< in: FTS index */
+	ib_vector_t*		words,		/*!< in/out: vector to hold
+						fetched words */
+	ulint			selected)	/*!< in: selected FTS index */
+{
+	pars_info_t*		info;
+	fts_table_t		fts_table;
+	trx_t*			trx;
+	que_t*			graph;
+	ulint			error;
+	fts_fetch_t		fetch;
+
+	info = pars_info_create();
+
+	fetch.read_arg = words;
+	fetch.read_record = fts_optimize_index_fetch_node;
+
+	trx = trx_allocate_for_background();
+
+	trx->op_info = "fetching FTS index nodes";
+
+	pars_info_bind_function(info, "my_func", fetch.read_record, &fetch);
+
+	FTS_INIT_INDEX_TABLE(&fts_table, fts_get_suffix(selected),
+			     FTS_INDEX_TABLE, index);
+
+	graph = fts_parse_sql(
+		&fts_table, info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT word, doc_count, first_doc_id, last_doc_id, "
+		"ilist\n"
+		" FROM %s;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for(;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+
+			break;
+		} else {
+			fts_sql_rollback(trx);
+
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, "  InnoDB: Warning: "
+					"lock wait timeout reading "
+					"FTS index.  Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, "  InnoDB: Error: %lu "
+				"while reading FTS index.\n", error);
+				break;
+			}
+		}
+	}
+
+	mutex_enter(&dict_sys->mutex);
+	que_graph_free(graph);
+	mutex_exit(&dict_sys->mutex);
+
+	trx_free_for_background(trx);
+
+	return(error);
+}
+
+/*******************************************************************//**
+Go through a FTS index and its auxiliary tables, fetch rows in each table
+and fill INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_index_table_fill_one_index(
+/*===============================*/
+	dict_index_t*		index,		/*!< in: FTS index */
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables)		/*!< in/out: tables to fill */
+{
+	TABLE*			table = (TABLE*) tables->table;
+	Field**			fields;
+	ib_vector_t*		words;
+	mem_heap_t*		heap;
+	ulint			num_row_fill;
+
+	DBUG_ENTER("i_s_fts_index_cache_fill_one_index");
+	DBUG_ASSERT(!dict_index_is_online_ddl(index));
+
+	heap = mem_heap_create(1024);
+
+	words = ib_vector_create(ib_heap_allocator_create(heap),
+				 sizeof(fts_word_t), 256);
+
+	fields = table->field;
+
+	/* Iterate through each auxiliary table as described in
+	fts_index_selector */
+	for (ulint selected = 0; fts_index_selector[selected].value;
+	     selected++) {
+		i_s_fts_index_table_fill_selected(index, words, selected);
+	}
+
+	num_row_fill = ut_min(ib_vector_size(words), 500000);
+
+	/* Go through each word in the index cache */
+	for (ulint i = 0; i < num_row_fill; i++) {
+		fts_word_t*	word;
+
+		word = (fts_word_t*) ib_vector_get(words, i);
+
+		word->text.f_str[word->text.f_len] = 0;
+
+		/* Decrypt the ilist, and display Dod ID and word position */
+		for (ulint i = 0; i < ib_vector_size(word->nodes); i++) {
+			fts_node_t*	node;
+			byte*		ptr;
+			ulint		decoded = 0;
+			doc_id_t	doc_id = 0;
+
+			node = static_cast<fts_node_t*> (ib_vector_get(
+				word->nodes, i));
+
+			ptr = node->ilist;
+
+			while (decoded < node->ilist_size) {
+				ulint	pos = fts_decode_vlc(&ptr);
+
+				doc_id += pos;
+
+				/* Get position info */
+				while (*ptr) {
+					pos = fts_decode_vlc(&ptr);
+
+					OK(field_store_string(
+						fields[I_S_FTS_WORD],
+						reinterpret_cast<const char*>
+						(word->text.f_str)));
+
+					OK(fields[I_S_FTS_FIRST_DOC_ID]->store(
+						(longlong) node->first_doc_id,
+						true));
+
+					OK(fields[I_S_FTS_LAST_DOC_ID]->store(
+						(longlong) node->last_doc_id,
+						true));
+
+					OK(fields[I_S_FTS_DOC_COUNT]->store(
+						node->doc_count));
+
+					OK(fields[I_S_FTS_ILIST_DOC_ID]->store(
+						(longlong) doc_id, true));
+
+					OK(fields[I_S_FTS_ILIST_DOC_POS]->store(
+						pos));
+
+					OK(schema_table_store_record(
+						thd, table));
+				}
+
+				++ptr;
+
+				decoded = ptr - (byte*) node->ilist;
+			}
+		}
+	}
+
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_index_table_fill(
+/*=====================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	dict_table_t*		user_table;
+	dict_index_t*		index;
+
+	DBUG_ENTER("i_s_fts_index_table_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL, true)) {
+		DBUG_RETURN(0);
+	}
+
+	if (!fts_internal_tbl_name) {
+		DBUG_RETURN(0);
+	}
+
+	user_table = dict_table_open_on_name(
+		fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	}
+
+	for (index = dict_table_get_first_index(user_table);
+	     index; index = dict_table_get_next_index(index)) {
+		if (index->type & DICT_FTS) {
+			i_s_fts_index_table_fill_one_index(index, thd, tables);
+		}
+	}
+
+	dict_table_close(user_table, FALSE, FALSE);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE
+@return	0 on success */
+static
+int
+i_s_fts_index_table_init(
+/*=====================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_index_table_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_fts_index_fields_info;
+	schema->fill_table = i_s_fts_index_table_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_ft_index_table =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_fts_index_fields_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_INDEX_TABLE"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "INNODB AUXILIARY FTS INDEX TABLE"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_fts_index_table_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG */
+static ST_FIELD_INFO	i_s_fts_config_fields_info[] =
+{
+#define	FTS_CONFIG_KEY			0
+	{STRUCT_FLD(field_name,		"KEY"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	FTS_CONFIG_VALUE		1
+	{STRUCT_FLD(field_name,		"VALUE"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+static const char* fts_config_key[] = {
+	FTS_OPTIMIZE_LIMIT_IN_SECS,
+	FTS_SYNCED_DOC_ID,
+	FTS_STOPWORD_TABLE_NAME,
+	FTS_USE_STOPWORD,
+        NULL
+};
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_config_fill(
+/*================*/
+	THD*		thd,		/*!< in: thread */
+	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	Field**			fields;
+	TABLE*			table = (TABLE*) tables->table;
+	trx_t*			trx;
+	fts_table_t		fts_table;
+	dict_table_t*		user_table;
+	ulint			i = 0;
+	dict_index_t*		index = NULL;
+	unsigned char		str[FTS_MAX_CONFIG_VALUE_LEN + 1];
+
+	DBUG_ENTER("i_s_fts_config_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL, true)) {
+		DBUG_RETURN(0);
+	}
+
+	if (!fts_internal_tbl_name) {
+		DBUG_RETURN(0);
+	}
+
+	fields = table->field;
+
+	user_table = dict_table_open_on_name(
+		fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	}
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "Select for FTS DELETE TABLE";
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, user_table);
+
+	if (!ib_vector_is_empty(user_table->fts->indexes)) {
+		index = (dict_index_t*) ib_vector_getp_const(
+				user_table->fts->indexes, 0);
+		DBUG_ASSERT(!dict_index_is_online_ddl(index));
+	}
+
+	while (fts_config_key[i]) {
+		fts_string_t	value;
+		char*		key_name;
+		ulint		allocated = FALSE;
+
+		value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+
+		value.f_str = str;
+
+		if (index
+		    && strcmp(fts_config_key[i], FTS_TOTAL_WORD_COUNT) == 0) {
+			key_name = fts_config_create_index_param_name(
+				fts_config_key[i], index);
+			allocated = TRUE;
+		} else {
+			key_name = (char*) fts_config_key[i];
+		}
+
+		fts_config_get_value(trx, &fts_table, key_name, &value);
+
+		if (allocated) {
+			ut_free(key_name);
+		}
+
+		OK(field_store_string(
+                        fields[FTS_CONFIG_KEY], fts_config_key[i]));
+
+		OK(field_store_string(
+                        fields[FTS_CONFIG_VALUE], (const char*) value.f_str));
+
+		OK(schema_table_store_record(thd, table));
+
+		i++;
+	}
+
+	fts_sql_commit(trx);
+
+	trx_free_for_background(trx);
+
+	dict_table_close(user_table, FALSE, FALSE);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG
+@return	0 on success */
+static
+int
+i_s_fts_config_init(
+/*=================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_config_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_fts_config_fields_info;
+	schema->fill_table = i_s_fts_config_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_ft_config =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_fts_config_fields_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_CONFIG"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "INNODB AUXILIARY FTS CONFIG TABLE"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_fts_config_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
 /* Fields of the dynamic table INNODB_BUFFER_POOL_STATS. */
 static ST_FIELD_INFO	i_s_innodb_buffer_stats_fields_info[] =
 {
@@ -2575,9 +4619,8 @@ i_s_innodb_buffer_page_fill(
 	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
 	const buf_page_info_t*	info_array,	/*!< in: array cached page
 						info */
-	ulint			num_page,	/*!< in: number of page info
-						 cached */
-	mem_heap_t*		heap)		/*!< in: temp heap memory */
+	ulint			num_page)	/*!< in: number of page info
+						cached */
 {
 	TABLE*			table;
 	Field**			fields;
@@ -2591,15 +4634,13 @@ i_s_innodb_buffer_page_fill(
 	/* Iterate through the cached array and fill the I_S table rows */
 	for (ulint i = 0; i < num_page; i++) {
 		const buf_page_info_t*	page_info;
-		const char*		table_name;
-		const char*		index_name;
+		char			table_name[MAX_FULL_NAME_LEN + 1];
+		const char*		table_name_end = NULL;
 		const char*		state_str;
 		enum buf_page_state	state;
 
 		page_info = info_array + i;
 
-		table_name = NULL;
-		index_name = NULL;
 		state_str = NULL;
 
 		OK(fields[IDX_BUFFER_POOL_ID]->store(page_info->pool_id));
@@ -2637,6 +4678,10 @@ i_s_innodb_buffer_page_fill(
 		OK(fields[IDX_BUFFER_PAGE_ACCESS_TIME]->store(
 			page_info->access_time));
 
+		fields[IDX_BUFFER_PAGE_TABLE_NAME]->set_null();
+
+		fields[IDX_BUFFER_PAGE_INDEX_NAME]->set_null();
+
 		/* If this is an index page, fetch the index name
 		and table name */
 		if (page_info->page_type == I_S_PAGE_TYPE_INDEX) {
@@ -2646,32 +4691,28 @@ i_s_innodb_buffer_page_fill(
 			index = dict_index_get_if_in_cache_low(
 				page_info->index_id);
 
-			/* Copy the index/table name under mutex. We
-			do not want to hold the InnoDB mutex while
-			filling the IS table */
 			if (index) {
-				const char*	name_ptr = index->name;
-
-				if (name_ptr[0] == TEMP_INDEX_PREFIX) {
-					name_ptr++;
-				}
-
-				index_name = mem_heap_strdup(heap, name_ptr);
-
-				table_name = mem_heap_strdup(heap,
-							     index->table_name);
 
+				table_name_end = innobase_convert_name(
+					table_name, sizeof(table_name),
+					index->table_name,
+					strlen(index->table_name),
+					thd, TRUE);
+
+				OK(fields[IDX_BUFFER_PAGE_TABLE_NAME]->store(
+					table_name,
+					table_name_end - table_name,
+					system_charset_info));
+				fields[IDX_BUFFER_PAGE_TABLE_NAME]->set_notnull();
+
+				OK(field_store_index_name(
+					fields[IDX_BUFFER_PAGE_INDEX_NAME],
+					index->name));
 			}
 
 			mutex_exit(&dict_sys->mutex);
 		}
 
-		OK(field_store_string(
-			fields[IDX_BUFFER_PAGE_TABLE_NAME], table_name));
-
-		OK(field_store_string(
-			fields[IDX_BUFFER_PAGE_INDEX_NAME], index_name));
-
 		OK(fields[IDX_BUFFER_PAGE_NUM_RECS]->store(
 			page_info->num_recs));
 
@@ -2680,7 +4721,7 @@ i_s_innodb_buffer_page_fill(
 
 		OK(fields[IDX_BUFFER_PAGE_ZIP_SIZE]->store(
 			page_info->zip_ssize
-			? (PAGE_ZIP_MIN_SIZE >> 1) << page_info->zip_ssize
+			? (UNIV_ZIP_SIZE_MIN >> 1) << page_info->zip_ssize
 			: 0));
 
 #if BUF_PAGE_STATE_BITS > 3
@@ -2692,7 +4733,7 @@ i_s_innodb_buffer_page_fill(
 		/* First three states are for compression pages and
 		are not states we would get as we scan pages through
 		buffer blocks */
-		case BUF_BLOCK_ZIP_FREE:
+		case BUF_BLOCK_POOL_WATCH:
 		case BUF_BLOCK_ZIP_PAGE:
 		case BUF_BLOCK_ZIP_DIRTY:
 			state_str = NULL;
@@ -2822,12 +4863,16 @@ i_s_innodb_buffer_page_get_info(
 					out: structure filled with scanned
 					info */
 {
+	ib_mutex_t*	mutex = buf_page_get_mutex(bpage);
+
 	ut_ad(pool_id < MAX_BUFFER_POOLS);
 
 	page_info->pool_id = pool_id;
 
 	page_info->block_id = pos;
 
+	mutex_enter(mutex);
+
 	page_info->page_state = buf_page_get_state(bpage);
 
 	/* Only fetch information for buffers that map to a tablespace,
@@ -2866,6 +4911,7 @@ i_s_innodb_buffer_page_get_info(
 			break;
 		case BUF_IO_READ:
 			page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
+			mutex_exit(mutex);
 			return;
 		}
 
@@ -2886,6 +4932,8 @@ i_s_innodb_buffer_page_get_info(
 	} else {
 		page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
 	}
+
+	mutex_exit(mutex);
 }
 
 /*******************************************************************//**
@@ -2920,7 +4968,6 @@ i_s_innodb_fill_buffer_pool(
 		ulint			chunk_size;
 		ulint			num_to_process = 0;
 		ulint			block_id = 0;
-		mutex_t*		block_mutex;
 
 		/* Get buffer block of the nth chunk */
 		block = buf_get_nth_chunk_block(buf_pool, n, &chunk_size);
@@ -2940,24 +4987,25 @@ i_s_innodb_fill_buffer_pool(
 			info_buffer = (buf_page_info_t*) mem_heap_zalloc(
 				heap, mem_size);
 
+			/* Obtain appropriate mutexes. Since this is diagnostic
+			buffer pool info printout, we are not required to
+			preserve the overall consistency, so we can
+			release mutex periodically */
+
 			/* GO through each block in the chunk */
 			for (n_blocks = num_to_process; n_blocks--; block++) {
-				block_mutex = buf_page_get_mutex_enter(&block->page);
 				i_s_innodb_buffer_page_get_info(
 					&block->page, pool_id, block_id,
 					info_buffer + num_page);
-				if (block_mutex)
-					mutex_exit(block_mutex);
 				block_id++;
 				num_page++;
 			}
 
-
 			/* Fill in information schema table with information
 			just collected from the buffer chunk scan */
 			status = i_s_innodb_buffer_page_fill(
 				thd, tables, info_buffer,
-				num_page, heap);
+				num_page);
 
 			/* If something goes wrong, break and return */
 			if (status) {
@@ -3298,13 +5346,11 @@ i_s_innodb_buf_page_lru_fill(
 	/* Iterate through the cached array and fill the I_S table rows */
 	for (ulint i = 0; i < num_page; i++) {
 		const buf_page_info_t*	page_info;
-		const char*		table_name;
-		const char*		index_name;
+		char			table_name[MAX_FULL_NAME_LEN + 1];
+		const char*		table_name_end = NULL;
 		const char*		state_str;
 		enum buf_page_state	state;
 
-		table_name = NULL;
-		index_name = NULL;
 		state_str = NULL;
 
 		page_info = info_array + i;
@@ -3344,6 +5390,10 @@ i_s_innodb_buf_page_lru_fill(
 		OK(fields[IDX_BUF_LRU_PAGE_ACCESS_TIME]->store(
 			page_info->access_time));
 
+		fields[IDX_BUF_LRU_PAGE_TABLE_NAME]->set_null();
+
+		fields[IDX_BUF_LRU_PAGE_INDEX_NAME]->set_null();
+
 		/* If this is an index page, fetch the index name
 		and table name */
 		if (page_info->page_type == I_S_PAGE_TYPE_INDEX) {
@@ -3353,30 +5403,28 @@ i_s_innodb_buf_page_lru_fill(
 			index = dict_index_get_if_in_cache_low(
 				page_info->index_id);
 
-			/* Copy the index/table name under mutex. We
-			do not want to hold the InnoDB mutex while
-			filling the IS table */
 			if (index) {
-				const char*	name_ptr = index->name;
-
-				if (name_ptr[0] == TEMP_INDEX_PREFIX) {
-					name_ptr++;
-				}
-
-				index_name = mem_heap_strdup(heap, name_ptr);
 
-				table_name = mem_heap_strdup(heap,
-							     index->table_name);
+				table_name_end = innobase_convert_name(
+					table_name, sizeof(table_name),
+					index->table_name,
+					strlen(index->table_name),
+					thd, TRUE);
+
+				OK(fields[IDX_BUF_LRU_PAGE_TABLE_NAME]->store(
+					table_name,
+					table_name_end - table_name,
+					system_charset_info));
+				fields[IDX_BUF_LRU_PAGE_TABLE_NAME]->set_notnull();
+
+				OK(field_store_index_name(
+					fields[IDX_BUF_LRU_PAGE_INDEX_NAME],
+					index->name));
 			}
 
 			mutex_exit(&dict_sys->mutex);
 		}
 
-		OK(field_store_string(
-			fields[IDX_BUF_LRU_PAGE_TABLE_NAME], table_name));
-
-		OK(field_store_string(
-			fields[IDX_BUF_LRU_PAGE_INDEX_NAME], index_name));
 		OK(fields[IDX_BUF_LRU_PAGE_NUM_RECS]->store(
 			page_info->num_recs));
 
@@ -3400,7 +5448,7 @@ i_s_innodb_buf_page_lru_fill(
 			state_str = "NO";
 			break;
 		/* We should not see following states */
-		case BUF_BLOCK_ZIP_FREE:
+		case BUF_BLOCK_POOL_WATCH:
 		case BUF_BLOCK_READY_FOR_USE:
 		case BUF_BLOCK_NOT_USED:
 		case BUF_BLOCK_MEMORY:
@@ -3464,12 +5512,11 @@ i_s_innodb_fill_buffer_lru(
 	ulint			lru_pos = 0;
 	const buf_page_t*	bpage;
 	ulint			lru_len;
-	mutex_t*		block_mutex;
 
 	DBUG_ENTER("i_s_innodb_fill_buffer_lru");
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
 
-	/* Obtain buf_pool mutex before allocate info_buffer, since
+	/* Obtain buf_pool->LRU_list_mutex before allocate info_buffer, since
 	UT_LIST_GET_LEN(buf_pool->LRU) could change */
 	mutex_enter(&buf_pool->LRU_list_mutex);
 
@@ -3491,14 +5538,12 @@ i_s_innodb_fill_buffer_lru(
 	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
 
 	while (bpage != NULL) {
-		block_mutex = buf_page_get_mutex_enter(bpage);
 		/* Use the same function that collect buffer info for
 		INNODB_BUFFER_PAGE to get buffer page info */
 		i_s_innodb_buffer_page_get_info(bpage, pool_id, lru_pos,
 						(info_buffer + lru_pos));
 
 		bpage = UT_LIST_GET_PREV(LRU, bpage);
-		mutex_exit(block_mutex);
 
 		lru_pos++;
 	}
@@ -3643,10 +5688,11 @@ i_s_common_deinit(
 	DBUG_RETURN(0);
 }
 
+/**  SYS_TABLES  ***************************************************/
 /* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLES */
-static ST_FIELD_INFO    innodb_sys_tables_fields_info[] =
+static ST_FIELD_INFO	innodb_sys_tables_fields_info[] =
 {
-#define SYS_TABLE_ID		0
+#define SYS_TABLES_ID			0
 	{STRUCT_FLD(field_name,		"TABLE_ID"),
 	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
 	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
@@ -3655,25 +5701,16 @@ static ST_FIELD_INFO    innodb_sys_tables_fields_info[] =
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-#define SYS_TABLE_SCHEMA	1
-	{STRUCT_FLD(field_name,		"SCHEMA"),
-	 STRUCT_FLD(field_length,	NAME_LEN + 1),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	0),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-#define SYS_TABLE_NAME		2
+#define SYS_TABLES_NAME			1
 	{STRUCT_FLD(field_name,		"NAME"),
-	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_length,	MAX_FULL_NAME_LEN + 1),
 	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
 	 STRUCT_FLD(value,		0),
 	 STRUCT_FLD(field_flags,	0),
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-#define SYS_TABLE_FLAG		3
+#define SYS_TABLES_FLAG			2
 	{STRUCT_FLD(field_name,		"FLAG"),
 	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
 	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
@@ -3682,7 +5719,7 @@ static ST_FIELD_INFO    innodb_sys_tables_fields_info[] =
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-#define SYS_TABLE_NUM_COLUMN	4
+#define SYS_TABLES_NUM_COLUMN		3
 	{STRUCT_FLD(field_name,		"N_COLS"),
 	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
 	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
@@ -3691,7 +5728,7 @@ static ST_FIELD_INFO    innodb_sys_tables_fields_info[] =
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-#define SYS_TABLE_SPACE		5
+#define SYS_TABLES_SPACE		4
 	{STRUCT_FLD(field_name,		"SPACE"),
 	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
 	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
@@ -3700,6 +5737,33 @@ static ST_FIELD_INFO    innodb_sys_tables_fields_info[] =
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
+#define SYS_TABLES_FILE_FORMAT		5
+	{STRUCT_FLD(field_name,		"FILE_FORMAT"),
+	 STRUCT_FLD(field_length,	10),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLES_ROW_FORMAT		6
+	{STRUCT_FLD(field_name,		"ROW_FORMAT"),
+	 STRUCT_FLD(field_length,	12),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLES_ZIP_PAGE_SIZE	7
+	{STRUCT_FLD(field_name,		"ZIP_PAGE_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
 	END_OF_ST_FIELD_INFO
 };
 
@@ -3713,36 +5777,45 @@ i_s_dict_fill_sys_tables(
 /*=====================*/
 	THD*		thd,		/*!< in: thread */
 	dict_table_t*	table,		/*!< in: table */
-	TABLE*		table_to_fill)  /*!< in/out: fill this table */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
 {
 	Field**		fields;
-	char		buf[NAME_LEN * 2 + 2];
-	char*		ptr;
+	ulint	compact		= DICT_TF_GET_COMPACT(table->flags);
+	ulint	atomic_blobs	= DICT_TF_HAS_ATOMIC_BLOBS(table->flags);
+	ulint	zip_size	= dict_tf_get_zip_size(table->flags);
+	const char* file_format;
+	const char* row_format;
+
+	file_format = trx_sys_file_format_id_to_name(atomic_blobs);
+	if (!compact) {
+		row_format = "Redundant";
+	} else if (!atomic_blobs) {
+		row_format = "Compact";
+	} else if DICT_TF_GET_ZIP_SSIZE(table->flags) {
+		row_format = "Compressed";
+	} else {
+		row_format = "Dynamic";
+	}
 
 	DBUG_ENTER("i_s_dict_fill_sys_tables");
 
 	fields = table_to_fill->field;
 
-	OK(fields[SYS_TABLE_ID]->store(longlong(table->id), TRUE));
+	OK(fields[SYS_TABLES_ID]->store(longlong(table->id), TRUE));
 
-	strncpy(buf, table->name, NAME_LEN * 2 + 2);
-	ptr = strchr(buf, '/');
-	if (ptr) {
-		*ptr = '\0';
-		++ptr;
+	OK(field_store_string(fields[SYS_TABLES_NAME], table->name));
 
-		OK(field_store_string(fields[SYS_TABLE_SCHEMA], buf));
-		OK(field_store_string(fields[SYS_TABLE_NAME], ptr));
-	} else {
-		fields[SYS_TABLE_SCHEMA]->set_null();
-		OK(field_store_string(fields[SYS_TABLE_NAME], buf));
-	}
+	OK(fields[SYS_TABLES_FLAG]->store(table->flags));
+
+	OK(fields[SYS_TABLES_NUM_COLUMN]->store(table->n_cols));
 
-	OK(fields[SYS_TABLE_FLAG]->store(table->flags));
+	OK(fields[SYS_TABLES_SPACE]->store(table->space));
 
-	OK(fields[SYS_TABLE_NUM_COLUMN]->store(table->n_cols));
+	OK(field_store_string(fields[SYS_TABLES_FILE_FORMAT], file_format));
 
-	OK(fields[SYS_TABLE_SPACE]->store(table->space));
+	OK(field_store_string(fields[SYS_TABLES_ROW_FORMAT], row_format));
+
+	OK(fields[SYS_TABLES_ZIP_PAGE_SIZE]->store(zip_size));
 
 	OK(schema_table_store_record(thd, table_to_fill));
 
@@ -3756,28 +5829,26 @@ static
 int
 i_s_sys_tables_fill_table(
 /*======================*/
-	THD*		thd,    /*!< in: thread */
-	TABLE_LIST*	tables, /*!< in/out: tables to fill */
-	COND*		cond)   /*!< in: condition (not used) */
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
 {
-        btr_pcur_t	pcur;
+	btr_pcur_t	pcur;
 	const rec_t*	rec;
 	mem_heap_t*	heap;
 	mtr_t		mtr;
 
 	DBUG_ENTER("i_s_sys_tables_fill_table");
-
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
 
-	/* deny access to non-superusers */
-        if (check_global_access(thd, PROCESS_ACL, true)) {
-
-                DBUG_RETURN(0);
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL, true)) {
+		DBUG_RETURN(0);
 	}
 
-        heap = mem_heap_create(1000);
-        mutex_enter(&(dict_sys->mutex));
-        mtr_start(&mtr);
+	heap = mem_heap_create(1000);
+	mutex_enter(&(dict_sys->mutex));
+	mtr_start(&mtr);
 
 	rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES);
 
@@ -3787,23 +5858,24 @@ i_s_sys_tables_fill_table(
 
 		/* Create and populate a dict_table_t structure with
 		information from SYS_TABLES row */
-		err_msg = dict_process_sys_tables_rec(
-			heap, rec, &table_rec, DICT_TABLE_LOAD_FROM_RECORD);
+		err_msg = dict_process_sys_tables_rec_and_mtr_commit(
+			heap, rec, &table_rec,
+			DICT_TABLE_LOAD_FROM_RECORD, &mtr);
 
-		mtr_commit(&mtr);
 		mutex_exit(&dict_sys->mutex);
 
 		if (!err_msg) {
 			i_s_dict_fill_sys_tables(thd, table_rec, tables->table);
 		} else {
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-					    ER_CANT_FIND_SYSTEM_REC,
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
 					    err_msg);
 		}
 
-		/* Since dict_process_sys_tables_rec() is called with
-		DICT_TABLE_LOAD_FROM_RECORD, the table_rec is created in
-		dict_process_sys_tables_rec(), we will need to free it */
+		/* Since dict_process_sys_tables_rec_and_mtr_commit()
+		is called with DICT_TABLE_LOAD_FROM_RECORD, the table_rec
+		is created in dict_process_sys_tables_rec(), we will
+		need to free it */
 		if (table_rec) {
 			dict_mem_table_free(table_rec);
 		}
@@ -3830,18 +5902,18 @@ static
 int
 innodb_sys_tables_init(
 /*===================*/
-        void*   p)      /*!< in/out: table schema object */
+	void*	p)	/*!< in/out: table schema object */
 {
-        ST_SCHEMA_TABLE*        schema;
+	ST_SCHEMA_TABLE*	schema;
 
-        DBUG_ENTER("innodb_sys_tables_init");
+	DBUG_ENTER("innodb_sys_tables_init");
 
-        schema = (ST_SCHEMA_TABLE*) p;
+	schema = (ST_SCHEMA_TABLE*) p;
 
-        schema->fields_info = innodb_sys_tables_fields_info;
-        schema->fill_table = i_s_sys_tables_fill_table;
+	schema->fields_info = innodb_sys_tables_fields_info;
+	schema->fill_table = i_s_sys_tables_fill_table;
 
-        DBUG_RETURN(0);
+	DBUG_RETURN(0);
 }
 
 UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_tables =
@@ -3860,7 +5932,7 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_tables =
 
 	/* plugin author (for SHOW PLUGINS) */
 	/* const char* */
-	STRUCT_FLD(author, "Percona"),
+	STRUCT_FLD(author, plugin_author),
 
 	/* general descriptive text (for SHOW PLUGINS) */
 	/* const char* */
@@ -3891,8 +5963,9 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_tables =
         INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
 };
 
+/**  SYS_TABLESTATS  ***********************************************/
 /* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLESTATS */
-static ST_FIELD_INFO    innodb_sys_tablestats_fields_info[] =
+static ST_FIELD_INFO	innodb_sys_tablestats_fields_info[] =
 {
 #define SYS_TABLESTATS_ID		0
 	{STRUCT_FLD(field_name,		"TABLE_ID"),
@@ -3903,16 +5976,7 @@ static ST_FIELD_INFO    innodb_sys_tablestats_fields_info[] =
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-#define SYS_TABLESTATS_SCHEMA		1
-	{STRUCT_FLD(field_name,		"SCHEMA"),
-	 STRUCT_FLD(field_length,	NAME_LEN + 1),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	0),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-#define SYS_TABLESTATS_NAME		2
+#define SYS_TABLESTATS_NAME		1
 	{STRUCT_FLD(field_name,		"NAME"),
 	 STRUCT_FLD(field_length,	NAME_LEN + 1),
 	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
@@ -3921,7 +5985,7 @@ static ST_FIELD_INFO    innodb_sys_tablestats_fields_info[] =
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-#define SYS_TABLESTATS_INIT		3
+#define SYS_TABLESTATS_INIT		2
 	{STRUCT_FLD(field_name,		"STATS_INITIALIZED"),
 	 STRUCT_FLD(field_length,	NAME_LEN + 1),
 	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
@@ -3930,7 +5994,7 @@ static ST_FIELD_INFO    innodb_sys_tablestats_fields_info[] =
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-#define SYS_TABLESTATS_NROW		4
+#define SYS_TABLESTATS_NROW		3
 	{STRUCT_FLD(field_name,		"NUM_ROWS"),
 	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
 	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
@@ -3939,7 +6003,7 @@ static ST_FIELD_INFO    innodb_sys_tablestats_fields_info[] =
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-#define SYS_TABLESTATS_CLUST_SIZE	5
+#define SYS_TABLESTATS_CLUST_SIZE	4
 	{STRUCT_FLD(field_name,		"CLUST_INDEX_SIZE"),
 	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
 	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
@@ -3948,7 +6012,7 @@ static ST_FIELD_INFO    innodb_sys_tablestats_fields_info[] =
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-#define SYS_TABLESTATS_INDEX_SIZE	6
+#define SYS_TABLESTATS_INDEX_SIZE	5
 	{STRUCT_FLD(field_name,		"OTHER_INDEX_SIZE"),
 	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
 	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
@@ -3957,7 +6021,7 @@ static ST_FIELD_INFO    innodb_sys_tablestats_fields_info[] =
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-#define SYS_TABLESTATS_MODIFIED		7
+#define SYS_TABLESTATS_MODIFIED		6
 	{STRUCT_FLD(field_name,		"MODIFIED_COUNTER"),
 	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
 	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
@@ -3966,7 +6030,7 @@ static ST_FIELD_INFO    innodb_sys_tablestats_fields_info[] =
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-#define SYS_TABLESTATS_AUTONINC		8
+#define SYS_TABLESTATS_AUTONINC		7
 	{STRUCT_FLD(field_name,		"AUTOINC"),
 	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
 	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
@@ -3975,8 +6039,8 @@ static ST_FIELD_INFO    innodb_sys_tablestats_fields_info[] =
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-#define SYS_TABLESTATS_MYSQL_OPEN_HANDLE	9
-	{STRUCT_FLD(field_name,		"MYSQL_HANDLES_OPENED"),
+#define SYS_TABLESTATS_TABLE_REF_COUNT	8
+	{STRUCT_FLD(field_name,		"REF_COUNT"),
 	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
 	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
 	 STRUCT_FLD(value,		0),
@@ -3997,11 +6061,9 @@ i_s_dict_fill_sys_tablestats(
 /*=========================*/
 	THD*		thd,		/*!< in: thread */
 	dict_table_t*	table,		/*!< in: table */
-	TABLE*		table_to_fill)  /*!< in/out: fill this table */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
 {
 	Field**		fields;
-	char		buf[NAME_LEN * 2 + 2];
-	char*		ptr;
 
 	DBUG_ENTER("i_s_dict_fill_sys_tablestats");
 
@@ -4009,47 +6071,50 @@ i_s_dict_fill_sys_tablestats(
 
 	OK(fields[SYS_TABLESTATS_ID]->store(longlong(table->id), TRUE));
 
-	strncpy(buf, table->name, NAME_LEN * 2 + 2);
-	ptr = strchr(buf, '/');
-	if (ptr) {
-		*ptr = '\0';
-		++ptr;
+	OK(field_store_string(fields[SYS_TABLESTATS_NAME], table->name));
 
-		OK(field_store_string(fields[SYS_TABLESTATS_SCHEMA], buf));
-		OK(field_store_string(fields[SYS_TABLESTATS_NAME], ptr));
-	} else {
-		fields[SYS_TABLESTATS_SCHEMA]->set_null();
-		OK(field_store_string(fields[SYS_TABLESTATS_NAME], buf));
-	}
+	dict_table_stats_lock(table, RW_S_LATCH);
 
 	if (table->stat_initialized) {
 		OK(field_store_string(fields[SYS_TABLESTATS_INIT],
 				      "Initialized"));
+
+		OK(fields[SYS_TABLESTATS_NROW]->store(table->stat_n_rows,
+						      TRUE));
+
+		OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store(
+				table->stat_clustered_index_size));
+
+		OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store(
+				table->stat_sum_of_other_index_sizes));
+
+		OK(fields[SYS_TABLESTATS_MODIFIED]->store(
+				(ulint) table->stat_modified_counter));
 	} else {
 		OK(field_store_string(fields[SYS_TABLESTATS_INIT],
 				      "Uninitialized"));
-	}
 
-	OK(fields[SYS_TABLESTATS_NROW]->store(table->stat_n_rows, TRUE));
+		OK(fields[SYS_TABLESTATS_NROW]->store(0, TRUE));
 
-	OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store(
-		table->stat_clustered_index_size));
+		OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store(0));
 
-	OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store(
-		table->stat_sum_of_other_index_sizes));
+		OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store(0));
 
-	OK(fields[SYS_TABLESTATS_MODIFIED]->store(
-		table->stat_modified_counter));
+		OK(fields[SYS_TABLESTATS_MODIFIED]->store(0));
+	}
+
+	dict_table_stats_unlock(table, RW_S_LATCH);
 
 	OK(fields[SYS_TABLESTATS_AUTONINC]->store(table->autoinc, TRUE));
 
-	OK(fields[SYS_TABLESTATS_MYSQL_OPEN_HANDLE]->store(
-		table->n_mysql_handles_opened));
+	OK(fields[SYS_TABLESTATS_TABLE_REF_COUNT]->store(
+		table->n_ref_count));
 
 	OK(schema_table_store_record(thd, table_to_fill));
 
 	DBUG_RETURN(0);
 }
+
 /*******************************************************************//**
 Function to go through each record in SYS_TABLES table, and fill the
 information_schema.innodb_sys_tablestats table with table statistics
@@ -4059,28 +6124,26 @@ static
 int
 i_s_sys_tables_fill_table_stats(
 /*============================*/
-	THD*		thd,    /*!< in: thread */
-	TABLE_LIST*	tables, /*!< in/out: tables to fill */
-	COND*		cond)   /*!< in: condition (not used) */
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
 {
-        btr_pcur_t	pcur;
+	btr_pcur_t	pcur;
 	const rec_t*	rec;
 	mem_heap_t*	heap;
 	mtr_t		mtr;
 
 	DBUG_ENTER("i_s_sys_tables_fill_table_stats");
-
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
 
-	/* deny access to non-superusers */
-        if (check_global_access(thd, PROCESS_ACL, true)) {
-
-                DBUG_RETURN(0);
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL, true)) {
+		DBUG_RETURN(0);
 	}
 
-        heap = mem_heap_create(1000);
-        mutex_enter(&dict_sys->mutex);
-        mtr_start(&mtr);
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
 
 	rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES);
 
@@ -4090,18 +6153,18 @@ i_s_sys_tables_fill_table_stats(
 
 		/* Fetch the dict_table_t structure corresponding to
 		this SYS_TABLES record */
-		err_msg = dict_process_sys_tables_rec(
-			heap, rec, &table_rec, DICT_TABLE_LOAD_FROM_CACHE);
+		err_msg = dict_process_sys_tables_rec_and_mtr_commit(
+			heap, rec, &table_rec,
+			DICT_TABLE_LOAD_FROM_CACHE, &mtr);
 
-		mtr_commit(&mtr);
 		mutex_exit(&dict_sys->mutex);
 
 		if (!err_msg) {
 			i_s_dict_fill_sys_tablestats(thd, table_rec,
 						     tables->table);
 		} else {
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-					    ER_CANT_FIND_SYSTEM_REC,
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
 					    err_msg);
 		}
 
@@ -4127,18 +6190,18 @@ static
 int
 innodb_sys_tablestats_init(
 /*=======================*/
-        void*   p)      /*!< in/out: table schema object */
+	void*	p)	/*!< in/out: table schema object */
 {
-        ST_SCHEMA_TABLE*        schema;
+	ST_SCHEMA_TABLE*	schema;
 
-        DBUG_ENTER("innodb_sys_tablestats_init");
+	DBUG_ENTER("innodb_sys_tablestats_init");
 
-        schema = (ST_SCHEMA_TABLE*) p;
+	schema = (ST_SCHEMA_TABLE*) p;
 
-        schema->fields_info = innodb_sys_tablestats_fields_info;
-        schema->fill_table = i_s_sys_tables_fill_table_stats;
+	schema->fields_info = innodb_sys_tablestats_fields_info;
+	schema->fill_table = i_s_sys_tables_fill_table_stats;
 
-        DBUG_RETURN(0);
+	DBUG_RETURN(0);
 }
 
 UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_tablestats =
@@ -4157,7 +6220,7 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_tablestats =
 
 	/* plugin author (for SHOW PLUGINS) */
 	/* const char* */
-	STRUCT_FLD(author, "Percona"),
+	STRUCT_FLD(author, plugin_author),
 
 	/* general descriptive text (for SHOW PLUGINS) */
 	/* const char* */
@@ -4188,8 +6251,9 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_tablestats =
         INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
 };
 
+/**  SYS_INDEXES  **************************************************/
 /* Fields of the dynamic table INFORMATION_SCHEMA.SYS_INDEXES */
-static ST_FIELD_INFO    innodb_sysindex_fields_info[] =
+static ST_FIELD_INFO	innodb_sysindex_fields_info[] =
 {
 #define SYS_INDEX_ID		0
 	{STRUCT_FLD(field_name,		"INDEX_ID"),
@@ -4269,7 +6333,7 @@ i_s_dict_fill_sys_indexes(
 	table_id_t	table_id,	/*!< in: table id */
 	dict_index_t*	index,		/*!< in: populated dict_index_t
 					struct with index info */
-	TABLE*		table_to_fill)  /*!< in/out: fill this table */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
 {
 	Field**		fields;
 
@@ -4277,9 +6341,9 @@ i_s_dict_fill_sys_indexes(
 
 	fields = table_to_fill->field;
 
-	OK(fields[SYS_INDEX_ID]->store(longlong(index->id), TRUE));
+	OK(field_store_index_name(fields[SYS_INDEX_NAME], index->name));
 
-	OK(field_store_string(fields[SYS_INDEX_NAME], index->name));
+	OK(fields[SYS_INDEX_ID]->store(longlong(index->id), TRUE));
 
 	OK(fields[SYS_INDEX_TABLE_ID]->store(longlong(table_id), TRUE));
 
@@ -4287,7 +6351,12 @@ i_s_dict_fill_sys_indexes(
 
 	OK(fields[SYS_INDEX_NUM_FIELDS]->store(index->n_fields));
 
-	OK(fields[SYS_INDEX_PAGE_NO]->store(index->page));
+	/* FIL_NULL is ULINT32_UNDEFINED */
+	if (index->page == FIL_NULL) {
+		OK(fields[SYS_INDEX_PAGE_NO]->store(-1));
+	} else {
+		OK(fields[SYS_INDEX_PAGE_NO]->store(index->page));
+	}
 
 	OK(fields[SYS_INDEX_SPACE]->store(index->space));
 
@@ -4303,35 +6372,33 @@ static
 int
 i_s_sys_indexes_fill_table(
 /*=======================*/
-	THD*		thd,    /*!< in: thread */
-	TABLE_LIST*	tables, /*!< in/out: tables to fill */
-	COND*		cond)   /*!< in: condition (not used) */
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
 {
-        btr_pcur_t		pcur;
+	btr_pcur_t		pcur;
 	const rec_t*		rec;
 	mem_heap_t*		heap;
 	mtr_t			mtr;
 
 	DBUG_ENTER("i_s_sys_indexes_fill_table");
-
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
 
-	/* deny access to non-superusers */
-        if (check_global_access(thd, PROCESS_ACL, true)) {
-
-                DBUG_RETURN(0);
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL, true)) {
+		DBUG_RETURN(0);
 	}
 
-        heap = mem_heap_create(1000);
-        mutex_enter(&dict_sys->mutex);
-        mtr_start(&mtr);
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
 
 	/* Start scan the SYS_INDEXES table */
 	rec = dict_startscan_system(&pcur, &mtr, SYS_INDEXES);
 
 	/* Process each record in the table */
 	while (rec) {
-		const char*	err_msg;;
+		const char*	err_msg;
 		table_id_t	table_id;
 		dict_index_t	index_rec;
 
@@ -4347,8 +6414,8 @@ i_s_sys_indexes_fill_table(
 			i_s_dict_fill_sys_indexes(thd, table_id, &index_rec,
 						 tables->table);
 		} else {
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-					    ER_CANT_FIND_SYSTEM_REC,
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
 					    err_msg);
 		}
 
@@ -4373,18 +6440,18 @@ static
 int
 innodb_sys_indexes_init(
 /*====================*/
-        void*   p)      /*!< in/out: table schema object */
+	void*	p)	/*!< in/out: table schema object */
 {
-        ST_SCHEMA_TABLE*        schema;
+	ST_SCHEMA_TABLE*	schema;
 
-        DBUG_ENTER("innodb_sys_index_init");
+	DBUG_ENTER("innodb_sys_indexes_init");
 
-        schema = (ST_SCHEMA_TABLE*) p;
+	schema = (ST_SCHEMA_TABLE*) p;
 
-        schema->fields_info = innodb_sysindex_fields_info;
-        schema->fill_table = i_s_sys_indexes_fill_table;
+	schema->fields_info = innodb_sysindex_fields_info;
+	schema->fill_table = i_s_sys_indexes_fill_table;
 
-        DBUG_RETURN(0);
+	DBUG_RETURN(0);
 }
 
 UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_indexes =
@@ -4403,7 +6470,7 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_indexes =
 
 	/* plugin author (for SHOW PLUGINS) */
 	/* const char* */
-	STRUCT_FLD(author, "Percona"),
+	STRUCT_FLD(author, plugin_author),
 
 	/* general descriptive text (for SHOW PLUGINS) */
 	/* const char* */
@@ -4434,8 +6501,9 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_indexes =
         INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
 };
 
-/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_COLUMNS */
-static ST_FIELD_INFO    innodb_sys_columns_fields_info[] =
+/**  SYS_COLUMNS  **************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_COLUMNS */
+static ST_FIELD_INFO	innodb_sys_columns_fields_info[] =
 {
 #define SYS_COLUMN_TABLE_ID		0
 	{STRUCT_FLD(field_name,		"TABLE_ID"),
@@ -4507,7 +6575,7 @@ i_s_dict_fill_sys_columns(
 	const char*	col_name,	/*!< in: column name */
 	dict_col_t*	column,		/*!< in: dict_col_t struct holding
 					more column information */
-	TABLE*		table_to_fill)  /*!< in/out: fill this table */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
 {
 	Field**		fields;
 
@@ -4539,29 +6607,27 @@ static
 int
 i_s_sys_columns_fill_table(
 /*=======================*/
-	THD*		thd,    /*!< in: thread */
-	TABLE_LIST*	tables, /*!< in/out: tables to fill */
-	COND*		cond)   /*!< in: condition (not used) */
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
 {
-        btr_pcur_t	pcur;
+	btr_pcur_t	pcur;
 	const rec_t*	rec;
 	const char*	col_name;
 	mem_heap_t*	heap;
 	mtr_t		mtr;
 
 	DBUG_ENTER("i_s_sys_columns_fill_table");
-
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
 
-	/* deny access to non-superusers */
-        if (check_global_access(thd, PROCESS_ACL, true)) {
-
-                DBUG_RETURN(0);
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL, true)) {
+		DBUG_RETURN(0);
 	}
 
-        heap = mem_heap_create(1000);
-        mutex_enter(&dict_sys->mutex);
-        mtr_start(&mtr);
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
 
 	rec = dict_startscan_system(&pcur, &mtr, SYS_COLUMNS);
 
@@ -4583,8 +6649,8 @@ i_s_sys_columns_fill_table(
 						 &column_rec,
 						 tables->table);
 		} else {
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-					    ER_CANT_FIND_SYSTEM_REC,
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
 					    err_msg);
 		}
 
@@ -4609,18 +6675,18 @@ static
 int
 innodb_sys_columns_init(
 /*====================*/
-        void*   p)      /*!< in/out: table schema object */
+	void*	p)	/*!< in/out: table schema object */
 {
-        ST_SCHEMA_TABLE*        schema;
+	ST_SCHEMA_TABLE*	schema;
 
-        DBUG_ENTER("innodb_sys_columns_init");
+	DBUG_ENTER("innodb_sys_columns_init");
 
-        schema = (ST_SCHEMA_TABLE*) p;
+	schema = (ST_SCHEMA_TABLE*) p;
 
-        schema->fields_info = innodb_sys_columns_fields_info;
-        schema->fill_table = i_s_sys_columns_fill_table;
+	schema->fields_info = innodb_sys_columns_fields_info;
+	schema->fill_table = i_s_sys_columns_fill_table;
 
-        DBUG_RETURN(0);
+	DBUG_RETURN(0);
 }
 
 UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_columns =
@@ -4639,7 +6705,7 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_columns =
 
 	/* plugin author (for SHOW PLUGINS) */
 	/* const char* */
-	STRUCT_FLD(author, "Percona"),
+	STRUCT_FLD(author, plugin_author),
 
 	/* general descriptive text (for SHOW PLUGINS) */
 	/* const char* */
@@ -4669,8 +6735,10 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_columns =
 
         INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
 };
-/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_sys_fields */
-static ST_FIELD_INFO    innodb_sys_fields_fields_info[] =
+
+/**  SYS_FIELDS  ***************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FIELDS */
+static ST_FIELD_INFO	innodb_sys_fields_fields_info[] =
 {
 #define SYS_FIELD_INDEX_ID	0
 	{STRUCT_FLD(field_name,		"INDEX_ID"),
@@ -4714,7 +6782,7 @@ i_s_dict_fill_sys_fields(
 	index_id_t	index_id,	/*!< in: index id for the field */
 	dict_field_t*	field,		/*!< in: table */
 	ulint		pos,		/*!< in: Field position */
-	TABLE*		table_to_fill)  /*!< in/out: fill this table */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
 {
 	Field**		fields;
 
@@ -4741,29 +6809,28 @@ static
 int
 i_s_sys_fields_fill_table(
 /*======================*/
-	THD*		thd,    /*!< in: thread */
-	TABLE_LIST*	tables, /*!< in/out: tables to fill */
-	COND*		cond)   /*!< in: condition (not used) */
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
 {
-        btr_pcur_t	pcur;
+	btr_pcur_t	pcur;
 	const rec_t*	rec;
 	mem_heap_t*	heap;
 	index_id_t	last_id;
 	mtr_t		mtr;
 
 	DBUG_ENTER("i_s_sys_fields_fill_table");
-
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
 
-	/* deny access to non-superusers */
-        if (check_global_access(thd, PROCESS_ACL, true)) {
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL, true)) {
 
-                DBUG_RETURN(0);
+		DBUG_RETURN(0);
 	}
 
-        heap = mem_heap_create(1000);
-        mutex_enter(&dict_sys->mutex);
-        mtr_start(&mtr);
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
 
 	/* will save last index id so that we know whether we move to
 	the next index. This is used to calculate prefix length */
@@ -4790,8 +6857,8 @@ i_s_sys_fields_fill_table(
 						 pos, tables->table);
 			last_id = index_id;
 		} else {
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-					    ER_CANT_FIND_SYSTEM_REC,
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
 					    err_msg);
 		}
 
@@ -4816,18 +6883,18 @@ static
 int
 innodb_sys_fields_init(
 /*===================*/
-        void*   p)      /*!< in/out: table schema object */
+	void*   p)	/*!< in/out: table schema object */
 {
-        ST_SCHEMA_TABLE*        schema;
+	ST_SCHEMA_TABLE*	schema;
 
-        DBUG_ENTER("innodb_sys_field_init");
+	DBUG_ENTER("innodb_sys_field_init");
 
-        schema = (ST_SCHEMA_TABLE*) p;
+	schema = (ST_SCHEMA_TABLE*) p;
 
-        schema->fields_info = innodb_sys_fields_fields_info;
-        schema->fill_table = i_s_sys_fields_fill_table;
+	schema->fields_info = innodb_sys_fields_fields_info;
+	schema->fill_table = i_s_sys_fields_fill_table;
 
-        DBUG_RETURN(0);
+	DBUG_RETURN(0);
 }
 
 UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_fields =
@@ -4846,7 +6913,7 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_fields =
 
 	/* plugin author (for SHOW PLUGINS) */
 	/* const char* */
-	STRUCT_FLD(author, "Percona"),
+	STRUCT_FLD(author, plugin_author),
 
 	/* general descriptive text (for SHOW PLUGINS) */
 	/* const char* */
@@ -4877,8 +6944,9 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_fields =
         INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
 };
 
-/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign */
-static ST_FIELD_INFO    innodb_sys_foreign_fields_info[] =
+/**  SYS_FOREIGN        ********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FOREIGN */
+static ST_FIELD_INFO	innodb_sys_foreign_fields_info[] =
 {
 #define SYS_FOREIGN_ID		0
 	{STRUCT_FLD(field_name,		"ID"),
@@ -4938,7 +7006,7 @@ i_s_dict_fill_sys_foreign(
 /*======================*/
 	THD*		thd,		/*!< in: thread */
 	dict_foreign_t*	foreign,	/*!< in: table */
-	TABLE*		table_to_fill)  /*!< in/out: fill this table */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
 {
 	Field**		fields;
 
@@ -4962,6 +7030,7 @@ i_s_dict_fill_sys_foreign(
 
 	DBUG_RETURN(0);
 }
+
 /*******************************************************************//**
 Function to populate INFORMATION_SCHEMA.innodb_sys_foreign table. Loop
 through each record in SYS_FOREIGN, and extract the foreign key
@@ -4971,28 +7040,27 @@ static
 int
 i_s_sys_foreign_fill_table(
 /*=======================*/
-	THD*		thd,    /*!< in: thread */
-	TABLE_LIST*	tables, /*!< in/out: tables to fill */
-	COND*		cond)   /*!< in: condition (not used) */
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
 {
-        btr_pcur_t	pcur;
+	btr_pcur_t	pcur;
 	const rec_t*	rec;
 	mem_heap_t*	heap;
 	mtr_t		mtr;
 
 	DBUG_ENTER("i_s_sys_foreign_fill_table");
-
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
 
-	/* deny access to non-superusers */
-        if (check_global_access(thd, PROCESS_ACL, true)) {
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL, true)) {
 
-                DBUG_RETURN(0);
+		DBUG_RETURN(0);
 	}
 
-        heap = mem_heap_create(1000);
-        mutex_enter(&dict_sys->mutex);
-        mtr_start(&mtr);
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
 
 	rec = dict_startscan_system(&pcur, &mtr, SYS_FOREIGN);
 
@@ -5011,8 +7079,8 @@ i_s_sys_foreign_fill_table(
 			i_s_dict_fill_sys_foreign(thd, &foreign_rec,
 						 tables->table);
 		} else {
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-					    ER_CANT_FIND_SYSTEM_REC,
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
 					    err_msg);
 		}
 
@@ -5030,6 +7098,7 @@ i_s_sys_foreign_fill_table(
 
 	DBUG_RETURN(0);
 }
+
 /*******************************************************************//**
 Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign
 @return 0 on success */
@@ -5037,18 +7106,18 @@ static
 int
 innodb_sys_foreign_init(
 /*====================*/
-        void*   p)      /*!< in/out: table schema object */
+	void*   p)	/*!< in/out: table schema object */
 {
-        ST_SCHEMA_TABLE*        schema;
+	ST_SCHEMA_TABLE*	schema;
 
-        DBUG_ENTER("innodb_sys_foreign_init");
+	DBUG_ENTER("innodb_sys_foreign_init");
 
-        schema = (ST_SCHEMA_TABLE*) p;
+	schema = (ST_SCHEMA_TABLE*) p;
 
-        schema->fields_info = innodb_sys_foreign_fields_info;
-        schema->fill_table = i_s_sys_foreign_fill_table;
+	schema->fields_info = innodb_sys_foreign_fields_info;
+	schema->fill_table = i_s_sys_foreign_fill_table;
 
-        DBUG_RETURN(0);
+	DBUG_RETURN(0);
 }
 
 UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_foreign =
@@ -5067,7 +7136,7 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_foreign =
 
 	/* plugin author (for SHOW PLUGINS) */
 	/* const char* */
-	STRUCT_FLD(author, "Percona"),
+	STRUCT_FLD(author, plugin_author),
 
 	/* general descriptive text (for SHOW PLUGINS) */
 	/* const char* */
@@ -5097,8 +7166,10 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_foreign =
 
         INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
 };
-/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign_cols */
-static ST_FIELD_INFO    innodb_sys_foreign_cols_fields_info[] =
+
+/**  SYS_FOREIGN_COLS   ********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FOREIGN_COLS */
+static ST_FIELD_INFO	innodb_sys_foreign_cols_fields_info[] =
 {
 #define SYS_FOREIGN_COL_ID		0
 	{STRUCT_FLD(field_name,		"ID"),
@@ -5153,7 +7224,7 @@ i_s_dict_fill_sys_foreign_cols(
 	const char*	ref_col_name,	/*!< in: referenced column
 					name */
 	ulint		pos,		/*!< in: column position */
-	TABLE*		table_to_fill)  /*!< in/out: fill this table */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
 {
 	Field**		fields;
 
@@ -5182,27 +7253,26 @@ static
 int
 i_s_sys_foreign_cols_fill_table(
 /*============================*/
-	THD*		thd,    /*!< in: thread */
-	TABLE_LIST*	tables, /*!< in/out: tables to fill */
-	COND*		cond)   /*!< in: condition (not used) */
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
 {
-        btr_pcur_t	pcur;
+	btr_pcur_t	pcur;
 	const rec_t*	rec;
 	mem_heap_t*	heap;
 	mtr_t		mtr;
 
 	DBUG_ENTER("i_s_sys_foreign_cols_fill_table");
-
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
 
-	/* deny access to non-superusers */
-        if (check_global_access(thd, PROCESS_ACL, true)) {
-                DBUG_RETURN(0);
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL, true)) {
+		DBUG_RETURN(0);
 	}
 
-        heap = mem_heap_create(1000);
-        mutex_enter(&dict_sys->mutex);
-        mtr_start(&mtr);
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
 
 	rec = dict_startscan_system(&pcur, &mtr, SYS_FOREIGN_COLS);
 
@@ -5225,8 +7295,8 @@ i_s_sys_foreign_cols_fill_table(
 				thd, name, for_col_name, ref_col_name, pos,
 				tables->table);
 		} else {
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-					    ER_CANT_FIND_SYSTEM_REC,
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
 					    err_msg);
 		}
 
@@ -5251,18 +7321,18 @@ static
 int
 innodb_sys_foreign_cols_init(
 /*========================*/
-        void*   p)      /*!< in/out: table schema object */
+	void*	p)	/*!< in/out: table schema object */
 {
-        ST_SCHEMA_TABLE*        schema;
+	ST_SCHEMA_TABLE*	schema;
 
-        DBUG_ENTER("innodb_sys_foreign_cols_init");
+	DBUG_ENTER("innodb_sys_foreign_cols_init");
 
-        schema = (ST_SCHEMA_TABLE*) p;
+	schema = (ST_SCHEMA_TABLE*) p;
 
-        schema->fields_info = innodb_sys_foreign_cols_fields_info;
-        schema->fill_table = i_s_sys_foreign_cols_fill_table;
+	schema->fields_info = innodb_sys_foreign_cols_fields_info;
+	schema->fill_table = i_s_sys_foreign_cols_fill_table;
 
-        DBUG_RETURN(0);
+	DBUG_RETURN(0);
 }
 
 UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_foreign_cols =
@@ -5281,7 +7351,7 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_foreign_cols =
 
 	/* plugin author (for SHOW PLUGINS) */
 	/* const char* */
-	STRUCT_FLD(author, "Percona"),
+	STRUCT_FLD(author, plugin_author),
 
 	/* general descriptive text (for SHOW PLUGINS) */
 	/* const char* */
@@ -5312,20 +7382,30 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_foreign_cols =
         INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
 };
 
-/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_sys_stats */
-static ST_FIELD_INFO	innodb_sys_stats_fields_info[] =
+/**  SYS_TABLESPACES    ********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES */
+static ST_FIELD_INFO	innodb_sys_tablespaces_fields_info[] =
 {
-#define SYS_STATS_INDEX_ID	0
-	{STRUCT_FLD(field_name,		"INDEX_ID"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+#define SYS_TABLESPACES_SPACE		0
+	{STRUCT_FLD(field_name,		"SPACE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
 	 STRUCT_FLD(value,		0),
 	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-#define SYS_STATS_KEY_COLS	1
-	{STRUCT_FLD(field_name,		"KEY_COLS"),
+#define SYS_TABLESPACES_NAME		1
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	MAX_FULL_NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESPACES_FLAGS		2
+	{STRUCT_FLD(field_name,		"FLAG"),
 	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
 	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
 	 STRUCT_FLD(value,		0),
@@ -5333,115 +7413,152 @@ static ST_FIELD_INFO	innodb_sys_stats_fields_info[] =
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-#define SYS_STATS_DIFF_VALS	2
-	{STRUCT_FLD(field_name,		"DIFF_VALS"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+#define SYS_TABLESPACES_FILE_FORMAT	3
+	{STRUCT_FLD(field_name,		"FILE_FORMAT"),
+	 STRUCT_FLD(field_length,	10),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESPACES_ROW_FORMAT	4
+	{STRUCT_FLD(field_name,		"ROW_FORMAT"),
+	 STRUCT_FLD(field_length,	22),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESPACES_PAGE_SIZE	5
+	{STRUCT_FLD(field_name,		"PAGE_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
 	 STRUCT_FLD(value,		0),
 	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-#define SYS_STATS_NON_NULL_VALS	3
-	{STRUCT_FLD(field_name,		"NON_NULL_VALS"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+#define SYS_TABLESPACES_ZIP_PAGE_SIZE	6
+	{STRUCT_FLD(field_name,		"ZIP_PAGE_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
 	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
 	END_OF_ST_FIELD_INFO
+
 };
+
 /**********************************************************************//**
-Function to fill information_schema.innodb_sys_stats
+Function to fill INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES with information
+collected by scanning SYS_TABLESPACESS table.
 @return 0 on success */
 static
 int
-i_s_dict_fill_sys_stats(
-/*====================*/
+i_s_dict_fill_sys_tablespaces(
+/*==========================*/
 	THD*		thd,		/*!< in: thread */
-	index_id_t	index_id,	/*!< in: INDEX_ID */
-	ulint		key_cols,	/*!< in: KEY_COLS */
-	ib_uint64_t	diff_vals,	/*!< in: DIFF_VALS */
-	ib_uint64_t	non_null_vals,	/*!< in: NON_NULL_VALS */
-	TABLE*		table_to_fill)  /*!< in/out: fill this table */
+	ulint		space,		/*!< in: space ID */
+	const char*	name,		/*!< in: tablespace name */
+	ulint		flags,		/*!< in: tablespace flags */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
 {
-	Field**		fields;
-
-	DBUG_ENTER("i_s_dict_fill_sys_stats");
+	Field**	fields;
+	ulint	atomic_blobs	= FSP_FLAGS_HAS_ATOMIC_BLOBS(flags);
+	ulint	page_size	= fsp_flags_get_page_size(flags);;
+	ulint	zip_size	= fsp_flags_get_zip_size(flags);
+	const char* file_format;
+	const char* row_format;
+
+	DBUG_ENTER("i_s_dict_fill_sys_tablespaces");
+
+	file_format = trx_sys_file_format_id_to_name(atomic_blobs);
+	if (!atomic_blobs) {
+		row_format = "Compact or Redundant";
+	} else if DICT_TF_GET_ZIP_SSIZE(flags) {
+		row_format = "Compressed";
+	} else {
+		row_format = "Dynamic";
+	}
 
 	fields = table_to_fill->field;
 
-	OK(fields[SYS_STATS_INDEX_ID]->store(longlong(index_id), TRUE));
+	OK(fields[SYS_TABLESPACES_SPACE]->store(space));
 
-	OK(fields[SYS_STATS_KEY_COLS]->store(key_cols));
+	OK(field_store_string(fields[SYS_TABLESPACES_NAME], name));
 
-	OK(fields[SYS_STATS_DIFF_VALS]->store(longlong(diff_vals), TRUE));
+	OK(fields[SYS_TABLESPACES_FLAGS]->store(flags));
 
-	if (non_null_vals == ((ib_uint64_t)(-1))) {
-		fields[SYS_STATS_NON_NULL_VALS]->set_null();
-	} else {
-		OK(fields[SYS_STATS_NON_NULL_VALS]->store(longlong(non_null_vals), TRUE));
-		fields[SYS_STATS_NON_NULL_VALS]->set_notnull();
-	}
+	OK(field_store_string(fields[SYS_TABLESPACES_FILE_FORMAT],
+			      file_format));
+
+	OK(field_store_string(fields[SYS_TABLESPACES_ROW_FORMAT],
+			      row_format));
+
+	OK(fields[SYS_TABLESPACES_PAGE_SIZE]->store(page_size));
+
+	OK(fields[SYS_TABLESPACES_ZIP_PAGE_SIZE]->store(zip_size));
 
 	OK(schema_table_store_record(thd, table_to_fill));
 
 	DBUG_RETURN(0);
 }
 /*******************************************************************//**
-Function to populate INFORMATION_SCHEMA.innodb_sys_stats table.
+Function to populate INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES table.
+Loop through each record in SYS_TABLESPACES, and extract the column
+information and fill the INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES table.
 @return 0 on success */
 static
 int
-i_s_sys_stats_fill_table(
-/*=====================*/
-	THD*		thd,    /*!< in: thread */
-	TABLE_LIST*	tables, /*!< in/out: tables to fill */
-	COND*		cond)   /*!< in: condition (not used) */
+i_s_sys_tablespaces_fill_table(
+/*===========================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
 {
-        btr_pcur_t	pcur;
+	btr_pcur_t	pcur;
 	const rec_t*	rec;
 	mem_heap_t*	heap;
 	mtr_t		mtr;
 
-	DBUG_ENTER("i_s_sys_stats_fill_table");
-
+	DBUG_ENTER("i_s_sys_tablespaces_fill_table");
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
 
-	/* deny access to non-superusers */
-        if (check_global_access(thd, PROCESS_ACL, true)) {
-                DBUG_RETURN(0);
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL, true)) {
+		DBUG_RETURN(0);
 	}
 
-        heap = mem_heap_create(1000);
-        mutex_enter(&dict_sys->mutex);
-        mtr_start(&mtr);
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
 
-	rec = dict_startscan_system(&pcur, &mtr, SYS_STATS);
+	rec = dict_startscan_system(&pcur, &mtr, SYS_TABLESPACES);
 
 	while (rec) {
 		const char*	err_msg;
-		index_id_t	index_id;
-		ulint		key_cols;
-		ib_uint64_t	diff_vals;
-		ib_uint64_t	non_null_vals;
+		ulint		space;
+		const char*	name;
+		ulint		flags;
 
-		/* Extract necessary information from a SYS_FOREIGN_COLS row */
-		err_msg = dict_process_sys_stats_rec(
-			heap, rec, &index_id, &key_cols, &diff_vals, &non_null_vals);
+		/* Extract necessary information from a SYS_TABLESPACES row */
+		err_msg = dict_process_sys_tablespaces(
+			heap, rec, &space, &name, &flags);
 
 		mtr_commit(&mtr);
 		mutex_exit(&dict_sys->mutex);
 
 		if (!err_msg) {
-			i_s_dict_fill_sys_stats(
-				thd, index_id, key_cols, diff_vals, non_null_vals,
+			i_s_dict_fill_sys_tablespaces(
+				thd, space, name, flags,
 				tables->table);
 		} else {
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-					    ER_CANT_FIND_SYSTEM_REC,
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
 					    err_msg);
 		}
 
@@ -5460,188 +7577,27 @@ i_s_sys_stats_fill_table(
 	DBUG_RETURN(0);
 }
 /*******************************************************************//**
-Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_stats
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES
 @return 0 on success */
 static
 int
-innodb_sys_stats_init(
+innodb_sys_tablespaces_init(
 /*========================*/
-        void*   p)      /*!< in/out: table schema object */
-{
-        ST_SCHEMA_TABLE*        schema;
-
-        DBUG_ENTER("innodb_sys_stats_init");
-
-        schema = (ST_SCHEMA_TABLE*) p;
-
-        schema->fields_info = innodb_sys_stats_fields_info;
-        schema->fill_table = i_s_sys_stats_fill_table;
-
-        DBUG_RETURN(0);
-}
-
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_stats =
-{
-	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
-	/* int */
-	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
-
-	/* pointer to type-specific plugin descriptor */
-	/* void* */
-	STRUCT_FLD(info, &i_s_info),
-
-	/* plugin name */
-	/* const char* */
-	STRUCT_FLD(name, "INNODB_SYS_STATS"),
-
-	/* plugin author (for SHOW PLUGINS) */
-	/* const char* */
-	STRUCT_FLD(author, "Percona"),
-
-	/* general descriptive text (for SHOW PLUGINS) */
-	/* const char* */
-	STRUCT_FLD(descr, "XtraDB SYS_STATS table"),
-
-	/* the plugin license (PLUGIN_LICENSE_XXX) */
-	/* int */
-	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
-
-	/* the function to invoke when plugin is loaded */
-	/* int (*)(void*); */
-	STRUCT_FLD(init, innodb_sys_stats_init),
-
-	/* the function to invoke when plugin is unloaded */
-	/* int (*)(void*); */
-	STRUCT_FLD(deinit, i_s_common_deinit),
-
-	/* plugin version (for SHOW PLUGINS) */
-	/* unsigned int */
-	STRUCT_FLD(version, INNODB_VERSION_SHORT),
-
-	/* struct st_mysql_show_var* */
-	STRUCT_FLD(status_vars, NULL),
-
-	/* struct st_mysql_sys_var** */
-	STRUCT_FLD(system_vars, NULL),
-
-        INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
-};
-
-/***********************************************************************
-*/
-static ST_FIELD_INFO	i_s_innodb_rseg_fields_info[] =
-{
-	{STRUCT_FLD(field_name,		"rseg_id"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"space_id"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"zip_size"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"page_no"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"max_size"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"curr_size"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	END_OF_ST_FIELD_INFO
-};
-
-static
-int
-i_s_innodb_rseg_fill(
-/*=================*/
-	THD*		thd,	/* in: thread */
-	TABLE_LIST*	tables,	/* in/out: tables to fill */
-	COND*		cond)	/* in: condition (ignored) */
+	void*	p)	/*!< in/out: table schema object */
 {
-	TABLE*	table	= (TABLE *) tables->table;
-	int	status	= 0;
-	trx_rseg_t*	rseg;
-
-	DBUG_ENTER("i_s_innodb_rseg_fill");
-
-	/* deny access to non-superusers */
-        if (check_global_access(thd, PROCESS_ACL, true)) {
-
-		DBUG_RETURN(0);
-	}
-
-	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
-
-	rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
-
-	while (rseg) {
-		table->field[0]->store(rseg->id);
-		table->field[1]->store(rseg->space);
-		table->field[2]->store(rseg->zip_size);
-		table->field[3]->store(rseg->page_no);
-		table->field[4]->store(rseg->max_size);
-		table->field[5]->store(rseg->curr_size);
-
-		if (schema_table_store_record(thd, table)) {
-			status = 1;
-			break;
-		}
-
-		rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
-	}
+	ST_SCHEMA_TABLE*	schema;
 
-	DBUG_RETURN(status);
-}
+	DBUG_ENTER("innodb_sys_tablespaces_init");
 
-static
-int
-i_s_innodb_rseg_init(
-/*=================*/
-			/* out: 0 on success */
-	void*	p)	/* in/out: table schema object */
-{
-	DBUG_ENTER("i_s_innodb_rseg_init");
-	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = i_s_innodb_rseg_fields_info;
-	schema->fill_table = i_s_innodb_rseg_fill;
+	schema->fields_info = innodb_sys_tablespaces_fields_info;
+	schema->fill_table = i_s_sys_tablespaces_fill_table;
 
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_rseg =
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_sys_tablespaces =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -5653,15 +7609,15 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_rseg =
 
 	/* plugin name */
 	/* const char* */
-	STRUCT_FLD(name, "INNODB_RSEG"),
+	STRUCT_FLD(name, "INNODB_SYS_TABLESPACES"),
 
 	/* plugin author (for SHOW PLUGINS) */
 	/* const char* */
-	STRUCT_FLD(author, "Percona"),
+	STRUCT_FLD(author, plugin_author),
 
 	/* general descriptive text (for SHOW PLUGINS) */
 	/* const char* */
-	STRUCT_FLD(descr, "InnoDB rollback segment information"),
+	STRUCT_FLD(descr, "InnoDB SYS_TABLESPACES"),
 
 	/* the plugin license (PLUGIN_LICENSE_XXX) */
 	/* int */
@@ -5669,7 +7625,7 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_rseg =
 
 	/* the function to invoke when plugin is loaded */
 	/* int (*)(void*); */
-	STRUCT_FLD(init, i_s_innodb_rseg_init),
+	STRUCT_FLD(init, innodb_sys_tablespaces_init),
 
 	/* the function to invoke when plugin is unloaded */
 	/* int (*)(void*); */
@@ -5677,7 +7633,7 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_rseg =
 
 	/* plugin version (for SHOW PLUGINS) */
 	/* unsigned int */
-	STRUCT_FLD(version, 0x0100 /* 1.0 */),
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
 
 	/* struct st_mysql_show_var* */
 	STRUCT_FLD(status_vars, NULL),
@@ -5685,1170 +7641,151 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_rseg =
 	/* struct st_mysql_sys_var** */
 	STRUCT_FLD(system_vars, NULL),
 
-        INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
 };
 
-/***********************************************************************
-*/
-static ST_FIELD_INFO	i_s_innodb_table_stats_info[] =
+/**  SYS_DATAFILES  ************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_DATAFILES */
+static ST_FIELD_INFO	innodb_sys_datafiles_fields_info[] =
 {
-	{STRUCT_FLD(field_name,		"table_schema"),
-	 STRUCT_FLD(field_length,	NAME_LEN),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	0),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"table_name"),
-	 STRUCT_FLD(field_length,	NAME_LEN),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	0),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"rows"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"clust_size"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"other_size"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"modified"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	END_OF_ST_FIELD_INFO
-};
-
-static ST_FIELD_INFO	i_s_innodb_index_stats_info[] =
-{
-	{STRUCT_FLD(field_name,		"table_schema"),
-	 STRUCT_FLD(field_length,	NAME_LEN),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	0),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"table_name"),
-	 STRUCT_FLD(field_length,	NAME_LEN),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	0),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"index_name"),
-	 STRUCT_FLD(field_length,	NAME_LEN),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	0),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"fields"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+#define SYS_DATAFILES_SPACE		0
+	{STRUCT_FLD(field_name,		"SPACE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
 	 STRUCT_FLD(value,		0),
 	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-	{STRUCT_FLD(field_name,		"rows_per_key"),
-	 STRUCT_FLD(field_length,	256),
+#define SYS_DATAFILES_PATH		1
+	{STRUCT_FLD(field_name,		"PATH"),
+	 STRUCT_FLD(field_length,	OS_FILE_MAX_PATH),
 	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
 	 STRUCT_FLD(value,		0),
 	 STRUCT_FLD(field_flags,	0),
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
-	{STRUCT_FLD(field_name,		"index_total_pages"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"index_leaf_pages"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
 	END_OF_ST_FIELD_INFO
 };
 
+/**********************************************************************//**
+Function to fill INFORMATION_SCHEMA.INNODB_SYS_DATAFILES with information
+collected by scanning SYS_DATAFILESS table.
+@return 0 on success */
 static
 int
-i_s_innodb_table_stats_fill(
-/*========================*/
-	THD*		thd,
-	TABLE_LIST*	tables,
-	COND*		cond)
-{
-	TABLE*	i_s_table	= (TABLE *) tables->table;
-	int	status	= 0;
-	dict_table_t*	table;
-
-	DBUG_ENTER("i_s_innodb_table_stats_fill");
-
-	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
-
-	/* deny access to non-superusers */
-        if (check_global_access(thd, PROCESS_ACL, true)) {
-		DBUG_RETURN(0);
-	}
-
-	mutex_enter(&(dict_sys->mutex));
-
-	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
-
-	while (table) {
-		char	buf[NAME_LEN * 2 + 2];
-		char*	ptr;
-
-		if (table->stat_clustered_index_size == 0) {
-			table = UT_LIST_GET_NEXT(table_LRU, table);
-			continue;
-		}
-
-		buf[NAME_LEN * 2 + 1] = 0;
-		strncpy(buf, table->name, NAME_LEN * 2 + 1);
-		ptr = strchr(buf, '/');
-		if (ptr) {
-			*ptr = '\0';
-			++ptr;
-		} else {
-			ptr = buf;
-		}
-
-		field_store_string(i_s_table->field[0], buf);
-		field_store_string(i_s_table->field[1], ptr);
-		i_s_table->field[2]->store(table->stat_n_rows, 1);
-		i_s_table->field[3]->store(table->stat_clustered_index_size);
-		i_s_table->field[4]->store(table->stat_sum_of_other_index_sizes);
-		i_s_table->field[5]->store(table->stat_modified_counter);
-
-		if (schema_table_store_record(thd, i_s_table)) {
-			status = 1;
-			break;
-		}
-
-		table = UT_LIST_GET_NEXT(table_LRU, table);
-	}
-
-	mutex_exit(&(dict_sys->mutex));
-
-	DBUG_RETURN(status);
-}
-
-static
-int
-i_s_innodb_index_stats_fill(
+i_s_dict_fill_sys_datafiles(
 /*========================*/
-	THD*		thd,
-	TABLE_LIST*	tables,
-	COND*		cond)
+	THD*		thd,		/*!< in: thread */
+	ulint		space,		/*!< in: space ID */
+	const char*	path,		/*!< in: absolute path */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
 {
-	TABLE*	i_s_table	= (TABLE *) tables->table;
-	int	status	= 0;
-	dict_table_t*	table;
-	dict_index_t*	index;
-
-	DBUG_ENTER("i_s_innodb_index_stats_fill");
-
-	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
-
-	/* deny access to non-superusers */
-        if (check_global_access(thd, PROCESS_ACL, true)) {
-		DBUG_RETURN(0);
-	}
-
-	mutex_enter(&(dict_sys->mutex));
-
-	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
-
-	while (table) {
-		if (table->stat_clustered_index_size == 0) {
-			table = UT_LIST_GET_NEXT(table_LRU, table);
-			continue;
-		}
-
-		ib_int64_t	n_rows = table->stat_n_rows;
-
-		if (n_rows < 0) {
-			n_rows = 0;
-		}
-
-		index = dict_table_get_first_index(table);
-
-		while (index) {
-			char	buff[256+1];
-			char	row_per_keys[256+1];
-			char	buf[NAME_LEN * 2 + 2];
-			char*	ptr;
-			ulint	i;
-
-			buf[NAME_LEN * 2 + 1] = 0;
-			strncpy(buf, table->name, NAME_LEN * 2 + 1);
-			ptr = strchr(buf, '/');
-			if (ptr) {
-				*ptr = '\0';
-				++ptr;
-			} else {
-				ptr = buf;
-			}
-
-			field_store_string(i_s_table->field[0], buf);
-			field_store_string(i_s_table->field[1], ptr);
-			field_store_string(i_s_table->field[2], index->name);
-			i_s_table->field[3]->store(index->n_uniq);
-
-			row_per_keys[0] = '\0';
-
-			/* It is remained optimistic operation still for now */
-			//dict_index_stat_mutex_enter(index);
-			if (index->stat_n_diff_key_vals) {
-				for (i = 1; i <= index->n_uniq; i++) {
-					ib_int64_t	rec_per_key;
-					if (index->stat_n_diff_key_vals[i]) {
-						rec_per_key = n_rows / index->stat_n_diff_key_vals[i];
-					} else {
-						rec_per_key = n_rows;
-					}
-					ut_snprintf(buff, 256, (i == index->n_uniq)?"%llu":"%llu, ",
-						 rec_per_key);
-					strncat(row_per_keys, buff, 256 - strlen(row_per_keys));
-				}
-			}
-			//dict_index_stat_mutex_exit(index);
-
-			field_store_string(i_s_table->field[4], row_per_keys);
-
-			i_s_table->field[5]->store(index->stat_index_size);
-			i_s_table->field[6]->store(index->stat_n_leaf_pages);
-
-			if (schema_table_store_record(thd, i_s_table)) {
-				status = 1;
-				break;
-			}
-
-			index = dict_table_get_next_index(index);
-		}
-
-		if (status == 1) {
-			break;
-		}
-
-		table = UT_LIST_GET_NEXT(table_LRU, table);
-	}
-
-	mutex_exit(&(dict_sys->mutex));
-
-	DBUG_RETURN(status);
-}
+	Field**		fields;
 
-static
-int
-i_s_innodb_table_stats_init(
-/*========================*/
-	void*   p)
-{
-	DBUG_ENTER("i_s_innodb_table_stats_init");
-	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+	DBUG_ENTER("i_s_dict_fill_sys_datafiles");
 
-	schema->fields_info = i_s_innodb_table_stats_info;
-	schema->fill_table = i_s_innodb_table_stats_fill;
+	fields = table_to_fill->field;
 
-	DBUG_RETURN(0);
-}
+	OK(field_store_ulint(fields[SYS_DATAFILES_SPACE], space));
 
-static
-int
-i_s_innodb_index_stats_init(
-/*========================*/
-	void*	p)
-{
-	DBUG_ENTER("i_s_innodb_index_stats_init");
-	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+	OK(field_store_string(fields[SYS_DATAFILES_PATH], path));
 
-	schema->fields_info = i_s_innodb_index_stats_info;
-	schema->fill_table = i_s_innodb_index_stats_fill;
+	OK(schema_table_store_record(thd, table_to_fill));
 
 	DBUG_RETURN(0);
 }
-
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_table_stats =
-{
-	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
-	STRUCT_FLD(info, &i_s_info),
-	STRUCT_FLD(name, "INNODB_TABLE_STATS"),
-	STRUCT_FLD(author, "Percona"),
-	STRUCT_FLD(descr, "InnoDB table statistics in memory"),
-	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
-	STRUCT_FLD(init, i_s_innodb_table_stats_init),
-	STRUCT_FLD(deinit, i_s_common_deinit),
-	STRUCT_FLD(version, 0x0100 /* 1.0 */),
-	STRUCT_FLD(status_vars, NULL),
-	STRUCT_FLD(system_vars, NULL),
-        INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
-};
-
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_index_stats =
-{
-	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
-	STRUCT_FLD(info, &i_s_info),
-	STRUCT_FLD(name, "INNODB_INDEX_STATS"),
-	STRUCT_FLD(author, "Percona"),
-	STRUCT_FLD(descr, "InnoDB index statistics in memory"),
-	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
-	STRUCT_FLD(init, i_s_innodb_index_stats_init),
-	STRUCT_FLD(deinit, i_s_common_deinit),
-	STRUCT_FLD(version, 0x0100 /* 1.0 */),
-	STRUCT_FLD(status_vars, NULL),
-	STRUCT_FLD(system_vars, NULL),
-        INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
-};
-
-/***********************************************************************
-*/
-static ST_FIELD_INFO	i_s_innodb_admin_command_info[] =
-{
-	{STRUCT_FLD(field_name,		"result_message"),
-	 STRUCT_FLD(field_length,	1024),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	0),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	END_OF_ST_FIELD_INFO
-};
-
-#ifndef INNODB_COMPATIBILITY_HOOKS
-#error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS
-#endif
-
-extern "C" {
-char **thd_query(MYSQL_THD thd);
-}
-
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.INNODB_SYS_DATAFILES table.
+Loop through each record in SYS_DATAFILES, and extract the column
+information and fill the INFORMATION_SCHEMA.INNODB_SYS_DATAFILES table.
+@return 0 on success */
 static
 int
-i_s_innodb_admin_command_fill(
-/*==========================*/
-	THD*		thd,
-	TABLE_LIST*	tables,
-	COND*		cond)
+i_s_sys_datafiles_fill_table(
+/*=========================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
 {
-	TABLE*	i_s_table	= (TABLE *) tables->table;
-	char**	query_str;
-	char*	ptr;
-	char	quote	= '\0';
-	const char*	command_head = "XTRA_";
-
-	DBUG_ENTER("i_s_innodb_admin_command_fill");
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
 
+	DBUG_ENTER("i_s_sys_datafiles_fill_table");
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
 
-	/* deny access to non-superusers */
-        if (check_global_access(thd, PROCESS_ACL, true)) {
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL, true)) {
 		DBUG_RETURN(0);
 	}
 
-	if(thd_sql_command(thd) != SQLCOM_SELECT) {
-		field_store_string(i_s_table->field[0],
-			"SELECT command is only accepted.");
-		goto end_func;
-	}
-
-	query_str = thd_query(thd);
-	ptr = *query_str;
-	
-	for (; *ptr; ptr++) {
-		if (*ptr == quote) {
-			quote = '\0';
-		} else if (quote) {
-		} else if (*ptr == '`' || *ptr == '"') {
-			quote = *ptr;
-		} else {
-			long	i;
-			for (i = 0; command_head[i]; i++) {
-				if (toupper((int)(unsigned char)(ptr[i]))
-				    != toupper((int)(unsigned char)
-				      (command_head[i]))) {
-					goto nomatch;
-				}
-			}
-			break;
-nomatch:
-			;
-		}
-	}
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
 
-	if (!*ptr) {
-		field_store_string(i_s_table->field[0],
-			"No XTRA_* command in the SQL statement."
-			" Please add /*!XTRA_xxxx*/ to the SQL.");
-		goto end_func;
-	}
+	rec = dict_startscan_system(&pcur, &mtr, SYS_DATAFILES);
 
-	if (!strncasecmp("XTRA_HELLO", ptr, 10)) {
-		/* This is example command XTRA_HELLO */
+	while (rec) {
+		const char*	err_msg;
+		ulint		space;
+		const char*	path;
 
-		ut_print_timestamp(stderr);
-		fprintf(stderr, " InnoDB: administration command test for XtraDB"
-				" 'XTRA_HELLO' was detected.\n");
+		/* Extract necessary information from a SYS_DATAFILES row */
+		err_msg = dict_process_sys_datafiles(
+			heap, rec, &space, &path);
 
-		field_store_string(i_s_table->field[0],
-			"Hello!");
-		goto end_func;
-	}
-	else if (!strncasecmp("XTRA_LRU_DUMP", ptr, 13)) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr, " InnoDB: Administrative command 'XTRA_LRU_DUMP'"
-				" was detected.\n");
-
-		if (buf_LRU_file_dump()) {
-			field_store_string(i_s_table->field[0],
-				"XTRA_LRU_DUMP was succeeded.");
-		} else {
-			field_store_string(i_s_table->field[0],
-				"XTRA_LRU_DUMP was failed.");
-		}
+		mtr_commit(&mtr);
+		mutex_exit(&dict_sys->mutex);
 
-		goto end_func;
-	}
-	else if (!strncasecmp("XTRA_LRU_RESTORE", ptr, 16)) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr, " InnoDB: Administrative command 'XTRA_LRU_RESTORE'"
-				" was detected.\n");
-
-		if (buf_LRU_file_restore()) {
-			field_store_string(i_s_table->field[0],
-				"XTRA_LRU_RESTORE was succeeded.");
+		if (!err_msg) {
+			i_s_dict_fill_sys_datafiles(
+				thd, space, path, tables->table);
 		} else {
-			field_store_string(i_s_table->field[0],
-				"XTRA_LRU_RESTORE was failed.");
-		}
-
-		goto end_func;
-	}
-
-	field_store_string(i_s_table->field[0],
-		"Undefined XTRA_* command.");
-	goto end_func;
-
-end_func:
-	if (schema_table_store_record(thd, i_s_table)) {
-		DBUG_RETURN(1);
-	} else {
-		DBUG_RETURN(0);
-	}
-}
-
-static
-int
-i_s_innodb_admin_command_init(
-/*==========================*/
-	void*	p)
-{
-	DBUG_ENTER("i_s_innodb_admin_command_init");
-	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
-
-	schema->fields_info = i_s_innodb_admin_command_info;
-	schema->fill_table = i_s_innodb_admin_command_fill;
-
-	DBUG_RETURN(0);
-}
-
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_admin_command =
-{
-	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
-	STRUCT_FLD(info, &i_s_info),
-	STRUCT_FLD(name, "XTRADB_ADMIN_COMMAND"),
-	STRUCT_FLD(author, "Percona"),
-	STRUCT_FLD(descr, "XtraDB specific command acceptor"),
-	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
-	STRUCT_FLD(init, i_s_innodb_admin_command_init),
-	STRUCT_FLD(deinit, i_s_common_deinit),
-	STRUCT_FLD(version, 0x0100 /* 1.0 */),
-	STRUCT_FLD(status_vars, NULL),
-	STRUCT_FLD(system_vars, NULL),
-        INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
-};
-
-/***********************************************************************
-*/
-static ST_FIELD_INFO	i_s_innodb_buffer_pool_pages_fields_info[] =
-{
-	{STRUCT_FLD(field_name,		"page_type"),
-	 STRUCT_FLD(field_length,	64),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"space_id"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"page_no"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"lru_position"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"fix_count"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"flush_type"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	END_OF_ST_FIELD_INFO
-};
-
-static ST_FIELD_INFO	i_s_innodb_buffer_pool_pages_index_fields_info[] =
-{
-	{STRUCT_FLD(field_name,		"index_id"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"space_id"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"page_no"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"n_recs"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"data_size"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"hashed"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"access_time"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"modified"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"dirty"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"old"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"lru_position"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"fix_count"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"flush_type"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	END_OF_ST_FIELD_INFO
-};
-
-static ST_FIELD_INFO	i_s_innodb_buffer_pool_pages_blob_fields_info[] =
-{
-	{STRUCT_FLD(field_name,		"space_id"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"page_no"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"compressed"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"part_len"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"next_page_no"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"lru_position"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"fix_count"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	{STRUCT_FLD(field_name,		"flush_type"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	END_OF_ST_FIELD_INFO
-};
-
-/***********************************************************************
-Fill the dynamic table information_schema.innodb_buffer_pool_pages. */
-static
-int
-i_s_innodb_buffer_pool_pages_fill(
-/*================*/
-				/* out: 0 on success, 1 on failure */
-	THD*		thd,	/* in: thread */
-	TABLE_LIST*	tables,	/* in/out: tables to fill */
-	COND*		cond)	/* in: condition (ignored) */
-{
-	TABLE*	table	= (TABLE *) tables->table;
-	int	status	= 0;
-	ulint	i;
-
-	DBUG_ENTER("i_s_innodb_buffer_pool_pages_fill");
-
-	/* deny access to non-superusers */
-        if (check_global_access(thd, PROCESS_ACL, true)) {
-
-		DBUG_RETURN(0);
-	}
-
-	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		ulint		n_block;
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		buf_pool_mutex_enter(buf_pool);
-
-		for (n_block = 0; n_block < buf_pool->curr_size; n_block++) {
-			buf_block_t*	block = buf_page_from_array(buf_pool, n_block);
-			const buf_frame_t*	frame = block->frame;
-
-			char page_type[64];
-
-			switch(fil_page_get_type(frame))
-			{
-				case FIL_PAGE_INDEX:
-					strcpy(page_type, "index");
-					break;
-				case FIL_PAGE_UNDO_LOG:
-					strcpy(page_type, "undo_log");
-					break;
-				case FIL_PAGE_INODE:
-					strcpy(page_type, "inode");
-					break;
-				case FIL_PAGE_IBUF_FREE_LIST:
-					strcpy(page_type, "ibuf_free_list");
-					break;
-				case FIL_PAGE_TYPE_ALLOCATED:
-					strcpy(page_type, "allocated");
-					break;
-				case FIL_PAGE_IBUF_BITMAP:
-					strcpy(page_type, "bitmap");
-					break;
-				case FIL_PAGE_TYPE_SYS:
-					strcpy(page_type, "sys");
-					break;
-				case FIL_PAGE_TYPE_TRX_SYS:
-					strcpy(page_type, "trx_sys");
-					break;
-				case FIL_PAGE_TYPE_FSP_HDR:
-					strcpy(page_type, "fsp_hdr");
-					break;
-				case FIL_PAGE_TYPE_XDES:
-					strcpy(page_type, "xdes");
-					break;
-				case FIL_PAGE_TYPE_BLOB:
-					strcpy(page_type, "blob");
-					break;
-				case FIL_PAGE_TYPE_ZBLOB:
-					strcpy(page_type, "zblob");
-					break;
-				case FIL_PAGE_TYPE_ZBLOB2:
-					strcpy(page_type, "zblob2");
-					break;
-				default:
-					sprintf(page_type, "unknown (type=%li)", fil_page_get_type(frame));
-			}
-
-			field_store_string(table->field[0], page_type);
-			table->field[1]->store(block->page.space);
-			table->field[2]->store(block->page.offset);
-			table->field[3]->store(0);
-			table->field[4]->store(block->page.buf_fix_count);
-			table->field[5]->store(block->page.flush_type);
-
-			if (schema_table_store_record(thd, table)) {
-				status = 1;
-				break;
-			}
-
-		}      
-
-		buf_pool_mutex_exit(buf_pool);
-	}
-
-	DBUG_RETURN(status);
-}
-
-/***********************************************************************
-Fill the dynamic table information_schema.innodb_buffer_pool_pages_index. */
-static
-int
-i_s_innodb_buffer_pool_pages_index_fill(
-/*================*/
-				/* out: 0 on success, 1 on failure */
-	THD*		thd,	/* in: thread */
-	TABLE_LIST*	tables,	/* in/out: tables to fill */
-	COND*		cond)	/* in: condition (ignored) */
-{
-	TABLE*	table	= (TABLE *) tables->table;
-	int	status	= 0;
-	ulint	i;
-	index_id_t	index_id;
-
-	DBUG_ENTER("i_s_innodb_buffer_pool_pages_index_fill");
-
-	/* deny access to non-superusers */
-        if (check_global_access(thd, PROCESS_ACL, true)) {
-
-		DBUG_RETURN(0);
-	}
-
-	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		ulint		n_block;
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		buf_pool_mutex_enter(buf_pool);
-	
-		for (n_block = 0; n_block < buf_pool->curr_size; n_block++) {
-			buf_block_t*	block = buf_page_from_array(buf_pool, n_block);
-			const buf_frame_t* frame = block->frame;
-
-			if (fil_page_get_type(frame) == FIL_PAGE_INDEX) {
-				index_id = btr_page_get_index_id(frame);
-				table->field[0]->store(index_id, TRUE);
-				table->field[1]->store(block->page.space, TRUE);
-				table->field[2]->store(block->page.offset, TRUE);
-				table->field[3]->store(page_get_n_recs(frame), TRUE);
-				table->field[4]->store(page_get_data_size(frame), TRUE);
-				table->field[5]->store(block->index != NULL, TRUE);
-				table->field[6]->store(block->page.access_time, TRUE);
-				table->field[7]->store(block->page.newest_modification != 0, TRUE);
-				table->field[8]->store(block->page.oldest_modification != 0, TRUE);
-				table->field[9]->store(block->page.old, TRUE);
-				table->field[10]->store(0, TRUE);
-				table->field[11]->store(block->page.buf_fix_count, TRUE);
-				table->field[12]->store(block->page.flush_type, TRUE);
-
-				if (schema_table_store_record(thd, table)) {
-					status = 1;
-					break;
-				}
-			}      
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
 		}
 
-		buf_pool_mutex_exit(buf_pool);
-	}
-
-	DBUG_RETURN(status);
-}
-
-/***********************************************************************
-Fill the dynamic table information_schema.innodb_buffer_pool_pages_index. */
-static
-int
-i_s_innodb_buffer_pool_pages_blob_fill(
-/*================*/
-				/* out: 0 on success, 1 on failure */
-	THD*		thd,	/* in: thread */
-	TABLE_LIST*	tables,	/* in/out: tables to fill */
-	COND*		cond)	/* in: condition (ignored) */
-{
-	TABLE*	table	= (TABLE *) tables->table;
-	int	status	= 0;
-	ulint	i;
-
-	ulint		part_len;
-	ulint		next_page_no;
-
-	DBUG_ENTER("i_s_innodb_buffer_pool_pages_blob_fill");
-
-	/* deny access to non-superusers */
-        if (check_global_access(thd, PROCESS_ACL, true)) {
-
-		DBUG_RETURN(0);
-	}
-
-	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		ulint		n_block;
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		buf_pool_mutex_enter(buf_pool);
-	
-		for (n_block = 0; n_block < buf_pool->curr_size; n_block++) {
-			buf_block_t*	block = buf_page_from_array(buf_pool, n_block);
-			page_zip_des_t*	block_page_zip = buf_block_get_page_zip(block);
-			const buf_frame_t* frame = block->frame;
-
-			if (fil_page_get_type(frame) == FIL_PAGE_TYPE_BLOB) {
-
-				if (UNIV_LIKELY_NULL(block_page_zip)) {
-					part_len = 0; /* hmm, can't figure it out */
-
-					next_page_no = mach_read_from_4(
-							buf_block_get_frame(block)
-							+ FIL_PAGE_NEXT);        
-				} else {
-					part_len = mach_read_from_4(
-							buf_block_get_frame(block)
-							+ FIL_PAGE_DATA
-							+ 0 /*BTR_BLOB_HDR_PART_LEN*/);
-
-					next_page_no = mach_read_from_4(
-							buf_block_get_frame(block)
-							+ FIL_PAGE_DATA
-							+ 4 /*BTR_BLOB_HDR_NEXT_PAGE_NO*/);
-				}
-
-				table->field[0]->store(block->page.space);
-				table->field[1]->store(block->page.offset);
-				table->field[2]->store(block_page_zip != NULL);
-				table->field[3]->store(part_len);
-
-				if(next_page_no == FIL_NULL)
-				{
-					table->field[4]->store(0);
-				} else {
-					table->field[4]->store(block->page.offset);
-				}
-
-				table->field[5]->store(0);
-				table->field[6]->store(block->page.buf_fix_count);
-				table->field[7]->store(block->page.flush_type);
-
-				if (schema_table_store_record(thd, table)) {
-					status = 1;
-					break;
-				}
-
-			}
-		}      
+		mem_heap_empty(heap);
 
-		buf_pool_mutex_exit(buf_pool);
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
 	}
 
-	DBUG_RETURN(status);
-}
-
-/***********************************************************************
-Bind the dynamic table information_schema.innodb_buffer_pool_pages. */
-static
-int
-i_s_innodb_buffer_pool_pages_init(
-/*=========*/
-			/* out: 0 on success */
-	void*	p)	/* in/out: table schema object */
-{
-	DBUG_ENTER("i_s_innodb_buffer_pool_pages_init");
-	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
-
-	schema->fields_info = i_s_innodb_buffer_pool_pages_fields_info;
-	schema->fill_table = i_s_innodb_buffer_pool_pages_fill;
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
 
 	DBUG_RETURN(0);
 }
-
-/***********************************************************************
-Bind the dynamic table information_schema.innodb_buffer_pool_pages. */
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_DATAFILES
+@return 0 on success */
 static
 int
-i_s_innodb_buffer_pool_pages_index_init(
-/*=========*/
-			/* out: 0 on success */
-	void*	p)	/* in/out: table schema object */
+innodb_sys_datafiles_init(
+/*======================*/
+	void*	p)	/*!< in/out: table schema object */
 {
-	DBUG_ENTER("i_s_innodb_buffer_pool_pages_index_init");
-	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
-
-	schema->fields_info = i_s_innodb_buffer_pool_pages_index_fields_info;
-	schema->fill_table = i_s_innodb_buffer_pool_pages_index_fill;
+	ST_SCHEMA_TABLE*	schema;
 
-	DBUG_RETURN(0);
-}
+	DBUG_ENTER("innodb_sys_datafiles_init");
 
-/***********************************************************************
-Bind the dynamic table information_schema.innodb_buffer_pool_pages. */
-static
-int
-i_s_innodb_buffer_pool_pages_blob_init(
-/*=========*/
-			/* out: 0 on success */
-	void*	p)	/* in/out: table schema object */
-{
-	DBUG_ENTER("i_s_innodb_buffer_pool_pages_blob_init");
-	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+	schema = (ST_SCHEMA_TABLE*) p;
 
-	schema->fields_info = i_s_innodb_buffer_pool_pages_blob_fields_info;
-	schema->fill_table = i_s_innodb_buffer_pool_pages_blob_fill;
+	schema->fields_info = innodb_sys_datafiles_fields_info;
+	schema->fill_table = i_s_sys_datafiles_fill_table;
 
 	DBUG_RETURN(0);
 }
 
-
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_buffer_pool_pages =
-{
-	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
-	/* int */
-	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
-
-	/* pointer to type-specific plugin descriptor */
-	/* void* */
-	STRUCT_FLD(info, &i_s_info),
-
-	/* plugin name */
-	/* const char* */
-	STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES"),
-
-	/* plugin author (for SHOW PLUGINS) */
-	/* const char* */
-	STRUCT_FLD(author, "Percona"),
-
-	/* general descriptive text (for SHOW PLUGINS) */
-	/* const char* */
-	STRUCT_FLD(descr, "InnoDB buffer pool pages"),
-
-	/* the plugin license (PLUGIN_LICENSE_XXX) */
-	/* int */
-	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
-
-	/* the function to invoke when plugin is loaded */
-	/* int (*)(void*); */
-	STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_init),
-
-	/* the function to invoke when plugin is unloaded */
-	/* int (*)(void*); */
-	STRUCT_FLD(deinit, i_s_common_deinit),
-
-	/* plugin version (for SHOW PLUGINS) */
-	/* unsigned int */
-	STRUCT_FLD(version, 0x0100 /* 1.0 */),
-
-	/* struct st_mysql_show_var* */
-	STRUCT_FLD(status_vars, NULL),
-
-	/* struct st_mysql_sys_var** */
-	STRUCT_FLD(system_vars, NULL),
-
-        INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
-};
-
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_buffer_pool_pages_index =
-{
-	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
-	/* int */
-	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
-
-	/* pointer to type-specific plugin descriptor */
-	/* void* */
-	STRUCT_FLD(info, &i_s_info),
-
-	/* plugin name */
-	/* const char* */
-	STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES_INDEX"),
-
-	/* plugin author (for SHOW PLUGINS) */
-	/* const char* */
-	STRUCT_FLD(author, "Percona"),
-
-	/* general descriptive text (for SHOW PLUGINS) */
-	/* const char* */
-	STRUCT_FLD(descr, "InnoDB buffer pool index pages"),
-
-	/* the plugin license (PLUGIN_LICENSE_XXX) */
-	/* int */
-	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
-
-	/* the function to invoke when plugin is loaded */
-	/* int (*)(void*); */
-	STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_index_init),
-
-	/* the function to invoke when plugin is unloaded */
-	/* int (*)(void*); */
-	STRUCT_FLD(deinit, i_s_common_deinit),
-
-	/* plugin version (for SHOW PLUGINS) */
-	/* unsigned int */
-	STRUCT_FLD(version, 0x0100 /* 1.0 */),
-
-	/* struct st_mysql_show_var* */
-	STRUCT_FLD(status_vars, NULL),
-
-	/* struct st_mysql_sys_var** */
-	STRUCT_FLD(system_vars, NULL),
-
-        INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
-};
-
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_buffer_pool_pages_blob =
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_sys_datafiles =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -6860,15 +7797,15 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_buffer_pool_pages_blob =
 
 	/* plugin name */
 	/* const char* */
-	STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES_BLOB"),
+	STRUCT_FLD(name, "INNODB_SYS_DATAFILES"),
 
 	/* plugin author (for SHOW PLUGINS) */
 	/* const char* */
-	STRUCT_FLD(author, "Percona"),
+	STRUCT_FLD(author, plugin_author),
 
 	/* general descriptive text (for SHOW PLUGINS) */
 	/* const char* */
-	STRUCT_FLD(descr, "InnoDB buffer pool blob pages"),
+	STRUCT_FLD(descr, "InnoDB SYS_DATAFILES"),
 
 	/* the plugin license (PLUGIN_LICENSE_XXX) */
 	/* int */
@@ -6876,7 +7813,7 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_buffer_pool_pages_blob =
 
 	/* the function to invoke when plugin is loaded */
 	/* int (*)(void*); */
-	STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_blob_init),
+	STRUCT_FLD(init, innodb_sys_datafiles_init),
 
 	/* the function to invoke when plugin is unloaded */
 	/* int (*)(void*); */
@@ -6884,245 +7821,7 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_buffer_pool_pages_blob =
 
 	/* plugin version (for SHOW PLUGINS) */
 	/* unsigned int */
-	STRUCT_FLD(version, 0x0100 /* 1.0 */),
-
-	/* struct st_mysql_show_var* */
-	STRUCT_FLD(status_vars, NULL),
-
-	/* struct st_mysql_sys_var** */
-	STRUCT_FLD(system_vars, NULL),
-
-        INNODB_VERSION_STR, MariaDB_PLUGIN_MATURITY_STABLE
-};
-
-
-static ST_FIELD_INFO	i_s_innodb_undo_logs_fields_info[] =
-{
-#define IDX_USEG_TRX_ID 0
-	{STRUCT_FLD(field_name,		"trx_id"),
-	 STRUCT_FLD(field_length,	TRX_ID_MAX_LEN + 1),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	0),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-#define IDX_USEG_RSEG_ID 1
-	{STRUCT_FLD(field_name,		"rseg_id"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-#define IDX_USEG_USEG_ID 2
-	{STRUCT_FLD(field_name,		"useg_id"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-#define IDX_USEG_TYPE 3
-#define USEG_TYPE_MAX_LEN 256
-	{STRUCT_FLD(field_name,		"type"),
-	 STRUCT_FLD(field_length,	USEG_TYPE_MAX_LEN),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	0),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-	 
- #define IDX_USEG_STATE 4
- #define USEG_STATE_MAX_LEN 256
-	{STRUCT_FLD(field_name,		"state"),
-	 STRUCT_FLD(field_length,	USEG_STATE_MAX_LEN),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	0),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-	 
-#define IDX_USEG_SIZE 5
-	{STRUCT_FLD(field_name,		"size"),
-	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
-	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
-	 STRUCT_FLD(value,		0),
-	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
-	 STRUCT_FLD(old_name,		""),
-	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
-
-	END_OF_ST_FIELD_INFO
-};
-static
-int
-i_s_innodb_undo_logs_fill_store(
-/*=================*/
-	THD*		thd,	/* in: thread */
-	TABLE*		table,	/* in/out: table to fill */
-	trx_undo_t*	useg)	/* in: useg to fill from */
-{
-	char		trx_id[TRX_ID_MAX_LEN + 1];
-
-	DBUG_ENTER("i_s_innodb_undo_logs_fill_store");
-
-	switch (useg->type) {
-	case TRX_UNDO_INSERT:
-		OK(field_store_string(table->field[IDX_USEG_TYPE], "INSERT"));		
-		break;
-	case TRX_UNDO_UPDATE:
-		OK(field_store_string(table->field[IDX_USEG_TYPE], "UPDATE"));		
-		break;
-	default:
-		OK(field_store_string(table->field[IDX_USEG_TYPE], "UNKNOWN"));		
-		break;
-	}
-
-	ut_snprintf(trx_id, sizeof(trx_id), TRX_ID_FMT, useg->trx_id);
-
-	switch (useg->state) {
-	case TRX_UNDO_ACTIVE:
-		OK(field_store_string(table->field[IDX_USEG_TRX_ID], trx_id));
-		OK(field_store_string(table->field[IDX_USEG_STATE], "ACTIVE"));		
-		break;
-	case TRX_UNDO_CACHED:
-		OK(field_store_string(table->field[IDX_USEG_TRX_ID], NULL));
-		OK(field_store_string(table->field[IDX_USEG_STATE], "CACHED"));		
-		break;
-	case TRX_UNDO_TO_FREE:
-		OK(field_store_string(table->field[IDX_USEG_TRX_ID], NULL));
-		OK(field_store_string(table->field[IDX_USEG_STATE], "TO_FREE"));		
-		break;
-	case TRX_UNDO_TO_PURGE:
-		OK(field_store_string(table->field[IDX_USEG_TRX_ID], NULL));
-		OK(field_store_string(table->field[IDX_USEG_STATE], "TO_PURGE"));		
-		break;
-	case TRX_UNDO_PREPARED:
-		OK(field_store_string(table->field[IDX_USEG_TRX_ID], trx_id));
-		OK(field_store_string(table->field[IDX_USEG_STATE], "PREPARED"));		
-		break;
-	default:
-		OK(field_store_string(table->field[IDX_USEG_TRX_ID], trx_id));
-		OK(field_store_string(table->field[IDX_USEG_STATE], "UNKNOWN"));		
-		break;
-	}
-
-	table->field[IDX_USEG_RSEG_ID]->store(useg->rseg->id);
-	table->field[IDX_USEG_USEG_ID]->store(useg->id);
-	table->field[IDX_USEG_SIZE]->store(useg->size);
-	if (schema_table_store_record(thd, table)) {
-		DBUG_RETURN(1);
-	}
-	DBUG_RETURN(0);
-}
-static
-int
-i_s_innodb_undo_logs_fill(
-/*=================*/
-	THD*		thd,	/* in: thread */
-	TABLE_LIST*	tables,	/* in/out: tables to fill */
-	COND*		cond)	/* in: condition (ignored) */
-{
-	TABLE*	table	= (TABLE *) tables->table;
-	int	status	= 0;
-	trx_rseg_t*	rseg;
-	trx_undo_t*	useg;
-
-	DBUG_ENTER("i_s_innodb_undo_logs_fill");
-
-	/* deny access to non-superusers */
-        if (check_global_access(thd, PROCESS_ACL, true)) {
-		DBUG_RETURN(0);
-	}
-
-	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
-
-	rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
-	while (rseg && status == 0) {
-		mutex_enter(&(rseg->mutex));
-		useg = UT_LIST_GET_FIRST(rseg->update_undo_list);
-		while (useg && status == 0) {
-			status = i_s_innodb_undo_logs_fill_store(thd, table, useg);
-			useg = UT_LIST_GET_NEXT(undo_list, useg);
-		}
-
-		useg = UT_LIST_GET_FIRST(rseg->update_undo_cached);
-		while (useg && status == 0) {
-			status = i_s_innodb_undo_logs_fill_store(thd, table, useg);
-			useg = UT_LIST_GET_NEXT(undo_list, useg);
-		}
-
-		useg = UT_LIST_GET_FIRST(rseg->insert_undo_list);
-		while (useg && status == 0) {
-			status = i_s_innodb_undo_logs_fill_store(thd, table, useg);
-			useg = UT_LIST_GET_NEXT(undo_list, useg);
-		}
-
-		useg = UT_LIST_GET_FIRST(rseg->insert_undo_cached);
-		while (useg && status == 0) {
-			status = i_s_innodb_undo_logs_fill_store(thd, table, useg);
-			useg = UT_LIST_GET_NEXT(undo_list, useg);
-		}
-		mutex_exit(&(rseg->mutex));
-		rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
-	}
-
-	DBUG_RETURN(status);
-}
-
-static
-int
-i_s_innodb_undo_logs_init(
-/*=================*/
-			/* out: 0 on success */
-	void*	p)	/* in/out: table schema object */
-{
-	DBUG_ENTER("i_s_innodb_undo_logs_init");
-	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
-
-	schema->fields_info = i_s_innodb_undo_logs_fields_info;
-	schema->fill_table = i_s_innodb_undo_logs_fill;
-
-	DBUG_RETURN(0);
-}
-
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_undo_logs =
-{
-	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
-	/* int */
-	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
-
-	/* pointer to type-specific plugin descriptor */
-	/* void* */
-	STRUCT_FLD(info, &i_s_info),
-
-	/* plugin name */
-	/* const char* */
-	STRUCT_FLD(name, "INNODB_UNDO_LOGS"),
-
-	/* plugin author (for SHOW PLUGINS) */
-	/* const char* */
-	STRUCT_FLD(author, "Percona"),
-
-	/* general descriptive text (for SHOW PLUGINS) */
-	/* const char* */
-	STRUCT_FLD(descr, "InnoDB rollback undo segment information"),
-
-	/* the plugin license (PLUGIN_LICENSE_XXX) */
-	/* int */
-	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
-
-	/* the function to invoke when plugin is loaded */
-	/* int (*)(void*); */
-	STRUCT_FLD(init, i_s_innodb_undo_logs_init),
-
-	/* the function to invoke when plugin is unloaded */
-	/* int (*)(void*); */	STRUCT_FLD(deinit, i_s_common_deinit),
-
-	/* plugin version (for SHOW PLUGINS) */
-	STRUCT_FLD(version, 0x0100 /* 1.0 */),
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
 
 	/* struct st_mysql_show_var* */
 	STRUCT_FLD(status_vars, NULL),
@@ -7224,7 +7923,7 @@ void
 limit_lsn_range_from_condition(
 /*===========================*/
 	TABLE*		table,		/*!<in: table */
-	COND*		cond,		/*!<in: condition */
+	Item*		cond,		/*!<in: condition */
 	ib_uint64_t*	start_lsn,	/*!<in/out: minumum LSN */
 	ib_uint64_t*	end_lsn)	/*!<in/out: maximum LSN */
 {
@@ -7304,7 +8003,7 @@ limit_lsn_range_from_condition(
 			tmp_result = right->val_int();
 			if (((func_type == Item_func::LE_FUNC)
 			     || (func_type == Item_func::GE_FUNC))
-			    && (tmp_result != IB_ULONGLONG_MAX)) {
+			    && (tmp_result != IB_UINT64_MAX)) {
 
 				tmp_result++;
 			}
@@ -7324,7 +8023,7 @@ limit_lsn_range_from_condition(
 			}
 			if (((func_type == Item_func::LT_FUNC)
 			     || (func_type == Item_func::GT_FUNC))
-			    && (tmp_result != IB_ULONGLONG_MAX)) {
+			    && (tmp_result != IB_UINT64_MAX)) {
 
 				tmp_result++;
 			}
@@ -7348,13 +8047,13 @@ i_s_innodb_changed_pages_fill(
 /*==========================*/
 	THD*		thd,	/*!<in: thread */
 	TABLE_LIST*	tables,	/*!<in/out: tables to fill */
-	COND*		cond)	/*!<in: condition */
+	Item*		cond)	/*!<in: condition */
 {
 	TABLE*			table = (TABLE *) tables->table;
 	log_bitmap_iterator_t	i;
 	ib_uint64_t		output_rows_num = 0UL;
-	ib_uint64_t		max_lsn = IB_ULONGLONG_MAX;
-	ib_uint64_t		min_lsn = 0ULL;
+	lsn_t			max_lsn = LSN_MAX;
+	lsn_t			min_lsn = 0ULL;
 	int			ret = 0;
 
 	DBUG_ENTER("i_s_innodb_changed_pages_fill");
diff --git a/storage/xtradb/handler/i_s.h b/storage/xtradb/handler/i_s.h
index 4d3913a544c..8d33e9bce7d 100644
--- a/storage/xtradb/handler/i_s.h
+++ b/storage/xtradb/handler/i_s.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -28,32 +28,37 @@ Created July 18, 2007 Vasil Dimov
 
 const char plugin_author[] = "Oracle Corporation";
 
+#define st_mysql_plugin st_maria_plugin
+
 extern struct st_maria_plugin	i_s_innodb_trx;
-extern struct st_maria_plugin	i_s_innodb_locks;
-extern struct st_maria_plugin	i_s_innodb_lock_waits;
-extern struct st_maria_plugin	i_s_innodb_cmp;
-extern struct st_maria_plugin	i_s_innodb_cmp_reset;
-extern struct st_maria_plugin	i_s_innodb_cmpmem;
-extern struct st_maria_plugin	i_s_innodb_cmpmem_reset;
-extern struct st_maria_plugin	i_s_innodb_sys_tables;
-extern struct st_maria_plugin	i_s_innodb_sys_tablestats;
-extern struct st_maria_plugin	i_s_innodb_sys_indexes;
-extern struct st_maria_plugin   i_s_innodb_sys_columns;
-extern struct st_maria_plugin   i_s_innodb_sys_fields;
-extern struct st_maria_plugin   i_s_innodb_sys_foreign;
-extern struct st_maria_plugin   i_s_innodb_sys_foreign_cols;
-extern struct st_maria_plugin	i_s_innodb_rseg;
-extern struct st_maria_plugin	i_s_innodb_undo_logs;
-extern struct st_maria_plugin	i_s_innodb_sys_stats;
-extern struct st_maria_plugin	i_s_innodb_table_stats;
-extern struct st_maria_plugin	i_s_innodb_index_stats;
-extern struct st_maria_plugin	i_s_innodb_admin_command;
-extern struct st_maria_plugin	i_s_innodb_buffer_pool_pages;
-extern struct st_maria_plugin	i_s_innodb_buffer_pool_pages_index;
-extern struct st_maria_plugin	i_s_innodb_buffer_pool_pages_blob;
-extern struct st_maria_plugin	i_s_innodb_changed_pages;
-extern struct st_maria_plugin	i_s_innodb_buffer_page;
-extern struct st_maria_plugin	i_s_innodb_buffer_page_lru;
-extern struct st_maria_plugin	i_s_innodb_buffer_stats;
+extern struct st_mysql_plugin	i_s_innodb_trx;
+extern struct st_mysql_plugin	i_s_innodb_locks;
+extern struct st_mysql_plugin	i_s_innodb_lock_waits;
+extern struct st_mysql_plugin	i_s_innodb_cmp;
+extern struct st_mysql_plugin	i_s_innodb_cmp_reset;
+extern struct st_mysql_plugin	i_s_innodb_cmp_per_index;
+extern struct st_mysql_plugin	i_s_innodb_cmp_per_index_reset;
+extern struct st_mysql_plugin	i_s_innodb_cmpmem;
+extern struct st_mysql_plugin	i_s_innodb_cmpmem_reset;
+extern struct st_mysql_plugin   i_s_innodb_metrics;
+extern struct st_mysql_plugin	i_s_innodb_ft_default_stopword;
+extern struct st_mysql_plugin	i_s_innodb_ft_deleted;
+extern struct st_mysql_plugin	i_s_innodb_ft_being_deleted;
+extern struct st_mysql_plugin	i_s_innodb_ft_index_cache;
+extern struct st_mysql_plugin	i_s_innodb_ft_index_table;
+extern struct st_mysql_plugin	i_s_innodb_ft_config;
+extern struct st_mysql_plugin	i_s_innodb_buffer_page;
+extern struct st_mysql_plugin	i_s_innodb_buffer_page_lru;
+extern struct st_mysql_plugin	i_s_innodb_buffer_stats;
+extern struct st_mysql_plugin	i_s_innodb_sys_tables;
+extern struct st_mysql_plugin	i_s_innodb_sys_tablestats;
+extern struct st_mysql_plugin	i_s_innodb_sys_indexes;
+extern struct st_mysql_plugin	i_s_innodb_sys_columns;
+extern struct st_mysql_plugin	i_s_innodb_sys_fields;
+extern struct st_mysql_plugin	i_s_innodb_sys_foreign;
+extern struct st_mysql_plugin	i_s_innodb_sys_foreign_cols;
+extern struct st_mysql_plugin	i_s_innodb_sys_tablespaces;
+extern struct st_mysql_plugin	i_s_innodb_sys_datafiles;
+extern struct st_mysql_plugin	i_s_innodb_changed_pages;
 
 #endif /* i_s_h */
diff --git a/storage/xtradb/handler/xtradb_i_s.cc b/storage/xtradb/handler/xtradb_i_s.cc
new file mode 100644
index 00000000000..77050c0b7f8
--- /dev/null
+++ b/storage/xtradb/handler/xtradb_i_s.cc
@@ -0,0 +1,603 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2010-2012, Percona Inc. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#include <mysqld_error.h>
+#include <sql_acl.h>				// PROCESS_ACL
+
+#include <m_ctype.h>
+#include <hash.h>
+#include <myisampack.h>
+#include <mysys_err.h>
+#include <my_sys.h>
+#include "i_s.h"
+#include <sql_plugin.h>
+#include <innodb_priv.h>
+
+#include <read0i_s.h>
+#include <trx0i_s.h>
+#include "srv0start.h"	/* for srv_was_started */
+#include <btr0sea.h> /* btr_search_sys */
+#include <log0recv.h> /* recv_sys */
+#include <fil0fil.h>
+
+/* for XTRADB_RSEG table */
+#include "trx0trx.h" /* for TRX_QUE_STATE_STR_MAX_LEN */
+#include "trx0rseg.h" /* for trx_rseg_struct */
+#include "trx0sys.h" /* for trx_sys */
+
+#define PLUGIN_AUTHOR "Percona Inc."
+
+#define OK(expr)		\
+	if ((expr) != 0) {	\
+		DBUG_RETURN(1);	\
+	}
+
+#define RETURN_IF_INNODB_NOT_STARTED(plugin_name)			\
+do {									\
+	if (!srv_was_started) {						\
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,	\
+				    ER_CANT_FIND_SYSTEM_REC,		\
+				    "InnoDB: SELECTing from "		\
+				    "INFORMATION_SCHEMA.%s but "	\
+				    "the InnoDB storage engine "	\
+				    "is not installed", plugin_name);	\
+		DBUG_RETURN(0);						\
+	}								\
+} while (0)
+
+#if !defined __STRICT_ANSI__ && defined __GNUC__ && (__GNUC__) > 2 &&	\
+	!defined __INTEL_COMPILER && !defined __clang__
+#define STRUCT_FLD(name, value)	name: value
+#else
+#define STRUCT_FLD(name, value)	value
+#endif
+
+#define END_OF_ST_FIELD_INFO \
+	{STRUCT_FLD(field_name,		NULL), \
+	 STRUCT_FLD(field_length,	0), \
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_NULL), \
+	 STRUCT_FLD(value,		0), \
+	 STRUCT_FLD(field_flags,	0), \
+	 STRUCT_FLD(old_name,		""), \
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)}
+
+
+/*******************************************************************//**
+Auxiliary function to store ulint value in MYSQL_TYPE_LONGLONG field.
+If the value is ULINT_UNDEFINED then the field it set to NULL.
+@return	0 on success */
+static
+int
+field_store_ulint(
+/*==============*/
+	Field*	field,	/*!< in/out: target field for storage */
+	ulint	n)	/*!< in: value to store */
+{
+	int	ret;
+
+	if (n != ULINT_UNDEFINED) {
+
+		ret = field->store(n);
+		field->set_notnull();
+	} else {
+
+		ret = 0; /* success */
+		field->set_null();
+	}
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Auxiliary function to store char* value in MYSQL_TYPE_STRING field.
+@return	0 on success */
+static
+int
+field_store_string(
+/*===============*/
+	Field*		field,	/*!< in/out: target field for storage */
+	const char*	str)	/*!< in: NUL-terminated utf-8 string,
+				or NULL */
+{
+	int	ret;
+
+	if (str != NULL) {
+
+		ret = field->store(str, strlen(str),
+				   system_charset_info);
+		field->set_notnull();
+	} else {
+
+		ret = 0; /* success */
+		field->set_null();
+	}
+
+	return(ret);
+}
+
+static
+int
+i_s_common_deinit(
+/*==============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_common_deinit");
+
+	/* Do nothing */
+
+	DBUG_RETURN(0);
+}
+
+static ST_FIELD_INFO xtradb_read_view_fields_info[] =
+{
+#define READ_VIEW_UNDO_NUMBER		0
+	{STRUCT_FLD(field_name,		"READ_VIEW_UNDO_NUMBER"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define READ_VIEW_LOW_LIMIT_NUMBER	1
+	{STRUCT_FLD(field_name,		"READ_VIEW_LOW_LIMIT_TRX_NUMBER"),
+	 STRUCT_FLD(field_length,	TRX_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define READ_VIEW_UPPER_LIMIT_ID	2
+	{STRUCT_FLD(field_name,		"READ_VIEW_UPPER_LIMIT_TRX_ID"),
+	 STRUCT_FLD(field_length,	TRX_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define READ_VIEW_LOW_LIMIT_ID		3
+	{STRUCT_FLD(field_name,		"READ_VIEW_LOW_LIMIT_TRX_ID"),
+
+	 STRUCT_FLD(field_length,	TRX_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+static int xtradb_read_view_fill_table(THD* thd, TABLE_LIST* tables, Item*)
+{
+	const char*		table_name;
+	Field**	fields;
+	TABLE* table;
+	char		trx_id[TRX_ID_MAX_LEN + 1];
+
+
+	DBUG_ENTER("xtradb_read_view_fill_table");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	table_name = tables->schema_table_name;
+	table = tables->table;
+	fields = table->field;
+
+	RETURN_IF_INNODB_NOT_STARTED(table_name);
+
+	i_s_xtradb_read_view_t read_view;
+
+	if (read_fill_i_s_xtradb_read_view(&read_view) == NULL)
+		DBUG_RETURN(0);
+
+	OK(field_store_ulint(fields[READ_VIEW_UNDO_NUMBER], read_view.undo_no));
+
+	ut_snprintf(trx_id, sizeof(trx_id), TRX_ID_FMT, read_view.low_limit_no);
+	OK(field_store_string(fields[READ_VIEW_LOW_LIMIT_NUMBER], trx_id));
+
+	ut_snprintf(trx_id, sizeof(trx_id), TRX_ID_FMT, read_view.up_limit_id);
+	OK(field_store_string(fields[READ_VIEW_UPPER_LIMIT_ID], trx_id));
+
+	ut_snprintf(trx_id, sizeof(trx_id), TRX_ID_FMT, read_view.low_limit_id);
+	OK(field_store_string(fields[READ_VIEW_LOW_LIMIT_ID], trx_id));
+
+	OK(schema_table_store_record(thd, table));
+
+	DBUG_RETURN(0);
+}
+
+
+static int xtradb_read_view_init(void* p)
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("xtradb_read_view_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = xtradb_read_view_fields_info;
+	schema->fill_table = xtradb_read_view_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+static struct st_mysql_information_schema i_s_info =
+{
+	MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION
+};
+
+UNIV_INTERN struct st_mysql_plugin i_s_xtradb_read_view =
+{
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+	STRUCT_FLD(info, &i_s_info),
+	STRUCT_FLD(name, "XTRADB_READ_VIEW"),
+	STRUCT_FLD(author, PLUGIN_AUTHOR),
+	STRUCT_FLD(descr, "InnoDB Read View information"),
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+	STRUCT_FLD(init, xtradb_read_view_init),
+	STRUCT_FLD(deinit, i_s_common_deinit),
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+	STRUCT_FLD(status_vars, NULL),
+	STRUCT_FLD(system_vars, NULL),
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+static ST_FIELD_INFO xtradb_internal_hash_tables_fields_info[] =
+{
+#define INT_HASH_TABLES_NAME		0
+	{STRUCT_FLD(field_name,		"INTERNAL_HASH_TABLE_NAME"),
+	 STRUCT_FLD(field_length,	100),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define INT_HASH_TABLES_TOTAL		1
+	{STRUCT_FLD(field_name,		"TOTAL_MEMORY"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define INT_HASH_TABLES_CONSTANT		2
+	{STRUCT_FLD(field_name,		"CONSTANT_MEMORY"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define INT_HASH_TABLES_VARIABLE		3
+	{STRUCT_FLD(field_name,		"VARIABLE_MEMORY"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+static int xtradb_internal_hash_tables_fill_table(THD* thd, TABLE_LIST* tables, Item*)
+{
+	const char*	table_name;
+	Field**		fields;
+	TABLE*		table;
+	ulong		btr_search_sys_constant;
+	ulong		btr_search_sys_variable;
+
+	DBUG_ENTER("xtradb_internal_hash_tables_fill_table");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	table_name = tables->schema_table_name;
+	table = tables->table;
+	fields = table->field;
+
+	RETURN_IF_INNODB_NOT_STARTED(table_name);
+
+	/* Calculate AHI constant and variable memory allocations */
+
+	btr_search_sys_constant = 0;
+	btr_search_sys_variable = 0;
+
+	ut_ad(btr_search_sys->hash_tables);
+
+	for (ulint i = 0; i < btr_search_index_num; i++) {
+		hash_table_t* ht = btr_search_sys->hash_tables[i];
+
+		ut_ad(ht);
+		ut_ad(ht->heap);
+
+		/* Multiple mutexes/heaps are currently never used for adaptive
+		hash index tables. */
+		ut_ad(!ht->n_sync_obj);
+		ut_ad(!ht->heaps);
+
+		btr_search_sys_variable += mem_heap_get_size(ht->heap);
+		btr_search_sys_constant += ht->n_cells * sizeof(hash_cell_t);
+	}
+
+	OK(field_store_string(fields[INT_HASH_TABLES_NAME],
+			      "Adaptive hash index"));
+	OK(field_store_ulint(fields[INT_HASH_TABLES_TOTAL],
+			     btr_search_sys_variable + btr_search_sys_constant));
+	OK(field_store_ulint(fields[INT_HASH_TABLES_CONSTANT],
+			     btr_search_sys_constant));
+	OK(field_store_ulint(fields[INT_HASH_TABLES_VARIABLE],
+			     btr_search_sys_variable));
+	OK(schema_table_store_record(thd, table));
+
+	{
+	  OK(field_store_string(fields[INT_HASH_TABLES_NAME],
+				"Page hash (buffer pool 0 only)"));
+	  OK(field_store_ulint(fields[INT_HASH_TABLES_TOTAL],
+			       (ulong) (buf_pool_from_array(0)->page_hash->n_cells * sizeof(hash_cell_t))));
+	  OK(field_store_ulint(fields[INT_HASH_TABLES_CONSTANT],
+			       (ulong) (buf_pool_from_array(0)->page_hash->n_cells * sizeof(hash_cell_t))));
+	  OK(field_store_ulint(fields[INT_HASH_TABLES_VARIABLE], 0));
+	  OK(schema_table_store_record(thd, table));
+
+	}
+
+	if (dict_sys)
+	{
+	  OK(field_store_string(fields[INT_HASH_TABLES_NAME],
+				"Dictionary Cache"));
+	  OK(field_store_ulint(fields[INT_HASH_TABLES_TOTAL],
+			       ((dict_sys->table_hash->n_cells
+				 + dict_sys->table_id_hash->n_cells
+				 ) * sizeof(hash_cell_t)
+				+ dict_sys->size)));
+	  OK(field_store_ulint(fields[INT_HASH_TABLES_CONSTANT],
+			       ((dict_sys->table_hash->n_cells
+				 + dict_sys->table_id_hash->n_cells
+				 ) * sizeof(hash_cell_t))));
+	  OK(field_store_ulint(fields[INT_HASH_TABLES_VARIABLE],
+			       dict_sys->size));
+	  OK(schema_table_store_record(thd, table));
+	}
+
+	{
+	  OK(field_store_string(fields[INT_HASH_TABLES_NAME],
+				"File system"));
+	  OK(field_store_ulint(fields[INT_HASH_TABLES_TOTAL],
+			       (ulong) (fil_system_hash_cells()
+					* sizeof(hash_cell_t)
+					+ fil_system_hash_nodes())));
+	  OK(field_store_ulint(fields[INT_HASH_TABLES_CONSTANT],
+			       (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t))));
+	  OK(field_store_ulint(fields[INT_HASH_TABLES_VARIABLE],
+			       (ulong) fil_system_hash_nodes()));
+	  OK(schema_table_store_record(thd, table));
+
+	}
+
+	{
+	  ulint lock_sys_constant, lock_sys_variable;
+
+	  trx_i_s_get_lock_sys_memory_usage(&lock_sys_constant,
+					    &lock_sys_variable);
+
+	  OK(field_store_string(fields[INT_HASH_TABLES_NAME], "Lock System"));
+	  OK(field_store_ulint(fields[INT_HASH_TABLES_TOTAL],
+			       lock_sys_constant + lock_sys_variable));
+	  OK(field_store_ulint(fields[INT_HASH_TABLES_CONSTANT],
+			       lock_sys_constant));
+	  OK(field_store_ulint(fields[INT_HASH_TABLES_VARIABLE],
+			       lock_sys_variable));
+	  OK(schema_table_store_record(thd, table));
+	}
+
+	if (recv_sys)
+	{
+	  ulint recv_sys_subtotal = ((recv_sys && recv_sys->addr_hash)
+				     ? mem_heap_get_size(recv_sys->heap) : 0);
+
+	  OK(field_store_string(fields[INT_HASH_TABLES_NAME], "Recovery System"));
+	  OK(field_store_ulint(fields[INT_HASH_TABLES_TOTAL],
+			       ((recv_sys->addr_hash) ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0) + recv_sys_subtotal));
+	  OK(field_store_ulint(fields[INT_HASH_TABLES_CONSTANT],
+			       ((recv_sys->addr_hash) ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0)));
+	  OK(field_store_ulint(fields[INT_HASH_TABLES_VARIABLE],
+			       recv_sys_subtotal));
+	  OK(schema_table_store_record(thd, table));
+	}
+
+	DBUG_RETURN(0);
+}
+
+static int xtradb_internal_hash_tables_init(void* p)
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("xtradb_internal_hash_tables_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = xtradb_internal_hash_tables_fields_info;
+	schema->fill_table = xtradb_internal_hash_tables_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin i_s_xtradb_internal_hash_tables =
+{
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+	STRUCT_FLD(info, &i_s_info),
+	STRUCT_FLD(name, "XTRADB_INTERNAL_HASH_TABLES"),
+	STRUCT_FLD(author, PLUGIN_AUTHOR),
+	STRUCT_FLD(descr, "InnoDB internal hash tables information"),
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+	STRUCT_FLD(init, xtradb_internal_hash_tables_init),
+	STRUCT_FLD(deinit, i_s_common_deinit),
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+	STRUCT_FLD(status_vars, NULL),
+	STRUCT_FLD(system_vars, NULL),
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+
+/***********************************************************************
+*/
+static ST_FIELD_INFO	i_s_xtradb_rseg_fields_info[] =
+{
+	{STRUCT_FLD(field_name,		"rseg_id"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"space_id"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"zip_size"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"page_no"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"max_size"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"curr_size"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+static
+int
+i_s_xtradb_rseg_fill(
+/*=================*/
+	THD*		thd,	/* in: thread */
+	TABLE_LIST*	tables,	/* in/out: tables to fill */
+	Item*		)	/* in: condition (ignored) */
+{
+	TABLE*	table	= (TABLE *) tables->table;
+	int	status	= 0;
+	trx_rseg_t*	rseg;
+
+	DBUG_ENTER("i_s_xtradb_rseg_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	for(int i=0; i < TRX_SYS_N_RSEGS; i++)
+	{
+	  rseg = trx_sys->rseg_array[i];
+
+	  table->field[0]->store(rseg->id);
+	  table->field[1]->store(rseg->space);
+	  table->field[2]->store(rseg->zip_size);
+	  table->field[3]->store(rseg->page_no);
+	  table->field[4]->store(rseg->max_size);
+	  table->field[5]->store(rseg->curr_size);
+
+	  if (schema_table_store_record(thd, table)) {
+	    status = 1;
+	    break;
+	  }
+	}
+
+	DBUG_RETURN(status);
+}
+
+static
+int
+i_s_xtradb_rseg_init(
+/*=================*/
+			/* out: 0 on success */
+	void*	p)	/* in/out: table schema object */
+{
+	DBUG_ENTER("i_s_xtradb_rseg_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_xtradb_rseg_fields_info;
+	schema->fill_table = i_s_xtradb_rseg_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_xtradb_rseg =
+{
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+	STRUCT_FLD(info, &i_s_info),
+	STRUCT_FLD(name, "XTRADB_RSEG"),
+	STRUCT_FLD(author, PLUGIN_AUTHOR),
+	STRUCT_FLD(descr, "InnoDB rollback segment information"),
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+	STRUCT_FLD(init, i_s_xtradb_rseg_init),
+	STRUCT_FLD(deinit, i_s_common_deinit),
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+	STRUCT_FLD(status_vars, NULL),
+	STRUCT_FLD(system_vars, NULL),
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
diff --git a/storage/xtradb/handler/xtradb_i_s.h b/storage/xtradb/handler/xtradb_i_s.h
new file mode 100644
index 00000000000..2f7552c565a
--- /dev/null
+++ b/storage/xtradb/handler/xtradb_i_s.h
@@ -0,0 +1,26 @@
+/*****************************************************************************
+
+Copyright (c) 2010-2012, Percona Inc. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef XTRADB_I_S_H
+#define XTRADB_I_S_H
+
+extern struct st_mysql_plugin	i_s_xtradb_read_view;
+extern struct st_mysql_plugin	i_s_xtradb_internal_hash_tables;
+extern struct st_mysql_plugin	i_s_xtradb_rseg;
+
+#endif /* XTRADB_I_S_H */
diff --git a/storage/xtradb/ibuf/ibuf0ibuf.c b/storage/xtradb/ibuf/ibuf0ibuf.cc
index 65489d13285..876906fc9ce 100644
--- a/storage/xtradb/ibuf/ibuf0ibuf.c
+++ b/storage/xtradb/ibuf/ibuf0ibuf.cc
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file ibuf/ibuf0ibuf.c
+@file ibuf/ibuf0ibuf.cc
 Insert buffer
 
 Created 7/19/1997 Heikki Tuuri
@@ -25,6 +25,10 @@ Created 7/19/1997 Heikki Tuuri
 
 #include "ibuf0ibuf.h"
 
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+UNIV_INTERN my_bool	srv_ibuf_disable_background_merge;
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
 /** Number of bits describing a single page */
 #define IBUF_BITS_PER_PAGE	4
 #if IBUF_BITS_PER_PAGE % 2
@@ -57,6 +61,7 @@ Created 7/19/1997 Heikki Tuuri
 #include "log0recv.h"
 #include "que0que.h"
 #include "srv0start.h" /* srv_shutdown_state */
+#include "ha_prototypes.h"
 #include "rem0cmp.h"
 
 /*	STRUCTURE OF AN INSERT BUFFER RECORD
@@ -185,9 +190,6 @@ level 2 i/o. However, if an OS thread does the i/o handling for itself, i.e.,
 it uses synchronous aio, it can access any pages, as long as it obeys the
 access order rules. */
 
-/** Buffer pool size per the maximum insert buffer size */
-#define IBUF_POOL_SIZE_PER_MAX_SIZE	2
-
 /** Table name for the insert buffer. */
 #define IBUF_TABLE_NAME		"SYS_IBUF_TABLE"
 
@@ -202,9 +204,6 @@ UNIV_INTERN uint	ibuf_debug;
 /** The insert buffer control structure */
 UNIV_INTERN ibuf_t*	ibuf			= NULL;
 
-/** Counter for ibuf_should_try() */
-UNIV_INTERN ulint	ibuf_flush_count	= 0;
-
 #ifdef UNIV_PFS_MUTEX
 UNIV_INTERN mysql_pfs_key_t	ibuf_pessimistic_insert_mutex_key;
 UNIV_INTERN mysql_pfs_key_t	ibuf_mutex_key;
@@ -292,16 +291,16 @@ type, counter, and some flags. */
 
 
 /** The mutex used to block pessimistic inserts to ibuf trees */
-static mutex_t	ibuf_pessimistic_insert_mutex;
+static ib_mutex_t	ibuf_pessimistic_insert_mutex;
 
 /** The mutex protecting the insert buffer structs */
-static mutex_t	ibuf_mutex;
+static ib_mutex_t	ibuf_mutex;
 
 /** The mutex protecting the insert buffer bitmaps */
-static mutex_t	ibuf_bitmap_mutex;
+static ib_mutex_t	ibuf_bitmap_mutex;
 
 /** The area in pages from which contract looks for page numbers for merge */
-#define	IBUF_MERGE_AREA			8
+#define	IBUF_MERGE_AREA			8UL
 
 /** Inside the merge area, pages which have at most 1 per this number less
 buffered entries compared to maximum volume that can buffered for a single
@@ -419,7 +418,7 @@ ibuf_tree_root_get(
 
 	ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
 	ut_ad(page_get_page_no(root) == FSP_IBUF_TREE_ROOT_PAGE_NO);
-	ut_ad(ibuf->empty == (page_get_n_recs(root) == 0));
+	ut_ad(ibuf->empty == page_is_empty(root));
 
 	return(root);
 }
@@ -554,20 +553,17 @@ ibuf_init_at_db_start(void)
 	dict_index_t*	index;
 	ulint		n_used;
 	page_t*		header_page;
-	ulint		error;
-
-	ibuf = mem_alloc(sizeof(ibuf_t));
-
-	memset(ibuf, 0, sizeof(*ibuf));
+	dberr_t		error;
 
-	/* Note that also a pessimistic delete can sometimes make a B-tree
-	grow in size, as the references on the upper levels of the tree can
-	change */
+	ibuf = static_cast<ibuf_t*>(mem_zalloc(sizeof(ibuf_t)));
 
-	ibuf->max_size = ut_min( buf_pool_get_curr_size() / UNIV_PAGE_SIZE
-		/ IBUF_POOL_SIZE_PER_MAX_SIZE, (ulint) srv_ibuf_max_size / UNIV_PAGE_SIZE);
-
-	srv_ibuf_max_size = (long long) ibuf->max_size * UNIV_PAGE_SIZE;
+	/* At startup we intialize ibuf to have a maximum of
+	CHANGE_BUFFER_DEFAULT_SIZE in terms of percentage of the
+	buffer pool size. Once ibuf struct is initialized this
+	value is updated with the user supplied size by calling
+	ibuf_max_size_update(). */
+	ibuf->max_size = ((buf_pool_get_curr_size() / UNIV_PAGE_SIZE)
+			  * CHANGE_BUFFER_DEFAULT_SIZE) / 100;
 
 	mutex_create(ibuf_pessimistic_insert_mutex_key,
 		     &ibuf_pessimistic_insert_mutex,
@@ -609,20 +605,19 @@ ibuf_init_at_db_start(void)
 	ibuf_size_update(root, &mtr);
 	mutex_exit(&ibuf_mutex);
 
-	ibuf->empty = (page_get_n_recs(root) == 0);
+	ibuf->empty = page_is_empty(root);
 	ibuf_mtr_commit(&mtr);
 
 	heap = mem_heap_create(450);
 
 	/* Use old-style record format for the insert buffer. */
-	table = dict_mem_table_create(IBUF_TABLE_NAME, IBUF_SPACE_ID, 1, 0);
-	table->n_mysql_handles_opened = 1; /* for pin */
+	table = dict_mem_table_create(IBUF_TABLE_NAME, IBUF_SPACE_ID, 1, 0, 0);
 
 	dict_mem_table_add_col(table, heap, "DUMMY_COLUMN", DATA_BINARY, 0, 0);
 
 	table->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID;
 
-	dict_table_add_to_cache(table, heap);
+	dict_table_add_to_cache(table, FALSE, heap);
 	mem_heap_free(heap);
 
 	index = dict_mem_index_create(
@@ -640,6 +635,24 @@ ibuf_init_at_db_start(void)
 
 	ibuf->index = dict_table_get_first_index(table);
 }
+
+/*********************************************************************//**
+Updates the max_size value for ibuf. */
+UNIV_INTERN
+void
+ibuf_max_size_update(
+/*=================*/
+	ulint	new_val)	/*!< in: new value in terms of
+				percentage of the buffer pool size */
+{
+	ulint	new_size = ((buf_pool_get_curr_size() / UNIV_PAGE_SIZE)
+			    * new_val) / 100;
+	mutex_enter(&ibuf_mutex);
+	ibuf->max_size = new_size;
+	mutex_exit(&ibuf_mutex);
+}
+
+
 #endif /* !UNIV_HOTBACKUP */
 /*********************************************************************//**
 Initializes an ibuf bitmap page. */
@@ -1289,17 +1302,9 @@ ibuf_rec_get_page_no_func(
 
 	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
 
-	if (len == 1) {
-		/* This is of the >= 4.1.x record format */
-		ut_a(trx_sys_multiple_tablespace_format);
-
-		field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len);
-	} else {
-		ut_a(trx_doublewrite_must_reset_space_ids);
-		ut_a(!trx_sys_multiple_tablespace_format);
+	ut_a(len == 1);
 
-		field = rec_get_nth_field_old(rec, 0, &len);
-	}
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len);
 
 	ut_a(len == 4);
 
@@ -1335,20 +1340,13 @@ ibuf_rec_get_space_func(
 
 	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
 
-	if (len == 1) {
-		/* This is of the >= 4.1.x record format */
-
-		ut_a(trx_sys_multiple_tablespace_format);
-		field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
-		ut_a(len == 4);
+	ut_a(len == 1);
 
-		return(mach_read_from_4(field));
-	}
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
 
-	ut_a(trx_doublewrite_must_reset_space_ids);
-	ut_a(!trx_sys_multiple_tablespace_format);
+	ut_a(len == 4);
 
-	return(0);
+	return(mach_read_from_4(field));
 }
 
 #ifdef UNIV_DEBUG
@@ -1405,7 +1403,7 @@ ibuf_rec_get_info_func(
 		break;
 
 	case IBUF_REC_INFO_SIZE:
-		op_local = (ibuf_op_t)types[IBUF_REC_OFFSET_TYPE];
+		op_local = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE];
 		comp_local = types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT;
 		counter_local = mach_read_from_2(
 			types + IBUF_REC_OFFSET_COUNTER);
@@ -1526,7 +1524,7 @@ ibuf_add_ops(
 
 	for (i = 0; i < IBUF_OP_COUNT; i++) {
 #ifdef HAVE_ATOMIC_BUILTINS
-		(void) os_atomic_increment_ulint(&arr[i], ops[i]);
+		os_atomic_increment_ulint(&arr[i], ops[i]);
 #else /* HAVE_ATOMIC_BUILTINS */
 		arr[i] += ops[i];
 #endif /* HAVE_ATOMIC_BUILTINS */
@@ -1574,7 +1572,7 @@ ibuf_dummy_index_create(
 
 	table = dict_mem_table_create("IBUF_DUMMY",
 				      DICT_HDR_SPACE, n,
-				      comp ? DICT_TF_COMPACT : 0);
+				      comp ? DICT_TF_COMPACT : 0, 0);
 
 	index = dict_mem_index_create("IBUF_DUMMY", "IBUF_DUMMY",
 				      DICT_HDR_SPACE, 0, n);
@@ -1618,58 +1616,6 @@ ibuf_dummy_index_free(
 	dict_mem_table_free(table);
 }
 
-/*********************************************************************//**
-Builds the entry to insert into a non-clustered index when we have the
-corresponding record in an ibuf index.
-
-NOTE that as we copy pointers to fields in ibuf_rec, the caller must
-hold a latch to the ibuf_rec page as long as the entry is used!
-
-@return own: entry to insert to a non-clustered index */
-UNIV_INLINE
-dtuple_t*
-ibuf_build_entry_pre_4_1_x(
-/*=======================*/
-	const rec_t*	ibuf_rec,	/*!< in: record in an insert buffer */
-	mem_heap_t*	heap,		/*!< in: heap where built */
-	dict_index_t**	pindex)		/*!< out, own: dummy index that
-					describes the entry */
-{
-	ulint		i;
-	ulint		len;
-	const byte*	types;
-	dtuple_t*	tuple;
-	ulint		n_fields;
-
-	ut_a(trx_doublewrite_must_reset_space_ids);
-	ut_a(!trx_sys_multiple_tablespace_format);
-
-	n_fields = rec_get_n_fields_old(ibuf_rec) - 2;
-	tuple = dtuple_create(heap, n_fields);
-	types = rec_get_nth_field_old(ibuf_rec, 1, &len);
-
-	ut_a(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
-
-	for (i = 0; i < n_fields; i++) {
-		const byte*	data;
-		dfield_t*	field;
-
-		field = dtuple_get_nth_field(tuple, i);
-
-		data = rec_get_nth_field_old(ibuf_rec, i + 2, &len);
-
-		dfield_set_data(field, data, len);
-
-		dtype_read_for_order_and_null_size(
-			dfield_get_type(field),
-			types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE);
-	}
-
-	*pindex = ibuf_dummy_index_create(n_fields, FALSE);
-
-	return(tuple);
-}
-
 #ifdef UNIV_DEBUG
 # define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex)	\
 	ibuf_build_entry_from_ibuf_rec_func(mtr,ibuf_rec,heap,pindex)
@@ -1723,15 +1669,7 @@ ibuf_build_entry_from_ibuf_rec_func(
 
 	data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len);
 
-	if (len > 1) {
-		/* This a < 4.1.x format record */
-
-		return(ibuf_build_entry_pre_4_1_x(ibuf_rec, heap, pindex));
-	}
-
-	/* This a >= 4.1.x format record */
-
-	ut_a(trx_sys_multiple_tablespace_format);
+	ut_a(len == 1);
 	ut_a(*data == 0);
 	ut_a(rec_get_n_fields_old(ibuf_rec) > IBUF_REC_FIELD_USER);
 
@@ -1787,8 +1725,6 @@ ibuf_rec_get_size(
 	const rec_t*	rec,			/*!< in: ibuf record */
 	const byte*	types,			/*!< in: fields */
 	ulint		n_fields,		/*!< in: number of fields */
-	ibool		pre_4_1,		/*!< in: TRUE=pre-4.1 format,
-						FALSE=newer */
 	ulint		comp)			/*!< in: 0=ROW_FORMAT=REDUNDANT,
 						nonzero=ROW_FORMAT=COMPACT */
 {
@@ -1797,13 +1733,8 @@ ibuf_rec_get_size(
 	ulint	types_offset;
 	ulint	size = 0;
 
-	if (pre_4_1) {
-		field_offset = 2;
-		types_offset = DATA_ORDER_NULL_TYPE_BUF_SIZE;
-	} else {
-		field_offset = IBUF_REC_FIELD_USER;
-		types_offset = DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
-	}
+	field_offset = IBUF_REC_FIELD_USER;
+	types_offset = DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
 
 	for (i = 0; i < n_fields; i++) {
 		ulint		len;
@@ -1813,10 +1744,6 @@ ibuf_rec_get_size(
 
 		if (len != UNIV_SQL_NULL) {
 			size += len;
-		} else if (pre_4_1) {
-			dtype_read_for_order_and_null_size(&dtype, types);
-
-			size += dtype_get_sql_null_size(&dtype, comp);
 		} else {
 			dtype_new_read_for_order_and_null_size(&dtype, types);
 
@@ -1854,8 +1781,9 @@ ibuf_rec_get_volume_func(
 	const byte*	types;
 	ulint		n_fields;
 	ulint		data_size;
-	ibool		pre_4_1;
 	ulint		comp;
+	ibuf_op_t	op;
+	ulint		info_len;
 
 	ut_ad(mtr_memo_contains_page(mtr, ibuf_rec, MTR_MEMO_PAGE_X_FIX)
 	      || mtr_memo_contains_page(mtr, ibuf_rec, MTR_MEMO_PAGE_S_FIX));
@@ -1863,64 +1791,44 @@ ibuf_rec_get_volume_func(
 	ut_ad(rec_get_n_fields_old(ibuf_rec) > 2);
 
 	data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len);
-	pre_4_1 = (len > 1);
-
-	if (pre_4_1) {
-		/* < 4.1.x format record */
-
-		ut_a(trx_doublewrite_must_reset_space_ids);
-		ut_a(!trx_sys_multiple_tablespace_format);
-
-		n_fields = rec_get_n_fields_old(ibuf_rec) - 2;
-
-		types = rec_get_nth_field_old(ibuf_rec, 1, &len);
-
-		ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
-		comp = 0;
-	} else {
-		/* >= 4.1.x format record */
-		ibuf_op_t	op;
-		ulint		info_len;
-
-		ut_a(trx_sys_multiple_tablespace_format);
-		ut_a(*data == 0);
-
-		types = rec_get_nth_field_old(
-			ibuf_rec, IBUF_REC_FIELD_METADATA, &len);
+	ut_a(len == 1);
+	ut_a(*data == 0);
 
-		ibuf_rec_get_info(mtr, ibuf_rec, &op, &comp, &info_len, NULL);
+	types = rec_get_nth_field_old(
+		ibuf_rec, IBUF_REC_FIELD_METADATA, &len);
 
-		if (op == IBUF_OP_DELETE_MARK || op == IBUF_OP_DELETE) {
-			/* Delete-marking a record doesn't take any
-			additional space, and while deleting a record
-			actually frees up space, we have to play it safe and
-			pretend it takes no additional space (the record
-			might not exist, etc.).  */
+	ibuf_rec_get_info(mtr, ibuf_rec, &op, &comp, &info_len, NULL);
 
-			return(0);
-		} else if (comp) {
-			dtuple_t*	entry;
-			ulint		volume;
-			dict_index_t*	dummy_index;
-			mem_heap_t*	heap = mem_heap_create(500);
+	if (op == IBUF_OP_DELETE_MARK || op == IBUF_OP_DELETE) {
+		/* Delete-marking a record doesn't take any
+		additional space, and while deleting a record
+		actually frees up space, we have to play it safe and
+		pretend it takes no additional space (the record
+		might not exist, etc.).  */
 
-			entry = ibuf_build_entry_from_ibuf_rec(
-				mtr, ibuf_rec, heap, &dummy_index);
+		return(0);
+	} else if (comp) {
+		dtuple_t*	entry;
+		ulint		volume;
+		dict_index_t*	dummy_index;
+		mem_heap_t*	heap = mem_heap_create(500);
 
-			volume = rec_get_converted_size(dummy_index, entry, 0);
+		entry = ibuf_build_entry_from_ibuf_rec(mtr, ibuf_rec,
+			heap, &dummy_index);
 
-			ibuf_dummy_index_free(dummy_index);
-			mem_heap_free(heap);
+		volume = rec_get_converted_size(dummy_index, entry, 0);
 
-			return(volume + page_dir_calc_reserved_space(1));
-		}
+		ibuf_dummy_index_free(dummy_index);
+		mem_heap_free(heap);
 
-		types += info_len;
-		n_fields = rec_get_n_fields_old(ibuf_rec)
-			- IBUF_REC_FIELD_USER;
+		return(volume + page_dir_calc_reserved_space(1));
 	}
 
-	data_size = ibuf_rec_get_size(ibuf_rec, types, n_fields, pre_4_1, comp);
+	types += info_len;
+	n_fields = rec_get_n_fields_old(ibuf_rec)
+		- IBUF_REC_FIELD_USER;
+
+	data_size = ibuf_rec_get_size(ibuf_rec, types, n_fields, comp);
 
 	return(data_size + rec_get_converted_extra_size(data_size, n_fields, 0)
 	       + page_dir_calc_reserved_space(1));
@@ -1978,7 +1886,7 @@ ibuf_entry_build(
 
 	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE);
 
-	buf = mem_heap_alloc(heap, 4);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
 
 	mach_write_to_4(buf, space);
 
@@ -1988,7 +1896,7 @@ ibuf_entry_build(
 
 	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER);
 
-	buf = mem_heap_alloc(heap, 1);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 1));
 
 	/* We set the marker byte zero */
 
@@ -2000,7 +1908,7 @@ ibuf_entry_build(
 
 	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE);
 
-	buf = mem_heap_alloc(heap, 4);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
 
 	mach_write_to_4(buf, page_no);
 
@@ -2015,8 +1923,10 @@ ibuf_entry_build(
 		i = IBUF_REC_INFO_SIZE;
 	}
 
-	ti = type_info = mem_heap_alloc(heap, i + n_fields
-					* DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+	ti = type_info = static_cast<byte*>(
+		mem_heap_alloc(
+			heap,
+			i + n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE));
 
 	switch (i) {
 	default:
@@ -2092,7 +2002,7 @@ ibuf_entry_build(
 
 /*********************************************************************//**
 Builds a search tuple used to search buffered inserts for an index page.
-This is for < 4.1.x format records
+This is for >= 4.1.x format records.
 @return	own: search tuple */
 static
 dtuple_t*
@@ -2106,52 +2016,13 @@ ibuf_search_tuple_build(
 	dfield_t*	field;
 	byte*		buf;
 
-	ut_a(space == 0);
-	ut_a(trx_doublewrite_must_reset_space_ids);
-	ut_a(!trx_sys_multiple_tablespace_format);
-
-	tuple = dtuple_create(heap, 1);
-
-	/* Store the page number in tuple */
-
-	field = dtuple_get_nth_field(tuple, 0);
-
-	buf = mem_heap_alloc(heap, 4);
-
-	mach_write_to_4(buf, page_no);
-
-	dfield_set_data(field, buf, 4);
-
-	dtuple_set_types_binary(tuple, 1);
-
-	return(tuple);
-}
-
-/*********************************************************************//**
-Builds a search tuple used to search buffered inserts for an index page.
-This is for >= 4.1.x format records.
-@return	own: search tuple */
-static
-dtuple_t*
-ibuf_new_search_tuple_build(
-/*========================*/
-	ulint		space,	/*!< in: space id */
-	ulint		page_no,/*!< in: index page number */
-	mem_heap_t*	heap)	/*!< in: heap into which to build */
-{
-	dtuple_t*	tuple;
-	dfield_t*	field;
-	byte*		buf;
-
-	ut_a(trx_sys_multiple_tablespace_format);
-
 	tuple = dtuple_create(heap, IBUF_REC_FIELD_METADATA);
 
 	/* Store the space id in tuple */
 
 	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE);
 
-	buf = mem_heap_alloc(heap, 4);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
 
 	mach_write_to_4(buf, space);
 
@@ -2161,7 +2032,7 @@ ibuf_new_search_tuple_build(
 
 	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER);
 
-	buf = mem_heap_alloc(heap, 1);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 1));
 
 	mach_write_to_1(buf, 0);
 
@@ -2171,7 +2042,7 @@ ibuf_new_search_tuple_build(
 
 	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE);
 
-	buf = mem_heap_alloc(heap, 4);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
 
 	mach_write_to_4(buf, page_no);
 
@@ -2239,7 +2110,7 @@ ibuf_add_free_page(void)
 	/* Acquire the fsp latch before the ibuf header, obeying the latching
 	order */
 	mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, &flags), &mtr);
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	header_page = ibuf_header_page_get(&mtr);
 
@@ -2321,7 +2192,7 @@ ibuf_remove_free_page(void)
 	/* Acquire the fsp latch before the ibuf header, obeying the latching
 	order */
 	mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, &flags), &mtr);
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	header_page = ibuf_header_page_get(&mtr);
 
@@ -2595,7 +2466,8 @@ ibuf_get_merge_page_nos_func(
 			smallest possible secondary index leaf page
 			(and that only after DROP INDEX). */
 			ut_ad(rec_page_no
-			      > IBUF_TREE_ROOT_PAGE_NO - (rec_space_id != 0));
+			      > (ulint) IBUF_TREE_ROOT_PAGE_NO
+			      - (rec_space_id != 0));
 		}
 
 #ifdef UNIV_IBUF_DEBUG
@@ -2660,6 +2532,73 @@ ibuf_get_merge_page_nos_func(
 	return(sum_volumes);
 }
 
+/*******************************************************************//**
+Get the matching records for space id.
+@return	current rec or NULL */
+static	__attribute__((nonnull, warn_unused_result))
+const rec_t*
+ibuf_get_user_rec(
+/*===============*/
+	btr_pcur_t*	pcur,		/*!< in: the current cursor */
+	mtr_t*		mtr)		/*!< in: mini transaction */
+{
+	do {
+		const rec_t* rec = btr_pcur_get_rec(pcur);
+
+		if (page_rec_is_user_rec(rec)) {
+			return(rec);
+		}
+	} while (btr_pcur_move_to_next(pcur, mtr));
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Reads page numbers for a space id from an ibuf tree.
+@return a lower limit for the combined volume of records which will be
+merged */
+static	__attribute__((nonnull, warn_unused_result))
+ulint
+ibuf_get_merge_pages(
+/*=================*/
+	btr_pcur_t*	pcur,	/*!< in/out: cursor */
+	ulint		space,	/*!< in: space for which to merge */
+	ulint		limit,	/*!< in: max page numbers to read */
+	ulint*		pages,	/*!< out: pages read */
+	ulint*		spaces,	/*!< out: spaces read */
+	ib_int64_t*	versions,/*!< out: space versions read */
+	ulint*		n_pages,/*!< out: number of pages read */
+	mtr_t*		mtr)	/*!< in: mini transaction */
+{
+	const rec_t*	rec;
+	ulint		volume = 0;
+	ib_int64_t	version = fil_space_get_version(space);
+
+	ut_a(space != ULINT_UNDEFINED);
+
+	*n_pages = 0;
+
+	while ((rec = ibuf_get_user_rec(pcur, mtr)) != 0
+	       && ibuf_rec_get_space(mtr, rec) == space
+	       && *n_pages < limit) {
+
+		ulint	page_no = ibuf_rec_get_page_no(mtr, rec);
+
+		if (*n_pages == 0 || pages[*n_pages - 1] != page_no) {
+			spaces[*n_pages] = space;
+			pages[*n_pages] = page_no;
+			versions[*n_pages] = version;
+			++*n_pages;
+		}
+
+		volume += ibuf_rec_get_volume(mtr, rec);
+
+		btr_pcur_move_to_next(pcur, mtr);
+	}
+
+	return(volume);
+}
+
 /*********************************************************************//**
 Contracts insert buffer trees by reading pages to the buffer pool.
 @return a lower limit for the combined size in bytes of entries which
@@ -2667,32 +2606,22 @@ will be merged from ibuf trees to the pages read, 0 if ibuf is
 empty */
 static
 ulint
-ibuf_contract_ext(
-/*==============*/
-	ulint*	n_pages,/*!< out: number of pages to which merged */
-	ibool	sync)	/*!< in: TRUE if the caller wants to wait for the
-			issued read with the highest tablespace address
-			to complete */
+ibuf_merge_pages(
+/*=============*/
+	ulint*	n_pages,	/*!< out: number of pages to which merged */
+	bool	sync)		/*!< in: true if the caller wants to wait for
+				the issued read with the highest tablespace
+				address to complete */
 {
+	mtr_t		mtr;
 	btr_pcur_t	pcur;
+	ulint		sum_sizes;
 	ulint		page_nos[IBUF_MAX_N_PAGES_MERGED];
 	ulint		space_ids[IBUF_MAX_N_PAGES_MERGED];
 	ib_int64_t	space_versions[IBUF_MAX_N_PAGES_MERGED];
-	ulint		sum_sizes;
-	mtr_t		mtr;
 
 	*n_pages = 0;
 
-	/* We perform a dirty read of ibuf->empty, without latching
-	the insert buffer root page. We trust this dirty read except
-	when a slow shutdown is being executed. During a slow
-	shutdown, the insert buffer merge must be completed. */
-
-	if (UNIV_UNLIKELY(ibuf->empty)
-	    && UNIV_LIKELY(!srv_shutdown_state)) {
-		return(0);
-	}
-
 	ibuf_mtr_start(&mtr);
 
 	/* Open a cursor to a randomly chosen leaf of the tree, at a random
@@ -2702,7 +2631,7 @@ ibuf_contract_ext(
 
 	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
 
-	if (page_get_n_recs(btr_pcur_get_page(&pcur)) == 0) {
+	if (page_is_empty(btr_pcur_get_page(&pcur))) {
 		/* If a B-tree page is empty, it must be the root page
 		and the whole B-tree must be empty. InnoDB does not
 		allow empty B-tree pages other than the root. */
@@ -2729,18 +2658,160 @@ ibuf_contract_ext(
 	ibuf_mtr_commit(&mtr);
 	btr_pcur_close(&pcur);
 
-	buf_read_ibuf_merge_pages(sync, space_ids, space_versions, page_nos,
-				  *n_pages);
+	buf_read_ibuf_merge_pages(
+		sync, space_ids, space_versions, page_nos, *n_pages);
 
 	return(sum_sizes + 1);
 }
 
 /*********************************************************************//**
+Get the table instance from the table id.
+@return table instance */
+static __attribute__((warn_unused_result))
+dict_table_t*
+ibuf_get_table(
+/*===========*/
+	table_id_t	table_id)	/*!< in: valid table id */
+{
+	rw_lock_s_lock_func(&dict_operation_lock, 0, __FILE__, __LINE__);
+
+	dict_table_t*	table = dict_table_open_on_id(
+		table_id, FALSE, DICT_TABLE_OP_NORMAL);
+
+	rw_lock_s_unlock_gen(&dict_operation_lock, 0);
+
+	return(table);
+}
+
+/*********************************************************************//**
 Contracts insert buffer trees by reading pages to the buffer pool.
 @return a lower limit for the combined size in bytes of entries which
 will be merged from ibuf trees to the pages read, 0 if ibuf is
 empty */
-UNIV_INTERN
+static
+ulint
+ibuf_merge_space(
+/*=============*/
+	ulint		space,	/*!< in: tablespace id to merge */
+	ulint*		n_pages)/*!< out: number of pages to which merged */
+{
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+	mem_heap_t*	heap = mem_heap_create(512);
+	dtuple_t*	tuple = ibuf_search_tuple_build(space, 0, heap);
+
+	ibuf_mtr_start(&mtr);
+
+	/* Position the cursor on the first matching record. */
+
+	btr_pcur_open(
+		ibuf->index, tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur,
+		&mtr);
+
+	mem_heap_free(heap);
+
+	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
+
+	ulint		sum_sizes = 0;
+	ulint		pages[IBUF_MAX_N_PAGES_MERGED];
+	ulint		spaces[IBUF_MAX_N_PAGES_MERGED];
+	ib_int64_t	versions[IBUF_MAX_N_PAGES_MERGED];
+
+	if (page_is_empty(btr_pcur_get_page(&pcur))) {
+		/* If a B-tree page is empty, it must be the root page
+		and the whole B-tree must be empty. InnoDB does not
+		allow empty B-tree pages other than the root. */
+		ut_ad(ibuf->empty);
+		ut_ad(page_get_space_id(btr_pcur_get_page(&pcur))
+		      == IBUF_SPACE_ID);
+		ut_ad(page_get_page_no(btr_pcur_get_page(&pcur))
+		      == FSP_IBUF_TREE_ROOT_PAGE_NO);
+
+	} else {
+
+		sum_sizes = ibuf_get_merge_pages(
+			&pcur, space, IBUF_MAX_N_PAGES_MERGED,
+			&pages[0], &spaces[0], &versions[0], n_pages,
+			&mtr);
+
+		++sum_sizes;
+	}
+
+	ibuf_mtr_commit(&mtr);
+
+	btr_pcur_close(&pcur);
+
+	if (sum_sizes > 0) {
+
+		ut_a(*n_pages > 0 || sum_sizes == 1);
+
+#ifdef UNIV_DEBUG
+		ut_ad(*n_pages <= UT_ARR_SIZE(pages));
+
+		for (ulint i = 0; i < *n_pages; ++i) {
+			ut_ad(spaces[i] == space);
+			ut_ad(i == 0 || versions[i] == versions[i - 1]);
+		}
+#endif /* UNIV_DEBUG */
+
+		buf_read_ibuf_merge_pages(
+			true, spaces, versions, pages, *n_pages);
+	}
+
+	return(sum_sizes);
+}
+
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+static __attribute__((nonnull, warn_unused_result))
+ulint
+ibuf_merge(
+/*=======*/
+	table_id_t	table_id,	/*!< in: if merge should be
+					done only for a specific
+					table, for all tables this
+					should be 0 */
+	ulint*		n_pages,	/*!< out: number of pages to
+					which merged */
+	bool		sync)		/*!< in: TRUE if the caller
+					wants to wait for the issued
+					read with the highest
+					tablespace address to complete */
+{
+	dict_table_t*	table;
+
+	*n_pages = 0;
+
+	/* We perform a dirty read of ibuf->empty, without latching
+	the insert buffer root page. We trust this dirty read except
+	when a slow shutdown is being executed. During a slow
+	shutdown, the insert buffer merge must be completed. */
+
+	if (ibuf->empty && !srv_shutdown_state) {
+		return(0);
+	} else if (table_id == 0) {
+		return(ibuf_merge_pages(n_pages, sync));
+	} else if ((table = ibuf_get_table(table_id)) == 0) {
+		/* Table has been dropped. */
+		return(0);
+	}
+
+	ulint	volume = ibuf_merge_space(table->space, n_pages);
+
+	dict_table_close(table, FALSE, FALSE);
+
+	return(volume);
+}
+
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+static
 ulint
 ibuf_contract(
 /*==========*/
@@ -2750,7 +2821,7 @@ ibuf_contract(
 {
 	ulint	n_pages;
 
-	return(ibuf_contract_ext(&n_pages, sync));
+	return(ibuf_merge(0, &n_pages, sync));
 }
 
 /*********************************************************************//**
@@ -2760,22 +2831,53 @@ will be merged from ibuf trees to the pages read, 0 if ibuf is
 empty */
 UNIV_INTERN
 ulint
-ibuf_contract_for_n_pages(
-/*======================*/
-	ibool	sync,	/*!< in: TRUE if the caller wants to wait for the
-			issued read with the highest tablespace address
-			to complete */
-	ulint	n_pages)/*!< in: try to read at least this many pages to
-			the buffer pool and merge the ibuf contents to
-			them */
+ibuf_contract_in_background(
+/*========================*/
+	table_id_t	table_id,	/*!< in: if merge should be done only
+					for a specific table, for all tables
+					this should be 0 */
+	ibool		full)		/*!< in: TRUE if the caller wants to
+					do a full contract based on PCT_IO(100).
+					If FALSE then the size of contract
+					batch is determined based on the
+					current size of the ibuf tree. */
 {
 	ulint	sum_bytes	= 0;
 	ulint	sum_pages	= 0;
-	ulint	n_bytes;
 	ulint	n_pag2;
+	ulint	n_pages;
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+	if (srv_ibuf_disable_background_merge && table_id == 0) {
+		return(0);
+	}
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+	if (full) {
+		/* Caller has requested a full batch */
+		n_pages = PCT_IO(100);
+	} else {
+		/* By default we do a batch of 5% of the io_capacity */
+		n_pages = PCT_IO(5);
+
+		mutex_enter(&ibuf_mutex);
+
+		/* If the ibuf->size is more than half the max_size
+		then we make more agreesive contraction.
+		+1 is to avoid division by zero. */
+		if (ibuf->size > ibuf->max_size / 2) {
+			ulint diff = ibuf->size - ibuf->max_size / 2;
+			n_pages += PCT_IO((diff * 100)
+					   / (ibuf->max_size + 1));
+		}
+
+		mutex_exit(&ibuf_mutex);
+	}
 
 	while (sum_pages < n_pages) {
-		n_bytes = ibuf_contract_ext(&n_pag2, sync);
+		ulint	n_bytes;
+
+		n_bytes = ibuf_merge(table_id, &n_pag2, FALSE);
 
 		if (n_bytes == 0) {
 			return(sum_bytes);
@@ -2783,6 +2885,8 @@ ibuf_contract_for_n_pages(
 
 		sum_bytes += n_bytes;
 		sum_pages += n_pag2;
+
+		srv_inc_activity_count();
 	}
 
 	return(sum_bytes);
@@ -2812,11 +2916,9 @@ ibuf_contract_after_insert(
 	size = ibuf->size;
 	max_size = ibuf->max_size;
 
-	if (!srv_ibuf_active_contract) {
 	if (size < max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) {
 		return;
 	}
-	}
 
 	sync = (size >= max_size + IBUF_CONTRACT_ON_INSERT_SYNC);
 
@@ -2852,8 +2954,7 @@ ibuf_get_volume_buffered_hash(
 
 	len = ibuf_rec_get_size(
 		rec, types,
-		rec_get_n_fields_old(rec) - IBUF_REC_FIELD_USER,
-		FALSE, comp);
+		rec_get_n_fields_old(rec) - IBUF_REC_FIELD_USER, comp);
 	fold = ut_fold_binary(data, len);
 
 	hash += (fold / (CHAR_BIT * sizeof *hash)) % size;
@@ -2913,7 +3014,6 @@ ibuf_get_volume_buffered_count_func(
 	operations.  All pre-4.1 records should have been merged
 	when the database was started up. */
 	ut_a(len == 1);
-	ut_ad(trx_sys_multiple_tablespace_format);
 
 	if (rec_get_deleted_flag(rec, 0)) {
 		/* This record has been merged already,
@@ -2935,7 +3035,7 @@ ibuf_get_volume_buffered_count_func(
 		because deletes cannot be buffered if there are
 		old-style inserts buffered for the page. */
 
-		len = ibuf_rec_get_size(rec, types, n_fields, FALSE, 0);
+		len = ibuf_rec_get_size(rec, types, n_fields, 0);
 
 		return(len
 		       + rec_get_converted_extra_size(len, n_fields, 0)
@@ -3040,8 +3140,6 @@ ibuf_get_volume_buffered(
 	/* bitmap of buffered recs */
 	ulint		hash_bitmap[128 / sizeof(ulint)];
 
-	ut_a(trx_sys_multiple_tablespace_format);
-
 	ut_ad((pcur->latch_mode == BTR_MODIFY_PREV)
 	      || (pcur->latch_mode == BTR_MODIFY_TREE));
 
@@ -3222,7 +3320,7 @@ ibuf_update_max_tablespace_id(void)
 	ibuf_mtr_start(&mtr);
 
 	btr_pcur_open_at_index_side(
-		FALSE, ibuf->index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+		false, ibuf->index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
 
 	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
 
@@ -3285,18 +3383,11 @@ ibuf_get_entry_counter_low_func(
 
 	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
 
-	if (UNIV_UNLIKELY(len != 1)) {
-		/* pre-4.1 format */
-		ut_a(trx_doublewrite_must_reset_space_ids);
-		ut_a(!trx_sys_multiple_tablespace_format);
-
-		return(ULINT_UNDEFINED);
-	}
-
-	ut_a(trx_sys_multiple_tablespace_format);
+	ut_a(len == 1);
 
 	/* Check the tablespace identifier. */
 	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
+
 	ut_a(len == 4);
 
 	if (mach_read_from_4(field) != space) {
@@ -3391,8 +3482,8 @@ ibuf_get_entry_counter_func(
 Buffer an operation in the insert/delete buffer, instead of doing it
 directly to the disk page, if this is possible.
 @return	DB_SUCCESS, DB_STRONG_FAIL or other error */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 ibuf_insert_low(
 /*============*/
 	ulint		mode,	/*!< in: BTR_MODIFY_PREV or BTR_MODIFY_TREE */
@@ -3414,7 +3505,9 @@ ibuf_insert_low(
 	btr_pcur_t	pcur;
 	btr_cur_t*	cursor;
 	dtuple_t*	ibuf_entry;
+	mem_heap_t*	offsets_heap	= NULL;
 	mem_heap_t*	heap;
+	ulint*		offsets		= NULL;
 	ulint		buffered;
 	lint		min_n_recs;
 	rec_t*		ins_rec;
@@ -3422,7 +3515,7 @@ ibuf_insert_low(
 	page_t*		bitmap_page;
 	buf_block_t*	block;
 	page_t*		root;
-	ulint		err;
+	dberr_t		err;
 	ibool		do_merge;
 	ulint		space_ids[IBUF_MAX_N_PAGES_MERGED];
 	ib_int64_t	space_versions[IBUF_MAX_N_PAGES_MERGED];
@@ -3437,19 +3530,19 @@ ibuf_insert_low(
 	ut_ad(!no_counter || op == IBUF_OP_INSERT);
 	ut_a(op < IBUF_OP_COUNT);
 
-	ut_a(trx_sys_multiple_tablespace_format);
-
 	ut_ad(!(thr_get_trx(thr)->fake_changes));
 
 	do_merge = FALSE;
 
 	/* Perform dirty reads of ibuf->size and ibuf->max_size, to
-	reduce ibuf_mutex contention. ibuf->max_size remains constant
-	after ibuf_init_at_db_start(), but ibuf->size should be
-	protected by ibuf_mutex. Given that ibuf->size fits in a
-	machine word, this should be OK; at worst we are doing some
-	excessive ibuf_contract() or occasionally skipping a
-	ibuf_contract(). */
+	reduce ibuf_mutex contention. Given that ibuf->max_size and
+	ibuf->size fit in a machine word, this should be OK; at worst
+	we are doing some excessive ibuf_contract() or occasionally
+	skipping an ibuf_contract(). */
+	if (ibuf->max_size == 0) {
+		return(DB_STRONG_FAIL);
+	}
+
 	if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_DO_NOT_INSERT) {
 		/* Insert buffer is now too big, contract it but do not try
 		to insert */
@@ -3464,7 +3557,7 @@ ibuf_insert_low(
 		return(DB_STRONG_FAIL);
 	}
 
-	heap = mem_heap_create(512);
+	heap = mem_heap_create(1024);
 
 	/* Build the entry which contains the space id and the page number
 	as the first fields and the type information for other fields, and
@@ -3568,7 +3661,8 @@ fail_exit:
 	if (buf_page_peek(space, page_no)
 	    || lock_rec_expl_exist_on_page(space, page_no)) {
 
-		goto bitmap_fail;
+		ibuf_mtr_commit(&bitmap_mtr);
+		goto fail_exit;
 	}
 
 	if (op == IBUF_OP_INSERT) {
@@ -3604,7 +3698,6 @@ fail_exit:
 		dfield_t*	field;
 
 		if (counter == ULINT_UNDEFINED) {
-bitmap_fail:
 			ibuf_mtr_commit(&bitmap_mtr);
 			goto fail_exit;
 		}
@@ -3634,9 +3727,11 @@ bitmap_fail:
 	cursor = btr_pcur_get_btr_cur(&pcur);
 
 	if (mode == BTR_MODIFY_PREV) {
-		err = btr_cur_optimistic_insert(BTR_NO_LOCKING_FLAG, cursor,
-						ibuf_entry, &ins_rec,
-						&dummy_big_rec, 0, thr, &mtr);
+		err = btr_cur_optimistic_insert(
+			BTR_NO_LOCKING_FLAG,
+			cursor, &offsets, &offsets_heap,
+			ibuf_entry, &ins_rec,
+			&dummy_big_rec, 0, thr, &mtr);
 		block = btr_cur_get_block(cursor);
 		ut_ad(buf_block_get_space(block) == IBUF_SPACE_ID);
 
@@ -3649,7 +3744,7 @@ bitmap_fail:
 			ut_ad(page_get_page_no(root)
 			      == FSP_IBUF_TREE_ROOT_PAGE_NO);
 
-			ibuf->empty = (page_get_n_recs(root) == 0);
+			ibuf->empty = page_is_empty(root);
 		}
 	} else {
 		ut_ad(mode == BTR_MODIFY_TREE);
@@ -3663,25 +3758,31 @@ bitmap_fail:
 
 		err = btr_cur_optimistic_insert(
 			BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
-			cursor, ibuf_entry, &ins_rec,
+			cursor, &offsets, &offsets_heap,
+			ibuf_entry, &ins_rec,
 			&dummy_big_rec, 0, thr, &mtr);
 
 		if (err == DB_FAIL) {
 			err = btr_cur_pessimistic_insert(
 				BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
-				cursor, ibuf_entry, &ins_rec,
+				cursor, &offsets, &offsets_heap,
+				ibuf_entry, &ins_rec,
 				&dummy_big_rec, 0, thr, &mtr);
 		}
 
 		mutex_exit(&ibuf_pessimistic_insert_mutex);
 		ibuf_size_update(root, &mtr);
 		mutex_exit(&ibuf_mutex);
-		ibuf->empty = (page_get_n_recs(root) == 0);
+		ibuf->empty = page_is_empty(root);
 
 		block = btr_cur_get_block(cursor);
 		ut_ad(buf_block_get_space(block) == IBUF_SPACE_ID);
 	}
 
+	if (offsets_heap) {
+		mem_heap_free(offsets_heap);
+	}
+
 	if (err == DB_SUCCESS && op != IBUF_OP_DELETE) {
 		/* Update the page max trx id field */
 		page_update_max_trx_id(block, NULL,
@@ -3714,7 +3815,7 @@ func_exit:
 #ifdef UNIV_IBUF_DEBUG
 		ut_a(n_stored <= IBUF_MAX_N_PAGES_MERGED);
 #endif
-		buf_read_ibuf_merge_pages(FALSE, space_ids, space_versions,
+		buf_read_ibuf_merge_pages(false, space_ids, space_versions,
 					  page_nos, n_stored);
 	}
 
@@ -3738,14 +3839,17 @@ ibuf_insert(
 	ulint		page_no,/*!< in: page number where to insert */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	ulint		err;
+	dberr_t		err;
 	ulint		entry_size;
 	ibool		no_counter;
 	/* Read the settable global variable ibuf_use only once in
 	this function, so that we will have a consistent view of it. */
 	ibuf_use_t	use		= ibuf_use;
+	DBUG_ENTER("ibuf_insert");
+
+	DBUG_PRINT("ibuf", ("op: %d, space: %ld, page_no: %ld",
+			    op, space, page_no));
 
-	ut_a(trx_sys_multiple_tablespace_format);
 	ut_ad(dtuple_check_typed(entry));
 	ut_ad(ut_is_2pow(zip_size));
 
@@ -3759,7 +3863,7 @@ ibuf_insert(
 		case IBUF_USE_NONE:
 		case IBUF_USE_DELETE:
 		case IBUF_USE_DELETE_MARK:
-			return(FALSE);
+			DBUG_RETURN(FALSE);
 		case IBUF_USE_INSERT:
 		case IBUF_USE_INSERT_DELETE_MARK:
 		case IBUF_USE_ALL:
@@ -3772,7 +3876,7 @@ ibuf_insert(
 		switch (use) {
 		case IBUF_USE_NONE:
 		case IBUF_USE_INSERT:
-			return(FALSE);
+			DBUG_RETURN(FALSE);
 		case IBUF_USE_DELETE_MARK:
 		case IBUF_USE_DELETE:
 		case IBUF_USE_INSERT_DELETE_MARK:
@@ -3788,7 +3892,7 @@ ibuf_insert(
 		case IBUF_USE_NONE:
 		case IBUF_USE_INSERT:
 		case IBUF_USE_INSERT_DELETE_MARK:
-			return(FALSE);
+			DBUG_RETURN(FALSE);
 		case IBUF_USE_DELETE_MARK:
 		case IBUF_USE_DELETE:
 		case IBUF_USE_ALL:
@@ -3820,14 +3924,8 @@ check_watch:
 
 	{
 		buf_page_t*	bpage;
-		ulint		fold = buf_page_address_fold(space, page_no);
 		buf_pool_t*	buf_pool = buf_pool_get(space, page_no);
-
-		//buf_pool_mutex_enter(buf_pool);
-		rw_lock_s_lock(&buf_pool->page_hash_latch);
-		bpage = buf_page_hash_get_low(buf_pool, space, page_no, fold);
-		//buf_pool_mutex_exit(buf_pool);
-		rw_lock_s_unlock(&buf_pool->page_hash_latch);
+		bpage = buf_page_hash_get(buf_pool, space, page_no);
 
 		if (UNIV_LIKELY_NULL(bpage)) {
 			/* A buffer pool watch has been set or the
@@ -3836,7 +3934,7 @@ check_watch:
 			is being buffered, have this request executed
 			directly on the page in the buffer pool after the
 			buffered entries for this page have been merged. */
-			return(FALSE);
+			DBUG_RETURN(FALSE);
 		}
 	}
 
@@ -3847,7 +3945,7 @@ skip_watch:
 	    >= page_get_free_space_of_empty(dict_table_is_comp(index->table))
 	    / 2) {
 
-		return(FALSE);
+		DBUG_RETURN(FALSE);
 	}
 
 	err = ibuf_insert_low(BTR_MODIFY_PREV, op, no_counter,
@@ -3864,28 +3962,29 @@ skip_watch:
 		/* fprintf(stderr, "Ibuf insert for page no %lu of index %s\n",
 		page_no, index->name); */
 #endif
-		return(TRUE);
+		DBUG_RETURN(TRUE);
 
 	} else {
-		ut_a(err == DB_STRONG_FAIL);
+		ut_a(err == DB_STRONG_FAIL || err == DB_TOO_BIG_RECORD);
 
-		return(FALSE);
+		DBUG_RETURN(FALSE);
 	}
 }
 
 /********************************************************************//**
 During merge, inserts to an index page a secondary index entry extracted
-from the insert buffer.
+from the insert buffer. 
 @return	newly inserted record */
-static
+static __attribute__((nonnull))
 rec_t*
 ibuf_insert_to_index_page_low(
 /*==========================*/
-				/* out: newly inserted record */
 	const dtuple_t*	entry,	/*!< in: buffered entry to insert */
 	buf_block_t*	block,	/*!< in/out: index page where the buffered
 				entry should be placed */
 	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint**		offsets,/*!< out: offsets on *rec */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
 	mtr_t*		mtr,	/*!< in/out: mtr */
 	page_cur_t*	page_cur)/*!< in/out: cursor positioned on the record
 				after which to insert the buffered entry */
@@ -3899,19 +3998,26 @@ ibuf_insert_to_index_page_low(
 	rec_t*		rec;
 	DBUG_ENTER("ibuf_insert_to_index_page_low");
 
-	rec = page_cur_tuple_insert(page_cur, entry, index, 0, mtr);
+	rec = page_cur_tuple_insert(page_cur, entry, index,
+				    offsets, &heap, 0, mtr);
 	if (rec != NULL) {
 		DBUG_RETURN(rec);
 	}
 
+	/* Page reorganization or recompression should already have
+	been attempted by page_cur_tuple_insert(). Besides, per
+	ibuf_index_page_calc_free_zip() the page should not have been
+	recompressed or reorganized. */
+	ut_ad(!buf_block_get_page_zip(block));
+
 	/* If the record did not fit, reorganize */
 
-	btr_page_reorganize(block, index, mtr);
-	page_cur_search(block, index, entry, PAGE_CUR_LE, page_cur);
+	btr_page_reorganize(page_cur, index, mtr);
 
 	/* This time the record must fit */
 
-	rec = page_cur_tuple_insert(page_cur, entry, index, 0, mtr);
+	rec = page_cur_tuple_insert(page_cur, entry, index,
+				    offsets, &heap, 0, mtr);
 	if (rec != NULL) {
 		DBUG_RETURN(rec);
 	}
@@ -3967,8 +4073,16 @@ ibuf_insert_to_index_page(
 	ulint		low_match;
 	page_t*		page		= buf_block_get_frame(block);
 	rec_t*		rec;
+	ulint*		offsets;
+	mem_heap_t*	heap;
+
 	DBUG_ENTER("ibuf_insert_to_index_page");
 
+	DBUG_PRINT("ibuf", ("page_no: %ld", buf_block_get_page_no(block)));
+	DBUG_PRINT("ibuf", ("index name: %s", index->name));
+	DBUG_PRINT("ibuf", ("online status: %d",
+			    dict_index_get_online_status(index)));
+
 	ut_ad(ibuf_inside(mtr));
 	ut_ad(dtuple_check_typed(entry));
 	ut_ad(!buf_block_align(page)->index);
@@ -4018,10 +4132,14 @@ dump:
 	low_match = page_cur_search(block, index, entry,
 				    PAGE_CUR_LE, &page_cur);
 
+	heap = mem_heap_create(
+		sizeof(upd_t)
+		+ REC_OFFS_HEADER_SIZE * sizeof(*offsets)
+		+ dtuple_get_n_fields(entry)
+		* (sizeof(upd_field_t) + sizeof *offsets));
+
 	if (UNIV_UNLIKELY(low_match == dtuple_get_n_fields(entry))) {
-		mem_heap_t*	heap;
 		upd_t*		update;
-		ulint*		offsets;
 		page_zip_des_t*	page_zip;
 
 		rec = page_cur_get_rec(&page_cur);
@@ -4030,12 +4148,10 @@ dump:
 		row_ins_sec_index_entry_by_modify(BTR_MODIFY_LEAF). */
 		ut_ad(rec_get_deleted_flag(rec, page_is_comp(page)));
 
-		heap = mem_heap_create(1024);
-
 		offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED,
 					  &heap);
 		update = row_upd_build_sec_rec_difference_binary(
-			index, entry, rec, NULL, heap);
+			rec, index, offsets, entry, heap);
 
 		page_zip = buf_block_get_page_zip(block);
 
@@ -4045,9 +4161,7 @@ dump:
 			Bug #56680 was fixed. */
 			btr_cur_set_deleted_flag_for_ibuf(
 				rec, page_zip, FALSE, mtr);
-updated_in_place:
-			mem_heap_free(heap);
-			DBUG_VOID_RETURN;
+			goto updated_in_place;
 		}
 
 		/* Copy the info bits. Clear the delete-mark. */
@@ -4060,15 +4174,19 @@ updated_in_place:
 		if (!row_upd_changes_field_size_or_external(index, offsets,
 							    update)
 		    && (!page_zip || btr_cur_update_alloc_zip(
-				page_zip, block, index,
-				rec_offs_size(offsets), FALSE, mtr, NULL))) {
+				page_zip, &page_cur, index, offsets,
+				rec_offs_size(offsets), false, mtr, NULL))) {
 			/* This is the easy case. Do something similar
 			to btr_cur_update_in_place(). */
+			rec = page_cur_get_rec(&page_cur);
 			row_upd_rec_in_place(rec, index, offsets,
 					     update, page_zip);
 			goto updated_in_place;
 		}
 
+		/* btr_cur_update_alloc_zip() may have changed this */
+		rec = page_cur_get_rec(&page_cur);
+
 		/* A collation may identify values that differ in
 		storage length.
 		Some examples (1 or 2 bytes):
@@ -4091,20 +4209,21 @@ updated_in_place:
 		lock_rec_store_on_page_infimum(block, rec);
 		page_cur_delete_rec(&page_cur, index, offsets, mtr);
 		page_cur_move_to_prev(&page_cur);
-
-		rec = ibuf_insert_to_index_page_low(entry, block, index, mtr,
+		rec = ibuf_insert_to_index_page_low(entry, block, index,
+				      		    &offsets, heap, mtr,
 						    &page_cur);
-		ut_ad(!cmp_dtuple_rec(entry, rec,
-				      rec_get_offsets(rec, index, NULL,
-						      ULINT_UNDEFINED,
-						      &heap)));
-		mem_heap_free(heap);
 
+		ut_ad(!cmp_dtuple_rec(entry, rec, offsets));
 		lock_rec_restore_from_page_infimum(block, rec, block);
 	} else {
-		ibuf_insert_to_index_page_low(entry, block, index, mtr,
+		offsets = NULL;
+		ibuf_insert_to_index_page_low(entry, block, index,
+					      &offsets, heap, mtr,
 					      &page_cur);
 	}
+updated_in_place:
+	mem_heap_free(heap);
+
 	DBUG_VOID_RETURN;
 }
 
@@ -4139,7 +4258,7 @@ ibuf_set_del_mark(
 		/* Delete mark the old index record. According to a
 		comment in row_upd_sec_index_entry(), it can already
 		have been delete marked if a lock wait occurred in
-		row_ins_index_entry() in a previous invocation of
+		row_ins_sec_index_entry() in a previous invocation of
 		row_upd_sec_index_entry(). */
 
 		if (UNIV_LIKELY
@@ -4206,7 +4325,7 @@ ibuf_delete(
 		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
 		ulint*		offsets	= offsets_;
 		mem_heap_t*	heap = NULL;
-		ulint		max_ins_size;
+		ulint		max_ins_size = 0;
 
 		rec_offs_init(offsets_);
 
@@ -4317,7 +4436,7 @@ ibuf_restore_pos(
 		ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
 
 		fputs("InnoDB: Validating insert buffer tree:\n", stderr);
-		if (!btr_validate_index(ibuf->index, NULL)) {
+		if (!btr_validate_index(ibuf->index, 0)) {
 			ut_error;
 		}
 
@@ -4349,7 +4468,7 @@ ibuf_delete_rec(
 {
 	ibool		success;
 	page_t*		root;
-	ulint		err;
+	dberr_t		err;
 
 	ut_ad(ibuf_inside(mtr));
 	ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur)));
@@ -4367,15 +4486,16 @@ ibuf_delete_rec(
 		btr_cur_set_deleted_flag_for_ibuf(
 			btr_pcur_get_rec(pcur), NULL, TRUE, mtr);
 		ibuf_mtr_commit(mtr);
-		log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+		log_write_up_to(LSN_MAX, LOG_WAIT_ALL_GROUPS, TRUE);
 		DBUG_SUICIDE();
 	}
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 
-	success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur), mtr);
+	success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur),
+					    0, mtr);
 
 	if (success) {
-		if (UNIV_UNLIKELY(!page_get_n_recs(btr_pcur_get_page(pcur)))) {
+		if (page_is_empty(btr_pcur_get_page(pcur))) {
 			/* If a B-tree page is empty, it must be the root page
 			and the whole B-tree must be empty. InnoDB does not
 			allow empty B-tree pages other than the root. */
@@ -4388,7 +4508,7 @@ ibuf_delete_rec(
 			/* ibuf->empty is protected by the root page latch.
 			Before the deletion, it had to be FALSE. */
 			ut_ad(!ibuf->empty);
-			ibuf->empty = TRUE;
+			ibuf->empty = true;
 		}
 
 #ifdef UNIV_IBUF_COUNT_DEBUG
@@ -4429,7 +4549,7 @@ ibuf_delete_rec(
 
 	root = ibuf_tree_root_get(mtr);
 
-	btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur),
+	btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur), 0,
 				   RB_NONE, mtr);
 	ut_a(err == DB_SUCCESS);
 
@@ -4439,7 +4559,7 @@ ibuf_delete_rec(
 	ibuf_size_update(root, mtr);
 	mutex_exit(&ibuf_mutex);
 
-	ibuf->empty = (page_get_n_recs(root) == 0);
+	ibuf->empty = page_is_empty(root);
 	ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
 
 func_exit:
@@ -4567,13 +4687,7 @@ ibuf_merge_or_delete_for_page(
 
 	heap = mem_heap_create(512);
 
-	if (UNIV_UNLIKELY(!trx_sys_multiple_tablespace_format)) {
-		ut_a(trx_doublewrite_must_reset_space_ids);
-		search_tuple = ibuf_search_tuple_build(space, page_no, heap);
-	} else {
-		search_tuple = ibuf_new_search_tuple_build(space, page_no,
-							   heap);
-	}
+	search_tuple = ibuf_search_tuple_build(space, page_no, heap);
 
 	if (block) {
 		/* Move the ownership of the x-latch on the page to this OS
@@ -4845,7 +4959,7 @@ reset_bit:
 	mem_heap_free(heap);
 
 #ifdef HAVE_ATOMIC_BUILTINS
-	(void) os_atomic_increment_ulint(&ibuf->n_merges, 1);
+	os_atomic_increment_ulint(&ibuf->n_merges, 1);
 	ibuf_add_ops(ibuf->n_merged_ops, mops);
 	ibuf_add_ops(ibuf->n_discarded_ops, dops);
 #else /* HAVE_ATOMIC_BUILTINS */
@@ -4895,7 +5009,7 @@ ibuf_delete_for_discarded_space(
 	/* Use page number 0 to build the search tuple so that we get the
 	cursor positioned at the first entry for this space id */
 
-	search_tuple = ibuf_new_search_tuple_build(space, 0, heap);
+	search_tuple = ibuf_search_tuple_build(space, 0, heap);
 
 	memset(dops, 0, sizeof(dops));
 loop:
@@ -4964,13 +5078,13 @@ leave_loop:
 
 /******************************************************************//**
 Looks if the insert buffer is empty.
-@return	TRUE if empty */
+@return	true if empty */
 UNIV_INTERN
-ibool
+bool
 ibuf_is_empty(void)
 /*===============*/
 {
-	ibool		is_empty;
+	bool		is_empty;
 	const page_t*	root;
 	mtr_t		mtr;
 
@@ -4980,7 +5094,7 @@ ibuf_is_empty(void)
 	root = ibuf_tree_root_get(&mtr);
 	mutex_exit(&ibuf_mutex);
 
-	is_empty = (page_get_n_recs(root) == 0);
+	is_empty = page_is_empty(root);
 	ut_a(is_empty == ibuf->empty);
 	ibuf_mtr_commit(&mtr);
 
@@ -5033,4 +5147,109 @@ ibuf_print(
 
 	mutex_exit(&ibuf_mutex);
 }
+
+/******************************************************************//**
+Checks the insert buffer bitmaps on IMPORT TABLESPACE.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+ibuf_check_bitmap_on_import(
+/*========================*/
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		space_id)	/*!< in: tablespace identifier */
+{
+	ulint	zip_size;
+	ulint	page_size;
+	ulint	size;
+	ulint	page_no;
+
+	ut_ad(space_id);
+	ut_ad(trx->mysql_thd);
+
+	zip_size = fil_space_get_zip_size(space_id);
+
+	if (zip_size == ULINT_UNDEFINED) {
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	size = fil_space_get_size(space_id);
+
+	if (size == 0) {
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	mutex_enter(&ibuf_mutex);
+
+	page_size = zip_size ? zip_size : UNIV_PAGE_SIZE;
+
+	for (page_no = 0; page_no < size; page_no += page_size) {
+		mtr_t	mtr;
+		page_t*	bitmap_page;
+		ulint	i;
+
+		if (trx_is_interrupted(trx)) {
+			mutex_exit(&ibuf_mutex);
+			return(DB_INTERRUPTED);
+		}
+
+		mtr_start(&mtr);
+
+		mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+		ibuf_enter(&mtr);
+
+		bitmap_page = ibuf_bitmap_get_map_page(
+			space_id, page_no, zip_size, &mtr);
+
+		for (i = FSP_IBUF_BITMAP_OFFSET + 1; i < page_size; i++) {
+			const ulint	offset = page_no + i;
+
+			if (ibuf_bitmap_page_get_bits(
+				    bitmap_page, offset, zip_size,
+				    IBUF_BITMAP_IBUF, &mtr)) {
+
+				mutex_exit(&ibuf_mutex);
+				ibuf_exit(&mtr);
+				mtr_commit(&mtr);
+
+				ib_errf(trx->mysql_thd,
+					IB_LOG_LEVEL_ERROR,
+					 ER_INNODB_INDEX_CORRUPT,
+					 "Space %u page %u"
+					 " is wrongly flagged to belong to the"
+					 " insert buffer",
+					 (unsigned) space_id,
+					 (unsigned) offset);
+
+				return(DB_CORRUPTION);
+			}
+
+			if (ibuf_bitmap_page_get_bits(
+				    bitmap_page, offset, zip_size,
+				    IBUF_BITMAP_BUFFERED, &mtr)) {
+
+				ib_errf(trx->mysql_thd,
+					IB_LOG_LEVEL_WARN,
+					ER_INNODB_INDEX_CORRUPT,
+					"Buffered changes"
+					" for space %u page %u are lost",
+					(unsigned) space_id,
+					(unsigned) offset);
+
+				/* Tolerate this error, so that
+				slightly corrupted tables can be
+				imported and dumped.  Clear the bit. */
+				ibuf_bitmap_page_set_bits(
+					bitmap_page, offset, zip_size,
+					IBUF_BITMAP_BUFFERED, FALSE, &mtr);
+			}
+		}
+
+		ibuf_exit(&mtr);
+		mtr_commit(&mtr);
+	}
+
+	mutex_exit(&ibuf_mutex);
+	return(DB_SUCCESS);
+}
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/api0api.h b/storage/xtradb/include/api0api.h
new file mode 100644
index 00000000000..1d6aaab60bc
--- /dev/null
+++ b/storage/xtradb/include/api0api.h
@@ -0,0 +1,1284 @@
+/*****************************************************************************
+
+Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/api0api.h
+InnoDB Native API
+
+2008-08-01 Created by Sunny Bains.
+3/20/2011 Jimmy Yang extracted from Embedded InnoDB
+*******************************************************/
+
+#ifndef api0api_h
+#define api0api_h
+
+#include "db0err.h"
+#include <stdio.h>
+
+#ifdef _MSC_VER
+#define strncasecmp		_strnicmp
+#define strcasecmp		_stricmp
+#endif
+
+#if defined(__GNUC__) && (__GNUC__ > 2) && ! defined(__INTEL_COMPILER)
+#define UNIV_NO_IGNORE		__attribute__ ((warn_unused_result))
+#else
+#define UNIV_NO_IGNORE
+#endif /* __GNUC__ && __GNUC__ > 2 && !__INTEL_COMPILER */
+
+/* See comment about ib_bool_t as to why the two macros are unsigned long. */
+/** The boolean value of "true" used internally within InnoDB */
+#define IB_TRUE			0x1UL
+/** The boolean value of "false" used internally within InnoDB */
+#define IB_FALSE		0x0UL
+
+/* Basic types used by the InnoDB API. */
+/** All InnoDB error codes are represented by ib_err_t */
+typedef enum dberr_t		ib_err_t;
+/** Representation of a byte within InnoDB */
+typedef unsigned char		ib_byte_t;
+/** Representation of an unsigned long int within InnoDB */
+typedef unsigned long int	ib_ulint_t;
+
+/* We assume C99 support except when using VisualStudio. */
+#if !defined(_MSC_VER)
+#include <stdint.h>
+#endif /* _MSC_VER */
+
+/* Integer types used by the API. Microsft VS defines its own types
+and we use the Microsoft types when building with Visual Studio. */
+#if defined(_MSC_VER)
+/** A signed 8 bit integral type. */
+typedef __int8			ib_i8_t;
+#else
+/** A signed 8 bit integral type. */
+typedef int8_t                  ib_i8_t;
+#endif
+
+#if defined(_MSC_VER)
+/** An unsigned 8 bit integral type. */
+typedef unsigned __int8		ib_u8_t;
+#else
+/** An unsigned 8 bit integral type. */
+typedef uint8_t                 ib_u8_t;
+#endif
+
+#if defined(_MSC_VER)
+/** A signed 16 bit integral type. */
+typedef __int16			ib_i16_t;
+#else
+/** A signed 16 bit integral type. */
+typedef int16_t                 ib_i16_t;
+#endif
+
+#if defined(_MSC_VER)
+/** An unsigned 16 bit integral type. */
+typedef unsigned __int16	ib_u16_t;
+#else
+/** An unsigned 16 bit integral type. */
+typedef uint16_t                ib_u16_t;
+#endif
+
+#if defined(_MSC_VER)
+/** A signed 32 bit integral type. */
+typedef __int32			ib_i32_t;
+#else
+/** A signed 32 bit integral type. */
+typedef int32_t                 ib_i32_t;
+#endif
+
+#if defined(_MSC_VER)
+/** An unsigned 32 bit integral type. */
+typedef unsigned __int32	ib_u32_t;
+#else
+/** An unsigned 32 bit integral type. */
+typedef uint32_t                ib_u32_t;
+#endif
+
+#if defined(_MSC_VER)
+/** A signed 64 bit integral type. */
+typedef __int64			ib_i64_t;
+#else
+/** A signed 64 bit integral type. */
+typedef int64_t                 ib_i64_t;
+#endif
+
+#if defined(_MSC_VER)
+/** An unsigned 64 bit integral type. */
+typedef unsigned __int64	ib_u64_t;
+#else
+/** An unsigned 64 bit integral type. */
+typedef uint64_t                ib_u64_t;
+#endif
+
+typedef void*			ib_opaque_t;
+typedef ib_opaque_t		ib_charset_t;
+typedef ib_ulint_t		ib_bool_t;
+typedef ib_u64_t		ib_id_u64_t;
+
+/** @enum ib_cfg_type_t Possible types for a configuration variable. */
+typedef enum {
+	IB_CFG_IBOOL,			/*!< The configuration parameter is
+					of type ibool */
+
+	/* XXX Can we avoid having different types for ulint and ulong?
+	- On Win64 "unsigned long" is 32 bits
+	- ulong is always defined as "unsigned long"
+	- On Win64 ulint is defined as 64 bit integer
+	=> On Win64 ulint != ulong.
+	If we typecast all ulong and ulint variables to the smaller type
+	ulong, then we will cut the range of the ulint variables.
+	This is not a problem for most ulint variables because their max
+	allowed values do not exceed 2^32-1 (e.g. log_groups is ulint
+	but its max allowed value is 10). BUT buffer_pool_size and
+	log_file_size allow up to 2^64-1. */
+
+	IB_CFG_ULINT,			/*!< The configuration parameter is
+					of type ulint */
+
+	IB_CFG_ULONG,			/*!< The configuration parameter is
+					of type ulong */
+
+	IB_CFG_TEXT,			/*!< The configuration parameter is
+					of type char* */
+
+	IB_CFG_CB			/*!< The configuration parameter is
+					a callback parameter */
+} ib_cfg_type_t;
+
+/** @enum ib_col_type_t  column types that are supported. */
+typedef enum {
+	IB_VARCHAR =	1,		/*!< Character varying length. The
+					column is not padded. */
+
+	IB_CHAR =	2,		/*!< Fixed length character string. The
+					column is padded to the right. */
+
+	IB_BINARY =	3,		/*!< Fixed length binary, similar to
+					IB_CHAR but the column is not padded
+					to the right. */
+
+	IB_VARBINARY =	4,		/*!< Variable length binary */
+
+	IB_BLOB	=	5,		/*!< Binary large object, or
+					a TEXT type */
+
+	IB_INT =	6,		/*!< Integer: can be any size
+					from 1 - 8 bytes. If the size is
+					1, 2, 4 and 8 bytes then you can use
+					the typed read and write functions. For
+					other sizes you will need to use the
+					ib_col_get_value() function and do the
+					conversion yourself. */
+
+	IB_SYS =	8,		/*!< System column, this column can
+					be one of DATA_TRX_ID, DATA_ROLL_PTR
+					or DATA_ROW_ID. */
+
+	IB_FLOAT =	9,		/*!< C (float)  floating point value. */
+
+	IB_DOUBLE =	10,		/*!> C (double) floating point value. */
+
+	IB_DECIMAL =	11,		/*!< Decimal stored as an ASCII
+					string */
+
+	IB_VARCHAR_ANYCHARSET =	12,	/*!< Any charset, varying length */
+
+	IB_CHAR_ANYCHARSET =	13	/*!< Any charset, fixed length */
+
+} ib_col_type_t;
+
+/** @enum ib_tbl_fmt_t InnoDB table format types */
+typedef enum {
+	IB_TBL_REDUNDANT,		/*!< Redundant row format, the column
+					type and length is stored in the row.*/
+
+	IB_TBL_COMPACT,			/*!< Compact row format, the column
+					type is not stored in the row. The
+					length is stored in the row but the
+					storage format uses a compact format
+					to store the length of the column data
+					and record data storage format also
+					uses less storage. */
+
+	IB_TBL_DYNAMIC,			/*!< Compact row format. BLOB prefixes
+					are not stored in the clustered index */
+
+	IB_TBL_COMPRESSED		/*!< Similar to dynamic format but
+					with pages compressed */
+} ib_tbl_fmt_t;
+
+/** @enum ib_col_attr_t InnoDB column attributes */
+typedef enum {
+	IB_COL_NONE = 0,		/*!< No special attributes. */
+
+	IB_COL_NOT_NULL = 1,		/*!< Column data can't be NULL. */
+
+	IB_COL_UNSIGNED = 2,		/*!< Column is IB_INT and unsigned. */
+
+	IB_COL_NOT_USED = 4,		/*!< Future use, reserved. */
+
+	IB_COL_CUSTOM1 = 8,		/*!< Custom precision type, this is
+					a bit that is ignored by InnoDB and so
+					can be set and queried by users. */
+
+	IB_COL_CUSTOM2 = 16,		/*!< Custom precision type, this is
+					a bit that is ignored by InnoDB and so
+					can be set and queried by users. */
+
+	IB_COL_CUSTOM3 = 32		/*!< Custom precision type, this is
+					a bit that is ignored by InnoDB and so
+					can be set and queried by users. */
+} ib_col_attr_t;
+
+/* Note: must match lock0types.h */
+/** @enum ib_lck_mode_t InnoDB lock modes. */
+typedef enum {
+	IB_LOCK_IS = 0,			/*!< Intention shared, an intention
+					lock should be used to lock tables */
+
+	IB_LOCK_IX,			/*!< Intention exclusive, an intention
+					lock should be used to lock tables */
+
+	IB_LOCK_S,			/*!< Shared locks should be used to
+					lock rows */
+
+	IB_LOCK_X,			/*!< Exclusive locks should be used to
+					lock rows*/
+
+	IB_LOCK_TABLE_X,		/*!< exclusive table lock */
+
+	IB_LOCK_NONE,			/*!< This is used internally to note
+					consistent read */
+
+	IB_LOCK_NUM = IB_LOCK_NONE	/*!< number of lock modes */
+} ib_lck_mode_t;
+
+typedef enum {
+	IB_CLUSTERED = 1,	/*!< clustered index */
+	IB_UNIQUE = 2		/*!< unique index */
+} ib_index_type_t;
+
+/** @enum ib_srch_mode_t InnoDB cursor search modes for ib_cursor_moveto().
+Note: Values must match those found in page0cur.h */
+typedef enum {
+	IB_CUR_G = 1,			/*!< If search key is not found then
+					position the cursor on the row that
+					is greater than the search key */
+
+	IB_CUR_GE = 2,			/*!< If the search key not found then
+					position the cursor on the row that
+					is greater than or equal to the search
+					key */
+
+	IB_CUR_L = 3,			/*!< If search key is not found then
+					position the cursor on the row that
+					is less than the search key */
+
+	IB_CUR_LE = 4			/*!< If search key is not found then
+					position the cursor on the row that
+					is less than or equal to the search
+					key */
+} ib_srch_mode_t;
+
+/** @enum ib_match_mode_t Various match modes used by ib_cursor_moveto() */
+typedef enum {
+	IB_CLOSEST_MATCH,		/*!< Closest match possible */
+
+	IB_EXACT_MATCH,			/*!< Search using a complete key
+					value */
+
+	IB_EXACT_PREFIX			/*!< Search using a key prefix which
+					must match to rows: the prefix may
+					contain an incomplete field (the
+					last field in prefix may be just
+					a prefix of a fixed length column) */
+} ib_match_mode_t;
+
+/** @struct ib_col_meta_t InnoDB column meta data. */
+typedef struct {
+	ib_col_type_t	type;		/*!< Type of the column */
+
+	ib_col_attr_t	attr;		/*!< Column attributes */
+
+	ib_u32_t	type_len;	/*!< Length of type */
+
+	ib_u16_t	client_type;	/*!< 16 bits of data relevant only to
+					the client. InnoDB doesn't care */
+
+	ib_charset_t*	charset;	/*!< Column charset */
+} ib_col_meta_t;
+
+/* Note: Must be in sync with trx0trx.h */
+/** @enum ib_trx_state_t The transaction state can be queried using the
+ib_trx_state() function. The InnoDB deadlock monitor can roll back a
+transaction and users should be prepared for this, especially where there
+is high contention. The way to determine the state of the transaction is to
+query it's state and check. */
+typedef enum {
+	IB_TRX_NOT_STARTED,		/*!< Has not started yet, the
+					transaction has not ben started yet.*/
+
+	IB_TRX_ACTIVE,			/*!< The transaction is currently
+					active and needs to be either
+					committed or rolled back. */
+
+	IB_TRX_COMMITTED_IN_MEMORY,	/*!< Not committed to disk yet */
+
+	IB_TRX_PREPARED			/*!< Support for 2PC/XA */
+} ib_trx_state_t;
+
+/* Note: Must be in sync with trx0trx.h */
+/** @enum ib_trx_level_t Transaction isolation levels */
+typedef enum {
+	IB_TRX_READ_UNCOMMITTED = 0,	/*!< Dirty read: non-locking SELECTs are
+					performed so that we do not look at a
+					possible earlier version of a record;
+					thus they are not 'consistent' reads
+					under this isolation level; otherwise
+					like level 2 */
+
+	IB_TRX_READ_COMMITTED = 1,	/*!< Somewhat Oracle-like isolation,
+					except that in range UPDATE and DELETE
+					we must block phantom rows with
+					next-key locks; SELECT ... FOR UPDATE
+					and ...  LOCK IN SHARE MODE only lock
+					the index records, NOT the gaps before
+					them, and thus allow free inserting;
+					each consistent read reads its own
+					snapshot */
+
+	IB_TRX_REPEATABLE_READ = 2,	/*!< All consistent reads in the same
+					trx read the same snapshot; full
+					next-key locking used in locking reads
+					to block insertions into gaps */
+
+	IB_TRX_SERIALIZABLE = 3		/*!< All plain SELECTs are converted to
+					LOCK IN SHARE MODE reads */
+} ib_trx_level_t;
+
+/** Generical InnoDB callback prototype. */
+typedef void (*ib_cb_t)(void);
+
+#define IB_CFG_BINLOG_ENABLED	0x1
+#define IB_CFG_MDL_ENABLED	0x2
+#define IB_CFG_DISABLE_ROWLOCK	0x4
+
+/** The first argument to the InnoDB message logging function. By default
+it's set to stderr. You should treat ib_msg_stream_t as a void*, since
+it will probably change in the future. */
+typedef FILE* ib_msg_stream_t;
+
+/** All log messages are written to this function.It should have the same
+behavior as fprintf(3). */
+typedef int (*ib_msg_log_t)(ib_msg_stream_t, const char*, ...);
+
+/* Note: This is to make it easy for API users to have type
+checking for arguments to our functions. Making it ib_opaque_t
+by itself will result in pointer decay resulting in subverting
+of the compiler's type checking. */
+
+/** InnoDB tuple handle. This handle can refer to either a cluster index
+tuple or a secondary index tuple. There are two types of tuples for each
+type of index, making a total of four types of tuple handles. There
+is a tuple for reading the entire row contents and another for searching
+on the index key. */
+typedef struct ib_tuple_t* ib_tpl_t;
+
+/** InnoDB transaction handle, all database operations need to be covered
+by transactions. This handle represents a transaction. The handle can be
+created with ib_trx_begin(), you commit your changes with ib_trx_commit()
+and undo your changes using ib_trx_rollback(). If the InnoDB deadlock
+monitor rolls back the transaction then you need to free the transaction
+using the function ib_trx_release(). You can query the state of an InnoDB
+transaction by calling ib_trx_state(). */
+typedef struct trx_t* ib_trx_t;
+
+/** InnoDB cursor handle */
+typedef struct ib_cursor_t* ib_crsr_t;
+
+/*************************************************************//**
+This function is used to compare two data fields for which the data type
+is such that we must use the client code to compare them.
+
+@param col_meta		column meta data
+@param p1		key
+@oaram p1_len		key length
+@param p2		second key
+@param p2_len		second key length
+@return 1, 0, -1, if a is greater, equal, less than b, respectively */
+
+typedef int (*ib_client_cmp_t)(
+	const ib_col_meta_t*	col_meta,
+	const ib_byte_t*	p1,
+	ib_ulint_t		p1_len,
+	const ib_byte_t*	p2,
+	ib_ulint_t		p2_len);
+
+/* This should be the same as univ.i */
+/** Represents SQL_NULL length */
+#define	IB_SQL_NULL		0xFFFFFFFF
+/** The number of system columns in a row. */
+#define IB_N_SYS_COLS		3
+
+/** The maximum length of a text column. */
+#define MAX_TEXT_LEN		4096
+
+/* MySQL uses 3 byte UTF-8 encoding. */
+/** The maximum length of a column name in a table schema. */
+#define IB_MAX_COL_NAME_LEN	(64 * 3)
+
+/** The maximum length of a table name (plus database name). */
+#define IB_MAX_TABLE_NAME_LEN	(64 * 3) * 2
+
+/*****************************************************************//**
+Start a transaction that's been rolled back. This special function
+exists for the case when InnoDB's deadlock detector has rolledack
+a transaction. While the transaction has been rolled back the handle
+is still valid and can be reused by calling this function. If you
+don't want to reuse the transaction handle then you can free the handle
+by calling ib_trx_release().
+@return	innobase txn handle */
+
+ib_err_t
+ib_trx_start(
+/*=========*/
+	ib_trx_t	ib_trx,		/*!< in: transaction to restart */
+	ib_trx_level_t	ib_trx_level,	/*!< in: trx isolation level */
+	void*		thd);		/*!< in: THD */
+
+/*****************************************************************//**
+Begin a transaction. This will allocate a new transaction handle and
+put the transaction in the active state.
+@return	innobase txn handle */
+
+ib_trx_t
+ib_trx_begin(
+/*=========*/
+	ib_trx_level_t	ib_trx_level);	/*!< in: trx isolation level */
+
+/*****************************************************************//**
+Query the transaction's state. This function can be used to check for
+the state of the transaction in case it has been rolled back by the
+InnoDB deadlock detector. Note that when a transaction is selected as
+a victim for rollback, InnoDB will always return an appropriate error
+code indicating this. @see DB_DEADLOCK, @see DB_LOCK_TABLE_FULL and
+@see DB_LOCK_WAIT_TIMEOUT
+@return	transaction state */
+
+ib_trx_state_t
+ib_trx_state(
+/*=========*/
+	ib_trx_t	ib_trx);	/*!< in: trx handle */
+
+/*****************************************************************//**
+Release the resources of the transaction. If the transaction was
+selected as a victim by InnoDB and rolled back then use this function
+to free the transaction handle.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_trx_release(
+/*===========*/
+	ib_trx_t	ib_trx);	/*!< in: trx handle */
+
+/*****************************************************************//**
+Commit a transaction. This function will release the schema latches too.
+It will also free the transaction handle.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_trx_commit(
+/*==========*/
+	ib_trx_t	ib_trx);	/*!< in: trx handle */
+
+/*****************************************************************//**
+Rollback a transaction. This function will release the schema latches too.
+It will also free the transaction handle.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_trx_rollback(
+/*============*/
+	ib_trx_t	ib_trx);	/*!< in: trx handle */
+
+/*****************************************************************//**
+Open an InnoDB table and return a cursor handle to it.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_open_table_using_id(
+/*==========================*/
+	ib_id_u64_t	table_id,	/*!< in: table id of table to open */
+	ib_trx_t	ib_trx,		/*!< in: Current transaction handle
+					can be NULL */
+	ib_crsr_t*	ib_crsr);	/*!< out,own: InnoDB cursor */
+
+/*****************************************************************//**
+Open an InnoDB index and return a cursor handle to it.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_open_index_using_id(
+/*==========================*/
+	ib_id_u64_t	index_id,	/*!< in: index id of index to open */
+	ib_trx_t	ib_trx,		/*!< in: Current transaction handle
+					can be NULL */
+	ib_crsr_t*	ib_crsr);	/*!< out: InnoDB cursor */
+
+/*****************************************************************//**
+Open an InnoDB secondary index cursor and return a cursor handle to it.
+@return DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_open_index_using_name(
+/*============================*/
+	ib_crsr_t	ib_open_crsr,	/*!< in: open/active cursor */
+	const char*	index_name,	/*!< in: secondary index name */
+	ib_crsr_t*	ib_crsr,	/*!< out,own: InnoDB index cursor */
+	int*		idx_type,	/*!< out: index is cluster index */
+	ib_id_u64_t*	idx_id);	/*!< out: index id */
+
+/*****************************************************************//**
+Open an InnoDB table by name and return a cursor handle to it.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_open_table(
+/*=================*/
+	const char*	name,		/*!< in: table name */
+	ib_trx_t	ib_trx,		/*!< in: Current transaction handle
+					can be NULL */
+	ib_crsr_t*	ib_crsr);	/*!< out,own: InnoDB cursor */
+
+/*****************************************************************//**
+Reset the cursor.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_reset(
+/*============*/
+	ib_crsr_t	ib_crsr);	/*!< in/out: InnoDB cursor */
+
+
+/*****************************************************************//**
+set a cursor trx to NULL*/
+
+void
+ib_cursor_clear_trx(
+/*================*/
+	ib_crsr_t	ib_crsr);	/*!< in/out: InnoDB cursor */
+
+/*****************************************************************//**
+Close an InnoDB table and free the cursor.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_close(
+/*============*/
+	ib_crsr_t	ib_crsr);	/*!< in/out: InnoDB cursor */
+
+/*****************************************************************//**
+Close the table, decrement n_ref_count count.
+@return DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_close_table(
+/*==================*/
+	ib_crsr_t	ib_crsr);	/*!< in/out: InnoDB cursor */
+
+/*****************************************************************//**
+update the cursor with new transactions and also reset the cursor
+@return DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_new_trx(
+/*==============*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor */
+	ib_trx_t	ib_trx);	/*!< in: transaction */
+
+/*****************************************************************//**
+Commit the transaction in a cursor
+@return DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_commit_trx(
+/*=================*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor */
+	ib_trx_t	ib_trx);	/*!< in: transaction */
+
+/********************************************************************//**
+Open a table using the table name, if found then increment table ref count.
+@return table instance if found */
+
+void*
+ib_open_table_by_name(
+/*==================*/
+	const char*	name);		/*!< in: table name to lookup */
+
+/*****************************************************************//**
+Insert a row to a table.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_insert_row(
+/*=================*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor instance */
+	const ib_tpl_t	ib_tpl);	/*!< in: tuple to insert */
+
+/*****************************************************************//**
+Update a row in a table.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_update_row(
+/*=================*/
+	ib_crsr_t	ib_crsr,	/*!< in: InnoDB cursor instance */
+	const ib_tpl_t	ib_old_tpl,	/*!< in: Old tuple in table */
+	const ib_tpl_t	ib_new_tpl);	/*!< in: New tuple to update */
+
+/*****************************************************************//**
+Delete a row in a table.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_delete_row(
+/*=================*/
+	ib_crsr_t	ib_crsr);	/*!< in: cursor instance */
+
+/*****************************************************************//**
+Read current row.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_read_row(
+/*===============*/
+	ib_crsr_t	ib_crsr,	/*!< in: InnoDB cursor instance */
+	ib_tpl_t	ib_tpl);	/*!< out: read cols into this tuple */
+
+/*****************************************************************//**
+Move cursor to the first record in the table.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_first(
+/*============*/
+	ib_crsr_t	ib_crsr);	/*!< in: InnoDB cursor instance */
+
+/*****************************************************************//**
+Move cursor to the last record in the table.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_last(
+/*===========*/
+	ib_crsr_t	ib_crsr);	/*!< in: InnoDB cursor instance */
+
+/*****************************************************************//**
+Move cursor to the next record in the table.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_next(
+/*===========*/
+	ib_crsr_t	ib_crsr);	/*!< in: InnoDB cursor instance */
+
+/*****************************************************************//**
+Search for key.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_moveto(
+/*=============*/
+	ib_crsr_t	ib_crsr,	/*!< in: InnoDB cursor instance */
+	ib_tpl_t	ib_tpl,		/*!< in: Key to search for */
+	ib_srch_mode_t	ib_srch_mode);	/*!< in: search mode */
+
+/*****************************************************************//**
+Set the match mode for ib_cursor_move(). */
+
+void
+ib_cursor_set_match_mode(
+/*=====================*/
+	ib_crsr_t	ib_crsr,	/*!< in: Cursor instance */
+	ib_match_mode_t	match_mode);	/*!< in: ib_cursor_moveto match mode */
+
+/*****************************************************************//**
+Set a column of the tuple. Make a copy using the tuple's heap.
+@return	DB_SUCCESS or error code */
+
+ib_err_t
+ib_col_set_value(
+/*=============*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	col_no,		/*!< in: column index in tuple */
+	const void*	src,		/*!< in: data value */
+	ib_ulint_t	len,		/*!< in: data value len */
+	ib_bool_t	need_cpy);	/*!< in: if need memcpy */
+
+
+/*****************************************************************//**
+Get the size of the data available in the column the tuple.
+@return	bytes avail or IB_SQL_NULL */
+
+ib_ulint_t
+ib_col_get_len(
+/*===========*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	i);		/*!< in: column index in tuple */
+
+/*****************************************************************//**
+Copy a column value from the tuple.
+@return	bytes copied or IB_SQL_NULL */
+
+ib_ulint_t
+ib_col_copy_value(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	i,		/*!< in: column index in tuple */
+	void*		dst,		/*!< out: copied data value */
+	ib_ulint_t	len);		/*!< in: max data value len to copy */
+
+/*************************************************************//**
+Read a signed int 8 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_i8(
+/*=============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_i8_t*	ival);		/*!< out: integer value */
+
+/*************************************************************//**
+Read an unsigned int 8 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_u8(
+/*=============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_u8_t*	ival);		/*!< out: integer value */
+
+/*************************************************************//**
+Read a signed int 16 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_i16(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_i16_t*	ival);		/*!< out: integer value */
+
+/*************************************************************//**
+Read an unsigned int 16 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_u16(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_u16_t*	ival);		/*!< out: integer value */
+
+/*************************************************************//**
+Read a signed int 32 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_i32(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_i32_t*	ival);		/*!< out: integer value */
+
+/*************************************************************//**
+Read an unsigned int 32 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_u32(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_u32_t*	ival);		/*!< out: integer value */
+
+/*************************************************************//**
+Read a signed int 64 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_i64(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_i64_t*	ival);		/*!< out: integer value */
+
+/*************************************************************//**
+Read an unsigned int 64 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_u64(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_u64_t*	ival);		/*!< out: integer value */
+
+/*****************************************************************//**
+Get a column value pointer from the tuple.
+@return	NULL or pointer to buffer */
+
+const void*
+ib_col_get_value(
+/*=============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i);		/*!< in: column number */
+
+/*****************************************************************//**
+Get a column type, length and attributes from the tuple.
+@return	len of column data */
+
+ib_ulint_t
+ib_col_get_meta(
+/*============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_col_meta_t*	ib_col_meta);	/*!< out: column meta data */
+
+/*****************************************************************//**
+"Clear" or reset an InnoDB tuple. We free the heap and recreate the tuple.
+@return	new tuple, or NULL */
+
+ib_tpl_t
+ib_tuple_clear(
+/*============*/
+	ib_tpl_t	ib_tpl);	/*!< in: InnoDB tuple */
+
+/*****************************************************************//**
+Create a new cluster key search tuple and copy the contents of  the
+secondary index key tuple columns that refer to the cluster index record
+to the cluster key. It does a deep copy of the column data.
+@return	DB_SUCCESS or error code */
+
+ib_err_t
+ib_tuple_get_cluster_key(
+/*=====================*/
+	ib_crsr_t	ib_crsr,	/*!< in: secondary index cursor */
+	ib_tpl_t*	ib_dst_tpl,	/*!< out,own: destination tuple */
+	const ib_tpl_t	ib_src_tpl);	/*!< in: source tuple */
+
+/*****************************************************************//**
+Copy the contents of  source tuple to destination tuple. The tuples
+must be of the same type and belong to the same table/index.
+@return	DB_SUCCESS or error code */
+
+ib_err_t
+ib_tuple_copy(
+/*==========*/
+	ib_tpl_t	ib_dst_tpl,	/*!< in: destination tuple */
+	const ib_tpl_t	ib_src_tpl);	/*!< in: source tuple */
+
+/*****************************************************************//**
+Create an InnoDB tuple used for index/table search.
+@return tuple for current index */
+
+ib_tpl_t
+ib_sec_search_tuple_create(
+/*=======================*/
+	ib_crsr_t	ib_crsr);	/*!< in: Cursor instance */
+
+/*****************************************************************//**
+Create an InnoDB tuple used for index/table search.
+@return	tuple for current index */
+
+ib_tpl_t
+ib_sec_read_tuple_create(
+/*=====================*/
+	ib_crsr_t	ib_crsr);	/*!< in: Cursor instance */
+
+/*****************************************************************//**
+Create an InnoDB tuple used for table key operations.
+@return	tuple for current table */
+
+ib_tpl_t
+ib_clust_search_tuple_create(
+/*=========================*/
+	ib_crsr_t	ib_crsr);	/*!< in: Cursor instance */
+
+/*****************************************************************//**
+Create an InnoDB tuple for table row operations.
+@return	tuple for current table */
+
+ib_tpl_t
+ib_clust_read_tuple_create(
+/*=======================*/
+	ib_crsr_t	ib_crsr);	/*!< in: Cursor instance */
+
+/*****************************************************************//**
+Return the number of user columns in the tuple definition.
+@return	number of user columns */
+
+ib_ulint_t
+ib_tuple_get_n_user_cols(
+/*=====================*/
+	const ib_tpl_t	ib_tpl);	/*!< in: Tuple for current table */
+
+/*****************************************************************//**
+Return the number of columns in the tuple definition.
+@return	number of columns */
+
+ib_ulint_t
+ib_tuple_get_n_cols(
+/*================*/
+	const ib_tpl_t	ib_tpl);	/*!< in: Tuple for current table */
+
+/*****************************************************************//**
+Destroy an InnoDB tuple. */
+
+void
+ib_tuple_delete(
+/*============*/
+	ib_tpl_t	ib_tpl);	/*!< in,own: Tuple instance to delete */
+
+/*****************************************************************//**
+Truncate a table. The cursor handle will be closed and set to NULL
+on success.
+@return	DB_SUCCESS or error code */
+
+ib_err_t
+ib_cursor_truncate(
+/*===============*/
+	ib_crsr_t*	ib_crsr,	/*!< in/out: cursor for table
+					to truncate */
+	ib_id_u64_t*	table_id);	/*!< out: new table id */
+
+/*****************************************************************//**
+Get a table id.
+@return	DB_SUCCESS if found */
+
+ib_err_t
+ib_table_get_id(
+/*============*/
+	const char*	table_name,	/*!< in: table to find */
+	ib_id_u64_t*	table_id);	/*!< out: table id if found */
+
+/*****************************************************************//**
+Get an index id.
+@return	DB_SUCCESS if found */
+
+ib_err_t
+ib_index_get_id(
+/*============*/
+	const char*	table_name,	/*!< in: find index for this table */
+	const char*	index_name,	/*!< in: index to find */
+	ib_id_u64_t*	index_id);	/*!< out: index id if found */
+
+/*****************************************************************//**
+Check if cursor is positioned.
+@return	IB_TRUE if positioned */
+
+ib_bool_t
+ib_cursor_is_positioned(
+/*====================*/
+	const ib_crsr_t	ib_crsr);	/*!< in: InnoDB cursor instance */
+
+/*****************************************************************//**
+Checks if the data dictionary is latched in exclusive mode by a
+user transaction.
+@return TRUE if exclusive latch */
+
+ib_bool_t
+ib_schema_lock_is_exclusive(
+/*========================*/
+	const ib_trx_t	ib_trx);	/*!< in: transaction */
+
+/*****************************************************************//**
+Lock an InnoDB cursor/table.
+@return	DB_SUCCESS or error code */
+
+ib_err_t
+ib_cursor_lock(
+/*===========*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor */
+	ib_lck_mode_t	ib_lck_mode);	/*!< in: InnoDB lock mode */
+
+/*****************************************************************//**
+Set the Lock an InnoDB table using the table id.
+@return	DB_SUCCESS or error code */
+
+ib_err_t
+ib_table_lock(
+/*===========*/
+	ib_trx_t	ib_trx,		/*!< in/out: transaction */
+	ib_id_u64_t	table_id,	/*!< in: table id */
+	ib_lck_mode_t	ib_lck_mode);	/*!< in: InnoDB lock mode */
+
+/*****************************************************************//**
+Set the Lock mode of the cursor.
+@return	DB_SUCCESS or error code */
+
+ib_err_t
+ib_cursor_set_lock_mode(
+/*====================*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor */
+	ib_lck_mode_t	ib_lck_mode);	/*!< in: InnoDB lock mode */
+
+/*****************************************************************//**
+Set need to access clustered index record flag. */
+
+void
+ib_cursor_set_cluster_access(
+/*=========================*/
+	ib_crsr_t	ib_crsr);	/*!< in/out: InnoDB cursor */
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+
+ib_err_t
+ib_tuple_write_i8(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_i8_t		val);		/*!< in: value to write */
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+
+ib_err_t
+ib_tuple_write_i16(
+/*=================*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_i16_t	val);		/*!< in: value to write */
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+
+ib_err_t
+ib_tuple_write_i32(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_i32_t	val);		/*!< in: value to write */
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+
+ib_err_t
+ib_tuple_write_i64(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_i64_t	val);		/*!< in: value to write */
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+
+ib_err_t
+ib_tuple_write_u8(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_u8_t		val);		/*!< in: value to write */
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+
+ib_err_t
+ib_tuple_write_u16(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_u16_t	val);		/*!< in: value to write */
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+
+ib_err_t
+ib_tuple_write_u32(
+/*=================*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_u32_t	val);		/*!< in: value to write */
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+
+ib_err_t
+ib_tuple_write_u64(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_u64_t	val);		/*!< in: value to write */
+
+/*****************************************************************//**
+Inform the cursor that it's the start of an SQL statement. */
+
+void
+ib_cursor_stmt_begin(
+/*=================*/
+	ib_crsr_t	ib_crsr);	/*!< in: cursor */
+
+/*****************************************************************//**
+Write a double value to a column.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_write_double(
+/*==================*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	int		col_no,		/*!< in: column number */
+	double		val);		/*!< in: value to write */
+
+/*************************************************************//**
+Read a double column value from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_double(
+/*=================*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	col_no,		/*!< in: column number */
+	double*		dval);		/*!< out: double value */
+
+/*****************************************************************//**
+Write a float value to a column.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_write_float(
+/*=================*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	float		val);		/*!< in: value to write */
+
+/*************************************************************//**
+Read a float value from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_float(
+/*================*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	col_no,		/*!< in: column number */
+	float*		fval);		/*!< out: float value */
+
+/*****************************************************************//**
+Get a column type, length and attributes from the tuple.
+@return len of column data */
+
+const char*
+ib_col_get_name(
+/*============*/
+	ib_crsr_t	ib_crsr,	/*!< in: InnoDB cursor instance */
+	ib_ulint_t	i);		/*!< in: column index in tuple */
+
+/*****************************************************************//**
+Get an index field name from the cursor.
+@return name of the field */
+
+const char*
+ib_get_idx_field_name(
+/*==================*/
+	ib_crsr_t	ib_crsr,	/*!< in: InnoDB cursor instance */
+	ib_ulint_t	i);		/*!< in: column index in tuple */
+
+/*****************************************************************//**
+Truncate a table.
+@return DB_SUCCESS or error code */
+
+ib_err_t
+ib_table_truncate(
+/*==============*/
+	const char*	table_name,	/*!< in: table name */
+	ib_id_u64_t*	table_id);	/*!< out: new table id */
+
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return DB_SUCCESS or error number */
+
+ib_err_t
+ib_close_thd(
+/*=========*/
+	void*		thd);		/*!< in: handle to the MySQL
+					thread of the user whose resources
+					should be free'd */
+
+/*****************************************************************//**
+Get generic configure status
+@return configure status*/
+
+int
+ib_cfg_get_cfg();
+/*============*/
+
+/*****************************************************************//**
+Check whether the table name conforms to our requirements. Currently
+we only do a simple check for the presence of a '/'.
+@return DB_SUCCESS or err code */
+
+ib_err_t
+ib_table_name_check(
+/*================*/
+	const char*	name);		/*!< in: table name to check */
+
+/*****************************************************************//**
+Return isolation configuration set by "innodb_api_trx_level"
+@return trx isolation level*/
+
+ib_trx_state_t
+ib_cfg_trx_level();
+/*==============*/
+
+/*****************************************************************//**
+Return configure value for background commit interval (in seconds)
+@return background commit interval (in seconds) */
+
+ib_ulint_t
+ib_cfg_bk_commit_interval();
+/*=======================*/
+
+/*****************************************************************//**
+Get a trx start time.
+@return trx start_time */
+
+ib_u64_t
+ib_trx_get_start_time(
+/*==================*/
+	ib_trx_t	ib_trx);	/*!< in: transaction */
+
+#endif /* api0api_h */
diff --git a/storage/xtradb/include/api0misc.h b/storage/xtradb/include/api0misc.h
new file mode 100644
index 00000000000..fcd748390d1
--- /dev/null
+++ b/storage/xtradb/include/api0misc.h
@@ -0,0 +1,78 @@
+/*****************************************************************************
+
+Copyright (c) 2008, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/api0misc.h
+InnoDB Native API
+
+3/20/2011 Jimmy Yang extracted from Embedded InnoDB
+2008 Created by Sunny Bains
+*******************************************************/
+
+#ifndef api0misc_h
+#define	api0misc_h
+
+#include "univ.i"
+#include "os0file.h"
+#include "que0que.h"
+#include "trx0trx.h"
+
+/** Whether binlog is enabled for applications using InnoDB APIs */
+extern my_bool                  ib_binlog_enabled;
+
+/** Whether MySQL MDL is enabled for applications using InnoDB APIs */
+extern my_bool                  ib_mdl_enabled;
+
+/** Whether InnoDB row lock is disabled for applications using InnoDB APIs */
+extern my_bool                  ib_disable_row_lock;
+
+/** configure value for transaction isolation level */
+extern ulong			ib_trx_level_setting;
+
+/** configure value for background commit interval (in seconds) */
+extern ulong			ib_bk_commit_interval;
+
+/********************************************************************
+Handles user errors and lock waits detected by the database engine.
+@return	TRUE if it was a lock wait and we should continue running
+the query thread */
+UNIV_INTERN
+ibool
+ib_handle_errors(
+/*=============*/
+	dberr_t*	new_err,	/*!< out: possible new error
+					encountered in lock wait, or if
+					no new error, the value of
+					trx->error_state at the entry of this
+					function */
+	trx_t*		trx,		/*!< in: transaction */
+	que_thr_t*	thr,		/*!< in: query thread */
+	trx_savept_t*	savept);	/*!< in: savepoint or NULL */
+
+/*************************************************************************
+Sets a lock on a table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+ib_trx_lock_table_with_retry(
+/*=========================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table,		/*!< in: table to lock */
+	enum lock_mode	mode);		/*!< in: lock mode */
+
+#endif /* api0misc_h */
diff --git a/storage/xtradb/include/btr0btr.h b/storage/xtradb/include/btr0btr.h
index fb06a774b82..a3f7cee2733 100644
--- a/storage/xtradb/include/btr0btr.h
+++ b/storage/xtradb/include/btr0btr.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -95,6 +96,17 @@ insert/delete buffer when the record is not in the buffer pool. */
 buffer when the record is not in the buffer pool. */
 #define BTR_DELETE		8192
 
+/** In the case of BTR_SEARCH_LEAF or BTR_MODIFY_LEAF, the caller is
+already holding an S latch on the index tree */
+#define BTR_ALREADY_S_LATCHED	16384
+
+#define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode)	\
+	((latch_mode) & ~(BTR_INSERT			\
+			  | BTR_DELETE_MARK		\
+			  | BTR_DELETE			\
+			  | BTR_ESTIMATE		\
+			  | BTR_IGNORE_SEC_UNIQUE	\
+			  | BTR_ALREADY_S_LATCHED))
 #endif /* UNIV_HOTBACKUP */
 
 /**************************************************************//**
@@ -121,7 +133,7 @@ btr_corruption_report(
 #ifdef UNIV_BLOB_DEBUG
 # include "ut0rbt.h"
 /** An index->blobs entry for keeping track of off-page column references */
-struct btr_blob_dbg_struct
+struct btr_blob_dbg_t
 {
 	unsigned	blob_page_no:32;	/*!< first BLOB page number */
 	unsigned	ref_page_no:32;		/*!< referring page number */
@@ -210,8 +222,32 @@ UNIV_INTERN
 page_t*
 btr_root_get(
 /*=========*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	mtr_t*			mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
+
+/**************************************************************//**
+Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
+@return error code, or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+btr_root_adjust_on_import(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index tree */
+	__attribute__((nonnull, warn_unused_result));
+
+/**************************************************************//**
+Gets the height of the B-tree (the level of the root, when the leaf
+level is assumed to be 0). The caller must hold an S or X latch on
+the index.
+@return	tree height (level of the root) */
+UNIV_INTERN
+ulint
+btr_height_get(
+/*===========*/
 	dict_index_t*	index,	/*!< in: index tree */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull, warn_unused_result));
 /**************************************************************//**
 Gets a buffer page and declares its latching order level. */
 UNIV_INLINE
@@ -264,17 +300,6 @@ btr_block_get_func(
 @return the uncompressed page frame */
 # define btr_page_get(space,zip_size,page_no,mode,idx,mtr)		\
 	buf_block_get_frame(btr_block_get(space,zip_size,page_no,mode,idx,mtr))
-/**************************************************************//**
-Sets the index id field of a page. */
-UNIV_INLINE
-void
-btr_page_set_index_id(
-/*==================*/
-	page_t*		page,	/*!< in: page to be created */
-	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
-				part will be updated, or NULL */
-	index_id_t	id,	/*!< in: index id */
-	mtr_t*		mtr);	/*!< in: mtr */
 #endif /* !UNIV_HOTBACKUP */
 /**************************************************************//**
 Gets the index id field of a page.
@@ -283,7 +308,8 @@ UNIV_INLINE
 index_id_t
 btr_page_get_index_id(
 /*==================*/
-	const page_t*	page);	/*!< in: index page */
+	const page_t*	page)	/*!< in: index page */
+	__attribute__((nonnull, pure, warn_unused_result));
 #ifndef UNIV_HOTBACKUP
 /********************************************************//**
 Gets the node level field in an index page.
@@ -292,16 +318,9 @@ UNIV_INLINE
 ulint
 btr_page_get_level_low(
 /*===================*/
-	const page_t*	page);	/*!< in: index page */
-/********************************************************//**
-Gets the node level field in an index page.
-@return	level, leaf level == 0 */
-UNIV_INLINE
-ulint
-btr_page_get_level(
-/*===============*/
-	const page_t*	page,	/*!< in: index page */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+	const page_t*	page)	/*!< in: index page */
+	__attribute__((nonnull, pure, warn_unused_result));
+#define btr_page_get_level(page, mtr) btr_page_get_level_low(page)
 /********************************************************//**
 Gets the next index page number.
 @return	next page number */
@@ -310,18 +329,8 @@ ulint
 btr_page_get_next(
 /*==============*/
 	const page_t*	page,	/*!< in: index page */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle */
-/********************************************************//**
-Sets the next index page field. */
-UNIV_INLINE
-void
-btr_page_set_next(
-/*==============*/
-	page_t*		page,	/*!< in: index page */
-	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
-				part will be updated, or NULL */
-	ulint		next,	/*!< in: next page number */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************//**
 Gets the previous index page number.
 @return	prev page number */
@@ -330,18 +339,8 @@ ulint
 btr_page_get_prev(
 /*==============*/
 	const page_t*	page,	/*!< in: index page */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle */
-/********************************************************//**
-Sets the previous index page field. */
-UNIV_INLINE
-void
-btr_page_set_prev(
-/*==============*/
-	page_t*		page,	/*!< in: index page */
-	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
-				part will be updated, or NULL */
-	ulint		prev,	/*!< in: previous page number */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+	__attribute__((nonnull, warn_unused_result));
 /*************************************************************//**
 Gets pointer to the previous user record in the tree. It is assumed
 that the caller has appropriate latches on the page and its neighbor.
@@ -351,8 +350,9 @@ rec_t*
 btr_get_prev_user_rec(
 /*==================*/
 	rec_t*	rec,	/*!< in: record on leaf level */
-	mtr_t*	mtr);	/*!< in: mtr holding a latch on the page, and if
+	mtr_t*	mtr)	/*!< in: mtr holding a latch on the page, and if
 			needed, also to the previous page */
+	__attribute__((nonnull, warn_unused_result));
 /*************************************************************//**
 Gets pointer to the next user record in the tree. It is assumed
 that the caller has appropriate latches on the page and its neighbor.
@@ -362,8 +362,9 @@ rec_t*
 btr_get_next_user_rec(
 /*==================*/
 	rec_t*	rec,	/*!< in: record on leaf level */
-	mtr_t*	mtr);	/*!< in: mtr holding a latch on the page, and if
+	mtr_t*	mtr)	/*!< in: mtr holding a latch on the page, and if
 			needed, also to the next page */
+	__attribute__((nonnull, warn_unused_result));
 /**************************************************************//**
 Releases the latch on a leaf page and bufferunfixes it. */
 UNIV_INLINE
@@ -373,7 +374,8 @@ btr_leaf_page_release(
 	buf_block_t*	block,		/*!< in: buffer block */
 	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF or
 					BTR_MODIFY_LEAF */
-	mtr_t*		mtr);		/*!< in: mtr */
+	mtr_t*		mtr)		/*!< in: mtr */
+	__attribute__((nonnull));
 /**************************************************************//**
 Gets the child node file address in a node pointer.
 NOTE: the offsets array must contain all offsets for the record since
@@ -386,19 +388,8 @@ ulint
 btr_node_ptr_get_child_page_no(
 /*===========================*/
 	const rec_t*	rec,	/*!< in: node pointer record */
-	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
-/**************************************************************//**
-Creates a new index page (not the root, and also not
-used in page reorganization).  @see btr_page_empty(). */
-UNIV_INTERN
-void
-btr_page_create(
-/*============*/
-	buf_block_t*	block,	/*!< in/out: page to be created */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	dict_index_t*	index,	/*!< in: index */
-	ulint		level,	/*!< in: the B-tree level of the page */
-	mtr_t*		mtr);	/*!< in: mtr */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
 /************************************************************//**
 Creates the root node for a new index tree.
 @return	page number of the created root, FIL_NULL if did not succeed */
@@ -412,7 +403,8 @@ btr_create(
 				or 0 for uncompressed pages */
 	index_id_t	index_id,/*!< in: index id */
 	dict_index_t*	index,	/*!< in: index */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+	__attribute__((nonnull));
 /************************************************************//**
 Frees a B-tree except the root page, which MUST be freed after this
 by calling btr_free_root. */
@@ -434,7 +426,8 @@ btr_free_root(
 	ulint	zip_size,	/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
 	ulint	root_page_no,	/*!< in: root page number */
-	mtr_t*	mtr);		/*!< in/out: mini-transaction */
+	mtr_t*	mtr)		/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
 /*************************************************************//**
 Makes tree one level higher by splitting the root, and inserts
 the tuple. It is assumed that mtr contains an x-latch on the tree.
@@ -446,38 +439,63 @@ UNIV_INTERN
 rec_t*
 btr_root_raise_and_insert(
 /*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
 	btr_cur_t*	cursor,	/*!< in: cursor at which to insert: must be
 				on the root page; when the function returns,
 				the cursor is positioned on the predecessor
 				of the inserted record */
+	ulint**		offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
+				that can be emptied, or NULL */
 	const dtuple_t*	tuple,	/*!< in: tuple to insert */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull, warn_unused_result));
 /*************************************************************//**
 Reorganizes an index page.
-IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf
-page of a non-clustered index, the caller must update the insert
-buffer free bits in the same mini-transaction in such a way that the
-modification will be redo-logged.
-@return	TRUE on success, FALSE on failure */
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+UNIV_INTERN
+bool
+btr_page_reorganize_low(
+/*====================*/
+	bool		recovery,/*!< in: true if called in recovery:
+				locks should not be updated, i.e.,
+				there cannot exist locks on the
+				page, and a hash index should not be
+				dropped: it cannot exist */
+	ulint		z_level,/*!< in: compression level to be used
+				if dealing with compressed page */
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
+	dict_index_t*	index,	/*!< in: the index tree of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull, warn_unused_result));
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
 UNIV_INTERN
-ibool
+bool
 btr_page_reorganize(
 /*================*/
-	buf_block_t*	block,	/*!< in: page to be reorganized */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	mtr_t*		mtr);	/*!< in: mtr */
-/*************************************************************//**
-Empties an index page.  @see btr_page_create(). */
-UNIV_INTERN
-void
-btr_page_empty(
-/*===========*/
-	buf_block_t*	block,	/*!< in: page to be emptied */
-	page_zip_des_t*	page_zip,/*!< out: compressed page, or NULL */
-	dict_index_t*	index,	/*!< in: index of the page */
-	ulint		level,	/*!< in: the B-tree level of the page */
-	mtr_t*		mtr);	/*!< in: mtr */
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
+	dict_index_t*	index,	/*!< in: the index tree of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
 /*************************************************************//**
 Decides if the page should be split at the convergence point of
 inserts converging to left.
@@ -487,9 +505,10 @@ ibool
 btr_page_get_split_rec_to_left(
 /*===========================*/
 	btr_cur_t*	cursor,	/*!< in: cursor at which to insert */
-	rec_t**		split_rec);/*!< out: if split recommended,
+	rec_t**		split_rec)/*!< out: if split recommended,
 				the first record on upper half page,
 				or NULL if tuple should be first */
+	__attribute__((nonnull, warn_unused_result));
 /*************************************************************//**
 Decides if the page should be split at the convergence point of
 inserts converging to right.
@@ -499,9 +518,10 @@ ibool
 btr_page_get_split_rec_to_right(
 /*============================*/
 	btr_cur_t*	cursor,	/*!< in: cursor at which to insert */
-	rec_t**		split_rec);/*!< out: if split recommended,
+	rec_t**		split_rec)/*!< out: if split recommended,
 				the first record on upper half page,
 				or NULL if tuple should be first */
+	__attribute__((nonnull, warn_unused_result));
 /*************************************************************//**
 Splits an index page to halves and inserts the tuple. It is assumed
 that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
@@ -515,12 +535,17 @@ UNIV_INTERN
 rec_t*
 btr_page_split_and_insert(
 /*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
 	btr_cur_t*	cursor,	/*!< in: cursor at which to insert; when the
 				function returns, the cursor is positioned
 				on the predecessor of the inserted record */
+	ulint**		offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
+				that can be emptied, or NULL */
 	const dtuple_t*	tuple,	/*!< in: tuple to insert */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull, warn_unused_result));
 /*******************************************************//**
 Inserts a data tuple to a tree on a non-leaf level. It is assumed
 that mtr holds an x-latch on the tree. */
@@ -528,29 +553,17 @@ UNIV_INTERN
 void
 btr_insert_on_non_leaf_level_func(
 /*==============================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
 	dict_index_t*	index,	/*!< in: index */
 	ulint		level,	/*!< in: level, must be > 0 */
 	dtuple_t*	tuple,	/*!< in: the record to be inserted */
 	const char*	file,	/*!< in: file name */
 	ulint		line,	/*!< in: line where called */
-	mtr_t*		mtr);	/*!< in: mtr */
-# define btr_insert_on_non_leaf_level(i,l,t,m)				\
-	btr_insert_on_non_leaf_level_func(i,l,t,__FILE__,__LINE__,m)
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
+# define btr_insert_on_non_leaf_level(f,i,l,t,m)			\
+	btr_insert_on_non_leaf_level_func(f,i,l,t,__FILE__,__LINE__,m)
 #endif /* !UNIV_HOTBACKUP */
-/**************************************************************//**
-Attaches the halves of an index page on the appropriate level in an
-index tree. */
-UNIV_INTERN
-void
-btr_attach_half_pages(
-/*==================*/
-	dict_index_t*	index,		/*!< in: the index tree */
-	buf_block_t*	block,		/*!< in/out: page to be split */
-	rec_t*		split_rec,	/*!< in: first record on upper
-					half page */
-	buf_block_t*	new_block,	/*!< in/out: the new half page */
-	ulint		direction,	/*!< in: FSP_UP or FSP_DOWN */
-	mtr_t*		mtr);		/*!< in: mtr */
 /****************************************************************//**
 Sets a record as the predefined minimum record. */
 UNIV_INTERN
@@ -558,7 +571,8 @@ void
 btr_set_min_rec_mark(
 /*=================*/
 	rec_t*	rec,	/*!< in/out: record */
-	mtr_t*	mtr);	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
 #ifndef UNIV_HOTBACKUP
 /*************************************************************//**
 Deletes on the upper level the node pointer to a page. */
@@ -568,7 +582,8 @@ btr_node_ptr_delete(
 /*================*/
 	dict_index_t*	index,	/*!< in: index tree */
 	buf_block_t*	block,	/*!< in: page whose node pointer is deleted */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
 #ifdef UNIV_DEBUG
 /************************************************************//**
 Checks that the node pointer to a page is appropriate.
@@ -579,7 +594,8 @@ btr_check_node_ptr(
 /*===============*/
 	dict_index_t*	index,	/*!< in: index tree */
 	buf_block_t*	block,	/*!< in: index page */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull, warn_unused_result));
 #endif /* UNIV_DEBUG */
 /*************************************************************//**
 Tries to merge the page first to the left immediate brother if such a
@@ -613,7 +629,8 @@ btr_discard_page(
 /*=============*/
 	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
 				the root page */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
 #endif /* !UNIV_HOTBACKUP */
 /****************************************************************//**
 Parses the redo log record for setting an index record as the predefined
@@ -627,7 +644,8 @@ btr_parse_set_min_rec_mark(
 	byte*	end_ptr,/*!< in: buffer end */
 	ulint	comp,	/*!< in: nonzero=compact page format */
 	page_t*	page,	/*!< in: page or NULL */
-	mtr_t*	mtr);	/*!< in: mtr or NULL */
+	mtr_t*	mtr)	/*!< in: mtr or NULL */
+	__attribute__((nonnull(1,2), warn_unused_result));
 /***********************************************************//**
 Parses a redo log record of reorganizing a page.
 @return	end of log record or NULL */
@@ -638,8 +656,10 @@ btr_parse_page_reorganize(
 	byte*		ptr,	/*!< in: buffer */
 	byte*		end_ptr,/*!< in: buffer end */
 	dict_index_t*	index,	/*!< in: record descriptor */
+	bool		compressed,/*!< in: true if compressed page */
 	buf_block_t*	block,	/*!< in: page to be reorganized, or NULL */
-	mtr_t*		mtr);	/*!< in: mtr or NULL */
+	mtr_t*		mtr)	/*!< in: mtr or NULL */
+	__attribute__((nonnull(1,2,3), warn_unused_result));
 #ifndef UNIV_HOTBACKUP
 /**************************************************************//**
 Gets the number of pages in a B-tree.
@@ -685,7 +705,8 @@ btr_page_free(
 /*==========*/
 	dict_index_t*	index,	/*!< in: index tree */
 	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
 /**************************************************************//**
 Frees a file page used in an index tree. Can be used also to BLOB
 external storage pages, because the page level 0 can be given as an
@@ -697,7 +718,8 @@ btr_page_free_low(
 	dict_index_t*	index,	/*!< in: index tree */
 	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
 	ulint		level,	/*!< in: page level */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
 #ifdef UNIV_BTR_PRINT
 /*************************************************************//**
 Prints size info of a B-tree. */
@@ -705,7 +727,8 @@ UNIV_INTERN
 void
 btr_print_size(
 /*===========*/
-	dict_index_t*	index);	/*!< in: index tree */
+	dict_index_t*	index)	/*!< in: index tree */
+	__attribute__((nonnull));
 /**************************************************************//**
 Prints directories and other info of all nodes in the index. */
 UNIV_INTERN
@@ -713,8 +736,9 @@ void
 btr_print_index(
 /*============*/
 	dict_index_t*	index,	/*!< in: index */
-	ulint		width);	/*!< in: print this many entries from start
+	ulint		width)	/*!< in: print this many entries from start
 				and end */
+	__attribute__((nonnull));
 #endif /* UNIV_BTR_PRINT */
 /************************************************************//**
 Checks the size and number of fields in a record based on the definition of
@@ -726,18 +750,20 @@ btr_index_rec_validate(
 /*===================*/
 	const rec_t*		rec,		/*!< in: index record */
 	const dict_index_t*	index,		/*!< in: index */
-	ibool			dump_on_error);	/*!< in: TRUE if the function
+	ibool			dump_on_error)	/*!< in: TRUE if the function
 						should print hex dump of record
 						and page on error */
+	__attribute__((nonnull, warn_unused_result));
 /**************************************************************//**
 Checks the consistency of an index tree.
 @return	TRUE if ok */
 UNIV_INTERN
-ibool
+bool
 btr_validate_index(
 /*===============*/
-	dict_index_t*	index,	/*!< in: index */
-	trx_t*		trx);	/*!< in: transaction or NULL */
+	dict_index_t*	index,			/*!< in: index */
+	const trx_t*	trx)			/*!< in: transaction or 0 */
+	__attribute__((nonnull(1), warn_unused_result));
 
 #define BTR_N_LEAF_PAGES	1
 #define BTR_TOTAL_SIZE		2
diff --git a/storage/xtradb/include/btr0btr.ic b/storage/xtradb/include/btr0btr.ic
index 21eaa9bd026..9cc611ee450 100644
--- a/storage/xtradb/include/btr0btr.ic
+++ b/storage/xtradb/include/btr0btr.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -83,7 +83,7 @@ btr_page_set_index_id(
 	index_id_t	id,	/*!< in: index id */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), id);
 		page_zip_write_header(page_zip,
 				      page + (PAGE_HEADER + PAGE_INDEX_ID),
@@ -128,22 +128,6 @@ btr_page_get_level_low(
 }
 
 /********************************************************//**
-Gets the node level field in an index page.
-@return	level, leaf level == 0 */
-UNIV_INLINE
-ulint
-btr_page_get_level(
-/*===============*/
-	const page_t*	page,	/*!< in: index page */
-	mtr_t*		mtr __attribute__((unused)))
-				/*!< in: mini-transaction handle */
-{
-	ut_ad(page && mtr);
-
-	return(btr_page_get_level_low(page));
-}
-
-/********************************************************//**
 Sets the node level field in an index page. */
 UNIV_INLINE
 void
@@ -158,7 +142,7 @@ btr_page_set_level(
 	ut_ad(page && mtr);
 	ut_ad(level <= BTR_MAX_NODE_LEVEL);
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		mach_write_to_2(page + (PAGE_HEADER + PAGE_LEVEL), level);
 		page_zip_write_header(page_zip,
 				      page + (PAGE_HEADER + PAGE_LEVEL),
@@ -201,7 +185,7 @@ btr_page_set_next(
 {
 	ut_ad(page && mtr);
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		mach_write_to_4(page + FIL_PAGE_NEXT, next);
 		page_zip_write_header(page_zip, page + FIL_PAGE_NEXT, 4, mtr);
 	} else {
@@ -238,7 +222,7 @@ btr_page_set_prev(
 {
 	ut_ad(page && mtr);
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		mach_write_to_4(page + FIL_PAGE_PREV, prev);
 		page_zip_write_header(page_zip, page + FIL_PAGE_PREV, 4, mtr);
 	} else {
@@ -274,12 +258,13 @@ btr_node_ptr_get_child_page_no(
 
 	page_no = mach_read_from_4(field);
 
-	if (UNIV_UNLIKELY(page_no == 0)) {
+	if (page_no == 0) {
 		fprintf(stderr,
 			"InnoDB: a nonsensical page number 0"
 			" in a node ptr record at offset %lu\n",
 			(ulong) page_offset(rec));
 		buf_page_print(page_align(rec), 0, 0);
+		ut_ad(0);
 	}
 
 	return(page_no);
diff --git a/storage/xtradb/include/btr0cur.h b/storage/xtradb/include/btr0cur.h
index 97929d44159..cf7c1a24139 100644
--- a/storage/xtradb/include/btr0cur.h
+++ b/storage/xtradb/include/btr0cur.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -31,14 +31,26 @@ Created 10/16/1994 Heikki Tuuri
 #include "page0cur.h"
 #include "btr0types.h"
 
-/* Mode flags for btr_cur operations; these can be ORed */
-#define BTR_NO_UNDO_LOG_FLAG	1	/* do no undo logging */
-#define BTR_NO_LOCKING_FLAG	2	/* do no record lock checking */
-#define BTR_KEEP_SYS_FLAG	4	/* sys fields will be found from the
-					update vector or inserted entry */
-#define BTR_KEEP_POS_FLAG	8	/* btr_cur_pessimistic_update()
-					must keep cursor position when
-					moving columns to big_rec */
+/** Mode flags for btr_cur operations; these can be ORed */
+enum {
+	/** do no undo logging */
+	BTR_NO_UNDO_LOG_FLAG = 1,
+	/** do no record lock checking */
+	BTR_NO_LOCKING_FLAG = 2,
+	/** sys fields will be found in the update vector or inserted
+	entry */
+	BTR_KEEP_SYS_FLAG = 4,
+	/** btr_cur_pessimistic_update() must keep cursor position
+	when moving columns to big_rec */
+	BTR_KEEP_POS_FLAG = 8,
+	/** the caller is creating the index or wants to bypass the
+	index->info.online creation log */
+	BTR_CREATE_FLAG = 16,
+	/** the caller of btr_cur_optimistic_update() or
+	btr_cur_update_in_place() will take care of
+	updating IBUF_BITMAP_FREE */
+	BTR_KEEP_IBUF_BITMAP = 32
+};
 
 #ifndef UNIV_HOTBACKUP
 #include "que0types.h"
@@ -57,9 +69,6 @@ page_cur_t*
 btr_cur_get_page_cur(
 /*=================*/
 	const btr_cur_t*	cursor);/*!< in: tree cursor */
-#else /* UNIV_DEBUG */
-# define btr_cur_get_page_cur(cursor) (&(cursor)->page_cur)
-#endif /* UNIV_DEBUG */
 /*********************************************************//**
 Returns the buffer block on which the tree cursor is positioned.
 @return	pointer to buffer block */
@@ -67,7 +76,7 @@ UNIV_INLINE
 buf_block_t*
 btr_cur_get_block(
 /*==============*/
-	btr_cur_t*	cursor);/*!< in: tree cursor */
+	const btr_cur_t*	cursor);/*!< in: tree cursor */
 /*********************************************************//**
 Returns the record pointer of a tree cursor.
 @return	pointer to record */
@@ -75,7 +84,12 @@ UNIV_INLINE
 rec_t*
 btr_cur_get_rec(
 /*============*/
-	btr_cur_t*	cursor);/*!< in: tree cursor */
+	const btr_cur_t*	cursor);/*!< in: tree cursor */
+#else /* UNIV_DEBUG */
+# define btr_cur_get_page_cur(cursor)	(&(cursor)->page_cur)
+# define btr_cur_get_block(cursor)	((cursor)->page_cur.block)
+# define btr_cur_get_rec(cursor)	((cursor)->page_cur.rec)
+#endif /* UNIV_DEBUG */
 /*********************************************************//**
 Returns the compressed page on which the tree cursor is positioned.
 @return	pointer to compressed page, or NULL if the page is not compressed */
@@ -101,12 +115,9 @@ btr_cur_get_page(
 	btr_cur_t*	cursor);/*!< in: tree cursor */
 /*********************************************************//**
 Returns the index of a cursor.
+@param cursor	b-tree cursor
 @return	index */
-UNIV_INLINE
-dict_index_t*
-btr_cur_get_index(
-/*==============*/
-	btr_cur_t*	cursor);/*!< in: B-tree cursor */
+#define btr_cur_get_index(cursor) ((cursor)->index)
 /*********************************************************//**
 Positions a tree cursor at a given record. */
 UNIV_INLINE
@@ -165,16 +176,19 @@ UNIV_INTERN
 void
 btr_cur_open_at_index_side_func(
 /*============================*/
-	ibool		from_left,	/*!< in: TRUE if open to the low end,
-					FALSE if to the high end */
+	bool		from_left,	/*!< in: true if open to the low end,
+					false if to the high end */
 	dict_index_t*	index,		/*!< in: index */
 	ulint		latch_mode,	/*!< in: latch mode */
-	btr_cur_t*	cursor,		/*!< in: cursor */
+	btr_cur_t*	cursor,		/*!< in/out: cursor */
+	ulint		level,		/*!< in: level to search for
+					(0=leaf) */
 	const char*	file,		/*!< in: file name */
 	ulint		line,		/*!< in: line where called */
-	mtr_t*		mtr);		/*!< in: mtr */
-#define btr_cur_open_at_index_side(f,i,l,c,m)				\
-	btr_cur_open_at_index_side_func(f,i,l,c,__FILE__,__LINE__,m)
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
+#define btr_cur_open_at_index_side(f,i,l,c,lv,m)			\
+	btr_cur_open_at_index_side_func(f,i,l,c,lv,__FILE__,__LINE__,m)
 /**********************************************************************//**
 Positions a cursor at a randomly chosen position within a B-tree. */
 UNIV_INTERN
@@ -197,7 +211,7 @@ one record on the page, the insert will always succeed; this is to
 prevent trying to split a page with just one record.
 @return	DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
 UNIV_INTERN
-ulint
+dberr_t
 btr_cur_optimistic_insert(
 /*======================*/
 	ulint		flags,	/*!< in: undo logging and locking flags: if not
@@ -205,6 +219,8 @@ btr_cur_optimistic_insert(
 				specified */
 	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
 				cursor stays valid */
+	ulint**		offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
 	dtuple_t*	entry,	/*!< in/out: entry to insert */
 	rec_t**		rec,	/*!< out: pointer to inserted record if
 				succeed */
@@ -213,11 +229,13 @@ btr_cur_optimistic_insert(
 				NULL */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
 	que_thr_t*	thr,	/*!< in: query thread or NULL */
-	mtr_t*		mtr);	/*!< in: mtr; if this function returns
-				DB_SUCCESS on a leaf page of a secondary
-				index in a compressed tablespace, the
-				mtr must be committed before latching
+	mtr_t*		mtr)	/*!< in/out: mini-transaction;
+				if this function returns DB_SUCCESS on
+				a leaf page of a secondary index in a
+				compressed tablespace, the caller must
+				mtr_commit(mtr) before latching
 				any further pages */
+	__attribute__((nonnull(2,3,4,5,6,7,10), warn_unused_result));
 /*************************************************************//**
 Performs an insert on a page of an index tree. It is assumed that mtr
 holds an x-latch on the tree and on the cursor page. If the insert is
@@ -225,7 +243,7 @@ made on the leaf level, to avoid deadlocks, mtr must also own x-latches
 to brothers of page, if those brothers exist.
 @return	DB_SUCCESS or error number */
 UNIV_INTERN
-ulint
+dberr_t
 btr_cur_pessimistic_insert(
 /*=======================*/
 	ulint		flags,	/*!< in: undo logging and locking flags: if not
@@ -236,6 +254,9 @@ btr_cur_pessimistic_insert(
 				insertion will certainly succeed */
 	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
 				cursor stays valid */
+	ulint**		offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
+				that can be emptied, or NULL */
 	dtuple_t*	entry,	/*!< in/out: entry to insert */
 	rec_t**		rec,	/*!< out: pointer to inserted record if
 				succeed */
@@ -244,64 +265,105 @@ btr_cur_pessimistic_insert(
 				NULL */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
 	que_thr_t*	thr,	/*!< in: query thread or NULL */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull(2,3,4,5,6,7,10), warn_unused_result));
 /*************************************************************//**
 See if there is enough place in the page modification log to log
 an update-in-place.
-@return	TRUE if enough place */
+
+@retval false if out of space; IBUF_BITMAP_FREE will be reset
+outside mtr if the page was recompressed
+@retval	true if enough place;
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
+a secondary index leaf page. This has to be done either within the
+same mini-transaction, or by invoking ibuf_reset_free_bits() before
+mtr_commit(mtr). */
 UNIV_INTERN
-ibool
-btr_cur_update_alloc_zip(
-/*=====================*/
+bool
+btr_cur_update_alloc_zip_func(
+/*==========================*/
 	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	buf_block_t*	block,	/*!< in/out: buffer page */
-	dict_index_t*	index,	/*!< in: the index corresponding to the block */
+	page_cur_t*	cursor,	/*!< in/out: B-tree page cursor */
+	dict_index_t*	index,	/*!< in: the index corresponding to cursor */
+#ifdef UNIV_DEBUG
+	ulint*		offsets,/*!< in/out: offsets of the cursor record */
+#endif /* UNIV_DEBUG */
 	ulint		length,	/*!< in: size needed */
-	ibool		create,	/*!< in: TRUE=delete-and-insert,
-				FALSE=update-in-place */
-	mtr_t*		mtr,	/*!< in: mini-transaction */
+	bool		create,	/*!< in: true=delete-and-insert,
+				false=update-in-place */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
 	trx_t*		trx)	/*!< in: NULL or transaction */
-    __attribute__((nonnull (1, 2, 3, 6), warn_unused_result));
+#ifdef UNIV_DEBUG
+	__attribute__((nonnull (1, 2, 3, 4, 7), warn_unused_result));
+#else
+	__attribute__((nonnull (1, 2, 3, 6), warn_unused_result));
+#endif
+
+#ifdef UNIV_DEBUG
+# define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr,trx) \
+	btr_cur_update_alloc_zip_func(page_zip,cursor,index,offsets,len,cr,mtr,trx)
+#else /* UNIV_DEBUG */
+# define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr,trx) \
+	btr_cur_update_alloc_zip_func(page_zip,cursor,index,len,cr,mtr,trx)
+#endif /* UNIV_DEBUG */
 /*************************************************************//**
 Updates a record when the update causes no size changes in its fields.
-@return	DB_SUCCESS or error number */
+@return locking or undo log related error code, or
+@retval DB_SUCCESS on success
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
 UNIV_INTERN
-ulint
+dberr_t
 btr_cur_update_in_place(
 /*====================*/
 	ulint		flags,	/*!< in: undo logging and locking flags */
 	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
 				cursor stays valid and positioned on the
 				same record */
+	ulint*		offsets,/*!< in/out: offsets on cursor->page_cur.rec */
 	const upd_t*	update,	/*!< in: update vector */
 	ulint		cmpl_info,/*!< in: compiler info on secondary index
 				updates */
 	que_thr_t*	thr,	/*!< in: query thread */
-	mtr_t*		mtr);	/*!< in: mtr; must be committed before
-				latching any further pages */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
+				is a secondary index, the caller must
+				mtr_commit(mtr) before latching any
+				further pages */
+	__attribute__((warn_unused_result, nonnull));
 /*************************************************************//**
 Tries to update a record on a page in an index tree. It is assumed that mtr
 holds an x-latch on the page. The operation does not succeed if there is too
 little space on the page or if the update would result in too empty a page,
 so that tree compression is recommended.
-@return DB_SUCCESS, or DB_OVERFLOW if the updated record does not fit,
-DB_UNDERFLOW if the page would become too empty, or DB_ZIP_OVERFLOW if
-there is not enough space left on the compressed page */
+@return error code, including
+@retval DB_SUCCESS on success
+@retval DB_OVERFLOW if the updated record does not fit
+@retval DB_UNDERFLOW if the page would become too empty
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page */
 UNIV_INTERN
-ulint
+dberr_t
 btr_cur_optimistic_update(
 /*======================*/
 	ulint		flags,	/*!< in: undo logging and locking flags */
 	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
 				cursor stays valid and positioned on the
 				same record */
+	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to NULL or memory heap */
 	const upd_t*	update,	/*!< in: update vector; this must also
 				contain trx id and roll ptr fields */
 	ulint		cmpl_info,/*!< in: compiler info on secondary index
 				updates */
 	que_thr_t*	thr,	/*!< in: query thread */
-	mtr_t*		mtr);	/*!< in: mtr; must be committed before
-				latching any further pages */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
+				is a secondary index, the caller must
+				mtr_commit(mtr) before latching any
+				further pages */
+	__attribute__((warn_unused_result, nonnull));
 /*************************************************************//**
 Performs an update of a record on a page of a tree. It is assumed
 that mtr holds an x-latch on the tree and on the cursor page. If the
@@ -309,7 +371,7 @@ update is made on the leaf level, to avoid deadlocks, mtr must also
 own x-latches to brothers of page, if those brothers exist.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 btr_cur_pessimistic_update(
 /*=======================*/
 	ulint		flags,	/*!< in: undo logging, locking, and rollback
@@ -317,7 +379,13 @@ btr_cur_pessimistic_update(
 	btr_cur_t*	cursor,	/*!< in/out: cursor on the record to update;
 				cursor may become invalid if *big_rec == NULL
 				|| !(flags & BTR_KEEP_POS_FLAG) */
-	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: pointer to memory heap
+				that can be emptied, or NULL */
+	mem_heap_t*	entry_heap,
+				/*!< in/out: memory heap for allocating
+				big_rec and the index tuple */
 	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
 				be stored externally by the caller, or NULL */
 	const upd_t*	update,	/*!< in: update vector; this is allowed also
@@ -326,8 +394,10 @@ btr_cur_pessimistic_update(
 	ulint		cmpl_info,/*!< in: compiler info on secondary index
 				updates */
 	que_thr_t*	thr,	/*!< in: query thread */
-	mtr_t*		mtr);	/*!< in: mtr; must be committed before
-				latching any further pages */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; must be committed
+				before latching any further pages */
+	__attribute__((warn_unused_result, nonnull));
 /***********************************************************//**
 Marks a clustered index record deleted. Writes an undo log record to
 undo log on this delete marking. Writes in the trx id field the id
@@ -335,30 +405,29 @@ of the deleting transaction, and in the roll ptr field pointer to the
 undo log record created.
 @return	DB_SUCCESS, DB_LOCK_WAIT, or error number */
 UNIV_INTERN
-ulint
+dberr_t
 btr_cur_del_mark_set_clust_rec(
 /*===========================*/
-	ulint		flags,	/*!< in: undo logging and locking flags */
 	buf_block_t*	block,	/*!< in/out: buffer block of the record */
 	rec_t*		rec,	/*!< in/out: record */
 	dict_index_t*	index,	/*!< in: clustered index of the record */
 	const ulint*	offsets,/*!< in: rec_get_offsets(rec) */
-	ibool		val,	/*!< in: value to set */
 	que_thr_t*	thr,	/*!< in: query thread */
-	mtr_t*		mtr)	/*!< in: mtr */
-	__attribute__((nonnull));
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull, warn_unused_result));
 /***********************************************************//**
 Sets a secondary index record delete mark to TRUE or FALSE.
 @return	DB_SUCCESS, DB_LOCK_WAIT, or error number */
 UNIV_INTERN
-ulint
+dberr_t
 btr_cur_del_mark_set_sec_rec(
 /*=========================*/
 	ulint		flags,	/*!< in: locking flag */
 	btr_cur_t*	cursor,	/*!< in: cursor */
 	ibool		val,	/*!< in: value to set */
 	que_thr_t*	thr,	/*!< in: query thread */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull, warn_unused_result));
 /*************************************************************//**
 Tries to compress a page of the tree if it seems useful. It is assumed
 that mtr holds an x-latch on the tree and on the cursor page. To avoid
@@ -384,16 +453,27 @@ but no latch on the whole tree.
 @return	TRUE if success, i.e., the page did not become too empty */
 UNIV_INTERN
 ibool
-btr_cur_optimistic_delete(
-/*======================*/
+btr_cur_optimistic_delete_func(
+/*===========================*/
 	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
 				cursor stays valid: if deletion succeeds,
 				on function exit it points to the successor
 				of the deleted record */
-	mtr_t*		mtr);	/*!< in: mtr; if this function returns
+# ifdef UNIV_DEBUG
+	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
+# endif /* UNIV_DEBUG */
+	mtr_t*		mtr)	/*!< in: mtr; if this function returns
 				TRUE on a leaf page of a secondary
 				index, the mtr must be committed
 				before latching any further pages */
+	__attribute__((nonnull, warn_unused_result));
+# ifdef UNIV_DEBUG
+#  define btr_cur_optimistic_delete(cursor, flags, mtr)		\
+	btr_cur_optimistic_delete_func(cursor, flags, mtr)
+# else /* UNIV_DEBUG */
+#  define btr_cur_optimistic_delete(cursor, flags, mtr)		\
+	btr_cur_optimistic_delete_func(cursor, mtr)
+# endif /* UNIV_DEBUG */
 /*************************************************************//**
 Removes the record on which the tree cursor is positioned. Tries
 to compress the page if its fillfactor drops below a threshold
@@ -406,7 +486,7 @@ UNIV_INTERN
 ibool
 btr_cur_pessimistic_delete(
 /*=======================*/
-	ulint*		err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+	dberr_t*		err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
 				the latter may occur because we may have
 				to update node pointers on upper levels,
 				and in the case of variable length keys
@@ -419,8 +499,10 @@ btr_cur_pessimistic_delete(
 				if compression does not occur, the cursor
 				stays valid: it points to successor of
 				deleted record on function exit */
+	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
 	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
 #endif /* !UNIV_HOTBACKUP */
 /***********************************************************//**
 Parses a redo log record of updating a record in-place.
@@ -474,8 +556,10 @@ btr_estimate_n_rows_in_range(
 	ulint		mode2);	/*!< in: search mode for range end */
 /*******************************************************************//**
 Estimates the number of different key values in a given index, for
-each n-column prefix of the index where n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals.
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
+0..n_uniq-1) and the number of pages that were sampled is saved in
+index->stat_n_sample_sizes[].
 If innodb_stats_method is nulls_ignored, we also record the number of
 non-null values for each prefix and stored the estimates in
 array index->stat_n_non_null_key_vals. */
@@ -529,7 +613,7 @@ The fields are stored on pages allocated from leaf node
 file segment of the index tree.
 @return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
 UNIV_INTERN
-enum db_err
+dberr_t
 btr_store_big_rec_extern_fields(
 /*============================*/
 	dict_index_t*	index,		/*!< in: index of rec; the index tree
@@ -596,6 +680,23 @@ btr_copy_externally_stored_field_prefix(
 				a lock or a page latch */
 	ulint		local_len);/*!< in: length of data, in bytes */
 /*******************************************************************//**
+Copies an externally stored field of a record to mem heap.  The
+clustered index record must be protected by a lock or a page latch.
+@return the whole field copied to heap */
+UNIV_INTERN
+byte*
+btr_copy_externally_stored_field(
+/*=============================*/
+	ulint*		len,	/*!< out: length of the whole field */
+	const byte*	data,	/*!< in: 'internally' stored part of the
+				field containing also the reference to
+				the external part; must be protected by
+				a lock or a page latch */
+	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
+				zero for uncompressed BLOBs */
+	ulint		local_len,/*!< in: length of data */
+	mem_heap_t*	heap);	/*!< in: mem heap */
+/*******************************************************************//**
 Copies an externally stored field of a record to mem heap.
 @return	the field copied to heap, or NULL if the field is incomplete */
 UNIV_INTERN
@@ -646,8 +747,7 @@ limit, merging it to a neighbor is tried */
 /** A slot in the path array. We store here info on a search path down the
 tree. Each slot contains data on a single level of the tree. */
 
-typedef struct btr_path_struct	btr_path_t;
-struct btr_path_struct{
+struct btr_path_t{
 	ulint	nth_rec;	/*!< index of the record
 				where the page cursor stopped on
 				this level (index in alphabetical
@@ -684,7 +784,7 @@ enum btr_cur_method {
 
 /** The tree cursor: the definition appears here only for the compiler
 to know struct size! */
-struct btr_cur_struct {
+struct btr_cur_t {
 	dict_index_t*	index;		/*!< index where positioned */
 	page_cur_t	page_cur;	/*!< page cursor */
 	purge_node_t*	purge_node;	/*!< purge node, for BTR_DELETE */
@@ -721,7 +821,7 @@ struct btr_cur_struct {
 					for comparison to the adjacent user
 					record if that record is on a
 					different leaf page! (See the note in
-					row_ins_duplicate_key.) */
+					row_ins_duplicate_error_in_clust.) */
 	ulint		up_bytes;	/*!< number of matched bytes to the
 					right at the time cursor positioned;
 					only used internally in searches: not
diff --git a/storage/xtradb/include/btr0cur.ic b/storage/xtradb/include/btr0cur.ic
index 5fc4651ca13..080866c7465 100644
--- a/storage/xtradb/include/btr0cur.ic
+++ b/storage/xtradb/include/btr0cur.ic
@@ -48,7 +48,7 @@ btr_cur_get_page_cur(
 {
 	return(&((btr_cur_t*) cursor)->page_cur);
 }
-#endif /* UNIV_DEBUG */
+
 /*********************************************************//**
 Returns the buffer block on which the tree cursor is positioned.
 @return	pointer to buffer block */
@@ -56,7 +56,7 @@ UNIV_INLINE
 buf_block_t*
 btr_cur_get_block(
 /*==============*/
-	btr_cur_t*	cursor)	/*!< in: tree cursor */
+	const btr_cur_t*	cursor)	/*!< in: tree cursor */
 {
 	return(page_cur_get_block(btr_cur_get_page_cur(cursor)));
 }
@@ -68,10 +68,11 @@ UNIV_INLINE
 rec_t*
 btr_cur_get_rec(
 /*============*/
-	btr_cur_t*	cursor)	/*!< in: tree cursor */
+	const btr_cur_t*	cursor)	/*!< in: tree cursor */
 {
-	return(page_cur_get_rec(&(cursor->page_cur)));
+	return(page_cur_get_rec(btr_cur_get_page_cur(cursor)));
 }
+#endif /* UNIV_DEBUG */
 
 /*********************************************************//**
 Returns the compressed page on which the tree cursor is positioned.
@@ -109,18 +110,6 @@ btr_cur_get_page(
 }
 
 /*********************************************************//**
-Returns the index of a cursor.
-@return	index */
-UNIV_INLINE
-dict_index_t*
-btr_cur_get_index(
-/*==============*/
-	btr_cur_t*	cursor)	/*!< in: B-tree cursor */
-{
-	return(cursor->index);
-}
-
-/*********************************************************//**
 Positions a tree cursor at a given record. */
 UNIV_INLINE
 void
diff --git a/storage/xtradb/include/btr0pcur.h b/storage/xtradb/include/btr0pcur.h
index 4312f73ca4a..973fae382ab 100644
--- a/storage/xtradb/include/btr0pcur.h
+++ b/storage/xtradb/include/btr0pcur.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -92,9 +92,10 @@ Initializes and opens a persistent cursor to an index tree. It should be
 closed with btr_pcur_close. */
 UNIV_INLINE
 void
-btr_pcur_open_func(
-/*===============*/
+btr_pcur_open_low(
+/*==============*/
 	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: level in the btree */
 	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
 	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
 				NOTE that if the search is made using a unique
@@ -108,7 +109,7 @@ btr_pcur_open_func(
 	ulint		line,	/*!< in: line where called */
 	mtr_t*		mtr);	/*!< in: mtr */
 #define btr_pcur_open(i,t,md,l,c,m)				\
-	btr_pcur_open_func(i,t,md,l,c,__FILE__,__LINE__,m)
+	btr_pcur_open_low(i,0,t,md,l,c,__FILE__,__LINE__,m)
 /**************************************************************//**
 Opens an persistent cursor to an index tree without initializing the
 cursor. */
@@ -145,13 +146,16 @@ UNIV_INLINE
 void
 btr_pcur_open_at_index_side(
 /*========================*/
-	ibool		from_left,	/*!< in: TRUE if open to the low end,
-					FALSE if to the high end */
+	bool		from_left,	/*!< in: true if open to the low end,
+					false if to the high end */
 	dict_index_t*	index,		/*!< in: index */
 	ulint		latch_mode,	/*!< in: latch mode */
-	btr_pcur_t*	pcur,		/*!< in: cursor */
-	ibool		do_init,	/*!< in: TRUE if should be initialized */
-	mtr_t*		mtr);		/*!< in: mtr */
+	btr_pcur_t*	pcur,		/*!< in/out: cursor */
+	bool		init_pcur,	/*!< in: whether to initialize pcur */
+	ulint		level,		/*!< in: level to search for
+					(0=leaf) */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
 /**************************************************************//**
 Gets the up_match value for a pcur after a search.
 @return number of matched fields at the cursor or to the right if
@@ -208,8 +212,17 @@ btr_pcur_open_at_rnd_pos_func(
 #define btr_pcur_open_at_rnd_pos(i,l,c,m)				\
 	btr_pcur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
 /**************************************************************//**
-Frees the possible old_rec_buf buffer of a persistent cursor and sets the
-latch mode of the persistent cursor to BTR_NO_LATCHES. */
+Frees the possible memory heap of a persistent cursor and sets the latch
+mode of the persistent cursor to BTR_NO_LATCHES.
+WARNING: this function does not release the latch on the page where the
+cursor is currently positioned. The latch is acquired by the
+"move to next/previous" family of functions. Since recursive shared locks
+are not allowed, you must take care (if using the cursor in S-mode) to
+manually release the latch by either calling
+btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr)
+or by committing the mini-transaction right after btr_pcur_close().
+A subsequent attempt to crawl the same page in the same mtr would cause
+an assertion failure. */
 UNIV_INLINE
 void
 btr_pcur_close(
@@ -451,14 +464,14 @@ btr_pcur_move_to_prev_on_page(
 /* The persistent B-tree cursor structure. This is used mainly for SQL
 selects, updates, and deletes. */
 
-struct btr_pcur_struct{
+struct btr_pcur_t{
 	btr_cur_t	btr_cur;	/*!< a B-tree cursor */
 	ulint		latch_mode;	/*!< see TODO note below!
 					BTR_SEARCH_LEAF, BTR_MODIFY_LEAF,
 					BTR_MODIFY_TREE, or BTR_NO_LATCHES,
 					depending on the latching state of
 					the page and tree where the cursor is
-					positioned; the last value means that
+					positioned; BTR_NO_LATCHES means that
 					the cursor is not currently positioned:
 					we say then that the cursor is
 					detached; it can be restored to
diff --git a/storage/xtradb/include/btr0pcur.ic b/storage/xtradb/include/btr0pcur.ic
index 696dfc728dc..79afd7c322e 100644
--- a/storage/xtradb/include/btr0pcur.ic
+++ b/storage/xtradb/include/btr0pcur.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -406,9 +406,10 @@ Initializes and opens a persistent cursor to an index tree. It should be
 closed with btr_pcur_close. */
 UNIV_INLINE
 void
-btr_pcur_open_func(
-/*===============*/
+btr_pcur_open_low(
+/*==============*/
 	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: level in the btree */
 	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
 	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
 				NOTE that if the search is made using a unique
@@ -428,14 +429,14 @@ btr_pcur_open_func(
 
 	btr_pcur_init(cursor);
 
-	cursor->latch_mode = latch_mode;
+	cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
 	cursor->search_mode = mode;
 
 	/* Search with the tree cursor */
 
 	btr_cursor = btr_pcur_get_btr_cur(cursor);
 
-	btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
+	btr_cur_search_to_nth_level(index, level, tuple, mode, latch_mode,
 				    btr_cursor, 0, file, line, mtr);
 	cursor->pos_state = BTR_PCUR_IS_POSITIONED;
 
@@ -495,28 +496,26 @@ UNIV_INLINE
 void
 btr_pcur_open_at_index_side(
 /*========================*/
-	ibool		from_left,	/*!< in: TRUE if open to the low end,
-					FALSE if to the high end */
+	bool		from_left,	/*!< in: true if open to the low end,
+					false if to the high end */
 	dict_index_t*	index,		/*!< in: index */
 	ulint		latch_mode,	/*!< in: latch mode */
-	btr_pcur_t*	pcur,		/*!< in: cursor */
-	ibool		do_init,	/*!< in: TRUE if should be initialized */
-	mtr_t*		mtr)		/*!< in: mtr */
+	btr_pcur_t*	pcur,		/*!< in/out: cursor */
+	bool		init_pcur,	/*!< in: whether to initialize pcur */
+	ulint		level,		/*!< in: level to search for
+					(0=leaf) */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
-	pcur->latch_mode = latch_mode;
+	pcur->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
 
-	if (from_left) {
-		pcur->search_mode = PAGE_CUR_G;
-	} else {
-		pcur->search_mode = PAGE_CUR_L;
-	}
+	pcur->search_mode = from_left ? PAGE_CUR_G : PAGE_CUR_L;
 
-	if (do_init) {
+	if (init_pcur) {
 		btr_pcur_init(pcur);
 	}
 
 	btr_cur_open_at_index_side(from_left, index, latch_mode,
-				   btr_pcur_get_btr_cur(pcur), mtr);
+				   btr_pcur_get_btr_cur(pcur), level, mtr);
 	pcur->pos_state = BTR_PCUR_IS_POSITIONED;
 
 	pcur->old_stored = BTR_PCUR_OLD_NOT_STORED;
@@ -555,7 +554,16 @@ btr_pcur_open_at_rnd_pos_func(
 
 /**************************************************************//**
 Frees the possible memory heap of a persistent cursor and sets the latch
-mode of the persistent cursor to BTR_NO_LATCHES. */
+mode of the persistent cursor to BTR_NO_LATCHES.
+WARNING: this function does not release the latch on the page where the
+cursor is currently positioned. The latch is acquired by the
+"move to next/previous" family of functions. Since recursive shared locks
+are not allowed, you must take care (if using the cursor in S-mode) to
+manually release the latch by either calling
+btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr)
+or by committing the mini-transaction right after btr_pcur_close().
+A subsequent attempt to crawl the same page in the same mtr would cause
+an assertion failure. */
 UNIV_INLINE
 void
 btr_pcur_close(
diff --git a/storage/xtradb/include/btr0sea.h b/storage/xtradb/include/btr0sea.h
index 6fa7a2d87bf..9f9c2b04191 100644
--- a/storage/xtradb/include/btr0sea.h
+++ b/storage/xtradb/include/btr0sea.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -68,7 +68,8 @@ UNIV_INLINE
 btr_search_t*
 btr_search_get_info(
 /*================*/
-	dict_index_t*	index);	/*!< in: index */
+	dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull));
 /*****************************************************************//**
 Creates and initializes a search info struct.
 @return	own: search info struct */
@@ -141,13 +142,6 @@ btr_search_drop_page_hash_index(
 				s- or x-latched, or an index page
 				for which we know that
 				block->buf_fix_count == 0 */
-/************************************************************************
-Drops a page hash index based on index */
-UNIV_INTERN
-void
-btr_search_drop_page_hash_index_on_index(
-/*=====================================*/
-	dict_index_t*	index);		/* in: record descriptor */
 /********************************************************************//**
 Drops a possible page hash index when a page is evicted from the buffer pool
 or freed in a file segment. */
@@ -201,20 +195,24 @@ btr_search_validate(void);
 #endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
 
 /********************************************************************//**
-New functions to control split btr_search_index */
+Returns the adaptive hash index table for a given index key.
+@return the adaptive hash index table for a given index key */
 UNIV_INLINE
 hash_table_t*
 btr_search_get_hash_table(
 /*======================*/
 	const dict_index_t*	index)	/*!< in: index */
-	__attribute__((nonnull,pure,warn_unused_result));
+	__attribute__((pure,warn_unused_result));
 
+/********************************************************************//**
+Returns the adaptive hash index latch for a given index key.
+@return the adaptive hash index latch for a given index key */
 UNIV_INLINE
-rw_lock_t*
+prio_rw_lock_t*
 btr_search_get_latch(
 /*=================*/
 	const dict_index_t*	index)	/*!< in: index */
-	__attribute__((nonnull,pure,warn_unused_result));
+	__attribute__((pure,warn_unused_result));
 
 /*********************************************************************//**
 Returns the AHI partition number corresponding to a given index ID. */
@@ -234,29 +232,45 @@ btr_search_index_init(
 	dict_index_t*	index)	/*!< in: index */
 	__attribute__((nonnull));
 
+/********************************************************************//**
+Latches all adaptive hash index latches in exclusive mode.  */
 UNIV_INLINE
 void
 btr_search_x_lock_all(void);
 /*========================*/
 
+/********************************************************************//**
+Unlatches all adaptive hash index latches in exclusive mode.  */
 UNIV_INLINE
 void
 btr_search_x_unlock_all(void);
 /*==========================*/
 
 #ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Checks if the thread has locked all the adaptive hash index latches in the
+specified mode.
+
+@return true if all latches are locked by the current thread, false
+otherwise.  */
+UNIV_INLINE
+bool
+btr_search_own_all(
+/*===============*/
+	ulint lock_type)
+	__attribute__((warn_unused_result));
 /********************************************************************//**
 Checks if the thread owns any adaptive hash latches in either S or X mode.
-@return	TRUE if the thread owns at least one latch in any mode. */
+@return	true if the thread owns at least one latch in any mode. */
 UNIV_INLINE
-ibool
+bool
 btr_search_own_any(void)
 /*=====================*/
 	 __attribute__((warn_unused_result));
 #endif
 
 /** The search info struct in an index */
-struct btr_search_struct{
+struct btr_search_t{
 	ulint	ref_count;	/*!< Number of blocks in this index tree
 				that have search index built
 				i.e. block->index points to this index.
@@ -305,19 +319,16 @@ struct btr_search_struct{
 #endif /* UNIV_SEARCH_PERF_STAT */
 #ifdef UNIV_DEBUG
 	ulint	magic_n;	/*!< magic number @see BTR_SEARCH_MAGIC_N */
-/** value of btr_search_struct::magic_n, used in assertions */
+/** value of btr_search_t::magic_n, used in assertions */
 # define BTR_SEARCH_MAGIC_N	1112765
 #endif /* UNIV_DEBUG */
 };
 
 /** The hash index system */
-typedef struct btr_search_sys_struct	btr_search_sys_t;
-
-/** The hash index system */
-struct btr_search_sys_struct{
-	hash_table_t**	hash_tables;	/*!< the array of adaptive hash index,
-					tables mapping dtuple_fold values
-					to rec_t pointers on index pages */
+struct btr_search_sys_t{
+	hash_table_t**	hash_tables;	/*!< the array of adaptive hash index
+					tables, mapping dtuple_fold values to
+					rec_t pointers on index pages */
 };
 
 /** The adaptive hash index */
diff --git a/storage/xtradb/include/btr0sea.ic b/storage/xtradb/include/btr0sea.ic
index 3f0dfdaa511..3cbcff75f31 100644
--- a/storage/xtradb/include/btr0sea.ic
+++ b/storage/xtradb/include/btr0sea.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -45,8 +45,6 @@ btr_search_get_info(
 /*================*/
 	dict_index_t*	index)	/*!< in: index */
 {
-	ut_ad(index);
-
 	return(index->search_info);
 }
 
@@ -62,8 +60,8 @@ btr_search_info_update(
 	btr_search_t*	info;
 
 #ifdef UNIV_SYNC_DEBUG
-	ut_ad(!rw_lock_own(btr_search_get_latch(index->id), RW_LOCK_SHARED));
-	ut_ad(!rw_lock_own(btr_search_get_latch(index->id), RW_LOCK_EX));
+	ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
 
 	info = btr_search_get_info(index);
@@ -83,8 +81,9 @@ btr_search_info_update(
 	btr_search_info_update_slow(info, cursor);
 }
 
-/*********************************************************************//**
-New functions to control split btr_search_index */
+/********************************************************************//**
+Returns the adaptive hash index table for a given index key.
+@return the adaptive hash index table for a given index key */
 UNIV_INLINE
 hash_table_t*
 btr_search_get_hash_table(
@@ -97,8 +96,11 @@ btr_search_get_hash_table(
 	return(index->search_table);
 }
 
+/********************************************************************//**
+Returns the adaptive hash index latch for a given index key.
+@return the adaptive hash index latch for a given index key */
 UNIV_INLINE
-rw_lock_t*
+prio_rw_lock_t*
 btr_search_get_latch(
 /*=================*/
 	const dict_index_t*	index)	/*!< in: index */
@@ -138,6 +140,8 @@ btr_search_index_init(
 		btr_search_sys->hash_tables[btr_search_get_key(index->id)];
 }
 
+/********************************************************************//**
+Latches all adaptive hash index latches in exclusive mode.  */
 UNIV_INLINE
 void
 btr_search_x_lock_all(void)
@@ -150,6 +154,8 @@ btr_search_x_lock_all(void)
 	}
 }
 
+/********************************************************************//**
+Unlatches all adaptive hash index latches in exclusive mode.  */
 UNIV_INLINE
 void
 btr_search_x_unlock_all(void)
@@ -163,11 +169,34 @@ btr_search_x_unlock_all(void)
 }
 
 #ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Checks if the thread has locked all the adaptive hash index latches in the
+specified mode.
+
+@return true if all latches are locked by the current thread, false
+otherwise.  */
+UNIV_INLINE
+bool
+btr_search_own_all(
+/*===============*/
+	ulint lock_type)
+{
+	ulint	i;
+
+	for (i = 0; i < btr_search_index_num; i++) {
+		if (!rw_lock_own(&btr_search_latch_arr[i], lock_type)) {
+			return(false);
+		}
+	}
+
+	return(true);
+}
+
 /********************************************************************//**
 Checks if the thread owns any adaptive hash latches in either S or X mode.
-@return	TRUE if the thread owns at least one latch in any mode. */
+@return	true if the thread owns at least one latch in any mode. */
 UNIV_INLINE
-ibool
+bool
 btr_search_own_any(void)
 /*====================*/
 {
@@ -176,10 +205,10 @@ btr_search_own_any(void)
 	for (i = 0; i < btr_search_index_num; i++) {
 		if (rw_lock_own(&btr_search_latch_arr[i], RW_LOCK_SHARED) ||
 		    rw_lock_own(&btr_search_latch_arr[i], RW_LOCK_EX)) {
-			return(TRUE);
+			return(true);
 		}
 	}
 
-	return(FALSE);
+	return(false);
 }
-#endif
+#endif /* UNIV_SYNC_DEBUG */
diff --git a/storage/xtradb/include/btr0types.h b/storage/xtradb/include/btr0types.h
index a7cd64df276..cd0392e7951 100644
--- a/storage/xtradb/include/btr0types.h
+++ b/storage/xtradb/include/btr0types.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -33,11 +33,11 @@ Created 2/17/1996 Heikki Tuuri
 #include "sync0rw.h"
 
 /** Persistent cursor */
-typedef struct btr_pcur_struct		btr_pcur_t;
+struct btr_pcur_t;
 /** B-tree cursor */
-typedef struct btr_cur_struct		btr_cur_t;
+struct btr_cur_t;
 /** B-tree search information for the adaptive hash index */
-typedef struct btr_search_struct	btr_search_t;
+struct btr_search_t;
 
 #ifndef UNIV_HOTBACKUP
 
@@ -55,23 +55,21 @@ but do NOT protect:
 Bear in mind (3) and (4) when using the hash indexes.
 */
 
-extern rw_lock_t*	btr_search_latch_arr;
+extern prio_rw_lock_t*	btr_search_latch_arr;
 
 #endif /* UNIV_HOTBACKUP */
 
-/** The latch protecting the adaptive search system */
-//#define btr_search_latch	(*btr_search_latch_temp)
-
 /** Flag: has the search system been enabled?
 Protected by btr_search_latch. */
 extern char	btr_search_enabled;
 
+/** Number of adaptive hash index partitions */
 extern ulint	btr_search_index_num;
 
 #ifdef UNIV_BLOB_DEBUG
 # include "buf0types.h"
 /** An index->blobs entry for keeping track of off-page column references */
-typedef struct btr_blob_dbg_struct btr_blob_dbg_t;
+struct btr_blob_dbg_t;
 
 /** Insert to index->blobs a reference to an off-page column.
 @param index	the index tree
diff --git a/storage/xtradb/include/buf0buddy.h b/storage/xtradb/include/buf0buddy.h
index 7060316dad9..a86fc87e3d3 100644
--- a/storage/xtradb/include/buf0buddy.h
+++ b/storage/xtradb/include/buf0buddy.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -36,8 +36,8 @@ Created December 2006 by Marko Makela
 
 /**********************************************************************//**
 Allocate a block.  The thread calling this function must hold
-buf_pool->mutex and must not hold buf_pool->zip_mutex or any
-block->mutex.  The buf_pool->mutex may be released and reacquired.
+buf_pool->LRU_list_mutex and must not hold buf_pool->zip_mutex or any
+block->mutex.  The buf_pool->LRU_list_mutex may be released and reacquired.
 This function should only be used for allocating compressed page frames.
 @return	allocated block, never NULL */
 UNIV_INLINE
@@ -47,14 +47,13 @@ buf_buddy_alloc(
 	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool in which
 					the page resides */
 	ulint		size,		/*!< in: compressed page size
-					(between PAGE_ZIP_MIN_SIZE and
+					(between UNIV_ZIP_SIZE_MIN and
 					UNIV_PAGE_SIZE) */
-	ibool*		lru,		/*!< in: pointer to a variable
+	ibool*		lru)		/*!< in: pointer to a variable
 					that will be assigned TRUE if
 				       	storage was allocated from the
-				       	LRU list and buf_pool->mutex was
-				       	temporarily released */
-	ibool		have_page_hash_mutex)
+					LRU list and buf_pool->LRU_list_mutex
+					was temporarily released */
 	__attribute__((malloc, nonnull));
 
 /**********************************************************************//**
@@ -67,9 +66,8 @@ buf_buddy_free(
 					the block resides */
 	void*		buf,		/*!< in: block to be freed, must not
 					be pointed to by the buffer pool */
-	ulint		size,		/*!< in: block size,
+	ulint		size)		/*!< in: block size,
 					up to UNIV_PAGE_SIZE */
-	ibool		have_page_hash_mutex)
 	__attribute__((nonnull));
 
 #ifndef UNIV_NONINL
diff --git a/storage/xtradb/include/buf0buddy.ic b/storage/xtradb/include/buf0buddy.ic
index d7053881caa..020442016d0 100644
--- a/storage/xtradb/include/buf0buddy.ic
+++ b/storage/xtradb/include/buf0buddy.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -35,8 +35,8 @@ Created December 2006 by Marko Makela
 
 /**********************************************************************//**
 Allocate a block.  The thread calling this function must hold
-buf_pool->mutex and must not hold buf_pool->zip_mutex or any block->mutex.
-The buf_pool_mutex may be released and reacquired.
+buf_pool->LRU_list_mutex and must not hold buf_pool->zip_mutex or any
+block->mutex.  The buf_pool->LRU_list_mutex may be released and reacquired.
 @return	allocated block, never NULL */
 UNIV_INTERN
 void*
@@ -45,12 +45,11 @@ buf_buddy_alloc_low(
 	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
 	ulint		i,		/*!< in: index of buf_pool->zip_free[],
 					or BUF_BUDDY_SIZES */
-	ibool*		lru,		/*!< in: pointer to a variable that
+	ibool*		lru)		/*!< in: pointer to a variable that
 					will be assigned TRUE if storage was
 					allocated from the LRU list and
-					buf_pool->mutex was temporarily
-					released */
-	ibool		have_page_hash_mutex)
+					buf_pool->LRU_list_mutex was
+					temporarily released */
 	__attribute__((malloc, nonnull));
 
 /**********************************************************************//**
@@ -62,9 +61,8 @@ buf_buddy_free_low(
 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
 	void*		buf,		/*!< in: block to be freed, must not be
 					pointed to by the buffer pool */
-	ulint		i,		/*!< in: index of buf_pool->zip_free[],
+	ulint		i)		/*!< in: index of buf_pool->zip_free[],
 					or BUF_BUDDY_SIZES */
-	ibool		have_page_hash_mutex)
 	__attribute__((nonnull));
 
 /**********************************************************************//**
@@ -79,7 +77,7 @@ buf_buddy_get_slot(
 	ulint	i;
 	ulint	s;
 
-	ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
 
 	for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) {
 	}
@@ -90,8 +88,8 @@ buf_buddy_get_slot(
 
 /**********************************************************************//**
 Allocate a block.  The thread calling this function must hold
-buf_pool->mutex and must not hold buf_pool->zip_mutex or any
-block->mutex.  The buf_pool->mutex may be released and reacquired.
+buf_pool->LRU_list_mutex and must not hold buf_pool->zip_mutex or any
+block->mutex.  The buf_pool->LRU_list_mutex may be released and reacquired.
 This function should only be used for allocating compressed page frames.
 @return	allocated block, never NULL */
 UNIV_INLINE
@@ -101,22 +99,21 @@ buf_buddy_alloc(
 	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool in which
 					the page resides */
 	ulint		size,		/*!< in: compressed page size
-					(between PAGE_ZIP_MIN_SIZE and
+					(between UNIV_ZIP_SIZE_MIN and
 					UNIV_PAGE_SIZE) */
-	ibool*		lru,		/*!< in: pointer to a variable
+	ibool*		lru)		/*!< in: pointer to a variable
 					that will be assigned TRUE if
 				       	storage was allocated from the
-				       	LRU list and buf_pool->mutex was
-				       	temporarily released */
-	ibool		have_page_hash_mutex)
+					LRU list and buf_pool->LRU_list_mutex
+					was temporarily released */
 {
-	//ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 	ut_ad(ut_is_2pow(size));
-	ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
 	ut_ad(size <= UNIV_PAGE_SIZE);
 
 	return((byte*) buf_buddy_alloc_low(buf_pool, buf_buddy_get_slot(size),
-					   lru, have_page_hash_mutex));
+					   lru));
 }
 
 /**********************************************************************//**
@@ -129,28 +126,14 @@ buf_buddy_free(
 					the block resides */
 	void*		buf,		/*!< in: block to be freed, must not
 					be pointed to by the buffer pool */
-	ulint		size,		/*!< in: block size,
+	ulint		size)		/*!< in: block size,
 					up to UNIV_PAGE_SIZE */
-	ibool		have_page_hash_mutex)
 {
-	//ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(ut_is_2pow(size));
-	ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
 	ut_ad(size <= UNIV_PAGE_SIZE);
 
-	if (!have_page_hash_mutex) {
-		mutex_enter(&buf_pool->LRU_list_mutex);
-		rw_lock_x_lock(&buf_pool->page_hash_latch);
-	}
-
-	mutex_enter(&buf_pool->zip_free_mutex);
-	buf_buddy_free_low(buf_pool, buf, buf_buddy_get_slot(size), TRUE);
-	mutex_exit(&buf_pool->zip_free_mutex);
-
-	if (!have_page_hash_mutex) {
-		mutex_exit(&buf_pool->LRU_list_mutex);
-		rw_lock_x_unlock(&buf_pool->page_hash_latch);
-	}
+	buf_buddy_free_low(buf_pool, buf, buf_buddy_get_slot(size));
 }
 
 #ifdef UNIV_MATERIALIZE
diff --git a/storage/xtradb/include/buf0buf.h b/storage/xtradb/include/buf0buf.h
index 701e820a23f..ba2f413429c 100644
--- a/storage/xtradb/include/buf0buf.h
+++ b/storage/xtradb/include/buf0buf.h
@@ -36,6 +36,7 @@ Created 11/5/1995 Heikki Tuuri
 #ifndef UNIV_HOTBACKUP
 #include "ut0rbt.h"
 #include "os0proc.h"
+#include "log0log.h"
 
 /** @name Modes for buf_page_get_gen */
 /* @{ */
@@ -68,14 +69,18 @@ Created 11/5/1995 Heikki Tuuri
 					position of the block. */
 /* @} */
 
-#define MAX_BUFFER_POOLS_BITS	6 	/*!< Number of bits to representing
+#define MAX_BUFFER_POOLS_BITS	6	/*!< Number of bits to representing
 					a buffer pool ID */
-#define MAX_BUFFER_POOLS	(1 << MAX_BUFFER_POOLS_BITS)
+
+#define MAX_BUFFER_POOLS 	(1 << MAX_BUFFER_POOLS_BITS)
 					/*!< The maximum number of buffer
 					pools that can be defined */
 
-#define BUF_POOL_WATCH_SIZE 1		/*!< Maximum number of concurrent
+#define BUF_POOL_WATCH_SIZE		(srv_n_purge_threads + 1)
+					/*!< Maximum number of concurrent
 					buffer pool watches */
+#define MAX_PAGE_HASH_LOCKS	1024	/*!< The maximum number of
+					page_hash locks */
 
 extern	buf_pool_t*	buf_pool_ptr;	/*!< The buffer pools
 					of the database */
@@ -84,8 +89,6 @@ extern ibool		buf_debug_prints;/*!< If this is set TRUE, the program
 					prints info whenever read or flush
 					occurs */
 #endif /* UNIV_DEBUG */
-extern ulint srv_buf_pool_write_requests; /*!< variable to count write request
-					  issued */
 extern ulint srv_buf_pool_instances;
 extern ulint srv_buf_pool_curr_size;
 #else /* !UNIV_HOTBACKUP */
@@ -97,13 +100,11 @@ extern buf_block_t*	back_block2;	/*!< second block, for page reorganize */
 #define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL
 
 /** @brief States of a control block
-@see buf_page_struct
+@see buf_page_t
 
 The enumeration values must be 0..7. */
 enum buf_page_state {
-	BUF_BLOCK_ZIP_FREE = 0,		/*!< contains a free
-					compressed page */
-	BUF_BLOCK_POOL_WATCH = 0,	/*!< a sentinel for the buffer pool
+	BUF_BLOCK_POOL_WATCH,		/*!< a sentinel for the buffer pool
 					watch, element of buf_pool->watch[] */
 	BUF_BLOCK_ZIP_PAGE,		/*!< contains a clean
 					compressed page */
@@ -127,7 +128,7 @@ enum buf_page_state {
 
 /** This structure defines information we will fetch from each buffer pool. It
 will be used to print table IO stats */
-struct buf_pool_info_struct{
+struct buf_pool_info_t{
 	/* General buffer pool info */
 	ulint	pool_unique_id;		/*!< Buffer Pool ID */
 	ulint	pool_size;		/*!< Buffer Pool size in pages */
@@ -141,10 +142,12 @@ struct buf_pool_info_struct{
 	ulint	n_pend_reads;		/*!< buf_pool->n_pend_reads, pages
 					pending read */
 	ulint	n_pending_flush_lru;	/*!< Pages pending flush in LRU */
+	ulint	n_pending_flush_single_page;/*!< Pages pending to be
+					flushed as part of single page
+					flushes issued by various user
+					threads */
 	ulint	n_pending_flush_list;	/*!< Pages pending flush in FLUSH
 					LIST */
-	ulint	n_pending_flush_single_page;/*!< Pages pending flush in
-					BUF_FLUSH_SINGLE_PAGE list */
 	ulint	n_pages_made_young;	/*!< number of pages made young */
 	ulint	n_pages_not_made_young;	/*!< number of pages not made young */
 	ulint	n_pages_read;		/*!< buf_pool->n_pages_read */
@@ -197,51 +200,20 @@ struct buf_pool_info_struct{
 					interval */
 };
 
-typedef struct buf_pool_info_struct	buf_pool_info_t;
-
 /** The occupied bytes of lists in all buffer pools */
-struct buf_pools_list_size_struct {
+struct buf_pools_list_size_t {
 	ulint	LRU_bytes;		/*!< LRU size in bytes */
 	ulint	unzip_LRU_bytes;	/*!< unzip_LRU size in bytes */
 	ulint	flush_list_bytes;	/*!< flush_list size in bytes */
 };
 
-typedef struct buf_pools_list_size_struct	buf_pools_list_size_t;
-
 #ifndef UNIV_HOTBACKUP
-/********************************************************************//**
-Acquire mutex on all buffer pool instances */
-UNIV_INLINE
-void
-buf_pool_mutex_enter_all(void);
-/*===========================*/
-
-/********************************************************************//**
-Release mutex on all buffer pool instances */
-UNIV_INLINE
-void
-buf_pool_mutex_exit_all(void);
-/*==========================*/
-
-/********************************************************************//**
-*/
-UNIV_INLINE
-void
-buf_pool_page_hash_x_lock_all(void);
-/*================================*/
-
-/********************************************************************//**
-*/
-UNIV_INLINE
-void
-buf_pool_page_hash_x_unlock_all(void);
-/*==================================*/
 
 /********************************************************************//**
 Creates the buffer pool.
-@return	own: buf_pool object, NULL if not enough memory or error */
+@return	DB_SUCCESS if success, DB_ERROR if not enough memory or error */
 UNIV_INTERN
-ulint
+dberr_t
 buf_pool_init(
 /*=========*/
 	ulint	size,		/*!< in: Size of the total pool in bytes */
@@ -295,9 +267,10 @@ Gets the smallest oldest_modification lsn for any page in the pool. Returns
 zero if all modified pages have been flushed to disk.
 @return	oldest modification in pool, zero if none */
 UNIV_INTERN
-ib_uint64_t
+lsn_t
 buf_pool_get_oldest_modification(void);
 /*==================================*/
+
 /********************************************************************//**
 Allocates a buf_page_t descriptor. This function must succeed. In case
 of failure we assert in this function. */
@@ -369,8 +342,7 @@ buf_page_optimistic_get(
 /*====================*/
 	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
 	buf_block_t*	block,	/*!< in: guessed block */
-	ib_uint64_t	modify_clock,/*!< in: modify clock value if mode is
-				..._GUESS_ON_CLOCK */
+	ib_uint64_t	modify_clock,/*!< in: modify clock value */
 	const char*	file,	/*!< in: file name */
 	ulint		line,	/*!< in: line where called */
 	mtr_t*		mtr);	/*!< in: mini-transaction */
@@ -392,7 +364,7 @@ buf_page_get_known_nowait(
 /*******************************************************************//**
 Given a tablespace id and page number tries to get that page. If the
 page is not in the buffer pool it is not loaded and NULL is returned.
-Suitable for using when holding the kernel mutex. */
+Suitable for using when holding the lock_sys_t::mutex. */
 UNIV_INTERN
 const buf_block_t*
 buf_page_try_get_func(
@@ -404,7 +376,7 @@ buf_page_try_get_func(
 	mtr_t*		mtr);	/*!< in: mini-transaction */
 
 /** Tries to get a page. If the page is not in the buffer pool it is
-not loaded.  Suitable for using when holding the kernel mutex.
+not loaded.  Suitable for using when holding the lock_sys_t::mutex.
 @param space_id	in: tablespace id
 @param page_no	in: page number
 @param mtr	in: mini-transaction
@@ -517,15 +489,6 @@ buf_page_peek(
 /*==========*/
 	ulint	space,	/*!< in: space id */
 	ulint	offset);/*!< in: page number */
-/********************************************************************//**
-Resets the check_index_page_at_flush field of a page if found in the buffer
-pool. */
-UNIV_INTERN
-void
-buf_reset_check_index_page_at_flush(
-/*================================*/
-	ulint	space,	/*!< in: space id */
-	ulint	offset);/*!< in: page number */
 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
 /********************************************************************//**
 Sets file_page_was_freed TRUE if the page is found in the buffer pool.
@@ -598,14 +561,14 @@ Gets the youngest modification log sequence number for a frame.
 Returns zero if not file page or no modification occurred yet.
 @return	newest modification to page */
 UNIV_INLINE
-ib_uint64_t
+lsn_t
 buf_page_get_newest_modification(
 /*=============================*/
 	const buf_page_t*	bpage);	/*!< in: block containing the
 					page frame */
 /********************************************************************//**
 Increments the modify clock of a frame by 1. The caller must (1) own the
-buf_pool->mutex and block bufferfix count has to be zero, (2) or own an x-lock
+LRU list mutex and block bufferfix count has to be zero, (2) or own an x-lock
 on the block. */
 UNIV_INLINE
 void
@@ -650,46 +613,18 @@ buf_block_buf_fix_inc_func(
 # define buf_block_modify_clock_inc(block) ((void) 0)
 #endif /* !UNIV_HOTBACKUP */
 /********************************************************************//**
-Calculates a page checksum which is stored to the page when it is written
-to a file. Note that we must be careful to calculate the same value
-on 32-bit and 64-bit architectures.
-@return	checksum */
-UNIV_INTERN
-ulint
-buf_calc_page_new_checksum(
-/*=======================*/
-	const byte*	page);	/*!< in: buffer page */
-UNIV_INTERN
-ulint
-buf_calc_page_new_checksum_32(
-/*==========================*/
-	const byte*	page);	/*!< in: buffer page */
-/********************************************************************//**
-In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
-looked at the first few bytes of the page. This calculates that old
-checksum.
-NOTE: we must first store the new formula checksum to
-FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
-because this takes that field as an input!
-@return	checksum */
-UNIV_INTERN
-ulint
-buf_calc_page_old_checksum(
-/*=======================*/
-	const byte*	 page);	/*!< in: buffer page */
-/********************************************************************//**
 Checks if a page is corrupt.
 @return	TRUE if corrupted */
 UNIV_INTERN
 ibool
 buf_page_is_corrupted(
 /*==================*/
-	ibool		check_lsn,	/*!< in: TRUE if we need to check
+	bool		check_lsn,	/*!< in: true if we need to check the
 					and complain about the LSN */
 	const byte*	read_buf,	/*!< in: a database page */
 	ulint		zip_size)	/*!< in: size of compressed page;
 					0 for uncompressed pages */
-	__attribute__((warn_unused_result));
+	__attribute__((nonnull, warn_unused_result));
 #ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
 Gets the space id, page offset, and byte offset within page of a
@@ -723,6 +658,17 @@ buf_pool_contains_zip(
 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
 	const void*	data);		/*!< in: pointer to compressed page */
 #endif /* UNIV_DEBUG */
+
+/***********************************************************************
+FIXME_FTS: Gets the frame the pointer is pointing to. */
+UNIV_INLINE
+buf_frame_t*
+buf_frame_align(
+/*============*/
+                        /* out: pointer to frame */
+        byte*   ptr);   /* in: pointer to a frame */
+
+
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 /*********************************************************************//**
 Validates the buffer pool data structure.
@@ -757,10 +703,10 @@ buf_page_print(
 	const byte*	read_buf,	/*!< in: a database page */
 	ulint		zip_size,	/*!< in: compressed page size, or
 					0 for uncompressed pages */
-	ulint		flags)		/*!< in: 0 or
+	ulint		flags);		/*!< in: 0 or
 					BUF_PAGE_PRINT_NO_CRASH or
 					BUF_PAGE_PRINT_NO_FULL */
-	__attribute__((nonnull));
+
 /********************************************************************//**
 Decompress a block.
 @return	TRUE if successful */
@@ -781,12 +727,12 @@ buf_get_latched_pages_number(void);
 /*==============================*/
 #endif /* UNIV_DEBUG */
 /*********************************************************************//**
-Returns the number of pending buf pool ios.
-@return	number of pending I/O operations */
+Returns the number of pending buf pool read ios.
+@return	number of pending read I/O operations */
 UNIV_INTERN
 ulint
-buf_get_n_pending_ios(void);
-/*=======================*/
+buf_get_n_pending_read_ios(void);
+/*============================*/
 /*********************************************************************//**
 Prints info of the buffer i/o. */
 UNIV_INTERN
@@ -840,8 +786,8 @@ pool.
 @return	number of pending i/o operations */
 UNIV_INTERN
 ulint
-buf_pool_check_num_pending_io(void);
-/*===============================*/
+buf_pool_check_no_pending_io(void);
+/*==============================*/
 /*********************************************************************//**
 Invalidates the file pages in the buffer pool when an archive recovery is
 completed. All the file pages buffered must be in a replaceable state when
@@ -928,26 +874,17 @@ buf_page_belongs_to_unzip_LRU(
 Gets the mutex of a block.
 @return	pointer to mutex protecting bpage */
 UNIV_INLINE
-mutex_t*
+ib_mutex_t*
 buf_page_get_mutex(
 /*===============*/
 	const buf_page_t*	bpage)	/*!< in: pointer to control block */
 	__attribute__((pure));
 
-/*************************************************************************
-Gets the mutex of a block and enter the mutex with consistency. */
-UNIV_INLINE
-mutex_t*
-buf_page_get_mutex_enter(
-/*=========================*/
-	const buf_page_t*	bpage)	/*!< in: pointer to control block */
-	__attribute__((pure));
-
 /*********************************************************************//**
 Get the flush type of a page.
 @return	flush type */
 UNIV_INLINE
-enum buf_flush
+buf_flush_t
 buf_page_get_flush_type(
 /*====================*/
 	const buf_page_t*	bpage)	/*!< in: buffer page */
@@ -959,7 +896,7 @@ void
 buf_page_set_flush_type(
 /*====================*/
 	buf_page_t*	bpage,		/*!< in: buffer page */
-	enum buf_flush	flush_type);	/*!< in: flush type */
+	buf_flush_t	flush_type);	/*!< in: flush type */
 /*********************************************************************//**
 Map a block to a file page. */
 UNIV_INLINE
@@ -970,7 +907,7 @@ buf_block_set_file_page(
 	ulint			space,	/*!< in: tablespace id */
 	ulint			page_no);/*!< in: page number */
 /*********************************************************************//**
-Gets the io_fix state of a block.  Requires that the block mutex is held.
+Gets the io_fix state of a block.
 @return	io_fix state */
 UNIV_INLINE
 enum buf_io_fix
@@ -979,17 +916,7 @@ buf_page_get_io_fix(
 	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
 	__attribute__((pure));
 /*********************************************************************//**
-Gets the io_fix state of a block.  Does not assert that the block mutex is
-held, to be used in the cases where it is safe not to hold it.
-@return	io_fix state */
-UNIV_INLINE
-enum buf_io_fix
-buf_page_get_io_fix_unlocked(
-/*=========================*/
-	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
-	__attribute__((pure));
-/*********************************************************************//**
-Gets the io_fix state of a block.  Requires that the block mutex is held.
+Gets the io_fix state of a block.
 @return	io_fix state */
 UNIV_INLINE
 enum buf_io_fix
@@ -998,14 +925,15 @@ buf_block_get_io_fix(
 	const buf_block_t*	block)	/*!< in: pointer to the control block */
 	__attribute__((pure));
 /*********************************************************************//**
-Gets the io_fix state of a block.  Does not assert that the block mutex is
-held, to be used in the cases where it is safe not to hold it.
+Gets the io_fix state of a block.  Does not assert that the
+buf_page_get_mutex() mutex is held, to be used in the cases where it is safe
+not to hold it.
 @return	io_fix state */
 UNIV_INLINE
 enum buf_io_fix
-buf_block_get_io_fix_unlocked(
-/*==========================*/
-	const buf_block_t*	block)	/*!< in: pointer to the control block */
+buf_page_get_io_fix_unlocked(
+/*=========================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
 	__attribute__((pure));
 /*********************************************************************//**
 Sets the io_fix state of a block. */
@@ -1025,7 +953,7 @@ buf_block_set_io_fix(
 	enum buf_io_fix	io_fix);/*!< in: io_fix state */
 /*********************************************************************//**
 Makes a block sticky. A sticky block implies that even after we release
-the buf_pool->mutex and the block->mutex:
+the buf_pool->LRU_list_mutex and the block->mutex:
 * it cannot be removed from the flush_list
 * the block descriptor cannot be relocated
 * it cannot be removed from the LRU list
@@ -1173,7 +1101,7 @@ buf_block_get_zip_size(
 Gets the compressed page descriptor corresponding to an uncompressed page
 if applicable. */
 #define buf_block_get_page_zip(block) \
-	(UNIV_LIKELY_NULL((block)->page.zip.data) ? &(block)->page.zip : NULL)
+	((block)->page.zip.data ? &(block)->page.zip : NULL)
 #ifndef UNIV_HOTBACKUP
 /*******************************************************************//**
 Gets the block to whose frame the pointer is pointing to.
@@ -1229,7 +1157,7 @@ UNIV_INTERN
 buf_page_t*
 buf_page_init_for_read(
 /*===================*/
-	ulint*		err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
+	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
 	ulint		mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */
 	ulint		space,	/*!< in: space id */
 	ulint		zip_size,/*!< in: compressed page size, or 0 */
@@ -1241,9 +1169,9 @@ buf_page_init_for_read(
 /********************************************************************//**
 Completes an asynchronous read or write request of a file page to or from
 the buffer pool.
-@return TRUE if successful */
+@return true if successful */
 UNIV_INTERN
-ibool
+bool
 buf_page_io_complete(
 /*=================*/
 	buf_page_t*	bpage);	/*!< in: pointer to the block in question */
@@ -1267,14 +1195,6 @@ buf_pool_index(
 /*===========*/
 	const buf_pool_t*	buf_pool)	/*!< in: buffer pool */
 	__attribute__((nonnull, const));
-/********************************************************************//**
-*/
-UNIV_INTERN
-buf_block_t*
-buf_page_from_array(
-/*================*/
-	buf_pool_t*	buf_pool,
-	ulint		n_block);
 /******************************************************************//**
 Returns the buffer pool instance given a page instance
 @return buf_pool */
@@ -1316,35 +1236,83 @@ UNIV_INLINE
 buf_page_t*
 buf_page_hash_get_low(
 /*==================*/
-	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	ulint		space,		/*!< in: space id */
-	ulint		offset,		/*!< in: offset of the page
-					within space */
-	ulint		fold);		/*!< in: buf_page_address_fold(
-					space, offset) */
+	buf_pool_t*	buf_pool,/*!< buffer pool instance */
+	ulint		space,	/*!< in: space id */
+	ulint		offset,	/*!< in: offset of the page within space */
+	ulint		fold);	/*!< in: buf_page_address_fold(space, offset) */
 /******************************************************************//**
 Returns the control block of a file page, NULL if not found.
-@return	block, NULL if not found or not a real control block */
+If the block is found and lock is not NULL then the appropriate
+page_hash lock is acquired in the specified lock mode. Otherwise,
+mode value is ignored. It is up to the caller to release the
+lock. If the block is found and the lock is NULL then the page_hash
+lock is released by this function.
+@return	block, NULL if not found */
 UNIV_INLINE
 buf_page_t*
-buf_page_hash_get(
-/*==============*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+buf_page_hash_get_locked(
+/*=====================*/
+					/*!< out: pointer to the bpage,
+					or NULL; if NULL, hash_lock
+					is also NULL. */
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
 	ulint		space,		/*!< in: space id */
-	ulint		offset);	/*!< in: offset of the page
-					within space */
+	ulint		offset,		/*!< in: page number */
+	prio_rw_lock_t**	lock,	/*!< in/out: lock of the page
+					hash acquired if bpage is
+					found. NULL otherwise. If NULL
+					is passed then the hash_lock
+					is released by this function */
+	ulint		lock_mode);	/*!< in: RW_LOCK_EX or
+					RW_LOCK_SHARED. Ignored if
+					lock == NULL */
 /******************************************************************//**
-Returns the control block of a file page, NULL if not found
-or an uncompressed page frame does not exist.
+Returns the control block of a file page, NULL if not found.
+If the block is found and lock is not NULL then the appropriate
+page_hash lock is acquired in the specified lock mode. Otherwise,
+mode value is ignored. It is up to the caller to release the
+lock. If the block is found and the lock is NULL then the page_hash
+lock is released by this function.
 @return	block, NULL if not found */
 UNIV_INLINE
 buf_block_t*
-buf_block_hash_get(
-/*===============*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+buf_block_hash_get_locked(
+/*=====================*/
+					/*!< out: pointer to the bpage,
+					or NULL; if NULL, hash_lock
+					is also NULL. */
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
 	ulint		space,		/*!< in: space id */
-	ulint		offset);	/*!< in: offset of the page
-					within space */
+	ulint		offset,		/*!< in: page number */
+	prio_rw_lock_t**	lock,	/*!< in/out: lock of the page
+					hash acquired if bpage is
+					found. NULL otherwise. If NULL
+					is passed then the hash_lock
+					is released by this function */
+	ulint		lock_mode);	/*!< in: RW_LOCK_EX or
+					RW_LOCK_SHARED. Ignored if
+					lock == NULL */
+/* There are four different ways we can try to get a bpage or block
+from the page hash:
+1) Caller already holds the appropriate page hash lock: in the case call
+buf_page_hash_get_low() function.
+2) Caller wants to hold page hash lock in x-mode
+3) Caller wants to hold page hash lock in s-mode
+4) Caller doesn't want to hold page hash lock */
+#define buf_page_hash_get_s_locked(b, s, o, l)			\
+	buf_page_hash_get_locked(b, s, o, l, RW_LOCK_SHARED)
+#define buf_page_hash_get_x_locked(b, s, o, l)			\
+	buf_page_hash_get_locked(b, s, o, l, RW_LOCK_EX)
+#define buf_page_hash_get(b, s, o)				\
+	buf_page_hash_get_locked(b, s, o, NULL, 0)
+
+#define buf_block_hash_get_s_locked(b, s, o, l)			\
+	buf_block_hash_get_locked(b, s, o, l, RW_LOCK_SHARED)
+#define buf_block_hash_get_x_locked(b, s, o, l)			\
+	buf_block_hash_get_locked(b, s, o, l, RW_LOCK_EX)
+#define buf_block_hash_get(b, s, o)				\
+	buf_block_hash_get_locked(b, s, o, NULL, 0)
+
 /*********************************************************************//**
 Gets the current length of the free list of buffer blocks.
 @return	length of the free list */
@@ -1430,44 +1398,82 @@ buf_get_nth_chunk_block(
 	ulint		n,		/*!< in: nth chunk in the buffer pool */
 	ulint*		chunk_size);	/*!< in: chunk size */
 
+/********************************************************************//**
+Calculate the checksum of a page from compressed table and update the page. */
+UNIV_INTERN
+void
+buf_flush_update_zip_checksum(
+/*==========================*/
+	buf_frame_t*	page,		/*!< in/out: Page to update */
+	ulint		zip_size,	/*!< in: Compressed page size */
+	lsn_t		lsn);		/*!< in: Lsn to stamp on the page */
+
 #endif /* !UNIV_HOTBACKUP */
 
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Checks if buf_pool->zip_mutex is owned and is serving for a given page as its
+block mutex.
+@return true if buf_pool->zip_mutex is owned. */
+UNIV_INLINE
+bool
+buf_own_zip_mutex_for_page(
+/*=======================*/
+	const buf_page_t*	bpage)
+	__attribute__((nonnull,warn_unused_result));
+#endif /* UNIV_DEBUG */
+
 /** The common buffer control block structure
 for compressed and uncompressed frames */
 
 /** Number of bits used for buffer page states. */
 #define BUF_PAGE_STATE_BITS	3
 
-struct buf_page_struct{
+struct buf_page_t{
 	/** @name General fields
 	None of these bit-fields must be modified without holding
-	buf_page_get_mutex() [buf_block_struct::mutex or
+	buf_page_get_mutex() [buf_block_t::mutex or
 	buf_pool->zip_mutex], since they can be stored in the same
-	machine word.  Some of these fields are additionally protected
-	by buf_pool->mutex. */
+	machine word.  */
 	/* @{ */
 
-	unsigned	space:32;	/*!< tablespace id; also protected
-					by buf_pool->mutex. */
-	unsigned	offset:32;	/*!< page number; also protected
-					by buf_pool->mutex. */
+	unsigned	space:32;	/*!< tablespace id. */
+	unsigned	offset:32;	/*!< page number. */
 
 	unsigned	state:BUF_PAGE_STATE_BITS;
-					/*!< state of the control block; also
-					protected by buf_pool->mutex.
+					/*!< state of the control block.
 					State transitions from
 					BUF_BLOCK_READY_FOR_USE to
 					BUF_BLOCK_MEMORY need not be
 					protected by buf_page_get_mutex().
-					@see enum buf_page_state */
+					@see enum buf_page_state.
+					State changes that are relevant
+					to page_hash are additionally
+					protected by the appropriate
+					page_hash mutex i.e.: if a page
+					is in page_hash or is being
+					added to/removed from page_hash
+					then the corresponding changes
+					must also be protected by
+					page_hash mutex. */
 #ifndef UNIV_HOTBACKUP
 	unsigned	flush_type:2;	/*!< if this block is currently being
 					flushed to disk, this tells the
-					flush_type.
-					@see enum buf_flush */
-	unsigned	io_fix:2;	/*!< type of pending I/O operation;
-					also protected by buf_pool->mutex
-					@see enum buf_io_fix */
+					flush_type.  Writes during flushing
+					protected by buf_page_get_mutex_enter()
+					mutex and the corresponding flush state
+					mutex.
+					@see buf_flush_t */
+	unsigned	io_fix:2;	/*!< type of pending I/O operation.
+					Transitions from BUF_IO_NONE to
+					BUF_IO_WRITE and back are protected by
+					the buf_page_get_mutex() mutex and the
+					corresponding flush state mutex.  The
+					flush state mutex protection for io_fix
+					and flush_type is not strictly
+					required, but it ensures consistent
+					buffer pool instance state snapshots in
+					buf_pool_validate_instance(). */
 	unsigned	buf_fix_count:19;/*!< count of how manyfold this block
 					is currently bufferfixed */
 	unsigned	buf_pool_index:6;/*!< index number of the buffer pool
@@ -1479,7 +1485,7 @@ struct buf_page_struct{
 #endif /* !UNIV_HOTBACKUP */
 	page_zip_des_t	zip;		/*!< compressed page; zip.data
 					(but not the data it points to) is
-					also protected by buf_pool->mutex;
+					protected by buf_pool->zip_mutex;
 					state == BUF_BLOCK_ZIP_PAGE and
 					zip.data == NULL means an active
 					buf_pool->watch */
@@ -1492,15 +1498,13 @@ struct buf_page_struct{
 	ibool		in_zip_hash;	/*!< TRUE if in buf_pool->zip_hash */
 #endif /* UNIV_DEBUG */
 
-	/** @name Page flushing fields
-	All these are protected by buf_pool->mutex. */
+	/** @name Page flushing fields */
 	/* @{ */
 
-	/* UT_LIST_NODE_T(buf_page_t) list; */
+	UT_LIST_NODE_T(buf_page_t) list;
 					/*!< based on state, this is a
 					list node, protected either by
-					buf_pool->mutex or by
-					buf_pool->flush_list_mutex,
+					a corresponding list mutex,
 					in one of the following lists in
 					buf_pool:
 
@@ -1508,13 +1512,13 @@ struct buf_page_struct{
 					- BUF_BLOCK_FILE_PAGE:	flush_list
 					- BUF_BLOCK_ZIP_DIRTY:	flush_list
 					- BUF_BLOCK_ZIP_PAGE:	zip_clean
-					- BUF_BLOCK_ZIP_FREE:	zip_free[]
 
 					If bpage is part of flush_list
 					then the node pointers are
 					covered by buf_pool->flush_list_mutex.
 					Otherwise these pointers are
-					protected by buf_pool->mutex.
+					protected by a corresponding list
+					mutex.
 
 					The contents of the list node
 					is undefined if !in_flush_list
@@ -1524,10 +1528,6 @@ struct buf_page_struct{
 					BUF_BLOCK_REMOVE_HASH or
 					BUF_BLOCK_READY_IN_USE. */
 
-	/* resplit for optimistic use */
-	UT_LIST_NODE_T(buf_page_t) free;
-	UT_LIST_NODE_T(buf_page_t) flush_list;
-	UT_LIST_NODE_T(buf_page_t) zip_list; /* zip_clean or zip_free[] */
 #ifdef UNIV_DEBUG
 	ibool		in_flush_list;	/*!< TRUE if in buf_pool->flush_list;
 					when buf_pool->flush_list_mutex is
@@ -1541,17 +1541,17 @@ struct buf_page_struct{
 					reads can happen while holding
 					any one of the two mutexes */
 	ibool		in_free_list;	/*!< TRUE if in buf_pool->free; when
-					buf_pool->mutex is free, the following
-					should hold: in_free_list
+					buf_pool->free_list_mutex is free, the
+					following should hold: in_free_list
 					== (state == BUF_BLOCK_NOT_USED) */
 #endif /* UNIV_DEBUG */
-	ib_uint64_t	newest_modification;
+	lsn_t		newest_modification;
 					/*!< log sequence number of
 					the youngest modification to
 					this block, zero if not
 					modified. Protected by block
 					mutex */
-	ib_uint64_t	oldest_modification;
+	lsn_t		oldest_modification;
 					/*!< log sequence number of
 					the START of the log entry
 					written of the oldest
@@ -1565,20 +1565,21 @@ struct buf_page_struct{
 					reads can happen while holding
 					any one of the two mutexes */
 	/* @} */
-	/** @name LRU replacement algorithm fields
-	These fields are protected by buf_pool->mutex only (not
-	buf_pool->zip_mutex or buf_block_struct::mutex). */
+	/** @name LRU replacement algorithm fields */
 	/* @{ */
 
 	UT_LIST_NODE_T(buf_page_t) LRU;
 					/*!< node of the LRU list */
-//#ifdef UNIV_DEBUG
+#ifdef UNIV_DEBUG
 	ibool		in_LRU_list;	/*!< TRUE if the page is in
 					the LRU list; used in
 					debugging */
-//#endif /* UNIV_DEBUG */
+#endif /* UNIV_DEBUG */
 	unsigned	old:1;		/*!< TRUE if the block is in the old
-					blocks in buf_pool->LRU_old */
+					blocks in buf_pool->LRU_old.  Protected
+					by the LRU list mutex.  May be read for
+					heuristics purposes under the block
+					mutex instead. */
 	unsigned	freed_page_clock:31;/*!< the value of
 					buf_pool->freed_page_clock
 					when this block was the last
@@ -1595,15 +1596,17 @@ struct buf_page_struct{
 	ibool		is_corrupt;
 # if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
 	ibool		file_page_was_freed;
-					/*!< this is set to TRUE when fsp
-					frees a page in buffer pool */
+					/*!< this is set to TRUE when
+					fsp frees a page in buffer pool;
+					protected by buf_pool->zip_mutex
+					or buf_block_t::mutex. */
 # endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
 #endif /* !UNIV_HOTBACKUP */
 };
 
 /** The buffer control block structure */
 
-struct buf_block_struct{
+struct buf_block_t{
 
 	/** @name General fields */
 	/* @{ */
@@ -1622,14 +1625,13 @@ struct buf_block_struct{
 					a block is in the unzip_LRU list
 					if page.state == BUF_BLOCK_FILE_PAGE
 					and page.zip.data != NULL */
-//#ifdef UNIV_DEBUG
+#ifdef UNIV_DEBUG
 	ibool		in_unzip_LRU_list;/*!< TRUE if the page is in the
 					decompressed LRU list;
 					used in debugging */
-//#endif /* UNIV_DEBUG */
-	mutex_t		mutex;		/*!< mutex protecting this block:
-					state (also protected by the buffer
-					pool mutex), io_fix, buf_fix_count,
+#endif /* UNIV_DEBUG */
+	ib_mutex_t		mutex;		/*!< mutex protecting this block:
+					state, io_fix, buf_fix_count,
 					and accessed; we introduce this new
 					mutex in InnoDB-5.1 to relieve
 					contention on the buffer pool mutex */
@@ -1638,8 +1640,8 @@ struct buf_block_struct{
 	unsigned	lock_hash_val:32;/*!< hashed value of the page address
 					in the record lock hash table;
 					protected by buf_block_t::lock
-					(or buf_block_t::mutex, buf_pool->mutex
-				        in buf_page_get_gen(),
+					(or buf_block_t::mutex in
+					buf_page_get_gen(),
 					buf_page_init_for_read()
 					and buf_page_create()) */
 	ibool		check_index_page_at_flush;
@@ -1662,8 +1664,8 @@ struct buf_block_struct{
 					positioning: if the modify clock has
 					not changed, we know that the pointer
 					is still valid; this field may be
-					changed if the thread (1) owns the
-					pool mutex and the page is not
+					changed if the thread (1) owns the LRU
+					list mutex and the page is not
 					bufferfixed, or (2) the thread has an
 					x-latch on the block */
 	/* @} */
@@ -1686,11 +1688,11 @@ struct buf_block_struct{
 	/** @name Hash search fields
 	These 5 fields may only be modified when we have
 	an x-latch on btr_search_latch AND
-	- we are holding an s-latch or x-latch on buf_block_struct::lock or
-	- we know that buf_block_struct::buf_fix_count == 0.
+	- we are holding an s-latch or x-latch on buf_block_t::lock or
+	- we know that buf_block_t::buf_fix_count == 0.
 
 	An exception to this is when we init or create a page
-	in the buffer pool in buf0buf.c.
+	in the buffer pool in buf0buf.cc.
 
 	Another exception is that assigning block->index = NULL
 	is allowed whenever holding an x-latch on btr_search_latch. */
@@ -1745,25 +1747,36 @@ Compute the hash fold value for blocks in buf_pool->zip_hash. */
 #define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
 /* @} */
 
-/** A chunk of buffers.  The buffer pool is allocated in chunks. */
-struct buf_chunk_struct{
-	ulint		mem_size;	/*!< allocated size of the chunk */
-	ulint		size;		/*!< size of frames[] and blocks[] */
-	void*		mem;		/*!< pointer to the memory area which
-					was allocated for the frames */
-	buf_block_t*	blocks;		/*!< array of buffer control blocks */
+/** Struct that is embedded in the free zip blocks */
+struct buf_buddy_free_t {
+	union {
+		ulint	size;	/*!< size of the block */
+		byte	bytes[FIL_PAGE_DATA];
+				/*!< stamp[FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID]
+				== BUF_BUDDY_FREE_STAMP denotes a free
+				block. If the space_id field of buddy
+				block != BUF_BUDDY_FREE_STAMP, the block
+				is not in any zip_free list. If the
+				space_id is BUF_BUDDY_FREE_STAMP then
+				stamp[0] will contain the
+				buddy block size. */
+	} stamp;
+
+	buf_page_t	bpage;	/*!< Embedded bpage descriptor */
+	UT_LIST_NODE_T(buf_buddy_free_t) list;
+				/*!< Node of zip_free list */
 };
 
 /** @brief The buffer pool statistics structure. */
-struct buf_pool_stat_struct{
+struct buf_pool_stat_t{
 	ulint	n_page_gets;	/*!< number of page gets performed;
 				also successful searches through
 				the adaptive hash index are
-				counted as page gets; this field
-				is NOT protected by the buffer
-				pool mutex */
-	ulint	n_pages_read;	/*!< number read operations */
-	ulint	n_pages_written;/*!< number write operations */
+				counted as page gets. */
+	ulint	n_pages_read;	/*!< number read operations.  Accessed
+				atomically. */
+	ulint	n_pages_written;/*!< number write operations.  Accessed
+				atomically.*/
 	ulint	n_pages_created;/*!< number of pages created
 				in the pool with no read */
 	ulint	n_ra_pages_read_rnd;/*!< number of pages read in
@@ -1781,10 +1794,11 @@ struct buf_pool_stat_struct{
 				buf_page_peek_if_too_old() */
 	ulint	LRU_bytes;	/*!< LRU size in bytes */
 	ulint	flush_list_bytes;/*!< flush_list size in bytes */
+	ulint	buf_lru_flush_page_count;
 };
 
 /** Statistics of buddy blocks of a given size. */
-struct buf_buddy_stat_struct {
+struct buf_buddy_stat_t {
 	/** Number of blocks allocated from the buddy system. */
 	ulint		used;
 	/** Number of blocks relocated by the buddy system. */
@@ -1798,21 +1812,20 @@ struct buf_buddy_stat_struct {
 NOTE! The definition appears here only for other modules of this
 directory (buf) to see it. Do not use from outside! */
 
-struct buf_pool_struct{
+struct buf_pool_t{
 
 	/** @name General fields */
 	/* @{ */
-	mutex_t		mutex;		/*!< Buffer pool mutex of this
-					instance */
-	mutex_t		zip_mutex;	/*!< Zip mutex of this buffer
+	ib_mutex_t		zip_mutex;	/*!< Zip mutex of this buffer
 					pool instance, protects compressed
 					only pages (of type buf_page_t, not
 					buf_block_t */
-	mutex_t		LRU_list_mutex;
-	rw_lock_t	page_hash_latch;
-	mutex_t		free_list_mutex;
-	mutex_t		zip_free_mutex;
-	mutex_t		zip_hash_mutex;
+	ib_prio_mutex_t	LRU_list_mutex;
+	ib_prio_mutex_t	free_list_mutex;
+	ib_mutex_t	zip_free_mutex;
+	ib_mutex_t	zip_hash_mutex;
+	ib_mutex_t	flush_state_mutex;	/*!< Flush state protection
+					mutex */
 	ulint		instance_no;	/*!< Array index of this buffer
 					pool instance */
 	ulint		old_pool_size;  /*!< Old pool size in bytes */
@@ -1823,30 +1836,34 @@ struct buf_pool_struct{
 	ulint		buddy_n_frames; /*!< Number of frames allocated from
 					the buffer pool to the buddy system */
 #endif
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-	ulint		mutex_exit_forbidden; /*!< Forbid release mutex */
-#endif
 	ulint		n_chunks;	/*!< number of buffer pool chunks */
 	buf_chunk_t*	chunks;		/*!< buffer pool chunks */
 	ulint		curr_size;	/*!< current pool size in pages */
+	ulint		read_ahead_area;/*!< size in pages of the area which
+					the read-ahead algorithms read if
+					invoked */
 	hash_table_t*	page_hash;	/*!< hash table of buf_page_t or
 					buf_block_t file pages,
 					buf_page_in_file() == TRUE,
-					indexed by (space_id, offset) */
+					indexed by (space_id, offset).
+					page_hash is protected by an
+					array of mutexes. */
 	hash_table_t*	zip_hash;	/*!< hash table of buf_block_t blocks
 					whose frames are allocated to the
 					zip buddy system,
 					indexed by block->frame */
 	ulint		n_pend_reads;	/*!< number of pending read
-					operations */
-	ulint		n_pend_unzip;	/*!< number of pending decompressions */
+					operations.  Accessed atomically */
+	ulint		n_pend_unzip;	/*!< number of pending decompressions.
+					Accesssed atomically */
 
 	time_t		last_printout_time;
 					/*!< when buf_print_io was last time
-					called */
+					called.  Accesses not protected */
 	buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1];
 					/*!< Statistics of buddy system,
-					indexed by block size */
+					indexed by block size.  Protected by
+					zip_free_mutex. */
 	buf_pool_stat_t	stat;		/*!< current statistics */
 	buf_pool_stat_t	old_stat;	/*!< old statistics */
 
@@ -1856,22 +1873,29 @@ struct buf_pool_struct{
 
 	/* @{ */
 
-	mutex_t		flush_list_mutex;/*!< mutex protecting the
+	ib_mutex_t		flush_list_mutex;/*!< mutex protecting the
 					flush list access. This mutex
 					protects flush_list, flush_rbt
 					and bpage::list pointers when
 					the bpage is on flush_list. It
 					also protects writes to
-					bpage::oldest_modification */
+					bpage::oldest_modification and
+					flush_list_hp */
+	const buf_page_t*	flush_list_hp;/*!< "hazard pointer"
+					used during scan of flush_list
+					while doing flush list batch.
+					Protected by flush_list_mutex */
 	UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
 					/*!< base node of the modified block
 					list */
 	ibool		init_flush[BUF_FLUSH_N_TYPES];
 					/*!< this is TRUE when a flush of the
-					given type is being initialized */
+					given type is being initialized.
+					Protected by flush_state_mutex.  */
 	ulint		n_flush[BUF_FLUSH_N_TYPES];
 					/*!< this is the number of pending
-					writes in the given flush type */
+					writes in the given flush type.
+					Protected by flush_state_mutex.  */
 	os_event_t	no_flush[BUF_FLUSH_N_TYPES];
 					/*!< this is in the set state
 					when there is no flush batch
@@ -1898,11 +1922,17 @@ struct buf_pool_struct{
 					billion! A thread is allowed
 					to read this for heuristic
 					purposes without holding any
-					mutex or latch */
-	ulint		LRU_flush_ended;/*!< when an LRU flush ends for a page,
-					this is incremented by one; this is
-					set to zero when a buffer block is
-					allocated */
+					mutex or latch.  For non-heuristic
+					purposes protected by LRU_list_mutex */
+	ibool		try_LRU_scan;	/*!< Set to FALSE when an LRU
+					scan for free block fails. This
+					flag is used to avoid repeated
+					scans of LRU list when we know
+					that there is no free block
+					available in the scan depth for
+					eviction. Set to TRUE whenever
+					we flush a batch from the
+					buffer pool. Accessed atomically. */
 	/* @} */
 
 	/** @name LRU replacement algorithm fields */
@@ -1923,14 +1953,15 @@ struct buf_pool_struct{
 	ulint		LRU_old_len;	/*!< length of the LRU list from
 					the block to which LRU_old points
 					onward, including that block;
-					see buf0lru.c for the restrictions
+					see buf0lru.cc for the restrictions
 					on this value; 0 if LRU_old == NULL;
 					NOTE: LRU_old_len must be adjusted
 					whenever LRU_old shrinks or grows! */
 
 	UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU;
 					/*!< base node of the
-					unzip_LRU list */
+					unzip_LRU list.  The list is protected
+					by LRU list mutex. */
 
 	/* @} */
 	/** @name Buddy allocator fields
@@ -1942,35 +1973,23 @@ struct buf_pool_struct{
 	UT_LIST_BASE_NODE_T(buf_page_t)	zip_clean;
 					/*!< unmodified compressed pages */
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-	UT_LIST_BASE_NODE_T(buf_page_t) zip_free[BUF_BUDDY_SIZES_MAX];
+	UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX];
 					/*!< buddy free lists */
 
-	buf_page_t			watch[BUF_POOL_WATCH_SIZE];
+	buf_page_t*			watch;
 					/*!< Sentinel records for buffer
-					pool watches. Protected by
-				       	buf_pool->mutex. */
-
-//#if BUF_BUDDY_HIGH != UNIV_PAGE_SIZE
-//# error "BUF_BUDDY_HIGH != UNIV_PAGE_SIZE"
-//#endif
-#if BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE
-# error "BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE"
+					pool watches.  */
+
+#if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN
+# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN"
 #endif
 	/* @} */
 };
 
-/** @name Accessors for buf_pool->mutex.
-Use these instead of accessing buf_pool->mutex directly. */
+/** @name Accessors for buffer pool mutexes
+Use these instead of accessing buffer pool mutexes directly. */
 /* @{ */
 
-/** Test if a buffer pool mutex is owned. */
-#define buf_pool_mutex_own(b) mutex_own(&b->mutex)
-/** Acquire a buffer pool mutex. */
-/* the buf_pool_mutex is changed the latch order */
-#define buf_pool_mutex_enter(b) do {		\
-	mutex_enter(&b->mutex);		\
-} while (0)
-
 /** Test if flush list mutex is owned. */
 #define buf_flush_list_mutex_own(b) mutex_own(&b->flush_list_mutex)
 
@@ -1985,31 +2004,47 @@ Use these instead of accessing buf_pool->mutex directly. */
 
 
 
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/** Forbid the release of the buffer pool mutex. */
-# define buf_pool_mutex_exit_forbid(b) do {	\
-	ut_ad(buf_pool_mutex_own(b));		\
-	b->mutex_exit_forbidden++;		\
-} while (0)
-/** Allow the release of the buffer pool mutex. */
-# define buf_pool_mutex_exit_allow(b) do {	\
-	ut_ad(buf_pool_mutex_own(b));		\
-	ut_a(b->mutex_exit_forbidden);	\
-	b->mutex_exit_forbidden--;		\
-} while (0)
-/** Release the buffer pool mutex. */
-# define buf_pool_mutex_exit(b) do {		\
-	ut_a(!b->mutex_exit_forbidden);		\
-	mutex_exit(&b->mutex);			\
-} while (0)
-#else
-/** Forbid the release of the buffer pool mutex. */
-# define buf_pool_mutex_exit_forbid(b) ((void) 0)
-/** Allow the release of the buffer pool mutex. */
-# define buf_pool_mutex_exit_allow(b) ((void) 0)
-/** Release the buffer pool mutex. */
-# define buf_pool_mutex_exit(b) mutex_exit(&b->mutex)
-#endif
+/** Get appropriate page_hash_lock. */
+# define buf_page_hash_lock_get(b, f)		\
+	hash_get_lock(b->page_hash, f)
+
+#ifdef UNIV_SYNC_DEBUG
+/** Test if page_hash lock is held in s-mode. */
+# define buf_page_hash_lock_held_s(b, p)		\
+	rw_lock_own(buf_page_hash_lock_get(b,		\
+		  buf_page_address_fold(p->space,	\
+					p->offset)),	\
+					RW_LOCK_SHARED)
+
+/** Test if page_hash lock is held in x-mode. */
+# define buf_page_hash_lock_held_x(b, p)		\
+	rw_lock_own(buf_page_hash_lock_get(b,		\
+		  buf_page_address_fold(p->space,	\
+					p->offset)),	\
+					RW_LOCK_EX)
+
+/** Test if page_hash lock is held in x or s-mode. */
+# define buf_page_hash_lock_held_s_or_x(b, p)		\
+	(buf_page_hash_lock_held_s(b, p)		\
+	 || buf_page_hash_lock_held_x(b, p))
+
+# define buf_block_hash_lock_held_s(b, p)		\
+	buf_page_hash_lock_held_s(b, &(p->page))
+
+# define buf_block_hash_lock_held_x(b, p)		\
+	buf_page_hash_lock_held_x(b, &(p->page))
+
+# define buf_block_hash_lock_held_s_or_x(b, p)		\
+	buf_page_hash_lock_held_s_or_x(b, &(p->page))
+#else /* UNIV_SYNC_DEBUG */
+# define buf_page_hash_lock_held_s(b, p)	(TRUE)
+# define buf_page_hash_lock_held_x(b, p)	(TRUE)
+# define buf_page_hash_lock_held_s_or_x(b, p)	(TRUE)
+# define buf_block_hash_lock_held_s(b, p)	(TRUE)
+# define buf_block_hash_lock_held_x(b, p)	(TRUE)
+# define buf_block_hash_lock_held_s_or_x(b, p)	(TRUE)
+#endif /* UNIV_SYNC_DEBUG */
+
 #endif /* !UNIV_HOTBACKUP */
 /* @} */
 
@@ -2057,6 +2092,32 @@ FILE_PAGE => NOT_USED	NOTE: This transition is allowed if and only if
 				(3) io_fix == 0.
 */
 
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/** Functor to validate the LRU list. */
+struct	CheckInLRUList {
+	void	operator()(const buf_page_t* elem) const
+	{
+		ut_a(elem->in_LRU_list);
+	}
+};
+
+/** Functor to validate the LRU list. */
+struct	CheckInFreeList {
+	void	operator()(const buf_page_t* elem) const
+	{
+		ut_a(elem->in_free_list);
+	}
+};
+
+struct	CheckUnzipLRUAndLRUList {
+	void	operator()(const buf_block_t* elem) const
+	{
+                ut_a(elem->page.in_LRU_list);
+                ut_a(elem->in_unzip_LRU_list);
+	}
+};
+#endif /* UNIV_DEBUG || defined UNIV_BUF_DEBUG */
+
 #ifndef UNIV_NONINL
 #include "buf0buf.ic"
 #endif
diff --git a/storage/xtradb/include/buf0buf.ic b/storage/xtradb/include/buf0buf.ic
index 18c46b6412e..4ef354b11ab 100644
--- a/storage/xtradb/include/buf0buf.ic
+++ b/storage/xtradb/include/buf0buf.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -35,8 +35,18 @@ Created 11/5/1995 Heikki Tuuri
 #include "buf0flu.h"
 #include "buf0lru.h"
 #include "buf0rea.h"
+
+/** A chunk of buffers. The buffer pool is allocated in chunks. */
+struct buf_chunk_t{
+	ulint		mem_size;	/*!< allocated size of the chunk */
+	ulint		size;		/*!< size of frames[] and blocks[] */
+	void*		mem;		/*!< pointer to the memory area which
+					was allocated for the frames */
+	buf_block_t*	blocks;		/*!< array of buffer control blocks */
+};
+
+
 #include "srv0srv.h"
-#include "buf0types.h"
 
 /*********************************************************************//**
 Gets the current size of buffer buf_pool in bytes.
@@ -111,7 +121,7 @@ buf_page_get_freed_page_clock(
 /*==========================*/
 	const buf_page_t*	bpage)	/*!< in: block */
 {
-	/* This is sometimes read without holding buf_pool->mutex. */
+	/* This is sometimes read without holding any buffer pool mutex. */
 	return(bpage->freed_page_clock);
 }
 
@@ -163,7 +173,7 @@ buf_page_peek_if_too_old(
 {
 	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
 
-	if (UNIV_UNLIKELY(buf_pool->freed_page_clock == 0)) {
+	if (buf_pool->freed_page_clock == 0) {
 		/* If eviction has not started yet, do not update the
 		statistics or move blocks in the LRU list.  This is
 		either the warm-up phase or an in-memory workload. */
@@ -198,7 +208,7 @@ buf_page_get_state(
 
 #ifdef UNIV_DEBUG
 	switch (state) {
-	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_POOL_WATCH:
 	case BUF_BLOCK_ZIP_PAGE:
 	case BUF_BLOCK_ZIP_DIRTY:
 	case BUF_BLOCK_NOT_USED:
@@ -238,7 +248,7 @@ buf_page_set_state(
 	enum buf_page_state	old_state	= buf_page_get_state(bpage);
 
 	switch (old_state) {
-	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_POOL_WATCH:
 		ut_error;
 		break;
 	case BUF_BLOCK_ZIP_PAGE:
@@ -293,10 +303,8 @@ buf_page_in_file(
 	const buf_page_t*	bpage)	/*!< in: pointer to control block */
 {
 	switch (buf_page_get_state(bpage)) {
-	case BUF_BLOCK_ZIP_FREE:
-		/* This is a free page in buf_pool->zip_free[].
-		Such pages should only be accessed by the buddy allocator. */
-		/* ut_error; */ /* optimistic */
+	case BUF_BLOCK_POOL_WATCH:
+		ut_error;
 		break;
 	case BUF_BLOCK_ZIP_PAGE:
 	case BUF_BLOCK_ZIP_DIRTY:
@@ -332,23 +340,16 @@ buf_page_belongs_to_unzip_LRU(
 Gets the mutex of a block.
 @return	pointer to mutex protecting bpage */
 UNIV_INLINE
-mutex_t*
+ib_mutex_t*
 buf_page_get_mutex(
 /*===============*/
 	const buf_page_t*	bpage)	/*!< in: pointer to control block */
 {
 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
 
-	if (/*equivalent to buf_pool_watch_is_sentinel(buf_pool, bpage)*/
-	    bpage >= &buf_pool->watch[0]
-	    && bpage < &buf_pool->watch[BUF_POOL_WATCH_SIZE]) {
-		/* TODO: this code is the interim. should be confirmed later. */
-		return(&buf_pool->zip_mutex);
-	}
-
 	switch (buf_page_get_state(bpage)) {
-	case BUF_BLOCK_ZIP_FREE:
-		/* ut_error; */ /* optimistic */
+	case BUF_BLOCK_POOL_WATCH:
+		ut_error;
 		return(NULL);
 	case BUF_BLOCK_ZIP_PAGE:
 	case BUF_BLOCK_ZIP_DIRTY:
@@ -358,47 +359,25 @@ buf_page_get_mutex(
 	}
 }
 
-/*************************************************************************
-Gets the mutex of a block and enter the mutex with consistency. */
-UNIV_INLINE
-mutex_t*
-buf_page_get_mutex_enter(
-/*=========================*/
-	const buf_page_t*	bpage)	/*!< in: pointer to control block */
-{
-	mutex_t*	block_mutex;
-
-	while(1) {
-		block_mutex = buf_page_get_mutex(bpage);
-		if (!block_mutex)
-			return block_mutex;
-
-		mutex_enter(block_mutex);
-		if (block_mutex == buf_page_get_mutex(bpage))
-			return block_mutex;
-		mutex_exit(block_mutex);
-	}
-}
-
 /*********************************************************************//**
 Get the flush type of a page.
 @return	flush type */
 UNIV_INLINE
-enum buf_flush
+buf_flush_t
 buf_page_get_flush_type(
 /*====================*/
 	const buf_page_t*	bpage)	/*!< in: buffer page */
 {
-	enum buf_flush	flush_type = (enum buf_flush) bpage->flush_type;
+	buf_flush_t	flush_type = (buf_flush_t) bpage->flush_type;
 
 #ifdef UNIV_DEBUG
 	switch (flush_type) {
 	case BUF_FLUSH_LRU:
-	case BUF_FLUSH_SINGLE_PAGE:
 	case BUF_FLUSH_LIST:
+	case BUF_FLUSH_SINGLE_PAGE:
 		return(flush_type);
 	case BUF_FLUSH_N_TYPES:
-		break;
+		ut_error;
 	}
 	ut_error;
 #endif /* UNIV_DEBUG */
@@ -411,7 +390,7 @@ void
 buf_page_set_flush_type(
 /*====================*/
 	buf_page_t*	bpage,		/*!< in: buffer page */
-	enum buf_flush	flush_type)	/*!< in: flush type */
+	buf_flush_t	flush_type)	/*!< in: flush type */
 {
 	bpage->flush_type = flush_type;
 	ut_ad(buf_page_get_flush_type(bpage) == flush_type);
@@ -433,7 +412,7 @@ buf_block_set_file_page(
 }
 
 /*********************************************************************//**
-Gets the io_fix state of a block.  Requires that the block mutex is held.
+Gets the io_fix state of a block.
 @return	io_fix state */
 UNIV_INLINE
 enum buf_io_fix
@@ -446,8 +425,9 @@ buf_page_get_io_fix(
 }
 
 /*********************************************************************//**
-Gets the io_fix state of a block.  Does not assert that the block mutex is
-held, to be used in the cases where it is safe not to hold it.
+Gets the io_fix state of a block.  Does not assert that the
+buf_page_get_mutex() mutex is held, to be used in the cases where it is safe
+not to hold it.
 @return	io_fix state */
 UNIV_INLINE
 enum buf_io_fix
@@ -470,7 +450,7 @@ buf_page_get_io_fix_unlocked(
 }
 
 /*********************************************************************//**
-Gets the io_fix state of a block.  Requires that the block mutex is held.
+Gets the io_fix state of a block.
 @return	io_fix state */
 UNIV_INLINE
 enum buf_io_fix
@@ -482,8 +462,9 @@ buf_block_get_io_fix(
 }
 
 /*********************************************************************//**
-Gets the io_fix state of a block.  Does not assert that the block mutex is
-held, to be used in the cases where it is safe not to hold it.
+Gets the io_fix state of a block.  Does not assert that the
+buf_page_get_mutex() mutex is held, to be used in the cases where it is safe
+not to hold it.
 @return	io_fix state */
 UNIV_INLINE
 enum buf_io_fix
@@ -494,6 +475,7 @@ buf_block_get_io_fix_unlocked(
 	return(buf_page_get_io_fix_unlocked(&block->page));
 }
 
+
 /*********************************************************************//**
 Sets the io_fix state of a block. */
 UNIV_INLINE
@@ -503,10 +485,6 @@ buf_page_set_io_fix(
 	buf_page_t*	bpage,	/*!< in/out: control block */
 	enum buf_io_fix	io_fix)	/*!< in: io_fix state */
 {
-#ifdef UNIV_DEBUG
-	//buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-#endif
 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 
 	bpage->io_fix = io_fix;
@@ -527,7 +505,7 @@ buf_block_set_io_fix(
 
 /*********************************************************************//**
 Makes a block sticky. A sticky block implies that even after we release
-the buf_pool->mutex and the block->mutex:
+the buf_pool->LRU_list_mutex and the block->mutex:
 * it cannot be removed from the flush_list
 * the block descriptor cannot be relocated
 * it cannot be removed from the LRU list
@@ -546,6 +524,7 @@ buf_page_set_sticky(
 #endif
 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
+	ut_ad(bpage->in_LRU_list);
 
 	bpage->io_fix = BUF_IO_PIN;
 }
@@ -558,10 +537,6 @@ buf_page_unset_sticky(
 /*==================*/
 	buf_page_t*	bpage)	/*!< in/out: control block */
 {
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
-#endif
 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_PIN);
 
@@ -577,15 +552,11 @@ buf_page_can_relocate(
 /*==================*/
 	const buf_page_t*	bpage)	/*!< control block being relocated */
 {
-#ifdef UNIV_DEBUG
-	//buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
-#endif
 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 	ut_ad(buf_page_in_file(bpage));
-	//ut_ad(bpage->in_LRU_list);
+	ut_ad(bpage->in_LRU_list);
 
-	return(bpage->in_LRU_list && bpage->io_fix == BUF_IO_NONE
+	return(buf_page_get_io_fix(bpage) == BUF_IO_NONE
 	       && bpage->buf_fix_count == 0);
 }
 
@@ -599,9 +570,13 @@ buf_page_is_old(
 	const buf_page_t*	bpage)	/*!< in: control block */
 {
 #ifdef UNIV_DEBUG
-	//buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
 #endif
+	/* Buffer page mutex is not strictly required here for heuristic
+	purposes even if LRU mutex is not being held.  Keep the assertion
+	for now since all the callers hold it.  */
+	ut_ad(mutex_own(buf_page_get_mutex(bpage))
+	      || mutex_own(&buf_pool->LRU_list_mutex));
 	ut_ad(buf_page_in_file(bpage));
 
 	return(bpage->old);
@@ -620,7 +595,6 @@ buf_page_set_old(
 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
 #endif /* UNIV_DEBUG */
 	ut_a(buf_page_in_file(bpage));
-	//ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
 	ut_ad(bpage->in_LRU_list);
 
@@ -666,11 +640,7 @@ buf_page_set_accessed(
 /*==================*/
 	buf_page_t*	bpage)		/*!< in/out: control block */
 {
-#ifdef UNIV_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
-	ut_ad(!buf_pool_mutex_own(buf_pool));
 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-#endif
 	ut_a(buf_page_in_file(bpage));
 
 	if (!bpage->access_time) {
@@ -689,7 +659,7 @@ buf_page_get_block(
 /*===============*/
 	buf_page_t*	bpage)	/*!< in: control block, or NULL */
 {
-	if (UNIV_LIKELY(bpage != NULL)) {
+	if (bpage != NULL) {
 		ut_ad(buf_page_in_file(bpage));
 
 		if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
@@ -714,7 +684,7 @@ buf_block_get_frame(
 	SRV_CORRUPT_TABLE_CHECK(block, return(0););
 
 	switch (buf_block_get_state(block)) {
-	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_POOL_WATCH:
 	case BUF_BLOCK_ZIP_PAGE:
 	case BUF_BLOCK_ZIP_DIRTY:
 	case BUF_BLOCK_NOT_USED:
@@ -780,6 +750,23 @@ buf_page_get_page_no(
 
 	return(bpage->offset);
 }
+/***********************************************************************
+FIXME_FTS Gets the frame the pointer is pointing to. */
+UNIV_INLINE
+buf_frame_t*
+buf_frame_align(
+/*============*/
+                        /* out: pointer to frame */
+        byte*   ptr)    /* in: pointer to a frame */
+{
+        buf_frame_t*    frame;
+
+        ut_ad(ptr);
+
+        frame = (buf_frame_t*) ut_align_down(ptr, UNIV_PAGE_SIZE);
+
+        return(frame);
+}
 
 /*********************************************************************//**
 Gets the page number of a block.
@@ -805,7 +792,8 @@ buf_page_get_zip_size(
 /*==================*/
 	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
 {
-	return(bpage->zip.ssize ? 512 << bpage->zip.ssize : 0);
+	return(bpage->zip.ssize
+	       ? (UNIV_ZIP_SIZE_MIN >> 1) << bpage->zip.ssize : 0);
 }
 
 /*********************************************************************//**
@@ -817,7 +805,8 @@ buf_block_get_zip_size(
 /*===================*/
 	const buf_block_t*	block)	/*!< in: pointer to the control block */
 {
-	return(block->page.zip.ssize ? 512 << block->page.zip.ssize : 0);
+	return(block->page.zip.ssize
+	       ? (UNIV_ZIP_SIZE_MIN >> 1) << block->page.zip.ssize : 0);
 }
 
 #ifndef UNIV_HOTBACKUP
@@ -913,19 +902,13 @@ buf_block_free(
 /*===========*/
 	buf_block_t*	block)	/*!< in, own: block to be freed */
 {
-	//buf_pool_t*	buf_pool = buf_pool_from_bpage((buf_page_t*)block);
-
-	//buf_pool_mutex_enter(buf_pool);
-
 	mutex_enter(&block->mutex);
 
 	ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
 
-	buf_LRU_block_free_non_file_page(block, FALSE);
+	buf_LRU_block_free_non_file_page(block);
 
 	mutex_exit(&block->mutex);
-
-	//buf_pool_mutex_exit(buf_pool);
 }
 #endif /* !UNIV_HOTBACKUP */
 
@@ -966,31 +949,31 @@ Gets the youngest modification log sequence number for a frame.
 Returns zero if not file page or no modification occurred yet.
 @return	newest modification to page */
 UNIV_INLINE
-ib_uint64_t
+lsn_t
 buf_page_get_newest_modification(
 /*=============================*/
 	const buf_page_t*	bpage)	/*!< in: block containing the
 					page frame */
 {
-	ib_uint64_t	lsn;
-	mutex_t*	block_mutex = buf_page_get_mutex_enter(bpage);
+	lsn_t		lsn;
+	ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
 
-	if (block_mutex && buf_page_in_file(bpage)) {
+	mutex_enter(block_mutex);
+
+	if (buf_page_in_file(bpage)) {
 		lsn = bpage->newest_modification;
 	} else {
 		lsn = 0;
 	}
 
-	if (block_mutex) {
-		mutex_exit(block_mutex);
-	}
+	mutex_exit(block_mutex);
 
 	return(lsn);
 }
 
 /********************************************************************//**
 Increments the modify clock of a frame by 1. The caller must (1) own the
-buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+LRU list mutex and block bufferfix count has to be zero, (2) or own an x-lock
 on the block. */
 UNIV_INLINE
 void
@@ -999,7 +982,7 @@ buf_block_modify_clock_inc(
 	buf_block_t*	block)	/*!< in: block */
 {
 #ifdef UNIV_SYNC_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage((buf_page_t*)block);
+	buf_pool_t*	buf_pool = buf_pool_from_bpage((buf_page_t*) block);
 
 	ut_ad((mutex_own(&buf_pool->LRU_list_mutex)
 	       && (block->page.buf_fix_count == 0))
@@ -1108,22 +1091,24 @@ UNIV_INLINE
 buf_page_t*
 buf_page_hash_get_low(
 /*==================*/
-	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	ulint		space,		/*!< in: space id */
-	ulint		offset,		/*!< in: offset of the page
-					within space */
-	ulint		fold)		/*!< in: buf_page_address_fold(
-					space, offset) */
+	buf_pool_t*	buf_pool,/*!< buffer pool instance */
+	ulint		space,	/*!< in: space id */
+	ulint		offset,	/*!< in: offset of the page within space */
+	ulint		fold)	/*!< in: buf_page_address_fold(space, offset) */
 {
 	buf_page_t*	bpage;
 
-	ut_ad(buf_pool);
-	//ut_ad(buf_pool_mutex_own(buf_pool));
 #ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_EX)
-	      || rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_SHARED));
-#endif
-	ut_ad(fold == buf_page_address_fold(space, offset));
+	ulint		hash_fold;
+	prio_rw_lock_t*	hash_lock;
+
+	hash_fold = buf_page_address_fold(space, offset);
+	ut_ad(hash_fold == fold);
+
+	hash_lock = hash_get_lock(buf_pool->page_hash, fold);
+	ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX)
+	      || rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
 
 	/* Look for the page in the hash table */
 
@@ -1148,46 +1133,145 @@ buf_page_hash_get_low(
 
 /******************************************************************//**
 Returns the control block of a file page, NULL if not found.
-@return	block, NULL if not found or not a real control block */
+If the block is found and lock is not NULL then the appropriate
+page_hash lock is acquired in the specified lock mode. Otherwise,
+mode value is ignored. It is up to the caller to release the
+lock. If the block is found and the lock is NULL then the page_hash
+lock is released by this function.
+@return	block, NULL if not found */
 UNIV_INLINE
 buf_page_t*
-buf_page_hash_get(
-/*==============*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+buf_page_hash_get_locked(
+/*=====================*/
+					/*!< out: pointer to the bpage,
+					or NULL; if NULL, hash_lock
+					is also NULL. */
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
 	ulint		space,		/*!< in: space id */
-	ulint		offset)		/*!< in: offset of the page
-					within space */
-{
-	buf_page_t*	bpage;
-	ulint		fold	= buf_page_address_fold(space, offset);
+	ulint		offset,		/*!< in: page number */
+	prio_rw_lock_t**	lock,	/*!< in/out: lock of the page
+					hash acquired if bpage is
+					found. NULL otherwise. If NULL
+					is passed then the hash_lock
+					is released by this function */
+	ulint		lock_mode)	/*!< in: RW_LOCK_EX or
+					RW_LOCK_SHARED. Ignored if
+					lock == NULL */
+{
+	buf_page_t*	bpage = NULL;
+	ulint		fold;
+	prio_rw_lock_t*	hash_lock;
+	ulint		mode = RW_LOCK_SHARED;
+
+	if (lock != NULL) {
+		*lock = NULL;
+		ut_ad(lock_mode == RW_LOCK_EX
+		      || lock_mode == RW_LOCK_SHARED);
+		mode = lock_mode;
+	}
 
-	bpage	= buf_page_hash_get_low(buf_pool, space, offset, fold);
+	fold = buf_page_address_fold(space, offset);
+	hash_lock = hash_get_lock(buf_pool->page_hash, fold);
 
-	if (bpage && buf_pool_watch_is_sentinel(buf_pool, bpage)) {
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)
+	      && !rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (mode == RW_LOCK_SHARED) {
+		rw_lock_s_lock(hash_lock);
+	} else {
+		rw_lock_x_lock(hash_lock);
+	}
+
+	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
+
+	if (!bpage || buf_pool_watch_is_sentinel(buf_pool, bpage)) {
 		bpage = NULL;
+		goto unlock_and_exit;
+	}
+
+	ut_ad(buf_page_in_file(bpage));
+	ut_ad(offset == bpage->offset);
+	ut_ad(space == bpage->space);
+
+	if (lock == NULL) {
+		/* The caller wants us to release the page_hash lock */
+		goto unlock_and_exit;
+	} else {
+		/* To be released by the caller */
+		*lock = hash_lock;
+		goto exit;
 	}
 
+unlock_and_exit:
+	if (mode == RW_LOCK_SHARED) {
+		rw_lock_s_unlock(hash_lock);
+	} else {
+		rw_lock_x_unlock(hash_lock);
+	}
+exit:
 	return(bpage);
 }
 
 /******************************************************************//**
-Returns the control block of a file page, NULL if not found
-or an uncompressed page frame does not exist.
+Returns the control block of a file page, NULL if not found.
+If the block is found and lock is not NULL then the appropriate
+page_hash lock is acquired in the specified lock mode. Otherwise,
+mode value is ignored. It is up to the caller to release the
+lock. If the block is found and the lock is NULL then the page_hash
+lock is released by this function.
 @return	block, NULL if not found */
 UNIV_INLINE
 buf_block_t*
-buf_block_hash_get(
-/*===============*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+buf_block_hash_get_locked(
+/*=====================*/
+					/*!< out: pointer to the bpage,
+					or NULL; if NULL, hash_lock
+					is also NULL. */
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
 	ulint		space,		/*!< in: space id */
-	ulint		offset)		/*!< in: offset of the page
-					within space */
-{
-	buf_block_t*	block;
+	ulint		offset,		/*!< in: page number */
+	prio_rw_lock_t**	lock,	/*!< in/out: lock of the page
+					hash acquired if bpage is
+					found. NULL otherwise. If NULL
+					is passed then the hash_lock
+					is released by this function */
+	ulint		lock_mode)	/*!< in: RW_LOCK_EX or
+					RW_LOCK_SHARED. Ignored if
+					lock == NULL */
+{
+	buf_page_t*	bpage = buf_page_hash_get_locked(buf_pool,
+							 space,
+							 offset,
+							 lock,
+							 lock_mode);
+	buf_block_t*	block = buf_page_get_block(bpage);
+
+	if (block) {
+		ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(!lock || rw_lock_own(*lock, lock_mode));
+#endif /* UNIV_SYNC_DEBUG */
+		return(block);
+	} else if (bpage) {
+		/* It is not a block. Just a bpage */
+		ut_ad(buf_page_in_file(bpage));
 
-	block = buf_page_get_block(buf_page_hash_get(buf_pool, space, offset));
+		if (lock) {
+			if (lock_mode == RW_LOCK_SHARED) {
+				rw_lock_s_unlock(*lock);
+			} else {
+				rw_lock_x_unlock(*lock);
+			}
+		}
+		*lock = NULL;
+		return(NULL);
+	}
 
-	return(block);
+	ut_ad(!bpage);
+	ut_ad(lock == NULL ||*lock == NULL);
+	return(NULL);
 }
 
 /********************************************************************//**
@@ -1204,18 +1288,9 @@ buf_page_peek(
 	ulint	space,	/*!< in: space id */
 	ulint	offset)	/*!< in: page number */
 {
-	const buf_page_t*	bpage;
 	buf_pool_t*		buf_pool = buf_pool_get(space, offset);
 
-	//buf_pool_mutex_enter(buf_pool);
-	rw_lock_s_lock(&buf_pool->page_hash_latch);
-
-	bpage = buf_page_hash_get(buf_pool, space, offset);
-
-	//buf_pool_mutex_exit(buf_pool);
-	rw_lock_s_unlock(&buf_pool->page_hash_latch);
-
-	return(bpage != NULL);
+	return(buf_page_hash_get(buf_pool, space, offset) != NULL);
 }
 
 /********************************************************************//**
@@ -1248,7 +1323,7 @@ buf_page_release_zip(
 		bpage->buf_fix_count--;
 		mutex_exit(&block->mutex);
 		return;
-	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_POOL_WATCH:
 	case BUF_BLOCK_NOT_USED:
 	case BUF_BLOCK_READY_FOR_USE:
 	case BUF_BLOCK_MEMORY:
@@ -1256,7 +1331,6 @@ buf_page_release_zip(
 		break;
 	}
 
-	
 	ut_error;
 }
 
@@ -1308,73 +1382,6 @@ buf_block_dbg_add_level(
 	sync_thread_add_level(&block->lock, level, FALSE);
 }
 #endif /* UNIV_SYNC_DEBUG */
-/********************************************************************//**
-Acquire mutex on all buffer pool instances. */
-UNIV_INLINE
-void
-buf_pool_mutex_enter_all(void)
-/*==========================*/
-{
-	ulint   i;
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-		buf_pool_mutex_enter(buf_pool);
-	}
-}
-
-/********************************************************************//**
-Release mutex on all buffer pool instances. */
-UNIV_INLINE
-void
-buf_pool_mutex_exit_all(void)
-/*=========================*/
-{
-	ulint   i;
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-		buf_pool_mutex_exit(buf_pool);
-	}
-}
-
-/********************************************************************//**
-*/
-UNIV_INLINE
-void
-buf_pool_page_hash_x_lock_all(void)
-/*===============================*/
-{
-	ulint	i;
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-		rw_lock_x_lock(&buf_pool->page_hash_latch);
-	}
-}
-
-/********************************************************************//**
-*/
-UNIV_INLINE
-void
-buf_pool_page_hash_x_unlock_all(void)
-/*=================================*/
-{
-	ulint	i;
-
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-		rw_lock_x_unlock(&buf_pool->page_hash_latch);
-	}
-}
 /*********************************************************************//**
 Get the nth chunk's buffer block in the specified buffer pool.
 @return the nth chunk's buffer block. */
@@ -1392,4 +1399,26 @@ buf_get_nth_chunk_block(
 	*chunk_size = chunk->size;
 	return(chunk->blocks);
 }
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Checks if buf_pool->zip_mutex is owned and is serving for a given page as its
+block mutex.
+@return true if buf_pool->zip_mutex is owned. */
+UNIV_INLINE
+bool
+buf_own_zip_mutex_for_page(
+/*=======================*/
+	const buf_page_t*	bpage)
+{
+	buf_pool_t*	buf_pool	= buf_pool_from_bpage(bpage);
+
+	ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE
+	      || buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY);
+	ut_ad(buf_page_get_mutex(bpage) == &buf_pool->zip_mutex);
+
+	return(mutex_own(&buf_pool->zip_mutex));
+}
+#endif /* UNIV_DEBUG */
+
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/buf0checksum.h b/storage/xtradb/include/buf0checksum.h
new file mode 100644
index 00000000000..cd21781dc6e
--- /dev/null
+++ b/storage/xtradb/include/buf0checksum.h
@@ -0,0 +1,88 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0checksum.h
+Buffer pool checksum functions, also linked from /extra/innochecksum.cc
+
+Created Aug 11, 2011 Vasil Dimov
+*******************************************************/
+
+#ifndef buf0checksum_h
+#define buf0checksum_h
+
+#include "univ.i"
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "buf0types.h"
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************************//**
+Calculates a page CRC32 which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@return	checksum */
+UNIV_INTERN
+ib_uint32_t
+buf_calc_page_crc32(
+/*================*/
+	const byte*	page);	/*!< in: buffer page */
+
+/********************************************************************//**
+Calculates a page checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@return	checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_new_checksum(
+/*=======================*/
+	const byte*	page);	/*!< in: buffer page */
+
+/********************************************************************//**
+In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
+looked at the first few bytes of the page. This calculates that old
+checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@return	checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_old_checksum(
+/*=======================*/
+	const byte*	page);	/*!< in: buffer page */
+
+#ifndef UNIV_INNOCHECKSUM
+
+/********************************************************************//**
+Return a printable string describing the checksum algorithm.
+@return	algorithm name */
+UNIV_INTERN
+const char*
+buf_checksum_algorithm_name(
+/*========================*/
+	srv_checksum_algorithm_t	algo);	/*!< in: algorithm */
+
+extern ulong	srv_checksum_algorithm;
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif /* buf0checksum_h */
diff --git a/storage/xtradb/include/buf0dblwr.h b/storage/xtradb/include/buf0dblwr.h
new file mode 100644
index 00000000000..1b9336f4002
--- /dev/null
+++ b/storage/xtradb/include/buf0dblwr.h
@@ -0,0 +1,153 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0dblwr.h
+Doublewrite buffer module
+
+Created 2011/12/19 Inaam Rana
+*******************************************************/
+
+#ifndef buf0dblwr_h
+#define buf0dblwr_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "log0log.h"
+
+#ifndef UNIV_HOTBACKUP
+
+/** Doublewrite system */
+extern buf_dblwr_t*	buf_dblwr;
+/** Set to TRUE when the doublewrite buffer is being created */
+extern ibool		buf_dblwr_being_created;
+
+/****************************************************************//**
+Creates the doublewrite buffer to a new InnoDB installation. The header of the
+doublewrite buffer is placed on the trx system header page. */
+UNIV_INTERN
+void
+buf_dblwr_create(void);
+/*==================*/
+/****************************************************************//**
+At a database startup initializes the doublewrite buffer memory structure if
+we already have a doublewrite buffer created in the data files. If we are
+upgrading to an InnoDB version which supports multiple tablespaces, then this
+function performs the necessary update operations. If we are in a crash
+recovery, this function uses a possible doublewrite buffer to restore
+half-written pages in the data files. */
+UNIV_INTERN
+void
+buf_dblwr_init_or_restore_pages(
+/*============================*/
+	ibool	restore_corrupt_pages);	/*!< in: TRUE=restore pages */
+/****************************************************************//**
+frees doublewrite buffer. */
+UNIV_INTERN
+void
+buf_dblwr_free(void);
+/*================*/
+/********************************************************************//**
+Updates the doublewrite buffer when an IO request is completed. */
+UNIV_INTERN
+void
+buf_dblwr_update(
+/*=============*/
+	const buf_page_t*	bpage,	/*!< in: buffer block descriptor */
+	buf_flush_t		flush_type);/*!< in: flush type */
+/****************************************************************//**
+Determines if a page number is located inside the doublewrite buffer.
+@return TRUE if the location is inside the two blocks of the
+doublewrite buffer */
+UNIV_INTERN
+ibool
+buf_dblwr_page_inside(
+/*==================*/
+	ulint	page_no);	/*!< in: page number */
+/********************************************************************//**
+Posts a buffer page for writing. If the doublewrite memory buffer is
+full, calls buf_dblwr_flush_buffered_writes and waits for for free
+space to appear. */
+UNIV_INTERN
+void
+buf_dblwr_add_to_batch(
+/*====================*/
+	buf_page_t*	bpage);	/*!< in: buffer block to write */
+/********************************************************************//**
+Flushes possible buffered writes from the doublewrite memory buffer to disk,
+and also wakes up the aio thread if simulated aio is used. It is very
+important to call this function after a batch of writes has been posted,
+and also when we may have to wait for a page latch! Otherwise a deadlock
+of threads can occur. */
+UNIV_INTERN
+void
+buf_dblwr_flush_buffered_writes(void);
+/*=================================*/
+/********************************************************************//**
+Writes a page to the doublewrite buffer on disk, sync it, then write
+the page to the datafile and sync the datafile. This function is used
+for single page flushes. If all the buffers allocated for single page
+flushes in the doublewrite buffer are in use we wait here for one to
+become free. We are guaranteed that a slot will become free because any
+thread that is using a slot must also release the slot before leaving
+this function. */
+UNIV_INTERN
+void
+buf_dblwr_write_single_page(
+/*========================*/
+	buf_page_t*	bpage,	/*!< in: buffer block to write */
+	bool		sync);	/*!< in: true if sync IO requested */
+
+/** Doublewrite control struct */
+struct buf_dblwr_t{
+	ib_mutex_t	mutex;	/*!< mutex protecting the first_free
+				field and write_buf */
+	ulint		block1;	/*!< the page number of the first
+				doublewrite block (64 pages) */
+	ulint		block2;	/*!< page number of the second block */
+	ulint		first_free;/*!< first free position in write_buf
+				measured in units of UNIV_PAGE_SIZE */
+	ulint		b_reserved;/*!< number of slots currently reserved
+				for batch flush. */
+	os_event_t	b_event;/*!< event where threads wait for a
+				batch flush to end. */
+	ulint		s_reserved;/*!< number of slots currently
+				reserved for single page flushes. */
+	os_event_t	s_event;/*!< event where threads wait for a
+				single page flush slot. */
+	bool*		in_use;	/*!< flag used to indicate if a slot is
+				in use. Only used for single page
+				flushes. */
+	bool		batch_running;/*!< set to TRUE if currently a batch
+				is being written from the doublewrite
+				buffer. */
+	byte*		write_buf;/*!< write buffer used in writing to the
+				doublewrite buffer, aligned to an
+				address divisible by UNIV_PAGE_SIZE
+				(which is required by Windows aio) */
+	byte*		write_buf_unaligned;/*!< pointer to write_buf,
+				but unaligned */
+	buf_page_t**	buf_block_arr;/*!< array to store pointers to
+				the buffer blocks which have been
+				cached to write_buf */
+};
+
+
+#endif /* UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/xtradb/include/buf0dump.h b/storage/xtradb/include/buf0dump.h
new file mode 100644
index 00000000000..c704a8e97e0
--- /dev/null
+++ b/storage/xtradb/include/buf0dump.h
@@ -0,0 +1,72 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dump.h
+Implements a buffer pool dump/load.
+
+Created April 08, 2011 Vasil Dimov
+*******************************************************/
+
+#ifndef buf0dump_h
+#define buf0dump_h
+
+#include "univ.i"
+
+/*****************************************************************//**
+Wakes up the buffer pool dump/load thread and instructs it to start
+a dump. This function is called by MySQL code via buffer_pool_dump_now()
+and it should return immediately because the whole MySQL is frozen during
+its execution. */
+UNIV_INTERN
+void
+buf_dump_start();
+/*============*/
+
+/*****************************************************************//**
+Wakes up the buffer pool dump/load thread and instructs it to start
+a load. This function is called by MySQL code via buffer_pool_load_now()
+and it should return immediately because the whole MySQL is frozen during
+its execution. */
+UNIV_INTERN
+void
+buf_load_start();
+/*============*/
+
+/*****************************************************************//**
+Aborts a currently running buffer pool load. This function is called by
+MySQL code via buffer_pool_load_abort() and it should return immediately
+because the whole MySQL is frozen during its execution. */
+UNIV_INTERN
+void
+buf_load_abort();
+/*============*/
+
+/*****************************************************************//**
+This is the main thread for buffer pool dump/load. It waits for an
+event and when waked up either performs a dump or load and sleeps
+again.
+@return this function does not return, it calls os_thread_exit() */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(buf_dump_thread)(
+/*============================*/
+	void*	arg);				/*!< in: a dummy parameter
+						required by os_thread_create */
+
+#endif /* buf0dump_h */
diff --git a/storage/xtradb/include/buf0flu.h b/storage/xtradb/include/buf0flu.h
index 81085ab9552..f4542e7c206 100644
--- a/storage/xtradb/include/buf0flu.h
+++ b/storage/xtradb/include/buf0flu.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -28,13 +28,16 @@ Created 11/5/1995 Heikki Tuuri
 
 #include "univ.i"
 #include "ut0byte.h"
+#include "log0log.h"
 #ifndef UNIV_HOTBACKUP
 #include "mtr0types.h"
 #include "buf0types.h"
-#include "log0log.h"
+
+/** Flag indicating if the page_cleaner is in active state. */
+extern ibool buf_page_cleaner_is_active;
 
 /********************************************************************//**
-Remove a block from the flush list of modified blocks. */
+Remove a block from the flush list of modified blocks.  */
 UNIV_INTERN
 void
 buf_flush_remove(
@@ -57,23 +60,6 @@ void
 buf_flush_write_complete(
 /*=====================*/
 	buf_page_t*	bpage);	/*!< in: pointer to the block in question */
-/*********************************************************************//**
-Flushes pages from the end of the LRU list if there is too small
-a margin of replaceable pages there. If buffer pool is NULL it
-means flush free margin on all buffer pool instances. */
-UNIV_INTERN
-void
-buf_flush_free_margin(
-/*==================*/
-	 buf_pool_t*	buf_pool,
-	ibool		wait);
-/*********************************************************************//**
-Flushes pages from the end of all the LRU lists. */
-UNIV_INTERN
-void
-buf_flush_free_margins(
-/*=========================*/
-	ibool		wait);
 #endif /* !UNIV_HOTBACKUP */
 /********************************************************************//**
 Initializes a page for writing to the tablespace. */
@@ -81,17 +67,17 @@ UNIV_INTERN
 void
 buf_flush_init_for_writing(
 /*=======================*/
-	byte*		page,		/*!< in/out: page */
-	void*		page_zip_,	/*!< in/out: compressed page, or NULL */
-	ib_uint64_t	newest_lsn);	/*!< in: newest modification lsn
-					to the page */
+	byte*	page,		/*!< in/out: page */
+	void*	page_zip_,	/*!< in/out: compressed page, or NULL */
+	lsn_t	newest_lsn);	/*!< in: newest modification lsn
+				to the page */
 #ifndef UNIV_HOTBACKUP
 # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
 /********************************************************************//**
 Writes a flushable page asynchronously from the buffer pool to a file.
-NOTE: buf_pool->mutex and block->mutex must be held upon entering this
-function, and they will be released by this function after flushing.
-This is loosely based on buf_flush_batch() and buf_flush_page().
+NOTE: block->mutex must be held upon entering this function, and they will be
+released by this function after flushing.  This is loosely based on
+buf_flush_batch() and buf_flush_page().
 @return TRUE if the page was flushed and the mutexes released */
 UNIV_INTERN
 ibool
@@ -102,38 +88,40 @@ buf_flush_page_try(
 	__attribute__((nonnull, warn_unused_result));
 # endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 /*******************************************************************//**
-This utility flushes dirty blocks from the end of the LRU list.
-NOTE: The calling thread may own latches to pages: to avoid deadlocks,
-this function must be written so that it cannot end up waiting for these
-latches!
-@return number of blocks for which the write request was queued;
-ULINT_UNDEFINED if there was a flush of the same type already running */
-UNIV_INTERN
-ulint
-buf_flush_LRU(
-/*==========*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	ulint		min_n);		/*!< in: wished minimum mumber of blocks
-					flushed (it is not guaranteed that the
-					actual number is that big, though) */
-/*******************************************************************//**
-This utility flushes dirty blocks from the end of the flush_list of
+This utility flushes dirty blocks from the end of the flush list of
 all buffer pool instances.
 NOTE: The calling thread is not allowed to own any latches on pages!
-@return number of blocks for which the write request was queued;
-ULINT_UNDEFINED if there was a flush of the same type already running */
+@return true if a batch was queued successfully for each buffer pool
+instance. false if another batch of same type was already running in
+at least one of the buffer pool instance */
 UNIV_INTERN
-ulint
+bool
 buf_flush_list(
-/*============*/
+/*===========*/
 	ulint		min_n,		/*!< in: wished minimum mumber of blocks
 					flushed (it is not guaranteed that the
 					actual number is that big, though) */
-	ib_uint64_t	lsn_limit);	/*!< in the case BUF_FLUSH_LIST all
+	lsn_t		lsn_limit,	/*!< in the case BUF_FLUSH_LIST all
 					blocks whose oldest_modification is
 					smaller than this should be flushed
 					(if their number does not exceed
 					min_n), otherwise ignored */
+	ulint*		n_processed);	/*!< out: the number of pages
+					which were processed is passed
+					back to caller. Ignored if NULL */
+/******************************************************************//**
+This function picks up a single dirty page from the tail of the LRU
+list, flushes it, removes it from page_hash and LRU list and puts
+it on the free list. It is called from user threads when they are
+unable to find a replacable page at the tail of the LRU list i.e.:
+when the background LRU flushing in the page_cleaner thread is not
+fast enough to keep pace with the workload.
+@return TRUE if success. */
+UNIV_INTERN
+ibool
+buf_flush_single_page_from_LRU(
+/*===========================*/
+	buf_pool_t*	buf_pool);	/*!< in/out: buffer pool instance */
 /******************************************************************//**
 Waits until a flush batch of the given type ends */
 UNIV_INTERN
@@ -141,7 +129,7 @@ void
 buf_flush_wait_batch_end(
 /*=====================*/
 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	enum buf_flush	type);		/*!< in: BUF_FLUSH_LRU
+	buf_flush_t	type);		/*!< in: BUF_FLUSH_LRU
 					or BUF_FLUSH_LIST */
 /******************************************************************//**
 Waits until a flush batch of the given type ends. This is called by
@@ -152,7 +140,7 @@ void
 buf_flush_wait_batch_end_wait_only(
 /*===============================*/
 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	enum buf_flush	type);		/*!< in: BUF_FLUSH_LRU
+	buf_flush_t	type);		/*!< in: BUF_FLUSH_LRU
 					or BUF_FLUSH_LIST */
 /********************************************************************//**
 This function should be called at a mini-transaction commit, if a page was
@@ -171,9 +159,9 @@ void
 buf_flush_recv_note_modification(
 /*=============================*/
 	buf_block_t*	block,		/*!< in: block which is modified */
-	ib_uint64_t	start_lsn,	/*!< in: start lsn of the first mtr in a
+	lsn_t		start_lsn,	/*!< in: start lsn of the first mtr in a
 					set of mtr's */
-	ib_uint64_t	end_lsn);	/*!< in: end lsn of the last mtr in the
+	lsn_t		end_lsn);	/*!< in: end lsn of the last mtr in the
 					set of mtr's */
 /********************************************************************//**
 Returns TRUE if the file page block is immediately suitable for replacement,
@@ -185,43 +173,33 @@ buf_flush_ready_for_replace(
 /*========================*/
 	buf_page_t*	bpage);	/*!< in: buffer control block, must be
 				buf_page_in_file(bpage) and in the LRU list */
-
-/** @brief Statistics for selecting flush rate based on redo log
-generation speed.
-
-These statistics are generated for heuristics used in estimating the
-rate at which we should flush the dirty blocks to avoid bursty IO
-activity. Note that the rate of flushing not only depends on how many
-dirty pages we have in the buffer pool but it is also a fucntion of
-how much redo the workload is generating and at what rate. */
-
-struct buf_flush_stat_struct
-{
-	ib_uint64_t	redo;		/**< amount of redo generated. */
-	ulint		n_flushed;	/**< number of pages flushed. */
-};
-
-/** Statistics for selecting flush rate of dirty pages. */
-typedef struct buf_flush_stat_struct buf_flush_stat_t;
-/*********************************************************************
-Update the historical stats that we are collecting for flush rate
-heuristics at the end of each interval. */
-UNIV_INTERN
-void
-buf_flush_stat_update(void);
-/*=======================*/
-/*********************************************************************
-Determines the fraction of dirty pages that need to be flushed based
-on the speed at which we generate redo log. Note that if redo log
-is generated at significant rate without a corresponding increase
-in the number of dirty pages (for example, an in-memory workload)
-it can cause IO bursts of flushing. This function implements heuristics
-to avoid this burstiness.
-@return	number of dirty pages to be flushed / second */
+/******************************************************************//**
+page_cleaner thread tasked with flushing dirty pages from the buffer
+pools. As of now we'll have only one instance of this thread.
+@return a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(buf_flush_page_cleaner_thread)(
+/*==========================================*/
+	void*	arg);		/*!< in: a dummy parameter required by
+				os_thread_create */
+/*********************************************************************//**
+Clears up tail of the LRU lists:
+* Put replaceable pages at the tail of LRU to the free list
+* Flush dirty pages at the tail of LRU to the disk
+The depth to which we scan each buffer pool is controlled by dynamic
+config parameter innodb_LRU_scan_depth.
+@return total pages flushed */
 UNIV_INTERN
 ulint
-buf_flush_get_desired_flush_rate(void);
-/*==================================*/
+buf_flush_LRU_tail(void);
+/*====================*/
+/*********************************************************************//**
+Wait for any possible LRU flushes that are in progress to end. */
+UNIV_INTERN
+void
+buf_flush_wait_LRU_batch_end(void);
+/*==============================*/
 
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 /******************************************************************//**
@@ -250,16 +228,66 @@ void
 buf_flush_free_flush_rbt(void);
 /*==========================*/
 
-/** When buf_flush_free_margin is called, it tries to make this many blocks
-available to replacement in the free list and at the end of the LRU list (to
-make sure that a read-ahead batch can be read efficiently in a single
-sweep). */
-#define BUF_FLUSH_FREE_BLOCK_MARGIN(b)	(5 + BUF_READ_AHEAD_AREA(b))
-/** Extra margin to apply above BUF_FLUSH_FREE_BLOCK_MARGIN */
-#define BUF_FLUSH_EXTRA_MARGIN(b)	((BUF_FLUSH_FREE_BLOCK_MARGIN(b) / 4 \
-					+ 100) / srv_buf_pool_instances)
+/********************************************************************//**
+Writes a flushable page asynchronously from the buffer pool to a file.
+NOTE: in simulated aio we must call
+os_aio_simulated_wake_handler_threads after we have posted a batch of
+writes! NOTE: buf_page_get_mutex(bpage) must be held upon entering this
+function, and they will be released by this function. */
+UNIV_INTERN
+void
+buf_flush_page(
+/*===========*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_page_t*	bpage,		/*!< in: buffer control block */
+	buf_flush_t	flush_type,	/*!< in: type of flush */
+	bool		sync)		/*!< in: true if sync IO request */
+	__attribute__((nonnull));
+/********************************************************************//**
+Returns true if the block is modified and ready for flushing.
+@return	true if can flush immediately */
+UNIV_INTERN
+bool
+buf_flush_ready_for_flush(
+/*======================*/
+	buf_page_t*	bpage,	/*!< in: buffer control block, must be
+				buf_page_in_file(bpage) */
+	buf_flush_t	flush_type)/*!< in: type of flush */
+	__attribute__((warn_unused_result));
+
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Check if there are any dirty pages that belong to a space id in the flush
+list in a particular buffer pool.
+@return	number of dirty pages present in a single buffer pool */
+UNIV_INTERN
+ulint
+buf_pool_get_dirty_pages_count(
+/*===========================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool */
+	ulint		id);		/*!< in: space id to check */
+/******************************************************************//**
+Check if there are any dirty pages that belong to a space id in the flush list.
+@return	count of dirty pages present in all the buffer pools */
+UNIV_INTERN
+ulint
+buf_flush_get_dirty_pages_count(
+/*============================*/
+	ulint		id);		/*!< in: space id to check */
+#endif /* UNIV_DEBUG */
+
 #endif /* !UNIV_HOTBACKUP */
 
+/******************************************************************//**
+Check if a flush list flush is in progress for any buffer pool instance for
+heuristic purposes.
+@return true if flush list flush is in progress  */
+UNIV_INLINE
+bool
+buf_flush_flush_list_in_progress(void)
+/*==================================*/
+	__attribute__((warn_unused_result));
+
 #ifndef UNIV_NONINL
 #include "buf0flu.ic"
 #endif
diff --git a/storage/xtradb/include/buf0flu.ic b/storage/xtradb/include/buf0flu.ic
index c8d95d1849c..b1e64def462 100644
--- a/storage/xtradb/include/buf0flu.ic
+++ b/storage/xtradb/include/buf0flu.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,6 +26,7 @@ Created 11/5/1995 Heikki Tuuri
 #ifndef UNIV_HOTBACKUP
 #include "buf0buf.h"
 #include "mtr0mtr.h"
+#include "srv0srv.h"
 
 /********************************************************************//**
 Inserts a modified block into the flush list. */
@@ -35,7 +36,7 @@ buf_flush_insert_into_flush_list(
 /*=============================*/
 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
 	buf_block_t*	block,		/*!< in/out: block which is modified */
-	ib_uint64_t	lsn);		/*!< in: oldest modification */
+	lsn_t		lsn);		/*!< in: oldest modification */
 /********************************************************************//**
 Inserts a modified block into the flush list in the right sorted position.
 This function is used by recovery, because there the modifications do not
@@ -46,7 +47,7 @@ buf_flush_insert_sorted_into_flush_list(
 /*====================================*/
 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
 	buf_block_t*	block,		/*!< in/out: block which is modified */
-	ib_uint64_t	lsn);		/*!< in: oldest modification */
+	lsn_t		lsn);		/*!< in: oldest modification */
 
 /********************************************************************//**
 This function should be called at a mini-transaction commit, if a page was
@@ -61,14 +62,13 @@ buf_flush_note_modification(
 {
 	buf_pool_t*	buf_pool = buf_pool_from_block(block);
 
-	ut_ad(block);
+	ut_ad(!srv_read_only_mode);
 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 	ut_ad(block->page.buf_fix_count > 0);
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
 
-	ut_ad(!buf_pool_mutex_own(buf_pool));
 	ut_ad(!buf_flush_list_mutex_own(buf_pool));
 	ut_ad(!mtr->made_dirty || log_flush_order_mutex_own());
 
@@ -81,6 +81,8 @@ buf_flush_note_modification(
 	block->page.newest_modification = mtr->end_lsn;
 
 	if (!block->page.oldest_modification) {
+		ut_a(mtr->made_dirty);
+		ut_ad(log_flush_order_mutex_own());
 		buf_flush_insert_into_flush_list(
 			buf_pool, block, mtr->start_lsn);
 	} else {
@@ -89,7 +91,7 @@ buf_flush_note_modification(
 
 	mutex_exit(&block->mutex);
 
-	++srv_buf_pool_write_requests;
+	srv_stats.buf_pool_write_requests.inc();
 }
 
 /********************************************************************//**
@@ -99,21 +101,20 @@ void
 buf_flush_recv_note_modification(
 /*=============================*/
 	buf_block_t*	block,		/*!< in: block which is modified */
-	ib_uint64_t	start_lsn,	/*!< in: start lsn of the first mtr in a
+	lsn_t		start_lsn,	/*!< in: start lsn of the first mtr in a
 					set of mtr's */
-	ib_uint64_t	end_lsn)	/*!< in: end lsn of the last mtr in the
+	lsn_t		end_lsn)	/*!< in: end lsn of the last mtr in the
 					set of mtr's */
 {
 	buf_pool_t*	buf_pool = buf_pool_from_block(block);
 
-	ut_ad(block);
+	ut_ad(!srv_read_only_mode);
 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 	ut_ad(block->page.buf_fix_count > 0);
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
 
-	ut_ad(!buf_pool_mutex_own(buf_pool));
 	ut_ad(!buf_flush_list_mutex_own(buf_pool));
 	ut_ad(log_flush_order_mutex_own());
 
@@ -134,3 +135,24 @@ buf_flush_recv_note_modification(
 
 }
 #endif /* !UNIV_HOTBACKUP */
+
+/******************************************************************//**
+Check if a flush list flush is in progress for any buffer pool instance for
+heuristic purposes.
+@return true if flush list flush is in progress  */
+UNIV_INLINE
+bool
+buf_flush_flush_list_in_progress(void)
+/*==================================*/
+{
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+
+		const buf_pool_t* buf_pool = buf_pool_from_array(i);
+		if (buf_pool->init_flush[BUF_FLUSH_LIST]
+		    || buf_pool->n_flush[BUF_FLUSH_LIST]) {
+
+			return(true);
+		}
+	}
+	return(false);
+}
diff --git a/storage/xtradb/include/buf0lru.h b/storage/xtradb/include/buf0lru.h
index 4b415214fa5..6415540178c 100644
--- a/storage/xtradb/include/buf0lru.h
+++ b/storage/xtradb/include/buf0lru.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,22 +27,13 @@ Created 11/5/1995 Heikki Tuuri
 #define buf0lru_h
 
 #include "univ.i"
+#ifndef UNIV_HOTBACKUP
 #include "ut0byte.h"
 #include "buf0types.h"
 
-/******************************************************************//**
-Tries to remove LRU flushed blocks from the end of the LRU list and put them
-to the free list. This is beneficial for the efficiency of the insert buffer
-operation, as flushed pages from non-unique non-clustered indexes are here
-taken out of the buffer pool, and their inserts redirected to the insert
-buffer. Otherwise, the flushed blocks could get modified again before read
-operations need new buffer blocks, and the i/o work done in flushing would be
-wasted. */
-UNIV_INTERN
-void
-buf_LRU_try_free_flushed_blocks(
-/*============================*/
-	buf_pool_t*	buf_pool);	/*!< in: buffer pool instance */
+// Forward declaration
+struct trx_t;
+
 /******************************************************************//**
 Returns TRUE if less than 25 % of the buffer pool is available. This can be
 used in heuristics to prevent huge transactions eating up the whole buffer
@@ -60,18 +51,19 @@ These are low-level functions
 /** Minimum LRU list length for which the LRU_old pointer is defined */
 #define BUF_LRU_OLD_MIN_LEN	512	/* 8 megabytes of 16k pages */
 
-/** Maximum LRU list search length in buf_flush_LRU_recommendation() */
-#define BUF_LRU_FREE_SEARCH_LEN(b)	(5 + 2 * BUF_READ_AHEAD_AREA(b))
-
 /******************************************************************//**
-Removes all pages belonging to a given tablespace. */
+Flushes all dirty pages or removes all pages belonging
+to a given tablespace. A PROBLEM: if readahead is being started, what
+guarantees that it will not try to read in pages after this operation
+has completed? */
 UNIV_INTERN
 void
 buf_LRU_flush_or_remove_pages(
 /*==========================*/
-	ulint			id,	/*!< in: space id */
-	enum buf_remove_t	buf_remove);/*!< in: remove or flush
-					strategy */
+	ulint		id,		/*!< in: space id */
+	buf_remove_t	buf_remove,	/*!< in: remove or flush strategy */
+	const trx_t*	trx);		/*!< to check if the operation must
+					be interrupted */
 
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 /********************************************************************//**
@@ -87,40 +79,35 @@ buf_LRU_insert_zip_clean(
 Try to free a block.  If bpage is a descriptor of a compressed-only
 page, the descriptor object will be freed as well.
 
-NOTE: This will temporarily release buf_pool_mutex.  Furthermore, the
-page frame will no longer be accessible via bpage.
+NOTE: If this function returns true, it will release the LRU list mutex,
+and temporarily release and relock the buf_page_get_mutex() mutex.
+Furthermore, the page frame will no longer be accessible via bpage.  If this
+function returns false, the buf_page_get_mutex() might be temporarily released
+and relocked too.
+
+The caller must hold the LRU list and buf_page_get_mutex() mutexes.
 
-The caller must hold buf_page_get_mutex(bpage) and release this mutex
-after the call.  No other buf_page_get_mutex() may be held when
-calling this function.
-@return TRUE if freed, FALSE otherwise. */
+@return true if freed, false otherwise. */
 UNIV_INTERN
-ibool
-buf_LRU_free_block(
-/*===============*/
+bool
+buf_LRU_free_page(
+/*==============*/
 	buf_page_t*	bpage,	/*!< in: block to be freed */
-	ibool		zip,	/*!< in: TRUE if should remove also the
+	bool		zip)	/*!< in: true if should remove also the
 				compressed page of an uncompressed page */
-	ibool*		have_LRU_mutex)
 	__attribute__((nonnull));
 /******************************************************************//**
 Try to free a replaceable block.
 @return	TRUE if found and freed */
 UNIV_INTERN
 ibool
-buf_LRU_search_and_free_block(
-/*==========================*/
+buf_LRU_scan_and_free_block(
+/*========================*/
 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	ulint		n_iterations);	/*!< in: how many times this has
-					been called repeatedly without
-					result: a high value means that
-					we should search farther; if
-					n_iterations < 10, then we search
-					n_iterations / 10 * buf_pool->curr_size
-					pages from the end of the LRU list; if
-					n_iterations < 5, then we will
-					also search n_iterations / 5
-					of the unzip_LRU list. */
+	ibool		scan_all)	/*!< in: scan whole LRU list
+					if TRUE, otherwise scan only
+					'old' blocks. */
+	__attribute__((nonnull,warn_unused_result));
 /******************************************************************//**
 Returns a free block from the buf_pool.  The block is taken off the
 free list.  If it is empty, returns NULL.
@@ -134,6 +121,27 @@ buf_LRU_get_free_only(
 Returns a free block from the buf_pool. The block is taken off the
 free list. If it is empty, blocks are moved from the end of the
 LRU list to the free list.
+This function is called from a user thread when it needs a clean
+block to read in a page. Note that we only ever get a block from
+the free list. Even when we flush a page or find a page in LRU scan
+we put it to free list to be used.
+* iteration 0:
+  * get a block from free list, success:done
+  * if there is an LRU flush batch in progress:
+    * wait for batch to end: retry free list
+  * if buf_pool->try_LRU_scan is set
+    * scan LRU up to srv_LRU_scan_depth to find a clean block
+    * the above will put the block on free list
+    * success:retry the free list
+  * flush one dirty page from tail of LRU to disk
+    * the above will put the block on free list
+    * success: retry the free list
+* iteration 1:
+  * same as iteration 0 except:
+    * scan whole LRU list
+    * scan LRU list even if buf_pool->try_LRU_scan is not set
+* iteration > 1:
+  * same as iteration 1 but sleep 100ms
 @return	the free control block, in state BUF_BLOCK_READY_FOR_USE */
 UNIV_INTERN
 buf_block_t*
@@ -141,15 +149,22 @@ buf_LRU_get_free_block(
 /*===================*/
 	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
 	__attribute__((nonnull,warn_unused_result));
-
+/******************************************************************//**
+Determines if the unzip_LRU list should be used for evicting a victim
+instead of the general LRU list.
+@return	TRUE if should use unzip_LRU */
+UNIV_INTERN
+ibool
+buf_LRU_evict_from_unzip_LRU(
+/*=========================*/
+	buf_pool_t*	buf_pool);
 /******************************************************************//**
 Puts a block back to the free list. */
 UNIV_INTERN
 void
 buf_LRU_block_free_non_file_page(
 /*=============================*/
-	buf_block_t*	block,	/*!< in: block, must not contain a file page */
-	ibool		have_page_hash_mutex);
+	buf_block_t*	block);	/*!< in: block, must not contain a file page */
 /******************************************************************//**
 Adds a block to the LRU list. Please make sure that the zip_size is
 already set into the page zip when invoking the function, so that we
@@ -206,18 +221,6 @@ UNIV_INTERN
 void
 buf_LRU_stat_update(void);
 /*=====================*/
-/********************************************************************//**
-Dump the LRU page list to the specific file. */
-UNIV_INTERN
-ibool
-buf_LRU_file_dump(void);
-/*===================*/
-/********************************************************************//**
-Read the pages based on the specific file.*/
-UNIV_INTERN
-ibool
-buf_LRU_file_restore(void);
-/*======================*/
 
 /******************************************************************//**
 Remove one page from LRU list and put it to free list */
@@ -279,21 +282,18 @@ extern uint	buf_LRU_old_threshold_ms;
 These statistics are not 'of' LRU but 'for' LRU.  We keep count of I/O
 and page_zip_decompress() operations.  Based on the statistics we decide
 if we want to evict from buf_pool->unzip_LRU or buf_pool->LRU. */
-struct buf_LRU_stat_struct
+struct buf_LRU_stat_t
 {
 	ulint	io;	/**< Counter of buffer pool I/O operations. */
 	ulint	unzip;	/**< Counter of page_zip_decompress operations. */
 };
 
-/** Statistics for selecting the LRU list for eviction. */
-typedef struct buf_LRU_stat_struct buf_LRU_stat_t;
-
 /** Current operation counters.  Not protected by any mutex.
 Cleared by buf_LRU_stat_update(). */
 extern buf_LRU_stat_t	buf_LRU_stat_cur;
 
 /** Running sum of past values of buf_LRU_stat_cur.
-Updated by buf_LRU_stat_update().  Protected by buf_pool->mutex. */
+Updated by buf_LRU_stat_update().  */
 extern buf_LRU_stat_t	buf_LRU_stat_sum;
 
 /********************************************************************//**
@@ -307,4 +307,6 @@ Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */
 #include "buf0lru.ic"
 #endif
 
+#endif /* !UNIV_HOTBACKUP */
+
 #endif
diff --git a/storage/xtradb/include/buf0lru.ic b/storage/xtradb/include/buf0lru.ic
index d1a89b9fbee..6e0da7a2588 100644
--- a/storage/xtradb/include/buf0lru.ic
+++ b/storage/xtradb/include/buf0lru.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/buf0rea.h b/storage/xtradb/include/buf0rea.h
index 613b89e9f5c..9adeaa7455a 100644
--- a/storage/xtradb/include/buf0rea.h
+++ b/storage/xtradb/include/buf0rea.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,38 +27,20 @@ Created 11/5/1995 Heikki Tuuri
 #define buf0rea_h
 
 #include "univ.i"
-#include "trx0types.h"
 #include "buf0types.h"
 
 /********************************************************************//**
-Low-level function which reads a page asynchronously from a file to the
-buffer buf_pool if it is not already there, in which case does nothing.
-Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
-flag is cleared and the x-lock released by an i/o-handler thread.
-@return 1 if a read request was queued, 0 if the page already resided
-in buf_pool, or if the page is in the doublewrite buffer blocks in
-which case it is never read into the pool, or if the tablespace does
-not exist or is being dropped 
-@return 1 if read request is issued. 0 if it is not */
+High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@return TRUE if page has been read in, FALSE in case of failure */
 UNIV_INTERN
-ulint
-buf_read_page_low(
-/*==============*/
-	ulint*	err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
-			trying to read from a non-existent tablespace, or a
-			tablespace which is just now being dropped */
-	ibool	sync,	/*!< in: TRUE if synchronous aio is desired */
-	ulint	mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ...,
-			ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
-			at read-ahead functions) */
+ibool
+buf_read_page(
+/*==========*/
 	ulint	space,	/*!< in: space id */
-	ulint	zip_size,/*!< in: compressed page size, or 0 */
-	ibool	unzip,	/*!< in: TRUE=request uncompressed page */
-	ib_int64_t tablespace_version, /*!< in: if the space memory object has
-			this timestamp different from what we are giving here,
-			treat the tablespace as dropped; this is a timestamp we
-			use to stop dangling page reads from a tablespace
-			which we have DISCARDed + IMPORTed back */
+	ulint	zip_size,/*!< in: compressed page size in bytes, or 0 */
 	ulint	offset,	/*!< in: page number */
 	trx_t*	trx);
 /********************************************************************//**
@@ -69,12 +51,10 @@ released by the i/o-handler thread.
 @return TRUE if page has been read in, FALSE in case of failure */
 UNIV_INTERN
 ibool
-buf_read_page(
-/*==========*/
+buf_read_page_async(
+/*================*/
 	ulint	space,	/*!< in: space id */
-	ulint	zip_size,/*!< in: compressed page size in bytes, or 0 */
-	ulint	offset, /*!< in: page number */
-	trx_t*	trx);
+	ulint	offset);/*!< in: page number */
 /********************************************************************//**
 Applies a random read-ahead in buf_pool if there are at least a threshold
 value of accessed pages from the random read-ahead area. Does not read any
@@ -142,7 +122,7 @@ UNIV_INTERN
 void
 buf_read_ibuf_merge_pages(
 /*======================*/
-	ibool		sync,		/*!< in: TRUE if the caller
+	bool		sync,		/*!< in: true if the caller
 					wants this function to wait
 					for the highest address page
 					to get read in, before this
@@ -184,13 +164,16 @@ buf_read_recv_pages(
 
 /** The size in pages of the area which the read-ahead algorithms read if
 invoked */
-#define	BUF_READ_AHEAD_AREA(b)		64
+#define	BUF_READ_AHEAD_AREA(b)		((b)->read_ahead_area)
 
 /** @name Modes used in read-ahead @{ */
 /** read only pages belonging to the insert buffer tree */
 #define BUF_READ_IBUF_PAGES_ONLY	131
 /** read any page */
 #define BUF_READ_ANY_PAGE		132
+/** read any page, but ignore (return an error) if a page does not exist
+instead of crashing like BUF_READ_ANY_PAGE does */
+#define BUF_READ_IGNORE_NONEXISTENT_PAGES 1024
 /* @} */
 
 #endif
diff --git a/storage/xtradb/include/buf0types.h b/storage/xtradb/include/buf0types.h
index 9a0af8b648b..e19eb04a2ce 100644
--- a/storage/xtradb/include/buf0types.h
+++ b/storage/xtradb/include/buf0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,33 +26,45 @@ Created 11/17/1995 Heikki Tuuri
 #ifndef buf0types_h
 #define buf0types_h
 
-#include "page0types.h"
-
 /** Buffer page (uncompressed or compressed) */
-typedef	struct buf_page_struct		buf_page_t;
+struct buf_page_t;
 /** Buffer block for which an uncompressed page exists */
-typedef	struct buf_block_struct		buf_block_t;
+struct buf_block_t;
 /** Buffer pool chunk comprising buf_block_t */
-typedef struct buf_chunk_struct		buf_chunk_t;
+struct buf_chunk_t;
 /** Buffer pool comprising buf_chunk_t */
-typedef	struct buf_pool_struct		buf_pool_t;
+struct buf_pool_t;
 /** Buffer pool statistics struct */
-typedef	struct buf_pool_stat_struct	buf_pool_stat_t;
+struct buf_pool_stat_t;
 /** Buffer pool buddy statistics struct */
-typedef	struct buf_buddy_stat_struct	buf_buddy_stat_t;
+struct buf_buddy_stat_t;
+/** Doublewrite memory struct */
+struct buf_dblwr_t;
 
 /** A buffer frame. @see page_t */
 typedef	byte	buf_frame_t;
 
 /** Flags for flush types */
-enum buf_flush {
+enum buf_flush_t {
 	BUF_FLUSH_LRU = 0,		/*!< flush via the LRU list */
-	BUF_FLUSH_SINGLE_PAGE,		/*!< flush a single page */
 	BUF_FLUSH_LIST,			/*!< flush via the flush list
 					of dirty blocks */
+	BUF_FLUSH_SINGLE_PAGE,		/*!< flush via the LRU list
+					but only a single page */
 	BUF_FLUSH_N_TYPES		/*!< index of last element + 1  */
 };
 
+/** Algorithm to remove the pages for a tablespace from the buffer pool.
+See buf_LRU_flush_or_remove_pages(). */
+enum buf_remove_t {
+	BUF_REMOVE_ALL_NO_WRITE,	/*!< Remove all pages from the buffer
+					pool, don't write or sync to disk */
+	BUF_REMOVE_FLUSH_NO_WRITE,	/*!< Remove only, from the flush list,
+					don't write or sync to disk */
+	BUF_REMOVE_FLUSH_WRITE		/*!< Flush dirty pages to disk only
+					don't remove from the buffer pool */
+};
+
 /** Flags for io_fix types */
 enum buf_io_fix {
 	BUF_IO_NONE = 0,		/**< no pending I/O */
@@ -63,30 +75,79 @@ enum buf_io_fix {
 					the flush_list */
 };
 
-/** Algorithm to remove the pages for a tablespace from the buffer pool.
-@See buf_LRU_flush_or_remove_pages(). */
-enum buf_remove_t {
-	BUF_REMOVE_ALL_NO_WRITE,	/*!< Remove all pages from the buffer
-					pool, don't write or sync to disk */
-	BUF_REMOVE_FLUSH_NO_WRITE	/*!< Remove only, from the flush list,
-					don't write or sync to disk */
+/** Alternatives for srv_checksum_algorithm, which can be changed by
+setting innodb_checksum_algorithm */
+enum srv_checksum_algorithm_t {
+	SRV_CHECKSUM_ALGORITHM_CRC32,		/*!< Write crc32, allow crc32,
+						innodb or none when reading */
+	SRV_CHECKSUM_ALGORITHM_STRICT_CRC32,	/*!< Write crc32, allow crc32
+						when reading */
+	SRV_CHECKSUM_ALGORITHM_INNODB,		/*!< Write innodb, allow crc32,
+						innodb or none when reading */
+	SRV_CHECKSUM_ALGORITHM_STRICT_INNODB,	/*!< Write innodb, allow
+						innodb when reading */
+	SRV_CHECKSUM_ALGORITHM_NONE,		/*!< Write none, allow crc32,
+						innodb or none when reading */
+	SRV_CHECKSUM_ALGORITHM_STRICT_NONE	/*!< Write none, allow none
+						when reading */
+};
+
+/** Alternatives for srv_cleaner_lsn_age_factor, set through
+innodb_cleaner_lsn_age_factor variable  */
+enum srv_cleaner_lsn_age_factor_t {
+	SRV_CLEANER_LSN_AGE_FACTOR_LEGACY,	/*!< Original Oracle MySQL 5.6
+						formula */
+	SRV_CLEANER_LSN_AGE_FACTOR_HIGH_CHECKPOINT
+						/*!< Percona Server 5.6 formula
+						that returns lower values than
+					        legacy option for low
+					        checkpoint ages, and higher
+					        values for high ages.  This has
+					        the effect of stabilizing the
+						checkpoint age higher.  */
+};
+
+/** Alternatives for srv_foreground_preflush, set through
+innodb_foreground_preflush variable  */
+enum srv_foreground_preflush_t {
+	SRV_FOREGROUND_PREFLUSH_SYNC_PREFLUSH,	/*!< Original Oracle MySQL 5.6
+						behavior of performing a sync
+						flush list flush  */
+	SRV_FOREGROUND_PREFLUSH_EXP_BACKOFF	/*!< Exponential backoff wait
+						for the page cleaner to flush
+						for us  */
+};
+
+/** Alternatives for srv_empty_free_list_algorithm, set through
+innodb_empty_free_list_algorithm variable  */
+enum srv_empty_free_list_t {
+	SRV_EMPTY_FREE_LIST_LEGACY,	/*!< Original Oracle MySQL 5.6
+				        algorithm */
+	SRV_EMPTY_FREE_LIST_BACKOFF	/*!< Percona Server 5.6 algorithm that
+					loops in a progressive backoff until a
+					free page is produced by the cleaner
+					thread */
 };
 
 /** Parameters of binary buddy system for compressed pages (buf0buddy.h) */
 /* @{ */
-#define BUF_BUDDY_LOW_SHIFT	PAGE_ZIP_MIN_SIZE_SHIFT
+/** Zip shift value for the smallest page size */
+#define BUF_BUDDY_LOW_SHIFT	UNIV_ZIP_SIZE_SHIFT_MIN
 
-#define BUF_BUDDY_LOW		(1 << BUF_BUDDY_LOW_SHIFT)
+/** Smallest buddy page size */
+#define BUF_BUDDY_LOW		(1U << BUF_BUDDY_LOW_SHIFT)
 
+/** Actual number of buddy sizes based on current page size */
 #define BUF_BUDDY_SIZES		(UNIV_PAGE_SIZE_SHIFT - BUF_BUDDY_LOW_SHIFT)
-#define BUF_BUDDY_SIZES_MAX	(UNIV_PAGE_SIZE_SHIFT_MAX - BUF_BUDDY_LOW_SHIFT)
-					/*!< number of buddy sizes */
+
+/** Maximum number of buddy sizes based on the max page size */
+#define BUF_BUDDY_SIZES_MAX	(UNIV_PAGE_SIZE_SHIFT_MAX	\
+				- BUF_BUDDY_LOW_SHIFT)
 
 /** twice the maximum block size of the buddy system;
 the underlying memory is aligned by this amount:
 this must be equal to UNIV_PAGE_SIZE */
-#define BUF_BUDDY_HIGH	((ulint)BUF_BUDDY_LOW << BUF_BUDDY_SIZES)
+#define BUF_BUDDY_HIGH	(BUF_BUDDY_LOW << BUF_BUDDY_SIZES)
 /* @} */
 
-#endif
-
+#endif /* buf0types.h */
diff --git a/storage/xtradb/include/data0data.h b/storage/xtradb/include/data0data.h
index c6e864dafc9..a548c7b89b3 100644
--- a/storage/xtradb/include/data0data.h
+++ b/storage/xtradb/include/data0data.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -35,7 +35,7 @@ Created 5/30/1994 Heikki Tuuri
 
 /** Storage for overflow data in a big record, that is, a clustered
 index record which needs external storage of data fields */
-typedef struct big_rec_struct		big_rec_t;
+struct big_rec_t;
 
 #ifdef UNIV_DEBUG
 /*********************************************************************//**
@@ -45,7 +45,8 @@ UNIV_INLINE
 dtype_t*
 dfield_get_type(
 /*============*/
-	const dfield_t*	field);	/*!< in: SQL data field */
+	const dfield_t*	field)	/*!< in: SQL data field */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Gets pointer to the data in a field.
 @return	pointer to data */
@@ -53,7 +54,8 @@ UNIV_INLINE
 void*
 dfield_get_data(
 /*============*/
-	const dfield_t* field);	/*!< in: field */
+	const dfield_t* field)	/*!< in: field */
+	__attribute__((nonnull, warn_unused_result));
 #else /* UNIV_DEBUG */
 # define dfield_get_type(field) (&(field)->type)
 # define dfield_get_data(field) ((field)->data)
@@ -65,7 +67,8 @@ void
 dfield_set_type(
 /*============*/
 	dfield_t*	field,	/*!< in: SQL data field */
-	dtype_t*	type);	/*!< in: pointer to data type struct */
+	const dtype_t*	type)	/*!< in: pointer to data type struct */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Gets length of field data.
 @return	length of data; UNIV_SQL_NULL if SQL null data */
@@ -73,7 +76,8 @@ UNIV_INLINE
 ulint
 dfield_get_len(
 /*===========*/
-	const dfield_t* field);	/*!< in: field */
+	const dfield_t* field)	/*!< in: field */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Sets length in a field. */
 UNIV_INLINE
@@ -81,7 +85,8 @@ void
 dfield_set_len(
 /*===========*/
 	dfield_t*	field,	/*!< in: field */
-	ulint		len);	/*!< in: length or UNIV_SQL_NULL */
+	ulint		len)	/*!< in: length or UNIV_SQL_NULL */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Determines if a field is SQL NULL
 @return	nonzero if SQL null data */
@@ -89,7 +94,8 @@ UNIV_INLINE
 ulint
 dfield_is_null(
 /*===========*/
-	const dfield_t* field);	/*!< in: field */
+	const dfield_t* field)	/*!< in: field */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Determines if a field is externally stored
 @return	nonzero if externally stored */
@@ -97,14 +103,16 @@ UNIV_INLINE
 ulint
 dfield_is_ext(
 /*==========*/
-	const dfield_t* field);	/*!< in: field */
+	const dfield_t* field)	/*!< in: field */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Sets the "external storage" flag */
 UNIV_INLINE
 void
 dfield_set_ext(
 /*===========*/
-	dfield_t*	field);	/*!< in/out: field */
+	dfield_t*	field)	/*!< in/out: field */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Sets pointer to the data and length in a field. */
 UNIV_INLINE
@@ -113,14 +121,16 @@ dfield_set_data(
 /*============*/
 	dfield_t*	field,	/*!< in: field */
 	const void*	data,	/*!< in: data */
-	ulint		len);	/*!< in: length or UNIV_SQL_NULL */
+	ulint		len)	/*!< in: length or UNIV_SQL_NULL */
+	__attribute__((nonnull(1)));
 /*********************************************************************//**
 Sets a data field to SQL NULL. */
 UNIV_INLINE
 void
 dfield_set_null(
 /*============*/
-	dfield_t*	field);	/*!< in/out: field */
+	dfield_t*	field)	/*!< in/out: field */
+	__attribute__((nonnull));
 /**********************************************************************//**
 Writes an SQL null field full of zeros. */
 UNIV_INLINE
@@ -128,7 +138,8 @@ void
 data_write_sql_null(
 /*================*/
 	byte*	data,	/*!< in: pointer to a buffer of size len */
-	ulint	len);	/*!< in: SQL null size in bytes */
+	ulint	len)	/*!< in: SQL null size in bytes */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Copies the data and len fields. */
 UNIV_INLINE
@@ -136,7 +147,8 @@ void
 dfield_copy_data(
 /*=============*/
 	dfield_t*	field1,	/*!< out: field to copy to */
-	const dfield_t*	field2);/*!< in: field to copy from */
+	const dfield_t*	field2)	/*!< in: field to copy from */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Copies a data field to another. */
 UNIV_INLINE
@@ -144,7 +156,8 @@ void
 dfield_copy(
 /*========*/
 	dfield_t*	field1,	/*!< out: field to copy to */
-	const dfield_t*	field2);/*!< in: field to copy from */
+	const dfield_t*	field2)	/*!< in: field to copy from */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Copies the data pointed to by a data field. */
 UNIV_INLINE
@@ -152,7 +165,9 @@ void
 dfield_dup(
 /*=======*/
 	dfield_t*	field,	/*!< in/out: data field */
-	mem_heap_t*	heap);	/*!< in: memory heap where allocated */
+	mem_heap_t*	heap)	/*!< in: memory heap where allocated */
+	__attribute__((nonnull));
+#ifndef UNIV_HOTBACKUP
 /*********************************************************************//**
 Tests if two data fields are equal.
 If len==0, tests the data length and content for equality.
@@ -170,13 +185,15 @@ dfield_datas_are_binary_equal(
 /*********************************************************************//**
 Tests if dfield data length and content is equal to the given.
 @return	TRUE if equal */
-UNIV_INTERN
+UNIV_INLINE
 ibool
 dfield_data_is_binary_equal(
 /*========================*/
 	const dfield_t*	field,	/*!< in: field */
 	ulint		len,	/*!< in: data length or UNIV_SQL_NULL */
-	const byte*	data);	/*!< in: data */
+	const byte*	data)	/*!< in: data */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* !UNIV_HOTBACKUP */
 /*********************************************************************//**
 Gets number of fields in a data tuple.
 @return	number of fields */
@@ -184,7 +201,8 @@ UNIV_INLINE
 ulint
 dtuple_get_n_fields(
 /*================*/
-	const dtuple_t*	tuple);	/*!< in: tuple */
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	__attribute__((nonnull, warn_unused_result));
 #ifdef UNIV_DEBUG
 /*********************************************************************//**
 Gets nth field of a tuple.
@@ -205,7 +223,8 @@ UNIV_INLINE
 ulint
 dtuple_get_info_bits(
 /*=================*/
-	const dtuple_t*	tuple);	/*!< in: tuple */
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Sets info bits in a data tuple. */
 UNIV_INLINE
@@ -213,7 +232,8 @@ void
 dtuple_set_info_bits(
 /*=================*/
 	dtuple_t*	tuple,		/*!< in: tuple */
-	ulint		info_bits);	/*!< in: info bits */
+	ulint		info_bits)	/*!< in: info bits */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Gets number of fields used in record comparisons.
 @return	number of fields used in comparisons in rem0cmp.* */
@@ -221,7 +241,8 @@ UNIV_INLINE
 ulint
 dtuple_get_n_fields_cmp(
 /*====================*/
-	const dtuple_t*	tuple);	/*!< in: tuple */
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Gets number of fields used in record comparisons. */
 UNIV_INLINE
@@ -229,8 +250,9 @@ void
 dtuple_set_n_fields_cmp(
 /*====================*/
 	dtuple_t*	tuple,		/*!< in: tuple */
-	ulint		n_fields_cmp);	/*!< in: number of fields used in
+	ulint		n_fields_cmp)	/*!< in: number of fields used in
 					comparisons in rem0cmp.* */
+	__attribute__((nonnull));
 
 /* Estimate the number of bytes that are going to be allocated when
 creating a new dtuple_t object */
@@ -249,7 +271,8 @@ dtuple_create_from_mem(
 /*===================*/
 	void*	buf,		/*!< in, out: buffer to use */
 	ulint	buf_size,	/*!< in: buffer size */
-	ulint	n_fields);	/*!< in: number of fields */
+	ulint	n_fields)	/*!< in: number of fields */
+	__attribute__((nonnull, warn_unused_result));
 
 /**********************************************************//**
 Creates a data tuple to a memory heap. The default value for number
@@ -262,19 +285,8 @@ dtuple_create(
 	mem_heap_t*	heap,	/*!< in: memory heap where the tuple
 				is created, DTUPLE_EST_ALLOC(n_fields)
 				bytes will be allocated from this heap */
-	ulint		n_fields); /*!< in: number of fields */
-
-/**********************************************************//**
-Wrap data fields in a tuple. The default value for number
-of fields used in record comparisons for this tuple is n_fields.
-@return	data tuple */
-UNIV_INLINE
-const dtuple_t*
-dtuple_from_fields(
-/*===============*/
-	dtuple_t*	tuple,		/*!< in: storage for data tuple */
-	const dfield_t*	fields,		/*!< in: fields */
-	ulint		n_fields);	/*!< in: number of fields */
+	ulint		n_fields)/*!< in: number of fields */
+	__attribute__((nonnull, malloc));
 
 /*********************************************************************//**
 Sets number of fields used in a tuple. Normally this is set in
@@ -284,7 +296,8 @@ void
 dtuple_set_n_fields(
 /*================*/
 	dtuple_t*	tuple,		/*!< in: tuple */
-	ulint		n_fields);	/*!< in: number of fields */
+	ulint		n_fields)	/*!< in: number of fields */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Copies a data tuple to another.  This is a shallow copy; if a deep copy
 is desired, dfield_dup() will have to be invoked on each field.
@@ -294,8 +307,9 @@ dtuple_t*
 dtuple_copy(
 /*========*/
 	const dtuple_t*	tuple,	/*!< in: tuple to copy from */
-	mem_heap_t*	heap);	/*!< in: memory heap
+	mem_heap_t*	heap)	/*!< in: memory heap
 				where the tuple is created */
+	__attribute__((nonnull, malloc));
 /**********************************************************//**
 The following function returns the sum of data lengths of a tuple. The space
 occupied by the field structs or the tuple struct is not counted.
@@ -305,7 +319,8 @@ ulint
 dtuple_get_data_size(
 /*=================*/
 	const dtuple_t*	tuple,	/*!< in: typed data tuple */
-	ulint		comp);	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+	ulint		comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Computes the number of externally stored fields in a data tuple.
 @return	number of fields */
@@ -313,7 +328,8 @@ UNIV_INLINE
 ulint
 dtuple_get_n_ext(
 /*=============*/
-	const dtuple_t*	tuple);	/*!< in: tuple */
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	__attribute__((nonnull));
 /************************************************************//**
 Compare two data tuples, respecting the collation of character fields.
 @return 1, 0 , -1 if tuple1 is greater, equal, less, respectively,
@@ -323,7 +339,8 @@ int
 dtuple_coll_cmp(
 /*============*/
 	const dtuple_t*	tuple1,	/*!< in: tuple 1 */
-	const dtuple_t*	tuple2);/*!< in: tuple 2 */
+	const dtuple_t*	tuple2)	/*!< in: tuple 2 */
+	__attribute__((nonnull, warn_unused_result));
 /************************************************************//**
 Folds a prefix given as the number of fields of a tuple.
 @return	the folded value */
@@ -336,7 +353,7 @@ dtuple_fold(
 	ulint		n_bytes,/*!< in: number of bytes to fold in an
 				incomplete last field */
 	index_id_t	tree_id)/*!< in: index tree id */
-	__attribute__((pure));
+	__attribute__((nonnull, pure, warn_unused_result));
 /*******************************************************************//**
 Sets types of fields binary in a tuple. */
 UNIV_INLINE
@@ -344,7 +361,8 @@ void
 dtuple_set_types_binary(
 /*====================*/
 	dtuple_t*	tuple,	/*!< in: data tuple */
-	ulint		n);	/*!< in: number of fields to set */
+	ulint		n)	/*!< in: number of fields to set */
+	__attribute__((nonnull));
 /**********************************************************************//**
 Checks if a dtuple contains an SQL null value.
 @return	TRUE if some field is SQL null */
@@ -352,7 +370,8 @@ UNIV_INLINE
 ibool
 dtuple_contains_null(
 /*=================*/
-	const dtuple_t*	tuple);	/*!< in: dtuple */
+	const dtuple_t*	tuple)	/*!< in: dtuple */
+	__attribute__((nonnull, warn_unused_result));
 /**********************************************************//**
 Checks that a data field is typed. Asserts an error if not.
 @return	TRUE if ok */
@@ -360,7 +379,8 @@ UNIV_INTERN
 ibool
 dfield_check_typed(
 /*===============*/
-	const dfield_t*	field);	/*!< in: data field */
+	const dfield_t*	field)	/*!< in: data field */
+	__attribute__((nonnull, warn_unused_result));
 /**********************************************************//**
 Checks that a data tuple is typed. Asserts an error if not.
 @return	TRUE if ok */
@@ -368,7 +388,8 @@ UNIV_INTERN
 ibool
 dtuple_check_typed(
 /*===============*/
-	const dtuple_t*	tuple);	/*!< in: tuple */
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	__attribute__((nonnull, warn_unused_result));
 /**********************************************************//**
 Checks that a data tuple is typed.
 @return	TRUE if ok */
@@ -376,7 +397,8 @@ UNIV_INTERN
 ibool
 dtuple_check_typed_no_assert(
 /*=========================*/
-	const dtuple_t*	tuple);	/*!< in: tuple */
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	__attribute__((nonnull, warn_unused_result));
 #ifdef UNIV_DEBUG
 /**********************************************************//**
 Validates the consistency of a tuple which must be complete, i.e,
@@ -386,7 +408,8 @@ UNIV_INTERN
 ibool
 dtuple_validate(
 /*============*/
-	const dtuple_t*	tuple);	/*!< in: tuple */
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	__attribute__((nonnull, warn_unused_result));
 #endif /* UNIV_DEBUG */
 /*************************************************************//**
 Pretty prints a dfield value according to its data type. */
@@ -394,7 +417,8 @@ UNIV_INTERN
 void
 dfield_print(
 /*=========*/
-	const dfield_t*	dfield);/*!< in: dfield */
+	const dfield_t*	dfield)	/*!< in: dfield */
+	__attribute__((nonnull));
 /*************************************************************//**
 Pretty prints a dfield value according to its data type. Also the hex string
 is printed if a string contains non-printable characters. */
@@ -402,7 +426,8 @@ UNIV_INTERN
 void
 dfield_print_also_hex(
 /*==================*/
-	const dfield_t*	dfield);	 /*!< in: dfield */
+	const dfield_t*	dfield)	 /*!< in: dfield */
+	__attribute__((nonnull));
 /**********************************************************//**
 The following function prints the contents of a tuple. */
 UNIV_INTERN
@@ -410,7 +435,8 @@ void
 dtuple_print(
 /*=========*/
 	FILE*		f,	/*!< in: output stream */
-	const dtuple_t*	tuple);	/*!< in: tuple */
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	__attribute__((nonnull));
 /**************************************************************//**
 Moves parts of long fields in entry to the big record vector so that
 the size of tuple drops below the maximum record size allowed in the
@@ -425,8 +451,9 @@ dtuple_convert_big_rec(
 /*===================*/
 	dict_index_t*	index,	/*!< in: index */
 	dtuple_t*	entry,	/*!< in/out: index entry */
-	ulint*		n_ext);	/*!< in/out: number of
+	ulint*		n_ext)	/*!< in/out: number of
 				externally stored columns */
+	__attribute__((nonnull, malloc, warn_unused_result));
 /**************************************************************//**
 Puts back to entry the data stored in vector. Note that to ensure the
 fields in entry can accommodate the data, vector must have been created
@@ -437,21 +464,23 @@ dtuple_convert_back_big_rec(
 /*========================*/
 	dict_index_t*	index,	/*!< in: index */
 	dtuple_t*	entry,	/*!< in: entry whose data was put to vector */
-	big_rec_t*	vector);/*!< in, own: big rec vector; it is
+	big_rec_t*	vector)	/*!< in, own: big rec vector; it is
 				freed in this function */
+	__attribute__((nonnull));
 /**************************************************************//**
 Frees the memory in a big rec vector. */
 UNIV_INLINE
 void
 dtuple_big_rec_free(
 /*================*/
-	big_rec_t*	vector);	/*!< in, own: big rec vector; it is
+	big_rec_t*	vector)	/*!< in, own: big rec vector; it is
 				freed in this function */
+	__attribute__((nonnull));
 
 /*######################################################################*/
 
 /** Structure for an SQL data field */
-struct dfield_struct{
+struct dfield_t{
 	void*		data;	/*!< pointer to data */
 	unsigned	ext:1;	/*!< TRUE=externally stored, FALSE=local */
 	unsigned	len:32;	/*!< data length; UNIV_SQL_NULL if SQL null */
@@ -459,7 +488,7 @@ struct dfield_struct{
 };
 
 /** Structure for an SQL data tuple of fields (logical record) */
-struct dtuple_struct {
+struct dtuple_t {
 	ulint		info_bits;	/*!< info bits of an index record:
 					the default is 0; this field is used
 					if an index record is built from
@@ -479,15 +508,13 @@ struct dtuple_struct {
 #ifdef UNIV_DEBUG
 	ulint		magic_n;	/*!< magic number, used in
 					debug assertions */
-/** Value of dtuple_struct::magic_n */
+/** Value of dtuple_t::magic_n */
 # define		DATA_TUPLE_MAGIC_N	65478679
 #endif /* UNIV_DEBUG */
 };
 
 /** A slot for a field in a big rec vector */
-typedef struct big_rec_field_struct	big_rec_field_t;
-/** A slot for a field in a big rec vector */
-struct big_rec_field_struct {
+struct big_rec_field_t {
 	ulint		field_no;	/*!< field number in record */
 	ulint		len;		/*!< stored data length, in bytes */
 	const void*	data;		/*!< stored data */
@@ -495,7 +522,7 @@ struct big_rec_field_struct {
 
 /** Storage format for overflow data in a big record, that is, a
 clustered index record which needs external storage of data fields */
-struct big_rec_struct {
+struct big_rec_t {
 	mem_heap_t*	heap;		/*!< memory heap from which
 					allocated */
 	ulint		n_fields;	/*!< number of stored fields */
diff --git a/storage/xtradb/include/data0data.ic b/storage/xtradb/include/data0data.ic
index 2059eefaf89..6937d55d211 100644
--- a/storage/xtradb/include/data0data.ic
+++ b/storage/xtradb/include/data0data.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -54,7 +54,7 @@ void
 dfield_set_type(
 /*============*/
 	dfield_t*	field,	/*!< in: SQL data field */
-	dtype_t*	type)	/*!< in: pointer to data type struct */
+	const dtype_t*	type)	/*!< in: pointer to data type struct */
 {
 	ut_ad(field && type);
 
@@ -138,7 +138,7 @@ dfield_is_ext(
 {
 	ut_ad(field);
 
-	return(UNIV_UNLIKELY(field->ext));
+	return(field->ext);
 }
 
 /*********************************************************************//**
@@ -228,6 +228,7 @@ dfield_dup(
 	}
 }
 
+#ifndef UNIV_HOTBACKUP
 /*********************************************************************//**
 Tests if two data fields are equal.
 If len==0, tests the data length and content for equality.
@@ -258,6 +259,23 @@ dfield_datas_are_binary_equal(
 }
 
 /*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return	TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+	const dfield_t*	field,	/*!< in: field */
+	ulint		len,	/*!< in: data length or UNIV_SQL_NULL */
+	const byte*	data)	/*!< in: data */
+{
+	return(len == dfield_get_len(field)
+	       && (len == UNIV_SQL_NULL
+		   || !memcmp(dfield_get_data(field), data, len)));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
 Gets info bits in a data tuple.
 @return	info bits */
 UNIV_INLINE
@@ -389,6 +407,8 @@ dtuple_create_from_mem(
 		}
 	}
 #endif
+	UNIV_MEM_ASSERT_W(tuple->fields, n_fields * sizeof *tuple->fields);
+	UNIV_MEM_INVALID(tuple->fields, n_fields * sizeof *tuple->fields);
 	return(tuple);
 }
 
@@ -416,30 +436,6 @@ dtuple_create(
 
 	tuple = dtuple_create_from_mem(buf, buf_size, n_fields);
 
-#ifdef UNIV_DEBUG
-	UNIV_MEM_INVALID(tuple->fields, n_fields * sizeof *tuple->fields);
-#endif
-
-	return(tuple);
-}
-
-/**********************************************************//**
-Wrap data fields in a tuple. The default value for number
-of fields used in record comparisons for this tuple is n_fields.
-@return	data tuple */
-UNIV_INLINE
-const dtuple_t*
-dtuple_from_fields(
-/*===============*/
-	dtuple_t*	tuple,		/*!< in: storage for data tuple */
-	const dfield_t*	fields,		/*!< in: fields */
-	ulint		n_fields)	/*!< in: number of fields */
-{
-	tuple->info_bits = 0;
-	tuple->n_fields = tuple->n_fields_cmp = n_fields;
-	tuple->fields = (dfield_t*) fields;
-	ut_d(tuple->magic_n = DATA_TUPLE_MAGIC_N);
-
 	return(tuple);
 }
 
diff --git a/storage/xtradb/include/data0type.h b/storage/xtradb/include/data0type.h
index 25d68de6646..111664b0b52 100644
--- a/storage/xtradb/include/data0type.h
+++ b/storage/xtradb/include/data0type.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -33,10 +33,20 @@ extern ulint	data_mysql_default_charset_coll;
 #define DATA_MYSQL_BINARY_CHARSET_COLL 63
 
 /* SQL data type struct */
-typedef struct dtype_struct		dtype_t;
+struct dtype_t;
+
+/* SQL Like operator comparison types */
+enum ib_like_t {
+	IB_LIKE_EXACT,                  /* e.g.  STRING */
+	IB_LIKE_PREFIX,                 /* e.g., STRING% */
+	IB_LIKE_SUFFIX,                 /* e.g., %STRING */
+	IB_LIKE_SUBSTR,                 /* e.g., %STRING% */
+	IB_LIKE_REGEXP                  /* Future */
+};
 
 /*-------------------------------------------*/
 /* The 'MAIN TYPE' of a column */
+#define DATA_MISSING	0	/* missing column */
 #define	DATA_VARCHAR	1	/* character varying of the
 				latin1_swedish_ci charset-collation; note
 				that the MySQL format for this, DATA_BINARY,
@@ -139,6 +149,8 @@ be less than 256 */
 
 #define	DATA_N_SYS_COLS 3	/* number of system columns defined above */
 
+#define DATA_FTS_DOC_ID	3	/* Used as FTS DOC ID column */
+
 #define DATA_SYS_PRTYPE_MASK 0xF /* mask to extract the above from prtype */
 
 /* Flags ORed to the precise data type */
@@ -182,6 +194,12 @@ because in GCC it returns a long. */
 /* Get mbmaxlen from mbminmaxlen. */
 #define DATA_MBMAXLEN(mbminmaxlen) ((ulint) ((mbminmaxlen) / DATA_MBMAX))
 
+/* We now support 15 bits (up to 32767) collation number */
+#define MAX_CHAR_COLL_NUM	32767
+
+/* Mask to get the Charset Collation number (0x7fff) */
+#define CHAR_COLL_MASK		MAX_CHAR_COLL_NUM
+
 #ifndef UNIV_HOTBACKUP
 /*********************************************************************//**
 Gets the MySQL type code from a dtype.
@@ -450,6 +468,20 @@ dtype_new_read_for_order_and_null_size(
 /*===================================*/
 	dtype_t*	type,	/*!< in: type struct */
 	const byte*	buf);	/*!< in: buffer for stored type order info */
+
+/*********************************************************************//**
+Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len
+@return the SQL type name */
+UNIV_INLINE
+char*
+dtype_sql_name(
+/*===========*/
+	unsigned	mtype,	/*!< in: mtype */
+	unsigned	prtype,	/*!< in: prtype */
+	unsigned	len,	/*!< in: len */
+	char*		name,	/*!< out: SQL name */
+	unsigned	name_sz);/*!< in: size of the name buffer */
+
 #endif /* !UNIV_HOTBACKUP */
 
 /*********************************************************************//**
@@ -476,15 +508,15 @@ dtype_read_for_order_and_null_size()
 dtype_new_read_for_order_and_null_size()
 sym_tab_add_null_lit() */
 
-struct dtype_struct{
-	unsigned	mtype:8;	/*!< main data type */
-	unsigned	prtype:24;	/*!< precise type; MySQL data
+struct dtype_t{
+	unsigned	prtype:32;	/*!< precise type; MySQL data
 					type, charset code, flags to
 					indicate nullability,
 					signedness, whether this is a
 					binary string, whether this is
 					a true VARCHAR where MySQL
 					uses 2 bytes to store the length */
+	unsigned	mtype:8;	/*!< main data type */
 
 	/* the remaining fields do not affect alphabetical ordering: */
 
diff --git a/storage/xtradb/include/data0type.ic b/storage/xtradb/include/data0type.ic
index 410970ac50e..d489bef89a8 100644
--- a/storage/xtradb/include/data0type.ic
+++ b/storage/xtradb/include/data0type.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -23,6 +23,8 @@ Data types
 Created 1/16/1996 Heikki Tuuri
 *******************************************************/
 
+#include <string.h> /* strlen() */
+
 #include "mach0data.h"
 #ifndef UNIV_HOTBACKUP
 # include "ha_prototypes.h"
@@ -36,7 +38,7 @@ dtype_get_charset_coll(
 /*===================*/
 	ulint	prtype)	/*!< in: precise data type */
 {
-	return((prtype >> 16) & 0xFFUL);
+	return((prtype >> 16) & CHAR_COLL_MASK);
 }
 
 /*********************************************************************//**
@@ -259,8 +261,8 @@ dtype_get_pad_char(
 	switch (mtype) {
 	case DATA_FIXBINARY:
 	case DATA_BINARY:
-		if (UNIV_UNLIKELY(dtype_get_charset_coll(prtype)
-				  == DATA_MYSQL_BINARY_CHARSET_COLL)) {
+		if (dtype_get_charset_coll(prtype)
+		    == DATA_MYSQL_BINARY_CHARSET_COLL) {
 			/* Starting from 5.0.18, do not pad
 			VARBINARY or BINARY columns. */
 			return(ULINT_UNDEFINED);
@@ -312,11 +314,11 @@ dtype_new_store_for_order_and_null_size(
 	buf[0] = (byte)(type->mtype & 0xFFUL);
 
 	if (type->prtype & DATA_BINARY_TYPE) {
-		buf[0] = buf[0] | 128;
+		buf[0] |= 128;
 	}
 
 	/* In versions < 4.1.2 we had:	if (type->prtype & DATA_NONLATIN1) {
-	buf[0] = buf[0] | 64;
+	buf[0] |= 64;
 	}
 	*/
 
@@ -326,7 +328,7 @@ dtype_new_store_for_order_and_null_size(
 
 	mach_write_to_2(buf + 2, len & 0xFFFFUL);
 
-	ut_ad(dtype_get_charset_coll(type->prtype) < 256);
+	ut_ad(dtype_get_charset_coll(type->prtype) <= MAX_CHAR_COLL_NUM);
 	mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype));
 
 	if (type->prtype & DATA_NOT_NULL) {
@@ -353,7 +355,7 @@ dtype_read_for_order_and_null_size(
 	type->prtype = buf[1];
 
 	if (buf[0] & 128) {
-		type->prtype = type->prtype | DATA_BINARY_TYPE;
+		type->prtype |= DATA_BINARY_TYPE;
 	}
 
 	type->len = mach_read_from_2(buf + 2);
@@ -393,10 +395,10 @@ dtype_new_read_for_order_and_null_size(
 
 	type->len = mach_read_from_2(buf + 2);
 
-	charset_coll = mach_read_from_2(buf + 4) & 0x7fff;
+	charset_coll = mach_read_from_2(buf + 4) & CHAR_COLL_MASK;
 
 	if (dtype_is_string_type(type->mtype)) {
-		ut_a(charset_coll < 256);
+		ut_a(charset_coll <= MAX_CHAR_COLL_NUM);
 
 		if (charset_coll == 0) {
 			/* This insert buffer record was inserted with MySQL
@@ -412,6 +414,101 @@ dtype_new_read_for_order_and_null_size(
 	}
 	dtype_set_mblen(type);
 }
+
+/*********************************************************************//**
+Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len
+@return the SQL type name */
+UNIV_INLINE
+char*
+dtype_sql_name(
+/*===========*/
+	unsigned	mtype,	/*!< in: mtype */
+	unsigned	prtype,	/*!< in: prtype */
+	unsigned	len,	/*!< in: len */
+	char*		name,	/*!< out: SQL name */
+	unsigned	name_sz)/*!< in: size of the name buffer */
+{
+
+#define APPEND_UNSIGNED()					\
+	do {							\
+		if (prtype & DATA_UNSIGNED) {			\
+			ut_snprintf(name + strlen(name),	\
+				    name_sz - strlen(name),	\
+				    " UNSIGNED");		\
+		}						\
+	} while (0)
+
+	ut_snprintf(name, name_sz, "UNKNOWN");
+
+	switch (mtype) {
+	case DATA_INT:
+		switch (len) {
+		case 1:
+			ut_snprintf(name, name_sz, "TINYINT");
+			break;
+		case 2:
+			ut_snprintf(name, name_sz, "SMALLINT");
+			break;
+		case 3:
+			ut_snprintf(name, name_sz, "MEDIUMINT");
+			break;
+		case 4:
+			ut_snprintf(name, name_sz, "INT");
+			break;
+		case 8:
+			ut_snprintf(name, name_sz, "BIGINT");
+			break;
+		}
+		APPEND_UNSIGNED();
+		break;
+	case DATA_FLOAT:
+		ut_snprintf(name, name_sz, "FLOAT");
+		APPEND_UNSIGNED();
+		break;
+	case DATA_DOUBLE:
+		ut_snprintf(name, name_sz, "DOUBLE");
+		APPEND_UNSIGNED();
+		break;
+	case DATA_FIXBINARY:
+		ut_snprintf(name, name_sz, "BINARY(%u)", len);
+		break;
+	case DATA_CHAR:
+	case DATA_MYSQL:
+		ut_snprintf(name, name_sz, "CHAR(%u)", len);
+		break;
+	case DATA_VARCHAR:
+	case DATA_VARMYSQL:
+		ut_snprintf(name, name_sz, "VARCHAR(%u)", len);
+		break;
+	case DATA_BINARY:
+		ut_snprintf(name, name_sz, "VARBINARY(%u)", len);
+		break;
+	case DATA_BLOB:
+		switch (len) {
+		case 9:
+			ut_snprintf(name, name_sz, "TINYBLOB");
+			break;
+		case 10:
+			ut_snprintf(name, name_sz, "BLOB");
+			break;
+		case 11:
+			ut_snprintf(name, name_sz, "MEDIUMBLOB");
+			break;
+		case 12:
+			ut_snprintf(name, name_sz, "LONGBLOB");
+			break;
+		}
+	}
+
+	if (prtype & DATA_NOT_NULL) {
+		ut_snprintf(name + strlen(name),
+			    name_sz - strlen(name),
+			    " NOT NULL");
+	}
+
+	return(name);
+}
+
 #endif /* !UNIV_HOTBACKUP */
 
 /***********************************************************************//**
diff --git a/storage/xtradb/include/data0types.h b/storage/xtradb/include/data0types.h
index 245aca599c0..bd2bb577611 100644
--- a/storage/xtradb/include/data0types.h
+++ b/storage/xtradb/include/data0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2000, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2000, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,10 +27,10 @@ Created 9/21/2000 Heikki Tuuri
 #define data0types_h
 
 /* SQL data field struct */
-typedef struct dfield_struct	dfield_t;
+struct dfield_t;
 
 /* SQL data tuple struct */
-typedef struct dtuple_struct	dtuple_t;
+struct dtuple_t;
 
 #endif
 
diff --git a/storage/xtradb/include/db0err.h b/storage/xtradb/include/db0err.h
index 4d0e3051fe6..af651c61b66 100644
--- a/storage/xtradb/include/db0err.h
+++ b/storage/xtradb/include/db0err.h
@@ -27,7 +27,7 @@ Created 5/24/1996 Heikki Tuuri
 #define db0err_h
 
 
-enum db_err {
+enum dberr_t {
 	DB_SUCCESS_LOCKED_REC = 9,	/*!< like DB_SUCCESS, but a new
 					explicit record lock was created */
 	DB_SUCCESS = 10,
@@ -42,79 +42,91 @@ enum db_err {
 	DB_ROLLBACK,
 	DB_DUPLICATE_KEY,
 	DB_QUE_THR_SUSPENDED,
-	DB_MISSING_HISTORY,		/* required history data has been
+	DB_MISSING_HISTORY,		/*!< required history data has been
 					deleted due to lack of space in
 					rollback segment */
 	DB_CLUSTER_NOT_FOUND = 30,
 	DB_TABLE_NOT_FOUND,
-	DB_MUST_GET_MORE_FILE_SPACE,	/* the database has to be stopped
+	DB_MUST_GET_MORE_FILE_SPACE,	/*!< the database has to be stopped
 					and restarted with more file space */
 	DB_TABLE_IS_BEING_USED,
-	DB_TOO_BIG_RECORD,		/* a record in an index would not fit
+	DB_TOO_BIG_RECORD,		/*!< a record in an index would not fit
 					on a compressed page, or it would
 					become bigger than 1/2 free space in
 					an uncompressed page frame */
-	DB_LOCK_WAIT_TIMEOUT,		/* lock wait lasted too long */
-	DB_NO_REFERENCED_ROW,		/* referenced key value not found
+	DB_LOCK_WAIT_TIMEOUT,		/*!< lock wait lasted too long */
+	DB_NO_REFERENCED_ROW,		/*!< referenced key value not found
 					for a foreign key in an insert or
 					update of a row */
-	DB_ROW_IS_REFERENCED,		/* cannot delete or update a row
+	DB_ROW_IS_REFERENCED,		/*!< cannot delete or update a row
 					because it contains a key value
 					which is referenced */
-	DB_CANNOT_ADD_CONSTRAINT,	/* adding a foreign key constraint
+	DB_CANNOT_ADD_CONSTRAINT,	/*!< adding a foreign key constraint
 					to a table failed */
-	DB_CORRUPTION,			/* data structure corruption noticed */
-	DB_CANNOT_DROP_CONSTRAINT,	/* dropping a foreign key constraint
+	DB_CORRUPTION,			/*!< data structure corruption noticed */
+	DB_CANNOT_DROP_CONSTRAINT,	/*!< dropping a foreign key constraint
 					from a table failed */
-	DB_NO_SAVEPOINT,		/* no savepoint exists with the given
+	DB_NO_SAVEPOINT,		/*!< no savepoint exists with the given
 					name */
-	DB_TABLESPACE_ALREADY_EXISTS,	/* we cannot create a new single-table
+	DB_TABLESPACE_EXISTS,		/*!< we cannot create a new single-table
 					tablespace because a file of the same
 					name already exists */
-	DB_TABLESPACE_DELETED,		/* tablespace does not exist or is
+	DB_TABLESPACE_DELETED,		/*!< tablespace was deleted or is
 					being dropped right now */
-	DB_LOCK_TABLE_FULL,		/* lock structs have exhausted the
+	DB_TABLESPACE_NOT_FOUND,	/*<! Attempt to delete a tablespace
+					instance that was not found in the
+					tablespace hash table */
+	DB_LOCK_TABLE_FULL,		/*!< lock structs have exhausted the
 					buffer pool (for big transactions,
 					InnoDB stores the lock structs in the
 					buffer pool) */
-	DB_FOREIGN_DUPLICATE_KEY,	/* foreign key constraints
+	DB_FOREIGN_DUPLICATE_KEY,	/*!< foreign key constraints
 					activated by the operation would
 					lead to a duplicate key in some
 					table */
-	DB_TOO_MANY_CONCURRENT_TRXS,	/* when InnoDB runs out of the
+	DB_TOO_MANY_CONCURRENT_TRXS,	/*!< when InnoDB runs out of the
 					preconfigured undo slots, this can
 					only happen when there are too many
 					concurrent transactions */
-	DB_UNSUPPORTED,			/* when InnoDB sees any artefact or
+	DB_UNSUPPORTED,			/*!< when InnoDB sees any artefact or
 					a feature that it can't recoginize or
 					work with e.g., FT indexes created by
 					a later version of the engine. */
 
-	DB_PRIMARY_KEY_IS_NULL,		/* a column in the PRIMARY KEY
-					was found to be NULL */
+	DB_INVALID_NULL,		/*!< a NOT NULL column was found to
+					be NULL during table rebuild */
 
-	DB_STATS_DO_NOT_EXIST,		/* an operation that requires the
+	DB_STATS_DO_NOT_EXIST,		/*!< an operation that requires the
 					persistent storage, used for recording
 					table and index statistics, was
 					requested but this storage does not
 					exist itself or the stats for a given
 					table do not exist */
-	DB_FOREIGN_EXCEED_MAX_CASCADE,	/* Foreign key constraint related
+	DB_FOREIGN_EXCEED_MAX_CASCADE,	/*!< Foreign key constraint related
 					cascading delete/update exceeds
 					maximum allowed depth */
-	DB_CHILD_NO_INDEX,		/* the child (foreign) table does not
-					have an index that contains the
+	DB_CHILD_NO_INDEX,		/*!< the child (foreign) table does
+					not have an index that contains the
 					foreign keys as its prefix columns */
-	DB_PARENT_NO_INDEX,		/* the parent table does not
+	DB_PARENT_NO_INDEX,		/*!< the parent table does not
 					have an index that contains the
 					foreign keys as its prefix columns */
-	DB_TOO_BIG_INDEX_COL,		/* index column size exceeds maximum
-					limit */
-	DB_INDEX_CORRUPT,		/* we have corrupted index */
-	DB_UNDO_RECORD_TOO_BIG,		/* the undo log record is too big */
+	DB_TOO_BIG_INDEX_COL,		/*!< index column size exceeds
+					maximum limit */
+	DB_INDEX_CORRUPT,		/*!< we have corrupted index */
+	DB_UNDO_RECORD_TOO_BIG,		/*!< the undo log record is too big */
+	DB_READ_ONLY,			/*!< Update operation attempted in
+					a read-only transaction */
+	DB_FTS_INVALID_DOCID,		/* FTS Doc ID cannot be zero */
 	DB_TABLE_IN_FK_CHECK,		/* table is being used in foreign
 					key check */
-	DB_IDENTIFIER_TOO_LONG,		/* Identifier name too long */
+	DB_ONLINE_LOG_TOO_BIG,		/*!< Modification log grew too big
+					during online index creation */
+
+	DB_IO_ERROR,			/*!< Generic IO error */
+	DB_IDENTIFIER_TOO_LONG,		/*!< Identifier name too long */
+	DB_FTS_EXCEED_RESULT_CACHE_LIMIT,	/*!< FTS query memory
+					exceeds result cache limit */
 
 	/* The following are partial failure codes */
 	DB_FAIL = 1000,
@@ -124,7 +136,23 @@ enum db_err {
 	DB_ZIP_OVERFLOW,
 	DB_RECORD_NOT_FOUND = 1500,
 	DB_END_OF_INDEX,
-	DB_SEARCH_ABORTED_BY_USER= 1533
+	DB_DICT_CHANGED,		/*!< Some part of table dictionary has
+					changed. Such as index dropped or
+					foreign key dropped */
+
+	DB_SEARCH_ABORTED_BY_USER= 1533,
+
+        /* The following are API only error codes. */
+	DB_DATA_MISMATCH = 2000,	/*!< Column update or read failed
+					because the types mismatch */
+
+	DB_SCHEMA_NOT_LOCKED,		/*!< If an API function expects the
+					schema to be locked in exclusive mode
+					and if it's not then that API function
+					will return this error code */
+
+	DB_NOT_FOUND			/*!< Generic error code for "Not found"
+					type of errors */
 };
 
 #endif
diff --git a/storage/xtradb/include/dict0boot.h b/storage/xtradb/include/dict0boot.h
index 27e87d16750..a994c9d8ff1 100644
--- a/storage/xtradb/include/dict0boot.h
+++ b/storage/xtradb/include/dict0boot.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -58,6 +58,13 @@ dict_hdr_get_new_id(
 	ulint*		space_id);	/*!< out: space id
 					(not assigned if NULL) */
 /**********************************************************************//**
+Writes the current value of the row id counter to the dictionary header file
+page. */
+UNIV_INTERN
+void
+dict_hdr_flush_row_id(void);
+/*=======================*/
+/**********************************************************************//**
 Returns a new row id.
 @return	the new id */
 UNIV_INLINE
@@ -82,38 +89,32 @@ dict_sys_write_row_id(
 	row_id_t	row_id);/*!< in: row id */
 /*****************************************************************//**
 Initializes the data dictionary memory structures when the database is
-started. This function is also called when the data dictionary is created. */
+started. This function is also called when the data dictionary is created.
+@return DB_SUCCESS or error code. */
 UNIV_INTERN
-void
-dict_boot(void);
+dberr_t
+dict_boot(void)
 /*===========*/
-/*****************************************************************//**
-Creates and initializes the data dictionary at the database creation. */
-UNIV_INTERN
-void
-dict_create(void);
-/*=============*/
-
-/*****************************************************************//**
-Verifies the SYS_STATS table by scanning its clustered index.  This
-function may only be called at InnoDB startup time.
-
-@return	TRUE if SYS_STATS was verified successfully */
-UNIV_INTERN
-ibool
-dict_verify_xtradb_sys_stats(void);
-/*==============================*/
+	__attribute__((warn_unused_result));
 
 /*****************************************************************//**
-Discard the existing dictionary cache SYS_STATS information, create and
-add it there anew.  Does not touch the old SYS_STATS tablespace page
-under the assumption that they are corrupted or overwritten for other
-purposes. */
+Creates and initializes the data dictionary at the server bootstrap.
+@return DB_SUCCESS or error code. */
 UNIV_INTERN
-void
-dict_recreate_xtradb_sys_stats(void);
-/*================================*/
+dberr_t
+dict_create(void)
+/*=============*/
+	__attribute__((warn_unused_result));
 
+/*********************************************************************//**
+Check if a table id belongs to  system table.
+@return true if the table id belongs to a system table. */
+UNIV_INLINE
+bool
+dict_is_sys_table(
+/*==============*/
+	table_id_t	id)		/*!< in: table id to check */
+	__attribute__((warn_unused_result));
 
 /* Space id and page no where the dictionary header resides */
 #define	DICT_HDR_SPACE		0	/* the SYSTEM tablespace */
@@ -124,7 +125,6 @@ dict_recreate_xtradb_sys_stats(void);
 #define DICT_COLUMNS_ID		2
 #define DICT_INDEXES_ID		3
 #define DICT_FIELDS_ID		4
-#define DICT_STATS_ID		6
 /* The following is a secondary index on SYS_TABLES */
 #define DICT_TABLE_IDS_ID	5
 
@@ -134,7 +134,6 @@ dict_recreate_xtradb_sys_stats(void);
 					indexes; ibuf tables and indexes are
 					assigned as the id the number
 					DICT_IBUF_ID_MIN plus the space id */
-#define DICT_IBUF_ID_MIN	0xFFFFFFFF00000000ULL
 
 /* The offset of the dictionary header on the page */
 #define	DICT_HDR		FSEG_PAGE_DATA
@@ -142,44 +141,200 @@ dict_recreate_xtradb_sys_stats(void);
 /*-------------------------------------------------------------*/
 /* Dictionary header offsets */
 #define DICT_HDR_ROW_ID		0	/* The latest assigned row id */
-#define	DICT_HDR_TABLE_ID	8	/* The latest assigned table id */
-#define	DICT_HDR_INDEX_ID	16	/* The latest assigned index id */
-#define DICT_HDR_MAX_SPACE_ID	24	/* The latest assigned space id, or 0*/
-#define	DICT_HDR_MIX_ID_LOW	28	/* Obsolete,always DICT_HDR_FIRST_ID */
-#define	DICT_HDR_TABLES		32	/* Root of the table index tree */
-#define	DICT_HDR_TABLE_IDS	36	/* Root of the table index tree */
-#define	DICT_HDR_COLUMNS	40	/* Root of the column index tree */
-#define	DICT_HDR_INDEXES	44	/* Root of the index index tree */
-#define	DICT_HDR_FIELDS		48	/* Root of the index field
-					index tree */
-#define	DICT_HDR_STATS		52	/* Root of the stats tree */
+#define DICT_HDR_TABLE_ID	8	/* The latest assigned table id */
+#define DICT_HDR_INDEX_ID	16	/* The latest assigned index id */
+#define DICT_HDR_MAX_SPACE_ID	24	/* The latest assigned space id,or 0*/
+#define DICT_HDR_MIX_ID_LOW	28	/* Obsolete,always DICT_HDR_FIRST_ID*/
+#define DICT_HDR_TABLES		32	/* Root of SYS_TABLES clust index */
+#define DICT_HDR_TABLE_IDS	36	/* Root of SYS_TABLE_IDS sec index */
+#define DICT_HDR_COLUMNS	40	/* Root of SYS_COLUMNS clust index */
+#define DICT_HDR_INDEXES	44	/* Root of SYS_INDEXES clust index */
+#define DICT_HDR_FIELDS		48	/* Root of SYS_FIELDS clust index */
 
 #define DICT_HDR_FSEG_HEADER	56	/* Segment header for the tablespace
 					segment into which the dictionary
 					header is created */
-
-#define	DICT_HDR_XTRADB_MARK	256	/* Flag to distinguish expansion of XtraDB */
 /*-------------------------------------------------------------*/
 
+/* The columns in SYS_TABLES */
+enum dict_col_sys_tables_enum {
+	DICT_COL__SYS_TABLES__NAME		= 0,
+	DICT_COL__SYS_TABLES__ID		= 1,
+	DICT_COL__SYS_TABLES__N_COLS		= 2,
+	DICT_COL__SYS_TABLES__TYPE		= 3,
+	DICT_COL__SYS_TABLES__MIX_ID		= 4,
+	DICT_COL__SYS_TABLES__MIX_LEN		= 5,
+	DICT_COL__SYS_TABLES__CLUSTER_ID	= 6,
+	DICT_COL__SYS_TABLES__SPACE		= 7,
+	DICT_NUM_COLS__SYS_TABLES		= 8
+};
 /* The field numbers in the SYS_TABLES clustered index */
-#define DICT_SYS_TABLES_TYPE_FIELD		5
-
+enum dict_fld_sys_tables_enum {
+	DICT_FLD__SYS_TABLES__NAME		= 0,
+	DICT_FLD__SYS_TABLES__DB_TRX_ID		= 1,
+	DICT_FLD__SYS_TABLES__DB_ROLL_PTR	= 2,
+	DICT_FLD__SYS_TABLES__ID		= 3,
+	DICT_FLD__SYS_TABLES__N_COLS		= 4,
+	DICT_FLD__SYS_TABLES__TYPE		= 5,
+	DICT_FLD__SYS_TABLES__MIX_ID		= 6,
+	DICT_FLD__SYS_TABLES__MIX_LEN		= 7,
+	DICT_FLD__SYS_TABLES__CLUSTER_ID	= 8,
+	DICT_FLD__SYS_TABLES__SPACE		= 9,
+	DICT_NUM_FIELDS__SYS_TABLES		= 10
+};
+/* The field numbers in the SYS_TABLE_IDS index */
+enum dict_fld_sys_table_ids_enum {
+	DICT_FLD__SYS_TABLE_IDS__ID		= 0,
+	DICT_FLD__SYS_TABLE_IDS__NAME		= 1,
+	DICT_NUM_FIELDS__SYS_TABLE_IDS		= 2
+};
+/* The columns in SYS_COLUMNS */
+enum dict_col_sys_columns_enum {
+	DICT_COL__SYS_COLUMNS__TABLE_ID		= 0,
+	DICT_COL__SYS_COLUMNS__POS		= 1,
+	DICT_COL__SYS_COLUMNS__NAME		= 2,
+	DICT_COL__SYS_COLUMNS__MTYPE		= 3,
+	DICT_COL__SYS_COLUMNS__PRTYPE		= 4,
+	DICT_COL__SYS_COLUMNS__LEN		= 5,
+	DICT_COL__SYS_COLUMNS__PREC		= 6,
+	DICT_NUM_COLS__SYS_COLUMNS		= 7
+};
+/* The field numbers in the SYS_COLUMNS clustered index */
+enum dict_fld_sys_columns_enum {
+	DICT_FLD__SYS_COLUMNS__TABLE_ID		= 0,
+	DICT_FLD__SYS_COLUMNS__POS		= 1,
+	DICT_FLD__SYS_COLUMNS__DB_TRX_ID	= 2,
+	DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR	= 3,
+	DICT_FLD__SYS_COLUMNS__NAME		= 4,
+	DICT_FLD__SYS_COLUMNS__MTYPE		= 5,
+	DICT_FLD__SYS_COLUMNS__PRTYPE		= 6,
+	DICT_FLD__SYS_COLUMNS__LEN		= 7,
+	DICT_FLD__SYS_COLUMNS__PREC		= 8,
+	DICT_NUM_FIELDS__SYS_COLUMNS		= 9
+};
+/* The columns in SYS_INDEXES */
+enum dict_col_sys_indexes_enum {
+	DICT_COL__SYS_INDEXES__TABLE_ID		= 0,
+	DICT_COL__SYS_INDEXES__ID		= 1,
+	DICT_COL__SYS_INDEXES__NAME		= 2,
+	DICT_COL__SYS_INDEXES__N_FIELDS		= 3,
+	DICT_COL__SYS_INDEXES__TYPE		= 4,
+	DICT_COL__SYS_INDEXES__SPACE		= 5,
+	DICT_COL__SYS_INDEXES__PAGE_NO		= 6,
+	DICT_NUM_COLS__SYS_INDEXES		= 7
+};
 /* The field numbers in the SYS_INDEXES clustered index */
-#define DICT_SYS_INDEXES_PAGE_NO_FIELD	 8
-#define DICT_SYS_INDEXES_SPACE_NO_FIELD	 7
-#define DICT_SYS_INDEXES_TYPE_FIELD	 6
-#define DICT_SYS_INDEXES_NAME_FIELD	 4
+enum dict_fld_sys_indexes_enum {
+	DICT_FLD__SYS_INDEXES__TABLE_ID		= 0,
+	DICT_FLD__SYS_INDEXES__ID		= 1,
+	DICT_FLD__SYS_INDEXES__DB_TRX_ID	= 2,
+	DICT_FLD__SYS_INDEXES__DB_ROLL_PTR	= 3,
+	DICT_FLD__SYS_INDEXES__NAME		= 4,
+	DICT_FLD__SYS_INDEXES__N_FIELDS		= 5,
+	DICT_FLD__SYS_INDEXES__TYPE		= 6,
+	DICT_FLD__SYS_INDEXES__SPACE		= 7,
+	DICT_FLD__SYS_INDEXES__PAGE_NO		= 8,
+	DICT_NUM_FIELDS__SYS_INDEXES		= 9
+};
+/* The columns in SYS_FIELDS */
+enum dict_col_sys_fields_enum {
+	DICT_COL__SYS_FIELDS__INDEX_ID		= 0,
+	DICT_COL__SYS_FIELDS__POS		= 1,
+	DICT_COL__SYS_FIELDS__COL_NAME		= 2,
+	DICT_NUM_COLS__SYS_FIELDS		= 3
+};
+/* The field numbers in the SYS_FIELDS clustered index */
+enum dict_fld_sys_fields_enum {
+	DICT_FLD__SYS_FIELDS__INDEX_ID		= 0,
+	DICT_FLD__SYS_FIELDS__POS		= 1,
+	DICT_FLD__SYS_FIELDS__DB_TRX_ID		= 2,
+	DICT_FLD__SYS_FIELDS__DB_ROLL_PTR	= 3,
+	DICT_FLD__SYS_FIELDS__COL_NAME		= 4,
+	DICT_NUM_FIELDS__SYS_FIELDS		= 5
+};
+/* The columns in SYS_FOREIGN */
+enum dict_col_sys_foreign_enum {
+	DICT_COL__SYS_FOREIGN__ID		= 0,
+	DICT_COL__SYS_FOREIGN__FOR_NAME		= 1,
+	DICT_COL__SYS_FOREIGN__REF_NAME		= 2,
+	DICT_COL__SYS_FOREIGN__N_COLS		= 3,
+	DICT_NUM_COLS__SYS_FOREIGN		= 4
+};
+/* The field numbers in the SYS_FOREIGN clustered index */
+enum dict_fld_sys_foreign_enum {
+	DICT_FLD__SYS_FOREIGN__ID		= 0,
+	DICT_FLD__SYS_FOREIGN__DB_TRX_ID	= 1,
+	DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR	= 2,
+	DICT_FLD__SYS_FOREIGN__FOR_NAME		= 3,
+	DICT_FLD__SYS_FOREIGN__REF_NAME		= 4,
+	DICT_FLD__SYS_FOREIGN__N_COLS		= 5,
+	DICT_NUM_FIELDS__SYS_FOREIGN		= 6
+};
+/* The field numbers in the SYS_FOREIGN_FOR_NAME secondary index */
+enum dict_fld_sys_foreign_for_name_enum {
+	DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME	= 0,
+	DICT_FLD__SYS_FOREIGN_FOR_NAME__ID	= 1,
+	DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME	= 2
+};
+/* The columns in SYS_FOREIGN_COLS */
+enum dict_col_sys_foreign_cols_enum {
+	DICT_COL__SYS_FOREIGN_COLS__ID			= 0,
+	DICT_COL__SYS_FOREIGN_COLS__POS			= 1,
+	DICT_COL__SYS_FOREIGN_COLS__FOR_COL_NAME	= 2,
+	DICT_COL__SYS_FOREIGN_COLS__REF_COL_NAME	= 3,
+	DICT_NUM_COLS__SYS_FOREIGN_COLS			= 4
+};
+/* The field numbers in the SYS_FOREIGN_COLS clustered index */
+enum dict_fld_sys_foreign_cols_enum {
+	DICT_FLD__SYS_FOREIGN_COLS__ID			= 0,
+	DICT_FLD__SYS_FOREIGN_COLS__POS			= 1,
+	DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID		= 2,
+	DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR		= 3,
+	DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME	= 4,
+	DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME	= 5,
+	DICT_NUM_FIELDS__SYS_FOREIGN_COLS		= 6
+};
+/* The columns in SYS_TABLESPACES */
+enum dict_col_sys_tablespaces_enum {
+	DICT_COL__SYS_TABLESPACES__SPACE		= 0,
+	DICT_COL__SYS_TABLESPACES__NAME			= 1,
+	DICT_COL__SYS_TABLESPACES__FLAGS		= 2,
+	DICT_NUM_COLS__SYS_TABLESPACES			= 3
+};
+/* The field numbers in the SYS_TABLESPACES clustered index */
+enum dict_fld_sys_tablespaces_enum {
+	DICT_FLD__SYS_TABLESPACES__SPACE		= 0,
+	DICT_FLD__SYS_TABLESPACES__DB_TRX_ID		= 1,
+	DICT_FLD__SYS_TABLESPACES__DB_ROLL_PTR		= 2,
+	DICT_FLD__SYS_TABLESPACES__NAME			= 3,
+	DICT_FLD__SYS_TABLESPACES__FLAGS		= 4,
+	DICT_NUM_FIELDS__SYS_TABLESPACES		= 5
+};
+/* The columns in SYS_DATAFILES */
+enum dict_col_sys_datafiles_enum {
+	DICT_COL__SYS_DATAFILES__SPACE			= 0,
+	DICT_COL__SYS_DATAFILES__PATH			= 1,
+	DICT_NUM_COLS__SYS_DATAFILES			= 2
+};
+/* The field numbers in the SYS_DATAFILES clustered index */
+enum dict_fld_sys_datafiles_enum {
+	DICT_FLD__SYS_DATAFILES__SPACE			= 0,
+	DICT_FLD__SYS_DATAFILES__DB_TRX_ID		= 1,
+	DICT_FLD__SYS_DATAFILES__DB_ROLL_PTR		= 2,
+	DICT_FLD__SYS_DATAFILES__PATH			= 3,
+	DICT_NUM_FIELDS__SYS_DATAFILES			= 4
+};
 
-#define DICT_SYS_STATS_DIFF_VALS_FIELD	 4
-#define DICT_SYS_STATS_NON_NULL_VALS_FIELD	5
+/* A number of the columns above occur in multiple tables.  These are the
+length of thos fields. */
+#define	DICT_FLD_LEN_SPACE	4
+#define	DICT_FLD_LEN_FLAGS	4
 
 /* When a row id which is zero modulo this number (which must be a power of
 two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is
 updated */
 #define DICT_HDR_ROW_ID_WRITE_MARGIN	256
 
-#define DICT_HDR_XTRADB_FLAG		0x5854524144425F31ULL	/* "XTRADB_1" */
-
 #ifndef UNIV_NONINL
 #include "dict0boot.ic"
 #endif
diff --git a/storage/xtradb/include/dict0boot.ic b/storage/xtradb/include/dict0boot.ic
index 5fa33837640..2b156a4f672 100644
--- a/storage/xtradb/include/dict0boot.ic
+++ b/storage/xtradb/include/dict0boot.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -24,15 +24,6 @@ Created 4/18/1996 Heikki Tuuri
 *******************************************************/
 
 /**********************************************************************//**
-Writes the current value of the row id counter to the dictionary header file
-page. */
-UNIV_INTERN
-void
-dict_hdr_flush_row_id(void);
-/*=======================*/
-
-
-/**********************************************************************//**
 Returns a new row id.
 @return	the new id */
 UNIV_INLINE
@@ -90,4 +81,16 @@ dict_sys_write_row_id(
 	mach_write_to_6(field, row_id);
 }
 
+/*********************************************************************//**
+Check if a table id belongs to  system table.
+@return true if the table id belongs to a system table. */
+UNIV_INLINE
+bool
+dict_is_sys_table(
+/*==============*/
+	table_id_t	id)		/*!< in: table id to check */
+{
+	return(id < DICT_HDR_FIRST_ID);
+}
+
 
diff --git a/storage/xtradb/include/dict0crea.h b/storage/xtradb/include/dict0crea.h
index 762ab54a353..6ec1079957b 100644
--- a/storage/xtradb/include/dict0crea.h
+++ b/storage/xtradb/include/dict0crea.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -42,7 +42,9 @@ tab_create_graph_create(
 /*====================*/
 	dict_table_t*	table,	/*!< in: table to create, built as a memory data
 				structure */
-	mem_heap_t*	heap);	/*!< in: heap where created */
+	mem_heap_t*	heap,	/*!< in: heap where created */
+	bool		commit);/*!< in: true if the commit node should be
+				added to the query graph */
 /*********************************************************************//**
 Creates an index create graph.
 @return	own: index create node */
@@ -52,15 +54,9 @@ ind_create_graph_create(
 /*====================*/
 	dict_index_t*	index,	/*!< in: index to create, built as a memory data
 				structure */
-	mem_heap_t*	heap);	/*!< in: heap where created */
-/*********************************************************************//**
-*/
-UNIV_INTERN
-ind_node_t*
-ind_insert_stats_graph_create(
-/*==========================*/
-	dict_index_t*	index,
-	mem_heap_t*	heap);
+	mem_heap_t*	heap,	/*!< in: heap where created */
+	bool		commit);/*!< in: true if the commit node should be
+				added to the query graph */
 /***********************************************************//**
 Creates a table. This is a high-level function used in SQL execution graphs.
 @return	query thread to run next or NULL */
@@ -70,13 +66,6 @@ dict_create_table_step(
 /*===================*/
 	que_thr_t*	thr);	/*!< in: query thread */
 /***********************************************************//**
-*/
-UNIV_INTERN
-que_thr_t*
-dict_insert_stats_step(
-/*===================*/
-	que_thr_t*	thr);
-/***********************************************************//**
 Creates an index. This is a high-level function used in SQL execution
 graphs.
 @return	query thread to run next or NULL */
@@ -114,14 +103,28 @@ dict_drop_index_tree(
 	mtr_t*	mtr);	/*!< in: mtr having the latch on the record page */
 /****************************************************************//**
 Creates the foreign key constraints system tables inside InnoDB
-at database creation or database start if they are not found or are
+at server bootstrap or server start if they are not found or are
 not of the right form.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 dict_create_or_check_foreign_constraint_tables(void);
 /*================================================*/
 /********************************************************************//**
+Generate a foreign key constraint name when it was not named by the user.
+A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER,
+where the numbers start from 1, and are given locally for this table, that is,
+the number is not global, as it used to be before MySQL 4.0.18.  */
+UNIV_INLINE
+dberr_t
+dict_create_add_foreign_id(
+/*=======================*/
+	ulint*		id_nr,	/*!< in/out: number to use in id generation;
+				incremented if used */
+	const char*	name,	/*!< in: table name */
+	dict_foreign_t*	foreign)/*!< in/out: foreign key */
+	__attribute__((nonnull));
+/********************************************************************//**
 Adds foreign key definitions to data dictionary tables in the database. We
 look at table->foreign_list, and also generate names to constraints that were
 not named by the user. A generated constraint has a name of the format
@@ -130,7 +133,7 @@ given locally for this table, that is, the number is not global, as in the
 old format constraints < 4.0.18 it used to be.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 dict_create_add_foreigns_to_dictionary(
 /*===================================*/
 	ulint		start_id,/*!< in: if we are actually doing ALTER TABLE
@@ -142,11 +145,46 @@ dict_create_add_foreigns_to_dictionary(
 				so far has no constraints for which the name
 				was generated here */
 	dict_table_t*	table,	/*!< in: table */
-	trx_t*		trx);	/*!< in: transaction */
+	trx_t*		trx)	/*!< in: transaction */
+	__attribute__((nonnull, warn_unused_result));
+/****************************************************************//**
+Creates the tablespaces and datafiles system tables inside InnoDB
+at server bootstrap or server start if they are not found or are
+not of the right form.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_create_or_check_sys_tablespace(void);
+/*=====================================*/
+/********************************************************************//**
+Add a single tablespace definition to the data dictionary tables in the
+database.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_add_tablespace_to_dictionary(
+/*=====================================*/
+	ulint		space,		/*!< in: tablespace id */
+	const char*	name,		/*!< in: tablespace name */
+	ulint		flags,		/*!< in: tablespace flags */
+	const char*	path,		/*!< in: tablespace path */
+	trx_t*		trx,		/*!< in: transaction */
+	bool		commit);	/*!< in: if true then commit the
+					transaction */
+/********************************************************************//**
+Add a foreign key definition to the data dictionary tables.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_add_foreign_to_dictionary(
+/*==================================*/
+	const char*		name,	/*!< in: table name */
+	const dict_foreign_t*	foreign,/*!< in: foreign key */
+	trx_t*			trx)	/*!< in/out: dictionary transaction */
+	__attribute__((nonnull, warn_unused_result));
 
 /* Table create node structure */
-
-struct tab_node_struct{
+struct tab_node_t{
 	que_common_t	common;	/*!< node type: QUE_NODE_TABLE_CREATE */
 	dict_table_t*	table;	/*!< table to create, built as a memory data
 				structure with dict_mem_... functions */
@@ -175,7 +213,7 @@ struct tab_node_struct{
 
 /* Index create node struct */
 
-struct ind_node_struct{
+struct ind_node_t{
 	que_common_t	common;	/*!< node type: QUE_NODE_INDEX_CREATE */
 	dict_index_t*	index;	/*!< index to create, built as a memory data
 				structure with dict_mem_... functions */
@@ -185,7 +223,6 @@ struct ind_node_struct{
 	ins_node_t*	field_def; /* child node which does the inserts of
 				the field definitions; the row to be inserted
 				is built by the parent node  */
-	ins_node_t*	stats_def;
 	commit_node_t*	commit_node;
 				/* child node which performs a commit after
 				a successful index creation */
@@ -196,7 +233,6 @@ struct ind_node_struct{
 	dict_table_t*	table;	/*!< table which owns the index */
 	dtuple_t*	ind_row;/* index definition row built */
 	ulint		field_no;/* next field definition to insert */
-	ulint		stats_no;
 	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage */
 };
 
@@ -206,7 +242,6 @@ struct ind_node_struct{
 #define	INDEX_CREATE_INDEX_TREE	3
 #define	INDEX_COMMIT_WORK	4
 #define	INDEX_ADD_TO_CACHE	5
-#define	INDEX_BUILD_STATS_COLS	6
 
 #ifndef UNIV_NONINL
 #include "dict0crea.ic"
diff --git a/storage/xtradb/include/dict0crea.ic b/storage/xtradb/include/dict0crea.ic
index 36f77e5c7d1..2d0d9dcb858 100644
--- a/storage/xtradb/include/dict0crea.ic
+++ b/storage/xtradb/include/dict0crea.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -23,3 +23,76 @@ Database object creation
 Created 1/8/1996 Heikki Tuuri
 *******************************************************/
 
+#include "mem0mem.h"
+
+/*********************************************************************//**
+Checks if a table name contains the string "/#sql" which denotes temporary
+tables in MySQL.
+@return true if temporary table */
+UNIV_INTERN
+bool
+row_is_mysql_tmp_table_name(
+/*========================*/
+	const char*     name) __attribute__((warn_unused_result));
+				/*!< in: table name in the form
+				'database/tablename' */
+
+
+/********************************************************************//**
+Generate a foreign key constraint name when it was not named by the user.
+A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER,
+where the numbers start from 1, and are given locally for this table, that is,
+the number is not global, as it used to be before MySQL 4.0.18.  */
+UNIV_INLINE
+dberr_t
+dict_create_add_foreign_id(
+/*=======================*/
+	ulint*		id_nr,	/*!< in/out: number to use in id generation;
+				incremented if used */
+	const char*	name,	/*!< in: table name */
+	dict_foreign_t*	foreign)/*!< in/out: foreign key */
+{
+	if (foreign->id == NULL) {
+		/* Generate a new constraint id */
+		ulint	namelen	= strlen(name);
+		char*	id	= static_cast<char*>(
+					mem_heap_alloc(foreign->heap,
+						       namelen + 20));
+
+		if (row_is_mysql_tmp_table_name(name)) {
+
+			/* no overflow if number < 1e13 */
+			sprintf(id, "%s_ibfk_%lu", name,
+				(ulong) (*id_nr)++);
+		} else {
+			char	table_name[MAX_TABLE_NAME_LEN + 20] = "";
+			uint	errors = 0;
+
+			strncpy(table_name, name,
+				MAX_TABLE_NAME_LEN + 20);
+
+			innobase_convert_to_system_charset(
+				strchr(table_name, '/') + 1,
+				strchr(name, '/') + 1,
+				MAX_TABLE_NAME_LEN, &errors);
+
+			if (errors) {
+				strncpy(table_name, name,
+					MAX_TABLE_NAME_LEN + 20);
+			}
+
+			/* no overflow if number < 1e13 */
+			sprintf(id, "%s_ibfk_%lu", table_name,
+				(ulong) (*id_nr)++);
+
+			if (innobase_check_identifier_length(
+				strchr(id,'/') + 1)) {
+				return(DB_IDENTIFIER_TOO_LONG);
+			}
+		}
+		foreign->id = id;
+	}
+
+	return(DB_SUCCESS);
+}
+
diff --git a/storage/xtradb/include/dict0dict.h b/storage/xtradb/include/dict0dict.h
index 8c6620b94b3..6669f60b95a 100644
--- a/storage/xtradb/include/dict0dict.h
+++ b/storage/xtradb/include/dict0dict.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -27,6 +28,7 @@ Created 1/8/1996 Heikki Tuuri
 #define dict0dict_h
 
 #include "univ.i"
+#include "db0err.h"
 #include "dict0types.h"
 #include "dict0mem.h"
 #include "data0type.h"
@@ -39,6 +41,7 @@ Created 1/8/1996 Heikki Tuuri
 #include "ut0rnd.h"
 #include "ut0byte.h"
 #include "trx0types.h"
+#include "row0types.h"
 
 #ifndef UNIV_HOTBACKUP
 # include "sync0sync.h"
@@ -49,7 +52,8 @@ UNIV_INTERN
 void
 dict_casedn_str(
 /*============*/
-	char*	a);	/*!< in/out: string to put in lower case */
+	char*	a)	/*!< in/out: string to put in lower case */
+	__attribute__((nonnull));
 /********************************************************************//**
 Get the database name length in a table name.
 @return	database name length */
@@ -57,34 +61,88 @@ UNIV_INTERN
 ulint
 dict_get_db_name_len(
 /*=================*/
-	const char*	name);	/*!< in: table name in the form
+	const char*	name)	/*!< in: table name in the form
 				dbname '/' tablename */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Open a table from its database and table name, this is currently used by
+foreign constraint parser to get the referenced table.
+@return complete table name with database and table name, allocated from
+heap memory passed in */
+UNIV_INTERN
+char*
+dict_get_referenced_table(
+/*======================*/
+	const char*	name,		/*!< in: foreign key table name */
+	const char*	database_name,	/*!< in: table db name */
+	ulint		database_name_len,/*!< in: db name length */
+	const char*	table_name,	/*!< in: table name */
+	ulint		table_name_len,	/*!< in: table name length */
+	dict_table_t**	table,		/*!< out: table object or NULL */
+	mem_heap_t*	heap);		/*!< in: heap memory */
+/*********************************************************************//**
+Frees a foreign key struct. */
+UNIV_INTERN
+void
+dict_foreign_free(
+/*==============*/
+	dict_foreign_t*	foreign);	/*!< in, own: foreign key struct */
+/*********************************************************************//**
+Finds the highest [number] for foreign key constraints of the table. Looks
+only at the >= 4.0.18-format id's, which are of the form
+databasename/tablename_ibfk_[number].
+@return highest number, 0 if table has no new format foreign key constraints */
+UNIV_INTERN
+ulint
+dict_table_get_highest_foreign_id(
+/*==============================*/
+	dict_table_t*	table);		/*!< in: table in the dictionary
+					memory cache */
 /********************************************************************//**
 Return the end of table name where we have removed dbname and '/'.
 @return	table name */
-
+UNIV_INTERN
 const char*
 dict_remove_db_name(
 /*================*/
-	const char*	name);	/*!< in: table name in the form
+	const char*	name)	/*!< in: table name in the form
 				dbname '/' tablename */
+	__attribute__((nonnull, warn_unused_result));
+
+/** Operation to perform when opening a table */
+enum dict_table_op_t {
+	/** Expect the tablespace to exist. */
+	DICT_TABLE_OP_NORMAL = 0,
+	/** Drop any orphan indexes after an aborted online index creation */
+	DICT_TABLE_OP_DROP_ORPHAN,
+	/** Silently load the tablespace if it does not exist,
+	and do not load the definitions of incomplete indexes. */
+	DICT_TABLE_OP_LOAD_TABLESPACE
+};
+
 /**********************************************************************//**
 Returns a table object based on table id.
 @return	table, NULL if does not exist */
 UNIV_INTERN
 dict_table_t*
-dict_table_get_on_id(
-/*=================*/
-        table_id_t	table_id,	/*!< in: table id */
-        trx_t*		trx);		/*!< in: transaction handle */
+dict_table_open_on_id(
+/*==================*/
+	table_id_t	table_id,	/*!< in: table id */
+	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
+	dict_table_op_t	table_op)	/*!< in: operation to perform */
+	__attribute__((warn_unused_result));
 /********************************************************************//**
-Decrements the count of open MySQL handles to a table. */
+Decrements the count of open handles to a table. */
 UNIV_INTERN
 void
-dict_table_decrement_handle_count(
-/*==============================*/
+dict_table_close(
+/*=============*/
 	dict_table_t*	table,		/*!< in/out: table */
-	ibool		dict_locked);	/*!< in: TRUE=data dictionary locked */
+	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
+	ibool		try_drop)	/*!< in: TRUE=try to drop any orphan
+					indexes after an aborted online
+					index creation */
+	__attribute__((nonnull));
 /**********************************************************************//**
 Inits the data dictionary module. */
 UNIV_INTERN
@@ -108,7 +166,8 @@ UNIV_INLINE
 ulint
 dict_col_get_mbminlen(
 /*==================*/
-	const dict_col_t*	col);	/*!< in: column */
+	const dict_col_t*	col)	/*!< in: column */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Gets the maximum number of bytes per character.
 @return maximum multi-byte char size, in bytes */
@@ -116,7 +175,8 @@ UNIV_INLINE
 ulint
 dict_col_get_mbmaxlen(
 /*==================*/
-	const dict_col_t*	col);	/*!< in: column */
+	const dict_col_t*	col)	/*!< in: column */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Sets the minimum and maximum number of bytes per character. */
 UNIV_INLINE
@@ -126,8 +186,9 @@ dict_col_set_mbminmaxlen(
 	dict_col_t*	col,		/*!< in/out: column */
 	ulint		mbminlen,	/*!< in: minimum multi-byte
 					character size, in bytes */
-	ulint		mbmaxlen);	/*!< in: minimum multi-byte
+	ulint		mbmaxlen)	/*!< in: minimum multi-byte
 					character size, in bytes */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Gets the column data type. */
 UNIV_INLINE
@@ -135,10 +196,11 @@ void
 dict_col_copy_type(
 /*===============*/
 	const dict_col_t*	col,	/*!< in: column */
-	dtype_t*		type);	/*!< out: data type */
+	dtype_t*		type)	/*!< out: data type */
+	__attribute__((nonnull));
 /**********************************************************************//**
 Determine bytes of column prefix to be stored in the undo log. Please
-note if the table format is UNIV_FORMAT_A (< DICT_TF_FORMAT_ZIP), no prefix
+note if the table format is UNIV_FORMAT_A (< UNIV_FORMAT_B), no prefix
 needs to be stored in the undo log.
 @return bytes of column prefix to be stored in the undo log */
 UNIV_INLINE
@@ -146,9 +208,9 @@ ulint
 dict_max_field_len_store_undo(
 /*==========================*/
 	dict_table_t*		table,	/*!< in: table */
-	const dict_col_t*	col);	/*!< in: column which index prefix
+	const dict_col_t*	col)	/*!< in: column which index prefix
 					is based on */
-
+	__attribute__((nonnull, warn_unused_result));
 #endif /* !UNIV_HOTBACKUP */
 #ifdef UNIV_DEBUG
 /*********************************************************************//**
@@ -159,7 +221,8 @@ ibool
 dict_col_type_assert_equal(
 /*=======================*/
 	const dict_col_t*	col,	/*!< in: column */
-	const dtype_t*		type);	/*!< in: data type */
+	const dtype_t*		type)	/*!< in: data type */
+	__attribute__((nonnull, warn_unused_result));
 #endif /* UNIV_DEBUG */
 #ifndef UNIV_HOTBACKUP
 /***********************************************************************//**
@@ -169,7 +232,8 @@ UNIV_INLINE
 ulint
 dict_col_get_min_size(
 /*==================*/
-	const dict_col_t*	col);	/*!< in: column */
+	const dict_col_t*	col)	/*!< in: column */
+	__attribute__((nonnull, warn_unused_result));
 /***********************************************************************//**
 Returns the maximum size of the column.
 @return	maximum size */
@@ -177,7 +241,8 @@ UNIV_INLINE
 ulint
 dict_col_get_max_size(
 /*==================*/
-	const dict_col_t*	col);	/*!< in: column */
+	const dict_col_t*	col)	/*!< in: column */
+	__attribute__((nonnull, warn_unused_result));
 /***********************************************************************//**
 Returns the size of a fixed size column, 0 if not a fixed size column.
 @return	fixed size, or 0 */
@@ -186,7 +251,8 @@ ulint
 dict_col_get_fixed_size(
 /*====================*/
 	const dict_col_t*	col,	/*!< in: column */
-	ulint			comp);	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+	__attribute__((nonnull, warn_unused_result));
 /***********************************************************************//**
 Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
 For fixed length types it is the fixed length of the type, otherwise 0.
@@ -196,8 +262,8 @@ ulint
 dict_col_get_sql_null_size(
 /*=======================*/
 	const dict_col_t*	col,	/*!< in: column */
-	ulint			comp);	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
-
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Gets the column number.
 @return	col->ind, table column position (starting from 0) */
@@ -205,7 +271,8 @@ UNIV_INLINE
 ulint
 dict_col_get_no(
 /*============*/
-	const dict_col_t*	col);	/*!< in: column */
+	const dict_col_t*	col)	/*!< in: column */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Gets the column position in the clustered index. */
 UNIV_INLINE
@@ -213,7 +280,8 @@ ulint
 dict_col_get_clust_pos(
 /*===================*/
 	const dict_col_t*	col,		/*!< in: table column */
-	const dict_index_t*	clust_index);	/*!< in: clustered index */
+	const dict_index_t*	clust_index)	/*!< in: clustered index */
+	__attribute__((nonnull, warn_unused_result));
 /****************************************************************//**
 If the given column name is reserved for InnoDB system columns, return
 TRUE.
@@ -222,14 +290,16 @@ UNIV_INTERN
 ibool
 dict_col_name_is_reserved(
 /*======================*/
-	const char*	name);	/*!< in: column name */
+	const char*	name)	/*!< in: column name */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Acquire the autoinc lock. */
 UNIV_INTERN
 void
 dict_table_autoinc_lock(
 /*====================*/
-	dict_table_t*	table);	/*!< in/out: table */
+	dict_table_t*	table)	/*!< in/out: table */
+	__attribute__((nonnull));
 /********************************************************************//**
 Unconditionally set the autoinc counter. */
 UNIV_INTERN
@@ -237,7 +307,8 @@ void
 dict_table_autoinc_initialize(
 /*==========================*/
 	dict_table_t*	table,	/*!< in/out: table */
-	ib_uint64_t	value);	/*!< in: next value to assign to a row */
+	ib_uint64_t	value)	/*!< in: next value to assign to a row */
+	__attribute__((nonnull));
 /********************************************************************//**
 Reads the next autoinc value (== autoinc counter value), 0 if not yet
 initialized.
@@ -246,7 +317,8 @@ UNIV_INTERN
 ib_uint64_t
 dict_table_autoinc_read(
 /*====================*/
-	const dict_table_t*	table);	/*!< in: table */
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Updates the autoinc counter if the value supplied is greater than the
 current value. */
@@ -256,14 +328,16 @@ dict_table_autoinc_update_if_greater(
 /*=================================*/
 
 	dict_table_t*	table,	/*!< in/out: table */
-	ib_uint64_t	value);	/*!< in: value which was assigned to a row */
+	ib_uint64_t	value)	/*!< in: value which was assigned to a row */
+	__attribute__((nonnull));
 /********************************************************************//**
 Release the autoinc lock. */
 UNIV_INTERN
 void
 dict_table_autoinc_unlock(
 /*======================*/
-	dict_table_t*	table);	/*!< in/out: table */
+	dict_table_t*	table)	/*!< in/out: table */
+	__attribute__((nonnull));
 #endif /* !UNIV_HOTBACKUP */
 /**********************************************************************//**
 Adds system columns to a table object. */
@@ -272,7 +346,8 @@ void
 dict_table_add_system_columns(
 /*==========================*/
 	dict_table_t*	table,	/*!< in/out: table */
-	mem_heap_t*	heap);	/*!< in: temporary heap */
+	mem_heap_t*	heap)	/*!< in: temporary heap */
+	__attribute__((nonnull));
 #ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
 Adds a table object to the dictionary cache. */
@@ -280,27 +355,32 @@ UNIV_INTERN
 void
 dict_table_add_to_cache(
 /*====================*/
-	dict_table_t*	table,	/*!< in: table */
-	mem_heap_t*	heap);	/*!< in: temporary heap */
+	dict_table_t*	table,		/*!< in: table */
+	ibool		can_be_evicted,	/*!< in: TRUE if can be evicted*/
+	mem_heap_t*	heap)		/*!< in: temporary heap */
+	__attribute__((nonnull));
 /**********************************************************************//**
 Removes a table object from the dictionary cache. */
 UNIV_INTERN
 void
 dict_table_remove_from_cache(
 /*=========================*/
-	dict_table_t*	table);	/*!< in, own: table */
+	dict_table_t*	table)	/*!< in, own: table */
+	__attribute__((nonnull));
 /**********************************************************************//**
 Renames a table object.
 @return	TRUE if success */
 UNIV_INTERN
-ibool
+dberr_t
 dict_table_rename_in_cache(
 /*=======================*/
 	dict_table_t*	table,		/*!< in/out: table */
 	const char*	new_name,	/*!< in: new name */
-	ibool		rename_also_foreigns);/*!< in: in ALTER TABLE we want
+	ibool		rename_also_foreigns)
+					/*!< in: in ALTER TABLE we want
 					to preserve the original table name
 					in constraints which reference it */
+	__attribute__((nonnull, warn_unused_result));
 /**********************************************************************//**
 Removes an index from the dictionary cache. */
 UNIV_INTERN
@@ -308,7 +388,8 @@ void
 dict_index_remove_from_cache(
 /*=========================*/
 	dict_table_t*	table,	/*!< in/out: table */
-	dict_index_t*	index);	/*!< in, own: index */
+	dict_index_t*	index)	/*!< in, own: index */
+	__attribute__((nonnull));
 /**********************************************************************//**
 Change the id of a table object in the dictionary cache. This is used in
 DISCARD TABLESPACE. */
@@ -317,7 +398,16 @@ void
 dict_table_change_id_in_cache(
 /*==========================*/
 	dict_table_t*	table,	/*!< in/out: table object already in cache */
-	table_id_t	new_id);/*!< in: new id to set */
+	table_id_t	new_id)	/*!< in: new id to set */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Removes a foreign constraint struct from the dictionary cache. */
+UNIV_INTERN
+void
+dict_foreign_remove_from_cache(
+/*===========================*/
+	dict_foreign_t*	foreign)	/*!< in, own: foreign constraint */
+	__attribute__((nonnull));
 /**********************************************************************//**
 Adds a foreign key constraint object to the dictionary cache. May free
 the object if there already is an object with the same identifier in.
@@ -325,14 +415,20 @@ At least one of foreign table or referenced table must already be in
 the dictionary cache!
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 dict_foreign_add_to_cache(
 /*======================*/
-	dict_foreign_t*		foreign,	/*!< in, own: foreign key
-						constraint */
-	ibool			check_charsets,	/*!< in: TRUE=check charset
-						compatibility */
-	dict_err_ignore_t	ignore_err);	/*!< in: error to be ignored */
+	dict_foreign_t*		foreign,
+				/*!< in, own: foreign key constraint */
+	const char**		col_names,
+				/*!< in: column names, or NULL to use
+				foreign->foreign_table->col_names */
+	bool			check_charsets,
+				/*!< in: whether to check charset
+				compatibility */
+	dict_err_ignore_t	ignore_err)
+				/*!< in: error to be ignored */
+	__attribute__((nonnull(1), warn_unused_result));
 /*********************************************************************//**
 Check if the index is referenced by a foreign key, if TRUE return the
 matching instance NULL otherwise.
@@ -343,7 +439,8 @@ dict_foreign_t*
 dict_table_get_referenced_constraint(
 /*=================================*/
 	dict_table_t*	table,	/*!< in: InnoDB table */
-	dict_index_t*	index);	/*!< in: InnoDB index */
+	dict_index_t*	index)	/*!< in: InnoDB index */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Checks if a table is referenced by foreign keys.
 @return	TRUE if table is referenced by a foreign key */
@@ -351,17 +448,33 @@ UNIV_INTERN
 ibool
 dict_table_is_referenced_by_foreign_key(
 /*====================================*/
-	const dict_table_t*	table);	/*!< in: InnoDB table */
+	const dict_table_t*	table)	/*!< in: InnoDB table */
+	__attribute__((nonnull, warn_unused_result));
 /**********************************************************************//**
-Replace the index in the foreign key list that matches this index's
-definition with an equivalent index. */
+Replace the index passed in with another equivalent index in the
+foreign key lists of the table.
+@return whether all replacements were found */
 UNIV_INTERN
-void
-dict_table_replace_index_in_foreign_list(
-/*=====================================*/
-	dict_table_t*	table,  /*!< in/out: table */
-	dict_index_t*	index,	/*!< in: index to be replaced */
-	const trx_t*	trx);	/*!< in: transaction handle */
+bool
+dict_foreign_replace_index(
+/*=======================*/
+	dict_table_t*		table,  /*!< in/out: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const dict_index_t*	index)	/*!< in: index to be replaced */
+	__attribute__((nonnull(1,3), warn_unused_result));
+/**********************************************************************//**
+Determines whether a string starts with the specified keyword.
+@return TRUE if str starts with keyword */
+UNIV_INTERN
+ibool
+dict_str_starts_with_keyword(
+/*=========================*/
+	THD*		thd,		/*!< in: MySQL thread handle */
+	const char*	str,		/*!< in: string to scan for keyword */
+	const char*	keyword)	/*!< in: keyword to look for */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Checks if a index is defined for a foreign key constraint. Index is a part
 of a foreign key constraint if the index is referenced by foreign key
@@ -373,7 +486,8 @@ dict_foreign_t*
 dict_table_get_foreign_constraint(
 /*==============================*/
 	dict_table_t*	table,	/*!< in: InnoDB table */
-	dict_index_t*	index);	/*!< in: InnoDB index */
+	dict_index_t*	index)	/*!< in: InnoDB index */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Scans a table create SQL string and adds to the data dictionary
 the foreign key constraints declared in the string. This function
@@ -383,7 +497,7 @@ bot participating tables. The indexes are allowed to contain more
 fields than mentioned in the constraint.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 dict_create_foreign_constraints(
 /*============================*/
 	trx_t*		trx,		/*!< in: transaction */
@@ -399,15 +513,16 @@ dict_create_foreign_constraints(
 	const char*	name,		/*!< in: table full name in the
 					normalized form
 					database_name/table_name */
-	ibool		reject_fks);	/*!< in: if TRUE, fail with error
+	ibool		reject_fks)	/*!< in: if TRUE, fail with error
 					code DB_CANNOT_ADD_CONSTRAINT if
 					any foreign keys are found. */
+	__attribute__((nonnull, warn_unused_result));
 /**********************************************************************//**
 Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
 @return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
 constraint id does not match */
 UNIV_INTERN
-ulint
+dberr_t
 dict_foreign_parse_drop_constraints(
 /*================================*/
 	mem_heap_t*	heap,			/*!< in: heap from which we can
@@ -416,85 +531,57 @@ dict_foreign_parse_drop_constraints(
 	dict_table_t*	table,			/*!< in: table */
 	ulint*		n,			/*!< out: number of constraints
 						to drop */
-	const char***	constraints_to_drop);	/*!< out: id's of the
+	const char***	constraints_to_drop)	/*!< out: id's of the
 						constraints to drop */
+	__attribute__((nonnull, warn_unused_result));
 /**********************************************************************//**
-Returns a table object and optionally increment its MySQL open handle count.
+Returns a table object and increments its open handle count.
 NOTE! This is a high-level function to be used mainly from outside the
-'dict' directory. Inside this directory dict_table_get_low is usually the
-appropriate function.
+'dict' directory. Inside this directory dict_table_get_low
+is usually the appropriate function.
 @return	table, NULL if does not exist */
 UNIV_INTERN
 dict_table_t*
-dict_table_get(
-/*===========*/
-	const char*		table_name,
-					/*!< in: table name */
-	ibool			inc_mysql_count,
-					/*!< in: whether to increment the open
-					handle count on the table */
-	dict_err_ignore_t	ignore_err);
-					/*!< in: errors to ignore when loading
-					the table */
-/**********************************************************************//**
-Returns a index object, based on table and index id, and memoryfixes it.
-@return	index, NULL if does not exist */
-UNIV_INTERN
-dict_index_t*
-dict_index_get_on_id_low(
-/*=====================*/
-	dict_table_t*	table,		/*!< in: table */
-	index_id_t	index_id);	/*!< in: index id */
-/**********************************************************************//**
-Checks if a table is in the dictionary cache.
-@return	table, NULL if not found */
-
-UNIV_INLINE
-dict_table_t*
-dict_table_check_if_in_cache_low(
-/*=============================*/
-	const char*	table_name);	/*!< in: table name */
-/**********************************************************************//**
-Gets a table; loads it to the dictionary cache if necessary. A low-level
-function.
-@return	table, NULL if not found */
-UNIV_INLINE
-dict_table_t*
-dict_table_get_low(
-/*===============*/
+dict_table_open_on_name(
+/*====================*/
 	const char*	table_name,	/*!< in: table name */
+	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
+	ibool		try_drop,	/*!< in: TRUE=try to drop any orphan
+					indexes after an aborted online
+					index creation */
 	dict_err_ignore_t
-			ignore_err);	/*!< in: error to be ignored when
-					loading a table definition */
-/**********************************************************************//**
-Returns a table object based on table id.
-@return	table, NULL if does not exist */
-UNIV_INLINE
-dict_table_t*
-dict_table_get_on_id_low(
-/*=====================*/
-	table_id_t	table_id);	/*!< in: table id */
-/**********************************************************************//**
-Find an index that is equivalent to the one passed in and is not marked
-for deletion.
-@return	index equivalent to foreign->foreign_index, or NULL */
-UNIV_INTERN
-dict_index_t*
-dict_foreign_find_equiv_index(
-/*==========================*/
-	dict_foreign_t*	foreign);/*!< in: foreign key */
-/**********************************************************************//**
-Returns an index object by matching on the name and column names and
-if more than one index matches return the index with the max id
+			ignore_err)	/*!< in: error to be ignored when
+					loading the table */
+	__attribute__((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
 @return	matching index, NULL if not found */
 UNIV_INTERN
 dict_index_t*
-dict_table_get_index_by_max_id(
-/*===========================*/
-	dict_table_t*	table,	/*!< in: table */
-	const char*	name,	/*!< in: the index name to find */
-	const char**	columns,/*!< in: array of column names */
-	ulint		n_cols);/*!< in: number of columns */
+dict_foreign_find_index(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols,	/*!< in: number of columns */
+	const dict_index_t*	types_idx,
+					/*!< in: NULL or an index
+					whose types the column types
+					must match */
+	bool			check_charsets,
+					/*!< in: whether to check
+					charsets.  only has an effect
+					if types_idx != NULL */
+	ulint			check_null)
+					/*!< in: nonzero if none of
+					the columns must be declared
+					NOT NULL */
+	__attribute__((nonnull(1,3), warn_unused_result));
 /**********************************************************************//**
 Returns a column's name.
 @return column name. NOTE: not guaranteed to stay valid if table is
@@ -504,29 +591,16 @@ const char*
 dict_table_get_col_name(
 /*====================*/
 	const dict_table_t*	table,	/*!< in: table */
-	ulint			col_nr);/*!< in: column number */
-
+	ulint			col_nr)	/*!< in: column number */
+	__attribute__((nonnull, warn_unused_result));
 /**********************************************************************//**
-Prints a table definition. */
+Prints a table data. */
 UNIV_INTERN
 void
 dict_table_print(
 /*=============*/
-	dict_table_t*	table);	/*!< in: table */
-/**********************************************************************//**
-Prints a table data. */
-UNIV_INTERN
-void
-dict_table_print_low(
-/*=================*/
-	dict_table_t*	table);	/*!< in: table */
-/**********************************************************************//**
-Prints a table data when we know the table name. */
-UNIV_INTERN
-void
-dict_table_print_by_name(
-/*=====================*/
-	const char*	name);	/*!< in: table name */
+	dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull));
 /**********************************************************************//**
 Outputs info on foreign keys of a table. */
 UNIV_INTERN
@@ -539,7 +613,8 @@ dict_print_info_on_foreign_keys(
 				of SHOW TABLE STATUS */
 	FILE*		file,	/*!< in: file where to print */
 	trx_t*		trx,	/*!< in: transaction */
-	dict_table_t*	table);	/*!< in: table */
+	dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull));
 /**********************************************************************//**
 Outputs info on a foreign key of a table in a format suitable for
 CREATE TABLE. */
@@ -550,7 +625,8 @@ dict_print_info_on_foreign_key_in_create_format(
 	FILE*		file,		/*!< in: file where to print */
 	trx_t*		trx,		/*!< in: transaction */
 	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
-	ibool		add_newline);	/*!< in: whether to add a newline */
+	ibool		add_newline)	/*!< in: whether to add a newline */
+	__attribute__((nonnull(1,3)));
 /********************************************************************//**
 Displays the names of the index and the table. */
 UNIV_INTERN
@@ -558,8 +634,38 @@ void
 dict_index_name_print(
 /*==================*/
 	FILE*			file,	/*!< in: output stream */
-	trx_t*			trx,	/*!< in: transaction */
-	const dict_index_t*	index);	/*!< in: index to print */
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_index_t*	index)	/*!< in: index to print */
+	__attribute__((nonnull(1,3)));
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return	matching index, NULL if not found */
+UNIV_INTERN
+bool
+dict_foreign_qualify_index(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols,	/*!< in: number of columns */
+	const dict_index_t*	index,	/*!< in: index to check */
+	const dict_index_t*	types_idx,
+					/*!< in: NULL or an index
+					whose types the column types
+					must match */
+	bool			check_charsets,
+					/*!< in: whether to check
+					charsets.  only has an effect
+					if types_idx != NULL */
+	ulint			check_null)
+					/*!< in: nonzero if none of
+					the columns must be declared
+					NOT NULL */
+	__attribute__((nonnull(1,3), warn_unused_result));
 #ifdef UNIV_DEBUG
 /********************************************************************//**
 Gets the first index on the table (the clustered index).
@@ -568,7 +674,17 @@ UNIV_INLINE
 dict_index_t*
 dict_table_get_first_index(
 /*=======================*/
-	const dict_table_t*	table);	/*!< in: table */
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the last index on the table.
+@return	index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_last_index(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Gets the next index on the table.
 @return	index, NULL if none left */
@@ -576,9 +692,11 @@ UNIV_INLINE
 dict_index_t*
 dict_table_get_next_index(
 /*======================*/
-	const dict_index_t*	index);	/*!< in: index */
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, warn_unused_result));
 #else /* UNIV_DEBUG */
 # define dict_table_get_first_index(table) UT_LIST_GET_FIRST((table)->indexes)
+# define dict_table_get_last_index(table) UT_LIST_GET_LAST((table)->indexes)
 # define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index)
 #endif /* UNIV_DEBUG */
 #endif /* !UNIV_HOTBACKUP */
@@ -633,6 +751,17 @@ dict_index_is_sec_or_ibuf(
 	const dict_index_t*	index)	/*!< in: index */
 	__attribute__((nonnull, pure, warn_unused_result));
 
+/************************************************************************
+Gets the all the FTS indexes for the table. NOTE: must not be called for
+tables which do not have an FTS-index. */
+UNIV_INTERN
+ulint
+dict_table_get_all_fts_indexes(
+/*===========================*/
+				/* out: number of indexes collected */
+	dict_table_t*	table,	/* in: table */
+	ib_vector_t*	indexes)/* out: vector for collecting FTS indexes */
+	__attribute__((nonnull));
 /********************************************************************//**
 Gets the number of user-defined columns in a table in the dictionary
 cache.
@@ -662,6 +791,35 @@ dict_table_get_n_cols(
 /*==================*/
 	const dict_table_t*	table)	/*!< in: table */
 	__attribute__((nonnull, pure, warn_unused_result));
+/********************************************************************//**
+Gets the approximately estimated number of rows in the table.
+@return	estimated number of rows */
+UNIV_INLINE
+ib_uint64_t
+dict_table_get_n_rows(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Increment the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_inc(
+/*==================*/
+	dict_table_t*	table)	/*!< in/out: table */
+	__attribute__((nonnull));
+/********************************************************************//**
+Decrement the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_dec(
+/*==================*/
+	dict_table_t*	table)	/*!< in/out: table */
+	__attribute__((nonnull));
 #ifdef UNIV_DEBUG
 /********************************************************************//**
 Gets the nth column of a table.
@@ -671,7 +829,8 @@ dict_col_t*
 dict_table_get_nth_col(
 /*===================*/
 	const dict_table_t*	table,	/*!< in: table */
-	ulint			pos);	/*!< in: position of column */
+	ulint			pos)	/*!< in: position of column */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Gets the given system column of a table.
 @return	pointer to column object */
@@ -680,7 +839,8 @@ dict_col_t*
 dict_table_get_sys_col(
 /*===================*/
 	const dict_table_t*	table,	/*!< in: table */
-	ulint			sys);	/*!< in: DATA_ROW_ID, ... */
+	ulint			sys)	/*!< in: DATA_ROW_ID, ... */
+	__attribute__((nonnull, warn_unused_result));
 #else /* UNIV_DEBUG */
 #define dict_table_get_nth_col(table, pos) \
 ((table)->cols + (pos))
@@ -695,7 +855,8 @@ ulint
 dict_table_get_sys_col_no(
 /*======================*/
 	const dict_table_t*	table,	/*!< in: table */
-	ulint			sys);	/*!< in: DATA_ROW_ID, ... */
+	ulint			sys)	/*!< in: DATA_ROW_ID, ... */
+	__attribute__((nonnull, warn_unused_result));
 #ifndef UNIV_HOTBACKUP
 /********************************************************************//**
 Returns the minimum data size of an index record.
@@ -704,7 +865,8 @@ UNIV_INLINE
 ulint
 dict_index_get_min_size(
 /*====================*/
-	const dict_index_t*	index);	/*!< in: index */
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, warn_unused_result));
 #endif /* !UNIV_HOTBACKUP */
 /********************************************************************//**
 Check whether the table uses the compact page format.
@@ -713,7 +875,8 @@ UNIV_INLINE
 ibool
 dict_table_is_comp(
 /*===============*/
-	const dict_table_t*	table);	/*!< in: table */
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Determine the file format of a table.
 @return	file format version */
@@ -721,23 +884,53 @@ UNIV_INLINE
 ulint
 dict_table_get_format(
 /*==================*/
-	const dict_table_t*	table);	/*!< in: table */
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Determine the file format from a dict_table_t::flags.
+@return	file format version */
+UNIV_INLINE
+ulint
+dict_tf_get_format(
+/*===============*/
+	ulint		flags)		/*!< in: dict_table_t::flags */
+	__attribute__((warn_unused_result));
 /********************************************************************//**
-Set the file format of a table. */
+Set the various values in a dict_table_t::flags pointer. */
 UNIV_INLINE
 void
-dict_table_set_format(
-/*==================*/
-	dict_table_t*	table,	/*!< in/out: table */
-	ulint		format);/*!< in: file format version */
+dict_tf_set(
+/*========*/
+	ulint*		flags,		/*!< in/out: table */
+	rec_format_t	format,		/*!< in: file format */
+	ulint		zip_ssize,	/*!< in: zip shift size */
+	bool		remote_path)	/*!< in: table uses DATA DIRECTORY */
+	__attribute__((nonnull));
+/********************************************************************//**
+Convert a 32 bit integer table flags to the 32 bit integer that is
+written into the tablespace header at the offset FSP_SPACE_FLAGS and is
+also stored in the fil_space_t::flags field.  The following chart shows
+the translation of the low order bit.  Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC
+dict_table_t::flags |     0     |    1    |     1      |    1
+fil_space_t::flags  |     0     |    0    |     1      |    1
+==================================================================
+@return	tablespace flags (fil_space_t::flags) */
+UNIV_INLINE
+ulint
+dict_tf_to_fsp_flags(
+/*=================*/
+	ulint	flags)	/*!< in: dict_table_t::flags */
+	__attribute__((const));
 /********************************************************************//**
 Extract the compressed page size from table flags.
 @return	compressed page size, or 0 if not compressed */
 UNIV_INLINE
 ulint
-dict_table_flags_to_zip_size(
-/*=========================*/
-	ulint	flags)	/*!< in: flags */
+dict_tf_get_zip_size(
+/*=================*/
+	ulint	flags)			/*!< in: flags */
 	__attribute__((const));
 /********************************************************************//**
 Check whether the table uses the compressed compact page format.
@@ -746,7 +939,8 @@ UNIV_INLINE
 ulint
 dict_table_zip_size(
 /*================*/
-	const dict_table_t*	table);	/*!< in: table */
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
 #ifndef UNIV_HOTBACKUP
 /*********************************************************************//**
 Obtain exclusive locks on all index trees of the table. This is to prevent
@@ -756,15 +950,16 @@ UNIV_INLINE
 void
 dict_table_x_lock_indexes(
 /*======================*/
-	dict_table_t*	table);	/*!< in: table */
+	dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Release the exclusive locks on all index tree. */
 UNIV_INLINE
 void
 dict_table_x_unlock_indexes(
 /*========================*/
-	dict_table_t*	table);	/*!< in: table */
-#endif /* !UNIV_HOTBACKUP */
+	dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull));
 /********************************************************************//**
 Checks if a column is in the ordering columns of the clustered index of a
 table. Column prefixes are treated like whole columns.
@@ -774,8 +969,17 @@ ibool
 dict_table_col_in_clustered_key(
 /*============================*/
 	const dict_table_t*	table,	/*!< in: table */
-	ulint			n);	/*!< in: column number */
-#ifndef UNIV_HOTBACKUP
+	ulint			n)	/*!< in: column number */
+	__attribute__((nonnull, warn_unused_result));
+/*******************************************************************//**
+Check if the table has an FTS index.
+@return TRUE if table has an FTS index */
+UNIV_INLINE
+ibool
+dict_table_has_fts_index(
+/*=====================*/
+	dict_table_t*   table)		/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
 /*******************************************************************//**
 Copies types of columns contained in table to tuple and sets all
 fields of the tuple to the SQL NULL value.  This function should
@@ -785,7 +989,20 @@ void
 dict_table_copy_types(
 /*==================*/
 	dtuple_t*		tuple,	/*!< in/out: data tuple */
-	const dict_table_t*	table);	/*!< in: table */
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull));
+/********************************************************************
+Wait until all the background threads of the given table have exited, i.e.,
+bg_threads == 0. Note: bg_threads_mutex must be reserved when
+calling this. */
+UNIV_INTERN
+void
+dict_table_wait_for_bg_threads_to_exit(
+/*===================================*/
+	dict_table_t*	table,	/* in: table */
+	ulint		delay)	/* in: time in microseconds to wait between
+				checks of bg_threads. */
+	__attribute__((nonnull));
 /**********************************************************************//**
 Looks for an index with the given id. NOTE that we do not reserve
 the dictionary mutex: this function is for emergency purposes like
@@ -795,21 +1012,34 @@ UNIV_INTERN
 dict_index_t*
 dict_index_find_on_id_low(
 /*======================*/
-	index_id_t	id);	/*!< in: index id */
+	index_id_t	id)	/*!< in: index id */
+	__attribute__((warn_unused_result));
+/**********************************************************************//**
+Make room in the table cache by evicting an unused table. The unused table
+should not be part of FK relationship and currently not used in any user
+transaction. There is no guarantee that it will remove a table.
+@return number of tables evicted. */
+UNIV_INTERN
+ulint
+dict_make_room_in_cache(
+/*====================*/
+	ulint		max_tables,	/*!< in: max tables allowed in cache */
+	ulint		pct_check);	/*!< in: max percent to check */
 /**********************************************************************//**
 Adds an index to the dictionary cache.
 @return	DB_SUCCESS, DB_TOO_BIG_RECORD, or DB_CORRUPTION */
 UNIV_INTERN
-ulint
+dberr_t
 dict_index_add_to_cache(
 /*====================*/
 	dict_table_t*	table,	/*!< in: table on which the index is */
 	dict_index_t*	index,	/*!< in, own: index; NOTE! The index memory
 				object is freed in this function! */
 	ulint		page_no,/*!< in: root page number of the index */
-	ibool		strict);/*!< in: TRUE=refuse to create the index
+	ibool		strict)	/*!< in: TRUE=refuse to create the index
 				if records could be too big to fit in
 				an B-tree page */
+	__attribute__((nonnull, warn_unused_result));
 /**********************************************************************//**
 Removes an index from the dictionary cache. */
 UNIV_INTERN
@@ -817,7 +1047,8 @@ void
 dict_index_remove_from_cache(
 /*=========================*/
 	dict_table_t*	table,	/*!< in/out: table */
-	dict_index_t*	index);	/*!< in, own: index */
+	dict_index_t*	index)	/*!< in, own: index */
+	__attribute__((nonnull));
 #endif /* !UNIV_HOTBACKUP */
 /********************************************************************//**
 Gets the number of fields in the internal representation of an index,
@@ -827,9 +1058,10 @@ UNIV_INLINE
 ulint
 dict_index_get_n_fields(
 /*====================*/
-	const dict_index_t*	index);	/*!< in: an internal
+	const dict_index_t*	index)	/*!< in: an internal
 					representation of index (in
 					the dictionary cache) */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Gets the number of fields in the internal representation of an index
 that uniquely determine the position of an index entry in the index, if
@@ -840,8 +1072,9 @@ UNIV_INLINE
 ulint
 dict_index_get_n_unique(
 /*====================*/
-	const dict_index_t*	index);	/*!< in: an internal representation
+	const dict_index_t*	index)	/*!< in: an internal representation
 					of index (in the dictionary cache) */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Gets the number of fields in the internal representation of an index
 which uniquely determine the position of an index entry in the index, if
@@ -851,8 +1084,9 @@ UNIV_INLINE
 ulint
 dict_index_get_n_unique_in_tree(
 /*============================*/
-	const dict_index_t*	index);	/*!< in: an internal representation
+	const dict_index_t*	index)	/*!< in: an internal representation
 					of index (in the dictionary cache) */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Gets the number of user-defined ordering fields in the index. In the internal
 representation we add the row id to the ordering fields to make all indexes
@@ -863,8 +1097,9 @@ UNIV_INLINE
 ulint
 dict_index_get_n_ordering_defined_by_user(
 /*======================================*/
-	const dict_index_t*	index);	/*!< in: an internal representation
+	const dict_index_t*	index)	/*!< in: an internal representation
 					of index (in the dictionary cache) */
+	__attribute__((nonnull, warn_unused_result));
 #ifdef UNIV_DEBUG
 /********************************************************************//**
 Gets the nth field of an index.
@@ -874,7 +1109,8 @@ dict_field_t*
 dict_index_get_nth_field(
 /*=====================*/
 	const dict_index_t*	index,	/*!< in: index */
-	ulint			pos);	/*!< in: position of field */
+	ulint			pos)	/*!< in: position of field */
+	__attribute__((nonnull, warn_unused_result));
 #else /* UNIV_DEBUG */
 # define dict_index_get_nth_field(index, pos) ((index)->fields + (pos))
 #endif /* UNIV_DEBUG */
@@ -886,7 +1122,8 @@ const dict_col_t*
 dict_index_get_nth_col(
 /*===================*/
 	const dict_index_t*	index,	/*!< in: index */
-	ulint			pos);	/*!< in: position of the field */
+	ulint			pos)	/*!< in: position of the field */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Gets the column number of the nth field in an index.
 @return	column number */
@@ -895,17 +1132,19 @@ ulint
 dict_index_get_nth_col_no(
 /*======================*/
 	const dict_index_t*	index,	/*!< in: index */
-	ulint			pos);	/*!< in: position of the field */
+	ulint			pos)	/*!< in: position of the field */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Looks for column n in an index.
 @return position in internal representation of the index;
 ULINT_UNDEFINED if not contained */
-UNIV_INTERN
+UNIV_INLINE
 ulint
 dict_index_get_nth_col_pos(
 /*=======================*/
 	const dict_index_t*	index,	/*!< in: index */
-	ulint			n);	/*!< in: column number */
+	ulint			n)	/*!< in: column number */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Looks for column n in an index.
 @return position in internal representation of the index;
@@ -916,8 +1155,9 @@ dict_index_get_nth_col_or_prefix_pos(
 /*=================================*/
 	const dict_index_t*	index,		/*!< in: index */
 	ulint			n,		/*!< in: column number */
-	ibool			inc_prefix);	/*!< in: TRUE=consider
+	ibool			inc_prefix)	/*!< in: TRUE=consider
 						column prefixes too */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Returns TRUE if the index contains a column or a prefix of that column.
 @return	TRUE if contains the column or its prefix */
@@ -926,7 +1166,8 @@ ibool
 dict_index_contains_col_or_prefix(
 /*==============================*/
 	const dict_index_t*	index,	/*!< in: index */
-	ulint			n);	/*!< in: column number */
+	ulint			n)	/*!< in: column number */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Looks for a matching field in an index. The column has to be the same. The
 column in index must be complete, or must contain a prefix longer than the
@@ -940,7 +1181,8 @@ dict_index_get_nth_field_pos(
 /*=========================*/
 	const dict_index_t*	index,	/*!< in: index from which to search */
 	const dict_index_t*	index2,	/*!< in: index */
-	ulint			n);	/*!< in: field number in index2 */
+	ulint			n)	/*!< in: field number in index2 */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Looks for column n position in the clustered index.
 @return	position in internal representation of the clustered index */
@@ -949,7 +1191,8 @@ ulint
 dict_table_get_nth_col_pos(
 /*=======================*/
 	const dict_table_t*	table,	/*!< in: table */
-	ulint			n);	/*!< in: column number */
+	ulint			n)	/*!< in: column number */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Returns the position of a system column in an index.
 @return	position, ULINT_UNDEFINED if not contained */
@@ -958,7 +1201,8 @@ ulint
 dict_index_get_sys_col_pos(
 /*=======================*/
 	const dict_index_t*	index,	/*!< in: index */
-	ulint			type);	/*!< in: DATA_ROW_ID, ... */
+	ulint			type)	/*!< in: DATA_ROW_ID, ... */
+	__attribute__((nonnull, warn_unused_result));
 /*******************************************************************//**
 Adds a column to index. */
 UNIV_INTERN
@@ -968,7 +1212,8 @@ dict_index_add_col(
 	dict_index_t*		index,		/*!< in/out: index */
 	const dict_table_t*	table,		/*!< in: table */
 	dict_col_t*		col,		/*!< in: column */
-	ulint			prefix_len);	/*!< in: column prefix length */
+	ulint			prefix_len)	/*!< in: column prefix length */
+	__attribute__((nonnull));
 #ifndef UNIV_HOTBACKUP
 /*******************************************************************//**
 Copies types of fields contained in index to tuple. */
@@ -978,8 +1223,9 @@ dict_index_copy_types(
 /*==================*/
 	dtuple_t*		tuple,		/*!< in/out: data tuple */
 	const dict_index_t*	index,		/*!< in: index */
-	ulint			n_fields);	/*!< in: number of
+	ulint			n_fields)	/*!< in: number of
 						field types to copy */
+	__attribute__((nonnull));
 #endif /* !UNIV_HOTBACKUP */
 /*********************************************************************//**
 Gets the field column.
@@ -988,7 +1234,8 @@ UNIV_INLINE
 const dict_col_t*
 dict_field_get_col(
 /*===============*/
-	const dict_field_t*	field);	/*!< in: index field */
+	const dict_field_t*	field)	/*!< in: index field */
+	__attribute__((nonnull, warn_unused_result));
 #ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
 Returns an index object if it is found in the dictionary cache.
@@ -998,7 +1245,8 @@ UNIV_INTERN
 dict_index_t*
 dict_index_get_if_in_cache_low(
 /*===========================*/
-	index_id_t	index_id);	/*!< in: index id */
+	index_id_t	index_id)	/*!< in: index id */
+	__attribute__((warn_unused_result));
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 /**********************************************************************//**
 Returns an index object if it is found in the dictionary cache.
@@ -1007,7 +1255,8 @@ UNIV_INTERN
 dict_index_t*
 dict_index_get_if_in_cache(
 /*=======================*/
-	index_id_t	index_id);	/*!< in: index id */
+	index_id_t	index_id)	/*!< in: index id */
+	__attribute__((warn_unused_result));
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 #ifdef UNIV_DEBUG
 /**********************************************************************//**
@@ -1019,7 +1268,17 @@ ibool
 dict_index_check_search_tuple(
 /*==========================*/
 	const dict_index_t*	index,	/*!< in: index tree */
-	const dtuple_t*		tuple);	/*!< in: tuple used in a search */
+	const dtuple_t*		tuple)	/*!< in: tuple used in a search */
+	__attribute__((nonnull, warn_unused_result));
+/** Whether and when to allow temporary index names */
+enum check_name {
+	/** Require all indexes to be complete. */
+	CHECK_ALL_COMPLETE,
+	/** Allow aborted online index creation. */
+	CHECK_ABORTED_OK,
+	/** Allow partial indexes to exist. */
+	CHECK_PARTIAL_OK
+};
 /**********************************************************************//**
 Check for duplicate index entries in a table [using the index name] */
 UNIV_INTERN
@@ -1028,8 +1287,9 @@ dict_table_check_for_dup_indexes(
 /*=============================*/
 	const dict_table_t*	table,	/*!< in: Check for dup indexes
 					in this table */
-	ibool			tmp_ok);/*!< in: TRUE=allow temporary
-					index names */
+	enum check_name		check)	/*!< in: whether and when to allow
+					temporary index names */
+	__attribute__((nonnull));
 #endif /* UNIV_DEBUG */
 /**********************************************************************//**
 Builds a node pointer out of a physical record and a page number.
@@ -1045,8 +1305,9 @@ dict_index_build_node_ptr(
 					pointer */
 	mem_heap_t*		heap,	/*!< in: memory heap where pointer
 					created */
-	ulint			level);	/*!< in: level of rec in tree:
+	ulint			level)	/*!< in: level of rec in tree:
 					0 means leaf level */
+	__attribute__((nonnull, warn_unused_result));
 /**********************************************************************//**
 Copies an initial segment of a physical record, long enough to specify an
 index entry uniquely.
@@ -1061,7 +1322,8 @@ dict_index_copy_rec_order_prefix(
 	ulint*			n_fields,/*!< out: number of fields copied */
 	byte**			buf,	/*!< in/out: memory buffer for the
 					copied prefix, or NULL */
-	ulint*			buf_size);/*!< in/out: buffer size */
+	ulint*			buf_size)/*!< in/out: buffer size */
+	__attribute__((nonnull, warn_unused_result));
 /**********************************************************************//**
 Builds a typed data tuple out of a physical record.
 @return	own: data tuple */
@@ -1072,7 +1334,8 @@ dict_index_build_data_tuple(
 	dict_index_t*	index,	/*!< in: index */
 	rec_t*		rec,	/*!< in: record for which to build data tuple */
 	ulint		n_fields,/*!< in: number of data fields */
-	mem_heap_t*	heap);	/*!< in: memory heap where tuple created */
+	mem_heap_t*	heap)	/*!< in: memory heap where tuple created */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Gets the space id of the root of the index tree.
 @return	space id */
@@ -1080,7 +1343,8 @@ UNIV_INLINE
 ulint
 dict_index_get_space(
 /*=================*/
-	const dict_index_t*	index);	/*!< in: index */
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Sets the space id of the root of the index tree. */
 UNIV_INLINE
@@ -1088,7 +1352,8 @@ void
 dict_index_set_space(
 /*=================*/
 	dict_index_t*	index,	/*!< in/out: index */
-	ulint		space);	/*!< in: space id */
+	ulint		space)	/*!< in: space id */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Gets the page number of the root of the index tree.
 @return	page number */
@@ -1096,15 +1361,17 @@ UNIV_INLINE
 ulint
 dict_index_get_page(
 /*================*/
-	const dict_index_t*	tree);	/*!< in: index */
+	const dict_index_t*	tree)	/*!< in: index */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Gets the read-write lock of the index tree.
 @return	read-write lock */
 UNIV_INLINE
-rw_lock_t*
+prio_rw_lock_t*
 dict_index_get_lock(
 /*================*/
-	dict_index_t*	index);	/*!< in: index */
+	dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Returns free space reserved for future updates of records. This is
 relevant only in the case of many consecutive inserts, as updates
@@ -1114,49 +1381,48 @@ UNIV_INLINE
 ulint
 dict_index_get_space_reserve(void);
 /*==============================*/
+
+/* Online index creation @{ */
+/********************************************************************//**
+Gets the status of online index creation.
+@return the status */
+UNIV_INLINE
+enum online_index_status
+dict_index_get_online_status(
+/*=========================*/
+	const dict_index_t*	index)	/*!< in: secondary index */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Sets the status of online index creation. */
+UNIV_INLINE
+void
+dict_index_set_online_status(
+/*=========================*/
+	dict_index_t*			index,	/*!< in/out: index */
+	enum online_index_status	status)	/*!< in: status */
+	__attribute__((nonnull));
+/********************************************************************//**
+Determines if a secondary index is being or has been created online,
+or if the table is being rebuilt online, allowing concurrent modifications
+to the table.
+@retval true if the index is being or has been built online, or
+if this is a clustered index and the table is being or has been rebuilt online
+@retval false if the index has been created or the table has been
+rebuilt completely */
+UNIV_INLINE
+bool
+dict_index_is_online_ddl(
+/*=====================*/
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Calculates the minimum record length in an index. */
 UNIV_INTERN
 ulint
 dict_index_calc_min_rec_len(
 /*========================*/
-	const dict_index_t*	index);	/*!< in: index */
-
-/** Calculate new statistics if 1 / 16 of table has been modified
-since the last time a statistics batch was run.
-We calculate statistics at most every 16th round, since we may have
-a counter table which is very small and updated very often.
-@param t table
-@return true if the table has changed too much and stats need to be
-recalculated
-*/
-#define DICT_TABLE_CHANGED_TOO_MUCH(t) \
-	((ib_int64_t) (t)->stat_modified_counter > 16 + (t)->stat_n_rows / 16)
-
-/*********************************************************************//**
-Calculates new estimates for table and index statistics. The statistics
-are used in query optimization. */
-UNIV_INTERN
-void
-dict_update_statistics(
-/*===================*/
-	dict_table_t*	table,		/*!< in/out: table */
-	ibool		only_calc_if_missing_stats,/*!< in: only
-					update/recalc the stats if they have
-					not been initialized yet, otherwise
-					do nothing */
-	ibool		sync,
-	ibool		only_calc_if_changed_too_much);/*!< in: only
-					update/recalc the stats if the table
-					has been changed too much since the
-					last stats update/recalc */
-/*********************************************************************//**
-*/
-UNIV_INTERN
-ibool
-dict_is_older_statistics(
-/*=====================*/
-	dict_index_t*	index);
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, warn_unused_result));
 /********************************************************************//**
 Reserves the dictionary system mutex for MySQL. */
 UNIV_INTERN
@@ -1178,8 +1444,9 @@ void
 dict_table_stats_lock(
 /*==================*/
 	const dict_table_t*	table,		/*!< in: table */
-	ulint			latch_mode);	/*!< in: RW_S_LATCH or
+	ulint			latch_mode)	/*!< in: RW_S_LATCH or
 						RW_X_LATCH */
+	__attribute__((nonnull));
 /**********************************************************************//**
 Unlock the latch that has been locked by dict_table_stats_lock() */
 UNIV_INTERN
@@ -1187,8 +1454,9 @@ void
 dict_table_stats_unlock(
 /*====================*/
 	const dict_table_t*	table,		/*!< in: table */
-	ulint			latch_mode);	/*!< in: RW_S_LATCH or
+	ulint			latch_mode)	/*!< in: RW_S_LATCH or
 						RW_X_LATCH */
+	__attribute__((nonnull));
 /********************************************************************//**
 Checks if the database name in two table names is the same.
 @return	TRUE if same db name */
@@ -1198,8 +1466,9 @@ dict_tables_have_same_db(
 /*=====================*/
 	const char*	name1,	/*!< in: table name in the form
 				dbname '/' tablename */
-	const char*	name2);	/*!< in: table name in the form
+	const char*	name2)	/*!< in: table name in the form
 				dbname '/' tablename */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Removes an index from the cache */
 UNIV_INTERN
@@ -1207,7 +1476,8 @@ void
 dict_index_remove_from_cache(
 /*=========================*/
 	dict_table_t*	table,	/*!< in/out: table */
-	dict_index_t*	index);	/*!< in, own: index */
+	dict_index_t*	index)	/*!< in, own: index */
+	__attribute__((nonnull));
 /**********************************************************************//**
 Get index by name
 @return	index, NULL if does not exist */
@@ -1216,7 +1486,8 @@ dict_index_t*
 dict_table_get_index_on_name(
 /*=========================*/
 	dict_table_t*	table,	/*!< in: table */
-	const char*	name);	/*!< in: name of the index to find */
+	const char*	name)	/*!< in: name of the index to find */
+	__attribute__((nonnull, warn_unused_result));
 /**********************************************************************//**
 In case there is more than one index with the same name return the index
 with the min(id).
@@ -1226,17 +1497,53 @@ dict_index_t*
 dict_table_get_index_on_name_and_min_id(
 /*====================================*/
 	dict_table_t*	table,	/*!< in: table */
-	const char*	name);	/*!< in: name of the index to find */
-
+	const char*	name)	/*!< in: name of the index to find */
+	__attribute__((nonnull, warn_unused_result));
+/***************************************************************
+Check whether a column exists in an FTS index. */
+UNIV_INLINE
+ulint
+dict_table_is_fts_column(
+/*=====================*/
+				/* out: ULINT_UNDEFINED if no match else
+				the offset within the vector */
+	ib_vector_t*	indexes,/* in: vector containing only FTS indexes */
+	ulint		col_no)	/* in: col number to search for */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************************//**
+Move a table to the non LRU end of the LRU list. */
 UNIV_INTERN
 void
-dict_table_LRU_trim(
-/*================*/
-	dict_table_t*	self);
+dict_table_move_from_lru_to_non_lru(
+/*================================*/
+	dict_table_t*	table)	/*!< in: table to move from LRU to non-LRU */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Move a table to the LRU list from the non-LRU list. */
+UNIV_INTERN
+void
+dict_table_move_from_non_lru_to_lru(
+/*================================*/
+	dict_table_t*	table)	/*!< in: table to move from non-LRU to LRU */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Move to the most recently used segment of the LRU list. */
+UNIV_INTERN
+void
+dict_move_to_mru(
+/*=============*/
+	dict_table_t*	table)	/*!< in: table to move to MRU */
+	__attribute__((nonnull));
+
+/** Maximum number of columns in a foreign key constraint. Please Note MySQL
+has a much lower limit on the number of columns allowed in a foreign key
+constraint */
+#define MAX_NUM_FK_COLUMNS		500
+
 /* Buffers for storing detailed information about the latest foreign key
 and unique key errors */
 extern FILE*	dict_foreign_err_file;
-extern mutex_t	dict_foreign_err_mutex; /* mutex protecting the buffers */
+extern ib_mutex_t	dict_foreign_err_mutex; /* mutex protecting the buffers */
 
 /** the dictionary system */
 extern dict_sys_t*	dict_sys;
@@ -1244,8 +1551,8 @@ extern dict_sys_t*	dict_sys;
 extern rw_lock_t	dict_operation_lock;
 
 /* Dictionary system struct */
-struct dict_sys_struct{
-	mutex_t		mutex;		/*!< mutex protecting the data
+struct dict_sys_t{
+	ib_prio_mutex_t		mutex;		/*!< mutex protecting the data
 					dictionary; protects also the
 					disk-based dictionary system tables;
 					this mutex serializes CREATE TABLE
@@ -1262,8 +1569,6 @@ struct dict_sys_struct{
 					on name */
 	hash_table_t*	table_id_hash;	/*!< hash table of the tables, based
 					on id */
-	UT_LIST_BASE_NODE_T(dict_table_t)
-			table_LRU;	/*!< LRU list of tables */
 	ulint		size;		/*!< varying space in bytes occupied
 					by the data dictionary table and
 					index objects */
@@ -1271,7 +1576,14 @@ struct dict_sys_struct{
 	dict_table_t*	sys_columns;	/*!< SYS_COLUMNS table */
 	dict_table_t*	sys_indexes;	/*!< SYS_INDEXES table */
 	dict_table_t*	sys_fields;	/*!< SYS_FIELDS table */
-	dict_table_t*	sys_stats;	/*!< SYS_STATS table */
+
+	/*=============================*/
+	UT_LIST_BASE_NODE_T(dict_table_t)
+			table_LRU;	/*!< List of tables that can be evicted
+					from the cache */
+	UT_LIST_BASE_NODE_T(dict_table_t)
+			table_non_LRU;	/*!< List of tables that can't be
+					evicted from the cache */
 };
 #endif /* !UNIV_HOTBACKUP */
 
@@ -1287,6 +1599,80 @@ void
 dict_ind_init(void);
 /*===============*/
 
+/* Auxiliary structs for checking a table definition @{ */
+
+/* This struct is used to specify the name and type that a column must
+have when checking a table's schema. */
+struct dict_col_meta_t {
+	const char*	name;		/* column name */
+	ulint		mtype;		/* required column main type */
+	ulint		prtype_mask;	/* required column precise type mask;
+					if this is non-zero then all the
+					bits it has set must also be set
+					in the column's prtype */
+	ulint		len;		/* required column length */
+};
+
+/* This struct is used for checking whether a given table exists and
+whether it has a predefined schema (number of columns and columns names
+and types) */
+struct dict_table_schema_t {
+	const char*		table_name;	/* the name of the table whose
+						structure we are checking */
+	ulint			n_cols;		/* the number of columns the
+						table must have */
+	dict_col_meta_t*	columns;	/* metadata for the columns;
+						this array has n_cols
+						elements */
+	ulint			n_foreign;	/* number of foreign keys this
+						table has, pointing to other
+						tables (where this table is
+						FK child) */
+	ulint			n_referenced;	/* number of foreign keys other
+						tables have, pointing to this
+						table (where this table is
+						parent) */
+};
+/* @} */
+
+/*********************************************************************//**
+Checks whether a table exists and whether it has the given structure.
+The table must have the same number of columns with the same names and
+types. The order of the columns does not matter.
+The caller must own the dictionary mutex.
+dict_table_schema_check() @{
+@return DB_SUCCESS if the table exists and contains the necessary columns */
+UNIV_INTERN
+dberr_t
+dict_table_schema_check(
+/*====================*/
+	dict_table_schema_t*	req_schema,	/*!< in/out: required table
+						schema */
+	char*			errstr,		/*!< out: human readable error
+						message if != DB_SUCCESS and
+						!= DB_TABLE_NOT_FOUND is
+						returned */
+	size_t			errstr_sz)	/*!< in: errstr size */
+	__attribute__((nonnull, warn_unused_result));
+/* @} */
+
+/*********************************************************************//**
+Converts a database and table name from filesystem encoding
+(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two
+strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be
+at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */
+UNIV_INTERN
+void
+dict_fs2utf8(
+/*=========*/
+	const char*	db_and_table,	/*!< in: database and table names,
+					e.g. d@i1b/a@q1b@1Kc */
+	char*		db_utf8,	/*!< out: database name, e.g. dцb */
+	size_t		db_utf8_size,	/*!< in: dbname_utf8 size */
+	char*		table_utf8,	/*!< out: table name, e.g. aюbØc */
+	size_t		table_utf8_size)/*!< in: table_utf8 size */
+	__attribute__((nonnull));
+
 /**********************************************************************//**
 Closes the data dictionary module. */
 UNIV_INTERN
@@ -1302,7 +1688,7 @@ ulint
 dict_table_is_corrupted(
 /*====================*/
 	const dict_table_t*	table)	/*!< in: table */
-	__attribute__((nonnull, pure, warn_unused_result));
+	__attribute__((nonnull, warn_unused_result));
 
 /**********************************************************************//**
 Check whether the index is corrupted.
@@ -1312,7 +1698,7 @@ ulint
 dict_index_is_corrupted(
 /*====================*/
 	const dict_index_t*	index)	/*!< in: index */
-	__attribute__((nonnull, pure, warn_unused_result));
+	__attribute__((nonnull, warn_unused_result));
 
 #endif /* !UNIV_HOTBACKUP */
 /**********************************************************************//**
@@ -1322,7 +1708,9 @@ UNIV_INTERN
 void
 dict_set_corrupted(
 /*===============*/
-	dict_index_t*	index)		/*!< in/out: index */
+	dict_index_t*	index,	/*!< in/out: index */
+	trx_t*		trx,	/*!< in/out: transaction */
+	const char*	ctx)	/*!< in: context */
 	UNIV_COLD __attribute__((nonnull));
 
 /**********************************************************************//**
@@ -1334,7 +1722,8 @@ void
 dict_set_corrupted_index_cache_only(
 /*================================*/
 	dict_index_t*	index,		/*!< in/out: index */
-	dict_table_t*	table);		/*!< in/out: table */
+	dict_table_t*	table)		/*!< in/out: table */
+	__attribute__((nonnull));
 
 /**********************************************************************//**
 Flags a table with specified space_id corrupted in the table dictionary
@@ -1346,6 +1735,75 @@ dict_set_corrupted_by_space(
 /*========================*/
 	ulint		space_id);	/*!< in: space ID */
 
+/********************************************************************//**
+Validate the table flags.
+@return	true if valid. */
+UNIV_INLINE
+bool
+dict_tf_is_valid(
+/*=============*/
+	ulint		flags)		/*!< in: table flags */
+	__attribute__((warn_unused_result));
+
+/********************************************************************//**
+Check if the tablespace for the table has been discarded.
+@return	true if the tablespace has been discarded. */
+UNIV_INLINE
+bool
+dict_table_is_discarded(
+/*====================*/
+	const dict_table_t*	table)	/*!< in: table to check */
+	__attribute__((nonnull, pure, warn_unused_result));
+
+/********************************************************************//**
+Check if it is a temporary table.
+@return	true if temporary table flag is set. */
+UNIV_INLINE
+bool
+dict_table_is_temporary(
+/*====================*/
+	const dict_table_t*	table)	/*!< in: table to check */
+	__attribute__((nonnull, pure, warn_unused_result));
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+This function should be called whenever a page is successfully
+compressed. Updates the compression padding information. */
+UNIV_INTERN
+void
+dict_index_zip_success(
+/*===================*/
+	dict_index_t*	index)	/*!< in/out: index to be updated. */
+	__attribute__((nonnull));
+/*********************************************************************//**
+This function should be called whenever a page compression attempt
+fails. Updates the compression padding information. */
+UNIV_INTERN
+void
+dict_index_zip_failure(
+/*===================*/
+	dict_index_t*	index)	/*!< in/out: index to be updated. */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Return the optimal page size, for which page will likely compress.
+@return page size beyond which page may not compress*/
+UNIV_INTERN
+ulint
+dict_index_zip_pad_optimal_page_size(
+/*=================================*/
+	dict_index_t*	index)	/*!< in: index for which page size
+				is requested */
+	__attribute__((nonnull, warn_unused_result));
+/*************************************************************//**
+Convert table flag to row format string.
+@return row format name */
+UNIV_INTERN
+const char*
+dict_tf_to_row_format_string(
+/*=========================*/
+	ulint	table_flag);		/*!< in: row format setting */
+
+#endif /* !UNIV_HOTBACKUP */
 /*************************************************************************
 set is_corrupt flag by space_id*/
 
diff --git a/storage/xtradb/include/dict0dict.ic b/storage/xtradb/include/dict0dict.ic
index 1d2eb34042d..c261d6a3aee 100644
--- a/storage/xtradb/include/dict0dict.ic
+++ b/storage/xtradb/include/dict0dict.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -27,7 +27,9 @@ Created 1/8/1996 Heikki Tuuri
 #ifndef UNIV_HOTBACKUP
 #include "dict0load.h"
 #include "rem0types.h"
+#include "fsp0fsp.h"
 #include "srv0srv.h"
+#include "sync0rw.h" /* RW_S_LATCH */
 
 /*********************************************************************//**
 Gets the minimum number of bytes per character.
@@ -103,7 +105,7 @@ dict_col_type_assert_equal(
 
 	ut_ad(col->mtype == type->mtype);
 	ut_ad(col->prtype == type->prtype);
-	ut_ad(col->len == type->len);
+	//ut_ad(col->len == type->len);
 # ifndef UNIV_HOTBACKUP
 	ut_ad(col->mbminmaxlen == type->mbminmaxlen);
 # endif /* !UNIV_HOTBACKUP */
@@ -145,7 +147,7 @@ ulint
 dict_col_get_fixed_size(
 /*====================*/
 	const dict_col_t*	col,	/*!< in: column */
-	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT */
 {
 	return(dtype_get_fixed_size_low(col->mtype, col->prtype, col->len,
 					col->mbminmaxlen, comp));
@@ -222,6 +224,22 @@ dict_table_get_first_index(
 }
 
 /********************************************************************//**
+Gets the last index on the table.
+@return	index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_last_index(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return(UT_LIST_GET_LAST((const_cast<dict_table_t*>(table))
+				->indexes));
+}
+
+/********************************************************************//**
 Gets the next index on the table.
 @return	index, NULL if none left */
 UNIV_INLINE
@@ -250,7 +268,7 @@ dict_index_is_clust(
 	ut_ad(index);
 	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
 
-	return(UNIV_UNLIKELY(index->type & DICT_CLUSTERED));
+	return(index->type & DICT_CLUSTERED);
 }
 /********************************************************************//**
 Check whether the index is unique.
@@ -264,7 +282,7 @@ dict_index_is_unique(
 	ut_ad(index);
 	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
 
-	return(UNIV_UNLIKELY(index->type & DICT_UNIQUE));
+	return(index->type & DICT_UNIQUE);
 }
 
 /********************************************************************//**
@@ -279,7 +297,22 @@ dict_index_is_ibuf(
 	ut_ad(index);
 	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
 
-	return(UNIV_UNLIKELY(index->type & DICT_IBUF));
+	return(index->type & DICT_IBUF);
+}
+
+/********************************************************************//**
+Check whether the index is an universal index tree.
+@return	nonzero for universal tree, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_univ(
+/*===============*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(index->type & DICT_UNIVERSAL);
 }
 
 /********************************************************************//**
@@ -298,7 +331,7 @@ dict_index_is_sec_or_ibuf(
 
 	type = index->type;
 
-	return(UNIV_LIKELY(!(type & DICT_CLUSTERED) || (type & DICT_IBUF)));
+	return(!(type & DICT_CLUSTERED) || (type & DICT_IBUF));
 }
 
 /********************************************************************//**
@@ -349,6 +382,56 @@ dict_table_get_n_cols(
 	return(table->n_cols);
 }
 
+/********************************************************************//**
+Gets the approximately estimated number of rows in the table.
+@return	estimated number of rows */
+UNIV_INLINE
+ib_uint64_t
+dict_table_get_n_rows(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table->stat_initialized);
+
+	return(table->stat_n_rows);
+}
+
+/********************************************************************//**
+Increment the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_inc(
+/*==================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	if (table->stat_initialized) {
+		ib_uint64_t	n_rows = table->stat_n_rows;
+		if (n_rows < 0xFFFFFFFFFFFFFFFFULL) {
+			table->stat_n_rows = n_rows + 1;
+		}
+	}
+}
+
+/********************************************************************//**
+Decrement the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_dec(
+/*==================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	if (table->stat_initialized) {
+		ib_uint64_t	n_rows = table->stat_n_rows;
+		if (n_rows > 0) {
+			table->stat_n_rows = n_rows - 1;
+		}
+	}
+}
+
 #ifdef UNIV_DEBUG
 /********************************************************************//**
 Gets the nth column of a table.
@@ -420,11 +503,196 @@ dict_table_is_comp(
 {
 	ut_ad(table);
 
-#if DICT_TF_COMPACT != TRUE
-#error
+#if DICT_TF_COMPACT != 1
+#error "DICT_TF_COMPACT must be 1"
 #endif
 
-	return(UNIV_LIKELY(table->flags & DICT_TF_COMPACT));
+	return(table->flags & DICT_TF_COMPACT);
+}
+
+/************************************************************************
+Check if the table has an FTS index. */
+UNIV_INLINE
+ibool
+dict_table_has_fts_index(
+/*=====================*/
+				/* out: TRUE if table has an FTS index */
+	dict_table_t*   table)  /* in: table */
+{
+	ut_ad(table);
+
+	return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS));
+}
+
+/********************************************************************//**
+Validate the table flags.
+@return	true if valid. */
+UNIV_INLINE
+bool
+dict_tf_is_valid(
+/*=============*/
+	ulint	flags)		/*!< in: table flags */
+{
+	ulint	compact = DICT_TF_GET_COMPACT(flags);
+	ulint	zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags);
+	ulint	atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags);
+	ulint	unused = DICT_TF_GET_UNUSED(flags);
+
+	/* Make sure there are no bits that we do not know about. */
+	if (unused != 0) {
+
+		return(false);
+
+	} else if (atomic_blobs) {
+		/* Barracuda row formats COMPRESSED and DYNAMIC build on
+		the page structure introduced for the COMPACT row format
+		by allowing keys in secondary indexes to be made from
+		data stored off-page in the clustered index. */
+
+		if (!compact) {
+			return(false);
+		}
+
+	} else if (zip_ssize) {
+
+		/* Antelope does not support COMPRESSED row format. */
+		return(false);
+	}
+
+	if (zip_ssize) {
+
+		/* COMPRESSED row format must have compact and atomic_blobs
+		bits set and validate the number is within allowed range. */
+
+		if (!compact
+		    || !atomic_blobs
+		    || zip_ssize > PAGE_ZIP_SSIZE_MAX) {
+
+			return(false);
+		}
+	}
+
+	/* CREATE TABLE ... DATA DIRECTORY is supported for any row format,
+	so the DATA_DIR flag is compatible with all other table flags. */
+
+	return(true);
+}
+
+/********************************************************************//**
+Validate a SYS_TABLES TYPE field and return it.
+@return	Same as input after validating it as a SYS_TABLES TYPE field.
+If there is an error, return ULINT_UNDEFINED. */
+UNIV_INLINE
+ulint
+dict_sys_tables_type_validate(
+/*==========================*/
+	ulint	type,		/*!< in: SYS_TABLES.TYPE */
+	ulint	n_cols)		/*!< in: SYS_TABLES.N_COLS */
+{
+	ulint	low_order_bit = DICT_TF_GET_COMPACT(type);
+	ulint	redundant = !(n_cols & DICT_N_COLS_COMPACT);
+	ulint	zip_ssize = DICT_TF_GET_ZIP_SSIZE(type);
+	ulint	atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(type);
+	ulint	unused = DICT_TF_GET_UNUSED(type);
+
+	/* The low order bit of SYS_TABLES.TYPE is always set to 1.
+	If the format is UNIV_FORMAT_B or higher, this field is the same
+	as dict_table_t::flags. Zero is not allowed here. */
+	if (!low_order_bit) {
+		return(ULINT_UNDEFINED);
+	}
+
+	if (redundant) {
+		if (zip_ssize || atomic_blobs) {
+			return(ULINT_UNDEFINED);
+		}
+	}
+
+	/* Make sure there are no bits that we do not know about. */
+	if (unused) {
+		return(ULINT_UNDEFINED);
+	}
+
+	if (atomic_blobs) {
+		/* Barracuda row formats COMPRESSED and DYNAMIC build on
+		the page structure introduced for the COMPACT row format
+		by allowing keys in secondary indexes to be made from
+		data stored off-page in the clustered index.
+
+		The DICT_N_COLS_COMPACT flag should be in N_COLS,
+		but we already know that. */
+
+	} else if (zip_ssize) {
+		/* Antelope does not support COMPRESSED format. */
+		return(ULINT_UNDEFINED);
+	}
+
+	if (zip_ssize) {
+		/* COMPRESSED row format must have low_order_bit and
+		atomic_blobs bits set and the DICT_N_COLS_COMPACT flag
+		should be in N_COLS, but we already know about the
+		low_order_bit and DICT_N_COLS_COMPACT flags. */
+		if (!atomic_blobs) {
+			return(ULINT_UNDEFINED);
+		}
+
+		/* Validate that the number is within allowed range. */
+		if (zip_ssize > PAGE_ZIP_SSIZE_MAX) {
+			return(ULINT_UNDEFINED);
+		}
+	}
+
+	/* There is nothing to validate for the data_dir field.
+	CREATE TABLE ... DATA DIRECTORY is supported for any row
+	format, so the DATA_DIR flag is compatible with any other
+	table flags. However, it is not used with TEMPORARY tables.*/
+
+	/* Return the validated SYS_TABLES.TYPE. */
+	return(type);
+}
+
+/********************************************************************//**
+Determine the file format from dict_table_t::flags
+The low order bit will be zero for REDUNDANT and 1 for COMPACT. For any
+other row_format, file_format is > 0 and DICT_TF_COMPACT will also be set.
+@return	file format version */
+UNIV_INLINE
+rec_format_t
+dict_tf_get_rec_format(
+/*===================*/
+	ulint		flags)	/*!< in: dict_table_t::flags */
+{
+	ut_a(dict_tf_is_valid(flags));
+
+	if (!DICT_TF_GET_COMPACT(flags)) {
+		return(REC_FORMAT_REDUNDANT);
+	}
+
+	if (!DICT_TF_HAS_ATOMIC_BLOBS(flags)) {
+		return(REC_FORMAT_COMPACT);
+	}
+
+	if (DICT_TF_GET_ZIP_SSIZE(flags)) {
+		return(REC_FORMAT_COMPRESSED);
+	}
+
+	return(REC_FORMAT_DYNAMIC);
+}
+
+/********************************************************************//**
+Determine the file format from a dict_table_t::flags.
+@return	file format version */
+UNIV_INLINE
+ulint
+dict_tf_get_format(
+/*===============*/
+	ulint		flags)	/*!< in: dict_table_t::flags */
+{
+	if (DICT_TF_HAS_ATOMIC_BLOBS(flags)) {
+		return(UNIV_FORMAT_B);
+	}
+
+	return(UNIV_FORMAT_A);
 }
 
 /********************************************************************//**
@@ -438,41 +706,166 @@ dict_table_get_format(
 {
 	ut_ad(table);
 
-	return((table->flags & DICT_TF_FORMAT_MASK) >> DICT_TF_FORMAT_SHIFT);
+	return(dict_tf_get_format(table->flags));
 }
 
 /********************************************************************//**
-Determine the file format of a table. */
+Set the file format and zip size in a dict_table_t::flags.  If zip size
+is not needed, it should be 0. */
 UNIV_INLINE
 void
-dict_table_set_format(
-/*==================*/
-	dict_table_t*	table,	/*!< in/out: table */
-	ulint		format)	/*!< in: file format version */
+dict_tf_set(
+/*========*/
+	ulint*		flags,		/*!< in/out: table flags */
+	rec_format_t	format,		/*!< in: file format */
+	ulint		zip_ssize,	/*!< in: zip shift size */
+	bool		use_data_dir)	/*!< in: table uses DATA DIRECTORY */
 {
-	ut_ad(table);
+	switch (format) {
+	case REC_FORMAT_REDUNDANT:
+		*flags = 0;
+		ut_ad(zip_ssize == 0);
+		break;
+	case REC_FORMAT_COMPACT:
+		*flags = DICT_TF_COMPACT;
+		ut_ad(zip_ssize == 0);
+		break;
+	case REC_FORMAT_COMPRESSED:
+		*flags = DICT_TF_COMPACT
+			| (1 << DICT_TF_POS_ATOMIC_BLOBS)
+			| (zip_ssize << DICT_TF_POS_ZIP_SSIZE);
+		break;
+	case REC_FORMAT_DYNAMIC:
+		*flags = DICT_TF_COMPACT
+			| (1 << DICT_TF_POS_ATOMIC_BLOBS);
+		ut_ad(zip_ssize == 0);
+		break;
+	}
 
-	table->flags = (table->flags & ~DICT_TF_FORMAT_MASK)
-		| (format << DICT_TF_FORMAT_SHIFT);
+	if (use_data_dir) {
+		*flags |= (1 << DICT_TF_POS_DATA_DIR);
+	}
 }
 
 /********************************************************************//**
-Extract the compressed page size from table flags.
+Convert a 32 bit integer table flags to the 32 bit integer that is
+written into the tablespace header at the offset FSP_SPACE_FLAGS and is
+also stored in the fil_space_t::flags field.  The following chart shows
+the translation of the low order bit.  Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC
+dict_table_t::flags |     0     |    1    |     1      |    1
+fil_space_t::flags  |     0     |    0    |     1      |    1
+==================================================================
+@return	tablespace flags (fil_space_t::flags) */
+UNIV_INLINE
+ulint
+dict_tf_to_fsp_flags(
+/*=================*/
+	ulint	table_flags)	/*!< in: dict_table_t::flags */
+{
+	ulint fsp_flags;
+
+	DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure",
+			return(ULINT_UNDEFINED););
+
+	/* Adjust bit zero. */
+	fsp_flags = DICT_TF_HAS_ATOMIC_BLOBS(table_flags) ? 1 : 0;
+
+	/* ZIP_SSIZE and ATOMIC_BLOBS are at the same position. */
+	fsp_flags |= table_flags & DICT_TF_MASK_ZIP_SSIZE;
+	fsp_flags |= table_flags & DICT_TF_MASK_ATOMIC_BLOBS;
+
+	/* In addition, tablespace flags also contain the page size. */
+	fsp_flags |= fsp_flags_set_page_size(fsp_flags, UNIV_PAGE_SIZE);
+
+	/* The DATA_DIR flag is in a different position in fsp_flag */
+	fsp_flags |= DICT_TF_HAS_DATA_DIR(table_flags)
+		     ? FSP_FLAGS_MASK_DATA_DIR : 0;
+
+	ut_a(fsp_flags_is_valid(fsp_flags));
+
+	return(fsp_flags);
+}
+
+/********************************************************************//**
+Convert a 32 bit integer from SYS_TABLES.TYPE to dict_table_t::flags
+The following chart shows the translation of the low order bit.
+Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
+SYS_TABLES.TYPE     |     1     |    1    |     1
+dict_table_t::flags |     0     |    1    |     1
+==================================================================
+@return	ulint containing SYS_TABLES.TYPE */
+UNIV_INLINE
+ulint
+dict_sys_tables_type_to_tf(
+/*=======================*/
+	ulint	type,	/*!< in: SYS_TABLES.TYPE field */
+	ulint	n_cols)	/*!< in: SYS_TABLES.N_COLS field */
+{
+	ulint	flags;
+	ulint	redundant = !(n_cols & DICT_N_COLS_COMPACT);
+
+	/* Adjust bit zero. */
+	flags = redundant ? 0 : 1;
+
+	/* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */
+	flags |= type & (DICT_TF_MASK_ZIP_SSIZE
+			 | DICT_TF_MASK_ATOMIC_BLOBS
+			 | DICT_TF_MASK_DATA_DIR);
+
+	return(flags);
+}
+
+/********************************************************************//**
+Convert a 32 bit integer table flags to the 32bit integer that is written
+to a SYS_TABLES.TYPE field. The following chart shows the translation of
+the low order bit.  Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
+dict_table_t::flags |     0     |    1    |     1
+SYS_TABLES.TYPE     |     1     |    1    |     1
+==================================================================
+@return	ulint containing SYS_TABLES.TYPE */
+UNIV_INLINE
+ulint
+dict_tf_to_sys_tables_type(
+/*=======================*/
+	ulint	flags)	/*!< in: dict_table_t::flags */
+{
+	ulint type;
+
+	ut_a(dict_tf_is_valid(flags));
+
+	/* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */
+	type = 1;
+
+	/* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */
+	type |= flags & (DICT_TF_MASK_ZIP_SSIZE
+			 | DICT_TF_MASK_ATOMIC_BLOBS
+			 | DICT_TF_MASK_DATA_DIR);
+
+	return(type);
+}
+
+/********************************************************************//**
+Extract the compressed page size from dict_table_t::flags.
+These flags are in memory, so assert that they are valid.
 @return	compressed page size, or 0 if not compressed */
 UNIV_INLINE
 ulint
-dict_table_flags_to_zip_size(
-/*=========================*/
+dict_tf_get_zip_size(
+/*=================*/
 	ulint	flags)	/*!< in: flags */
 {
-	ulint	zip_size = flags & DICT_TF_ZSSIZE_MASK;
+	ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags);
+	ulint zip_size = (zip_ssize
+			  ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize
+			  : 0);
 
-	if (UNIV_UNLIKELY(zip_size)) {
-		zip_size = ((PAGE_ZIP_MIN_SIZE >> 1)
-			 << (zip_size >> DICT_TF_ZSSIZE_SHIFT));
-
-		ut_ad(zip_size <= UNIV_PAGE_SIZE);
-	}
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
 
 	return(zip_size);
 }
@@ -488,7 +881,7 @@ dict_table_zip_size(
 {
 	ut_ad(table);
 
-	return(dict_table_flags_to_zip_size(table->flags));
+	return(dict_tf_get_zip_size(table->flags));
 }
 
 #ifndef UNIV_HOTBACKUP
@@ -535,6 +928,7 @@ dict_table_x_unlock_indexes(
 	}
 }
 #endif /* !UNIV_HOTBACKUP */
+
 /********************************************************************//**
 Gets the number of fields in the internal representation of an index,
 including fields added by the dictionary system.
@@ -644,7 +1038,7 @@ dict_index_get_sys_col_pos(
 {
 	ut_ad(index);
 	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
-	ut_ad(!(index->type & DICT_UNIVERSAL));
+	ut_ad(!dict_index_is_univ(index));
 
 	if (dict_index_is_clust(index)) {
 
@@ -697,6 +1091,20 @@ dict_index_get_nth_col_no(
 	return(dict_col_get_no(dict_index_get_nth_col(index, pos)));
 }
 
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			n)	/*!< in: column number */
+{
+	return(dict_index_get_nth_col_or_prefix_pos(index, n, FALSE));
+}
+
 #ifndef UNIV_HOTBACKUP
 /********************************************************************//**
 Returns the minimum data size of an index record.
@@ -767,7 +1175,7 @@ dict_index_get_page(
 Gets the read-write lock of the index tree.
 @return	read-write lock */
 UNIV_INLINE
-rw_lock_t*
+prio_rw_lock_t*
 dict_index_get_lock(
 /*================*/
 	dict_index_t*	index)	/*!< in: index */
@@ -791,119 +1199,133 @@ dict_index_get_space_reserve(void)
 	return(UNIV_PAGE_SIZE / 16);
 }
 
-/**********************************************************************//**
-Checks if a table is in the dictionary cache.
-@return	table, NULL if not found */
+/********************************************************************//**
+Gets the status of online index creation.
+@return the status */
 UNIV_INLINE
-dict_table_t*
-dict_table_check_if_in_cache_low(
-/*=============================*/
-	const char*	table_name)	/*!< in: table name */
+enum online_index_status
+dict_index_get_online_status(
+/*=========================*/
+	const dict_index_t*	index)	/*!< in: secondary index */
 {
-	dict_table_t*	table;
-	ulint		table_fold;
-
-	ut_ad(table_name);
-	ut_ad(mutex_own(&(dict_sys->mutex)));
+	enum online_index_status	status;
 
-	/* Look for the table name in the hash table */
-	table_fold = ut_fold_string(table_name);
+	status = (enum online_index_status) index->online_status;
 
-	HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold,
-		    dict_table_t*, table, ut_ad(table->cached),
-		    !strcmp(table->name, table_name));
+	/* Without the index->lock protection, the online
+	status can change from ONLINE_INDEX_CREATION to
+	ONLINE_INDEX_COMPLETE (or ONLINE_INDEX_ABORTED) in
+	row_log_apply() once log application is done. So to make
+	sure the status is ONLINE_INDEX_CREATION or ONLINE_INDEX_COMPLETE
+	you should always do the recheck after acquiring index->lock */
 
-	/* make young in table_LRU */
-	if (table) {
-		UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
-		UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
+#ifdef UNIV_DEBUG
+	switch (status) {
+	case ONLINE_INDEX_COMPLETE:
+	case ONLINE_INDEX_CREATION:
+	case ONLINE_INDEX_ABORTED:
+	case ONLINE_INDEX_ABORTED_DROPPED:
+		return(status);
 	}
-
-	return(table);
+	ut_error;
+#endif /* UNIV_DEBUG */
+	return(status);
 }
 
-/**********************************************************************//**
-Gets a table; loads it to the dictionary cache if necessary. A low-level
-function.
-@return	table, NULL if not found */
+/********************************************************************//**
+Sets the status of online index creation. */
 UNIV_INLINE
-dict_table_t*
-dict_table_get_low(
-/*===============*/
-	const char*	table_name,	/*!< in: table name */
-	dict_err_ignore_t
-			ignore_err)	/*!< in: error to be ignored when
-					loading a table definition */
+void
+dict_index_set_online_status(
+/*=========================*/
+	dict_index_t*			index,	/*!< in/out: index */
+	enum online_index_status	status)	/*!< in: status */
 {
-	dict_table_t*	table;
+	ut_ad(!(index->type & DICT_FTS));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+#ifdef UNIV_DEBUG
+	switch (dict_index_get_online_status(index)) {
+	case ONLINE_INDEX_COMPLETE:
+	case ONLINE_INDEX_CREATION:
+		break;
+	case ONLINE_INDEX_ABORTED:
+		ut_ad(status == ONLINE_INDEX_ABORTED_DROPPED);
+		break;
+	case ONLINE_INDEX_ABORTED_DROPPED:
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
 
-	ut_ad(table_name);
-	ut_ad(mutex_own(&(dict_sys->mutex)));
+	index->online_status = status;
+	ut_ad(dict_index_get_online_status(index) == status);
+}
 
-	table = dict_table_check_if_in_cache_low(table_name);
-
-	if (table && table->corrupted
-	    && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)) {
-		fprintf(stderr, "InnoDB: table");
-		ut_print_name(stderr, NULL, TRUE, table->name);
-		if (srv_load_corrupted) {
-			fputs(" is corrupted, but"
-			      " innodb_force_load_corrupted is set\n", stderr);
-		} else {
-			fputs(" is corrupted\n", stderr);
-			return(NULL);
+/********************************************************************//**
+Determines if a secondary index is being or has been created online,
+or if the table is being rebuilt online, allowing concurrent modifications
+to the table.
+@retval true if the index is being or has been built online, or
+if this is a clustered index and the table is being or has been rebuilt online
+@retval false if the index has been created or the table has been
+rebuilt completely */
+UNIV_INLINE
+bool
+dict_index_is_online_ddl(
+/*=====================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+#ifdef UNIV_DEBUG
+	if (dict_index_is_clust(index)) {
+		switch (dict_index_get_online_status(index)) {
+		case ONLINE_INDEX_CREATION:
+			return(true);
+		case ONLINE_INDEX_COMPLETE:
+			return(false);
+		case ONLINE_INDEX_ABORTED:
+		case ONLINE_INDEX_ABORTED_DROPPED:
+			break;
 		}
+		ut_ad(0);
+		return(false);
 	}
+#endif /* UNIV_DEBUG */
 
-	if (table == NULL) {
-		table = dict_load_table(table_name, TRUE, ignore_err);
-	}
-
-	ut_ad(!table || table->cached);
-
-	return(table);
+	return(UNIV_UNLIKELY(dict_index_get_online_status(index)
+			     != ONLINE_INDEX_COMPLETE));
 }
 
 /**********************************************************************//**
-Returns a table object based on table id.
-@return	table, NULL if does not exist */
+Check whether a column exists in an FTS index.
+@return ULINT_UNDEFINED if no match else the offset within the vector */
 UNIV_INLINE
-dict_table_t*
-dict_table_get_on_id_low(
+ulint
+dict_table_is_fts_column(
 /*=====================*/
-	table_id_t	table_id)	/*!< in: table id */
+	ib_vector_t*	indexes,/*!< in: vector containing only FTS indexes */
+	ulint		col_no)	/*!< in: col number to search for */
+
 {
-	dict_table_t*	table;
-	ulint		fold;
+	ulint		i;
 
-	ut_ad(mutex_own(&(dict_sys->mutex)));
+	for (i = 0; i < ib_vector_size(indexes); ++i) {
+		dict_index_t*	index;
 
-	/* Look for the table name in the hash table */
-	fold = ut_fold_ull(table_id);
+		index = (dict_index_t*) ib_vector_getp(indexes, i);
 
-	HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold,
-		    dict_table_t*, table, ut_ad(table->cached),
-		    table->id == table_id);
-	if (table == NULL) {
-		table = dict_load_table_on_id(table_id);
-	}
+		if (dict_index_contains_col_or_prefix(index, col_no)) {
 
-	/* make young in table_LRU */
-	if (table) {
-		UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
-		UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
+			return(i);
+		}
 	}
 
-	ut_ad(!table || table->cached);
-
-	/* TODO: should get the type information from MySQL */
-
-	return(table);
+	return(ULINT_UNDEFINED);
 }
 
 /**********************************************************************//**
 Determine bytes of column prefix to be stored in the undo log. Please
-note if the table format is UNIV_FORMAT_A (< DICT_TF_FORMAT_ZIP), no prefix
+note if the table format is UNIV_FORMAT_A (< UNIV_FORMAT_B), no prefix
 needs to be stored in the undo log.
 @return bytes of column prefix to be stored in the undo log */
 UNIV_INLINE
@@ -914,9 +1336,9 @@ dict_max_field_len_store_undo(
 	const dict_col_t*	col)	/*!< in: column which index prefix
 					is based on */
 {
-	ulint   prefix_len = 0;
+	ulint	prefix_len = 0;
 
-	if (dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP)
+	if (dict_table_get_format(table) >= UNIV_FORMAT_B)
 	{
 		prefix_len = col->max_prefix
 			? col->max_prefix
@@ -938,7 +1360,7 @@ dict_table_is_corrupted(
 	ut_ad(table);
 	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
 
-	return(UNIV_UNLIKELY(table->corrupted));
+	return(table->corrupted);
 }
 
 /********************************************************************//**
@@ -953,8 +1375,32 @@ dict_index_is_corrupted(
 	ut_ad(index);
 	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
 
-	return(UNIV_UNLIKELY((index->type & DICT_CORRUPT)
-	       || (index->table && index->table->corrupted)));
+	return((index->type & DICT_CORRUPT)
+	       || (index->table && index->table->corrupted));
+}
+
+/********************************************************************//**
+Check if the tablespace for the table has been discarded.
+@return	true if the tablespace has been discarded. */
+UNIV_INLINE
+bool
+dict_table_is_discarded(
+/*====================*/
+	const dict_table_t*	table)	/*!< in: table to check */
+{
+	return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_DISCARDED));
+}
+
+/********************************************************************//**
+Check if it is a temporary table.
+@return	true if temporary table flag is set. */
+UNIV_INLINE
+bool
+dict_table_is_temporary(
+/*====================*/
+	const dict_table_t*	table)	/*!< in: table to check */
+{
+	return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY));
 }
 
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/dict0load.h b/storage/xtradb/include/dict0load.h
index 5bb015346ac..030190b1a8e 100644
--- a/storage/xtradb/include/dict0load.h
+++ b/storage/xtradb/include/dict0load.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2013, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -29,38 +29,46 @@ Created 4/24/1996 Heikki Tuuri
 
 #include "univ.i"
 #include "dict0types.h"
+#include "trx0types.h"
 #include "ut0byte.h"
 #include "mem0mem.h"
 #include "btr0types.h"
 
-/** enum that defines all 6 system table IDs */
-enum dict_system_table_id {
+/** enum that defines all system table IDs. @see SYSTEM_TABLE_NAME[] */
+enum dict_system_id_t {
 	SYS_TABLES = 0,
 	SYS_INDEXES,
 	SYS_COLUMNS,
 	SYS_FIELDS,
 	SYS_FOREIGN,
 	SYS_FOREIGN_COLS,
-	SYS_STATS,
+	SYS_TABLESPACES,
+	SYS_DATAFILES,
 
 	/* This must be last item. Defines the number of system tables. */
 	SYS_NUM_SYSTEM_TABLES
 };
 
-typedef enum dict_system_table_id	dict_system_id_t;
-
-/** Status bit for dict_process_sys_tables_rec() */
-enum dict_table_info {
+/** Status bit for dict_process_sys_tables_rec_and_mtr_commit() */
+enum dict_table_info_t {
 	DICT_TABLE_LOAD_FROM_RECORD = 0,/*!< Directly populate a dict_table_t
 					structure with information from
 					a SYS_TABLES record */
-	DICT_TABLE_LOAD_FROM_CACHE = 1,	/*!< Check first whether dict_table_t
+	DICT_TABLE_LOAD_FROM_CACHE = 1	/*!< Check first whether dict_table_t
 					is in the cache, if so, return it */
-	DICT_TABLE_UPDATE_STATS = 2	/*!< whether to update statistics
-					when loading SYS_TABLES information. */
 };
 
-typedef enum dict_table_info	dict_table_info_t;
+/** Check type for dict_check_tablespaces_and_store_max_id() */
+enum dict_check_t {
+	/** No user tablespaces have been opened
+	(no crash recovery, no transactions recovered). */
+	DICT_CHECK_NONE_LOADED = 0,
+	/** Some user tablespaces may have been opened
+	(no crash recovery; recovered table locks for transactions). */
+	DICT_CHECK_SOME_LOADED,
+	/** All user tablespaces have been opened (crash recovery). */
+	DICT_CHECK_ALL_LOADED
+};
 
 /********************************************************************//**
 In a crash recovery we already have all the tablespace objects created.
@@ -74,7 +82,7 @@ UNIV_INTERN
 void
 dict_check_tablespaces_and_store_max_id(
 /*====================================*/
-	ibool	in_crash_recovery);	/*!< in: are we doing a crash recovery */
+	dict_check_t	dict_check);	/*!< in: how to check */
 /********************************************************************//**
 Finds the first table name in the given database.
 @return own: table name, NULL if does not exist; the caller must free
@@ -156,12 +164,28 @@ dict_load_field_low(
 	byte*		last_index_id,	/*!< in: last index id */
 	mem_heap_t*	heap,		/*!< in/out: memory heap
 					for temporary storage */
-	const rec_t*	rec,		/*!< in: SYS_FIELDS record */
-	char*		addition_err_str,/*!< out: additional error message
-					that requires information to be
-					filled, or NULL */
-	ulint		err_str_len);	/*!< in: length of addition_err_str
-					in bytes */
+	const rec_t*	rec);		/*!< in: SYS_FIELDS record */
+/********************************************************************//**
+Using the table->heap, copy the null-terminated filepath into
+table->data_dir_path and put a null byte before the extension.
+This allows SHOW CREATE TABLE to return the correct DATA DIRECTORY path.
+Make this data directory path only if it has not yet been saved. */
+UNIV_INTERN
+void
+dict_save_data_dir_path(
+/*====================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	char*		filepath);	/*!< in: filepath of tablespace */
+/*****************************************************************//**
+Make sure the data_file_name is saved in dict_table_t if needed. Try to
+read it from the file dictionary first, then from SYS_DATAFILES. */
+UNIV_INTERN
+void
+dict_get_and_save_data_dir_path(
+/*============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	bool		dict_mutex_own);	/*!< in: true if dict_sys->mutex
+					is owned already */
 /********************************************************************//**
 Loads a table definition and also all its index definitions, and also
 the cluster definition if the table is a member in a cluster. Also loads
@@ -187,7 +211,9 @@ UNIV_INTERN
 dict_table_t*
 dict_load_table_on_id(
 /*==================*/
-	table_id_t	table_id);	/*!< in: table id */
+	table_id_t		table_id,	/*!< in: table id */
+	dict_err_ignore_t	ignore_err);	/*!< in: errors to ignore
+						when loading the table */
 /********************************************************************//**
 This function is called when the database is booted.
 Loads system table index definitions except for the clustered index which
@@ -205,16 +231,19 @@ cache already contains all constraints where the other relevant table is
 already in the dictionary cache.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 dict_load_foreigns(
 /*===============*/
 	const char*		table_name,	/*!< in: table name */
-	ibool			check_recursive,/*!< in: Whether to check
+	const char**		col_names,	/*!< in: column names, or NULL
+						to use table->col_names */
+	bool			check_recursive,/*!< in: Whether to check
 						recursive load of tables
 						chained by FK */
-	ibool			check_charsets,	/*!< in: TRUE=check charsets
-						compatibility */
-	dict_err_ignore_t	ignore_err);	/*!< in: error to be ignored */
+	bool			check_charsets,	/*!< in: whether to check
+						charset compatibility */
+	dict_err_ignore_t	ignore_err)	/*!< in: error to be ignored */
+	__attribute__((nonnull(1), warn_unused_result));
 /********************************************************************//**
 Prints to the standard output information on all tables found in the data
 dictionary system table. */
@@ -251,15 +280,17 @@ both monitor table output and information schema innodb_sys_tables output.
 @return error message, or NULL on success */
 UNIV_INTERN
 const char*
-dict_process_sys_tables_rec(
-/*========================*/
+dict_process_sys_tables_rec_and_mtr_commit(
+/*=======================================*/
 	mem_heap_t*	heap,		/*!< in: temporary memory heap */
 	const rec_t*	rec,		/*!< in: SYS_TABLES record */
 	dict_table_t**	table,		/*!< out: dict_table_t to fill */
-	dict_table_info_t status);	/*!< in: status bit controls
+	dict_table_info_t status,	/*!< in: status bit controls
 					options such as whether we shall
 					look for dict_table_t from cache
 					first */
+	mtr_t*		mtr);		/*!< in/out: mini-transaction,
+					will be committed */
 /********************************************************************//**
 This function parses a SYS_INDEXES record and populate a dict_index_t
 structure with the information from the record. For detail information
@@ -331,19 +362,65 @@ dict_process_sys_foreign_col_rec(
 					in referenced table */
 	ulint*		pos);		/*!< out: column position */
 /********************************************************************//**
-This function parses a SYS_STATS record and extract necessary
-information from the record and return to caller.
+This function parses a SYS_TABLESPACES record, extracts necessary
+information from the record and returns to caller.
 @return error message, or NULL on success */
 UNIV_INTERN
 const char*
-dict_process_sys_stats_rec(
-/*=============================*/
+dict_process_sys_tablespaces(
+/*=========================*/
 	mem_heap_t*	heap,		/*!< in/out: heap memory */
-	const rec_t*	rec,		/*!< in: current SYS_STATS rec */
-	index_id_t*	index_id,	/*!< out: INDEX_ID */
-	ulint*		key_cols,	/*!< out: KEY_COLS */
-	ib_uint64_t*	diff_vals,	/*!< out: DIFF_VALS */
-	ib_uint64_t*	non_null_vals);	/*!< out: NON_NULL_VALS */
+	const rec_t*	rec,		/*!< in: current SYS_TABLESPACES rec */
+	ulint*		space,		/*!< out: pace id */
+	const char**	name,		/*!< out: tablespace name */
+	ulint*		flags);		/*!< out: tablespace flags */
+/********************************************************************//**
+This function parses a SYS_DATAFILES record, extracts necessary
+information from the record and returns to caller.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_datafiles(
+/*=======================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_DATAFILES rec */
+	ulint*		space,		/*!< out: pace id */
+	const char**	path);		/*!< out: datafile path */
+/********************************************************************//**
+Get the filepath for a spaceid from SYS_DATAFILES. This function provides
+a temporary heap which is used for the table lookup, but not for the path.
+The caller must free the memory for the path returned. This function can
+return NULL if the space ID is not found in SYS_DATAFILES, then the caller
+will assume that the ibd file is in the normal datadir.
+@return	own: A copy of the first datafile found in SYS_DATAFILES.PATH for
+the given space ID. NULL if space ID is zero or not found. */
+UNIV_INTERN
+char*
+dict_get_first_path(
+/*================*/
+	ulint		space,	/*!< in: space id */
+	const char*	name);	/*!< in: tablespace name */
+/********************************************************************//**
+Update the record for space_id in SYS_TABLESPACES to this filepath.
+@return	DB_SUCCESS if OK, dberr_t if the insert failed */
+UNIV_INTERN
+dberr_t
+dict_update_filepath(
+/*=================*/
+	ulint		space_id,	/*!< in: space id */
+	const char*	filepath);	/*!< in: filepath */
+/********************************************************************//**
+Insert records into SYS_TABLESPACES and SYS_DATAFILES.
+@return	DB_SUCCESS if OK, dberr_t if the insert failed */
+UNIV_INTERN
+dberr_t
+dict_insert_tablespace_and_filepath(
+/*================================*/
+	ulint		space,		/*!< in: space id */
+	const char*	name,		/*!< in: talespace name */
+	const char*	filepath,	/*!< in: filepath */
+	ulint		fsp_flags);	/*!< in: tablespace flags */
+
 #ifndef UNIV_NONINL
 #include "dict0load.ic"
 #endif
diff --git a/storage/xtradb/include/dict0load.ic b/storage/xtradb/include/dict0load.ic
index da224db7927..2c0f1ff38a5 100644
--- a/storage/xtradb/include/dict0load.ic
+++ b/storage/xtradb/include/dict0load.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/dict0mem.h b/storage/xtradb/include/dict0mem.h
index 717c7532dc9..bde0ce16094 100644
--- a/storage/xtradb/include/dict0mem.h
+++ b/storage/xtradb/include/dict0mem.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -27,9 +28,13 @@ Created 1/8/1996 Heikki Tuuri
 #define dict0mem_h
 
 #include "univ.i"
+
+#ifndef UNIV_INNOCHECKSUM
+
 #include "dict0types.h"
 #include "data0type.h"
 #include "mem0mem.h"
+#include "row0types.h"
 #include "rem0types.h"
 #include "btr0types.h"
 #ifndef UNIV_HOTBACKUP
@@ -43,6 +48,10 @@ Created 1/8/1996 Heikki Tuuri
 #include "ut0byte.h"
 #include "hash0hash.h"
 #include "trx0types.h"
+#include "fts0fts.h"
+
+/* Forward declaration. */
+struct ib_rbt_t;
 
 /** Type flags of an index: OR'ing of the flags is allowed to define a
 combination of types */
@@ -54,73 +63,170 @@ combination of types */
 #define	DICT_IBUF	8	/*!< insert buffer tree */
 #define	DICT_CORRUPT	16	/*!< bit to store the corrupted flag
 				in SYS_INDEXES.TYPE */
+#define	DICT_FTS	32	/* FTS index; can't be combined with the
+				other flags */
 
-#define	DICT_IT_BITS	5	/*!< number of bits used for
+#define	DICT_IT_BITS	6	/*!< number of bits used for
 				SYS_INDEXES.TYPE */
 /* @} */
 
+#if 0 /* not implemented, retained for history */
 /** Types for a table object */
 #define DICT_TABLE_ORDINARY		1 /*!< ordinary table */
-#if 0 /* not implemented */
 #define	DICT_TABLE_CLUSTER_MEMBER	2
 #define	DICT_TABLE_CLUSTER		3 /* this means that the table is
 					  really a cluster definition */
 #endif
 
-/** Table flags.  All unused bits must be 0. */
-/* @{ */
-#define DICT_TF_COMPACT			1	/* Compact page format.
-						This must be set for
-						new file formats
-						(later than
-						DICT_TF_FORMAT_51). */
+/* Table and tablespace flags are generally not used for the Antelope file
+format except for the low order bit, which is used differently depending on
+where the flags are stored.
 
-/** Compressed page size (0=uncompressed, up to 15 compressed sizes) */
-/* @{ */
-#define DICT_TF_ZSSIZE_SHIFT		1
-#define DICT_TF_ZSSIZE_MASK		(15 << DICT_TF_ZSSIZE_SHIFT)
-#define DICT_TF_ZSSIZE_MAX (UNIV_PAGE_SIZE_SHIFT - PAGE_ZIP_MIN_SIZE_SHIFT + 1)
-/* @} */
+==================== Low order flags bit =========================
+                    | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
+SYS_TABLES.TYPE     |     1     |    1    |     1
+dict_table_t::flags |     0     |    1    |     1
+FSP_SPACE_FLAGS     |     0     |    0    |     1
+fil_space_t::flags  |     0     |    0    |     1
 
-/** File format */
-/* @{ */
-#define DICT_TF_FORMAT_SHIFT		5	/* file format */
-#define DICT_TF_FORMAT_MASK		\
-((~(~0 << (DICT_TF_BITS - DICT_TF_FORMAT_SHIFT))) << DICT_TF_FORMAT_SHIFT)
-#define DICT_TF_FORMAT_51		0	/*!< InnoDB/MySQL up to 5.1 */
-#define DICT_TF_FORMAT_ZIP		1	/*!< InnoDB plugin for 5.1:
-						compressed tables,
-						new BLOB treatment */
-/** Maximum supported file format */
-#define DICT_TF_FORMAT_MAX		DICT_TF_FORMAT_ZIP
-
-/** Minimum supported file format */
-#define DICT_TF_FORMAT_MIN		DICT_TF_FORMAT_51
+Before the 5.1 plugin, SYS_TABLES.TYPE was always DICT_TABLE_ORDINARY (1)
+and the tablespace flags field was always 0. In the 5.1 plugin, these fields
+were repurposed to identify compressed and dynamic row formats.
 
-/* @} */
-#define DICT_TF_BITS			6	/*!< number of flag bits */
-#if (1 << (DICT_TF_BITS - DICT_TF_FORMAT_SHIFT)) <= DICT_TF_FORMAT_MAX
-# error "DICT_TF_BITS is insufficient for DICT_TF_FORMAT_MAX"
-#endif
+The following types and constants describe the flags found in dict_table_t
+and SYS_TABLES.TYPE.  Similar flags found in fil_space_t and FSP_SPACE_FLAGS
+are described in fsp0fsp.h. */
+
+/* @{ */
+/** dict_table_t::flags bit 0 is equal to 0 if the row format = Redundant */
+#define DICT_TF_REDUNDANT		0	/*!< Redundant row format. */
+/** dict_table_t::flags bit 0 is equal to 1 if the row format = Compact */
+#define DICT_TF_COMPACT			1	/*!< Compact row format. */
+
+/** This bitmask is used in SYS_TABLES.N_COLS to set and test whether
+the Compact page format is used, i.e ROW_FORMAT != REDUNDANT */
+#define DICT_N_COLS_COMPACT	0x80000000UL
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Width of the COMPACT flag */
+#define DICT_TF_WIDTH_COMPACT		1
+/** Width of the ZIP_SSIZE flag */
+#define DICT_TF_WIDTH_ZIP_SSIZE		4
+/** Width of the ATOMIC_BLOBS flag.  The Antelope file formats broke up
+BLOB and TEXT fields, storing the first 768 bytes in the clustered index.
+Brracuda row formats store the whole blob or text field off-page atomically.
+Secondary indexes are created from this external data using row_ext_t
+to cache the BLOB prefixes. */
+#define DICT_TF_WIDTH_ATOMIC_BLOBS	1
+/** If a table is created with the MYSQL option DATA DIRECTORY and
+innodb-file-per-table, an older engine will not be able to find that table.
+This flag prevents older engines from attempting to open the table and
+allows InnoDB to update_create_info() accordingly. */
+#define DICT_TF_WIDTH_DATA_DIR		1
+
+/** Width of all the currently known table flags */
+#define DICT_TF_BITS	(DICT_TF_WIDTH_COMPACT		\
+			+ DICT_TF_WIDTH_ZIP_SSIZE	\
+			+ DICT_TF_WIDTH_ATOMIC_BLOBS	\
+			+ DICT_TF_WIDTH_DATA_DIR)
+
+/** A mask of all the known/used bits in table flags */
+#define DICT_TF_BIT_MASK	(~(~0 << DICT_TF_BITS))
+
+/** Zero relative shift position of the COMPACT field */
+#define DICT_TF_POS_COMPACT		0
+/** Zero relative shift position of the ZIP_SSIZE field */
+#define DICT_TF_POS_ZIP_SSIZE		(DICT_TF_POS_COMPACT		\
+					+ DICT_TF_WIDTH_COMPACT)
+/** Zero relative shift position of the ATOMIC_BLOBS field */
+#define DICT_TF_POS_ATOMIC_BLOBS	(DICT_TF_POS_ZIP_SSIZE		\
+					+ DICT_TF_WIDTH_ZIP_SSIZE)
+/** Zero relative shift position of the DATA_DIR field */
+#define DICT_TF_POS_DATA_DIR		(DICT_TF_POS_ATOMIC_BLOBS	\
+					+ DICT_TF_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the start of the UNUSED bits */
+#define DICT_TF_POS_UNUSED		(DICT_TF_POS_DATA_DIR		\
+					+ DICT_TF_WIDTH_DATA_DIR)
+
+/** Bit mask of the COMPACT field */
+#define DICT_TF_MASK_COMPACT				\
+		((~(~0 << DICT_TF_WIDTH_COMPACT))	\
+		<< DICT_TF_POS_COMPACT)
+/** Bit mask of the ZIP_SSIZE field */
+#define DICT_TF_MASK_ZIP_SSIZE				\
+		((~(~0 << DICT_TF_WIDTH_ZIP_SSIZE))	\
+		<< DICT_TF_POS_ZIP_SSIZE)
+/** Bit mask of the ATOMIC_BLOBS field */
+#define DICT_TF_MASK_ATOMIC_BLOBS			\
+		((~(~0 << DICT_TF_WIDTH_ATOMIC_BLOBS))	\
+		<< DICT_TF_POS_ATOMIC_BLOBS)
+/** Bit mask of the DATA_DIR field */
+#define DICT_TF_MASK_DATA_DIR				\
+		((~(~0 << DICT_TF_WIDTH_DATA_DIR))	\
+		<< DICT_TF_POS_DATA_DIR)
+
+/** Return the value of the COMPACT field */
+#define DICT_TF_GET_COMPACT(flags)			\
+		((flags & DICT_TF_MASK_COMPACT)		\
+		>> DICT_TF_POS_COMPACT)
+/** Return the value of the ZIP_SSIZE field */
+#define DICT_TF_GET_ZIP_SSIZE(flags)			\
+		((flags & DICT_TF_MASK_ZIP_SSIZE)	\
+		>> DICT_TF_POS_ZIP_SSIZE)
+/** Return the value of the ATOMIC_BLOBS field */
+#define DICT_TF_HAS_ATOMIC_BLOBS(flags)			\
+		((flags & DICT_TF_MASK_ATOMIC_BLOBS)	\
+		>> DICT_TF_POS_ATOMIC_BLOBS)
+/** Return the value of the ATOMIC_BLOBS field */
+#define DICT_TF_HAS_DATA_DIR(flags)			\
+		((flags & DICT_TF_MASK_DATA_DIR)	\
+		>> DICT_TF_POS_DATA_DIR)
+/** Return the contents of the UNUSED bits */
+#define DICT_TF_GET_UNUSED(flags)			\
+		(flags >> DICT_TF_POS_UNUSED)
 /* @} */
 
-/** @brief Additional table flags.
+#ifndef UNIV_INNOCHECKSUM
+
+/** @brief Table Flags set number 2.
 
 These flags will be stored in SYS_TABLES.MIX_LEN.  All unused flags
 will be written as 0.  The column may contain garbage for tables
 created with old versions of InnoDB that only implemented
-ROW_FORMAT=REDUNDANT. */
+ROW_FORMAT=REDUNDANT.  InnoDB engines do not check these flags
+for unknown bits in order to protect backward incompatibility. */
 /* @{ */
-#define DICT_TF2_SHIFT			DICT_TF_BITS
-						/*!< Shift value for
-						table->flags. */
-#define DICT_TF2_TEMPORARY		1	/*!< TRUE for tables from
-						CREATE TEMPORARY TABLE. */
-#define DICT_TF2_BITS			(DICT_TF2_SHIFT + 1)
-						/*!< Total number of bits
-						in table->flags. */
+/** Total number of bits in table->flags2. */
+#define DICT_TF2_BITS			6
+#define DICT_TF2_BIT_MASK		~(~0 << DICT_TF2_BITS)
+
+/** TEMPORARY; TRUE for tables from CREATE TEMPORARY TABLE. */
+#define DICT_TF2_TEMPORARY		1
+/** The table has an internal defined DOC ID column */
+#define DICT_TF2_FTS_HAS_DOC_ID		2
+/** The table has an FTS index */
+#define DICT_TF2_FTS			4
+/** Need to add Doc ID column for FTS index build.
+This is a transient bit for index build */
+#define DICT_TF2_FTS_ADD_DOC_ID		8
+/** This bit is used during table creation to indicate that it will
+use its own tablespace instead of the system tablespace. */
+#define DICT_TF2_USE_TABLESPACE		16
+
+/** Set when we discard/detach the tablespace */
+#define DICT_TF2_DISCARDED		32
 /* @} */
 
+#define DICT_TF2_FLAG_SET(table, flag)				\
+	(table->flags2 |= (flag))
+
+#define DICT_TF2_FLAG_IS_SET(table, flag)			\
+	(table->flags2 & (flag))
+
+#define DICT_TF2_FLAG_UNSET(table, flag)			\
+	(table->flags2 &= ~(flag))
+
 /** Tables could be chained together with Foreign key constraint. When
 first load the parent table, we would load all of its descedents.
 This could result in rescursive calls and out of stack error eventually.
@@ -146,11 +252,10 @@ dict_mem_table_create(
 /*==================*/
 	const char*	name,		/*!< in: table name */
 	ulint		space,		/*!< in: space where the clustered index
-					of the table is placed; this parameter
-					is ignored if the table is made
-					a member of a cluster */
+					of the table is placed */
 	ulint		n_cols,		/*!< in: number of columns */
-	ulint		flags);		/*!< in: table flags */
+	ulint		flags,		/*!< in: table flags */
+	ulint		flags2);	/*!< in: table flags2 */
 /****************************************************************//**
 Free a table memory object. */
 UNIV_INTERN
@@ -169,7 +274,19 @@ dict_mem_table_add_col(
 	const char*	name,	/*!< in: column name, or NULL */
 	ulint		mtype,	/*!< in: main datatype */
 	ulint		prtype,	/*!< in: precise type */
-	ulint		len);	/*!< in: precision */
+	ulint		len)	/*!< in: precision */
+	__attribute__((nonnull(1)));
+/**********************************************************************//**
+Renames a column of a table in the data dictionary cache. */
+UNIV_INTERN
+void
+dict_mem_table_col_rename(
+/*======================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	unsigned	nth_col,/*!< in: column index */
+	const char*	from,	/*!< in: old column name */
+	const char*	to)	/*!< in: new column name */
+	__attribute__((nonnull));
 /**********************************************************************//**
 This function populates a dict_col_t memory structure with
 supplied information. */
@@ -267,20 +384,31 @@ dict_mem_referenced_table_name_lookup_set(
 	dict_foreign_t*	foreign,	/*!< in/out: foreign struct */
 	ibool		do_alloc);	/*!< in: is an alloc needed */
 
+/*******************************************************************//**
+Create a temporary tablename.
+@return temporary tablename suitable for InnoDB use */
+UNIV_INTERN __attribute__((nonnull, warn_unused_result))
+char*
+dict_mem_create_temporary_tablename(
+/*================================*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	const char*	dbtab,	/*!< in: database/table name */
+	table_id_t	id);	/*!< in: InnoDB table id */
+
 /** Data structure for a column in a table */
-struct dict_col_struct{
+struct dict_col_t{
 	/*----------------------*/
 	/** The following are copied from dtype_t,
 	so that all bit-fields can be packed tightly. */
 	/* @{ */
-	unsigned	mtype:8;	/*!< main data type */
-	unsigned	prtype:24;	/*!< precise type; MySQL data
+	unsigned	prtype:32;	/*!< precise type; MySQL data
 					type, charset code, flags to
 					indicate nullability,
 					signedness, whether this is a
 					binary string, whether this is
 					a true VARCHAR where MySQL
 					uses 2 bytes to store the length */
+	unsigned	mtype:8;	/*!< main data type */
 
 	/* the remaining fields do not affect alphabetical ordering: */
 
@@ -327,17 +455,16 @@ files would be at risk! */
 
 /** Find out maximum indexed column length by its table format.
 For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, the maximum
-field length is REC_ANTELOPE_MAX_INDEX_COL_LEN - 1 (767). For new
-barracuda format, the length could be REC_VERSION_56_MAX_INDEX_COL_LEN
-(3072) bytes */
+field length is REC_ANTELOPE_MAX_INDEX_COL_LEN - 1 (767). For
+Barracuda row formats COMPRESSED and DYNAMIC, the length could
+be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */
 #define DICT_MAX_FIELD_LEN_BY_FORMAT(table)				\
-		((dict_table_get_format(table) < DICT_TF_FORMAT_ZIP)	\
+		((dict_table_get_format(table) < UNIV_FORMAT_B)		\
 			? (REC_ANTELOPE_MAX_INDEX_COL_LEN - 1)		\
 			: REC_VERSION_56_MAX_INDEX_COL_LEN)
 
 #define DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags)			\
-		((((flags & DICT_TF_FORMAT_MASK) >> DICT_TF_FORMAT_SHIFT)\
-		    < DICT_TF_FORMAT_ZIP)				\
+		((DICT_TF_HAS_ATOMIC_BLOBS(flags) < UNIV_FORMAT_B)	\
 			? (REC_ANTELOPE_MAX_INDEX_COL_LEN - 1)		\
 			: REC_VERSION_56_MAX_INDEX_COL_LEN)
 
@@ -345,7 +472,7 @@ barracuda format, the length could be REC_VERSION_56_MAX_INDEX_COL_LEN
 #define DICT_MAX_FIXED_COL_LEN		DICT_ANTELOPE_MAX_INDEX_COL_LEN
 
 /** Data structure for a field in an index */
-struct dict_field_struct{
+struct dict_field_t{
 	dict_col_t*	col;		/*!< pointer to the table column */
 	const char*	name;		/*!< name of the column */
 	unsigned	prefix_len:12;	/*!< 0 or the length of the column
@@ -361,11 +488,63 @@ struct dict_field_struct{
 					DICT_ANTELOPE_MAX_INDEX_COL_LEN */
 };
 
+/**********************************************************************//**
+PADDING HEURISTIC BASED ON LINEAR INCREASE OF PADDING TO AVOID
+COMPRESSION FAILURES
+(Note: this is relevant only for compressed indexes)
+GOAL: Avoid compression failures by maintaining information about the
+compressibility of data. If data is not very compressible then leave
+some extra space 'padding' in the uncompressed page making it more
+likely that compression of less than fully packed uncompressed page will
+succeed.
+
+This padding heuristic works by increasing the pad linearly until the
+desired failure rate is reached. A "round" is a fixed number of
+compression operations.
+After each round, the compression failure rate for that round is
+computed. If the failure rate is too high, then padding is incremented
+by a fixed value, otherwise it's left intact.
+If the compression failure is lower than the desired rate for a fixed
+number of consecutive rounds, then the padding is decreased by a fixed
+value. This is done to prevent overshooting the padding value,
+and to accommodate the possible change in data compressibility. */
+
+/** Number of zip ops in one round. */
+#define ZIP_PAD_ROUND_LEN			(128)
+
+/** Number of successful rounds after which the padding is decreased */
+#define ZIP_PAD_SUCCESSFUL_ROUND_LIMIT		(5)
+
+/** Amount by which padding is increased. */
+#define ZIP_PAD_INCR				(128)
+
+/** Percentage of compression failures that are allowed in a single
+round */
+extern ulong	zip_failure_threshold_pct;
+
+/** Maximum percentage of a page that can be allowed as a pad to avoid
+compression failures */
+extern ulong	zip_pad_max;
+
+/** Data structure to hold information about about how much space in
+an uncompressed page should be left as padding to avoid compression
+failures. This estimate is based on a self-adapting heuristic. */
+struct zip_pad_info_t {
+	os_fast_mutex_t	mutex;	/*!< mutex protecting the info */
+	ulint		pad;	/*!< number of bytes used as pad */
+	ulint		success;/*!< successful compression ops during
+				current round */
+	ulint		failure;/*!< failed compression ops during
+				current round */
+	ulint		n_rounds;/*!< number of currently successful
+				rounds */
+};
+
 /** Data structure for an index.  Most fields will be
 initialized to 0, NULL or FALSE in dict_mem_index_create(). */
-struct dict_index_struct{
+struct dict_index_t{
 	index_id_t	id;	/*!< id of the index */
-	rw_lock_t*	search_latch; /*!< latch protecting the AHI partition
+	prio_rw_lock_t*	search_latch; /*!< latch protecting the AHI partition
 				      corresponding to this index */
 	hash_table_t*	search_table; /*!< hash table protected by
 				      search_latch */
@@ -403,30 +582,47 @@ struct dict_index_struct{
 	unsigned	cached:1;/*!< TRUE if the index object is in the
 				dictionary cache */
 	unsigned	to_be_dropped:1;
-				/*!< TRUE if this index is marked to be
-				dropped in ha_innobase::prepare_drop_index(),
-				otherwise FALSE. Protected by
-				dict_sys->mutex, dict_operation_lock and
-				index->lock.*/
+				/*!< TRUE if the index is to be dropped;
+				protected by dict_operation_lock */
+	unsigned	online_status:2;
+				/*!< enum online_index_status.
+				Transitions from ONLINE_INDEX_COMPLETE (to
+				ONLINE_INDEX_CREATION) are protected
+				by dict_operation_lock and
+				dict_sys->mutex. Other changes are
+				protected by index->lock. */
 	dict_field_t*	fields;	/*!< array of field descriptions */
 #ifndef UNIV_HOTBACKUP
 	UT_LIST_NODE_T(dict_index_t)
 			indexes;/*!< list of indexes of the table */
-	btr_search_t*	search_info; /*!< info used in optimistic searches */
+	btr_search_t*	search_info;
+				/*!< info used in optimistic searches */
+	row_log_t*	online_log;
+				/*!< the log of modifications
+				during online index creation;
+				valid when online_status is
+				ONLINE_INDEX_CREATION */
 	/*----------------------*/
 	/** Statistics for query optimization */
 	/* @{ */
-	ib_int64_t*	stat_n_diff_key_vals;
+	ib_uint64_t*	stat_n_diff_key_vals;
 				/*!< approximate number of different
 				key values for this index, for each
-				n-column prefix where n <=
-				dict_get_n_unique(index); we
+				n-column prefix where 1 <= n <=
+				dict_get_n_unique(index) (the array is
+				indexed from 0 to n_uniq-1); we
 				periodically calculate new
 				estimates */
-	ib_int64_t*	stat_n_non_null_key_vals;
+	ib_uint64_t*	stat_n_sample_sizes;
+				/*!< number of pages that were sampled
+				to calculate each of stat_n_diff_key_vals[],
+				e.g. stat_n_sample_sizes[3] pages were sampled
+				to get the number stat_n_diff_key_vals[3]. */
+	ib_uint64_t*	stat_n_non_null_key_vals;
 				/* approximate number of non-null key values
 				for this index, for each column where
-				n < dict_get_n_unique(index); This
+				1 <= n <= dict_get_n_unique(index) (the array
+				is indexed from 0 to n_uniq-1); This
 				is used when innodb_stats_method is
 				"nulls_ignored". */
 	ulint		stat_index_size;
@@ -436,30 +632,52 @@ struct dict_index_struct{
 				/*!< approximate number of leaf pages in the
 				index tree */
 	/* @} */
-	rw_lock_t	lock;	/*!< read-write lock protecting the
+	prio_rw_lock_t	lock;	/*!< read-write lock protecting the
 				upper levels of the index tree */
 	trx_id_t	trx_id; /*!< id of the transaction that created this
 				index, or 0 if the index existed
 				when InnoDB was started up */
+	zip_pad_info_t	zip_pad;/*!< Information about state of
+				compression failures and successes */
 #endif /* !UNIV_HOTBACKUP */
 #ifdef UNIV_BLOB_DEBUG
-	mutex_t		blobs_mutex;
+	ib_mutex_t		blobs_mutex;
 				/*!< mutex protecting blobs */
-	void*		blobs;	/*!< map of (page_no,heap_no,field_no)
+	ib_rbt_t*	blobs;	/*!< map of (page_no,heap_no,field_no)
 				to first_blob_page_no; protected by
 				blobs_mutex; @see btr_blob_dbg_t */
 #endif /* UNIV_BLOB_DEBUG */
 #ifdef UNIV_DEBUG
 	ulint		magic_n;/*!< magic number */
-/** Value of dict_index_struct::magic_n */
+/** Value of dict_index_t::magic_n */
 # define DICT_INDEX_MAGIC_N	76789786
 #endif
 };
 
+/** The status of online index creation */
+enum online_index_status {
+	/** the index is complete and ready for access */
+	ONLINE_INDEX_COMPLETE = 0,
+	/** the index is being created, online
+	(allowing concurrent modifications) */
+	ONLINE_INDEX_CREATION,
+	/** secondary index creation was aborted and the index
+	should be dropped as soon as index->table->n_ref_count reaches 0,
+	or online table rebuild was aborted and the clustered index
+	of the original table should soon be restored to
+	ONLINE_INDEX_COMPLETE */
+	ONLINE_INDEX_ABORTED,
+	/** the online index creation was aborted, the index was
+	dropped from the data dictionary and the tablespace, and it
+	should be dropped from the data dictionary cache as soon as
+	index->table->n_ref_count reaches 0. */
+	ONLINE_INDEX_ABORTED_DROPPED
+};
+
 /** Data structure for a foreign key constraint; an example:
 FOREIGN KEY (A, B) REFERENCES TABLE2 (C, D).  Most fields will be
 initialized to 0, NULL or FALSE in dict_mem_foreign_create(). */
-struct dict_foreign_struct{
+struct dict_foreign_t{
 	mem_heap_t*	heap;		/*!< this object is allocated from
 					this memory heap */
 	char*		id;		/*!< id of the constraint as a
@@ -510,10 +728,9 @@ a foreign key constraint is enforced, therefore RESTRICT just means no flag */
 #define DICT_FOREIGN_ON_UPDATE_NO_ACTION 32	/*!< ON UPDATE NO ACTION */
 /* @} */
 
-
 /** Data structure for a database table.  Most fields will be
 initialized to 0, NULL or FALSE in dict_mem_table_create(). */
-struct dict_table_struct{
+struct dict_table_t{
 	table_id_t	id;	/*!< id of the table */
 	mem_heap_t*	heap;	/*!< memory heap */
 	char*		name;	/*!< table name */
@@ -523,26 +740,39 @@ struct dict_table_struct{
 				innodb_file_per_table is defined in my.cnf;
 				in Unix this is usually /tmp/..., in Windows
 				temp\... */
+	char*		data_dir_path; /*!< NULL or the directory path
+				specified by DATA DIRECTORY */
 	unsigned	space:32;
 				/*!< space where the clustered index of the
 				table is placed */
-	unsigned	flags:DICT_TF2_BITS;/*!< DICT_TF_COMPACT, ... */
+	unsigned	flags:DICT_TF_BITS;	/*!< DICT_TF_... */
+	unsigned	flags2:DICT_TF2_BITS;	/*!< DICT_TF2_... */
 	unsigned	ibd_file_missing:1;
 				/*!< TRUE if this is in a single-table
 				tablespace and the .ibd file is missing; then
 				we must return in ha_innodb.cc an error if the
 				user tries to query such an orphaned table */
-	unsigned	tablespace_discarded:1;
-				/*!< this flag is set TRUE when the user
-				calls DISCARD TABLESPACE on this
-				table, and reset to FALSE in IMPORT
-				TABLESPACE */
 	unsigned	cached:1;/*!< TRUE if the table object has been added
 				to the dictionary cache */
+	unsigned	to_be_dropped:1;
+				/*!< TRUE if the table is to be dropped, but
+				not yet actually dropped (could in the bk
+				drop list); It is turned on at the beginning
+				of row_drop_table_for_mysql() and turned off
+				just before we start to update system tables
+				for the drop. It is protected by
+				dict_operation_lock */
 	unsigned	n_def:10;/*!< number of columns defined so far */
 	unsigned	n_cols:10;/*!< number of columns */
+	unsigned	can_be_evicted:1;
+				/*!< TRUE if it's not an InnoDB system table
+				or a table that has no FK relationships */
 	unsigned	corrupted:1;
 				/*!< TRUE if table is corrupted */
+	unsigned	drop_aborted:1;
+				/*!< TRUE if some indexes should be dropped
+				after ONLINE_INDEX_ABORTED
+				or ONLINE_INDEX_ABORTED_DROPPED */
 	dict_col_t*	cols;	/*!< array of column descriptions */
 	const char*	col_names;
 				/*!< Column names packed in a character string
@@ -564,12 +794,6 @@ struct dict_table_struct{
 				which refer to this table */
 	UT_LIST_NODE_T(dict_table_t)
 			table_LRU; /*!< node of the LRU list of tables */
-	ulint		n_mysql_handles_opened;
-				/*!< count of how many handles MySQL has opened
-				to this table; dropping of the table is
-				NOT allowed until this count gets to zero;
-				MySQL does NOT itself check the number of
-				open handles at drop */
 	unsigned	fk_max_recusive_level:8;
 				/*!< maximum recursive level we support when
 				loading tables chained together with FK
@@ -582,6 +806,12 @@ struct dict_table_struct{
 				on the table: we cannot drop the table while
 				there are foreign key checks running on
 				it! */
+	trx_id_t	def_trx_id;
+				/*!< transaction id that last touched
+				the table definition, either when
+				loading the definition or CREATE
+				TABLE, or ALTER TABLE (prepare,
+				commit, and rollback phases) */
 	trx_id_t	query_cache_inv_trx_id;
 				/*!< transactions whose trx id is
 				smaller than this number are not
@@ -590,8 +820,6 @@ struct dict_table_struct{
 				with undo logs commits, it sets this
 				to the value of the trx id counter for
 				the tables it had an IX lock on */
-	UT_LIST_BASE_NODE_T(lock_t)
-			locks; /*!< list of locks on the table */
 #ifdef UNIV_DEBUG
 	/*----------------------*/
 	ibool		does_not_fit_in_memory;
@@ -611,18 +839,60 @@ struct dict_table_struct{
 				/*!< flag: TRUE if the maximum length of
 				a single row exceeds BIG_ROW_SIZE;
 				initialized in dict_table_add_to_cache() */
-				/** Statistics for query optimization.
-				The following stat_* members are usually
-				protected by dict_table_stats_lock(). In
-				some exceptional cases (performance critical
-				code paths) we access or modify stat_n_rows
-				and stat_modified_counter without any
-				protection. */
+				/** Statistics for query optimization */
 				/* @{ */
 	unsigned	stat_initialized:1; /*!< TRUE if statistics have
 				been calculated the first time
 				after database startup or table creation */
-	ib_int64_t	stat_n_rows;
+	ib_time_t	stats_last_recalc;
+				/*!< Timestamp of last recalc of the stats */
+	ib_uint32_t	stat_persistent;
+				/*!< The two bits below are set in the
+				::stat_persistent member and have the following
+				meaning:
+				1. _ON=0, _OFF=0, no explicit persistent stats
+				setting for this table, the value of the global
+				srv_stats_persistent is used to determine
+				whether the table has persistent stats enabled
+				or not
+				2. _ON=0, _OFF=1, persistent stats are
+				explicitly disabled for this table, regardless
+				of the value of the global srv_stats_persistent
+				3. _ON=1, _OFF=0, persistent stats are
+				explicitly enabled for this table, regardless
+				of the value of the global srv_stats_persistent
+				4. _ON=1, _OFF=1, not allowed, we assert if
+				this ever happens. */
+#define DICT_STATS_PERSISTENT_ON	(1 << 1)
+#define DICT_STATS_PERSISTENT_OFF	(1 << 2)
+	ib_uint32_t	stats_auto_recalc;
+				/*!< The two bits below are set in the
+				::stats_auto_recalc member and have
+				the following meaning:
+				1. _ON=0, _OFF=0, no explicit auto recalc
+				setting for this table, the value of the global
+				srv_stats_persistent_auto_recalc is used to
+				determine whether the table has auto recalc
+				enabled or not
+				2. _ON=0, _OFF=1, auto recalc is explicitly
+				disabled for this table, regardless of the
+				value of the global
+				srv_stats_persistent_auto_recalc
+				3. _ON=1, _OFF=0, auto recalc is explicitly
+				enabled for this table, regardless of the
+				value of the global
+				srv_stats_persistent_auto_recalc
+				4. _ON=1, _OFF=1, not allowed, we assert if
+				this ever happens. */
+#define DICT_STATS_AUTO_RECALC_ON	(1 << 1)
+#define DICT_STATS_AUTO_RECALC_OFF	(1 << 2)
+	ulint		stats_sample_pages;
+				/*!< the number of pages to sample for this
+				table during persistent stats estimation;
+				if this is 0, then the value of the global
+				srv_stats_persistent_sample_pages will be
+				used instead. */
+	ib_uint64_t	stat_n_rows;
 				/*!< approximate number of rows in the table;
 				we periodically calculate new estimates */
 	ulint		stat_clustered_index_size;
@@ -630,19 +900,36 @@ struct dict_table_struct{
 				database pages */
 	ulint		stat_sum_of_other_index_sizes;
 				/*!< other indexes in database pages */
-	ulint		stat_modified_counter;
+	ib_uint64_t	stat_modified_counter;
 				/*!< when a row is inserted, updated,
 				or deleted,
 				we add 1 to this number; we calculate new
 				estimates for the stat_... values for the
-				table and the indexes at an interval of 2 GB
-				or when about 1 / 16 of table has been
-				modified; also when the estimate operation is
+				table and the indexes when about 1 / 16 of
+				table has been modified;
+				also when the estimate operation is
 				called for MySQL SHOW TABLE STATUS; the
 				counter is reset to zero at statistics
 				calculation; this counter is not protected by
 				any latch, because this is only used for
 				heuristics */
+#define BG_STAT_NONE		0
+#define BG_STAT_IN_PROGRESS	(1 << 0)
+				/*!< BG_STAT_IN_PROGRESS is set in
+				stats_bg_flag when the background
+				stats code is working on this table. The DROP
+				TABLE code waits for this to be cleared
+				before proceeding. */
+#define BG_STAT_SHOULD_QUIT	(1 << 1)
+				/*!< BG_STAT_SHOULD_QUIT is set in
+				stats_bg_flag when DROP TABLE starts
+				waiting on BG_STAT_IN_PROGRESS to be cleared,
+				the background stats thread will detect this
+				and will eventually quit sooner */
+	byte		stats_bg_flag;
+				/*!< see BG_STAT_* above.
+				Writes are covered by dict_sys->mutex.
+				Dirty reads are possible. */
 				/* @} */
 	/*----------------------*/
 				/**!< The following fields are used by the
@@ -652,8 +939,8 @@ struct dict_table_struct{
 				whether a transaction has locked the AUTOINC
 				lock we keep a pointer to the transaction
 				here in the autoinc_trx variable. This is to
-				avoid acquiring the kernel mutex and scanning
-				the vector in trx_t.
+				avoid acquiring the lock_sys_t::mutex and
+				scanning the vector in trx_t.
 
 				When an AUTOINC lock has to wait, the
 				corresponding lock instance is created on
@@ -668,7 +955,7 @@ struct dict_table_struct{
 				space from the lock heap of the trx:
 				otherwise the lock heap would grow rapidly
 				if we do a large insert from a select */
-	mutex_t		autoinc_mutex;
+	ib_mutex_t		autoinc_mutex;
 				/*!< mutex protecting the autoincrement
 				counter */
 	ib_uint64_t	autoinc;/*!< autoinc counter value to give to the
@@ -677,22 +964,46 @@ struct dict_table_struct{
 				/*!< This counter is used to track the number
 				of granted and pending autoinc locks on this
 				table. This value is set after acquiring the
-				kernel mutex but we peek the contents to
+				lock_sys_t::mutex but we peek the contents to
 				determine whether other transactions have
 				acquired the AUTOINC lock or not. Of course
 				only one transaction can be granted the
 				lock but there can be multiple waiters. */
-	const trx_t*		autoinc_trx;
+	const trx_t*	autoinc_trx;
 				/*!< The transaction that currently holds the
-				the AUTOINC lock on this table. */
+				the AUTOINC lock on this table.
+				Protected by lock_sys->mutex. */
+	fts_t*		fts;	/* FTS specific state variables */
 				/* @} */
 	/*----------------------*/
+
+	ib_quiesce_t	 quiesce;/*!< Quiescing states, protected by the
+				dict_index_t::lock. ie. we can only change
+				the state if we acquire all the latches
+				(dict_index_t::lock) in X mode of this table's
+				indexes. */
+
+	/*----------------------*/
+	ulint		n_rec_locks;
+				/*!< Count of the number of record locks on
+				this table. We use this to determine whether
+				we can evict the table from the dictionary
+				cache. It is protected by lock_sys->mutex. */
+	ulint		n_ref_count;
+				/*!< count of how many handles are opened
+				to this table; dropping of the table is
+				NOT allowed until this count gets to zero;
+				MySQL does NOT itself check the number of
+				open handles at drop */
+	UT_LIST_BASE_NODE_T(lock_t)
+			locks;	/*!< list of locks on the table; protected
+				by lock_sys->mutex */
 	ibool		is_corrupt;
 #endif /* !UNIV_HOTBACKUP */
 
 #ifdef UNIV_DEBUG
 	ulint		magic_n;/*!< magic number */
-/** Value of dict_table_struct::magic_n */
+/** Value of dict_table_t::magic_n */
 # define DICT_TABLE_MAGIC_N	76333786
 #endif /* UNIV_DEBUG */
 };
@@ -701,4 +1012,6 @@ struct dict_table_struct{
 #include "dict0mem.ic"
 #endif
 
+#endif /* !UNIV_INNOCHECKSUM */
+
 #endif
diff --git a/storage/xtradb/include/dict0mem.ic b/storage/xtradb/include/dict0mem.ic
index 41dacb1c643..38d51f61789 100644
--- a/storage/xtradb/include/dict0mem.ic
+++ b/storage/xtradb/include/dict0mem.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -57,16 +57,18 @@ dict_mem_fill_index_struct(
 		index->fields = NULL;
 	}
 
-        index->type = type;
+	/* Assign a ulint to a 4-bit-mapped field.
+	Only the low-order 4 bits are assigned. */
+	index->type = type;
 #ifndef UNIV_HOTBACKUP
-        index->space = (unsigned int) space;
-        index->page = FIL_NULL;
+	index->space = (unsigned int) space;
+	index->page = FIL_NULL;
 #endif /* !UNIV_HOTBACKUP */
-        index->table_name = table_name;
-        index->n_fields = (unsigned int) n_fields;
-        /* The '1 +' above prevents allocation
-        of an empty mem block */
+	index->table_name = table_name;
+	index->n_fields = (unsigned int) n_fields;
+	/* The '1 +' above prevents allocation
+	of an empty mem block */
 #ifdef UNIV_DEBUG
-        index->magic_n = DICT_INDEX_MAGIC_N;
+	index->magic_n = DICT_INDEX_MAGIC_N;
 #endif /* UNIV_DEBUG */
 }
diff --git a/storage/xtradb/include/dict0priv.h b/storage/xtradb/include/dict0priv.h
new file mode 100644
index 00000000000..9a3c8e22992
--- /dev/null
+++ b/storage/xtradb/include/dict0priv.h
@@ -0,0 +1,63 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0priv.h
+Data dictionary private functions
+
+Created  Fri 2 Jul 2010 13:30:38 EST - Sunny Bains
+*******************************************************/
+
+#ifndef dict0priv_h
+#define dict0priv_h
+
+/**********************************************************************//**
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function. Note: Not to be called from outside dict0*c functions.
+@return	table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low(
+/*===============*/
+	const char*	table_name);		/*!< in: table name */
+
+/**********************************************************************//**
+Checks if a table is in the dictionary cache.
+@return	table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_check_if_in_cache_low(
+/*=============================*/
+	const char*	table_name);		/*!< in: table name */
+
+/**********************************************************************//**
+Returns a table object based on table id.
+@return	table, NULL if does not exist */
+UNIV_INLINE
+dict_table_t*
+dict_table_open_on_id_low(
+/*=====================*/
+	table_id_t		table_id,	/*!< in: table id */
+	dict_err_ignore_t	ignore_err);	/*!< in: errors to ignore
+						when loading the table */
+
+#ifndef UNIV_NONINL
+#include "dict0priv.ic"
+#endif
+
+#endif /* dict0priv.h */
diff --git a/storage/xtradb/include/dict0priv.ic b/storage/xtradb/include/dict0priv.ic
new file mode 100644
index 00000000000..30ba8fb60aa
--- /dev/null
+++ b/storage/xtradb/include/dict0priv.ic
@@ -0,0 +1,125 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0priv.ic
+Data dictionary system private include file
+
+Created  Wed 13 Oct 2010 16:10:14 EST Sunny Bains
+***********************************************************************/
+
+#include "dict0dict.h"
+#include "dict0load.h"
+#include "dict0priv.h"
+#ifndef UNIV_HOTBACKUP
+
+/**********************************************************************//**
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function.
+@return	table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low(
+/*===============*/
+	const char*	table_name)	/*!< in: table name */
+{
+	dict_table_t*	table;
+
+	ut_ad(table_name);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	table = dict_table_check_if_in_cache_low(table_name);
+
+	if (table && table->corrupted) {
+		fprintf(stderr, "InnoDB: table");
+		ut_print_name(stderr, NULL, TRUE, table->name);
+		if (srv_load_corrupted) {
+			fputs(" is corrupted, but"
+			      " innodb_force_load_corrupted is set\n", stderr);
+		} else {
+			fputs(" is corrupted\n", stderr);
+			return(NULL);
+		}
+	}
+
+	if (table == NULL) {
+		table = dict_load_table(table_name, TRUE, DICT_ERR_IGNORE_NONE);
+	}
+
+	ut_ad(!table || table->cached);
+
+	return(table);
+}
+
+/**********************************************************************//**
+Returns a table object based on table id.
+@return	table, NULL if does not exist */
+UNIV_INLINE
+dict_table_t*
+dict_table_open_on_id_low(
+/*======================*/
+	table_id_t		table_id,	/*!< in: table id */
+	dict_err_ignore_t	ignore_err)	/*!< in: errors to ignore
+						when loading the table */
+{
+	dict_table_t*	table;
+	ulint		fold;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	/* Look for the table name in the hash table */
+	fold = ut_fold_ull(table_id);
+
+	HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold,
+		    dict_table_t*, table, ut_ad(table->cached),
+		    table->id == table_id);
+	if (table == NULL) {
+		table = dict_load_table_on_id(table_id, ignore_err);
+	}
+
+	ut_ad(!table || table->cached);
+
+	/* TODO: should get the type information from MySQL */
+
+	return(table);
+}
+
+/**********************************************************************//**
+Checks if a table is in the dictionary cache.
+@return	table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_check_if_in_cache_low(
+/*=============================*/
+	const char*	table_name)	/*!< in: table name */
+{
+	dict_table_t*	table;
+	ulint		table_fold;
+
+	ut_ad(table_name);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	/* Look for the table name in the hash table */
+	table_fold = ut_fold_string(table_name);
+
+	HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold,
+		    dict_table_t*, table, ut_ad(table->cached),
+		    !strcmp(table->name, table_name));
+	return(table);
+}
+#endif /*! UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/dict0stats.h b/storage/xtradb/include/dict0stats.h
new file mode 100644
index 00000000000..186f90e3694
--- /dev/null
+++ b/storage/xtradb/include/dict0stats.h
@@ -0,0 +1,202 @@
+/*****************************************************************************
+
+Copyright (c) 2009, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats.h
+Code used for calculating and manipulating table statistics.
+
+Created Jan 06, 2010 Vasil Dimov
+*******************************************************/
+
+#ifndef dict0stats_h
+#define dict0stats_h
+
+#include "univ.i"
+
+#include "db0err.h"
+#include "dict0types.h"
+#include "trx0types.h"
+
+enum dict_stats_upd_option_t {
+	DICT_STATS_RECALC_PERSISTENT,/* (re) calculate the
+				statistics using a precise and slow
+				algo and save them to the persistent
+				storage, if the persistent storage is
+				not present then emit a warning and
+				fall back to transient stats */
+	DICT_STATS_RECALC_TRANSIENT,/* (re) calculate the statistics
+				using an imprecise quick algo
+				without saving the results
+				persistently */
+	DICT_STATS_EMPTY_TABLE,	/* Write all zeros (or 1 where it makes sense)
+				into a table and its indexes' statistics
+				members. The resulting stats correspond to an
+				empty table. If the table is using persistent
+				statistics, then they are saved on disk. */
+	DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY /* fetch the stats
+				from the persistent storage if the in-memory
+				structures have not been initialized yet,
+				otherwise do nothing */
+};
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. This function
+is relatively quick and is used to calculate transient statistics that
+are not saved on disk.
+This was the only way to calculate statistics before the
+Persistent Statistics feature was introduced. */
+UNIV_INTERN
+void
+dict_stats_update_transient(
+/*========================*/
+	dict_table_t*	table);	/*!< in/out: table */
+
+/*********************************************************************//**
+Set the persistent statistics flag for a given table. This is set only
+in the in-memory table object and is not saved on disk. It will be read
+from the .frm file upon first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_set_persistent(
+/*======================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ibool		ps_on,	/*!< in: persistent stats explicitly enabled */
+	ibool		ps_off)	/*!< in: persistent stats explicitly disabled */
+	__attribute__((nonnull));
+
+/*********************************************************************//**
+Check whether persistent statistics is enabled for a given table.
+@return TRUE if enabled, FALSE otherwise */
+UNIV_INLINE
+ibool
+dict_stats_is_persistent_enabled(
+/*=============================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Set the auto recalc flag for a given table (only honored for a persistent
+stats enabled table). The flag is set only in the in-memory table object
+and is not saved in InnoDB files. It will be read from the .frm file upon
+first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_auto_recalc_set(
+/*=======================*/
+	dict_table_t*	table,			/*!< in/out: table */
+	ibool		auto_recalc_on,		/*!< in: explicitly enabled */
+	ibool		auto_recalc_off);	/*!< in: explicitly disabled */
+
+/*********************************************************************//**
+Check whether auto recalc is enabled for a given table.
+@return TRUE if enabled, FALSE otherwise */
+UNIV_INLINE
+ibool
+dict_stats_auto_recalc_is_enabled(
+/*==============================*/
+	const dict_table_t*	table);	/*!< in: table */
+
+/*********************************************************************//**
+Initialize table's stats for the first time when opening a table. */
+UNIV_INLINE
+void
+dict_stats_init(
+/*============*/
+	dict_table_t*	table);	/*!< in/out: table */
+
+/*********************************************************************//**
+Deinitialize table's stats after the last close of the table. This is
+used to detect "FLUSH TABLE" and refresh the stats upon next open. */
+UNIV_INLINE
+void
+dict_stats_deinit(
+/*==============*/
+	dict_table_t*	table)	/*!< in/out: table */
+	__attribute__((nonnull));
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization.
+@return DB_* error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_stats_update(
+/*==============*/
+	dict_table_t*		table,	/*!< in/out: table */
+	dict_stats_upd_option_t	stats_upd_option);
+					/*!< in: whether to (re) calc
+					the stats or to fetch them from
+					the persistent storage */
+
+/*********************************************************************//**
+Removes the information for a particular index's stats from the persistent
+storage if it exists and if there is data stored for this index.
+This function creates its own trx and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_drop_index(
+/*==================*/
+	const char*	tname,	/*!< in: table name */
+	const char*	iname,	/*!< in: index name */
+	char*		errstr, /*!< out: error message if != DB_SUCCESS
+				is returned */
+	ulint		errstr_sz);/*!< in: size of the errstr buffer */
+
+/*********************************************************************//**
+Removes the statistics for a table and all of its indexes from the
+persistent storage if it exists and if there is data stored for the table.
+This function creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_drop_table(
+/*==================*/
+	const char*	table_name,	/*!< in: table name */
+	char*		errstr,		/*!< out: error message
+					if != DB_SUCCESS is returned */
+	ulint		errstr_sz);	/*!< in: size of errstr buffer */
+
+/*********************************************************************//**
+Fetches or calculates new estimates for index statistics. */
+UNIV_INTERN
+void
+dict_stats_update_for_index(
+/*========================*/
+	dict_index_t*	index)	/*!< in/out: index */
+	__attribute__((nonnull));
+
+/*********************************************************************//**
+Renames a table in InnoDB persistent stats storage.
+This function creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_rename_table(
+/*====================*/
+	const char*	old_name,	/*!< in: old table name */
+	const char*	new_name,	/*!< in: new table name */
+	char*		errstr,		/*!< out: error string if != DB_SUCCESS
+					is returned */
+	size_t		errstr_sz);	/*!< in: errstr size */
+
+#ifndef UNIV_NONINL
+#include "dict0stats.ic"
+#endif
+
+#endif /* dict0stats_h */
diff --git a/storage/xtradb/include/dict0stats.ic b/storage/xtradb/include/dict0stats.ic
new file mode 100644
index 00000000000..8fb31678af9
--- /dev/null
+++ b/storage/xtradb/include/dict0stats.ic
@@ -0,0 +1,236 @@
+/*****************************************************************************
+
+Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats.ic
+Code used for calculating and manipulating table statistics.
+
+Created Jan 23, 2012 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+#include "dict0dict.h" /* dict_table_stats_lock() */
+#include "dict0types.h" /* dict_table_t */
+#include "srv0srv.h" /* srv_stats_persistent, srv_stats_auto_recalc */
+
+/*********************************************************************//**
+Set the persistent statistics flag for a given table. This is set only
+in the in-memory table object and is not saved on disk. It will be read
+from the .frm file upon first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_set_persistent(
+/*======================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ibool		ps_on,	/*!< in: persistent stats explicitly enabled */
+	ibool		ps_off)	/*!< in: persistent stats explicitly disabled */
+{
+	/* Not allowed to have both flags set, but a CREATE or ALTER
+	statement that contains "STATS_PERSISTENT=0 STATS_PERSISTENT=1" would
+	end up having both set. In this case we clear the OFF flag. */
+	if (ps_on && ps_off) {
+		ps_off = FALSE;
+	}
+
+	ib_uint32_t	stat_persistent = 0;
+
+	if (ps_on) {
+		stat_persistent |= DICT_STATS_PERSISTENT_ON;
+	}
+
+	if (ps_off) {
+		stat_persistent |= DICT_STATS_PERSISTENT_OFF;
+	}
+
+	/* we rely on this assignment to be atomic */
+	table->stat_persistent = stat_persistent;
+}
+
+/*********************************************************************//**
+Check whether persistent statistics is enabled for a given table.
+@return TRUE if enabled, FALSE otherwise */
+UNIV_INLINE
+ibool
+dict_stats_is_persistent_enabled(
+/*=============================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	/* Because of the nature of this check (non-locking) it is possible
+	that a table becomes:
+	* PS-disabled immediately after this function has returned TRUE or
+	* PS-enabled immediately after this function has returned FALSE.
+	This means that it is possible that we do:
+	+ dict_stats_update(DICT_STATS_RECALC_PERSISTENT) on a table that has
+	  just been PS-disabled or
+	+ dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has
+	  just been PS-enabled.
+	This is acceptable. Avoiding this would mean that we would have to
+	protect the ::stat_persistent with dict_table_stats_lock() like the
+	other ::stat_ members which would be too big performance penalty,
+	especially when this function is called from
+	row_update_statistics_if_needed(). */
+
+	/* we rely on this read to be atomic */
+	ib_uint32_t	stat_persistent = table->stat_persistent;
+
+	if (stat_persistent & DICT_STATS_PERSISTENT_ON) {
+		ut_ad(!(stat_persistent & DICT_STATS_PERSISTENT_OFF));
+		return(TRUE);
+	} else if (stat_persistent & DICT_STATS_PERSISTENT_OFF) {
+		return(FALSE);
+	} else {
+		return(srv_stats_persistent);
+	}
+}
+
+/*********************************************************************//**
+Set the auto recalc flag for a given table (only honored for a persistent
+stats enabled table). The flag is set only in the in-memory table object
+and is not saved in InnoDB files. It will be read from the .frm file upon
+first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_auto_recalc_set(
+/*=======================*/
+	dict_table_t*	table,			/*!< in/out: table */
+	ibool		auto_recalc_on,		/*!< in: explicitly enabled */
+	ibool		auto_recalc_off)	/*!< in: explicitly disabled */
+{
+	ut_ad(!auto_recalc_on || !auto_recalc_off);
+
+	ib_uint32_t	stats_auto_recalc = 0;
+
+	if (auto_recalc_on) {
+		stats_auto_recalc |= DICT_STATS_AUTO_RECALC_ON;
+	}
+
+	if (auto_recalc_off) {
+		stats_auto_recalc |= DICT_STATS_AUTO_RECALC_OFF;
+	}
+
+	/* we rely on this assignment to be atomic */
+	table->stats_auto_recalc = stats_auto_recalc;
+}
+
+/*********************************************************************//**
+Check whether auto recalc is enabled for a given table.
+@return TRUE if enabled, FALSE otherwise */
+UNIV_INLINE
+ibool
+dict_stats_auto_recalc_is_enabled(
+/*==============================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	/* we rely on this read to be atomic */
+	ib_uint32_t	stats_auto_recalc = table->stats_auto_recalc;
+
+	if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_ON) {
+		ut_ad(!(stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF));
+		return(TRUE);
+	} else if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF) {
+		return(FALSE);
+	} else {
+		return(srv_stats_auto_recalc);
+	}
+}
+
+/*********************************************************************//**
+Initialize table's stats for the first time when opening a table. */
+UNIV_INLINE
+void
+dict_stats_init(
+/*============*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	ut_ad(!mutex_own(&dict_sys->mutex));
+
+	if (table->stat_initialized) {
+		return;
+	}
+
+	dict_stats_upd_option_t	opt;
+
+	if (dict_stats_is_persistent_enabled(table)) {
+		opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY;
+	} else {
+		opt = DICT_STATS_RECALC_TRANSIENT;
+	}
+
+	dict_stats_update(table, opt);
+}
+
+/*********************************************************************//**
+Deinitialize table's stats after the last close of the table. This is
+used to detect "FLUSH TABLE" and refresh the stats upon next open. */
+UNIV_INLINE
+void
+dict_stats_deinit(
+/*==============*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	ut_a(table->n_ref_count == 0);
+
+	dict_table_stats_lock(table, RW_X_LATCH);
+
+	if (!table->stat_initialized) {
+		dict_table_stats_unlock(table, RW_X_LATCH);
+		return;
+	}
+
+	table->stat_initialized = FALSE;
+
+#ifdef UNIV_DEBUG_VALGRIND
+	UNIV_MEM_INVALID(&table->stat_n_rows,
+			 sizeof(table->stat_n_rows));
+	UNIV_MEM_INVALID(&table->stat_clustered_index_size,
+			 sizeof(table->stat_clustered_index_size));
+	UNIV_MEM_INVALID(&table->stat_sum_of_other_index_sizes,
+			 sizeof(table->stat_sum_of_other_index_sizes));
+	UNIV_MEM_INVALID(&table->stat_modified_counter,
+			 sizeof(table->stat_modified_counter));
+
+	dict_index_t*   index;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		ulint	n_uniq = dict_index_get_n_unique(index);
+
+		UNIV_MEM_INVALID(
+			index->stat_n_diff_key_vals,
+			n_uniq * sizeof(index->stat_n_diff_key_vals[0]));
+		UNIV_MEM_INVALID(
+			index->stat_n_sample_sizes,
+			n_uniq * sizeof(index->stat_n_sample_sizes[0]));
+		UNIV_MEM_INVALID(
+			index->stat_n_non_null_key_vals,
+			n_uniq * sizeof(index->stat_n_non_null_key_vals[0]));
+		UNIV_MEM_INVALID(
+			&index->stat_index_size,
+			sizeof(index->stat_index_size));
+		UNIV_MEM_INVALID(
+			&index->stat_n_leaf_pages,
+			sizeof(index->stat_n_leaf_pages));
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	dict_table_stats_unlock(table, RW_X_LATCH);
+}
diff --git a/storage/xtradb/include/dict0stats_bg.h b/storage/xtradb/include/dict0stats_bg.h
new file mode 100644
index 00000000000..e866ab419fe
--- /dev/null
+++ b/storage/xtradb/include/dict0stats_bg.h
@@ -0,0 +1,127 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats_bg.h
+Code used for background table and index stats gathering.
+
+Created Apr 26, 2012 Vasil Dimov
+*******************************************************/
+
+#ifndef dict0stats_bg_h
+#define dict0stats_bg_h
+
+#include "univ.i"
+
+#include "dict0types.h" /* dict_table_t, table_id_t */
+#include "os0sync.h" /* os_event_t */
+#include "os0thread.h" /* DECLARE_THREAD */
+
+/** Event to wake up the stats thread */
+extern os_event_t	dict_stats_event;
+
+/*****************************************************************//**
+Add a table to the recalc pool, which is processed by the
+background stats gathering thread. Only the table id is added to the
+list, so the table can be closed after being enqueued and it will be
+opened when needed. If the table does not exist later (has been DROPped),
+then it will be removed from the pool and skipped. */
+UNIV_INTERN
+void
+dict_stats_recalc_pool_add(
+/*=======================*/
+	const dict_table_t*	table);	/*!< in: table to add */
+
+/*****************************************************************//**
+Delete a given table from the auto recalc pool.
+dict_stats_recalc_pool_del() */
+UNIV_INTERN
+void
+dict_stats_recalc_pool_del(
+/*=======================*/
+	const dict_table_t*	table);	/*!< in: table to remove */
+
+/** Yield the data dictionary latch when waiting
+for the background thread to stop accessing a table.
+@param trx	transaction holding the data dictionary locks */
+#define DICT_STATS_BG_YIELD(trx)	do {	\
+	row_mysql_unlock_data_dictionary(trx);	\
+	os_thread_sleep(250000);		\
+	row_mysql_lock_data_dictionary(trx);	\
+} while (0)
+
+/*****************************************************************//**
+Request the background collection of statistics to stop for a table.
+@retval true when no background process is active
+@retval false when it is not safe to modify the table definition */
+UNIV_INLINE
+bool
+dict_stats_stop_bg(
+/*===============*/
+	dict_table_t*	table)	/*!< in/out: table */
+	__attribute__((warn_unused_result));
+
+/*****************************************************************//**
+Wait until background stats thread has stopped using the specified table.
+The caller must have locked the data dictionary using
+row_mysql_lock_data_dictionary() and this function may unlock it temporarily
+and restore the lock before it exits.
+The background stats thread is guaranteed not to start using the specified
+table after this function returns and before the caller unlocks the data
+dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag
+under dict_sys->mutex. */
+UNIV_INTERN
+void
+dict_stats_wait_bg_to_stop_using_table(
+/*===================================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx);	/*!< in/out: transaction to use for
+				unlocking/locking the data dict */
+/*****************************************************************//**
+Initialize global variables needed for the operation of dict_stats_thread().
+Must be called before dict_stats_thread() is started. */
+UNIV_INTERN
+void
+dict_stats_thread_init();
+/*====================*/
+
+/*****************************************************************//**
+Free resources allocated by dict_stats_thread_init(), must be called
+after dict_stats_thread() has exited. */
+UNIV_INTERN
+void
+dict_stats_thread_deinit();
+/*======================*/
+
+/*****************************************************************//**
+This is the thread for background stats gathering. It pops tables, from
+the auto recalc list and proceeds them, eventually recalculating their
+statistics.
+@return this function does not return, it calls os_thread_exit() */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(dict_stats_thread)(
+/*==============================*/
+	void*	arg);	/*!< in: a dummy parameter
+			required by os_thread_create */
+
+# ifndef UNIV_NONINL
+#  include "dict0stats_bg.ic"
+# endif
+
+#endif /* dict0stats_bg_h */
diff --git a/storage/xtradb/include/dict0stats_bg.ic b/storage/xtradb/include/dict0stats_bg.ic
new file mode 100644
index 00000000000..87e3225de58
--- /dev/null
+++ b/storage/xtradb/include/dict0stats_bg.ic
@@ -0,0 +1,45 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats_bg.ic
+Code used for background table and index stats gathering.
+
+Created Feb 8, 2013 Marko Makela
+*******************************************************/
+
+/*****************************************************************//**
+Request the background collection of statistics to stop for a table.
+@retval true when no background process is active
+@retval false when it is not safe to modify the table definition */
+UNIV_INLINE
+bool
+dict_stats_stop_bg(
+/*===============*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	if (!(table->stats_bg_flag & BG_STAT_IN_PROGRESS)) {
+		return(true);
+	}
+
+	table->stats_bg_flag |= BG_STAT_SHOULD_QUIT;
+	return(false);
+}
diff --git a/storage/xtradb/include/dict0types.h b/storage/xtradb/include/dict0types.h
index 330e6a25114..6acb6a2dcbe 100644
--- a/storage/xtradb/include/dict0types.h
+++ b/storage/xtradb/include/dict0types.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,20 +26,24 @@ Created 1/8/1996 Heikki Tuuri
 #ifndef dict0types_h
 #define dict0types_h
 
-typedef struct dict_sys_struct		dict_sys_t;
-typedef struct dict_col_struct		dict_col_t;
-typedef struct dict_field_struct	dict_field_t;
-typedef struct dict_index_struct	dict_index_t;
-typedef struct dict_table_struct	dict_table_t;
-typedef struct dict_foreign_struct	dict_foreign_t;
+struct dict_sys_t;
+struct dict_col_t;
+struct dict_field_t;
+struct dict_index_t;
+struct dict_table_t;
+struct dict_foreign_t;
 
-typedef struct ind_node_struct		ind_node_t;
-typedef struct tab_node_struct		tab_node_t;
+struct ind_node_t;
+struct tab_node_t;
 
 /* Space id and page no where the dictionary header resides */
 #define	DICT_HDR_SPACE		0	/* the SYSTEM tablespace */
 #define	DICT_HDR_PAGE_NO	FSP_DICT_HDR_PAGE_NO
 
+/* The ibuf table and indexes's ID are assigned as the number
+DICT_IBUF_ID_MIN plus the space id */
+#define DICT_IBUF_ID_MIN	0xFFFFFFFF00000000ULL
+
 typedef ib_id_t		table_id_t;
 typedef ib_id_t		index_id_t;
 
@@ -48,17 +52,32 @@ the table and index will be marked as "corrupted", and caller will
 be responsible to deal with corrupted table or index.
 Note: please define the IGNORE_ERR_* as bits, so their value can
 be or-ed together */
-enum dict_err_ignore {
-        DICT_ERR_IGNORE_NONE = 0,        /*!< no error to ignore */
-        DICT_ERR_IGNORE_INDEX_ROOT = 1, /*!< ignore error if index root
+enum dict_err_ignore_t {
+	DICT_ERR_IGNORE_NONE = 0,	/*!< no error to ignore */
+	DICT_ERR_IGNORE_INDEX_ROOT = 1,	/*!< ignore error if index root
 					page is FIL_NULL or incorrect value */
 	DICT_ERR_IGNORE_CORRUPT = 2,	/*!< skip corrupted indexes */
 	DICT_ERR_IGNORE_FK_NOKEY = 4,	/*!< ignore error if any foreign
 					key is missing */
-        DICT_ERR_IGNORE_ALL = 0xFFFF	/*!< ignore all errors */
+	DICT_ERR_IGNORE_RECOVER_LOCK = 8,
+					/*!< Used when recovering table locks
+					for resurrected transactions.
+					Silently load a missing
+					tablespace, and do not load
+					incomplete index definitions. */
+	DICT_ERR_IGNORE_ALL = 0xFFFF	/*!< ignore all errors */
+};
+
+/** Quiescing states for flushing tables to disk. */
+enum ib_quiesce_t {
+	QUIESCE_NONE,
+	QUIESCE_START,			/*!< Initialise, prepare to start */
+	QUIESCE_COMPLETE		/*!< All done */
 };
 
-typedef enum dict_err_ignore		dict_err_ignore_t;
+/** Prefix for tmp tables, adopted from sql/table.h */
+#define tmp_file_prefix		"#sql"
+#define tmp_file_prefix_length	4
 
 #define TEMP_TABLE_PREFIX                "#sql"
 #define TEMP_TABLE_PATH_PREFIX           "/" TEMP_TABLE_PREFIX
diff --git a/storage/xtradb/include/dyn0dyn.h b/storage/xtradb/include/dyn0dyn.h
index 62ed862e82c..7f23302d1ff 100644
--- a/storage/xtradb/include/dyn0dyn.h
+++ b/storage/xtradb/include/dyn0dyn.h
@@ -31,10 +31,9 @@ Created 2/5/1996 Heikki Tuuri
 #include "mem0mem.h"
 
 /** A block in a dynamically allocated array */
-typedef struct dyn_block_struct		dyn_block_t;
+struct dyn_block_t;
 /** Dynamically allocated array */
-typedef dyn_block_t			dyn_array_t;
-
+typedef dyn_block_t		dyn_array_t;
 
 /** This is the initial 'payload' size of a dynamic array;
 this must be > MLOG_BUF_MARGIN + 30! */
@@ -171,7 +170,7 @@ dyn_push_string(
 /** @brief A block in a dynamically allocated array.
 NOTE! Do not access the fields of the struct directly: the definition
 appears here only for the compiler to know its size! */
-struct dyn_block_struct{
+struct dyn_block_t{
 	mem_heap_t*	heap;	/*!< in the first block this is != NULL
 				if dynamic allocation has been needed */
 	ulint		used;	/*!< number of data bytes used in this block;
diff --git a/storage/xtradb/include/dyn0dyn.ic b/storage/xtradb/include/dyn0dyn.ic
index 177877ed1fd..0296554e2ee 100644
--- a/storage/xtradb/include/dyn0dyn.ic
+++ b/storage/xtradb/include/dyn0dyn.ic
@@ -23,9 +23,9 @@ The dynamically allocated array
 Created 2/5/1996 Heikki Tuuri
 *******************************************************/
 
-/** Value of dyn_block_struct::magic_n */
+/** Value of dyn_block_t::magic_n */
 #define DYN_BLOCK_MAGIC_N	375767
-/** Flag for dyn_block_struct::used that indicates a full block */
+/** Flag for dyn_block_t::used that indicates a full block */
 #define DYN_BLOCK_FULL_FLAG	0x1000000UL
 
 /************************************************************//**
@@ -63,7 +63,7 @@ dyn_block_get_data(
 {
 	ut_ad(block);
 
-	return((byte*) block->data);
+	return(const_cast<byte*>(block->data));
 }
 
 /*********************************************************************//**
@@ -245,7 +245,7 @@ dyn_array_get_element(
 	ut_ad(block);
 	ut_ad(dyn_block_get_used(block) >= pos);
 
-	return((byte*) block->data + pos);
+	return(const_cast<byte*>(block->data) + pos);
 }
 
 /************************************************************//**
diff --git a/storage/xtradb/include/eval0eval.h b/storage/xtradb/include/eval0eval.h
index c12df320b88..e3b1e6c16b6 100644
--- a/storage/xtradb/include/eval0eval.h
+++ b/storage/xtradb/include/eval0eval.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/eval0eval.ic b/storage/xtradb/include/eval0eval.ic
index d0ca4c9bea5..e4b1dd08017 100644
--- a/storage/xtradb/include/eval0eval.ic
+++ b/storage/xtradb/include/eval0eval.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -70,7 +70,7 @@ eval_node_ensure_val_buf(
 	dfield = que_node_get_val(node);
 	dfield_set_len(dfield, size);
 
-	data = dfield_get_data(dfield);
+	data = static_cast<byte*>(dfield_get_data(dfield));
 
 	if (!data || que_node_get_val_buf_size(node) < size) {
 
@@ -110,12 +110,12 @@ eval_exp(
 {
 	if (que_node_get_type(exp_node) == QUE_NODE_SYMBOL) {
 
-		eval_sym((sym_node_t*)exp_node);
+		eval_sym((sym_node_t*) exp_node);
 
 		return;
 	}
 
-	eval_func(exp_node);
+	eval_func(static_cast<func_node_t*>(exp_node));
 }
 
 /*****************************************************************//**
@@ -132,7 +132,7 @@ eval_node_set_int_val(
 
 	dfield = que_node_get_val(node);
 
-	data = dfield_get_data(dfield);
+	data = static_cast<byte*>(dfield_get_data(dfield));
 
 	if (data == NULL) {
 		data = eval_node_alloc_val_buf(node, 4);
@@ -140,7 +140,7 @@ eval_node_set_int_val(
 
 	ut_ad(dfield_get_len(dfield) == 4);
 
-	mach_write_to_4(data, (ulint)val);
+	mach_write_to_4(data, (ulint) val);
 }
 
 /*****************************************************************//**
@@ -152,13 +152,15 @@ eval_node_get_int_val(
 /*==================*/
 	que_node_t*	node)	/*!< in: expression node */
 {
+	const byte*	ptr;
 	dfield_t*	dfield;
 
 	dfield = que_node_get_val(node);
+	ptr = static_cast<byte*>(dfield_get_data(dfield));
 
 	ut_ad(dfield_get_len(dfield) == 4);
 
-	return((int)mach_read_from_4(dfield_get_data(dfield)));
+	return((int) mach_read_from_4(ptr));
 }
 
 /*****************************************************************//**
@@ -175,7 +177,7 @@ eval_node_get_ibool_val(
 
 	dfield = que_node_get_val(node);
 
-	data = dfield_get_data(dfield);
+	data = static_cast<byte*>(dfield_get_data(dfield));
 
 	ut_ad(data != NULL);
 
@@ -196,7 +198,7 @@ eval_node_set_ibool_val(
 
 	dfield = que_node_get_val(func_node);
 
-	data = dfield_get_data(dfield);
+	data = static_cast<byte*>(dfield_get_data(dfield));
 
 	if (data == NULL) {
 		/* Allocate 1 byte to hold the value */
@@ -246,6 +248,8 @@ eval_node_copy_val(
 
 	dfield2 = que_node_get_val(node2);
 
-	eval_node_copy_and_alloc_val(node1, dfield_get_data(dfield2),
-				     dfield_get_len(dfield2));
+	eval_node_copy_and_alloc_val(
+		node1,
+		static_cast<byte*>(dfield_get_data(dfield2)),
+		dfield_get_len(dfield2));
 }
diff --git a/storage/xtradb/include/eval0proc.h b/storage/xtradb/include/eval0proc.h
index 450fd5a27c3..7755fb10343 100644
--- a/storage/xtradb/include/eval0proc.h
+++ b/storage/xtradb/include/eval0proc.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1998, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/eval0proc.ic b/storage/xtradb/include/eval0proc.ic
index 6949af1557b..81418bae2c9 100644
--- a/storage/xtradb/include/eval0proc.ic
+++ b/storage/xtradb/include/eval0proc.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1998, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -40,7 +40,7 @@ proc_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<proc_node_t*>(thr->run_node);
 	ut_ad(que_node_get_type(node) == QUE_NODE_PROC);
 
 	if (thr->prev_node == que_node_get_parent(node)) {
@@ -75,7 +75,7 @@ proc_eval_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<func_node_t*>(thr->run_node);
 	ut_ad(que_node_get_type(node) == QUE_NODE_FUNC);
 
 	/* Evaluate the procedure */
diff --git a/storage/xtradb/include/fil0fil.h b/storage/xtradb/include/fil0fil.h
index a7d8d87035b..472c57fcbfc 100644
--- a/storage/xtradb/include/fil0fil.h
+++ b/storage/xtradb/include/fil0fil.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,15 +27,27 @@ Created 10/25/1995 Heikki Tuuri
 #define fil0fil_h
 
 #include "univ.i"
+
+#ifndef UNIV_INNOCHECKSUM
+
 #include "dict0types.h"
 #include "ut0byte.h"
 #include "os0file.h"
 #ifndef UNIV_HOTBACKUP
 #include "sync0rw.h"
 #include "ibuf0types.h"
+#include "log0log.h"
 #endif /* !UNIV_HOTBACKUP */
 #include "trx0types.h"
 
+#include <list>
+
+// Forward declaration
+struct trx_t;
+struct fil_space_t;
+
+typedef std::list<const char*> space_name_list_t;
+
 /** When mysqld is run, the default directory "." is the mysqld datadir,
 but in the MySQL Embedded Server Library and ibbackup it is not the default
 directory, and we must set the base file path explicitly */
@@ -58,12 +70,8 @@ typedef	byte	fil_faddr_t;	/*!< 'type' definition in C: an address
 
 #define	FIL_ADDR_SIZE	6	/* address size is 6 bytes */
 
-/** A struct for storing a space address FIL_ADDR, when it is used
-in C program data structures. */
-
-typedef struct fil_addr_struct	fil_addr_t;
 /** File space address */
-struct fil_addr_struct{
+struct fil_addr_t{
 	ulint	page;		/*!< page number within a space */
 	ulint	boffset;	/*!< byte offset within the page */
 };
@@ -71,6 +79,8 @@ struct fil_addr_struct{
 /** The null file address */
 extern fil_addr_t	fil_addr_null;
 
+#endif /* !UNIV_INNOCHECKSUM */
+
 /** The byte offsets on a file page for various variables @{ */
 #define FIL_PAGE_SPACE_OR_CHKSUM 0	/*!< in < MySQL-4.0.14 space id the
 					page belongs to (== 0) but in later
@@ -119,7 +129,6 @@ extern fil_addr_t	fil_addr_null;
 #define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID  34 /*!< starting from 4.1.x this
 					contains the space id of the page */
 #define FIL_PAGE_DATA		38	/*!< start of the data on the page */
-#define FIL_PAGE_DATA_ALIGN_32	40
 /* @} */
 /** File page trailer @{ */
 #define FIL_PAGE_END_LSN_OLD_CHKSUM 8	/*!< the low 4 bytes of this are used
@@ -148,6 +157,8 @@ extern fil_addr_t	fil_addr_null;
 					/*!< Last page type */
 /* @} */
 
+#ifndef UNIV_INNOCHECKSUM
+
 /** Space types @{ */
 #define FIL_TABLESPACE		501	/*!< tablespace */
 #define FIL_LOG			502	/*!< redo log */
@@ -161,6 +172,8 @@ extern ulint	fil_n_pending_log_flushes;
 /** Number of pending tablespace flushes */
 extern ulint	fil_n_pending_tablespace_flushes;
 
+/** Number of files currently open */
+extern ulint	fil_n_file_opened;
 
 #ifndef UNIV_HOTBACKUP
 /*******************************************************************//**
@@ -176,7 +189,7 @@ fil_space_get_version(
 Returns the latch of a file space.
 @return	latch protecting storage allocation */
 UNIV_INTERN
-rw_lock_t*
+prio_rw_lock_t*
 fil_space_get_latch(
 /*================*/
 	ulint	id,	/*!< in: space id */
@@ -192,17 +205,19 @@ fil_space_get_type(
 	ulint	id);	/*!< in: space id */
 #endif /* !UNIV_HOTBACKUP */
 /*******************************************************************//**
-Appends a new file to the chain of files of a space. File must be closed. */
+Appends a new file to the chain of files of a space. File must be closed.
+@return pointer to the file name, or NULL on error */
 UNIV_INTERN
-void
+char*
 fil_node_create(
 /*============*/
 	const char*	name,	/*!< in: file name (file must be closed) */
 	ulint		size,	/*!< in: file size in database blocks, rounded
 				downwards to an integer */
 	ulint		id,	/*!< in: space id where to append */
-	ibool		is_raw);/*!< in: TRUE if a raw device or
+	ibool		is_raw)	/*!< in: TRUE if a raw device or
 				a raw disk partition */
+	__attribute__((nonnull, warn_unused_result));
 #ifdef UNIV_LOG_ARCHIVE
 /****************************************************************//**
 Drops files from the start of a file space, so that its size is cut by
@@ -215,10 +230,18 @@ fil_space_truncate_start(
 	ulint	trunc_len);	/*!< in: truncate by this much; it is an error
 				if this does not equal to the combined size of
 				some initial files in the space */
+/****************************************************************//**
+Check is there node in file space with given name. */
+UNIV_INTERN
+ibool
+fil_space_contains_node(
+/*====================*/
+	ulint	id,		/*!< in: space id */
+	char*	node_name);	/*!< in: node name */
 #endif /* UNIV_LOG_ARCHIVE */
 /*******************************************************************//**
-Creates a space memory object and puts it to the 'fil system' hash table. If
-there is an error, prints an error message to the .err log.
+Creates a space memory object and puts it to the 'fil system' hash table.
+If there is an error, prints an error message to the .err log.
 @return	TRUE if success */
 UNIV_INTERN
 ibool
@@ -240,6 +263,16 @@ fil_assign_new_space_id(
 /*====================*/
 	ulint*	space_id);	/*!< in/out: space id */
 /*******************************************************************//**
+Returns the path from the first fil_node_t found for the space ID sent.
+The caller is responsible for freeing the memory allocated here for the
+value returned.
+@return	a copy of fil_node_t::path, NULL if space is zero or not found. */
+UNIV_INTERN
+char*
+fil_space_get_first_path(
+/*=====================*/
+	ulint	id);	/*!< in: space id */
+/*******************************************************************//**
 Returns the size of the space in pages. The tablespace must be cached in the
 memory cache.
 @return	space size, 0 if space not found */
@@ -308,6 +341,14 @@ void
 fil_close_all_files(void);
 /*=====================*/
 /*******************************************************************//**
+Closes the redo log files. There must not be any pending i/o's or not
+flushed modifications in the files. */
+UNIV_INTERN
+void
+fil_close_log_files(
+/*================*/
+	bool	free);	/*!< in: whether to free the memory object */
+/*******************************************************************//**
 Sets the max tablespace id counter if the given number is bigger than the
 previous value. */
 UNIV_INTERN
@@ -321,12 +362,11 @@ Writes the flushed lsn and the latest archived log number to the page
 header of the first page of each data file in the system tablespace.
 @return	DB_SUCCESS or error number */
 UNIV_INTERN
-ulint
+dberr_t
 fil_write_flushed_lsn_to_data_files(
 /*================================*/
-	ib_uint64_t	lsn,		/*!< in: lsn to write */
-	ulint		arch_log_no);	/*!< in: latest archived log
-					file number */
+	lsn_t	lsn,		/*!< in: lsn to write */
+	ulint	arch_log_no);	/*!< in: latest archived log file number */
 /*******************************************************************//**
 Reads the flushed lsn, arch no, and tablespace flag fields from a data
 file at database startup.
@@ -341,15 +381,10 @@ fil_read_first_page(
 						parameters below already
 						contain sensible data */
 	ulint*		flags,			/*!< out: tablespace flags */
-#ifdef UNIV_LOG_ARCHIVE
-	ulint*		min_arch_log_no,	/*!< out: min of archived
-						log numbers in data files */
-	ulint*		max_arch_log_no,	/*!< out: max of archived
-						log numbers in data files */
-#endif /* UNIV_LOG_ARCHIVE */
-	ib_uint64_t*	min_flushed_lsn,	/*!< out: min of flushed
+	ulint*		space_id,		/*!< out: tablespace ID */
+	lsn_t*		min_flushed_lsn,	/*!< out: min of flushed
 						lsn values in data files */
-	ib_uint64_t*	max_flushed_lsn)	/*!< out: max of flushed
+	lsn_t*		max_flushed_lsn)	/*!< out: max of flushed
 						lsn values in data files */
 	__attribute__((warn_unused_result));
 /*******************************************************************//**
@@ -401,27 +436,44 @@ Deletes a single-table tablespace. The tablespace must be cached in the
 memory cache.
 @return	TRUE if success */
 UNIV_INTERN
-ibool
+dberr_t
 fil_delete_tablespace(
 /*==================*/
-	ulint	id,		/*!< in: space id */
-	ibool	evict_all);	/*!< in: TRUE if we want all pages
-				evicted from LRU. */
+	ulint		id,		/*!< in: space id */
+	buf_remove_t	buf_remove);	/*!< in: specify the action to take
+					on the tables pages in the buffer
+					pool */
+/*******************************************************************//**
+Closes a single-table tablespace. The tablespace must be cached in the
+memory cache. Free all pages used by the tablespace.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+dberr_t
+fil_close_tablespace(
+/*=================*/
+	trx_t*	trx,	/*!< in/out: Transaction covering the close */
+	ulint	id);	/*!< in: space id */
 #ifndef UNIV_HOTBACKUP
 /*******************************************************************//**
 Discards a single-table tablespace. The tablespace must be cached in the
 memory cache. Discarding is like deleting a tablespace, but
-1) we do not drop the table from the data dictionary;
-2) we remove all insert buffer entries for the tablespace immediately; in DROP
-TABLE they are only removed gradually in the background;
-3) when the user does IMPORT TABLESPACE, the tablespace will have the same id
-as it originally had.
-@return	TRUE if success */
+
+ 1. We do not drop the table from the data dictionary;
+
+ 2. We remove all insert buffer entries for the tablespace immediately;
+    in DROP TABLE they are only removed gradually in the background;
+
+ 3. When the user does IMPORT TABLESPACE, the tablespace will have the
+    same id as it originally had.
+
+ 4. Free all the pages in use by the tablespace if rename=TRUE.
+@return	DB_SUCCESS or error */
 UNIV_INTERN
-ibool
+dberr_t
 fil_discard_tablespace(
 /*===================*/
-	ulint	id);	/*!< in: space id */
+	ulint	id)	/*!< in: space id */
+	__attribute__((warn_unused_result));
 #endif /* !UNIV_HOTBACKUP */
 /*******************************************************************//**
 Renames a single-table tablespace. The tablespace must be cached in the
@@ -431,16 +483,70 @@ UNIV_INTERN
 ibool
 fil_rename_tablespace(
 /*==================*/
-	const char*	old_name,	/*!< in: old table name in the standard
-					databasename/tablename format of
-					InnoDB, or NULL if we do the rename
-					based on the space id only */
+	const char*	old_name_in,	/*!< in: old table name in the
+					standard databasename/tablename
+					format of InnoDB, or NULL if we
+					do the rename based on the space
+					id only */
 	ulint		id,		/*!< in: space id */
-	const char*	new_name);	/*!< in: new table name in the standard
-					databasename/tablename format
-					of InnoDB */
+	const char*	new_name,	/*!< in: new table name in the
+					standard databasename/tablename
+					format of InnoDB */
+	const char*	new_path);	/*!< in: new full datafile path
+					if the tablespace is remotely
+					located, or NULL if it is located
+					in the normal data directory. */
 
 /*******************************************************************//**
+Allocates a file name for a single-table tablespace. The string must be freed
+by caller with mem_free().
+@return	own: file name */
+UNIV_INTERN
+char*
+fil_make_ibd_name(
+/*==============*/
+	const char*	name,		/*!< in: table name or a dir path */
+	bool		is_full_path);	/*!< in: TRUE if it is a dir path */
+/*******************************************************************//**
+Allocates a file name for a tablespace ISL file (InnoDB Symbolic Link).
+The string must be freed by caller with mem_free().
+@return	own: file name */
+UNIV_INTERN
+char*
+fil_make_isl_name(
+/*==============*/
+	const char*	name);	/*!< in: table name */
+/*******************************************************************//**
+Creates a new InnoDB Symbolic Link (ISL) file.  It is always created
+under the 'datadir' of MySQL. The datadir is the directory of a
+running mysqld program. We can refer to it by simply using the path '.'.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fil_create_link_file(
+/*=================*/
+	const char*	tablename,	/*!< in: tablename */
+	const char*	filepath);	/*!< in: pathname of tablespace */
+/*******************************************************************//**
+Deletes an InnoDB Symbolic Link (ISL) file. */
+UNIV_INTERN
+void
+fil_delete_link_file(
+/*==================*/
+	const char*	tablename);	/*!< in: name of table */
+/*******************************************************************//**
+Reads an InnoDB Symbolic Link (ISL) file.
+It is always created under the 'datadir' of MySQL.  The name is of the
+form {databasename}/{tablename}. and the isl file is expected to be in a
+'{databasename}' directory called '{tablename}.isl'. The caller must free
+the memory of the null-terminated path returned if it is not null.
+@return	own: filepath found in link file, NULL if not found. */
+UNIV_INTERN
+char*
+fil_read_link_file(
+/*===============*/
+	const char*	name);		/*!< in: tablespace name */
+/*******************************************************************//**
 Creates a new single-table tablespace to a database directory of MySQL.
 Database directories are under the 'datadir' of MySQL. The datadir is the
 directory of a running mysqld program. We can refer to it by simply the
@@ -448,20 +554,20 @@ path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp
 dir of the mysqld server.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 fil_create_new_single_table_tablespace(
 /*===================================*/
 	ulint		space_id,	/*!< in: space id */
 	const char*	tablename,	/*!< in: the table name in the usual
 					databasename/tablename format
-					of InnoDB, or a dir path to a temp
-					table */
-	ibool		is_temp,	/*!< in: TRUE if a table created with
-					CREATE TEMPORARY TABLE */
+					of InnoDB */
+	const char*	dir_path,	/*!< in: NULL or a dir path */
 	ulint		flags,		/*!< in: tablespace flags */
-	ulint		size);		/*!< in: the initial size of the
+	ulint		flags2,		/*!< in: table flags2 */
+	ulint		size)		/*!< in: the initial size of the
 					tablespace file in pages,
 					must be >= FIL_IBD_FILE_INITIAL_SIZE */
+	__attribute__((nonnull, warn_unused_result));
 #ifndef UNIV_HOTBACKUP
 /********************************************************************//**
 Tries to open a single-table tablespace and optionally checks the space id is
@@ -472,44 +578,31 @@ NOTE that we assume this operation is used either at the database startup
 or under the protection of the dictionary mutex, so that two users cannot
 race here. This operation does not leave the file associated with the
 tablespace open, but closes it after we have looked at the space id in it.
-@return	TRUE if success */
+
+If the validate boolean is set, we read the first page of the file and
+check that the space id in the file is what we expect. We assume that
+this function runs much faster if no check is made, since accessing the
+file inode probably is much faster (the OS caches them) than accessing
+the first page of the file.  This boolean may be initially FALSE, but if
+a remote tablespace is found it will be changed to true.
+
+If the fix_dict boolean is set, then it is safe to use an internal SQL
+statement to update the dictionary tables if they are incorrect.
+
+@return	DB_SUCCESS or error code */
 UNIV_INTERN
-ibool
+dberr_t
 fil_open_single_table_tablespace(
 /*=============================*/
-	ibool		check_space_id,	/*!< in: should we check that the space
-					id in the file is right; we assume
-					that this function runs much faster
-					if no check is made, since accessing
-					the file inode probably is much
-					faster (the OS caches them) than
-					accessing the first page of the file */
+	bool		validate,	/*!< in: Do we validate tablespace? */
+	bool		fix_dict,	/*!< in: Can we fix the dictionary? */
 	ulint		id,		/*!< in: space id */
 	ulint		flags,		/*!< in: tablespace flags */
-	const char*	name,		/*!< in: table name in the
+	const char*	tablename,	/*!< in: table name in the
 					databasename/tablename format */
-	trx_t*		trx);		/*!< in: transaction. This is only used
-					for IMPORT TABLESPACE, must be NULL
-					otherwise */
-/********************************************************************//**
-It is possible, though very improbable, that the lsn's in the tablespace to be
-imported have risen above the current system lsn, if a lengthy purge, ibuf
-merge, or rollback was performed on a backup taken with ibbackup. If that is
-the case, reset page lsn's in the file. We assume that mysqld was shut down
-after it performed these cleanup operations on the .ibd file, so that it at
-the shutdown stamped the latest lsn to the FIL_PAGE_FILE_FLUSH_LSN in the
-first page of the .ibd file, and we can determine whether we need to reset the
-lsn's just by looking at that flush lsn.
-@return	TRUE if success */
-UNIV_INTERN
-ibool
-fil_reset_too_high_lsns(
-/*====================*/
-	const char*	name,		/*!< in: table name in the
-					databasename/tablename format */
-	ib_uint64_t	current_lsn);	/*!< in: reset lsn's if the lsn stamped
-					to FIL_PAGE_FILE_FLUSH_LSN in the
-					first page is too high */
+	const char*	filepath)	/*!< in: tablespace filepath */
+	__attribute__((nonnull(5), warn_unused_result));
+
 #endif /* !UNIV_HOTBACKUP */
 /********************************************************************//**
 At the server startup, if we need crash recovery, scans the database
@@ -520,13 +613,13 @@ in the doublewrite buffer, also to know where to apply log records where the
 space id is != 0.
 @return	DB_SUCCESS or error number */
 UNIV_INTERN
-ulint
+dberr_t
 fil_load_single_table_tablespaces(void);
 /*===================================*/
 /*******************************************************************//**
 Returns TRUE if a single-table tablespace does not exist in the memory cache,
 or is being deleted there.
-@return	TRUE if does not exist or is being\ deleted */
+@return	TRUE if does not exist or is being deleted */
 UNIV_INTERN
 ibool
 fil_tablespace_deleted_or_being_deleted_in_mem(
@@ -555,21 +648,22 @@ fil_space_for_table_exists_in_mem(
 /*==============================*/
 	ulint		id,		/*!< in: space id */
 	const char*	name,		/*!< in: table name in the standard
-					'databasename/tablename' format or
-					the dir path to a temp table */
-	ibool		is_temp,	/*!< in: TRUE if created with CREATE
-					TEMPORARY TABLE */
+					'databasename/tablename' format */
 	ibool		mark_space,	/*!< in: in crash recovery, at database
 					startup we mark all spaces which have
 					an associated table in the InnoDB
 					data dictionary, so that
 					we can print a warning about orphaned
 					tablespaces */
-	ibool		print_error_if_does_not_exist);
+	ibool		print_error_if_does_not_exist,
 					/*!< in: print detailed error
 					information to the .err log if a
 					matching tablespace is not found from
 					memory */
+	bool		adjust_space,	/*!< in: whether to adjust space id
+					when find table space mismatch */
+	mem_heap_t*	heap,		/*!< in: heap memory */
+	table_id_t	table_id);	/*!< in: table id */
 #else /* !UNIV_HOTBACKUP */
 /********************************************************************//**
 Extends all tablespaces to the size stored in the space header. During the
@@ -631,7 +725,7 @@ i/o on a tablespace which does not exist */
 	_fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message, NULL)
 
 UNIV_INTERN
-ulint
+dberr_t
 _fil_io(
 /*===*/
 	ulint	type,		/*!< in: OS_FILE_READ or OS_FILE_WRITE,
@@ -643,7 +737,7 @@ _fil_io(
 				because i/os are not actually handled until
 				all have been posted: use with great
 				caution! */
-	ibool	sync,		/*!< in: TRUE if synchronous aio is desired */
+	bool	sync,		/*!< in: true if synchronous aio is desired */
 	ulint	space_id,	/*!< in: space id */
 	ulint	zip_size,	/*!< in: compressed page size in bytes;
 				0 for uncompressed pages */
@@ -659,19 +753,12 @@ _fil_io(
 				appropriately aligned */
 	void*	message,	/*!< in: message for aio handler if non-sync
 				aio used, else ignored */
-	trx_t*	trx);
-/********************************************************************//**
-Confirm whether the parameters are valid or not */
-UNIV_INTERN
-ibool
-fil_is_exist(
-/*==============*/
-	ulint	space_id,	/*!< in: space id */
-	ulint	block_offset);	/*!< in: offset in number of blocks */
+	trx_t*	trx)
+	__attribute__((nonnull(8)));
 /**********************************************************************//**
 Waits for an aio operation to complete. This function is used to write the
 handler for completed requests. The aio array of pending requests is divided
-into segments (see os0file.c for more info). The thread specifies which
+into segments (see os0file.cc for more info). The thread specifies which
 segment it wants to wait for. */
 UNIV_INTERN
 void
@@ -686,9 +773,8 @@ UNIV_INTERN
 void
 fil_flush(
 /*======*/
-	ulint	space_id,	/*!< in: file space id (this can be a group of
+	ulint	space_id);	/*!< in: file space id (this can be a group of
 				log files or a tablespace of the database) */
-	ibool	metadata);
 /**********************************************************************//**
 Flushes to disk writes in file spaces of the given type possibly cached by
 the OS. */
@@ -755,6 +841,159 @@ fil_tablespace_is_being_deleted(
 /*============================*/
 	ulint		id);	/*!< in: space id */
 
+/********************************************************************//**
+Delete the tablespace file and any related files like .cfg.
+This should not be called for temporary tables. */
+UNIV_INTERN
+void
+fil_delete_file(
+/*============*/
+	const char*	path);	/*!< in: filepath of the ibd tablespace */
+
+/** Callback functor. */
+struct PageCallback {
+
+	/**
+	Default constructor */
+	PageCallback()
+		:
+		m_zip_size(),
+		m_page_size(),
+		m_filepath() UNIV_NOTHROW {}
+
+	virtual ~PageCallback() UNIV_NOTHROW {}
+
+	/**
+	Called for page 0 in the tablespace file at the start.
+	@param file_size - size of the file in bytes
+	@param block - contents of the first page in the tablespace file
+	@retval DB_SUCCESS or error code.*/
+	virtual dberr_t init(
+		os_offset_t		file_size,
+		const buf_block_t*	block) UNIV_NOTHROW = 0;
+
+	/**
+	Called for every page in the tablespace. If the page was not
+	updated then its state must be set to BUF_PAGE_NOT_USED. For
+	compressed tables the page descriptor memory will be at offset:
+       		block->frame + UNIV_PAGE_SIZE;
+	@param offset - physical offset within the file
+	@param block - block read from file, note it is not from the buffer pool
+	@retval DB_SUCCESS or error code. */
+	virtual dberr_t operator()(
+		os_offset_t 	offset,
+		buf_block_t*	block) UNIV_NOTHROW = 0;
+
+	/**
+	Set the name of the physical file and the file handle that is used
+	to open it for the file that is being iterated over.
+	@param filename - then physical name of the tablespace file.
+	@param file - OS file handle */
+	void set_file(const char* filename, os_file_t file) UNIV_NOTHROW
+	{
+		m_file = file;
+		m_filepath = filename;
+	}
+
+	/**
+	@return the space id of the tablespace */
+	virtual ulint get_space_id() const UNIV_NOTHROW = 0;
+
+	/** The compressed page size
+	@return the compressed page size */
+	ulint get_zip_size() const
+	{
+		return(m_zip_size);
+	}
+
+	/**
+	Set the tablespace compressed table size.
+	@return DB_SUCCESS if it is valie or DB_CORRUPTION if not */
+	dberr_t set_zip_size(const buf_frame_t* page) UNIV_NOTHROW;
+
+	/** The compressed page size
+	@return the compressed page size */
+	ulint get_page_size() const
+	{
+		return(m_page_size);
+	}
+
+	/** Compressed table page size */
+	ulint			m_zip_size;
+
+	/** The tablespace page size. */
+	ulint			m_page_size;
+
+	/** File handle to the tablespace */
+	os_file_t		m_file;
+
+	/** Physical file path. */
+	const char*		m_filepath;
+
+protected:
+	// Disable copying
+	PageCallback(const PageCallback&);
+	PageCallback& operator=(const PageCallback&);
+};
+
+/********************************************************************//**
+Iterate over all the pages in the tablespace.
+@param table - the table definiton in the server
+@param n_io_buffers - number of blocks to read and write together
+@param callback - functor that will do the page updates
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fil_tablespace_iterate(
+/*===================*/
+	dict_table_t*		table,
+	ulint			n_io_buffers,
+	PageCallback&		callback)
+	__attribute__((nonnull, warn_unused_result));
+
+/*******************************************************************//**
+Checks if a single-table tablespace for a given table name exists in the
+tablespace memory cache.
+@return	space id, ULINT_UNDEFINED if not found */
+UNIV_INTERN
+ulint
+fil_get_space_id_for_table(
+/*=======================*/
+	const char*	name);	/*!< in: table name in the standard
+				'databasename/tablename' format */
+
+/**
+Iterate over all the spaces in the space list and fetch the
+tablespace names. It will return a copy of the name that must be
+freed by the caller using: delete[].
+@return DB_SUCCESS if all OK. */
+UNIV_INTERN
+dberr_t
+fil_get_space_names(
+/*================*/
+	space_name_list_t&	space_name_list)
+				/*!< in/out: Vector for collecting the names. */
+	__attribute__((warn_unused_result));
+
+/****************************************************************//**
+Generate redo logs for swapping two .ibd files */
+UNIV_INTERN
+void
+fil_mtr_rename_log(
+/*===============*/
+	ulint		old_space_id,	/*!< in: tablespace id of the old
+					table. */
+	const char*	old_name,	/*!< in: old table name */
+	ulint		new_space_id,	/*!< in: tablespace id of the new
+					table */
+	const char*	new_name,	/*!< in: new table name */
+	const char*	tmp_name,	/*!< in: temp table name used while
+					swapping */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
+
+#endif /* !UNIV_INNOCHECKSUM */
+
 /*************************************************************************
 Return local hash table informations. */
 
@@ -779,21 +1018,4 @@ fil_space_set_corrupt(
 /*==================*/
 	ulint	space_id);
 
-/****************************************************************//**
-Generate redo logs for swapping two .ibd files */
-UNIV_INTERN
-void
-fil_mtr_rename_log(
-/*===============*/
-	ulint		old_space_id,	/*!< in: tablespace id of the old
-					table. */
-	const char*	old_name,	/*!< in: old table name */
-	ulint		new_space_id,	/*!< in: tablespace id of the new
-					table */
-	const char*	new_name,	/*!< in: new table name */
-	const char*	tmp_name);	/*!< in: temp table name used while
-					swapping */
-
-typedef	struct fil_space_struct	fil_space_t;
-
-#endif
+#endif /* fil0fil_h */
diff --git a/storage/xtradb/include/fsp0fsp.h b/storage/xtradb/include/fsp0fsp.h
index f07e3decc66..a587ccc9f20 100644
--- a/storage/xtradb/include/fsp0fsp.h
+++ b/storage/xtradb/include/fsp0fsp.h
@@ -28,26 +28,108 @@ Created 12/18/1995 Heikki Tuuri
 
 #include "univ.i"
 
+#ifndef UNIV_INNOCHECKSUM
+
 #include "mtr0mtr.h"
 #include "fut0lst.h"
 #include "ut0byte.h"
 #include "page0types.h"
 #include "fsp0types.h"
 
+#endif /* !UNIV_INNOCHECKSUM */
+
 /* @defgroup fsp_flags InnoDB Tablespace Flag Constants @{ */
 
+/** Width of the POST_ANTELOPE flag */
+#define FSP_FLAGS_WIDTH_POST_ANTELOPE	1
+/** Number of flag bits used to indicate the tablespace zip page size */
+#define FSP_FLAGS_WIDTH_ZIP_SSIZE	4
+/** Width of the ATOMIC_BLOBS flag.  The ability to break up a long
+column into an in-record prefix and an externally stored part is available
+to the two Barracuda row formats COMPRESSED and DYNAMIC. */
+#define FSP_FLAGS_WIDTH_ATOMIC_BLOBS	1
 /** Number of flag bits used to indicate the tablespace page size */
 #define FSP_FLAGS_WIDTH_PAGE_SSIZE	4
+/** Width of the DATA_DIR flag.  This flag indicates that the tablespace
+is found in a remote location, not the default data directory. */
+#define FSP_FLAGS_WIDTH_DATA_DIR	1
+/** Width of all the currently known tablespace flags */
+#define FSP_FLAGS_WIDTH		(FSP_FLAGS_WIDTH_POST_ANTELOPE	\
+				+ FSP_FLAGS_WIDTH_ZIP_SSIZE	\
+				+ FSP_FLAGS_WIDTH_ATOMIC_BLOBS	\
+				+ FSP_FLAGS_WIDTH_PAGE_SSIZE	\
+				+ FSP_FLAGS_WIDTH_DATA_DIR)
+
+/** A mask of all the known/used bits in tablespace flags */
+#define FSP_FLAGS_MASK		(~(~0 << FSP_FLAGS_WIDTH))
+
+/** Zero relative shift position of the POST_ANTELOPE field */
+#define FSP_FLAGS_POS_POST_ANTELOPE	0
+/** Zero relative shift position of the ZIP_SSIZE field */
+#define FSP_FLAGS_POS_ZIP_SSIZE		(FSP_FLAGS_POS_POST_ANTELOPE	\
+					+ FSP_FLAGS_WIDTH_POST_ANTELOPE)
+/** Zero relative shift position of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_POS_ATOMIC_BLOBS	(FSP_FLAGS_POS_ZIP_SSIZE	\
+					+ FSP_FLAGS_WIDTH_ZIP_SSIZE)
 /** Zero relative shift position of the PAGE_SSIZE field */
-#define FSP_FLAGS_POS_PAGE_SSIZE	6
+#define FSP_FLAGS_POS_PAGE_SSIZE	(FSP_FLAGS_POS_ATOMIC_BLOBS	\
+					+ FSP_FLAGS_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the start of the UNUSED bits */
+#define FSP_FLAGS_POS_DATA_DIR		(FSP_FLAGS_POS_PAGE_SSIZE	\
+					+ FSP_FLAGS_WIDTH_PAGE_SSIZE)
+/** Zero relative shift position of the start of the UNUSED bits */
+#define FSP_FLAGS_POS_UNUSED		(FSP_FLAGS_POS_DATA_DIR	\
+					+ FSP_FLAGS_WIDTH_DATA_DIR)
+
+/** Bit mask of the POST_ANTELOPE field */
+#define FSP_FLAGS_MASK_POST_ANTELOPE				\
+		((~(~0 << FSP_FLAGS_WIDTH_POST_ANTELOPE))	\
+		<< FSP_FLAGS_POS_POST_ANTELOPE)
+/** Bit mask of the ZIP_SSIZE field */
+#define FSP_FLAGS_MASK_ZIP_SSIZE				\
+		((~(~0 << FSP_FLAGS_WIDTH_ZIP_SSIZE))		\
+		<< FSP_FLAGS_POS_ZIP_SSIZE)
+/** Bit mask of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_MASK_ATOMIC_BLOBS				\
+		((~(~0 << FSP_FLAGS_WIDTH_ATOMIC_BLOBS))	\
+		<< FSP_FLAGS_POS_ATOMIC_BLOBS)
 /** Bit mask of the PAGE_SSIZE field */
 #define FSP_FLAGS_MASK_PAGE_SSIZE				\
 		((~(~0 << FSP_FLAGS_WIDTH_PAGE_SSIZE))		\
 		<< FSP_FLAGS_POS_PAGE_SSIZE)
+/** Bit mask of the DATA_DIR field */
+#define FSP_FLAGS_MASK_DATA_DIR					\
+		((~(~0 << FSP_FLAGS_WIDTH_DATA_DIR))		\
+		<< FSP_FLAGS_POS_DATA_DIR)
+
+/** Return the value of the POST_ANTELOPE field */
+#define FSP_FLAGS_GET_POST_ANTELOPE(flags)			\
+		((flags & FSP_FLAGS_MASK_POST_ANTELOPE)		\
+		>> FSP_FLAGS_POS_POST_ANTELOPE)
+/** Return the value of the ZIP_SSIZE field */
+#define FSP_FLAGS_GET_ZIP_SSIZE(flags)				\
+		((flags & FSP_FLAGS_MASK_ZIP_SSIZE)		\
+		>> FSP_FLAGS_POS_ZIP_SSIZE)
+/** Return the value of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_HAS_ATOMIC_BLOBS(flags)			\
+		((flags & FSP_FLAGS_MASK_ATOMIC_BLOBS)		\
+		>> FSP_FLAGS_POS_ATOMIC_BLOBS)
 /** Return the value of the PAGE_SSIZE field */
 #define FSP_FLAGS_GET_PAGE_SSIZE(flags)				\
 		((flags & FSP_FLAGS_MASK_PAGE_SSIZE)		\
 		>> FSP_FLAGS_POS_PAGE_SSIZE)
+/** Return the value of the DATA_DIR field */
+#define FSP_FLAGS_HAS_DATA_DIR(flags)				\
+		((flags & FSP_FLAGS_MASK_DATA_DIR)		\
+		>> FSP_FLAGS_POS_DATA_DIR)
+/** Return the contents of the UNUSED bits */
+#define FSP_FLAGS_GET_UNUSED(flags)				\
+		(flags >> FSP_FLAGS_POS_UNUSED)
+
+/** Set a PAGE_SSIZE into the correct bits in a given
+tablespace flags. */
+#define FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize)			\
+		(flags | (ssize << FSP_FLAGS_POS_PAGE_SSIZE))
 
 /* @} */
 
@@ -116,6 +198,142 @@ descriptor page, but used only in the first. */
 					FSP_FREE_LIMIT at a time */
 /* @} */
 
+#ifndef UNIV_INNOCHECKSUM
+
+/* @defgroup File Segment Inode Constants (moved from fsp0fsp.c) @{ */
+
+/*			FILE SEGMENT INODE
+			==================
+
+Segment inode which is created for each segment in a tablespace. NOTE: in
+purge we assume that a segment having only one currently used page can be
+freed in a few steps, so that the freeing cannot fill the file buffer with
+bufferfixed file pages. */
+
+typedef	byte	fseg_inode_t;
+
+#define FSEG_INODE_PAGE_NODE	FSEG_PAGE_DATA
+					/* the list node for linking
+					segment inode pages */
+
+#define FSEG_ARR_OFFSET		(FSEG_PAGE_DATA + FLST_NODE_SIZE)
+/*-------------------------------------*/
+#define	FSEG_ID			0	/* 8 bytes of segment id: if this is 0,
+					it means that the header is unused */
+#define FSEG_NOT_FULL_N_USED	8
+					/* number of used segment pages in
+					the FSEG_NOT_FULL list */
+#define	FSEG_FREE		12
+					/* list of free extents of this
+					segment */
+#define	FSEG_NOT_FULL		(12 + FLST_BASE_NODE_SIZE)
+					/* list of partially free extents */
+#define	FSEG_FULL		(12 + 2 * FLST_BASE_NODE_SIZE)
+					/* list of full extents */
+#define	FSEG_MAGIC_N		(12 + 3 * FLST_BASE_NODE_SIZE)
+					/* magic number used in debugging */
+#define	FSEG_FRAG_ARR		(16 + 3 * FLST_BASE_NODE_SIZE)
+					/* array of individual pages
+					belonging to this segment in fsp
+					fragment extent lists */
+#define FSEG_FRAG_ARR_N_SLOTS	(FSP_EXTENT_SIZE / 2)
+					/* number of slots in the array for
+					the fragment pages */
+#define	FSEG_FRAG_SLOT_SIZE	4	/* a fragment page slot contains its
+					page number within space, FIL_NULL
+					means that the slot is not in use */
+/*-------------------------------------*/
+#define FSEG_INODE_SIZE					\
+	(16 + 3 * FLST_BASE_NODE_SIZE			\
+	 + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE)
+
+#define FSP_SEG_INODES_PER_PAGE(zip_size)		\
+	(((zip_size ? zip_size : UNIV_PAGE_SIZE)	\
+	  - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE)
+				/* Number of segment inodes which fit on a
+				single page */
+
+#define FSEG_MAGIC_N_VALUE	97937874
+
+#define	FSEG_FILLFACTOR		8	/* If this value is x, then if
+					the number of unused but reserved
+					pages in a segment is less than
+					reserved pages * 1/x, and there are
+					at least FSEG_FRAG_LIMIT used pages,
+					then we allow a new empty extent to
+					be added to the segment in
+					fseg_alloc_free_page. Otherwise, we
+					use unused pages of the segment. */
+
+#define FSEG_FRAG_LIMIT		FSEG_FRAG_ARR_N_SLOTS
+					/* If the segment has >= this many
+					used pages, it may be expanded by
+					allocating extents to the segment;
+					until that only individual fragment
+					pages are allocated from the space */
+
+#define	FSEG_FREE_LIST_LIMIT	40	/* If the reserved size of a segment
+					is at least this many extents, we
+					allow extents to be put to the free
+					list of the extent: at most
+					FSEG_FREE_LIST_MAX_LEN many */
+#define	FSEG_FREE_LIST_MAX_LEN	4
+/* @} */
+
+/* @defgroup Extent Descriptor Constants (moved from fsp0fsp.c) @{ */
+
+/*			EXTENT DESCRIPTOR
+			=================
+
+File extent descriptor data structure: contains bits to tell which pages in
+the extent are free and which contain old tuple version to clean. */
+
+/*-------------------------------------*/
+#define	XDES_ID			0	/* The identifier of the segment
+					to which this extent belongs */
+#define XDES_FLST_NODE		8	/* The list node data structure
+					for the descriptors */
+#define	XDES_STATE		(FLST_NODE_SIZE + 8)
+					/* contains state information
+					of the extent */
+#define	XDES_BITMAP		(FLST_NODE_SIZE + 12)
+					/* Descriptor bitmap of the pages
+					in the extent */
+/*-------------------------------------*/
+
+#define	XDES_BITS_PER_PAGE	2	/* How many bits are there per page */
+#define	XDES_FREE_BIT		0	/* Index of the bit which tells if
+					the page is free */
+#define	XDES_CLEAN_BIT		1	/* NOTE: currently not used!
+					Index of the bit which tells if
+					there are old versions of tuples
+					on the page */
+/* States of a descriptor */
+#define	XDES_FREE		1	/* extent is in free list of space */
+#define	XDES_FREE_FRAG		2	/* extent is in free fragment list of
+					space */
+#define	XDES_FULL_FRAG		3	/* extent is in full fragment list of
+					space */
+#define	XDES_FSEG		4	/* extent belongs to a segment */
+
+/** File extent data structure size in bytes. */
+#define	XDES_SIZE							\
+	(XDES_BITMAP							\
+	+ UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE))
+
+/** File extent data structure size in bytes for MAX page size. */
+#define	XDES_SIZE_MAX							\
+	(XDES_BITMAP							\
+	+ UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MAX * XDES_BITS_PER_PAGE))
+
+/** File extent data structure size in bytes for MIN page size. */
+#define	XDES_SIZE_MIN							\
+	(XDES_BITMAP							\
+	+ UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MIN * XDES_BITS_PER_PAGE))
+
+/** Offset of the descriptor array on a descriptor page */
+#define	XDES_ARR_OFFSET		(FSP_HEADER_OFFSET + FSP_HEADER_SIZE)
+
 /* @} */
 
 /**********************************************************************//**
@@ -125,16 +343,6 @@ void
 fsp_init(void);
 /*==========*/
 /**********************************************************************//**
-Gets the current free limit of the system tablespace.  The free limit
-means the place of the first page which has never been put to the
-free list for allocation.  The space above that address is initialized
-to zero.  Sets also the global variable log_fsp_current_free_limit.
-@return	free limit in megabytes */
-UNIV_INTERN
-ulint
-fsp_header_get_free_limit(void);
-/*===========================*/
-/**********************************************************************//**
 Gets the size of the system tablespace from the tablespace header.  If
 we do not have an auto-extending data file, this should be equal to
 the size of the data files.  If there is an auto-extending data file,
@@ -177,9 +385,9 @@ fsp_header_get_zip_size(
 /*====================*/
 	const page_t*	page);	/*!< in: first page of a tablespace */
 /**********************************************************************//**
-Writes the space id and compressed page size to a tablespace header.
-This function is used past the buffer pool when we in fil0fil.c create
-a new single-table tablespace. */
+Writes the space id and flags to a tablespace header.  The flags contain
+row type, physical/compressed page size, and logical/uncompressed page
+size of the tablespace. */
 UNIV_INTERN
 void
 fsp_header_init_fields(
@@ -197,16 +405,16 @@ fsp_header_init(
 /*============*/
 	ulint	space,		/*!< in: space id */
 	ulint	size,		/*!< in: current size in blocks */
-	mtr_t*	mtr);		/*!< in: mini-transaction handle */
+	mtr_t*	mtr);		/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Increases the space size field of a space. */
 UNIV_INTERN
 void
 fsp_header_inc_size(
 /*================*/
-	ulint	space,	/*!< in: space id */
-	ulint	size_inc,/*!< in: size increment in pages */
-	mtr_t*	mtr);	/*!< in: mini-transaction handle */
+	ulint	space,		/*!< in: space id */
+	ulint	size_inc,	/*!< in: size increment in pages */
+	mtr_t*	mtr);		/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Creates a new segment.
 @return the block where the segment header is placed, x-latched, NULL
@@ -222,7 +430,7 @@ fseg_create(
 			will belong to the created segment */
 	ulint	byte_offset, /*!< in: byte offset of the created segment header
 			on the page */
-	mtr_t*	mtr);	/*!< in: mtr */
+	mtr_t*	mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Creates a new segment.
 @return the block where the segment header is placed, x-latched, NULL
@@ -244,7 +452,7 @@ fseg_create_general(
 			the inode and the other for the segment) then there is
 			no need to do the check for this individual
 			operation */
-	mtr_t*	mtr);	/*!< in: mtr */
+	mtr_t*	mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Calculates the number of pages reserved by a segment, and how many pages are
 currently used.
@@ -255,7 +463,7 @@ fseg_n_reserved_pages(
 /*==================*/
 	fseg_header_t*	header,	/*!< in: segment header */
 	ulint*		used,	/*!< out: number of pages used (<= reserved) */
-	mtr_t*		mtr);	/*!< in: mtr handle */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Allocates a single free page from a segment. This function implements
 the intelligent allocation strategy which tries to minimize
@@ -339,7 +547,7 @@ fsp_reserve_free_extents(
 	ulint	space,	/*!< in: space id */
 	ulint	n_ext,	/*!< in: number of extents to reserve */
 	ulint	alloc_type,/*!< in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */
-	mtr_t*	mtr);	/*!< in: mtr */
+	mtr_t*	mtr);	/*!< in: mini-transaction */
 /**********************************************************************//**
 This function should be used to get information on how much we still
 will be able to insert new data to the database without running out the
@@ -360,7 +568,18 @@ fseg_free_page(
 	fseg_header_t*	seg_header, /*!< in: segment header */
 	ulint		space,	/*!< in: space id */
 	ulint		page,	/*!< in: page offset */
-	mtr_t*		mtr);	/*!< in: mtr handle */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+/**********************************************************************//**
+Checks if a single page of a segment is free.
+@return	true if free */
+UNIV_INTERN
+bool
+fseg_page_is_free(
+/*==============*/
+	fseg_header_t*	seg_header,	/*!< in: segment header */
+	ulint		space,		/*!< in: space id */
+	ulint		page)		/*!< in: page offset */
+	__attribute__((nonnull, warn_unused_result));
 /**********************************************************************//**
 Frees part of a segment. This function can be used to free a segment
 by repeatedly calling this function in different mini-transactions.
@@ -375,7 +594,7 @@ fseg_free_step(
 				resides on the first page of the frag list
 				of the segment, this pointer becomes obsolete
 				after the last freeing step */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Frees part of a segment. Differs from fseg_free_step because this function
 leaves the header page unfreed.
@@ -386,7 +605,7 @@ fseg_free_step_not_header(
 /*======================*/
 	fseg_header_t*	header,	/*!< in: segment header which must reside on
 				the first fragment page of the segment */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /***********************************************************************//**
 Checks if a page address is an extent descriptor page address.
 @return	TRUE if a descriptor page */
@@ -431,7 +650,7 @@ ibool
 fseg_validate(
 /*==========*/
 	fseg_header_t*	header, /*!< in: segment header */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 #endif /* UNIV_DEBUG */
 #ifdef UNIV_BTR_PRINT
 /*******************************************************************//**
@@ -441,20 +660,85 @@ void
 fseg_print(
 /*=======*/
 	fseg_header_t*	header, /*!< in: segment header */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 #endif /* UNIV_BTR_PRINT */
 
 /********************************************************************//**
+Validate and return the tablespace flags, which are stored in the
+tablespace header at offset FSP_SPACE_FLAGS.  They should be 0 for
+ROW_FORMAT=COMPACT and ROW_FORMAT=REDUNDANT. The newer row formats,
+COMPRESSED and DYNAMIC, use a file format > Antelope so they should
+have a file format number plus the DICT_TF_COMPACT bit set.
+@return	true if check ok */
+UNIV_INLINE
+bool
+fsp_flags_is_valid(
+/*===============*/
+	ulint	flags)		/*!< in: tablespace flags */
+	__attribute__((warn_unused_result, const));
+/********************************************************************//**
+Determine if the tablespace is compressed from dict_table_t::flags.
+@return	TRUE if compressed, FALSE if not compressed */
+UNIV_INLINE
+ibool
+fsp_flags_is_compressed(
+/*====================*/
+	ulint	flags);	/*!< in: tablespace flags */
+
+/********************************************************************//**
+Calculates the descriptor index within a descriptor page.
+@return	descriptor index */
+UNIV_INLINE
+ulint
+xdes_calc_descriptor_index(
+/*=======================*/
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	offset);	/*!< in: page offset */
+
+/**********************************************************************//**
+Gets a descriptor bit of a page.
+@return	TRUE if free */
+UNIV_INLINE
+ibool
+xdes_get_bit(
+/*=========*/
+	const xdes_t*	descr,	/*!< in: descriptor */
+	ulint		bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+	ulint		offset);/*!< in: page offset within extent:
+				0 ... FSP_EXTENT_SIZE - 1 */
+
+/********************************************************************//**
+Calculates the page where the descriptor of a page resides.
+@return	descriptor page offset */
+UNIV_INLINE
+ulint
+xdes_calc_descriptor_page(
+/*======================*/
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	offset);	/*!< in: page offset */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************************//**
+Extract the zip size from tablespace flags.  A tablespace has only one
+physical page size whether that page is compressed or not.
+@return	compressed page size of the file-per-table tablespace in bytes,
+or zero if the table is not compressed.  */
+UNIV_INLINE
+ulint
+fsp_flags_get_zip_size(
+/*====================*/
+	ulint	flags);		/*!< in: tablespace flags */
+/********************************************************************//**
 Extract the page size from tablespace flags.
-This feature, storing the page_ssize into the tablespace flags, is added
-to InnoDB 5.6.4.  This is here only to protect against a crash if a newer
-database is opened with this code branch.
 @return	page size of the tablespace in bytes */
 UNIV_INLINE
 ulint
 fsp_flags_get_page_size(
 /*====================*/
-	ulint	flags);	/*!< in: tablespace flags */
+	ulint	flags);		/*!< in: tablespace flags */
 
 #ifndef UNIV_NONINL
 #include "fsp0fsp.ic"
diff --git a/storage/xtradb/include/fsp0fsp.ic b/storage/xtradb/include/fsp0fsp.ic
index c92111a9d89..0d81e817cc9 100644
--- a/storage/xtradb/include/fsp0fsp.ic
+++ b/storage/xtradb/include/fsp0fsp.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -23,6 +23,8 @@ File space management
 Created 12/18/1995 Heikki Tuuri
 *******************************************************/
 
+#ifndef UNIV_INNOCHECKSUM
+
 /***********************************************************************//**
 Checks if a page address is an extent descriptor page address.
 @return	TRUE if a descriptor page */
@@ -37,17 +39,120 @@ fsp_descr_page(
 	ut_ad(ut_is_2pow(zip_size));
 
 	if (!zip_size) {
-		return(UNIV_UNLIKELY((page_no & (UNIV_PAGE_SIZE - 1))
-				     == FSP_XDES_OFFSET));
+		return((page_no & (UNIV_PAGE_SIZE - 1)) == FSP_XDES_OFFSET);
 	}
 
-	return(UNIV_UNLIKELY((page_no & (zip_size - 1)) == FSP_XDES_OFFSET));
+	return((page_no & (zip_size - 1)) == FSP_XDES_OFFSET);
 }
+
+/********************************************************************//**
+Validate and return the tablespace flags, which are stored in the
+tablespace header at offset FSP_SPACE_FLAGS.  They should be 0 for
+ROW_FORMAT=COMPACT and ROW_FORMAT=REDUNDANT. The newer row formats,
+COMPRESSED and DYNAMIC, use a file format > Antelope so they should
+have a file format number plus the DICT_TF_COMPACT bit set.
+@return	true if check ok */
+UNIV_INLINE
+bool
+fsp_flags_is_valid(
+/*===============*/
+	ulint	flags)		/*!< in: tablespace flags */
+{
+	ulint	post_antelope = FSP_FLAGS_GET_POST_ANTELOPE(flags);
+	ulint	zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags);
+	ulint	atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags);
+	ulint	page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags);
+	ulint	unused = FSP_FLAGS_GET_UNUSED(flags);
+
+	DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false););
+
+	/* fsp_flags is zero unless atomic_blobs is set. */
+	/* Make sure there are no bits that we do not know about. */
+	if (unused != 0 || flags == 1) {
+		return(false);
+	} else if (post_antelope) {
+		/* The Antelope row formats REDUNDANT and COMPACT did
+		not use tablespace flags, so this flag and the entire
+		4-byte field is zero for Antelope row formats. */
+
+		if (!atomic_blobs) {
+			return(false);
+		}
+	}
+
+	if (!atomic_blobs) {
+		/* Barracuda row formats COMPRESSED and DYNAMIC build on
+		the page structure introduced for the COMPACT row format
+		by allowing long fields to be broken into prefix and
+		externally stored parts. */
+
+		if (post_antelope || zip_ssize != 0) {
+			return(false);
+		}
+
+	} else if (!post_antelope || zip_ssize > PAGE_ZIP_SSIZE_MAX) {
+		return(false);
+	} else if (page_ssize > UNIV_PAGE_SSIZE_MAX) {
+
+		/* The page size field can be used for any row type, or it may
+		be zero for an original 16k page size.
+		Validate the page shift size is within allowed range. */
+
+		return(false);
+
+	} else if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_ORIG && !page_ssize) {
+		return(false);
+	}
+
+#if UNIV_FORMAT_MAX != UNIV_FORMAT_B
+# error "UNIV_FORMAT_MAX != UNIV_FORMAT_B, Add more validations."
+#endif
+
+	/* The DATA_DIR field can be used for any row type so there is
+	nothing here to validate. */
+
+	return(true);
+}
+
+/********************************************************************//**
+Determine if the tablespace is compressed from dict_table_t::flags.
+@return	TRUE if compressed, FALSE if not compressed */
+UNIV_INLINE
+ibool
+fsp_flags_is_compressed(
+/*====================*/
+	ulint	flags)	/*!< in: tablespace flags */
+{
+	return(FSP_FLAGS_GET_ZIP_SSIZE(flags) != 0);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************************//**
+Extract the zip size from tablespace flags.
+@return	compressed page size of the file-per-table tablespace in bytes,
+or zero if the table is not compressed. */
+UNIV_INLINE
+ulint
+fsp_flags_get_zip_size(
+/*===================*/
+	ulint	flags)	/*!< in: tablespace flags */
+{
+	ulint	zip_size = 0;
+	ulint	ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags);
+
+	/* Convert from a 'log2 minus 9' to a page size in bytes. */
+	if (ssize) {
+		zip_size = ((UNIV_ZIP_SIZE_MIN >> 1) << ssize);
+
+		ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+	}
+
+	return(zip_size);
+}
+
 /********************************************************************//**
 Extract the page size from tablespace flags.
-This feature, storing the page_ssize into the tablespace flags, is added
-to InnoDB 5.6.4.  This is here only to protect against a crash if a newer
-database is opened with this code branch.
 @return	page size of the tablespace in bytes */
 UNIV_INLINE
 ulint
@@ -60,14 +165,150 @@ fsp_flags_get_page_size(
 
 	/* Convert from a 'log2 minus 9' to a page size in bytes. */
 	if (UNIV_UNLIKELY(ssize)) {
-		page_size = (512 << ssize);
+		page_size = ((UNIV_ZIP_SIZE_MIN >> 1) << ssize);
 
-		ut_ad(page_size <= UNIV_PAGE_SIZE);
+		ut_ad(page_size <= UNIV_PAGE_SIZE_MAX);
 	} else {
 		/* If the page size was not stored, then it is the
 		original 16k. */
-		page_size = UNIV_PAGE_SIZE;
+		page_size = UNIV_PAGE_SIZE_ORIG;
 	}
 
 	return(page_size);
 }
+
+#ifndef UNIV_INNOCHECKSUM
+
+/********************************************************************//**
+Add the page size to the tablespace flags.
+@return	tablespace flags after page size is added */
+UNIV_INLINE
+ulint
+fsp_flags_set_page_size(
+/*====================*/
+	ulint	flags,		/*!< in: tablespace flags */
+	ulint	page_size)	/*!< in: page size in bytes */
+{
+	ulint ssize = 0;
+	ulint shift;
+
+	/* Page size should be > UNIV_PAGE_SIZE_MIN */
+	ut_ad(page_size >= UNIV_PAGE_SIZE_MIN);
+	ut_ad(page_size <= UNIV_PAGE_SIZE_MAX);
+
+	if (page_size == UNIV_PAGE_SIZE_ORIG) {
+		ut_ad(0 == FSP_FLAGS_GET_PAGE_SSIZE(flags));
+		return(flags);
+	}
+
+	for (shift = UNIV_PAGE_SIZE_SHIFT_MAX;
+	     shift >= UNIV_PAGE_SIZE_SHIFT_MIN;
+	     shift--) {
+		ulint	mask = (1 << shift);
+		if (page_size & mask) {
+			ut_ad(!(page_size & ~mask));
+			ssize = shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1;
+			break;
+		}
+	}
+
+	ut_ad(ssize);
+	ut_ad(ssize <= UNIV_PAGE_SSIZE_MAX);
+
+	flags = FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize);
+
+	ut_ad(fsp_flags_is_valid(flags));
+
+	return(flags);
+}
+
+/********************************************************************//**
+Calculates the descriptor index within a descriptor page.
+@return	descriptor index */
+UNIV_INLINE
+ulint
+xdes_calc_descriptor_index(
+/*=======================*/
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	offset)		/*!< in: page offset */
+{
+	ut_ad(ut_is_2pow(zip_size));
+
+	if (zip_size == 0) {
+		return(ut_2pow_remainder(offset, UNIV_PAGE_SIZE)
+		       / FSP_EXTENT_SIZE);
+	} else {
+		return(ut_2pow_remainder(offset, zip_size) / FSP_EXTENT_SIZE);
+	}
+}
+
+/**********************************************************************//**
+Gets a descriptor bit of a page.
+@return	TRUE if free */
+UNIV_INLINE
+ibool
+xdes_get_bit(
+/*=========*/
+	const xdes_t*	descr,	/*!< in: descriptor */
+	ulint		bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+	ulint		offset)	/*!< in: page offset within extent:
+				0 ... FSP_EXTENT_SIZE - 1 */
+{
+	ut_ad(offset < FSP_EXTENT_SIZE);
+	ut_ad(bit == XDES_FREE_BIT || bit == XDES_CLEAN_BIT);
+
+	ulint	index = bit + XDES_BITS_PER_PAGE * offset;
+
+	ulint	bit_index = index % 8;
+	ulint	byte_index = index / 8;
+
+	return(ut_bit_get_nth(
+			mach_read_ulint(descr + XDES_BITMAP + byte_index,
+					MLOG_1BYTE),
+			bit_index));
+}
+
+/********************************************************************//**
+Calculates the page where the descriptor of a page resides.
+@return	descriptor page offset */
+UNIV_INLINE
+ulint
+xdes_calc_descriptor_page(
+/*======================*/
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	offset)		/*!< in: page offset */
+{
+#ifndef DOXYGEN /* Doxygen gets confused by these */
+# if UNIV_PAGE_SIZE_MAX <= XDES_ARR_OFFSET				\
+			   + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX)	\
+			   * XDES_SIZE_MAX
+#  error
+# endif
+# if UNIV_ZIP_SIZE_MIN <= XDES_ARR_OFFSET				\
+			  + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE_MIN)	\
+			  * XDES_SIZE_MIN
+#  error
+# endif
+#endif /* !DOXYGEN */
+
+	ut_ad(UNIV_PAGE_SIZE > XDES_ARR_OFFSET
+	      + (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE)
+	      * XDES_SIZE);
+	ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET
+	      + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE)
+	      * XDES_SIZE);
+
+	ut_ad(ut_is_2pow(zip_size));
+
+	if (zip_size == 0) {
+		return(ut_2pow_round(offset, UNIV_PAGE_SIZE));
+	} else {
+		ut_ad(zip_size > XDES_ARR_OFFSET
+		      + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE);
+		return(ut_2pow_round(offset, zip_size));
+	}
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/xtradb/include/fsp0types.h b/storage/xtradb/include/fsp0types.h
index 6e46d647657..94fd908ab0c 100644
--- a/storage/xtradb/include/fsp0types.h
+++ b/storage/xtradb/include/fsp0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -42,7 +42,13 @@ fseg_alloc_free_page) */
 /* @} */
 
 /** File space extent size (one megabyte) in pages */
-#define	FSP_EXTENT_SIZE		(1ULL << (20 - UNIV_PAGE_SIZE_SHIFT))
+#define	FSP_EXTENT_SIZE		(1048576U / UNIV_PAGE_SIZE)
+
+/** File space extent size (one megabyte) in pages for MAX page size */
+#define	FSP_EXTENT_SIZE_MAX	(1048576 / UNIV_PAGE_SIZE_MAX)
+
+/** File space extent size (one megabyte) in pages for MIN page size */
+#define	FSP_EXTENT_SIZE_MIN	(1048576 / UNIV_PAGE_SIZE_MIN)
 
 /** On a page of any file segment, data may be put starting from this
 offset */
diff --git a/storage/xtradb/include/fts0ast.h b/storage/xtradb/include/fts0ast.h
new file mode 100644
index 00000000000..c0aac6d8e4c
--- /dev/null
+++ b/storage/xtradb/include/fts0ast.h
@@ -0,0 +1,281 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0ast.h
+The FTS query parser (AST) abstract syntax tree routines
+
+Created 2007/03/16/03 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FST0AST_H
+#define INNOBASE_FST0AST_H
+
+#include "mem0mem.h"
+#include "ha_prototypes.h"
+
+/* The type of AST Node */
+enum fts_ast_type_t {
+	FTS_AST_OPER,				/*!< Operator */
+	FTS_AST_NUMB,				/*!< Number */
+	FTS_AST_TERM,				/*!< Term (or word) */
+	FTS_AST_TEXT,				/*!< Text string */
+	FTS_AST_LIST,				/*!< Expression list */
+	FTS_AST_SUBEXP_LIST			/*!< Sub-Expression list */
+};
+
+/* The FTS query operators that we support */
+enum fts_ast_oper_t {
+	FTS_NONE,				/*!< No operator */
+
+	FTS_IGNORE,				/*!< Ignore rows that contain
+						this word */
+
+	FTS_EXIST,				/*!< Include rows that contain
+						this word */
+
+	FTS_NEGATE,				/*!< Include rows that contain
+						this word but rank them
+						lower*/
+
+	FTS_INCR_RATING,			/*!< Increase the rank for this
+						word*/
+
+	FTS_DECR_RATING,			/*!< Decrease the rank for this
+						word*/
+
+	FTS_DISTANCE,				/*!< Proximity distance */
+	FTS_IGNORE_SKIP,			/*!< Transient node operator
+						signifies that this is a
+						FTS_IGNORE node, and ignored in
+						the first pass of
+						fts_ast_visit() */
+	FTS_EXIST_SKIP				/*!< Transient node operator
+						signifies that this ia a
+						FTS_EXIST node, and ignored in
+						the first pass of
+						fts_ast_visit() */
+};
+
+/* Data types used by the FTS parser */
+struct fts_lexer_t;
+struct fts_ast_node_t;
+struct fts_ast_state_t;
+
+typedef dberr_t (*fts_ast_callback)(fts_ast_oper_t, fts_ast_node_t*, void*);
+
+/********************************************************************
+Parse the string using the lexer setup within state.*/
+int
+fts_parse(
+/*======*/
+						/* out: 0 on OK, 1 on error */
+	fts_ast_state_t* state);		/*!< in: ast state instance.*/
+
+/********************************************************************
+Create an AST operator node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_oper(
+/*=====================*/
+	void*		arg,			/*!< in: ast state */
+	fts_ast_oper_t	oper);			/*!< in: ast operator */
+/********************************************************************
+Create an AST term node, makes a copy of ptr */
+extern
+fts_ast_node_t*
+fts_ast_create_node_term(
+/*=====================*/
+	void*		arg,			/*!< in: ast state */
+	const char*	ptr);			/*!< in: term string */
+/********************************************************************
+Create an AST text node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_text(
+/*=====================*/
+	void*		arg,			/*!< in: ast state */
+	const char*	ptr);			/*!< in: text string */
+/********************************************************************
+Create an AST expr list node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_list(
+/*=====================*/
+	void*		arg,			/*!< in: ast state */
+	fts_ast_node_t*	expr);			/*!< in: ast expr */
+/********************************************************************
+Create a sub-expression list node. This function takes ownership of
+expr and is responsible for deleting it. */
+extern
+fts_ast_node_t*
+fts_ast_create_node_subexp_list(
+/*============================*/
+						/* out: new node */
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_node_t*	expr);			/*!< in: ast expr instance */
+/********************************************************************
+Set the wildcard attribute of a term.*/
+extern
+void
+fts_ast_term_set_wildcard(
+/*======================*/
+	fts_ast_node_t*	node);			/*!< in: term to change */
+/********************************************************************
+Set the proximity attribute of a text node. */
+
+void
+fts_ast_term_set_distance(
+/*======================*/
+	fts_ast_node_t*	node,			/*!< in/out: text node */
+	ulint		distance);		/*!< in: the text proximity
+						distance */
+/********************************************************************//**
+Free a fts_ast_node_t instance.
+@return next node to free */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_free_node(
+/*==============*/
+	fts_ast_node_t*	node);			/*!< in: node to free */
+/********************************************************************
+Add a sub-expression to an AST*/
+extern
+fts_ast_node_t*
+fts_ast_add_node(
+/*=============*/
+	fts_ast_node_t*	list,			/*!< in: list node instance */
+	fts_ast_node_t*	node);			/*!< in: (sub) expr to add */
+/********************************************************************
+Print the AST node recursively.*/
+extern
+void
+fts_ast_node_print(
+/*===============*/
+	fts_ast_node_t*	node);			/*!< in: ast node to print */
+/********************************************************************
+For tracking node allocations, in case there is an during parsing.*/
+extern
+void
+fts_ast_state_add_node(
+/*===================*/
+	fts_ast_state_t*state,			/*!< in: ast state instance */
+	fts_ast_node_t*	node);			/*!< in: node to add to state */
+/********************************************************************
+Free node and expr allocations.*/
+extern
+void
+fts_ast_state_free(
+/*===============*/
+	fts_ast_state_t*state);			/*!< in: state instance
+						to free */
+/******************************************************************//**
+Traverse the AST - in-order traversal.
+@return DB_SUCCESS if all went well */
+UNIV_INTERN
+dberr_t
+fts_ast_visit(
+/*==========*/
+	fts_ast_oper_t		oper,		/*!< in: FTS operator */
+	fts_ast_node_t*		node,		/*!< in: instance to traverse*/
+	fts_ast_callback	visitor,	/*!< in: callback */
+	void*			arg,		/*!< in: callback arg */
+	bool*			has_ignore)	/*!< out: whether we encounter
+						and ignored processing an
+						operator, currently we only
+						ignore FTS_IGNORE operator */
+	__attribute__((nonnull, warn_unused_result));
+/*****************************************************************//**
+Process (nested) sub-expression, create a new result set to store the
+sub-expression result by processing nodes under current sub-expression
+list. Merge the sub-expression result with that of parent expression list.
+@return DB_SUCCESS if all went well */
+UNIV_INTERN
+dberr_t
+fts_ast_visit_sub_exp(
+/*==================*/
+	fts_ast_node_t*		node,		/*!< in: instance to traverse*/
+	fts_ast_callback	visitor,	/*!< in: callback */
+	void*			arg)		/*!< in: callback arg */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************
+Create a lex instance.*/
+UNIV_INTERN
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+	ibool		boolean_mode,		/*!< in: query type */
+	const byte*	query,			/*!< in: query string */
+	ulint		query_len)		/*!< in: query string len */
+	__attribute__((nonnull, malloc, warn_unused_result));
+/********************************************************************
+Free an fts_lexer_t instance.*/
+UNIV_INTERN
+void
+fts_lexer_free(
+/*===========*/
+	fts_lexer_t*	fts_lexer)		/*!< in: lexer instance to
+						free */
+	__attribute__((nonnull));
+
+/* Query term type */
+struct fts_ast_term_t {
+	byte*		ptr;			/*!< Pointer to term string.*/
+	ibool		wildcard;		/*!< TRUE if wild card set.*/
+};
+
+/* Query text type */
+struct fts_ast_text_t {
+	byte*		ptr;			/*!< Pointer to term string.*/
+	ulint		distance;		/*!< > 0 if proximity distance
+						set */
+};
+
+/* The list of nodes in an expr list */
+struct fts_ast_list_t {
+	fts_ast_node_t*	head;			/*!< Children list head */
+	fts_ast_node_t*	tail;			/*!< Children list tail */
+};
+
+/* FTS AST node to store the term, text, operator and sub-expressions.*/
+struct fts_ast_node_t {
+	fts_ast_type_t	type;			/*!< The type of node */
+	fts_ast_text_t	text;			/*!< Text node */
+	fts_ast_term_t	term;			/*!< Term node */
+	fts_ast_oper_t	oper;			/*!< Operator value */
+	fts_ast_list_t	list;			/*!< Expression list */
+	fts_ast_node_t*	next;			/*!< Link for expr list */
+	fts_ast_node_t*	next_alloc;		/*!< For tracking allocations */
+	bool		visited;		/*!< whether this node is
+						already processed */
+};
+
+/* To track state during parsing */
+struct fts_ast_state_t {
+	mem_heap_t*	heap;			/*!< Heap to use for alloc */
+	fts_ast_node_t*	root;			/*!< If all goes OK, then this
+						will point to the root.*/
+
+	fts_ast_list_t	list;			/*!< List of nodes allocated */
+
+	fts_lexer_t*	lexer;			/*!< Lexer callback + arg */
+	CHARSET_INFO*	charset;		/*!< charset used for
+						tokenization */
+};
+
+#endif /* INNOBASE_FSTS0AST_H */
diff --git a/storage/xtradb/include/fts0blex.h b/storage/xtradb/include/fts0blex.h
new file mode 100644
index 00000000000..d0e4cae0678
--- /dev/null
+++ b/storage/xtradb/include/fts0blex.h
@@ -0,0 +1,349 @@
+#ifndef fts0bHEADER_H
+#define fts0bHEADER_H 1
+#define fts0bIN_HEADER 1
+
+#line 6 "../include/fts0blex.h"
+
+#line 8 "../include/fts0blex.h"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 35
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+    
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void fts0brestart (FILE *input_file ,yyscan_t yyscanner );
+void fts0b_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
+void fts0b_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0b_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0bpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+void fts0bpop_buffer_state (yyscan_t yyscanner );
+
+YY_BUFFER_STATE fts0b_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner );
+
+void *fts0balloc (yy_size_t ,yyscan_t yyscanner );
+void *fts0brealloc (void *,yy_size_t ,yyscan_t yyscanner );
+void fts0bfree (void * ,yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define fts0bwrap(n) 1
+#define YY_SKIP_YYWRAP
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+int fts0blex_init (yyscan_t* scanner);
+
+int fts0blex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int fts0blex_destroy (yyscan_t yyscanner );
+
+int fts0bget_debug (yyscan_t yyscanner );
+
+void fts0bset_debug (int debug_flag ,yyscan_t yyscanner );
+
+YY_EXTRA_TYPE fts0bget_extra (yyscan_t yyscanner );
+
+void fts0bset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
+
+FILE *fts0bget_in (yyscan_t yyscanner );
+
+void fts0bset_in  (FILE * in_str ,yyscan_t yyscanner );
+
+FILE *fts0bget_out (yyscan_t yyscanner );
+
+void fts0bset_out  (FILE * out_str ,yyscan_t yyscanner );
+
+int fts0bget_leng (yyscan_t yyscanner );
+
+char *fts0bget_text (yyscan_t yyscanner );
+
+int fts0bget_lineno (yyscan_t yyscanner );
+
+void fts0bset_lineno (int line_number ,yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int fts0bwrap (yyscan_t yyscanner );
+#else
+extern int fts0bwrap (yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int fts0blex (yyscan_t yyscanner);
+
+#define YY_DECL int fts0blex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#line 73 "fts0blex.l"
+
+
+#line 348 "../include/fts0blex.h"
+#undef fts0bIN_HEADER
+#endif /* fts0bHEADER_H */
diff --git a/storage/xtradb/include/fts0fts.h b/storage/xtradb/include/fts0fts.h
new file mode 100644
index 00000000000..f94112ef4d4
--- /dev/null
+++ b/storage/xtradb/include/fts0fts.h
@@ -0,0 +1,1042 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0fts.h
+Full text search header file
+
+Created 2011/09/02 Sunny Bains
+***********************************************************************/
+
+#ifndef fts0fts_h
+#define fts0fts_h
+
+#include "univ.i"
+
+#include "data0type.h"
+#include "data0types.h"
+#include "dict0types.h"
+#include "hash0hash.h"
+#include "mem0mem.h"
+#include "rem0types.h"
+#include "row0types.h"
+#include "trx0types.h"
+#include "ut0vec.h"
+#include "ut0rbt.h"
+#include "ut0wqueue.h"
+#include "que0types.h"
+#include "ft_global.h"
+
+/** "NULL" value of a document id. */
+#define FTS_NULL_DOC_ID			0
+
+/** FTS hidden column that is used to map to and from the row */
+#define FTS_DOC_ID_COL_NAME		"FTS_DOC_ID"
+
+/** The name of the index created by FTS */
+#define FTS_DOC_ID_INDEX_NAME		"FTS_DOC_ID_INDEX"
+
+#define FTS_DOC_ID_INDEX_NAME_LEN	16
+
+/** Doc ID is a 8 byte value */
+#define FTS_DOC_ID_LEN			8
+
+/** The number of fields to sort when we build FT index with
+FIC. Three fields are sort: (word, doc_id, position) */
+#define FTS_NUM_FIELDS_SORT		3
+
+/** Maximum number of rows in a table, smaller than which, we will
+optimize using a 4 byte Doc ID for FIC merge sort to reduce sort size */
+#define MAX_DOC_ID_OPT_VAL		1073741824
+
+/** Document id type. */
+typedef ib_uint64_t doc_id_t;
+
+/** doc_id_t printf format */
+#define FTS_DOC_ID_FORMAT	IB_ID_FMT
+
+/** Convert document id to the InnoDB (BIG ENDIAN) storage format. */
+#define fts_write_doc_id(d, s)	mach_write_to_8(d, s)
+
+/** Read a document id to internal format. */
+#define fts_read_doc_id(s)	mach_read_from_8(s)
+
+/** Bind the doc id to a variable */
+#define fts_bind_doc_id(i, n, v) pars_info_bind_int8_literal(i, n, v)
+
+/** Defines for FTS query mode, they have the same values as
+those defined in mysql file ft_global.h */
+#define FTS_NL		0
+#define FTS_BOOL	1
+#define FTS_SORTED	2
+#define FTS_EXPAND	4
+#define FTS_PROXIMITY	8
+#define FTS_PHRASE	16
+#define FTS_OPT_RANKING	32
+
+#define FTS_INDEX_TABLE_IND_NAME	"FTS_INDEX_TABLE_IND"
+
+/** Threshold where our optimize thread automatically kicks in */
+#define FTS_OPTIMIZE_THRESHOLD		10000000
+
+#define FTS_DOC_ID_MAX_STEP		10000
+/** Variable specifying the FTS parallel sort degree */
+extern ulong		fts_sort_pll_degree;
+
+/** Variable specifying the number of word to optimize for each optimize table
+call */
+extern ulong		fts_num_word_optimize;
+
+/** Variable specifying whether we do additional FTS diagnostic printout
+in the log */
+extern char		fts_enable_diag_print;
+
+/** FTS rank type, which will be between 0 .. 1 inclusive */
+typedef float 		fts_rank_t;
+
+/** Type of a row during a transaction. FTS_NOTHING means the row can be
+forgotten from the FTS system's POV, FTS_INVALID is an internal value used
+to mark invalid states.
+
+NOTE: Do not change the order or value of these, fts_trx_row_get_new_state
+depends on them being exactly as they are. */
+enum fts_row_state {
+	FTS_INSERT = 0,
+	FTS_MODIFY,
+	FTS_DELETE,
+	FTS_NOTHING,
+	FTS_INVALID
+};
+
+/** The FTS table types. */
+enum fts_table_type_t {
+	FTS_INDEX_TABLE,		/*!< FTS auxiliary table that is
+					specific to a particular FTS index
+					on a table */
+
+	FTS_COMMON_TABLE		/*!< FTS auxiliary table that is common
+					for all FTS index on a table */
+};
+
+struct fts_doc_t;
+struct fts_cache_t;
+struct fts_token_t;
+struct fts_doc_ids_t;
+struct fts_index_cache_t;
+
+
+/** Initialize the "fts_table" for internal query into FTS auxiliary
+tables */
+#define FTS_INIT_FTS_TABLE(fts_table, m_suffix, m_type, m_table)\
+do {								\
+	(fts_table)->suffix = m_suffix;				\
+        (fts_table)->type = m_type;				\
+        (fts_table)->table_id = m_table->id;			\
+        (fts_table)->parent = m_table->name;			\
+        (fts_table)->table = m_table;				\
+} while (0);
+
+#define FTS_INIT_INDEX_TABLE(fts_table, m_suffix, m_type, m_index)\
+do {								\
+	(fts_table)->suffix = m_suffix;				\
+        (fts_table)->type = m_type;				\
+        (fts_table)->table_id = m_index->table->id;		\
+        (fts_table)->parent = m_index->table->name;		\
+        (fts_table)->table = m_index->table;			\
+        (fts_table)->index_id = m_index->id;			\
+} while (0);
+
+/** Information about changes in a single transaction affecting
+the FTS system. */
+struct fts_trx_t {
+	trx_t*		trx;		/*!< InnoDB transaction */
+
+	ib_vector_t*	savepoints;	/*!< Active savepoints, must have at
+					least one element, the implied
+					savepoint */
+	ib_vector_t*	last_stmt;	/*!< last_stmt */
+
+	mem_heap_t*	heap;		/*!< heap */
+};
+
+/** Information required for transaction savepoint handling. */
+struct fts_savepoint_t {
+	char*		name;		/*!< First entry is always NULL, the
+					default instance. Otherwise the name
+					of the savepoint */
+
+	ib_rbt_t*	tables;		/*!< Modified FTS tables */
+};
+
+/** Information about changed rows in a transaction for a single table. */
+struct fts_trx_table_t {
+	dict_table_t*	table;		/*!< table */
+
+	fts_trx_t*	fts_trx;	/*!< link to parent */
+
+	ib_rbt_t*	rows;		/*!< rows changed; indexed by doc-id,
+					cells are fts_trx_row_t* */
+
+	fts_doc_ids_t*	added_doc_ids;	/*!< list of added doc ids (NULL until
+					the first addition) */
+
+					/*!< for adding doc ids */
+	que_t*		docs_added_graph;
+};
+
+/** Information about one changed row in a transaction. */
+struct fts_trx_row_t {
+	doc_id_t	doc_id;		/*!< Id of the ins/upd/del document */
+
+	fts_row_state	state;		/*!< state of the row */
+
+	ib_vector_t*	fts_indexes;	/*!< The indexes that are affected */
+};
+
+/** List of document ids that were added during a transaction. This
+list is passed on to a background 'Add' thread and OPTIMIZE, so it
+needs its own memory heap. */
+struct fts_doc_ids_t {
+	ib_vector_t*	doc_ids;	/*!< document ids (each element is
+					of type doc_id_t). */
+
+	ib_alloc_t*	self_heap;	/*!< Allocator used to create an
+					instance of this type and the
+					doc_ids vector */
+};
+
+// FIXME: Get rid of this if possible.
+/** Since MySQL's character set support for Unicode is woefully inadequate
+(it supports basic operations like isalpha etc. only for 8-bit characters),
+we have to implement our own. We use UTF-16 without surrogate processing
+as our in-memory format. This typedef is a single such character. */
+typedef unsigned short ib_uc_t;
+
+/** An UTF-16 ro UTF-8 string. */
+struct fts_string_t {
+	byte*		f_str;		/*!< string, not necessary terminated in
+					any way */
+	ulint		f_len;		/*!< Length of the string in bytes */
+	ulint		f_n_char;	/*!< Number of characters */
+};
+
+/** Query ranked doc ids. */
+struct fts_ranking_t {
+	doc_id_t	doc_id;		/*!< Document id */
+
+	fts_rank_t	rank;		/*!< Rank is between 0 .. 1 */
+
+	byte*		words;		/*!< this contains the words
+					that were queried
+					and found in this document */
+	ulint		words_len;	/*!< words len */
+};
+
+/** Query result. */
+struct fts_result_t {
+	ib_rbt_node_t*	current;	/*!< Current element */
+
+	ib_rbt_t*	rankings_by_id;	/*!< RB tree of type fts_ranking_t
+					indexed by doc id */
+	ib_rbt_t*	rankings_by_rank;/*!< RB tree of type fts_ranking_t
+					indexed by rank */
+};
+
+/** This is used to generate the FTS auxiliary table name, we need the
+table id and the index id to generate the column specific FTS auxiliary
+table name. */
+struct fts_table_t {
+	const char*	parent;		/*!< Parent table name, this is
+					required only for the database
+					name */
+
+	fts_table_type_t
+			type;		/*!< The auxiliary table type */
+
+	table_id_t	table_id;	/*!< The table id */
+
+	index_id_t	index_id;	/*!< The index id */
+
+	const char*	suffix;		/*!< The suffix of the fts auxiliary
+					table name, can be NULL, not used
+					everywhere (yet) */
+	const dict_table_t*
+			table;		/*!< Parent table */
+	CHARSET_INFO*	charset;	/*!< charset info if it is for FTS
+					index auxiliary table */
+};
+
+enum	fts_status {
+	BG_THREAD_STOP = 1,	 	/*!< TRUE if the FTS background thread
+					has finished reading the ADDED table,
+					meaning more items can be added to
+					the table. */
+
+	BG_THREAD_READY = 2,		/*!< TRUE if the FTS background thread
+					is ready */
+
+	ADD_THREAD_STARTED = 4,		/*!< TRUE if the FTS add thread
+					has started */
+
+	ADDED_TABLE_SYNCED = 8,		/*!< TRUE if the ADDED table record is
+					sync-ed after crash recovery */
+
+	TABLE_DICT_LOCKED = 16		/*!< Set if the table has
+					dict_sys->mutex */
+};
+
+typedef	enum fts_status	fts_status_t;
+
+/** The state of the FTS sub system. */
+struct fts_t {
+					/*!< mutex protecting bg_threads* and
+					fts_add_wq. */
+	ib_mutex_t		bg_threads_mutex;
+
+	ulint		bg_threads;	/*!< number of background threads
+					accessing this table */
+
+					/*!< TRUE if background threads running
+					should stop themselves */
+	ulint		fts_status;	/*!< Status bit regarding fts
+					running state */
+
+	ib_wqueue_t*	add_wq;		/*!< Work queue for scheduling jobs
+					for the FTS 'Add' thread, or NULL
+					if the thread has not yet been
+					created. Each work item is a
+					fts_trx_doc_ids_t*. */
+
+	fts_cache_t*	cache;		/*!< FTS memory buffer for this table,
+					or NULL if the table has no FTS
+					index. */
+
+	ulint		doc_col;	/*!< FTS doc id hidden column number
+					in the CLUSTERED index. */
+
+	ib_vector_t*	indexes;	/*!< Vector of FTS indexes, this is
+					mainly for caching purposes. */
+	mem_heap_t*	fts_heap;	/*!< heap for fts_t allocation */
+};
+
+struct fts_stopword_t;
+
+/** status bits for fts_stopword_t status field. */
+#define STOPWORD_NOT_INIT               0x1
+#define STOPWORD_OFF                    0x2
+#define STOPWORD_FROM_DEFAULT           0x4
+#define STOPWORD_USER_TABLE             0x8
+
+extern const char*	fts_default_stopword[];
+
+/** Variable specifying the maximum FTS cache size for each table */
+extern ulong		fts_max_cache_size;
+
+/** Variable specifying the total memory allocated for FTS cache */
+extern ulong		fts_max_total_cache_size;
+
+/** Variable specifying the FTS result cache limit for each query */
+extern ulong		fts_result_cache_limit;
+
+/** Variable specifying the maximum FTS max token size */
+extern ulong		fts_max_token_size;
+
+/** Variable specifying the minimum FTS max token size */
+extern ulong		fts_min_token_size;
+
+/** Whether the total memory used for FTS cache is exhausted, and we will
+need a sync to free some memory */
+extern bool		fts_need_sync;
+
+/** Maximum possible Fulltext word length */
+#define FTS_MAX_WORD_LEN		HA_FT_MAXBYTELEN
+
+/** Maximum possible Fulltext word length (in characters) */
+#define FTS_MAX_WORD_LEN_IN_CHAR	HA_FT_MAXCHARLEN
+
+/** Variable specifying the table that has Fulltext index to display its
+content through information schema table */
+extern char*		fts_internal_tbl_name;
+
+#define	fts_que_graph_free(graph)			\
+do {							\
+	mutex_enter(&dict_sys->mutex);			\
+	que_graph_free(graph);				\
+	mutex_exit(&dict_sys->mutex);			\
+} while (0)
+
+/******************************************************************//**
+Create a FTS cache. */
+UNIV_INTERN
+fts_cache_t*
+fts_cache_create(
+/*=============*/
+	dict_table_t*	table);			/*!< table owns the FTS cache */
+
+/******************************************************************//**
+Create a FTS index cache.
+@return Index Cache */
+UNIV_INTERN
+fts_index_cache_t*
+fts_cache_index_cache_create(
+/*=========================*/
+	dict_table_t*	table,			/*!< in: table with FTS index */
+	dict_index_t*	index);			/*!< in: FTS index */
+
+/******************************************************************//**
+Get the next available document id. This function creates a new
+transaction to generate the document id.
+@return DB_SUCCESS if OK */
+UNIV_INTERN
+dberr_t
+fts_get_next_doc_id(
+/*================*/
+	const dict_table_t*	table,	/*!< in: table */
+	doc_id_t*		doc_id)	/*!< out: new document id */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Update the next and last Doc ID in the CONFIG table to be the input
+"doc_id" value (+ 1). We would do so after each FTS index build or
+table truncate */
+UNIV_INTERN
+void
+fts_update_next_doc_id(
+/*===================*/
+	trx_t*			trx,		/*!< in/out: transaction */
+	const dict_table_t*	table,		/*!< in: table */
+	const char*		table_name,	/*!< in: table name, or NULL */
+	doc_id_t		doc_id)		/*!< in: DOC ID to set */
+	__attribute__((nonnull(2)));
+
+/******************************************************************//**
+Create a new document id .
+@return DB_SUCCESS if all went well else error */
+UNIV_INTERN
+dberr_t
+fts_create_doc_id(
+/*==============*/
+	dict_table_t*	table,			/*!< in: row is of this
+						table. */
+	dtuple_t*	row,			/*!< in/out: add doc id
+						value to this row. This is the
+						current row that is being
+						inserted. */
+	mem_heap_t*	heap)			/*!< in: heap */
+	__attribute__((nonnull));
+/******************************************************************//**
+Create a new fts_doc_ids_t.
+@return new fts_doc_ids_t. */
+UNIV_INTERN
+fts_doc_ids_t*
+fts_doc_ids_create(void);
+/*=====================*/
+
+/******************************************************************//**
+Free a fts_doc_ids_t. */
+UNIV_INTERN
+void
+fts_doc_ids_free(
+/*=============*/
+	fts_doc_ids_t*	doc_ids);		/*!< in: doc_ids to free */
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+UNIV_INTERN
+void
+fts_trx_add_op(
+/*===========*/
+	trx_t*		trx,			/*!< in: InnoDB transaction */
+	dict_table_t*	table,			/*!< in: table */
+	doc_id_t	doc_id,			/*!< in: doc id */
+	fts_row_state	state,			/*!< in: state of the row */
+	ib_vector_t*	fts_indexes)		/*!< in: FTS indexes affected
+						(NULL=all) */
+	__attribute__((nonnull(1,2)));
+
+/******************************************************************//**
+Free an FTS trx. */
+UNIV_INTERN
+void
+fts_trx_free(
+/*=========*/
+	fts_trx_t*	fts_trx);		/*!< in, own: FTS trx */
+
+/******************************************************************//**
+Creates the common ancillary tables needed for supporting an FTS index
+on the given table. row_mysql_lock_data_dictionary must have been
+called before this.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_create_common_tables(
+/*=====================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const dict_table_t*
+			table,			/*!< in: table with one FTS
+						index */
+	const char*	name,			/*!< in: table name */
+	bool		skip_doc_id_index)	/*!< in: Skip index on doc id */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Wrapper function of fts_create_index_tables_low(), create auxiliary
+tables for an FTS index
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_create_index_tables(
+/*====================*/
+	trx_t*			trx,		/*!< in: transaction handle */
+	const dict_index_t*	index)		/*!< in: the FTS index
+						instance */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Creates the column specific ancillary tables needed for supporting an
+FTS index on the given table. row_mysql_lock_data_dictionary must have
+been called before this.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_create_index_tables_low(
+/*========================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const dict_index_t*
+			index,			/*!< in: the FTS index
+						instance */
+	const char*	table_name,		/*!< in: the table name */
+	table_id_t	table_id)		/*!< in: the table id */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Add the FTS document id hidden column. */
+UNIV_INTERN
+void
+fts_add_doc_id_column(
+/*==================*/
+	dict_table_t*	table,	/*!< in/out: Table with FTS index */
+	mem_heap_t*	heap)	/*!< in: temporary memory heap, or NULL */
+	__attribute__((nonnull(1)));
+
+/*********************************************************************//**
+Drops the ancillary tables needed for supporting an FTS index on the
+given table. row_mysql_lock_data_dictionary must have been called before
+this.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_drop_tables(
+/*============*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_table_t*	table)			/*!< in: table has the FTS
+						index */
+	__attribute__((nonnull));
+/******************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_commit(
+/*=======*/
+	trx_t*		trx)			/*!< in: transaction */
+	__attribute__((nonnull, warn_unused_result));
+
+/*******************************************************************//**
+FTS Query entry point.
+@return DB_SUCCESS if successful otherwise error code */
+UNIV_INTERN
+dberr_t
+fts_query(
+/*======*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index to search */
+	uint		flags,			/*!< in: FTS search mode */
+	const byte*	query,			/*!< in: FTS query */
+	ulint		query_len,		/*!< in: FTS query string len
+						in bytes */
+	fts_result_t**	result)			/*!< out: query result, to be
+						freed by the caller.*/
+	__attribute__((nonnull, warn_unused_result));
+
+/******************************************************************//**
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+@return the relevance ranking value. */
+UNIV_INTERN
+float
+fts_retrieve_ranking(
+/*=================*/
+	fts_result_t*	result,			/*!< in: FTS result structure */
+	doc_id_t	doc_id);		/*!< in: the interested document
+						doc_id */
+
+/******************************************************************//**
+FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
+UNIV_INTERN
+void
+fts_query_sort_result_on_rank(
+/*==========================*/
+	fts_result_t*	result);		/*!< out: result instance
+						to sort.*/
+
+/******************************************************************//**
+FTS Query free result, returned by fts_query(). */
+UNIV_INTERN
+void
+fts_query_free_result(
+/*==================*/
+	fts_result_t*	result);		/*!< in: result instance
+						to free.*/
+
+/******************************************************************//**
+Extract the doc id from the FTS hidden column. */
+UNIV_INTERN
+doc_id_t
+fts_get_doc_id_from_row(
+/*====================*/
+	dict_table_t*	table,			/*!< in: table */
+	dtuple_t*	row);			/*!< in: row whose FTS doc id we
+						want to extract.*/
+
+/******************************************************************//**
+Extract the doc id from the FTS hidden column. */
+UNIV_INTERN
+doc_id_t
+fts_get_doc_id_from_rec(
+/*====================*/
+	dict_table_t*	table,			/*!< in: table */
+	const rec_t*	rec,			/*!< in: rec */
+	mem_heap_t*	heap);			/*!< in: heap */
+
+/******************************************************************//**
+Update the query graph with a new document id.
+@return Doc ID used */
+UNIV_INTERN
+doc_id_t
+fts_update_doc_id(
+/*==============*/
+	dict_table_t*	table,			/*!< in: table */
+	upd_field_t*	ufield,			/*!< out: update node */
+	doc_id_t*	next_doc_id);		/*!< out: buffer for writing */
+
+/******************************************************************//**
+FTS initialize. */
+UNIV_INTERN
+void
+fts_startup(void);
+/*==============*/
+
+/******************************************************************//**
+Signal FTS threads to initiate shutdown. */
+UNIV_INTERN
+void
+fts_start_shutdown(
+/*===============*/
+	dict_table_t*	table,			/*!< in: table with FTS
+						indexes */
+	fts_t*		fts);			/*!< in: fts instance to
+						shutdown */
+
+/******************************************************************//**
+Wait for FTS threads to shutdown. */
+UNIV_INTERN
+void
+fts_shutdown(
+/*=========*/
+	dict_table_t*	table,			/*!< in: table with FTS
+						indexes */
+	fts_t*		fts);			/*!< in: fts instance to
+						shutdown */
+
+/******************************************************************//**
+Create an instance of fts_t.
+@return instance of fts_t */
+UNIV_INTERN
+fts_t*
+fts_create(
+/*=======*/
+	dict_table_t*	table);			/*!< out: table with FTS
+						indexes */
+
+/**********************************************************************//**
+Free the FTS resources. */
+UNIV_INTERN
+void
+fts_free(
+/*=====*/
+	dict_table_t*   table);			/*!< in/out: table with
+						FTS indexes */
+
+/*********************************************************************//**
+Run OPTIMIZE on the given table.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+fts_optimize_table(
+/*===============*/
+	dict_table_t*	table)			/*!< in: table to optimiza */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Startup the optimize thread and create the work queue. */
+UNIV_INTERN
+void
+fts_optimize_init(void);
+/*====================*/
+
+/**********************************************************************//**
+Check whether the work queue is initialized.
+@return TRUE if optimze queue is initialized. */
+UNIV_INTERN
+ibool
+fts_optimize_is_init(void);
+/*======================*/
+
+/****************************************************************//**
+Drops index ancillary tables for a FTS index
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_drop_index_tables(
+/*==================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index)			/*!< in: Index to drop */
+	__attribute__((nonnull, warn_unused_result));
+
+/******************************************************************//**
+Remove the table from the OPTIMIZER's list. We do wait for
+acknowledgement from the consumer of the message. */
+UNIV_INTERN
+void
+fts_optimize_remove_table(
+/*======================*/
+	dict_table_t*	table);			/*!< in: table to remove */
+
+/**********************************************************************//**
+Signal the optimize thread to prepare for shutdown. */
+UNIV_INTERN
+void
+fts_optimize_start_shutdown(void);
+/*==============================*/
+
+/**********************************************************************//**
+Inform optimize to clean up. */
+UNIV_INTERN
+void
+fts_optimize_end(void);
+/*===================*/
+
+/**********************************************************************//**
+Take a FTS savepoint. */
+UNIV_INTERN
+void
+fts_savepoint_take(
+/*===============*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name)			/*!< in: savepoint name */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Refresh last statement savepoint. */
+UNIV_INTERN
+void
+fts_savepoint_laststmt_refresh(
+/*===========================*/
+	trx_t*		trx)			/*!< in: transaction */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Release the savepoint data identified by  name. */
+UNIV_INTERN
+void
+fts_savepoint_release(
+/*==================*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name);			/*!< in: savepoint name */
+
+/**********************************************************************//**
+Free the FTS cache. */
+UNIV_INTERN
+void
+fts_cache_destroy(
+/*==============*/
+	fts_cache_t*	cache);			/*!< in: cache*/
+
+/*********************************************************************//**
+Clear cache. If the shutdown flag is TRUE then the cache can contain
+data that needs to be freed. For regular clear as part of normal
+working we assume the caller has freed all resources. */
+UNIV_INTERN
+void
+fts_cache_clear(
+/*============*/
+	fts_cache_t*	cache,			/*!< in: cache */
+	ibool		free_words);		/*!< in: TRUE if free
+						in memory word cache. */
+
+/*********************************************************************//**
+Initialize things in cache. */
+UNIV_INTERN
+void
+fts_cache_init(
+/*===========*/
+	fts_cache_t*	cache);			/*!< in: cache */
+
+/*********************************************************************//**
+Rollback to and including savepoint indentified by name. */
+UNIV_INTERN
+void
+fts_savepoint_rollback(
+/*===================*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name);			/*!< in: savepoint name */
+
+/*********************************************************************//**
+Rollback to and including savepoint indentified by name. */
+UNIV_INTERN
+void
+fts_savepoint_rollback_last_stmt(
+/*=============================*/
+	trx_t*		trx);			/*!< in: transaction */
+
+/***********************************************************************//**
+Drop all orphaned FTS auxiliary tables, those that don't have a parent
+table or FTS index defined on them. */
+UNIV_INTERN
+void
+fts_drop_orphaned_tables(void);
+/*==========================*/
+
+/******************************************************************//**
+Since we do a horizontal split on the index table, we need to drop
+all the split tables.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_drop_index_split_tables(
+/*========================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index)			/*!< in: fts instance */
+	__attribute__((nonnull, warn_unused_result));
+
+/****************************************************************//**
+Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end. */
+UNIV_INTERN
+void
+fts_sync_table(
+/*===========*/
+	dict_table_t*	table)			/*!< in: table */
+	__attribute__((nonnull));
+
+/****************************************************************//**
+Free the query graph but check whether dict_sys->mutex is already
+held */
+UNIV_INTERN
+void
+fts_que_graph_free_check_lock(
+/*==========================*/
+	fts_table_t*		fts_table,	/*!< in: FTS table */
+	const fts_index_cache_t*index_cache,	/*!< in: FTS index cache */
+	que_t*			graph);		/*!< in: query graph */
+
+/****************************************************************//**
+Create an FTS index cache. */
+UNIV_INTERN
+CHARSET_INFO*
+fts_index_get_charset(
+/*==================*/
+	dict_index_t*		index);		/*!< in: FTS index */
+
+/*********************************************************************//**
+Get the initial Doc ID by consulting the CONFIG table
+@return initial Doc ID */
+UNIV_INTERN
+doc_id_t
+fts_init_doc_id(
+/*============*/
+	const dict_table_t*		table);	/*!< in: table */
+
+/******************************************************************//**
+compare two character string according to their charset. */
+extern
+int
+innobase_fts_text_cmp(
+/*==================*/
+	const void*	cs,			/*!< in: Character set */
+	const void*	p1,			/*!< in: key */
+	const void*	p2);			/*!< in: node */
+
+/******************************************************************//**
+Makes all characters in a string lower case. */
+extern
+size_t
+innobase_fts_casedn_str(
+/*====================*/
+        CHARSET_INFO*	cs,			/*!< in: Character set */
+	char*		src,			/*!< in: string to put in
+						lower case */
+	size_t		src_len,		/*!< in: input string length */
+	char*		dst,			/*!< in: buffer for result
+						string */
+	size_t		dst_len);		/*!< in: buffer size */
+
+
+/******************************************************************//**
+compare two character string according to their charset. */
+extern
+int
+innobase_fts_text_cmp_prefix(
+/*=========================*/
+	const void*	cs,			/*!< in: Character set */
+	const void*	p1,			/*!< in: key */
+	const void*	p2);			/*!< in: node */
+
+/*************************************************************//**
+Get the next token from the given string and store it in *token. */
+extern
+ulint
+innobase_mysql_fts_get_token(
+/*=========================*/
+	CHARSET_INFO*	charset,		/*!< in: Character set */
+	const byte*	start,			/*!< in: start of text */
+	const byte*	end,			/*!< in: one character past
+						end of text */
+	fts_string_t*	token,			/*!< out: token's text */
+	ulint*		offset);		/*!< out: offset to token,
+						measured as characters from
+						'start' */
+
+/*********************************************************************//**
+Fetch COUNT(*) from specified table.
+@return the number of rows in the table */
+UNIV_INTERN
+ulint
+fts_get_rows_count(
+/*===============*/
+	fts_table_t*	fts_table);		/*!< in: fts table to read */
+
+/*************************************************************//**
+Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists
+@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */
+UNIV_INTERN
+doc_id_t
+fts_get_max_doc_id(
+/*===============*/
+	dict_table_t*	table);			/*!< in: user table */
+
+/******************************************************************//**
+Check whether user supplied stopword table exists and is of
+the right format.
+@return the stopword column charset if qualifies */
+UNIV_INTERN
+CHARSET_INFO*
+fts_valid_stopword_table(
+/*=====================*/
+	const char*	stopword_table_name);	/*!< in: Stopword table
+						name */
+/****************************************************************//**
+This function loads specified stopword into FTS cache
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fts_load_stopword(
+/*==============*/
+	const dict_table_t*
+			table,			/*!< in: Table with FTS */
+	trx_t*		trx,			/*!< in: Transaction */
+	const char*	global_stopword_table,	/*!< in: Global stopword table
+						name */
+	const char*	session_stopword_table,	/*!< in: Session stopword table
+						name */
+	ibool		stopword_is_on,		/*!< in: Whether stopword
+						option is turned on/off */
+	ibool		reload);		/*!< in: Whether it is during
+						reload of FTS table */
+
+/****************************************************************//**
+Create the vector of fts_get_doc_t instances.
+@return vector of fts_get_doc_t instances */
+UNIV_INTERN
+ib_vector_t*
+fts_get_docs_create(
+/*================*/
+	fts_cache_t*	cache);			/*!< in: fts cache */
+
+/****************************************************************//**
+Read the rows from the FTS index
+@return DB_SUCCESS if OK */
+UNIV_INTERN
+dberr_t
+fts_table_fetch_doc_ids(
+/*====================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table,		/*!< in: aux table */
+	fts_doc_ids_t*	doc_ids);		/*!< in: For collecting
+						doc ids */
+/****************************************************************//**
+This function brings FTS index in sync when FTS index is first
+used. There are documents that have not yet sync-ed to auxiliary
+tables from last server abnormally shutdown, we will need to bring
+such document into FTS cache before any further operations
+@return TRUE if all OK */
+UNIV_INTERN
+ibool
+fts_init_index(
+/*===========*/
+	dict_table_t*	table,			/*!< in: Table with FTS */
+	ibool		has_cache_lock);	/*!< in: Whether we already
+						have cache lock */
+/*******************************************************************//**
+Add a newly create index in FTS cache */
+UNIV_INTERN
+void
+fts_add_index(
+/*==========*/
+	dict_index_t*	index,			/*!< FTS index to be added */
+	dict_table_t*	table);			/*!< table */
+
+/*******************************************************************//**
+Drop auxiliary tables related to an FTS index
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+fts_drop_index(
+/*===========*/
+	dict_table_t*	table,	/*!< in: Table where indexes are dropped */
+	dict_index_t*	index,	/*!< in: Index to be dropped */
+	trx_t*		trx)	/*!< in: Transaction for the drop */
+	__attribute__((nonnull));
+
+/****************************************************************//**
+Rename auxiliary tables for all fts index for a table
+@return DB_SUCCESS or error code */
+
+dberr_t
+fts_rename_aux_tables(
+/*==================*/
+	dict_table_t*	table,		/*!< in: user Table */
+	const char*	new_name,	/*!< in: new table name */
+	trx_t*		trx);		/*!< in: transaction */
+
+/*******************************************************************//**
+Check indexes in the fts->indexes is also present in index cache and
+table->indexes list
+@return TRUE if all indexes match */
+UNIV_INTERN
+ibool
+fts_check_cached_index(
+/*===================*/
+	dict_table_t*	table);  /*!< in: Table where indexes are dropped */
+#endif /*!< fts0fts.h */
+
diff --git a/storage/xtradb/include/fts0opt.h b/storage/xtradb/include/fts0opt.h
new file mode 100644
index 00000000000..92eaf8270d2
--- /dev/null
+++ b/storage/xtradb/include/fts0opt.h
@@ -0,0 +1,37 @@
+/*****************************************************************************
+
+Copyright (c) 2001, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0opt.h
+Full Text Search optimize thread
+
+Created 2011-02-15 Jimmy Yang
+***********************************************************************/
+#ifndef INNODB_FTS0OPT_H
+#define INNODB_FTS0OPT_H
+
+/********************************************************************
+Callback function to fetch the rows in an FTS INDEX record. */
+UNIV_INTERN
+ibool
+fts_optimize_index_fetch_node(
+/*==========================*/
+                                        /* out: always returns non-NULL */
+        void*           row,		/* in: sel_node_t* */
+        void*           user_arg);	/* in: pointer to ib_vector_t */
+#endif
diff --git a/storage/xtradb/include/fts0pars.h b/storage/xtradb/include/fts0pars.h
new file mode 100644
index 00000000000..50f636944e5
--- /dev/null
+++ b/storage/xtradb/include/fts0pars.h
@@ -0,0 +1,72 @@
+/* A Bison parser, made by GNU Bison 2.5.  */
+
+/* Bison interface for Yacc-like parsers in C
+   
+      Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc.
+   
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+   
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     FTS_OPER = 258,
+     FTS_TEXT = 259,
+     FTS_TERM = 260,
+     FTS_NUMB = 261
+   };
+#endif
+
+
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef union YYSTYPE
+{
+
+/* Line 2068 of yacc.c  */
+#line 61 "fts0pars.y"
+
+	int		oper;
+	char*		token;
+	fts_ast_node_t*	node;
+
+
+
+/* Line 2068 of yacc.c  */
+#line 64 "fts0pars.hh"
+} YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+
+
diff --git a/storage/xtradb/include/fts0priv.h b/storage/xtradb/include/fts0priv.h
new file mode 100644
index 00000000000..c6aca27f6ec
--- /dev/null
+++ b/storage/xtradb/include/fts0priv.h
@@ -0,0 +1,650 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0priv.h
+Full text search internal header file
+
+Created 2011/09/02 Sunny Bains
+***********************************************************************/
+
+#ifndef INNOBASE_FTS0PRIV_H
+#define INNOBASE_FTS0PRIV_H
+
+#include "dict0dict.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "que0types.h"
+#include "fts0types.h"
+
+/* The various states of the FTS sub system pertaining to a table with
+FTS indexes defined on it. */
+enum fts_table_state_enum {
+					/* !<This must be 0 since we insert
+					a hard coded '0' at create time
+					to the config table */
+
+	FTS_TABLE_STATE_RUNNING = 0,	/*!< Auxiliary tables created OK */
+
+	FTS_TABLE_STATE_OPTIMIZING,	/*!< This is a substate of RUNNING */
+
+	FTS_TABLE_STATE_DELETED		/*!< All aux tables to be dropped when
+					it's safe to do so */
+};
+
+typedef enum fts_table_state_enum fts_table_state_t;
+
+/** The default time to wait for the background thread (in microsecnds). */
+#define FTS_MAX_BACKGROUND_THREAD_WAIT		10000
+
+/** Maximum number of iterations to wait before we complain */
+#define FTS_BACKGROUND_THREAD_WAIT_COUNT	1000
+
+/** The maximum length of the config table's value column in bytes */
+#define FTS_MAX_CONFIG_NAME_LEN			64
+
+/** The maximum length of the config table's value column in bytes */
+#define FTS_MAX_CONFIG_VALUE_LEN		1024
+
+/** Approx. upper limit of ilist length in bytes. */
+#define FTS_ILIST_MAX_SIZE			(64 * 1024)
+
+/** FTS config table name parameters */
+
+/** The number of seconds after which an OPTIMIZE run will stop */
+#define FTS_OPTIMIZE_LIMIT_IN_SECS	"optimize_checkpoint_limit"
+
+/** The next doc id */
+#define FTS_SYNCED_DOC_ID		"synced_doc_id"
+
+/** The last word that was OPTIMIZED */
+#define FTS_LAST_OPTIMIZED_WORD		"last_optimized_word"
+
+/** Total number of documents that have been deleted. The next_doc_id
+minus this count gives us the total number of documents. */
+#define FTS_TOTAL_DELETED_COUNT		"deleted_doc_count"
+
+/** Total number of words parsed from all documents */
+#define FTS_TOTAL_WORD_COUNT		"total_word_count"
+
+/** Start of optimize of an FTS index */
+#define FTS_OPTIMIZE_START_TIME		"optimize_start_time"
+
+/** End of optimize for an FTS index */
+#define FTS_OPTIMIZE_END_TIME		"optimize_end_time"
+
+/** User specified stopword table name */
+#define	FTS_STOPWORD_TABLE_NAME		"stopword_table_name"
+
+/** Whether to use (turn on/off) stopword */
+#define	FTS_USE_STOPWORD		"use_stopword"
+
+/** State of the FTS system for this table. It can be one of
+ RUNNING, OPTIMIZING, DELETED. */
+#define FTS_TABLE_STATE			"table_state"
+
+/** The minimum length of an FTS auxiliary table names's id component
+e.g., For an auxiliary table name
+
+	FTS_<TABLE_ID>_SUFFIX
+
+This constant is for the minimum length required to store the <TABLE_ID>
+component.
+*/
+#define FTS_AUX_MIN_TABLE_ID_LENGTH	48
+
+/** Maximum length of an integer stored in the config table value column. */
+#define FTS_MAX_INT_LEN			32
+
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id.
+@return query graph */
+UNIV_INTERN
+que_t*
+fts_parse_sql(
+/*==========*/
+	fts_table_t*	fts_table,	/*!< in: FTS aux table */
+	pars_info_t*	info,		/*!< in: info struct, or NULL */
+	const char*	sql)		/*!< in: SQL string to evaluate */
+	__attribute__((nonnull(3), malloc, warn_unused_result));
+/******************************************************************//**
+Evaluate a parsed SQL statement
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_eval_sql(
+/*=========*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t*		graph)		/*!< in: Parsed statement */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Construct the name of an ancillary FTS table for the given table.
+@return own: table name, must be freed with mem_free() */
+UNIV_INTERN
+char*
+fts_get_table_name(
+/*===============*/
+	const fts_table_t*
+			fts_table)	/*!< in: FTS aux table info */
+	__attribute__((nonnull, malloc, warn_unused_result));
+/******************************************************************//**
+Construct the column specification part of the SQL string for selecting the
+indexed FTS columns for the given table. Adds the necessary bound
+ids to the given 'info' and returns the SQL string. Examples:
+
+One indexed column named "text":
+
+ "$sel0",
+ info/ids: sel0 -> "text"
+
+Two indexed columns named "subject" and "content":
+
+ "$sel0, $sel1",
+ info/ids: sel0 -> "subject", sel1 -> "content",
+@return heap-allocated WHERE string */
+UNIV_INTERN
+const char*
+fts_get_select_columns_str(
+/*=======================*/
+	dict_index_t*	index,		/*!< in: FTS index */
+	pars_info_t*	info,		/*!< in/out: parser info */
+	mem_heap_t*	heap)		/*!< in: memory heap */
+	__attribute__((nonnull, warn_unused_result));
+
+/** define for fts_doc_fetch_by_doc_id() "option" value, defines whether
+we want to get Doc whose ID is equal to or greater or smaller than supplied
+ID */
+#define	FTS_FETCH_DOC_BY_ID_EQUAL	1
+#define	FTS_FETCH_DOC_BY_ID_LARGE	2
+#define	FTS_FETCH_DOC_BY_ID_SMALL	3
+
+/*************************************************************//**
+Fetch document (= a single row's indexed text) with the given
+document id.
+@return: DB_SUCCESS if fetch is successful, else error */
+UNIV_INTERN
+dberr_t
+fts_doc_fetch_by_doc_id(
+/*====================*/
+	fts_get_doc_t*	get_doc,	/*!< in: state */
+	doc_id_t	doc_id,		/*!< in: id of document to fetch */
+	dict_index_t*	index_to_use,	/*!< in: caller supplied FTS index,
+					or NULL */
+	ulint		option,         /*!< in: search option, if it is
+                                        greater than doc_id or equal */
+	fts_sql_callback
+			callback,	/*!< in: callback to read
+					records */
+	void*		arg)		/*!< in: callback arg */
+	__attribute__((nonnull(6)));
+
+/*******************************************************************//**
+Callback function for fetch that stores the text of an FTS document,
+converting each column to UTF-16.
+@return always FALSE */
+UNIV_INTERN
+ibool
+fts_query_expansion_fetch_doc(
+/*==========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: fts_doc_t* */
+	__attribute__((nonnull));
+/********************************************************************
+Write out a single word's data as new entry/entries in the INDEX table.
+@return DB_SUCCESS if all OK. */
+UNIV_INTERN
+dberr_t
+fts_write_node(
+/*===========*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		graph,		/*!< in: query graph */
+	fts_table_t*	fts_table,	/*!< in: the FTS aux index */
+	fts_string_t*	word,		/*!< in: word in UTF-8 */
+	fts_node_t*	node)		/*!< in: node columns */
+	__attribute__((nonnull, warn_unused_result));
+/*******************************************************************//**
+Tokenize a document. */
+UNIV_INTERN
+void
+fts_tokenize_document(
+/*==================*/
+	fts_doc_t*	doc,		/*!< in/out: document to
+					tokenize */
+	fts_doc_t*	result)		/*!< out: if provided, save
+					result tokens here */
+	__attribute__((nonnull(1)));
+
+/*******************************************************************//**
+Continue to tokenize a document. */
+UNIV_INTERN
+void
+fts_tokenize_document_next(
+/*=======================*/
+	fts_doc_t*	doc,		/*!< in/out: document to
+					tokenize */
+	ulint		add_pos,	/*!< in: add this position to all
+					tokens from this tokenization */
+	fts_doc_t*	result)		/*!< out: if provided, save
+					result tokens here */
+	__attribute__((nonnull(1)));
+/******************************************************************//**
+Initialize a document. */
+UNIV_INTERN
+void
+fts_doc_init(
+/*=========*/
+	fts_doc_t*	doc)		/*!< in: doc to initialize */
+	__attribute__((nonnull));
+
+/******************************************************************//**
+Do a binary search for a doc id in the array
+@return +ve index if found -ve index where it should be
+        inserted if not found */
+UNIV_INTERN
+int
+fts_bsearch(
+/*========*/
+	fts_update_t*	array,		/*!< in: array to sort */
+	int		lower,		/*!< in: lower bound of array*/
+	int		upper,		/*!< in: upper bound of array*/
+	doc_id_t	doc_id)		/*!< in: doc id to lookup */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Free document. */
+UNIV_INTERN
+void
+fts_doc_free(
+/*=========*/
+	fts_doc_t*	doc)		/*!< in: document */
+	__attribute__((nonnull));
+/******************************************************************//**
+Free fts_optimizer_word_t instanace.*/
+UNIV_INTERN
+void
+fts_word_free(
+/*==========*/
+	fts_word_t*	word)		/*!< in: instance to free.*/
+	__attribute__((nonnull));
+/******************************************************************//**
+Read the rows from the FTS inde
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_index_fetch_nodes(
+/*==================*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		graph,		/*!< in: prepared statement */
+	fts_table_t*	fts_table,	/*!< in: FTS aux table */
+	const fts_string_t*
+			word,		/*!< in: the word to fetch */
+	fts_fetch_t*	fetch)		/*!< in: fetch callback.*/
+	__attribute__((nonnull));
+/******************************************************************//**
+Create a fts_optimizer_word_t instance.
+@return new instance */
+UNIV_INTERN
+fts_word_t*
+fts_word_init(
+/*==========*/
+	fts_word_t*	word,		/*!< in: word to initialize */
+	byte*		utf8,		/*!< in: UTF-8 string */
+	ulint		len)		/*!< in: length of string in bytes */
+	__attribute__((nonnull));
+/******************************************************************//**
+Compare two fts_trx_table_t instances, we actually compare the
+table id's here.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_cmp(
+/*==============*/
+	const void*	v1,		/*!< in: id1 */
+	const void*	v2)		/*!< in: id2 */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Compare a table id with a trx_table_t table id.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_id_cmp(
+/*=================*/
+	const void*	p1,		/*!< in: id1 */
+	const void*	p2)		/*!< in: id2 */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Commit a transaction.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+fts_sql_commit(
+/*===========*/
+	trx_t*		trx)		/*!< in: transaction */
+	__attribute__((nonnull));
+/******************************************************************//**
+Rollback a transaction.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+fts_sql_rollback(
+/*=============*/
+	trx_t*		trx)		/*!< in: transaction */
+	__attribute__((nonnull));
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id. Don't acquire
+the dict mutex
+@return query graph */
+UNIV_INTERN
+que_t*
+fts_parse_sql_no_dict_lock(
+/*=======================*/
+	fts_table_t*	fts_table,	/*!< in: table with FTS index */
+	pars_info_t*	info,		/*!< in: parser info */
+	const char*	sql)		/*!< in: SQL string to evaluate */
+	__attribute__((nonnull(3), malloc, warn_unused_result));
+/******************************************************************//**
+Get value from config table. The caller must ensure that enough
+space is allocated for value to hold the column contents
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_get_value(
+/*=================*/
+	trx_t*		trx,		/* transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: get config value for
+					this parameter name */
+	fts_string_t*	value)		/*!< out: value read from
+					config table */
+	__attribute__((nonnull));
+/******************************************************************//**
+Get value specific to an FTS index from the config table. The caller
+must ensure that enough space is allocated for value to hold the
+column contents.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_get_index_value(
+/*=======================*/
+	trx_t*		trx,		/*!< transaction */
+	dict_index_t*	index,		/*!< in: index */
+	const char*	param,		/*!< in: get config value for
+					this parameter name */
+	fts_string_t*	value)		/*!< out: value read from
+					config table */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Set the value in the config table for name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_set_value(
+/*=================*/
+	trx_t*		trx,		/*!< transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: get config value for
+					this parameter name */
+	const fts_string_t*
+			value)		/*!< in: value to update */
+	__attribute__((nonnull));
+/****************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+dberr_t
+fts_config_set_ulint(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: param name */
+	ulint		int_value)	/*!< in: value */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Set the value specific to an FTS index in the config table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_set_index_value(
+/*=======================*/
+	trx_t*		trx,		/*!< transaction */
+	dict_index_t*	index,		/*!< in: index */
+	const char*	param,		/*!< in: get config value for
+					this parameter name */
+	fts_string_t*	value)		/*!< out: value read from
+					config table */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Increment the value in the config table for column name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_increment_value(
+/*=======================*/
+	trx_t*		trx,		/*!< transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: increment config value
+					for this parameter name */
+	ulint		delta)		/*!< in: increment by this much */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Increment the per index value in the config table for column name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_increment_index_value(
+/*=============================*/
+	trx_t*		trx,		/*!< transaction */
+	dict_index_t*	index,		/*!< in: FTS index */
+	const char*	name,		/*!< in: increment config value
+					for this parameter name */
+	ulint		delta)		/*!< in: increment by this much */
+	__attribute__((nonnull));
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_get_index_ulint(
+/*=======================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: FTS index */
+	const char*	name,		/*!< in: param name */
+	ulint*		int_value)	/*!< out: value */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Set an ulint value int the config table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_set_index_ulint(
+/*=======================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: FTS index */
+	const char*	name,		/*!< in: param name */
+	ulint		int_value)	/*!< in: value */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_get_ulint(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: param name */
+	ulint*		int_value)	/*!< out: value */
+	__attribute__((nonnull));
+/******************************************************************//**
+Search cache for word.
+@return the word node vector if found else NULL */
+UNIV_INTERN
+const ib_vector_t*
+fts_cache_find_word(
+/*================*/
+	const fts_index_cache_t*
+			index_cache,	/*!< in: cache to search */
+	const fts_string_t*
+			text)		/*!< in: word to search for */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Check cache for deleted doc id.
+@return TRUE if deleted */
+UNIV_INTERN
+ibool
+fts_cache_is_deleted_doc_id(
+/*========================*/
+	const fts_cache_t*
+			cache,		/*!< in: cache ito search */
+	doc_id_t	doc_id)		/*!< in: doc id to search for */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Append deleted doc ids to vector and sort the vector. */
+UNIV_INTERN
+void
+fts_cache_append_deleted_doc_ids(
+/*=============================*/
+	const fts_cache_t*
+			cache,		/*!< in: cache to use */
+	ib_vector_t*	vector);	/*!< in: append to this vector */
+/******************************************************************//**
+Wait for the background thread to start. We poll to detect change
+of state, which is acceptable, since the wait should happen only
+once during startup.
+@return true if the thread started else FALSE (i.e timed out) */
+UNIV_INTERN
+ibool
+fts_wait_for_background_thread_to_start(
+/*====================================*/
+	dict_table_t*	table,		/*!< in: table to which the thread
+					is attached */
+	ulint		max_wait);	/*!< in: time in microseconds, if set
+					to 0 then it disables timeout
+					checking */
+#ifdef FTS_DOC_STATS_DEBUG
+/******************************************************************//**
+Get the total number of words in the FTS for a particular FTS index.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_get_total_word_count(
+/*=====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: for this index */
+	ulint*		total)		/*!< out: total words */
+	__attribute__((nonnull, warn_unused_result));
+#endif
+/******************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index specific cache else NULL */
+UNIV_INTERN
+fts_index_cache_t*
+fts_find_index_cache(
+/*================*/
+	const fts_cache_t*
+			cache,		/*!< in: cache to search */
+	const dict_index_t*
+			index)		/*!< in: index to search for */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Write the table id to the given buffer (including final NUL). Buffer must be
+at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long.
+@return	number of bytes written */
+UNIV_INLINE
+int
+fts_write_object_id(
+/*================*/
+	ib_id_t		id,		/*!< in: a table/index id */
+	char*		str)		/*!< in: buffer to write the id to */
+	__attribute__((nonnull));
+/******************************************************************//**
+Read the table id from the string generated by fts_write_object_id().
+@return TRUE if parse successful */
+UNIV_INLINE
+ibool
+fts_read_object_id(
+/*===============*/
+	ib_id_t*	id,		/*!< out: a table id */
+	const char*	str)		/*!< in: buffer to read from */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Get the table id.
+@return number of bytes written */
+UNIV_INTERN
+int
+fts_get_table_id(
+/*=============*/
+	const fts_table_t*
+			fts_table,	/*!< in: FTS Auxiliary table */
+	char*		table_id)	/*!< out: table id, must be at least
+					FTS_AUX_MIN_TABLE_ID_LENGTH bytes
+					long */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Add the table to add to the OPTIMIZER's list. */
+UNIV_INTERN
+void
+fts_optimize_add_table(
+/*===================*/
+	dict_table_t*	table)		/*!< in: table to add */
+	__attribute__((nonnull));
+/******************************************************************//**
+Optimize a table. */
+UNIV_INTERN
+void
+fts_optimize_do_table(
+/*==================*/
+	dict_table_t*	table)		/*!< in: table to optimize */
+	__attribute__((nonnull));
+/******************************************************************//**
+Construct the prefix name of an FTS table.
+@return own: table name, must be freed with mem_free() */
+UNIV_INTERN
+char*
+fts_get_table_name_prefix(
+/*======================*/
+	const fts_table_t*
+			fts_table)	/*!< in: Auxiliary table type */
+	__attribute__((nonnull, malloc, warn_unused_result));
+/******************************************************************//**
+Add node positions. */
+UNIV_INTERN
+void
+fts_cache_node_add_positions(
+/*=========================*/
+	fts_cache_t*	cache,		/*!< in: cache */
+	fts_node_t*	node,		/*!< in: word node */
+	doc_id_t	doc_id,		/*!< in: doc id */
+	ib_vector_t*	positions)	/*!< in: fts_token_t::positions */
+	__attribute__((nonnull(2,4)));
+
+/******************************************************************//**
+Create the config table name for retrieving index specific value.
+@return index config parameter name */
+UNIV_INTERN
+char*
+fts_config_create_index_param_name(
+/*===============================*/
+	const char*		param,		/*!< in: base name of param */
+	const dict_index_t*	index)		/*!< in: index for config */
+	__attribute__((nonnull, malloc, warn_unused_result));
+
+#ifndef UNIV_NONINL
+#include "fts0priv.ic"
+#endif
+
+#endif /* INNOBASE_FTS0PRIV_H */
diff --git a/storage/xtradb/include/fts0priv.ic b/storage/xtradb/include/fts0priv.ic
new file mode 100644
index 00000000000..268bb7e2227
--- /dev/null
+++ b/storage/xtradb/include/fts0priv.ic
@@ -0,0 +1,92 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0priv.ic
+Full text search internal header file
+
+Created 2011/11/12 Sunny Bains
+***********************************************************************/
+
+/******************************************************************//**
+Write the table id to the given buffer (including final NUL). Buffer must be
+at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long.
+@return	number of bytes written */
+UNIV_INLINE
+int
+fts_write_object_id(
+/*================*/
+	ib_id_t		id,		/* in: a table/index id */
+	char*		str)		/* in: buffer to write the id to */
+{
+        // FIXME: Use ut_snprintf()
+	return(sprintf(str, UINT64PFx, id));
+}
+
+/******************************************************************//**
+Read the table id from the string generated by fts_write_object_id().
+@return	TRUE if parse successful */
+UNIV_INLINE
+ibool
+fts_read_object_id(
+/*===============*/
+	ib_id_t*	id,		/* out: an id */
+	const char*	str)		/* in: buffer to read from */
+{
+	return(sscanf(str, UINT64PFx, id) == 1);
+}
+
+/******************************************************************//**
+Compare two fts_trx_table_t instances.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2  */
+UNIV_INLINE
+int
+fts_trx_table_cmp(
+/*==============*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const dict_table_t* table1 = (*(const fts_trx_table_t**) p1)->table;
+	const dict_table_t* table2 = (*(const fts_trx_table_t**) p2)->table;
+
+	return((table1->id > table2->id)
+	       ? 1
+	       : (table1->id == table2->id)
+		  ? 0
+		  : -1);
+}
+
+/******************************************************************//**
+Compare a table id with a fts_trx_table_t table id.
+@return < 0 if n1 < n2, 0 if n1 == n2,> 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_id_cmp(
+/*=================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const ullint* table_id = (const ullint*) p1;
+	const dict_table_t* table2 = (*(const fts_trx_table_t**) p2)->table;
+
+	return((*table_id > table2->id)
+	       ? 1
+	       : (*table_id == table2->id)
+		  ? 0
+		  : -1);
+}
diff --git a/storage/xtradb/include/fts0tlex.h b/storage/xtradb/include/fts0tlex.h
new file mode 100644
index 00000000000..f91533803e8
--- /dev/null
+++ b/storage/xtradb/include/fts0tlex.h
@@ -0,0 +1,349 @@
+#ifndef fts0tHEADER_H
+#define fts0tHEADER_H 1
+#define fts0tIN_HEADER 1
+
+#line 6 "../include/fts0tlex.h"
+
+#line 8 "../include/fts0tlex.h"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 35
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+    
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void fts0trestart (FILE *input_file ,yyscan_t yyscanner );
+void fts0t_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
+void fts0t_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0t_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0tpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+void fts0tpop_buffer_state (yyscan_t yyscanner );
+
+YY_BUFFER_STATE fts0t_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner );
+
+void *fts0talloc (yy_size_t ,yyscan_t yyscanner );
+void *fts0trealloc (void *,yy_size_t ,yyscan_t yyscanner );
+void fts0tfree (void * ,yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define fts0twrap(n) 1
+#define YY_SKIP_YYWRAP
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+int fts0tlex_init (yyscan_t* scanner);
+
+int fts0tlex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int fts0tlex_destroy (yyscan_t yyscanner );
+
+int fts0tget_debug (yyscan_t yyscanner );
+
+void fts0tset_debug (int debug_flag ,yyscan_t yyscanner );
+
+YY_EXTRA_TYPE fts0tget_extra (yyscan_t yyscanner );
+
+void fts0tset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
+
+FILE *fts0tget_in (yyscan_t yyscanner );
+
+void fts0tset_in  (FILE * in_str ,yyscan_t yyscanner );
+
+FILE *fts0tget_out (yyscan_t yyscanner );
+
+void fts0tset_out  (FILE * out_str ,yyscan_t yyscanner );
+
+int fts0tget_leng (yyscan_t yyscanner );
+
+char *fts0tget_text (yyscan_t yyscanner );
+
+int fts0tget_lineno (yyscan_t yyscanner );
+
+void fts0tset_lineno (int line_number ,yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int fts0twrap (yyscan_t yyscanner );
+#else
+extern int fts0twrap (yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int fts0tlex (yyscan_t yyscanner);
+
+#define YY_DECL int fts0tlex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#line 68 "fts0tlex.l"
+
+
+#line 348 "../include/fts0tlex.h"
+#undef fts0tIN_HEADER
+#endif /* fts0tHEADER_H */
diff --git a/storage/xtradb/include/fts0types.h b/storage/xtradb/include/fts0types.h
new file mode 100644
index 00000000000..b714d326487
--- /dev/null
+++ b/storage/xtradb/include/fts0types.h
@@ -0,0 +1,473 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0types.h
+Full text search types file
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0TYPES_H
+#define INNOBASE_FTS0TYPES_H
+
+#include "que0types.h"
+#include "ut0byte.h"
+#include "fut0fut.h"
+#include "ut0rbt.h"
+#include "fts0fts.h"
+
+/** Types used within FTS. */
+struct fts_que_t;
+struct fts_node_t;
+struct fts_utf8_str_t;
+
+/** Callbacks used within FTS. */
+typedef pars_user_func_cb_t fts_sql_callback;
+typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len);
+
+/** Statistics relevant to a particular document, used during retrieval. */
+struct fts_doc_stats_t {
+	doc_id_t	doc_id;		/*!< Document id */
+	ulint		word_count;	/*!< Total words in the document */
+};
+
+/** It's main purpose is to store the SQL prepared statements that
+are required to retrieve a document from the database. */
+struct fts_get_doc_t {
+	fts_index_cache_t*
+			index_cache;	/*!< The index cache instance */
+
+					/*!< Parsed sql statement */
+	que_t*		get_document_graph;
+	fts_cache_t*	cache;		/*!< The parent cache */
+};
+
+/** Since we can have multiple FTS indexes on a table, we keep a
+per index cache of words etc. */
+struct fts_index_cache_t {
+	dict_index_t*	index;		/*!< The FTS index instance */
+
+	ib_rbt_t*	words;		/*!< Nodes; indexed by fts_string_t*,
+					cells are fts_tokenizer_word_t*.*/
+
+	ib_vector_t*	doc_stats;	/*!< Array of the fts_doc_stats_t
+					contained in the memory buffer.
+					Must be in sorted order (ascending).
+					The  ideal choice is an rb tree but
+					the rb tree imposes a space overhead
+					that we can do without */
+
+	que_t**		ins_graph;	/*!< Insert query graphs */
+
+	que_t**		sel_graph;	/*!< Select query graphs */
+	CHARSET_INFO*	charset;	/*!< charset */
+};
+
+/** For supporting the tracking of updates on multiple FTS indexes we need
+to track which FTS indexes need to be updated. For INSERT and DELETE we
+update all fts indexes. */
+struct fts_update_t {
+	doc_id_t	doc_id;		/*!< The doc id affected */
+
+	ib_vector_t*	fts_indexes;	/*!< The FTS indexes that need to be
+					updated. A NULL value means all
+					indexes need to be updated.  This
+					vector is not allocated on the heap
+					and so must be freed explicitly,
+					when we are done with it */
+};
+
+/** Stop word control infotmation. */
+struct fts_stopword_t {
+	ulint		status;		/*!< Status of the stopword tree */
+	ib_alloc_t*	heap;		/*!< The memory allocator to use */
+	ib_rbt_t*	cached_stopword;/*!< This stores all active stopwords */
+	CHARSET_INFO*	charset;	/*!< charset for stopword */
+};
+
+/** The SYNC state of the cache. There is one instance of this struct
+associated with each ADD thread. */
+struct fts_sync_t {
+	trx_t*		trx;		/*!< The transaction used for SYNCing
+					the cache to disk */
+	dict_table_t*	table;		/*!< Table with FTS index(es) */
+	ulint		max_cache_size;	/*!< Max size in bytes of the cache */
+	ibool		cache_full;	/*!< flag, when true it indicates that
+					we need to sync the cache to disk */
+	ulint		lower_index;	/*!< the start index of the doc id
+					vector from where to start adding
+					documents to the FTS cache */
+	ulint		upper_index;	/*!< max index of the doc id vector to
+					add to the FTS cache */
+	ibool		interrupted;	/*!< TRUE if SYNC was interrupted */
+	doc_id_t	min_doc_id;	/*!< The smallest doc id added to the
+					cache. It should equal to
+					doc_ids[lower_index] */
+	doc_id_t	max_doc_id;	/*!< The doc id at which the cache was
+					noted as being full, we use this to
+					set the upper_limit field */
+        ib_time_t	start_time;	/*!< SYNC start time */
+};
+
+/** The cache for the FTS system. It is a memory-based inverted index
+that new entries are added to, until it grows over the configured maximum
+size, at which time its contents are written to the INDEX table. */
+struct fts_cache_t {
+	rw_lock_t	lock;		/*!< lock protecting all access to the
+					memory buffer. FIXME: this needs to
+					be our new upgrade-capable rw-lock */
+
+	rw_lock_t	init_lock;	/*!< lock used for the cache
+					intialization, it has different
+					SYNC level as above cache lock */
+
+	ib_mutex_t	optimize_lock;	/*!< Lock for OPTIMIZE */
+
+	ib_mutex_t	deleted_lock;	/*!< Lock covering deleted_doc_ids */
+
+	ib_mutex_t	doc_id_lock;	/*!< Lock covering Doc ID */
+
+	ib_vector_t*	deleted_doc_ids;/*!< Array of deleted doc ids, each
+					element is of type fts_update_t */
+
+	ib_vector_t*	indexes;	/*!< We store the stats and inverted
+					index for the individual FTS indexes
+					in this vector. Each element is
+					an instance of fts_index_cache_t */
+
+	ib_vector_t*	get_docs;	/*!< information required to read
+					the document from the table. Each
+					element is of type fts_doc_t */
+
+	ulint		total_size;	/*!< total size consumed by the ilist
+					field of all nodes. SYNC is run
+					whenever this gets too big */
+	fts_sync_t*	sync;		/*!< sync structure to sync data to
+					disk */
+	ib_alloc_t*	sync_heap;	/*!< The heap allocator, for indexes
+					and deleted_doc_ids, ie. transient
+					objects, they are recreated after
+					a SYNC is completed */
+
+
+	ib_alloc_t*	self_heap;	/*!< This heap is the heap out of
+					which an instance of the cache itself
+					was created. Objects created using
+					this heap will last for the lifetime
+					of the cache */
+
+	doc_id_t	next_doc_id;	/*!< Next doc id */
+
+	doc_id_t	synced_doc_id;	/*!< Doc ID sync-ed to CONFIG table */
+
+	doc_id_t	first_doc_id;	/*!< first doc id since this table
+					was opened */
+
+	ulint		deleted;	/*!< Number of doc ids deleted since
+					last optimized. This variable is
+					covered by deleted_lock */
+
+	ulint		added;		/*!< Number of doc ids added since last
+					optimized. This variable is covered by
+					the deleted lock */
+
+	fts_stopword_t	stopword_info;	/*!< Cached stopwords for the FTS */
+	mem_heap_t*	cache_heap;	/*!< Cache Heap */
+};
+
+/** Columns of the FTS auxiliary INDEX table */
+struct fts_node_t {
+	doc_id_t	first_doc_id;	/*!< First document id in ilist. */
+
+	doc_id_t	last_doc_id;	/*!< Last document id in ilist. */
+
+	byte*		ilist;		/*!< Binary list of documents & word
+					positions the token appears in.
+					TODO: For now, these are simply
+					ut_malloc'd, but if testing shows
+					that they waste memory unacceptably, a
+					special memory allocator will have
+					to be written */
+
+	ulint		doc_count;	/*!< Number of doc ids in ilist */
+
+	ulint		ilist_size;	/*!< Used size of ilist in bytes. */
+
+	ulint		ilist_size_alloc;
+					/*!< Allocated size of ilist in
+					bytes */
+};
+
+/** A tokenizer word. Contains information about one word. */
+struct fts_tokenizer_word_t {
+	fts_string_t	text;		/*!< Token text. */
+
+	ib_vector_t*	nodes;		/*!< Word node ilists, each element is
+					of type fts_node_t */
+};
+
+/** Word text plus it's array of nodes as on disk in FTS index */
+struct fts_word_t {
+	fts_string_t	text;		/*!< Word value in UTF-8 */
+	ib_vector_t*	nodes;		/*!< Nodes read from disk */
+
+	ib_alloc_t*	heap_alloc;	/*!< For handling all allocations */
+};
+
+/** Callback for reading and filtering nodes that are read from FTS index */
+struct fts_fetch_t {
+	void*		read_arg;	/*!< Arg for the sql_callback */
+
+	fts_sql_callback
+			read_record;	/*!< Callback for reading index
+					record */
+};
+
+/** For horizontally splitting an FTS auxiliary index */
+struct fts_index_selector_t {
+	ulint		value;		/*!< Character value at which
+					to split */
+
+	const char*	suffix;		/*!< FTS aux index suffix */
+};
+
+/** This type represents a single document. */
+struct fts_doc_t {
+	fts_string_t	text;		/*!< document text */
+
+	ibool		found;		/*!< TRUE if the document was found
+					successfully in the database */
+
+	ib_rbt_t*	tokens;		/*!< This is filled when the document
+					is tokenized. Tokens; indexed by
+					fts_string_t*, cells are of type
+					fts_token_t* */
+
+	ib_alloc_t*	self_heap;	/*!< An instance of this type is
+					allocated from this heap along
+					with any objects that have the
+					same lifespan, most notably
+					the vector of token positions */
+	CHARSET_INFO*	charset;	/*!< Document's charset info */
+};
+
+/** A token and its positions within a document. */
+struct fts_token_t {
+	fts_string_t	text;		/*!< token text */
+
+	ib_vector_t*	positions;	/*!< an array of the positions the
+					token is found in; each item is
+					actually an ulint. */
+};
+
+/** It's defined in fts/fts0fts.c */
+extern const fts_index_selector_t fts_index_selector[];
+
+/******************************************************************//**
+Compare two UTF-8 strings. */
+UNIV_INLINE
+int
+fts_utf8_string_cmp(
+/*================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: key */
+	const void*	p2);			/*!< in: node */
+
+/******************************************************************//**
+Compare two UTF-8 strings, and return match (0) if
+passed in "key" value equals or is the prefix of the "node" value. */
+UNIV_INLINE
+int
+fts_utf8_string_cmp_prefix(
+/*=======================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: key */
+	const void*	p2);			/*!< in: node */
+
+/******************************************************************//**
+Compare two fts_trx_row_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_trx_row_doc_id_cmp(
+/*===================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2);			/*!< in: id2 */
+
+/******************************************************************//**
+Compare two fts_ranking_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_ranking_doc_id_cmp(
+/*===================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2);			/*!< in: id2 */
+
+/******************************************************************//**
+Compare two fts_update_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_update_doc_id_cmp(
+/*==================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2);			/*!< in: id2 */
+
+/******************************************************************//**
+Decode and return the integer that was encoded using our VLC scheme.*/
+UNIV_INLINE
+ulint
+fts_decode_vlc(
+/*===========*/
+			/*!< out: value decoded */
+	byte**	ptr);	/*!< in: ptr to decode from, this ptr is
+			incremented by the number of bytes decoded */
+
+/******************************************************************//**
+Duplicate an UTF-8 string. */
+UNIV_INLINE
+void
+fts_utf8_string_dup(
+/*================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	fts_string_t*		dst,		/*!< in: dup to here */
+	const fts_string_t*	src,		/*!< in: src string */
+	mem_heap_t*		heap);		/*!< in: heap to use */
+
+/******************************************************************//**
+Return length of val if it were encoded using our VLC scheme. */
+UNIV_INLINE
+ulint
+fts_get_encoded_len(
+/*================*/
+						/*!< out: length of value
+						 encoded, in bytes */
+	ulint		val);			/*!< in: value to encode */
+
+/******************************************************************//**
+Encode an integer using our VLC scheme and return the length in bytes. */
+UNIV_INLINE
+ulint
+fts_encode_int(
+/*===========*/
+						/*!< out: length of value
+						encoded, in bytes */
+	ulint		val,			/*!< in: value to encode */
+	byte*		buf);			/*!< in: buffer, must have
+						enough space */
+
+/******************************************************************//**
+Decode a UTF-8 character.
+
+http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf:
+
+ Scalar Value              1st Byte 2nd Byte 3rd Byte 4th Byte
+00000000 0xxxxxxx          0xxxxxxx
+00000yyy yyxxxxxx          110yyyyy 10xxxxxx
+zzzzyyyy yyxxxxxx          1110zzzz 10yyyyyy 10xxxxxx
+000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
+
+This function decodes UTF-8 sequences up to 6 bytes (31 bits).
+
+On error *ptr will point to the first byte that was not correctly
+decoded. This will hopefully help in resyncing the input. */
+UNIV_INLINE
+ulint
+fts_utf8_decode(
+/*============*/
+						/*!< out: UTF8_ERROR if *ptr
+						did not point to a valid
+						UTF-8 sequence, or the
+						Unicode code point. */
+	const byte**	ptr);			/*!< in/out: pointer to
+						UTF-8 string. The
+						pointer is advanced to
+						the start of the next
+						character. */
+
+/******************************************************************//**
+Lowercase an UTF-8 string. */
+UNIV_INLINE
+void
+fts_utf8_tolower(
+/*=============*/
+	fts_string_t*	str);			/*!< in: string */
+
+/******************************************************************//**
+Get the selected FTS aux INDEX suffix. */
+UNIV_INLINE
+const char*
+fts_get_suffix(
+/*===========*/
+	ulint		selected);		/*!< in: selected index */
+
+/********************************************************************
+Get the number of index selectors. */
+UNIV_INLINE
+ulint
+fts_get_n_selectors(void);
+/*=====================*/
+
+/******************************************************************//**
+Select the FTS auxiliary index for the given string.
+@return the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+/*=============*/
+	const CHARSET_INFO*	cs,		/*!< Charset */
+	const byte*		str,		/*!< in: word string */
+	ulint			len);		/*!< in: string length */
+
+/********************************************************************
+Select the next FTS auxiliary index for the given character.
+@return the next index to use for character */
+UNIV_INLINE
+ulint
+fts_select_next_index(
+/*==================*/
+	const CHARSET_INFO*	cs,		/*!< Charset */
+	const byte*		str,		/*!< in: string */
+	ulint			len);		/*!< in: string length */
+
+#ifndef UNIV_NONINL
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#endif
+
+#endif /* INNOBASE_FTS0TYPES_H */
diff --git a/storage/xtradb/include/fts0types.ic b/storage/xtradb/include/fts0types.ic
new file mode 100644
index 00000000000..f0dfd023a70
--- /dev/null
+++ b/storage/xtradb/include/fts0types.ic
@@ -0,0 +1,388 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0types.ic
+Full text search types.
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0TYPES_IC
+#define INNOBASE_FTS0TYPES_IC
+
+#include <ctype.h>
+
+#include "rem0cmp.h"
+#include "ha_prototypes.h"
+
+extern const ulint UTF8_ERROR;
+
+/* Determine if a UTF-8 continuation byte is valid. */
+#define fts_utf8_is_valid(b) (((b) & 0xC0) == 0x80)
+
+/******************************************************************//**
+Duplicate an UTF-8 string.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+void
+fts_utf8_string_dup(
+/*================*/
+	fts_string_t*		dst,		/*!< in: dup to here */
+	const fts_string_t*	src,		/*!< in: src string */
+	mem_heap_t*		heap)		/*!< in: heap to use */
+{
+	dst->f_str = (byte*)mem_heap_alloc(heap, src->f_len + 1);
+	memcpy(dst->f_str, src->f_str, src->f_len);
+
+	dst->f_len = src->f_len;
+	dst->f_str[src->f_len] = 0;
+	dst->f_n_char = src->f_n_char;
+}
+
+/******************************************************************//**
+Compare two fts_trx_row_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_row_doc_id_cmp(
+/*===================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_trx_row_t*	tr1 = (const fts_trx_row_t*) p1;
+	const fts_trx_row_t*	tr2 = (const fts_trx_row_t*) p2;
+
+	return((int)(tr1->doc_id - tr2->doc_id));
+}
+
+/******************************************************************//**
+Compare two fts_ranking_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_ranking_doc_id_cmp(
+/*===================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_ranking_t*	rk1 = (const fts_ranking_t*) p1;
+	const fts_ranking_t*	rk2 = (const fts_ranking_t*) p2;
+
+	return((int)(rk1->doc_id - rk2->doc_id));
+}
+
+/******************************************************************//**
+Compare two fts_update_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_update_doc_id_cmp(
+/*==================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_update_t*	up1 = (const fts_update_t*) p1;
+	const fts_update_t*	up2 = (const fts_update_t*) p2;
+
+	return((int)(up1->doc_id - up2->doc_id));
+}
+
+
+/******************************************************************//**
+Lowercase an UTF-8 string. */
+UNIV_INLINE
+void
+fts_utf8_tolower(
+/*=============*/
+	fts_string_t*	str)			/*!< in: string */
+{
+	innobase_casedn_str((char*) str->f_str);
+}
+
+/******************************************************************//**
+Compare two UTF-8 strings.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_utf8_string_cmp(
+/*================*/
+	const void*	p1,			/*!< in: key */
+	const void*	p2)			/*!< in: node */
+{
+	const fts_string_t* s1 = (const fts_string_t*) p1;
+	const fts_string_t* s2 = (const fts_string_t*) p2;
+
+	return(cmp_data_data_slow_varchar(
+		s1->f_str, s1->f_len, s2->f_str, s2->f_len));
+}
+
+/******************************************************************//**
+Compare two UTF-8 strings, and return match (0) if
+passed in "key" value equals or is the prefix of the "node" value.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_utf8_string_cmp_prefix(
+/*=======================*/
+	const void*	p1,			/*!< in: key */
+	const void*	p2)			/*!< in: node */
+{
+	int	result;
+	ulint	len;
+
+	const fts_string_t* s1 = (const fts_string_t*) p1;
+	const fts_string_t* s2 = (const fts_string_t*) p2;
+
+	len = ut_min(s1->f_len, s2->f_len);
+
+	result = cmp_data_data_slow_varchar(s1->f_str, len, s2->f_str, len);
+
+	if (result) {
+		return(result);
+	}
+
+	if (s1->f_len > s2->f_len) {
+		return(1);
+	}
+
+	return(0);
+}
+
+/******************************************************************//**
+Decode a UTF-8 character.
+
+http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf:
+
+ Scalar Value              1st Byte 2nd Byte 3rd Byte 4th Byte
+00000000 0xxxxxxx          0xxxxxxx
+00000yyy yyxxxxxx          110yyyyy 10xxxxxx
+zzzzyyyy yyxxxxxx          1110zzzz 10yyyyyy 10xxxxxx
+000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
+
+This function decodes UTF-8 sequences up to 6 bytes (31 bits).
+
+On error *ptr will point to the first byte that was not correctly
+decoded. This will hopefully help in resyncing the input.
+@return UTF8_ERROR if *ptr did not point to a valid
+UTF-8 sequence, or the Unicode code point. */
+UNIV_INLINE
+ulint
+fts_utf8_decode(
+/*============*/
+	const byte**	ptr)			/*!< in/out: pointer to
+						UTF-8 string. The
+						pointer is advanced to
+						the start of the next
+						character. */
+{
+	const byte*	p = *ptr;
+	ulint		ch = *p++;
+#ifdef UNIV_DEBUG
+	ulint		min_ch;
+#endif /* UNIV_DEBUG */
+
+	if (UNIV_LIKELY(ch < 0x80)) {
+		/* 0xxxxxxx */
+	} else if (UNIV_UNLIKELY(ch < 0xC0)) {
+		/* A continuation byte cannot start a code. */
+		goto err_exit;
+	} else if (ch < 0xE0) {
+		/* 110yyyyy 10xxxxxx */
+		ch &= 0x1F;
+		ut_d(min_ch = 0x80);
+		goto get1;
+	} else if (ch < 0xF0) {
+		/* 1110zzzz 10yyyyyy 10xxxxxx */
+		ch &= 0x0F;
+		ut_d(min_ch = 0x800);
+		goto get2;
+	} else if (ch < 0xF8) {
+		/* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
+		ch &= 0x07;
+		ut_d(min_ch = 0x10000);
+		goto get3;
+	} else if (ch < 0xFC) {
+		/* 111110tt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */
+		ch &= 0x03;
+		ut_d(min_ch = 0x200000);
+		goto get4;
+	} else if (ch < 0xFE) {
+		/* 1111110s 10tttttt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */
+		ut_d(min_ch = 0x4000000);
+		if (!fts_utf8_is_valid(*p)) {
+			goto err_exit;
+		}
+		ch <<= 6;
+		ch |= (*p++) & 0x3F;
+get4:
+		if (!fts_utf8_is_valid(*p)) {
+			goto err_exit;
+		}
+		ch <<= 6;
+		ch |= (*p++) & 0x3F;
+get3:
+		if (!fts_utf8_is_valid(*p)) {
+			goto err_exit;
+		}
+		ch <<= 6;
+		ch |= (*p++) & 0x3F;
+get2:
+		if (!fts_utf8_is_valid(*p)) {
+			goto err_exit;
+		}
+		ch <<= 6;
+		ch |= (*p++) & 0x3F;
+get1:
+		if (!fts_utf8_is_valid(*p)) {
+			goto err_exit;
+		}
+		ch <<= 6;
+		ch |= (*p++) & 0x3F;
+
+		/* The following is needed in the 6-byte case
+		when ulint is wider than 32 bits. */
+		ch &= 0xFFFFFFFF;
+
+		/* The code positions U+D800 to U+DFFF (UTF-16 surrogate pairs)
+		and U+FFFE and U+FFFF cannot occur in valid UTF-8. */
+
+		if ( (ch >= 0xD800 && ch <= 0xDFFF)
+#ifdef UNIV_DEBUG
+		     || ch < min_ch
+#endif /* UNIV_DEBUG */
+		     || ch == 0xFFFE || ch == 0xFFFF) {
+
+			ch = UTF8_ERROR;
+		}
+	} else {
+err_exit:
+		ch = UTF8_ERROR;
+	}
+
+	*ptr = p;
+
+	return(ch);
+}
+
+/******************************************************************//**
+Get the first character's code position for FTS index partition */
+extern
+ulint
+innobase_strnxfrm(
+/*==============*/
+        const CHARSET_INFO*	cs,	/*!< in: Character set */
+        const uchar*		p2,	/*!< in: string */
+        const ulint		len2);	/*!< in: string length */
+
+/******************************************************************//**
+Select the FTS auxiliary index for the given character.
+@return the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+/*=============*/
+	const CHARSET_INFO*	cs,	/*!< in: Charset */
+	const byte*		str,	/*!< in: string */
+	ulint			len)	/*!< in: string length */
+{
+	ulint			selected = 0;
+	ulint			value = innobase_strnxfrm(cs, str, len);
+
+	while (fts_index_selector[selected].value != 0) {
+
+		if (fts_index_selector[selected].value == value) {
+
+			return(selected);
+
+		} else if (fts_index_selector[selected].value > value) {
+
+			return(selected > 0 ? selected - 1 : 0);
+		}
+
+		++selected;
+	}
+
+	ut_ad(selected > 1);
+
+	return(selected - 1);
+}
+
+/******************************************************************//**
+Select the next FTS auxiliary index for the given character.
+@return the next index to use for character */
+UNIV_INLINE
+ulint
+fts_select_next_index(
+/*==================*/
+	const CHARSET_INFO*	cs,	/*!< in: Charset */
+	const byte*		str,	/*!< in: string */
+	ulint			len)	/*!< in: string length */
+{
+	ulint		selected = 0;
+	ulint		value = innobase_strnxfrm(cs, str, len);
+
+	while (fts_index_selector[selected].value != 0) {
+
+		if (fts_index_selector[selected].value == value) {
+
+			return(selected + 1);
+
+		} else if (fts_index_selector[selected].value > value) {
+
+			return(selected);
+		}
+
+		++selected;
+	}
+
+	ut_ad(selected > 0);
+
+	return((ulint) selected);
+}
+
+/******************************************************************//**
+Return the selected FTS aux index suffix. */
+UNIV_INLINE
+const char*
+fts_get_suffix(
+/*===========*/
+	ulint		selected)	/*!< in: selected index */
+{
+	return(fts_index_selector[selected].suffix);
+}
+
+/******************************************************************//**
+Get the number of index selectors.
+@return The number of selectors */
+UNIV_INLINE
+ulint
+fts_get_n_selectors(void)
+/*=====================*/
+{
+	ulint	i = 0;
+
+	// FIXME: This is a hack
+	while (fts_index_selector[i].value != 0) {
+		++i;
+	}
+
+	return(i);
+}
+
+#endif /* INNOBASE_FTS0TYPES_IC */
diff --git a/storage/xtradb/include/fts0vlc.ic b/storage/xtradb/include/fts0vlc.ic
new file mode 100644
index 00000000000..e79bcf59347
--- /dev/null
+++ b/storage/xtradb/include/fts0vlc.ic
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0vlc.ic
+Full text variable length integer encoding/decoding.
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0VLC_IC
+#define INNOBASE_FTS0VLC_IC
+
+#include "fts0types.h"
+
+/******************************************************************//**
+Return length of val if it were encoded using our VLC scheme.
+FIXME: We will need to be able encode 8 bytes value
+@return length of value encoded, in bytes */
+UNIV_INLINE
+ulint
+fts_get_encoded_len(
+/*================*/
+	ulint	val)	/* in: value to encode */
+{
+	if (val <= 127) {
+		return(1);
+	} else if (val <= 16383) {
+		return(2);
+	} else if (val <= 2097151) {
+		return(3);
+	} else if (val <= 268435455) {
+		return(4);
+	} else {
+		/* Possibly we should care that on 64-bit machines ulint can
+		contain values that we can't encode in 5 bytes, but
+		fts_encode_int doesn't handle them either so it doesn't much
+		matter. */
+
+		return(5);
+	}
+}
+
+/******************************************************************//**
+Encode an integer using our VLC scheme and return the length in bytes.
+@return length of value encoded, in bytes */
+UNIV_INLINE
+ulint
+fts_encode_int(
+/*===========*/
+	ulint	val,	/* in: value to encode */
+	byte*	buf)	/* in: buffer, must have enough space */
+{
+	ulint	len;
+
+	if (val <= 127) {
+		*buf = (byte) val;
+
+		len = 1;
+	} else if (val <= 16383) {
+		*buf++ = (byte)(val >> 7);
+		*buf = (byte)(val & 0x7F);
+
+		len = 2;
+	} else if (val <= 2097151) {
+		*buf++ = (byte)(val >> 14);
+		*buf++ = (byte)((val >> 7) & 0x7F);
+		*buf = (byte)(val & 0x7F);
+
+		len = 3;
+	} else if (val <= 268435455) {
+		*buf++ = (byte)(val >> 21);
+		*buf++ = (byte)((val >> 14) & 0x7F);
+		*buf++ = (byte)((val >> 7) & 0x7F);
+		*buf = (byte)(val & 0x7F);
+
+		len = 4;
+	} else {
+		/* Best to keep the limitations of the 32/64 bit versions
+		identical, at least for the time being. */
+		ut_ad(val <= 4294967295u);
+
+		*buf++ = (byte)(val >> 28);
+		*buf++ = (byte)((val >> 21) & 0x7F);
+		*buf++ = (byte)((val >> 14) & 0x7F);
+		*buf++ = (byte)((val >> 7) & 0x7F);
+		*buf = (byte)(val & 0x7F);
+
+		len = 5;
+	}
+
+	/* High-bit on means "last byte in the encoded integer". */
+	*buf |= 0x80;
+
+	return(len);
+}
+
+/******************************************************************//**
+Decode and return the integer that was encoded using our VLC scheme.
+@return value decoded */
+UNIV_INLINE
+ulint
+fts_decode_vlc(
+/*===========*/
+	byte**	ptr)	/* in: ptr to decode from, this ptr is
+			incremented by the number of bytes decoded */
+{
+	ulint	val = 0;
+
+	for (;;) {
+		byte	b = **ptr;
+
+		++*ptr;
+		val |= (b & 0x7F);
+
+		/* High-bit on means "last byte in the encoded integer". */
+		if (b & 0x80) {
+			break;
+		} else {
+			val <<= 7;
+		}
+	}
+
+	return(val);
+}
+
+#endif
diff --git a/storage/xtradb/include/fut0fut.h b/storage/xtradb/include/fut0fut.h
index 6a68bfffc72..851cdb44cdf 100644
--- a/storage/xtradb/include/fut0fut.h
+++ b/storage/xtradb/include/fut0fut.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/fut0fut.ic b/storage/xtradb/include/fut0fut.ic
index b881baff13c..15c964df6c7 100644
--- a/storage/xtradb/include/fut0fut.ic
+++ b/storage/xtradb/include/fut0fut.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/fut0lst.h b/storage/xtradb/include/fut0lst.h
index c75efd2aab2..90f9a65d4fa 100644
--- a/storage/xtradb/include/fut0lst.h
+++ b/storage/xtradb/include/fut0lst.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/fut0lst.ic b/storage/xtradb/include/fut0lst.ic
index 74d00dc488e..d18cf21378f 100644
--- a/storage/xtradb/include/fut0lst.ic
+++ b/storage/xtradb/include/fut0lst.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/ha0ha.h b/storage/xtradb/include/ha0ha.h
index afa9152a317..7351b407e8c 100644
--- a/storage/xtradb/include/ha0ha.h
+++ b/storage/xtradb/include/ha0ha.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -45,9 +45,10 @@ ha_search_and_get_data(
 	ulint		fold);	/*!< in: folded value of the searched data */
 /*********************************************************//**
 Looks for an element when we know the pointer to the data and updates
-the pointer to data if found. */
+the pointer to data if found.
+@return TRUE if found */
 UNIV_INTERN
-void
+ibool
 ha_search_and_update_if_found_func(
 /*===============================*/
 	hash_table_t*	table,	/*!< in/out: hash table */
@@ -92,8 +93,12 @@ ha_create_func(
 	ulint	mutex_level,	/*!< in: level of the mutexes in the latching
 				order: this is used in the debug version */
 #endif /* UNIV_SYNC_DEBUG */
-	ulint	n_mutexes);	/*!< in: number of mutexes to protect the
+	ulint	n_mutexes,	/*!< in: number of mutexes to protect the
 				hash table: must be a power of 2, or 0 */
+	ulint	type);		/*!< in: type of datastructure for which
+				the memory heap is going to be used e.g.:
+				MEM_HEAP_FOR_BTR_SEARCH or
+				MEM_HEAP_FOR_PAGE_HASH */
 #ifdef UNIV_SYNC_DEBUG
 /** Creates a hash table.
 @return		own: created table
@@ -102,7 +107,7 @@ chosen to be a slightly bigger prime number.
 @param level	in: level of the mutexes in the latching order
 @param n_m	in: number of mutexes to protect the hash table;
 		must be a power of 2, or 0 */
-# define ha_create(n_c,n_m,level) ha_create_func(n_c,level,n_m)
+# define ha_create(n_c,n_m,type,level) ha_create_func(n_c,level,n_m,type)
 #else /* UNIV_SYNC_DEBUG */
 /** Creates a hash table.
 @return		own: created table
@@ -111,10 +116,18 @@ chosen to be a slightly bigger prime number.
 @param level	in: level of the mutexes in the latching order
 @param n_m	in: number of mutexes to protect the hash table;
 		must be a power of 2, or 0 */
-# define ha_create(n_c,n_m,level) ha_create_func(n_c,n_m)
+# define ha_create(n_c,n_m,type,level) ha_create_func(n_c,n_m,type)
 #endif /* UNIV_SYNC_DEBUG */
 
 /*************************************************************//**
+Empties a hash table and frees the memory heaps. */
+UNIV_INTERN
+void
+ha_clear(
+/*=====*/
+	hash_table_t*	table);	/*!< in, own: hash table */
+
+/*************************************************************//**
 Inserts an entry into a hash table. If an entry with the same fold number
 is found, its node is updated to point to the new data, and no new node
 is inserted.
@@ -143,7 +156,10 @@ is inserted.
 @param f	in: folded value of data
 @param b	in: buffer block containing the data
 @param d	in: data, must not be NULL */
-# define ha_insert_for_fold(t,f,b,d) ha_insert_for_fold_func(t,f,b,d)
+# define ha_insert_for_fold(t,f,b,d) 	do {		\
+	ha_insert_for_fold_func(t,f,b,d);		\
+	MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);	\
+} while(0)
 #else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
 /**
 Inserts an entry into a hash table. If an entry with the same fold number
@@ -154,7 +170,10 @@ is inserted.
 @param f	in: folded value of data
 @param b	ignored: buffer block containing the data
 @param d	in: data, must not be NULL */
-# define ha_insert_for_fold(t,f,b,d) ha_insert_for_fold_func(t,f,d)
+# define ha_insert_for_fold(t,f,b,d)	do {		\
+	ha_insert_for_fold_func(t,f,d);			\
+	MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);	\
+} while (0)
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
 
 /*********************************************************//**
@@ -202,10 +221,7 @@ ha_print_info(
 #endif /* !UNIV_HOTBACKUP */
 
 /** The hash table external chain node */
-typedef struct ha_node_struct ha_node_t;
-
-/** The hash table external chain node */
-struct ha_node_struct {
+struct ha_node_t {
 	ha_node_t*	next;	/*!< next chain node or NULL if none */
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 	buf_block_t*	block;	/*!< buffer block containing the data, or NULL */
@@ -214,20 +230,33 @@ struct ha_node_struct {
 	ulint		fold;	/*!< fold value for the data */
 };
 
-#ifndef UNIV_HOTBACKUP
-/** Assert that the current thread is holding the mutex protecting a
-hash bucket corresponding to a fold value.
-@param table	in: hash table
-@param fold	in: fold value */
-# define ASSERT_HASH_MUTEX_OWN(table, fold)				\
-	ut_ad(!(table)->mutexes || mutex_own(hash_get_mutex(table, fold)))
-#else /* !UNIV_HOTBACKUP */
-/** Assert that the current thread is holding the mutex protecting a
-hash bucket corresponding to a fold value.
-@param table	in: hash table
-@param fold	in: fold value */
-# define ASSERT_HASH_MUTEX_OWN(table, fold) ((void) 0)
-#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Assert that the synchronization object in a hash operation involving
+possible change in the hash table is held.
+Note that in case of mutexes we assert that mutex is owned while in case
+of rw-locks we assert that it is held in exclusive mode. */
+UNIV_INLINE
+void
+hash_assert_can_modify(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold value */
+/********************************************************************//**
+Assert that the synchronization object in a hash search operation is held.
+Note that in case of mutexes we assert that mutex is owned while in case
+of rw-locks we assert that it is held either in x-mode or s-mode. */
+UNIV_INLINE
+void
+hash_assert_can_search(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold value */
+#else /* UNIV_DEBUG */
+#define hash_assert_can_modify(t, f)
+#define hash_assert_can_search(t, f)
+#endif /* UNIV_DEBUG */
+
 
 #ifndef UNIV_NONINL
 #include "ha0ha.ic"
diff --git a/storage/xtradb/include/ha0ha.ic b/storage/xtradb/include/ha0ha.ic
index 4c69fe63f91..9d0e396e200 100644
--- a/storage/xtradb/include/ha0ha.ic
+++ b/storage/xtradb/include/ha0ha.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -106,6 +106,56 @@ ha_chain_get_first(
 	       hash_get_nth_cell(table, hash_calc_hash(fold, table))->node);
 }
 
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Assert that the synchronization object in a hash operation involving
+possible change in the hash table is held.
+Note that in case of mutexes we assert that mutex is owned while in case
+of rw-locks we assert that it is held in exclusive mode. */
+UNIV_INLINE
+void
+hash_assert_can_modify(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold value */
+{
+	if (table->type == HASH_TABLE_SYNC_MUTEX) {
+		ut_ad(mutex_own(hash_get_mutex(table, fold)));
+	} else if (table->type == HASH_TABLE_SYNC_RW_LOCK) {
+# ifdef UNIV_SYNC_DEBUG
+		prio_rw_lock_t* lock = hash_get_lock(table, fold);
+		ut_ad(rw_lock_own(lock, RW_LOCK_EX));
+# endif
+	} else {
+		ut_ad(table->type == HASH_TABLE_SYNC_NONE);
+	}
+}
+
+/********************************************************************//**
+Assert that the synchronization object in a hash search operation is held.
+Note that in case of mutexes we assert that mutex is owned while in case
+of rw-locks we assert that it is held either in x-mode or s-mode. */
+UNIV_INLINE
+void
+hash_assert_can_search(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold value */
+{
+	if (table->type == HASH_TABLE_SYNC_MUTEX) {
+		ut_ad(mutex_own(hash_get_mutex(table, fold)));
+	} else if (table->type == HASH_TABLE_SYNC_RW_LOCK) {
+# ifdef UNIV_SYNC_DEBUG
+		prio_rw_lock_t* lock = hash_get_lock(table, fold);
+		ut_ad(rw_lock_own(lock, RW_LOCK_EX)
+		      || rw_lock_own(lock, RW_LOCK_SHARED));
+# endif
+	} else {
+		ut_ad(table->type == HASH_TABLE_SYNC_NONE);
+	}
+}
+#endif /* UNIV_DEBUG */
+
 /*************************************************************//**
 Looks for an element in a hash table.
 @return pointer to the data of the first hash table node in chain
@@ -119,10 +169,7 @@ ha_search_and_get_data(
 {
 	ha_node_t*	node;
 
-	ASSERT_HASH_MUTEX_OWN(table, fold);
-#ifdef UNIV_SYNC_DEBUG
-//	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
-#endif /* UNIV_SYNC_DEBUG */
+	hash_assert_can_search(table, fold);
 	ut_ad(btr_search_enabled);
 
 	node = ha_chain_get_first(table, fold);
@@ -152,7 +199,7 @@ ha_search_with_data(
 {
 	ha_node_t*	node;
 
-	ASSERT_HASH_MUTEX_OWN(table, fold);
+	hash_assert_can_search(table, fold);
 
 	ut_ad(btr_search_enabled);
 
@@ -184,10 +231,7 @@ ha_search_and_delete_if_found(
 {
 	ha_node_t*	node;
 
-	ASSERT_HASH_MUTEX_OWN(table, fold);
-#ifdef UNIV_SYNC_DEBUG
-//	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
-#endif /* UNIV_SYNC_DEBUG */
+	hash_assert_can_modify(table, fold);
 	ut_ad(btr_search_enabled);
 
 	node = ha_search_with_data(table, fold, data);
diff --git a/storage/xtradb/include/ha0storage.h b/storage/xtradb/include/ha0storage.h
index 8109646a8e9..0073930b502 100644
--- a/storage/xtradb/include/ha0storage.h
+++ b/storage/xtradb/include/ha0storage.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -39,7 +39,7 @@ constant per ha_storage's lifetime. */
 #define HA_STORAGE_DEFAULT_HASH_CELLS	4096
 
 /** Hash storage */
-typedef struct ha_storage_struct	ha_storage_t;
+struct ha_storage_t;
 
 /*******************************************************************//**
 Creates a hash storage. If any of the parameters is 0, then a default
diff --git a/storage/xtradb/include/ha0storage.ic b/storage/xtradb/include/ha0storage.ic
index 86f2e578090..7150ca045ec 100644
--- a/storage/xtradb/include/ha0storage.ic
+++ b/storage/xtradb/include/ha0storage.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -31,7 +31,7 @@ Created September 24, 2007 Vasil Dimov
 #include "mem0mem.h"
 
 /** Hash storage for strings */
-struct ha_storage_struct {
+struct ha_storage_t {
 	mem_heap_t*	heap;	/*!< memory heap from which memory is
 				allocated */
 	hash_table_t*	hash;	/*!< hash table used to avoid
@@ -39,9 +39,7 @@ struct ha_storage_struct {
 };
 
 /** Objects of this type are stored in ha_storage_t */
-typedef struct ha_storage_node_struct ha_storage_node_t;
-/** Objects of this type are stored in ha_storage_struct */
-struct ha_storage_node_struct {
+struct ha_storage_node_t {
 	ulint			data_len;/*!< length of the data */
 	const void*		data;	/*!< pointer to data */
 	ha_storage_node_t*	next;	/*!< next node in hash chain */
diff --git a/storage/xtradb/include/ha_prototypes.h b/storage/xtradb/include/ha_prototypes.h
index c804e3dc7af..4599547439e 100644
--- a/storage/xtradb/include/ha_prototypes.h
+++ b/storage/xtradb/include/ha_prototypes.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2000, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2006, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,9 +27,21 @@ Created 5/11/2006 Osku Salerma
 #ifndef HA_INNODB_PROTOTYPES_H
 #define HA_INNODB_PROTOTYPES_H
 
+#include "my_dbug.h"
+#include "mysqld_error.h"
+#include "my_compare.h"
+#include "my_sys.h"
+#include "m_string.h"
+#include "debug_sync.h"
+#include "my_base.h"
+
 #include "trx0types.h"
 #include "m_ctype.h" /* CHARSET_INFO */
 
+// Forward declarations
+class Field;
+struct fts_string_t;
+
 /*********************************************************************//**
 Wrapper around MySQL's copy_and_convert function.
 @return	number of bytes copied to 'to' */
@@ -43,7 +55,8 @@ innobase_convert_string(
 	CHARSET_INFO*	to_cs,		/*!< in: character set to convert to */
 	const void*	from,		/*!< in: string to convert */
 	ulint		from_length,	/*!< in: number of bytes to convert */
-	CHARSET_INFO*	from_cs,	/*!< in: character set to convert from */
+	CHARSET_INFO*	from_cs,	/*!< in: character set to convert
+					from */
 	uint*		errors);	/*!< out: number of errors encountered
 					during the conversion */
 
@@ -96,7 +109,7 @@ innobase_convert_name(
 	ulint		buflen,	/*!< in: length of buf, in bytes */
 	const char*	id,	/*!< in: identifier to convert */
 	ulint		idlen,	/*!< in: length of id, in bytes */
-	void*		thd,	/*!< in: MySQL connection thread, or NULL */
+	THD*		thd,	/*!< in: MySQL connection thread, or NULL */
 	ibool		table_id);/*!< in: TRUE=id is a table or database name;
 				FALSE=id is an index name */
 
@@ -111,7 +124,19 @@ UNIV_INTERN
 ibool
 thd_is_replication_slave_thread(
 /*============================*/
-	const void*	thd);	/*!< in: thread handle (THD*) */
+	THD*	thd);	/*!< in: thread handle */
+
+/******************************************************************//**
+Gets information on the durability property requested by thread.
+Used when writing either a prepare or commit record to the log
+buffer.
+@return the durability property. */
+UNIV_INTERN
+enum durability_properties
+thd_requested_durability(
+/*=====================*/
+	const THD* thd)	/*!< in: thread handle */
+	__attribute__((nonnull, warn_unused_result));
 
 /******************************************************************//**
 Returns true if the transaction this thread is processing has edited
@@ -123,7 +148,7 @@ UNIV_INTERN
 ibool
 thd_has_edited_nontrans_tables(
 /*===========================*/
-	void*	thd);	/*!< in: thread handle (THD*) */
+	THD*	thd);	/*!< in: thread handle */
 
 /*************************************************************//**
 Prints info of a THD object (== user session thread) to the given file. */
@@ -132,7 +157,7 @@ void
 innobase_mysql_print_thd(
 /*=====================*/
 	FILE*	f,		/*!< in: output stream */
-	void*	thd,		/*!< in: pointer to a MySQL THD object */
+	THD*	thd,		/*!< in: pointer to a MySQL THD object */
 	uint	max_query_len);	/*!< in: max query length to print, or 0 to
 				   use the default max length */
 
@@ -147,6 +172,23 @@ innobase_mysql_log_notify(
 	ib_uint64_t	write_lsn,	/*!< in: LSN written to log file */
 	ib_uint64_t	flush_lsn);	/*!< in: LSN flushed to disk */
 
+/*************************************************************//**
+InnoDB uses this function to compare two data fields for which the data type
+is such that we must use MySQL code to compare them.
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+UNIV_INTERN
+int
+innobase_mysql_cmp(
+/*===============*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number,	/*!< in: number of the charset */
+	const unsigned char* a,		/*!< in: data field */
+	unsigned int	a_length,	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	const unsigned char* b,		/*!< in: data field */
+	unsigned int	b_length)	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	__attribute__((nonnull, warn_unused_result));
 /**************************************************************//**
 Converts a MySQL type to an InnoDB type. Note that this function returns
 the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
@@ -185,6 +227,17 @@ innobase_strcasecmp(
 	const char*	b);	/*!< in: second string to compare */
 
 /******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively. The
+second string contains wildcards.
+@return 0 if a match is found, 1 if not */
+UNIV_INTERN
+int
+innobase_wildcasecmp(
+/*=================*/
+	const char*	a,	/*!< in: string to compare */
+	const char*	b);	/*!< in: wildcard string to compare */
+
+/******************************************************************//**
 Strip dir name from a full path name and return only its file name.
 @return file name or "null" if no file name */
 UNIV_INTERN
@@ -196,11 +249,11 @@ innobase_basename(
 /******************************************************************//**
 Returns true if the thread is executing a SELECT statement.
 @return	true if thd is executing SELECT */
-
+UNIV_INTERN
 ibool
 thd_is_select(
 /*==========*/
-	const void*	thd);	/*!< in: thread handle (THD*) */
+	const THD*	thd);	/*!< in: thread handle */
 
 /******************************************************************//**
 Converts an identifier to a table name. */
@@ -222,8 +275,8 @@ innobase_convert_from_id(
 	struct charset_info_st*	cs,	/*!< in: the 'from' character set */
 	char*			to,	/*!< out: converted identifier */
 	const char*		from,	/*!< in: identifier to convert */
-	ulint			len);	/*!< in: length of 'to', in bytes; should
-					be at least 3 * strlen(to) + 1 */
+	ulint			len);	/*!< in: length of 'to', in bytes;
+					should be at least 3 * strlen(to) + 1 */
 /******************************************************************//**
 Makes all characters in a NUL-terminated UTF-8 string lower case. */
 UNIV_INTERN
@@ -239,7 +292,7 @@ UNIV_INTERN
 struct charset_info_st*
 innobase_get_charset(
 /*=================*/
-	void*	mysql_thd);	/*!< in: MySQL thread handle */
+	THD*	thd);	/*!< in: MySQL thread handle */
 /**********************************************************************//**
 Determines the current SQL statement.
 @return	SQL statement string */
@@ -247,7 +300,7 @@ UNIV_INTERN
 const char*
 innobase_get_stmt(
 /*==============*/
-	void*	mysql_thd,	/*!< in: MySQL thread handle */
+	THD*	thd,		/*!< in: MySQL thread handle */
 	size_t*	length)		/*!< out: length of the SQL statement */
 	__attribute__((nonnull));
 /******************************************************************//**
@@ -270,8 +323,9 @@ innobase_get_at_most_n_mbchars(
 /*************************************************************//**
 InnoDB index push-down condition check
 @return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */
+UNIV_INTERN
 enum icp_result
-handler_index_cond_check(
+innobase_index_cond(
 /*================*/
 	void*	file)	/*!< in/out: pointer to ha_innobase */
 	__attribute__((nonnull, warn_unused_result));
@@ -279,21 +333,21 @@ handler_index_cond_check(
 Returns true if the thread supports XA,
 global value of innodb_supports_xa if thd is NULL.
 @return	true if thd supports XA */
-
+UNIV_INTERN
 ibool
 thd_supports_xa(
 /*============*/
-	void*	thd);	/*!< in: thread handle (THD*), or NULL to query
+	THD*	thd);	/*!< in: thread handle, or NULL to query
 			the global innodb_supports_xa */
 
 /******************************************************************//**
 Returns the lock wait timeout for the current connection.
 @return	the lock wait timeout, in seconds */
-
+UNIV_INTERN
 ulong
 thd_lock_wait_timeout(
 /*==================*/
-	void*	thd);	/*!< in: thread handle (THD*), or NULL to query
+	THD*	thd);	/*!< in: thread handle, or NULL to query
 			the global innodb_lock_wait_timeout */
 /******************************************************************//**
 Add up the time waited for the lock for the current query. */
@@ -301,11 +355,21 @@ UNIV_INTERN
 void
 thd_set_lock_wait_time(
 /*===================*/
-        void*   thd,	/*!< in: thread handle (THD*) */
-        ulint   value);	/*!< in: time waited for the lock */
-/******************************************************************//**
-*/
+	THD*	thd,	/*!< in/out: thread handle */
+	ulint	value);	/*!< in: time waited for the lock */
 
+/**********************************************************************//**
+Get the current setting of the table_cache_size global parameter. We do
+a dirty read because for one there is no synchronization object and
+secondly there is little harm in doing so even if we get a torn read.
+@return	SQL statement string */
+UNIV_INTERN
+ulint
+innobase_get_table_cache_size(void);
+/*===============================*/
+
+/******************************************************************//**
+								     */
 ulong
 thd_flush_log_at_trx_commit(
 /*================================*/
@@ -322,17 +386,210 @@ ulint
 innobase_get_lower_case_table_names(void);
 /*=====================================*/
 
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return 0 or error number */
+UNIV_INTERN
+int
+innobase_close_thd(
+/*===============*/
+	THD*	thd);		/*!< in: MySQL thread handle for
+				which to close the connection */
+/*************************************************************//**
+Get the next token from the given string and store it in *token. */
+UNIV_INTERN
+ulint
+innobase_mysql_fts_get_token(
+/*=========================*/
+	CHARSET_INFO*	charset,	/*!< in: Character set */
+	const byte*	start,		/*!< in: start of text */
+	const byte*	end,		/*!< in: one character past end of
+					text */
+	fts_string_t*	token,		/*!< out: token's text */
+	ulint*		offset);	/*!< out: offset to token,
+					measured as characters from
+					'start' */
 
-/********************************************************************//**
-Returns the merge-sort block size used for the secondary index creation
-for the current connection.
-@return the merge-sort block size, in bytes */
+/******************************************************************//**
+compare two character string case insensitively according to their charset. */
+UNIV_INTERN
+int
+innobase_fts_text_case_cmp(
+/*=======================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*	p1,		/*!< in: key */
+	const void*	p2);		/*!< in: node */
 
-ulong
-thd_merge_sort_block_size(
-/*======================*/
-	void* thd); /*!< in: thread handle (THD*), or NULL to query
-			the global merge_sort_block_size */
+/******************************************************************//**
+compare two character string according to their charset. */
+UNIV_INTERN
+int
+innobase_fts_string_cmp(
+/*====================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*	p1,		/*!< in: key */
+	const void*	p2);		/*!< in: node */
+
+/****************************************************************//**
+Get FTS field charset info from the field's prtype
+@return charset info */
+UNIV_INTERN
+CHARSET_INFO*
+innobase_get_fts_charset(
+/*=====================*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number);/*!< in: number of the charset */
+/******************************************************************//**
+Returns true if transaction should be flagged as read-only.
+@return	true if the thd is marked as read-only */
+UNIV_INTERN
+ibool
+thd_trx_is_read_only(
+/*=================*/
+	THD*	thd);	/*!< in/out: thread handle */
+
+/******************************************************************//**
+Check if the transaction is an auto-commit transaction. TRUE also
+implies that it is a SELECT (read-only) transaction.
+@return	true if the transaction is an auto commit read-only transaction. */
+UNIV_INTERN
+ibool
+thd_trx_is_auto_commit(
+/*===================*/
+	THD*	thd);	/*!< in: thread handle, or NULL */
+
+/*****************************************************************//**
+A wrapper function of innobase_convert_name(), convert a table or
+index name to the MySQL system_charset_info (UTF-8) and quote it if needed.
+@return	pointer to the end of buf */
+UNIV_INTERN
+void
+innobase_format_name(
+/*==================*/
+	char*		buf,		/*!< out: buffer for converted
+					identifier */
+	ulint		buflen,		/*!< in: length of buf, in bytes */
+	const char*	name,		/*!< in: index or table name
+					to format */
+	ibool		is_index_name)	/*!< in: index name */
+	__attribute__((nonnull));
+
+/** Corresponds to Sql_condition:enum_warning_level. */
+enum ib_log_level_t {
+	IB_LOG_LEVEL_INFO,
+	IB_LOG_LEVEL_WARN,
+	IB_LOG_LEVEL_ERROR,
+	IB_LOG_LEVEL_FATAL
+};
+
+/******************************************************************//**
+Use this when the args are first converted to a formatted string and then
+passed to the format string from errmsg-utf8.txt. The error message format
+must be: "Some string ... %s".
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+	THD *thd, Sql_condition::enum_warning_level level,
+	uint code, const char *format, ...);
+*/
+UNIV_INTERN
+void
+ib_errf(
+/*====*/
+	THD*		thd,		/*!< in/out: session */
+	ib_log_level_t	level,		/*!< in: warning level */
+	ib_uint32_t	code,		/*!< MySQL error code */
+	const char*	format,		/*!< printf format */
+	...)				/*!< Args */
+	__attribute__((format(printf, 4, 5)));
+
+/******************************************************************//**
+Use this when the args are passed to the format string from
+errmsg-utf8.txt directly as is.
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+	THD *thd, Sql_condition::enum_warning_level level,
+	uint code, const char *format, ...);
+*/
+UNIV_INTERN
+void
+ib_senderrf(
+/*========*/
+	THD*		thd,		/*!< in/out: session */
+	ib_log_level_t	level,		/*!< in: warning level */
+	ib_uint32_t	code,		/*!< MySQL error code */
+	...);				/*!< Args */
+
+/******************************************************************//**
+Write a message to the MySQL log, prefixed with "InnoDB: ".
+Wrapper around sql_print_information() */
+UNIV_INTERN
+void
+ib_logf(
+/*====*/
+	ib_log_level_t	level,		/*!< in: warning level */
+	const char*	format,		/*!< printf format */
+	...)				/*!< Args */
+	__attribute__((format(printf, 2, 3)));
+
+/******************************************************************//**
+Returns the NUL terminated value of glob_hostname.
+@return	pointer to glob_hostname. */
+UNIV_INTERN
+const char*
+server_get_hostname();
+/*=================*/
+
+/******************************************************************//**
+Get the error message format string.
+@return the format string or 0 if not found. */
+UNIV_INTERN
+const char*
+innobase_get_err_msg(
+/*=================*/
+	int	error_code);	/*!< in: MySQL error code */
+
+/*********************************************************************//**
+Compute the next autoinc value.
+
+For MySQL replication the autoincrement values can be partitioned among
+the nodes. The offset is the start or origin of the autoincrement value
+for a particular node. For n nodes the increment will be n and the offset
+will be in the interval [1, n]. The formula tries to allocate the next
+value for a particular node.
+
+Note: This function is also called with increment set to the number of
+values we want to reserve for multi-value inserts e.g.,
+
+	INSERT INTO T VALUES(), (), ();
+
+innobase_next_autoinc() will be called with increment set to 3 where
+autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
+the multi-value INSERT above.
+@return	the next value */
+UNIV_INTERN
+ulonglong
+innobase_next_autoinc(
+/*==================*/
+	ulonglong	current,	/*!< in: Current value */
+	ulonglong	need,		/*!< in: count of values needed */
+	ulonglong	step,		/*!< in: AUTOINC increment step */
+	ulonglong	offset,		/*!< in: AUTOINC offset */
+	ulonglong	max_value)	/*!< in: max value for type */
+	__attribute__((pure, warn_unused_result));
+
+/********************************************************************//**
+Get the upper limit of the MySQL integral and floating-point type.
+@return maximum allowed value for the field */
+UNIV_INTERN
+ulonglong
+innobase_get_int_col_max_value(
+/*===========================*/
+	const Field*	field)	/*!< in: MySQL field */
+	__attribute__((nonnull, pure, warn_unused_result));
 
 /**********************************************************************
 Check if the length of the identifier exceeds the maximum allowed.
@@ -365,4 +622,4 @@ innobase_convert_to_filename_charset(
 	ulint           len);   /* in: length of 'to', in bytes */
 
 
-#endif
+#endif /* HA_INNODB_PROTOTYPES_H */
diff --git a/storage/xtradb/include/handler0alter.h b/storage/xtradb/include/handler0alter.h
index 3107fb32881..66b963ae39a 100644
--- a/storage/xtradb/include/handler0alter.h
+++ b/storage/xtradb/include/handler0alter.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,11 +27,34 @@ UNIV_INTERN
 void
 innobase_rec_to_mysql(
 /*==================*/
-	struct TABLE*		table,		/*!< in/out: MySQL table */
-	const rec_t*		rec,		/*!< in: record */
-	const dict_index_t*	index,		/*!< in: index */
-	const ulint*		offsets);	/*!< in: rec_get_offsets(
-						rec, index, ...) */
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: index */
+	const ulint*		offsets)/*!< in: rec_get_offsets(
+					rec, index, ...) */
+	__attribute__((nonnull));
+
+/*************************************************************//**
+Copies an InnoDB index entry to table->record[0]. */
+UNIV_INTERN
+void
+innobase_fields_to_mysql(
+/*=====================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const dict_index_t*	index,	/*!< in: InnoDB index */
+	const dfield_t*		fields)	/*!< in: InnoDB index fields */
+	__attribute__((nonnull));
+
+/*************************************************************//**
+Copies an InnoDB row to table->record[0]. */
+UNIV_INTERN
+void
+innobase_row_to_mysql(
+/*==================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const dict_table_t*	itab,	/*!< in: InnoDB table */
+	const dtuple_t*		row)	/*!< in: InnoDB row */
+	__attribute__((nonnull));
 
 /*************************************************************//**
 Resets table->record[0]. */
@@ -39,4 +62,53 @@ UNIV_INTERN
 void
 innobase_rec_reset(
 /*===============*/
-	struct TABLE*		table);		/*!< in/out: MySQL table */
+	struct TABLE*		table)		/*!< in/out: MySQL table */
+	__attribute__((nonnull));
+
+/** Generate the next autoinc based on a snapshot of the session
+auto_increment_increment and auto_increment_offset variables. */
+struct ib_sequence_t {
+
+	/**
+	@param thd - the session
+	@param start_value - the lower bound
+	@param max_value - the upper bound (inclusive) */
+	ib_sequence_t(THD* thd, ulonglong start_value, ulonglong max_value);
+
+	/**
+	Postfix increment
+	@return the value to insert */
+	ulonglong operator++(int) UNIV_NOTHROW;
+
+	/** Check if the autoinc "sequence" is exhausted.
+	@return true if the sequence is exhausted */
+	bool eof() const UNIV_NOTHROW
+	{
+		return(m_eof);
+	}
+
+	/**
+	@return the next value in the sequence */
+	ulonglong last() const UNIV_NOTHROW
+	{
+		ut_ad(m_next_value > 0);
+
+		return(m_next_value);
+	}
+
+	/** Maximum calumn value if adding an AUTOINC column else 0. Once
+	we reach the end of the sequence it will be set to ~0. */
+	const ulonglong	m_max_value;
+
+	/** Value of auto_increment_increment */
+	ulong		m_increment;
+
+	/** Value of auto_increment_offset */
+	ulong		m_offset;
+
+	/** Next value in the sequence */
+	ulonglong	m_next_value;
+
+	/** true if no more values left in the sequence */
+	bool		m_eof;
+};
diff --git a/storage/xtradb/include/hash0hash.h b/storage/xtradb/include/hash0hash.h
index 05b538ed5f5..a6fe4e680a1 100644
--- a/storage/xtradb/include/hash0hash.h
+++ b/storage/xtradb/include/hash0hash.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -30,16 +30,29 @@ Created 5/20/1997 Heikki Tuuri
 #include "mem0mem.h"
 #ifndef UNIV_HOTBACKUP
 # include "sync0sync.h"
+# include "sync0rw.h"
 #endif /* !UNIV_HOTBACKUP */
 
-typedef struct hash_table_struct hash_table_t;
-typedef struct hash_cell_struct hash_cell_t;
+struct hash_table_t;
+struct hash_cell_t;
 
 typedef void*	hash_node_t;
 
 /* Fix Bug #13859: symbol collision between imap/mysql */
 #define hash_create hash0_create
 
+/* Differnt types of hash_table based on the synchronization
+method used for it. */
+enum hash_table_sync_t {
+	HASH_TABLE_SYNC_NONE = 0,	/*!< Don't use any internal
+					synchronization objects for
+					this hash_table. */
+	HASH_TABLE_SYNC_MUTEX,		/*!< Use mutexes to control
+					access to this hash_table. */
+	HASH_TABLE_SYNC_RW_LOCK		/*!< Use rw_locks to control
+					access to this hash_table. */
+};
+
 /*************************************************************//**
 Creates a hash table with >= n array cells. The actual number
 of cells is chosen to be a prime number slightly bigger than n.
@@ -51,21 +64,29 @@ hash_create(
 	ulint	n);	/*!< in: number of array cells */
 #ifndef UNIV_HOTBACKUP
 /*************************************************************//**
-Creates a mutex array to protect a hash table. */
+Creates a sync object array array to protect a hash table.
+::sync_obj can be mutexes or rw_locks depening on the type of
+hash table. */
 UNIV_INTERN
 void
-hash_create_mutexes_func(
-/*=====================*/
-	hash_table_t*	table,		/*!< in: hash table */
+hash_create_sync_obj_func(
+/*======================*/
+	hash_table_t*		table,	/*!< in: hash table */
+	enum hash_table_sync_t	type,	/*!< in: HASH_TABLE_SYNC_MUTEX
+					or HASH_TABLE_SYNC_RW_LOCK */
 #ifdef UNIV_SYNC_DEBUG
-	ulint		sync_level,	/*!< in: latching order level of the
-					mutexes: used in the debug version */
+	ulint			sync_level,/*!< in: latching order level
+					of the mutexes: used in the
+					debug version */
 #endif /* UNIV_SYNC_DEBUG */
-	ulint		n_mutexes);	/*!< in: number of mutexes */
+	ulint			n_sync_obj);/*!< in: number of sync objects,
+					must be a power of 2 */
 #ifdef UNIV_SYNC_DEBUG
-# define hash_create_mutexes(t,n,level) hash_create_mutexes_func(t,level,n)
+# define hash_create_sync_obj(t, s, n, level)			\
+			hash_create_sync_obj_func(t, s, level, n)
 #else /* UNIV_SYNC_DEBUG */
-# define hash_create_mutexes(t,n,level) hash_create_mutexes_func(t,n)
+# define hash_create_sync_obj(t, s, n, level)			\
+			hash_create_sync_obj_func(t, s, n)
 #endif /* UNIV_SYNC_DEBUG */
 #endif /* !UNIV_HOTBACKUP */
 
@@ -87,11 +108,12 @@ hash_calc_hash(
 	hash_table_t*	table);	/*!< in: hash table */
 #ifndef UNIV_HOTBACKUP
 /********************************************************************//**
-Assert that the mutex for the table in a hash operation is owned. */
-# define HASH_ASSERT_OWNED(TABLE, FOLD)					\
-ut_ad(!(TABLE)->mutexes || mutex_own(hash_get_mutex(TABLE, FOLD)));
+Assert that the mutex for the table is held */
+# define HASH_ASSERT_OWN(TABLE, FOLD)				\
+	ut_ad((TABLE)->type != HASH_TABLE_SYNC_MUTEX		\
+	      || (mutex_own(hash_get_mutex((TABLE), FOLD))));
 #else /* !UNIV_HOTBACKUP */
-# define HASH_ASSERT_OWNED(TABLE, FOLD)
+# define HASH_ASSERT_OWN(TABLE, FOLD)
 #endif /* !UNIV_HOTBACKUP */
 
 /*******************************************************************//**
@@ -102,7 +124,7 @@ do {\
 	hash_cell_t*	cell3333;\
 	TYPE*		struct3333;\
 \
-	HASH_ASSERT_OWNED(TABLE, FOLD)\
+	HASH_ASSERT_OWN(TABLE, FOLD)\
 \
 	(DATA)->NAME = NULL;\
 \
@@ -124,7 +146,7 @@ do {\
 
 #ifdef UNIV_HASH_DEBUG
 # define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1)
-# define HASH_INVALIDATE(DATA, NAME) DATA->NAME = (void*) -1
+# define HASH_INVALIDATE(DATA, NAME) *(void**) (&DATA->NAME) = (void*) -1
 #else
 # define HASH_ASSERT_VALID(DATA) do {} while (0)
 # define HASH_INVALIDATE(DATA, NAME) do {} while (0)
@@ -138,7 +160,7 @@ do {\
 	hash_cell_t*	cell3333;\
 	TYPE*		struct3333;\
 \
-	HASH_ASSERT_OWNED(TABLE, FOLD)\
+	HASH_ASSERT_OWN(TABLE, FOLD)\
 \
 	cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
 \
@@ -175,7 +197,7 @@ Looks for a struct in a hash table. */
 #define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\
 {\
 \
-	HASH_ASSERT_OWNED(TABLE, FOLD)\
+	HASH_ASSERT_OWN(TABLE, FOLD)\
 \
 	(DATA) = (TYPE) HASH_GET_FIRST(TABLE, hash_calc_hash(FOLD, TABLE));\
 	HASH_ASSERT_VALID(DATA);\
@@ -259,7 +281,7 @@ do {\
 \
 	HASH_DELETE(TYPE, NAME, TABLE, fold111, NODE);\
 \
-	top_node111 = (TYPE*)mem_heap_get_top(\
+	top_node111 = (TYPE*) mem_heap_get_top(\
 				hash_get_heap(TABLE, fold111),\
 							sizeof(TYPE));\
 \
@@ -284,11 +306,12 @@ do {\
 		} else {\
 			/* We have to look for the predecessor of the top\
 			node */\
-			node111 = cell111->node;\
+			node111 = static_cast<TYPE*>(cell111->node);\
 \
 			while (top_node111 != HASH_GET_NEXT(NAME, node111)) {\
 \
-				node111 = HASH_GET_NEXT(NAME, node111);\
+				node111 = static_cast<TYPE*>(\
+					HASH_GET_NEXT(NAME, node111));\
 			}\
 \
 			/* Now we have the predecessor node */\
@@ -329,12 +352,12 @@ do {\
 } while (0)
 
 /************************************************************//**
-Gets the mutex index for a fold value in a hash table.
-@return	mutex number */
+Gets the sync object index for a fold value in a hash table.
+@return	index */
 UNIV_INLINE
 ulint
-hash_get_mutex_no(
-/*==============*/
+hash_get_sync_obj_index(
+/*====================*/
 	hash_table_t*	table,	/*!< in: hash table */
 	ulint		fold);	/*!< in: fold */
 /************************************************************//**
@@ -359,21 +382,39 @@ hash_get_heap(
 Gets the nth mutex in a hash table.
 @return	mutex */
 UNIV_INLINE
-mutex_t*
+ib_prio_mutex_t*
 hash_get_nth_mutex(
 /*===============*/
 	hash_table_t*	table,	/*!< in: hash table */
 	ulint		i);	/*!< in: index of the mutex */
 /************************************************************//**
+Gets the nth rw_lock in a hash table.
+@return	rw_lock */
+UNIV_INLINE
+prio_rw_lock_t*
+hash_get_nth_lock(
+/*==============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		i);	/*!< in: index of the rw_lock */
+/************************************************************//**
 Gets the mutex for a fold value in a hash table.
 @return	mutex */
 UNIV_INLINE
-mutex_t*
+ib_prio_mutex_t*
 hash_get_mutex(
 /*===========*/
 	hash_table_t*	table,	/*!< in: hash table */
 	ulint		fold);	/*!< in: fold */
 /************************************************************//**
+Gets the rw_lock for a fold value in a hash table.
+@return	rw_lock */
+UNIV_INLINE
+prio_rw_lock_t*
+hash_get_lock(
+/*==========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
 Reserves the mutex for a fold value in a hash table. */
 UNIV_INTERN
 void
@@ -403,39 +444,127 @@ void
 hash_mutex_exit_all(
 /*================*/
 	hash_table_t*	table);	/*!< in: hash table */
+/************************************************************//**
+Releases all but the passed in mutex of a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit_all_but(
+/*====================*/
+	hash_table_t*		table,		/*!< in: hash table */
+	ib_prio_mutex_t*	keep_mutex);	/*!< in: mutex to keep */
+/************************************************************//**
+s-lock a lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_lock_s(
+/*========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+x-lock a lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_lock_x(
+/*========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+unlock an s-lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_unlock_s(
+/*==========*/
+
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+unlock x-lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_unlock_x(
+/*==========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+Reserves all the locks of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_lock_x_all(
+/*============*/
+	hash_table_t*	table);	/*!< in: hash table */
+/************************************************************//**
+Releases all the locks of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_unlock_x_all(
+/*==============*/
+	hash_table_t*	table);	/*!< in: hash table */
+/************************************************************//**
+Releases all but passed in lock of a hash table, */
+UNIV_INTERN
+void
+hash_unlock_x_all_but(
+/*==================*/
+	hash_table_t*	table,		/*!< in: hash table */
+	prio_rw_lock_t*	keep_lock);	/*!< in: lock to keep */
+
 #else /* !UNIV_HOTBACKUP */
 # define hash_get_heap(table, fold)	((table)->heap)
 # define hash_mutex_enter(table, fold)	((void) 0)
 # define hash_mutex_exit(table, fold)	((void) 0)
+# define hash_mutex_enter_all(table)	((void) 0)
+# define hash_mutex_exit_all(table)	((void) 0)
+# define hash_mutex_exit_all_but(t, m)	((void) 0)
+# define hash_lock_s(t, f)		((void) 0)
+# define hash_lock_x(t, f)		((void) 0)
+# define hash_unlock_s(t, f)		((void) 0)
+# define hash_unlock_x(t, f)		((void) 0)
+# define hash_lock_x_all(t)		((void) 0)
+# define hash_unlock_x_all(t)		((void) 0)
+# define hash_unlock_x_all_but(t, l)	((void) 0)
 #endif /* !UNIV_HOTBACKUP */
 
-struct hash_cell_struct{
+struct hash_cell_t{
 	void*	node;	/*!< hash chain node, NULL if none */
 };
 
 /* The hash table structure */
-struct hash_table_struct {
+struct hash_table_t {
+	enum hash_table_sync_t	type;	/*<! type of hash_table. */
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 # ifndef UNIV_HOTBACKUP
-	ibool		adaptive;/* TRUE if this is the hash table of the
-				adaptive hash index */
+	ibool			adaptive;/* TRUE if this is the hash
+					table of the adaptive hash
+					index */
 # endif /* !UNIV_HOTBACKUP */
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-	ulint		n_cells;/* number of cells in the hash table */
-	hash_cell_t*	array;	/*!< pointer to cell array */
+	ulint			n_cells;/* number of cells in the hash table */
+	hash_cell_t*		array;	/*!< pointer to cell array */
 #ifndef UNIV_HOTBACKUP
-	ulint		n_mutexes;/* if mutexes != NULL, then the number of
-				mutexes, must be a power of 2 */
-	mutex_t*	mutexes;/* NULL, or an array of mutexes used to
-				protect segments of the hash table */
-	mem_heap_t**	heaps;	/*!< if this is non-NULL, hash chain nodes for
-				external chaining can be allocated from these
-				memory heaps; there are then n_mutexes many of
-				these heaps */
+	ulint			n_sync_obj;/* if sync_objs != NULL, then
+					the number of either the number
+					of mutexes or the number of
+					rw_locks depending on the type.
+					Must be a power of 2 */
+	union {
+		ib_prio_mutex_t*	mutexes;
+					/* NULL, or an array of mutexes
+					used to protect segments of the
+					hash table */
+		prio_rw_lock_t*	rw_locks;/* NULL, or an array of rw_lcoks
+					used to protect segments of the
+					hash table */
+	} sync_obj;
+
+	mem_heap_t**		heaps;	/*!< if this is non-NULL, hash
+					chain nodes for external chaining
+					can be allocated from these memory
+					heaps; there are then n_mutexes
+					many of these heaps */
 #endif /* !UNIV_HOTBACKUP */
-	mem_heap_t*	heap;
+	mem_heap_t*		heap;
 #ifdef UNIV_DEBUG
-	ulint		magic_n;
+	ulint			magic_n;
 # define HASH_TABLE_MAGIC_N	76561114
 #endif /* UNIV_DEBUG */
 };
diff --git a/storage/xtradb/include/hash0hash.ic b/storage/xtradb/include/hash0hash.ic
index 2c708cc594b..e4822538e19 100644
--- a/storage/xtradb/include/hash0hash.ic
+++ b/storage/xtradb/include/hash0hash.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -87,20 +87,21 @@ hash_calc_hash(
 
 #ifndef UNIV_HOTBACKUP
 /************************************************************//**
-Gets the mutex index for a fold value in a hash table.
-@return	mutex number */
+Gets the sync object index for a fold value in a hash table.
+@return	index */
 UNIV_INLINE
 ulint
-hash_get_mutex_no(
-/*==============*/
+hash_get_sync_obj_index(
+/*====================*/
 	hash_table_t*	table,	/*!< in: hash table */
 	ulint		fold)	/*!< in: fold */
 {
 	ut_ad(table);
 	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ut_ad(ut_is_2pow(table->n_mutexes));
+	ut_ad(table->type != HASH_TABLE_SYNC_NONE);
+	ut_ad(ut_is_2pow(table->n_sync_obj));
 	return(ut_2pow_remainder(hash_calc_hash(fold, table),
-				 table->n_mutexes));
+				 table->n_sync_obj));
 }
 
 /************************************************************//**
@@ -115,7 +116,8 @@ hash_get_nth_heap(
 {
 	ut_ad(table);
 	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ut_ad(i < table->n_mutexes);
+	ut_ad(table->type != HASH_TABLE_SYNC_NONE);
+	ut_ad(i < table->n_sync_obj);
 
 	return(table->heaps[i]);
 }
@@ -139,7 +141,7 @@ hash_get_heap(
 		return(table->heap);
 	}
 
-	i = hash_get_mutex_no(table, fold);
+	i = hash_get_sync_obj_index(table, fold);
 
 	return(hash_get_nth_heap(table, i));
 }
@@ -148,7 +150,7 @@ hash_get_heap(
 Gets the nth mutex in a hash table.
 @return	mutex */
 UNIV_INLINE
-mutex_t*
+ib_prio_mutex_t*
 hash_get_nth_mutex(
 /*===============*/
 	hash_table_t*	table,	/*!< in: hash table */
@@ -156,16 +158,17 @@ hash_get_nth_mutex(
 {
 	ut_ad(table);
 	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ut_ad(i < table->n_mutexes);
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	ut_ad(i < table->n_sync_obj);
 
-	return(table->mutexes + i);
+	return(table->sync_obj.mutexes + i);
 }
 
 /************************************************************//**
 Gets the mutex for a fold value in a hash table.
 @return	mutex */
 UNIV_INLINE
-mutex_t*
+ib_prio_mutex_t*
 hash_get_mutex(
 /*===========*/
 	hash_table_t*	table,	/*!< in: hash table */
@@ -176,8 +179,47 @@ hash_get_mutex(
 	ut_ad(table);
 	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
 
-	i = hash_get_mutex_no(table, fold);
+	i = hash_get_sync_obj_index(table, fold);
 
 	return(hash_get_nth_mutex(table, i));
 }
+
+/************************************************************//**
+Gets the nth rw_lock in a hash table.
+@return	rw_lock */
+UNIV_INLINE
+prio_rw_lock_t*
+hash_get_nth_lock(
+/*==============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		i)	/*!< in: index of the rw_lock */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(i < table->n_sync_obj);
+
+	return(table->sync_obj.rw_locks + i);
+}
+
+/************************************************************//**
+Gets the rw_lock for a fold value in a hash table.
+@return	rw_lock */
+UNIV_INLINE
+prio_rw_lock_t*
+hash_get_lock(
+/*==========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	ulint	i;
+
+	ut_ad(table);
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+
+	i = hash_get_sync_obj_index(table, fold);
+
+	return(hash_get_nth_lock(table, i));
+}
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/ibuf0ibuf.h b/storage/xtradb/include/ibuf0ibuf.h
index 03ea0629af4..f2e1c80878e 100644
--- a/storage/xtradb/include/ibuf0ibuf.h
+++ b/storage/xtradb/include/ibuf0ibuf.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -35,6 +35,10 @@ Created 7/19/1997 Heikki Tuuri
 #ifndef UNIV_HOTBACKUP
 # include "ibuf0types.h"
 
+/** Default value for maximum on-disk size of change buffer in terms
+of percentage of the buffer pool. */
+#define CHANGE_BUFFER_DEFAULT_SIZE	(25)
+
 /* Possible operations buffered in the insert/whatever buffer. See
 ibuf_insert(). DO NOT CHANGE THE VALUES OF THESE, THEY ARE STORED ON DISK. */
 typedef enum {
@@ -98,6 +102,14 @@ void
 ibuf_init_at_db_start(void);
 /*=======================*/
 /*********************************************************************//**
+Updates the max_size value for ibuf. */
+UNIV_INTERN
+void
+ibuf_max_size_update(
+/*=================*/
+	ulint	new_val);	/*!< in: new value in terms of
+				percentage of the buffer pool size */
+/*********************************************************************//**
 Reads the biggest tablespace id from the high end of the insert buffer
 tree and updates the counter in fil_system. */
 UNIV_INTERN
@@ -364,26 +376,16 @@ will be merged from ibuf trees to the pages read, 0 if ibuf is
 empty */
 UNIV_INTERN
 ulint
-ibuf_contract(
-/*==========*/
-	ibool	sync);	/*!< in: TRUE if the caller wants to wait for the
-			issued read with the highest tablespace address
-			to complete */
-/*********************************************************************//**
-Contracts insert buffer trees by reading pages to the buffer pool.
-@return a lower limit for the combined size in bytes of entries which
-will be merged from ibuf trees to the pages read, 0 if ibuf is
-empty */
-UNIV_INTERN
-ulint
-ibuf_contract_for_n_pages(
-/*======================*/
-	ibool	sync,	/*!< in: TRUE if the caller wants to wait for the
-			issued read with the highest tablespace address
-			to complete */
-	ulint	n_pages);/*!< in: try to read at least this many pages to
-			the buffer pool and merge the ibuf contents to
-			them */
+ibuf_contract_in_background(
+/*========================*/
+	table_id_t	table_id,	/*!< in: if merge should be done only
+					for a specific table, for all tables
+					this should be 0 */
+	ibool		full);		/*!< in: TRUE if the caller wants to
+					do a full contract based on PCT_IO(100).
+					If FALSE then the size of contract
+					batch is determined based on the
+					current size of the ibuf tree. */
 #endif /* !UNIV_HOTBACKUP */
 /*********************************************************************//**
 Parses a redo log record of an ibuf bitmap page init.
@@ -411,9 +413,9 @@ ibuf_count_get(
 #endif
 /******************************************************************//**
 Looks if the insert buffer is empty.
-@return	TRUE if empty */
+@return	true if empty */
 UNIV_INTERN
-ibool
+bool
 ibuf_is_empty(void);
 /*===============*/
 /******************************************************************//**
@@ -455,6 +457,17 @@ ibuf_export_ibuf_status(
 	ulint*	discarded_delete_marks,
 	ulint*	discarded_deletes);
 
+/******************************************************************//**
+Checks the insert buffer bitmaps on IMPORT TABLESPACE.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+ibuf_check_bitmap_on_import(
+/*========================*/
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		space_id)	/*!< in: tablespace identifier */
+	__attribute__((nonnull, warn_unused_result));
+
 #define IBUF_HEADER_PAGE_NO	FSP_IBUF_HEADER_PAGE_NO
 #define IBUF_TREE_ROOT_PAGE_NO	FSP_IBUF_TREE_ROOT_PAGE_NO
 
diff --git a/storage/xtradb/include/ibuf0ibuf.ic b/storage/xtradb/include/ibuf0ibuf.ic
index 043d7c472d8..21747fdceac 100644
--- a/storage/xtradb/include/ibuf0ibuf.ic
+++ b/storage/xtradb/include/ibuf0ibuf.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -28,9 +28,6 @@ Created 7/19/1997 Heikki Tuuri
 #ifndef UNIV_HOTBACKUP
 #include "buf0lru.h"
 
-/** Counter for ibuf_should_try() */
-extern ulint	ibuf_flush_count;
-
 /** An index page must contain at least UNIV_PAGE_SIZE /
 IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to
 buffer inserts to this page.  If there is this much of free space, the
@@ -62,7 +59,7 @@ ibuf_mtr_commit(
 }
 
 /** Insert buffer struct */
-struct ibuf_struct{
+struct ibuf_t{
 	ulint		size;		/*!< current size of the ibuf index
 					tree, in pages */
 	ulint		max_size;	/*!< recommended maximum size of the
@@ -70,10 +67,10 @@ struct ibuf_struct{
 	ulint		seg_size;	/*!< allocated pages of the file
 					segment containing ibuf header and
 					tree */
-	ibool		empty;		/*!< Protected by the page
+	bool		empty;		/*!< Protected by the page
 					latch of the root page of the
 					insert buffer tree
-					(FSP_IBUF_TREE_ROOT_PAGE_NO). TRUE
+					(FSP_IBUF_TREE_ROOT_PAGE_NO). true
 					if and only if the insert
 					buffer tree is empty. */
 	ulint		free_list_len;	/*!< length of the free list */
@@ -127,21 +124,11 @@ ibuf_should_try(
 						a secondary index when we
 						decide */
 {
-	if (ibuf_use != IBUF_USE_NONE
-	    && !dict_index_is_clust(index)
-	    && (ignore_sec_unique || !dict_index_is_unique(index))) {
-
-		ibuf_flush_count++;
-
-		if (ibuf_flush_count % 4 == 0) {
-
-			buf_LRU_try_free_flushed_blocks(NULL);
-		}
-
-		return(TRUE);
-	}
-
-	return(FALSE);
+	return(ibuf_use != IBUF_USE_NONE
+	       && ibuf->max_size != 0
+	       && !dict_index_is_clust(index)
+	       && index->table->quiesce == QUIESCE_NONE
+	       && (ignore_sec_unique || !dict_index_is_unique(index)));
 }
 
 /******************************************************************//**
@@ -174,12 +161,11 @@ ibuf_bitmap_page(
 	ut_ad(ut_is_2pow(zip_size));
 
 	if (!zip_size) {
-		return(UNIV_UNLIKELY((page_no & (UNIV_PAGE_SIZE - 1))
-				     == FSP_IBUF_BITMAP_OFFSET));
+		return((page_no & (UNIV_PAGE_SIZE - 1))
+			== FSP_IBUF_BITMAP_OFFSET);
 	}
 
-	return(UNIV_UNLIKELY((page_no & (zip_size - 1))
-			     == FSP_IBUF_BITMAP_OFFSET));
+	return((page_no & (zip_size - 1)) == FSP_IBUF_BITMAP_OFFSET);
 }
 
 /*********************************************************************//**
@@ -197,7 +183,7 @@ ibuf_index_page_calc_free_bits(
 	ulint	n;
 	ut_ad(ut_is_2pow(zip_size));
 	ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
-	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
 
 	if (zip_size) {
 		n = max_ins_size
@@ -232,7 +218,7 @@ ibuf_index_page_calc_free_from_bits(
 	ut_ad(bits < 4);
 	ut_ad(ut_is_2pow(zip_size));
 	ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
-	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
 
 	if (zip_size) {
 		if (bits == 3) {
@@ -267,16 +253,24 @@ ibuf_index_page_calc_free_zip(
 	ut_ad(zip_size == buf_block_get_zip_size(block));
 	ut_ad(zip_size);
 
-	max_ins_size = page_get_max_insert_size_after_reorganize(
+	/* Consider the maximum insert size on the uncompressed page
+	without reorganizing the page. We must not assume anything
+	about the compression ratio. If zip_max_ins > max_ins_size and
+	there is 1/4 garbage on the page, recompression after the
+	reorganize could fail, in theory. So, let us guarantee that
+	merging a buffered insert to a compressed page will always
+	succeed without reorganizing or recompressing the page, just
+	by using the page modification log. */
+	max_ins_size = page_get_max_insert_size(
 		buf_block_get_frame(block), 1);
 
 	page_zip = buf_block_get_page_zip(block);
 	zip_max_ins = page_zip_max_ins_size(page_zip,
 					    FALSE/* not clustered */);
 
-	if (UNIV_UNLIKELY(zip_max_ins < 0)) {
+	if (zip_max_ins < 0) {
 		return(0);
-	} else if (UNIV_LIKELY(max_ins_size > (ulint) zip_max_ins)) {
+	} else if (max_ins_size > (ulint) zip_max_ins) {
 		max_ins_size = (ulint) zip_max_ins;
 	}
 
@@ -345,8 +339,8 @@ ibuf_update_free_bits_if_full(
 	before = ibuf_index_page_calc_free_bits(0, max_ins_size);
 
 	if (max_ins_size >= increase) {
-#if ULINT32_UNDEFINED <= UNIV_PAGE_SIZE
-# error "ULINT32_UNDEFINED <= UNIV_PAGE_SIZE"
+#if ULINT32_UNDEFINED <= UNIV_PAGE_SIZE_MAX
+# error "ULINT32_UNDEFINED <= UNIV_PAGE_SIZE_MAX"
 #endif
 		after = ibuf_index_page_calc_free_bits(0, max_ins_size
 						       - increase);
diff --git a/storage/xtradb/include/ibuf0types.h b/storage/xtradb/include/ibuf0types.h
index d3e6f9299da..3fdbf078b0b 100644
--- a/storage/xtradb/include/ibuf0types.h
+++ b/storage/xtradb/include/ibuf0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,6 +26,6 @@ Created 7/29/1997 Heikki Tuuri
 #ifndef ibuf0types_h
 #define ibuf0types_h
 
-typedef	struct ibuf_struct	ibuf_t;
+struct ibuf_t;
 
 #endif
diff --git a/storage/xtradb/include/lock0iter.h b/storage/xtradb/include/lock0iter.h
index ce6f28dc514..0054850b526 100644
--- a/storage/xtradb/include/lock0iter.h
+++ b/storage/xtradb/include/lock0iter.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -29,13 +29,13 @@ Created July 16, 2007 Vasil Dimov
 #include "univ.i"
 #include "lock0types.h"
 
-typedef struct lock_queue_iterator_struct {
+struct lock_queue_iterator_t {
 	const lock_t*	current_lock;
 	/* In case this is a record lock queue (not table lock queue)
 	then bit_no is the record number within the heap in which the
 	record is stored. */
 	ulint		bit_no;
-} lock_queue_iterator_t;
+};
 
 /*******************************************************************//**
 Initialize lock queue iterator so that it starts to iterate from
diff --git a/storage/xtradb/include/lock0lock.h b/storage/xtradb/include/lock0lock.h
index 1d3958e0d50..3a3a28ef525 100644
--- a/storage/xtradb/include/lock0lock.h
+++ b/storage/xtradb/include/lock0lock.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -36,13 +36,13 @@ Created 5/7/1996 Heikki Tuuri
 #include "lock0types.h"
 #include "read0types.h"
 #include "hash0hash.h"
+#include "srv0srv.h"
 #include "ut0vec.h"
 
 #ifdef UNIV_DEBUG
 extern ibool	lock_print_waits;
 #endif /* UNIV_DEBUG */
-/* Buffer for storing information about the most recent deadlock error */
-extern FILE*	lock_latest_err_file;
+
 extern ulint	srv_n_lock_deadlock_count;
 
 /*********************************************************************//**
@@ -66,18 +66,6 @@ void
 lock_sys_close(void);
 /*================*/
 /*********************************************************************//**
-Checks if some transaction has an implicit x-lock on a record in a clustered
-index.
-@return	transaction which has the x-lock, or NULL */
-UNIV_INLINE
-trx_t*
-lock_clust_rec_some_has_impl(
-/*=========================*/
-	const rec_t*		rec,	/*!< in: user record */
-	const dict_index_t*	index,	/*!< in: clustered index */
-	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
-	__attribute__((nonnull, warn_unused_result));
-/*********************************************************************//**
 Gets the heap_no of the smallest user record on a page.
 @return	heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
 UNIV_INLINE
@@ -272,14 +260,15 @@ lock_rec_restore_from_page_infimum(
 					state; lock bits are reset on
 					the infimum */
 /*********************************************************************//**
-Returns TRUE if there are explicit record locks on a page.
-@return	TRUE if there are explicit record locks on the page */
+Determines if there are explicit record locks on a page.
+@return	an explicit record lock on the page, or NULL if there are none */
 UNIV_INTERN
-ibool
+lock_t*
 lock_rec_expl_exist_on_page(
 /*========================*/
 	ulint	space,	/*!< in: space id */
-	ulint	page_no);/*!< in: page number */
+	ulint	page_no)/*!< in: page number */
+	__attribute__((warn_unused_result));
 /*********************************************************************//**
 Checks if locks of other transactions prevent an immediate insert of
 a record. If they do, first tests if the query thread should anyway
@@ -288,7 +277,7 @@ the query thread to the lock wait state and inserts a waiting request
 for a gap x-lock to the lock queue.
 @return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
 UNIV_INTERN
-ulint
+dberr_t
 lock_rec_insert_check_and_lock(
 /*===========================*/
 	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is
@@ -298,10 +287,11 @@ lock_rec_insert_check_and_lock(
 	dict_index_t*	index,	/*!< in: index */
 	que_thr_t*	thr,	/*!< in: query thread */
 	mtr_t*		mtr,	/*!< in/out: mini-transaction */
-	ibool*		inherit);/*!< out: set to TRUE if the new
+	ibool*		inherit)/*!< out: set to TRUE if the new
 				inserted record maybe should inherit
 				LOCK_GAP type locks from the successor
 				record */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Checks if locks of other transactions prevent an immediate modify (update,
 delete mark, or delete unmark) of a clustered index record. If they do,
@@ -311,7 +301,7 @@ lock wait state and inserts a waiting request for a record x-lock to the
 lock queue.
 @return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
 UNIV_INTERN
-ulint
+dberr_t
 lock_clust_rec_modify_check_and_lock(
 /*=================================*/
 	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
@@ -321,13 +311,14 @@ lock_clust_rec_modify_check_and_lock(
 					modified */
 	dict_index_t*		index,	/*!< in: clustered index */
 	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
-	que_thr_t*		thr);	/*!< in: query thread */
+	que_thr_t*		thr)	/*!< in: query thread */
+	__attribute__((warn_unused_result, nonnull));
 /*********************************************************************//**
 Checks if locks of other transactions prevent an immediate modify
 (delete mark or delete unmark) of a secondary index record.
 @return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
 UNIV_INTERN
-ulint
+dberr_t
 lock_sec_rec_modify_check_and_lock(
 /*===============================*/
 	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG
@@ -339,15 +330,17 @@ lock_sec_rec_modify_check_and_lock(
 				clustered index record first: see the
 				comment below */
 	dict_index_t*	index,	/*!< in: secondary index */
-	que_thr_t*	thr,	/*!< in: query thread */
-	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+	que_thr_t*	thr,	/*!< in: query thread
+				(can be NULL if BTR_NO_LOCKING_FLAG) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((warn_unused_result, nonnull(2,3,4,6)));
 /*********************************************************************//**
 Like lock_clust_rec_read_check_and_lock(), but reads a
 secondary index record.
 @return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
 or DB_QUE_THR_SUSPENDED */
 UNIV_INTERN
-enum db_err
+dberr_t
 lock_sec_rec_read_check_and_lock(
 /*=============================*/
 	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
@@ -377,7 +370,7 @@ lock on the record.
 @return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
 or DB_QUE_THR_SUSPENDED */
 UNIV_INTERN
-enum db_err
+dberr_t
 lock_clust_rec_read_check_and_lock(
 /*===============================*/
 	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
@@ -408,7 +401,7 @@ lock_clust_rec_read_check_and_lock() that does not require the parameter
 "offsets".
 @return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
 UNIV_INTERN
-ulint
+dberr_t
 lock_clust_rec_read_check_and_lock_alt(
 /*===================================*/
 	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
@@ -426,13 +419,14 @@ lock_clust_rec_read_check_and_lock_alt(
 					SELECT FOR UPDATE */
 	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
 					LOCK_REC_NOT_GAP */
-	que_thr_t*		thr);	/*!< in: query thread */
+	que_thr_t*		thr)	/*!< in: query thread */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Checks that a record is seen in a consistent read.
-@return TRUE if sees, or FALSE if an earlier version of the record
+@return true if sees, or false if an earlier version of the record
 should be retrieved */
 UNIV_INTERN
-ibool
+bool
 lock_clust_rec_cons_read_sees(
 /*==========================*/
 	const rec_t*	rec,	/*!< in: user record which should be read or
@@ -444,33 +438,44 @@ lock_clust_rec_cons_read_sees(
 Checks that a non-clustered index record is seen in a consistent read.
 
 NOTE that a non-clustered index page contains so little information on
-its modifications that also in the case FALSE, the present version of
+its modifications that also in the case false, the present version of
 rec may be the right, but we must check this from the clustered index
 record.
 
-@return TRUE if certainly sees, or FALSE if an earlier version of the
+@return true if certainly sees, or false if an earlier version of the
 clustered index record might be needed */
 UNIV_INTERN
-ulint
+bool
 lock_sec_rec_cons_read_sees(
 /*========================*/
 	const rec_t*		rec,	/*!< in: user record which
 					should be read or passed over
 					by a read cursor */
-	const read_view_t*	view);	/*!< in: consistent read view */
+	const read_view_t*	view)	/*!< in: consistent read view */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Locks the specified database table in the mode given. If the lock cannot
 be granted immediately, the query thread is put to wait.
 @return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
 UNIV_INTERN
-ulint
+dberr_t
 lock_table(
 /*=======*/
 	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is set,
 				does nothing */
-	dict_table_t*	table,	/*!< in: database table in dictionary cache */
+	dict_table_t*	table,	/*!< in/out: database table
+				in dictionary cache */
 	enum lock_mode	mode,	/*!< in: lock mode */
-	que_thr_t*	thr);	/*!< in: query thread */
+	que_thr_t*	thr)	/*!< in: query thread */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Creates a table IX lock object for a resurrected transaction. */
+UNIV_INTERN
+void
+lock_table_ix_resurrect(
+/*====================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx);	/*!< in/out: transaction */
 /*************************************************************//**
 Removes a granted record lock of a transaction from the queue and grants
 locks to other transactions waiting in the queue if they now are entitled
@@ -479,19 +484,21 @@ UNIV_INTERN
 void
 lock_rec_unlock(
 /*============*/
-	trx_t*			trx,	/*!< in: transaction that has
+	trx_t*			trx,	/*!< in/out: transaction that has
 					set a record lock */
 	const buf_block_t*	block,	/*!< in: buffer block containing rec */
 	const rec_t*		rec,	/*!< in: record */
 	enum lock_mode		lock_mode);/*!< in: LOCK_S or LOCK_X */
 /*********************************************************************//**
-Releases transaction locks, and releases possible other transactions waiting
-because of these locks. */
+Releases a transaction's locks, and releases possible other transactions
+waiting because of these locks. Change the state of the transaction to
+TRX_STATE_COMMITTED_IN_MEMORY. */
 UNIV_INTERN
 void
-lock_release_off_kernel(
-/*====================*/
-	trx_t*	trx);	/*!< in: transaction */
+lock_trx_release_locks(
+/*===================*/
+	trx_t*	trx);	/*!< in/out: transaction */
+
 /*********************************************************************//**
 Cancels a waiting lock request and releases possible other transactions
 waiting behind it. */
@@ -499,7 +506,7 @@ UNIV_INTERN
 void
 lock_cancel_waiting_and_release(
 /*============================*/
-	lock_t*	lock);	/*!< in: waiting lock request */
+	lock_t*	lock);	/*!< in/out: waiting lock request */
 
 /*********************************************************************//**
 Removes locks on a table to be dropped or truncated.
@@ -573,8 +580,9 @@ UNIV_INTERN
 ibool
 lock_is_table_exclusive(
 /*====================*/
-	dict_table_t*	table,	/*!< in: table */
-	trx_t*		trx);	/*!< in: transaction */
+	const dict_table_t*	table,	/*!< in: table */
+	const trx_t*		trx)	/*!< in: transaction */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Checks if a lock request lock1 has to wait for request lock2.
 @return	TRUE if lock1 has to wait for lock2 to be removed */
@@ -588,18 +596,17 @@ lock_has_to_wait(
 				on the same record as in lock1 if the
 				locks are record locks */
 /*********************************************************************//**
-Checks that a transaction id is sensible, i.e., not in the future.
-@return	TRUE if ok */
+Reports that a transaction id is insensible, i.e., in the future. */
 UNIV_INTERN
-ibool
-lock_check_trx_id_sanity(
-/*=====================*/
+void
+lock_report_trx_id_insanity(
+/*========================*/
 	trx_id_t	trx_id,		/*!< in: trx id */
 	const rec_t*	rec,		/*!< in: user record */
-	dict_index_t*	index,		/*!< in: clustered index */
+	dict_index_t*	index,		/*!< in: index */
 	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
-	ibool		has_kernel_mutex);/*!< in: TRUE if the caller owns the
-					kernel mutex */
+	trx_id_t	max_trx_id)	/*!< in: trx_sys_get_max_trx_id() */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Prints info of a table lock. */
 UNIV_INTERN
@@ -618,16 +625,19 @@ lock_rec_print(
 	const lock_t*	lock);	/*!< in: record type lock */
 /*********************************************************************//**
 Prints info of locks for all transactions.
-@return FALSE if not able to obtain kernel mutex
-and exits without printing info */
+@return FALSE if not able to obtain lock mutex and exits without
+printing info */
 UNIV_INTERN
 ibool
 lock_print_info_summary(
 /*====================*/
 	FILE*	file,	/*!< in: file where to print */
-	ibool   nowait);/*!< in: whether to wait for the kernel mutex */
-/*************************************************************************
-Prints info of locks for each transaction. */
+	ibool   nowait)	/*!< in: whether to wait for the lock mutex */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Prints info of locks for each transaction. This function assumes that the
+caller holds the lock mutex and more importantly it will release the lock
+mutex on behalf of the caller. (This should be fixed in the future). */
 UNIV_INTERN
 void
 lock_print_info_all_transactions(
@@ -636,27 +646,14 @@ lock_print_info_all_transactions(
 /*********************************************************************//**
 Return approximate number or record locks (bits set in the bitmap) for
 this transaction. Since delete-marked records may be removed, the
-record count will not be precise. */
+record count will not be precise.
+The caller must be holding lock_sys->mutex. */
 UNIV_INTERN
 ulint
 lock_number_of_rows_locked(
 /*=======================*/
-	const trx_t*	trx);	/*!< in: transaction */
-/*******************************************************************//**
-Check if a transaction holds any autoinc locks.
-@return TRUE if the transaction holds any AUTOINC locks. */
-UNIV_INTERN
-ibool
-lock_trx_holds_autoinc_locks(
-/*=========================*/
-	const trx_t*	trx);		/*!< in: transaction */
-/*******************************************************************//**
-Release all the transaction's autoinc locks. */
-UNIV_INTERN
-void
-lock_release_autoinc_locks(
-/*=======================*/
-	trx_t*		trx);		/*!< in/out: transaction */
+	const trx_lock_t*	trx_lock)	/*!< in: transaction locks */
+	__attribute__((nonnull, warn_unused_result));
 
 /*******************************************************************//**
 Gets the type of a lock. Non-inline version for using outside of the
@@ -752,6 +749,115 @@ ulint
 lock_rec_get_page_no(
 /*=================*/
 	const lock_t*	lock);	/*!< in: lock */
+/*******************************************************************//**
+Check if there are any locks (table or rec) against table.
+@return	TRUE if locks exist */
+UNIV_INTERN
+ibool
+lock_table_has_locks(
+/*=================*/
+	const dict_table_t*	table);	/*!< in: check if there are any locks
+					held on records in this table or on the
+					table itself */
+
+/*********************************************************************//**
+A thread which wakes up threads whose lock wait may have lasted too long.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(lock_wait_timeout_thread)(
+/*=====================================*/
+	void*	arg);	/*!< in: a dummy parameter required by
+			os_thread_create */
+
+/********************************************************************//**
+Releases a user OS thread waiting for a lock to be released, if the
+thread is already suspended. */
+UNIV_INTERN
+void
+lock_wait_release_thread_if_suspended(
+/*==================================*/
+	que_thr_t*	thr);	/*!< in: query thread associated with the
+				user OS thread	 */
+
+/***************************************************************//**
+Puts a user OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
+UNIV_INTERN
+void
+lock_wait_suspend_thread(
+/*=====================*/
+	que_thr_t*	thr);	/*!< in: query thread associated with the
+				user OS thread */
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
+function should be called at the the end of an SQL statement, by the
+connection thread that owns the transaction (trx->mysql_thd). */
+UNIV_INTERN
+void
+lock_unlock_table_autoinc(
+/*======================*/
+	trx_t*	trx);			/*!< in/out: transaction */
+/*********************************************************************//**
+Check whether the transaction has already been rolled back because it
+was selected as a deadlock victim, or if it has to wait then cancel
+the wait lock.
+@return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+lock_trx_handle_wait(
+/*=================*/
+	trx_t*	trx)	/*!< in/out: trx lock state */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Get the number of locks on a table.
+@return number of locks */
+UNIV_INTERN
+ulint
+lock_table_get_n_locks(
+/*===================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull));
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Checks that a transaction id is sensible, i.e., not in the future.
+@return	true if ok */
+UNIV_INTERN
+bool
+lock_check_trx_id_sanity(
+/*=====================*/
+	trx_id_t	trx_id,		/*!< in: trx id */
+	const rec_t*	rec,		/*!< in: user record */
+	dict_index_t*	index,		/*!< in: index */
+	const ulint*	offsets)	/*!< in: rec_get_offsets(rec, index) */
+	__attribute__((nonnull, warn_unused_result));
+/*******************************************************************//**
+Check if the transaction holds any locks on the sys tables
+or its records.
+@return	the strongest lock found on any sys table or 0 for none */
+UNIV_INTERN
+const lock_t*
+lock_trx_has_sys_table_locks(
+/*=========================*/
+	const trx_t*	trx)	/*!< in: transaction to check */
+	__attribute__((warn_unused_result));
+
+/*******************************************************************//**
+Check if the transaction holds an exclusive lock on a record.
+@return	whether the locks are held */
+UNIV_INTERN
+bool
+lock_trx_has_rec_x_lock(
+/*====================*/
+	const trx_t*		trx,	/*!< in: transaction to check */
+	const dict_table_t*	table,	/*!< in: table to check */
+	const buf_block_t*	block,	/*!< in: buffer block of the record */
+	ulint			heap_no)/*!< in: record heap number */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
 
 /** Lock modes and types */
 /* @{ */
@@ -815,22 +921,76 @@ lock_rec_get_page_no(
 	((type_mode & (LOCK_CONV_BY_OTHER | LOCK_WAIT)) == LOCK_WAIT)
 
 /** Lock operation struct */
-typedef struct lock_op_struct	lock_op_t;
-/** Lock operation struct */
-struct lock_op_struct{
+struct lock_op_t{
 	dict_table_t*	table;	/*!< table to be locked */
 	enum lock_mode	mode;	/*!< lock mode */
 };
 
 /** The lock system struct */
-struct lock_sys_struct{
-	hash_table_t*	rec_hash;	/*!< hash table of the record locks */
+struct lock_sys_t{
+	ib_mutex_t	mutex;			/*!< Mutex protecting the
+						locks */
+	hash_table_t*	rec_hash;		/*!< hash table of the record
+						locks */
 	ulint		rec_num;
+	ib_mutex_t	wait_mutex;		/*!< Mutex protecting the
+						next two fields */
+	srv_slot_t*	waiting_threads;	/*!< Array  of user threads
+						suspended while waiting for
+						locks within InnoDB, protected
+						by the lock_sys->wait_mutex */
+	srv_slot_t*	last_slot;		/*!< highest slot ever used
+						in the waiting_threads array,
+						protected by
+						lock_sys->wait_mutex */
+	ibool		rollback_complete;
+						/*!< TRUE if rollback of all
+						recovered transactions is
+						complete. Protected by
+						lock_sys->mutex */
+
+	ulint		n_lock_max_wait_time;	/*!< Max wait time */
+
+	os_event_t	timeout_event;		/*!< Set to the event that is
+						created in the lock wait monitor
+						thread. A value of 0 means the
+						thread is not active */
+
+	bool		timeout_thread_active;	/*!< True if the timeout thread
+						is running */
 };
 
 /** The lock system */
 extern lock_sys_t*	lock_sys;
 
+/** Test if lock_sys->mutex can be acquired without waiting. */
+#define lock_mutex_enter_nowait() mutex_enter_nowait(&lock_sys->mutex)
+
+/** Test if lock_sys->mutex is owned. */
+#define lock_mutex_own() mutex_own(&lock_sys->mutex)
+
+/** Acquire the lock_sys->mutex. */
+#define lock_mutex_enter() do {			\
+	mutex_enter(&lock_sys->mutex);		\
+} while (0)
+
+/** Release the lock_sys->mutex. */
+#define lock_mutex_exit() do {			\
+	mutex_exit(&lock_sys->mutex);		\
+} while (0)
+
+/** Test if lock_sys->wait_mutex is owned. */
+#define lock_wait_mutex_own() mutex_own(&lock_sys->wait_mutex)
+
+/** Acquire the lock_sys->wait_mutex. */
+#define lock_wait_mutex_enter() do {		\
+	mutex_enter(&lock_sys->wait_mutex);	\
+} while (0)
+
+/** Release the lock_sys->wait_mutex. */
+#define lock_wait_mutex_exit() do {		\
+	mutex_exit(&lock_sys->wait_mutex);	\
+} while (0)
 
 #ifndef UNIV_NONINL
 #include "lock0lock.ic"
diff --git a/storage/xtradb/include/lock0lock.ic b/storage/xtradb/include/lock0lock.ic
index 4e6c0c1b78c..736936954cb 100644
--- a/storage/xtradb/include/lock0lock.ic
+++ b/storage/xtradb/include/lock0lock.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -68,35 +68,6 @@ lock_rec_hash(
 }
 
 /*********************************************************************//**
-Checks if some transaction has an implicit x-lock on a record in a clustered
-index.
-@return	transaction which has the x-lock, or NULL */
-UNIV_INLINE
-trx_t*
-lock_clust_rec_some_has_impl(
-/*=========================*/
-	const rec_t*		rec,	/*!< in: user record */
-	const dict_index_t*	index,	/*!< in: clustered index */
-	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
-{
-	trx_id_t	trx_id;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(dict_index_is_clust(index));
-	ut_ad(page_rec_is_user_rec(rec));
-
-	trx_id = row_get_rec_trx_id(rec, index, offsets);
-
-	if (trx_is_active(trx_id)) {
-		/* The modifying or inserting transaction is active */
-
-		return(trx_get_on_id(trx_id));
-	}
-
-	return(NULL);
-}
-
-/*********************************************************************//**
 Gets the heap_no of the smallest user record on a page.
 @return	heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
 UNIV_INLINE
diff --git a/storage/xtradb/include/lock0priv.h b/storage/xtradb/include/lock0priv.h
index 491cad95329..e564387ec53 100644
--- a/storage/xtradb/include/lock0priv.h
+++ b/storage/xtradb/include/lock0priv.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -40,9 +40,7 @@ those functions in lock/ */
 #include "ut0lst.h"
 
 /** A table lock */
-typedef struct lock_table_struct	lock_table_t;
-/** A table lock */
-struct lock_table_struct {
+struct lock_table_t {
 	dict_table_t*	table;		/*!< database table in dictionary
 					cache */
 	UT_LIST_NODE_T(lock_t)
@@ -51,9 +49,7 @@ struct lock_table_struct {
 };
 
 /** Record lock for a page */
-typedef struct lock_rec_struct		lock_rec_t;
-/** Record lock for a page */
-struct lock_rec_struct {
+struct lock_rec_t {
 	ulint	space;			/*!< space id */
 	ulint	page_no;		/*!< page number */
 	ulint	n_bits;			/*!< number of bits in the lock
@@ -62,8 +58,8 @@ struct lock_rec_struct {
 					lock struct */
 };
 
-/** Lock struct */
-struct lock_struct {
+/** Lock struct; protected by lock_sys->mutex */
+struct lock_t {
 	trx_t*		trx;		/*!< transaction owning the
 					lock */
 	UT_LIST_NODE_T(lock_t)
@@ -101,6 +97,19 @@ lock_rec_get_prev(
 	const lock_t*	in_lock,/*!< in: record lock */
 	ulint		heap_no);/*!< in: heap number of the record */
 
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return	transaction id of the transaction which has the x-lock, or 0 */
+UNIV_INLINE
+trx_id_t
+lock_clust_rec_some_has_impl(
+/*=========================*/
+	const rec_t*		rec,	/*!< in: user record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
+	__attribute__((nonnull, warn_unused_result));
+
 #ifndef UNIV_NONINL
 #include "lock0priv.ic"
 #endif
diff --git a/storage/xtradb/include/lock0priv.ic b/storage/xtradb/include/lock0priv.ic
index 98b2189680c..6b70dc33d3c 100644
--- a/storage/xtradb/include/lock0priv.ic
+++ b/storage/xtradb/include/lock0priv.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -24,8 +24,8 @@ Created July 16, 2007 Vasil Dimov
 *******************************************************/
 
 /* This file contains only methods which are used in
-lock/lock0* files, other than lock/lock0lock.c.
-I.e. lock/lock0lock.c contains more internal inline
+lock/lock0* files, other than lock/lock0lock.cc.
+I.e. lock/lock0lock.cc contains more internal inline
 methods but they are used only in that file. */
 
 #ifndef LOCK_MODULE_IMPLEMENTATION
@@ -46,4 +46,22 @@ lock_get_type_low(
 	return(lock->type_mode & LOCK_TYPE_MASK);
 }
 
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return	transaction id of the transaction which has the x-lock, or 0 */
+UNIV_INLINE
+trx_id_t
+lock_clust_rec_some_has_impl(
+/*=========================*/
+	const rec_t*		rec,	/*!< in: user record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(page_rec_is_user_rec(rec));
+
+	return(row_get_rec_trx_id(rec, index, offsets));
+}
+
 /* vim: set filetype=c: */
diff --git a/storage/xtradb/include/lock0types.h b/storage/xtradb/include/lock0types.h
index 2eb71e2939f..cf32e72f864 100644
--- a/storage/xtradb/include/lock0types.h
+++ b/storage/xtradb/include/lock0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,8 +27,8 @@ Created 5/7/1996 Heikki Tuuri
 #define lock0types_h
 
 #define lock_t ib_lock_t
-typedef struct lock_struct	lock_t;
-typedef struct lock_sys_struct	lock_sys_t;
+struct lock_t;
+struct lock_sys_t;
 
 /* Basic lock modes */
 enum lock_mode {
@@ -39,7 +39,9 @@ enum lock_mode {
 	LOCK_AUTO_INC,	/* locks the auto-inc counter of a table
 			in an exclusive mode */
 	LOCK_NONE,	/* this is used elsewhere to note consistent read */
-	LOCK_NUM = LOCK_NONE/* number of lock modes */
+	LOCK_NUM = LOCK_NONE, /* number of lock modes */
+	LOCK_NONE_UNSET = 255
 };
 
+
 #endif
diff --git a/storage/xtradb/include/log0log.h b/storage/xtradb/include/log0log.h
index 31afe5d8555..bab256e5a65 100644
--- a/storage/xtradb/include/log0log.h
+++ b/storage/xtradb/include/log0log.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All rights reserved.
 Copyright (c) 2009, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -43,11 +43,14 @@ Created 12/9/1995 Heikki Tuuri
 
 /* Type used for all log sequence number storage and arithmetics */
 typedef	ib_uint64_t		lsn_t;
+#define LSN_MAX			IB_UINT64_MAX
+
+#define LSN_PF			UINT64PF
 
 /** Redo log buffer */
-typedef struct log_struct	log_t;
+struct log_t;
 /** Redo log group */
-typedef struct log_group_struct	log_group_t;
+struct log_group_t;
 
 #ifdef UNIV_DEBUG
 /** Flag: write to log file? */
@@ -59,25 +62,27 @@ extern	ibool	log_debug_writes;
 # define log_do_write TRUE
 #endif /* UNIV_DEBUG */
 
+/** Magic value to use instead of log checksums when they are disabled */
+#define LOG_NO_CHECKSUM_MAGIC 0xDEADBEEFUL
+
+typedef ulint (*log_checksum_func_t)(const byte* log_block);
+
+/** Pointer to the log checksum calculation function. Protected with
+log_sys->mutex. */
+extern log_checksum_func_t log_checksum_algorithm_ptr;
+
 /** Wait modes for log_write_up_to @{ */
 #define LOG_NO_WAIT		91
 #define LOG_WAIT_ONE_GROUP	92
 #define	LOG_WAIT_ALL_GROUPS	93
 /* @} */
-/** Maximum number of log groups in log_group_struct::checkpoint_buf */
+/** Maximum number of log groups in log_group_t::checkpoint_buf */
 #define LOG_MAX_N_GROUPS	32
 
-#ifndef UNIV_HOTBACKUP
-/****************************************************************//**
-Sets the global variable log_fsp_current_free_limit. Also makes a checkpoint,
-so that we know that the limit has been written to a log checkpoint field
-on disk. */
-UNIV_INTERN
-void
-log_fsp_current_free_limit_set_and_checkpoint(
-/*==========================================*/
-	ulint	limit);	/*!< in: limit to set */
-#endif /* !UNIV_HOTBACKUP */
+#define IB_ARCHIVED_LOGS_PREFIX		"ib_log_archive_"
+#define IB_ARCHIVED_LOGS_PREFIX_LEN	(sizeof(IB_ARCHIVED_LOGS_PREFIX) - 1)
+#define IB_ARCHIVED_LOGS_SERIAL_LEN	20
+
 /*******************************************************************//**
 Calculates where in log files we find a specified lsn.
 @return	log file number */
@@ -101,12 +106,12 @@ Writes to the log the string given. The log must be released with
 log_release.
 @return	end lsn of the log record, zero if did not succeed */
 UNIV_INLINE
-ib_uint64_t
+lsn_t
 log_reserve_and_write_fast(
 /*=======================*/
 	const void*	str,	/*!< in: string */
 	ulint		len,	/*!< in: string length */
-	ib_uint64_t*	start_lsn);/*!< out: start lsn of the log record */
+	lsn_t*		start_lsn);/*!< out: start lsn of the log record */
 /***********************************************************************//**
 Releases the log mutex. */
 UNIV_INLINE
@@ -127,7 +132,7 @@ Locks the log mutex and opens the log for log_write_low. The log must be closed
 with log_close and released with log_release.
 @return start lsn of the log record */
 UNIV_INLINE
-ib_uint64_t
+lsn_t
 log_reserve_and_open(
 /*=================*/
 	ulint	len);	/*!< in: length of data to be catenated */
@@ -135,7 +140,7 @@ log_reserve_and_open(
 Opens the log for log_write_low. The log must be closed with log_close.
 @return	start lsn of the log record */
 UNIV_INTERN
-ib_uint64_t
+lsn_t
 log_open(
 /*=====*/
 	ulint	len);	/*!< in: length of data to be catenated */
@@ -152,14 +157,14 @@ log_write_low(
 Closes the log.
 @return	lsn */
 UNIV_INTERN
-ib_uint64_t
+lsn_t
 log_close(void);
 /*===========*/
 /************************************************************//**
 Gets the current lsn.
 @return	current lsn */
 UNIV_INLINE
-ib_uint64_t
+lsn_t
 log_get_lsn(void);
 /*=============*/
 /************************************************************//**
@@ -174,9 +179,17 @@ Gets the log group capacity. It is OK to read the value without
 holding log_sys->mutex because it is constant.
 @return	log group capacity */
 UNIV_INLINE
-ulint
+lsn_t
 log_get_capacity(void);
 /*==================*/
+/****************************************************************
+Get log_sys::max_modified_age_async. It is OK to read the value without
+holding log_sys::mutex because it is constant.
+@return	max_modified_age_async */
+UNIV_INLINE
+lsn_t
+log_get_max_modified_age_async(void);
+/*================================*/
 /******************************************************//**
 Initializes the log. */
 UNIV_INTERN
@@ -191,7 +204,7 @@ log_group_init(
 /*===========*/
 	ulint	id,			/*!< in: group id */
 	ulint	n_files,		/*!< in: number of log files */
-	ulint	file_size,		/*!< in: log file size in bytes */
+	lsn_t	file_size,		/*!< in: log file size in bytes */
 	ulint	space_id,		/*!< in: space id of the file space
 					which contains the log files of this
 					group */
@@ -216,14 +229,13 @@ UNIV_INTERN
 void
 log_write_up_to(
 /*============*/
-	ib_uint64_t	lsn,	/*!< in: log sequence number up to which
-				the log should be written,
-				IB_ULONGLONG_MAX if not specified */
-	ulint		wait,	/*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
-				or LOG_WAIT_ALL_GROUPS */
-	ibool		flush_to_disk);
-				/*!< in: TRUE if we want the written log
-				also to be flushed to disk */
+	lsn_t	lsn,	/*!< in: log sequence number up to which
+			the log should be written, LSN_MAX if not specified */
+	ulint	wait,	/*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
+			or LOG_WAIT_ALL_GROUPS */
+	ibool	flush_to_disk);
+			/*!< in: TRUE if we want the written log
+			also to be flushed to disk */
 /****************************************************************//**
 Does a syncronous flush of the log buffer to disk. */
 UNIV_INTERN
@@ -240,21 +252,6 @@ void
 log_buffer_sync_in_background(
 /*==========================*/
 	ibool	flush);	/*<! in: flush the logs to disk */
-/****************************************************************//**
-Advances the smallest lsn for which there are unflushed dirty blocks in the
-buffer pool and also may make a new checkpoint. NOTE: this function may only
-be called if the calling thread owns no synchronization objects!
-@return FALSE if there was a flush batch of the same type running,
-which means that we could not start this flush batch */
-UNIV_INTERN
-ibool
-log_preflush_pool_modified_pages(
-/*=============================*/
-	ib_uint64_t	new_oldest,	/*!< in: try to advance
-					oldest_modified_lsn at least
-					to this lsn */
-	ibool		sync);		/*!< in: TRUE if synchronous
-					operation is desired */
 /******************************************************//**
 Makes a checkpoint. Note that this function does not flush dirty
 blocks from the buffer pool: it only checks what is lsn of the oldest
@@ -282,16 +279,16 @@ UNIV_INTERN
 void
 log_make_checkpoint_at(
 /*===================*/
-	ib_uint64_t	lsn,		/*!< in: make a checkpoint at this or a
-					later lsn, if IB_ULONGLONG_MAX, makes
-					a checkpoint at the latest lsn */
-	ibool		write_always);	/*!< in: the function normally checks if
-					the new checkpoint would have a
-					greater lsn than the previous one: if
-					not, then no physical write is done;
-					by setting this parameter TRUE, a
-					physical write will always be made to
-					log files */
+	lsn_t	lsn,		/*!< in: make a checkpoint at this or a
+				later lsn, if LSN_MAX, makes
+				a checkpoint at the latest lsn */
+	ibool	write_always);	/*!< in: the function normally checks if
+				the new checkpoint would have a
+				greater lsn than the previous one: if
+				not, then no physical write is done;
+				by setting this parameter TRUE, a
+				physical write will always be made to
+				log files */
 /****************************************************************//**
 Disable checkpoints. This is used when doing a volume snapshot
 to ensure that we don't get checkpoint between snapshoting two
@@ -329,8 +326,7 @@ log_checkpoint_get_nth_group_info(
 /*==============================*/
 	const byte*	buf,	/*!< in: buffer containing checkpoint info */
 	ulint		n,	/*!< in: nth slot */
-	ulint*		file_no,/*!< out: archived file number */
-	ulint*		offset);/*!< out: archived file offset */
+	lsn_t*		file_no);/*!< out: archived file number */
 /******************************************************//**
 Writes checkpoint info to groups. */
 UNIV_INTERN
@@ -386,8 +382,18 @@ void
 log_archived_file_name_gen(
 /*=======================*/
 	char*	buf,	/*!< in: buffer where to write */
+	ulint	buf_len,/*!< in: buffer length */
 	ulint	id,	/*!< in: group id */
-	ulint	file_no);/*!< in: file number */
+	lsn_t	file_no);/*!< in: file number */
+
+UNIV_INTERN
+void
+log_archived_get_offset(
+/*====================*/
+	log_group_t*	group,		/*!< in: log group */
+	lsn_t		file_no,	/*!< in: archive log file number */
+	lsn_t		archived_lsn,	/*!< in: last archived LSN */
+	lsn_t*		offset);	/*!< out: offset within archived file */
 #else /* !UNIV_HOTBACKUP */
 /******************************************************//**
 Writes info to a buffer of a log group when log files are created in
@@ -421,8 +427,8 @@ log_group_read_log_seg(
 	ulint		type,		/*!< in: LOG_ARCHIVE or LOG_RECOVER */
 	byte*		buf,		/*!< in: buffer where to read */
 	log_group_t*	group,		/*!< in: log group */
-	ib_uint64_t	start_lsn,	/*!< in: read area start */
-	ib_uint64_t	end_lsn,	/*!< in: read area end */
+	lsn_t		start_lsn,	/*!< in: read area start */
+	lsn_t		end_lsn,	/*!< in: read area end */
 	ibool		release_mutex);	/*!< in: whether the log_sys->mutex
 				        should be released before the read */
 /******************************************************//**
@@ -435,7 +441,7 @@ log_group_write_buf(
 	byte*		buf,		/*!< in: buffer */
 	ulint		len,		/*!< in: buffer len; must be divisible
 					by OS_FILE_LOG_BLOCK_SIZE */
-	ib_uint64_t	start_lsn,	/*!< in: start lsn of the buffer; must
+	lsn_t		start_lsn,	/*!< in: start lsn of the buffer; must
 					be divisible by
 					OS_FILE_LOG_BLOCK_SIZE */
 	ulint		new_data_offset);/*!< in: start offset of new data in
@@ -451,14 +457,14 @@ void
 log_group_set_fields(
 /*=================*/
 	log_group_t*	group,	/*!< in/out: group */
-	ib_uint64_t	lsn);	/*!< in: lsn for which the values should be
+	lsn_t		lsn);	/*!< in: lsn for which the values should be
 				set */
 /******************************************************//**
 Calculates the data capacity of a log group, when the log file headers are not
 included.
 @return	capacity in bytes */
 UNIV_INTERN
-ulint
+lsn_t
 log_group_get_capacity(
 /*===================*/
 	const log_group_t*	group);	/*!< in: log group */
@@ -550,8 +556,8 @@ UNIV_INLINE
 void
 log_block_init(
 /*===========*/
-	byte*		log_block,	/*!< in: pointer to the log buffer */
-	ib_uint64_t	lsn);		/*!< in: lsn within the log block */
+	byte*	log_block,	/*!< in: pointer to the log buffer */
+	lsn_t	lsn);		/*!< in: lsn within the log block */
 /************************************************************//**
 Initializes a log block in the log buffer in the old, < 3.23.52 format, where
 there was no checksum yet. */
@@ -559,8 +565,8 @@ UNIV_INLINE
 void
 log_block_init_in_old_format(
 /*=========================*/
-	byte*		log_block,	/*!< in: pointer to the log buffer */
-	ib_uint64_t	lsn);		/*!< in: lsn within the log block */
+	byte*	log_block,	/*!< in: pointer to the log buffer */
+	lsn_t	lsn);		/*!< in: lsn within the log block */
 /************************************************************//**
 Converts a lsn to a log block number.
 @return	log block number, it is > 0 and <= 1G */
@@ -568,7 +574,7 @@ UNIV_INLINE
 ulint
 log_block_convert_lsn_to_no(
 /*========================*/
-	ib_uint64_t	lsn);	/*!< in: lsn of a byte within the block */
+	lsn_t	lsn);	/*!< in: lsn of a byte within the block */
 /******************************************************//**
 Prints info of the log. */
 UNIV_INTERN
@@ -583,20 +589,26 @@ UNIV_INTERN
 ibool
 log_peek_lsn(
 /*=========*/
-	ib_uint64_t*	lsn);	/*!< out: if returns TRUE, current lsn is here */
+	lsn_t*	lsn);	/*!< out: if returns TRUE, current lsn is here */
 /**********************************************************************//**
 Refreshes the statistics used to print per-second averages. */
 UNIV_INTERN
 void
 log_refresh_stats(void);
 /*===================*/
-/**********************************************************
+/********************************************************//**
+Closes all log groups. */
+UNIV_INTERN
+void
+log_group_close_all(void);
+/*=====================*/
+/********************************************************//**
 Shutdown the log system but do not release all the memory. */
 UNIV_INTERN
 void
 log_shutdown(void);
 /*==============*/
-/**********************************************************
+/********************************************************//**
 Free the log system data structures. */
 UNIV_INTERN
 void
@@ -614,7 +626,7 @@ extern log_t*	log_sys;
 #define LOG_RECOVER	98887331
 
 /* The counting of lsn's starts from this value: this must be non-zero */
-#define LOG_START_LSN		((ib_uint64_t) (16 * OS_FILE_LOG_BLOCK_SIZE))
+#define LOG_START_LSN		((lsn_t) (16 * OS_FILE_LOG_BLOCK_SIZE))
 
 #define LOG_BUFFER_SIZE		(srv_log_buffer_size * UNIV_PAGE_SIZE)
 #define LOG_ARCHIVE_BUF_SIZE	(srv_log_buffer_size * UNIV_PAGE_SIZE / 4)
@@ -661,7 +673,7 @@ extern log_t*	log_sys;
 /* Offsets for a checkpoint field */
 #define LOG_CHECKPOINT_NO		0
 #define LOG_CHECKPOINT_LSN		8
-#define LOG_CHECKPOINT_OFFSET		16
+#define LOG_CHECKPOINT_OFFSET_LOW32	16
 #define LOG_CHECKPOINT_LOG_BUF_SIZE	20
 #define	LOG_CHECKPOINT_ARCHIVED_LSN	24
 #define	LOG_CHECKPOINT_GROUP_ARRAY	32
@@ -675,22 +687,38 @@ extern log_t*	log_sys;
 							+ LOG_MAX_N_GROUPS * 8)
 #define LOG_CHECKPOINT_CHECKSUM_1	LOG_CHECKPOINT_ARRAY_END
 #define LOG_CHECKPOINT_CHECKSUM_2	(4 + LOG_CHECKPOINT_ARRAY_END)
+#if 0
 #define LOG_CHECKPOINT_FSP_FREE_LIMIT	(8 + LOG_CHECKPOINT_ARRAY_END)
-					/* current fsp free limit in
+					/*!< Not used (0);
+					This used to contain the
+					current fsp free limit in
 					tablespace 0, in units of one
-					megabyte; this information is only used
-					by ibbackup to decide if it can
-					truncate unused ends of
-					non-auto-extending data files in space
-					0 */
+					megabyte.
+
+					This information might have been used
+					since ibbackup version 0.35 but
+					before 1.41 to decide if unused ends of
+					non-auto-extending data files
+					in space 0 can be truncated.
+
+					This information was made obsolete
+					by ibbackup --compress. */
 #define LOG_CHECKPOINT_FSP_MAGIC_N	(12 + LOG_CHECKPOINT_ARRAY_END)
-					/* this magic number tells if the
+					/*!< Not used (0);
+					This magic number tells if the
 					checkpoint contains the above field:
 					the field was added to
-					InnoDB-3.23.50 */
-#define LOG_CHECKPOINT_SIZE		(16 + LOG_CHECKPOINT_ARRAY_END)
-
+					InnoDB-3.23.50 and
+					removed from MySQL 5.6 */
 #define LOG_CHECKPOINT_FSP_MAGIC_N_VAL	1441231243
+					/*!< if LOG_CHECKPOINT_FSP_MAGIC_N
+					contains this value, then
+					LOG_CHECKPOINT_FSP_FREE_LIMIT
+					is valid */
+#endif
+#define LOG_CHECKPOINT_OFFSET_HIGH32	(16 + LOG_CHECKPOINT_ARRAY_END)
+#define LOG_CHECKPOINT_SIZE		(20 + LOG_CHECKPOINT_ARRAY_END)
+
 
 /* Offsets of a log file header */
 #define LOG_GROUP_ID		0	/* log group number */
@@ -739,19 +767,19 @@ extern log_t*	log_sys;
 
 /** Log group consists of a number of log files, each of the same size; a log
 group is implemented as a space in the sense of the module fil0fil. */
-struct log_group_struct{
+struct log_group_t{
 	/* The following fields are protected by log_sys->mutex */
 	ulint		id;		/*!< log group id */
 	ulint		n_files;	/*!< number of files in the group */
-	ulint		file_size;	/*!< individual log file size in bytes,
+	lsn_t		file_size;	/*!< individual log file size in bytes,
 					including the log file header */
 	ulint		space_id;	/*!< file space which implements the log
 					group */
 	ulint		state;		/*!< LOG_GROUP_OK or
 					LOG_GROUP_CORRUPTED */
-	ib_uint64_t	lsn;		/*!< lsn used to fix coordinates within
+	lsn_t		lsn;		/*!< lsn used to fix coordinates within
 					the log group */
-	ulint		lsn_offset;	/*!< the offset of the above lsn */
+	lsn_t		lsn_offset;	/*!< the offset of the above lsn */
 	ulint		n_pending_writes;/*!< number of currently pending flush
 					writes for this log group */
 	byte**		file_header_bufs_ptr;/*!< unaligned buffers */
@@ -765,22 +793,22 @@ struct log_group_struct{
 	ulint		archive_space_id;/*!< file space which
 					implements the log group
 					archive */
-	ulint		archived_file_no;/*!< file number corresponding to
+	lsn_t		archived_file_no;/*!< file number corresponding to
 					log_sys->archived_lsn */
-	ulint		archived_offset;/*!< file offset corresponding to
+	lsn_t		archived_offset;/*!< file offset corresponding to
 					log_sys->archived_lsn, 0 if we have
 					not yet written to the archive file
 					number archived_file_no */
-	ulint		next_archived_file_no;/*!< during an archive write,
+	lsn_t		next_archived_file_no;/*!< during an archive write,
 					until the write is completed, we
 					store the next value for
 					archived_file_no here: the write
 					completion function then sets the new
 					value to ..._file_no */
-	ulint		next_archived_offset; /*!< like the preceding field */
+	lsn_t		next_archived_offset; /*!< like the preceding field */
 #endif /* UNIV_LOG_ARCHIVE */
 	/*-----------------------------*/
-	ib_uint64_t	scanned_lsn;	/*!< used only in recovery: recovery scan
+	lsn_t		scanned_lsn;	/*!< used only in recovery: recovery scan
 					succeeded up to this lsn in this log
 					group */
 	byte*		checkpoint_buf_ptr;/*!< unaligned checkpoint header */
@@ -791,17 +819,17 @@ struct log_group_struct{
 };
 
 /** Redo log buffer */
-struct log_struct{
+struct log_t{
 	byte		pad[64];	/*!< padding to prevent other memory
 					update hotspots from residing on the
 					same memory cache line */
-	ib_uint64_t	lsn;		/*!< log sequence number */
+	lsn_t		lsn;		/*!< log sequence number */
 	ulint		buf_free;	/*!< first free offset within the log
 					buffer */
 #ifndef UNIV_HOTBACKUP
-	mutex_t		mutex;		/*!< mutex protecting the log */
+	ib_prio_mutex_t		mutex;		/*!< mutex protecting the log */
 
-	mutex_t		log_flush_order_mutex;/*!< mutex to serialize access to
+	ib_mutex_t		log_flush_order_mutex;/*!< mutex to serialize access to
 					the flush list when we are putting
 					dirty blocks in the list. The idea
 					behind this mutex is to be able
@@ -816,12 +844,14 @@ struct log_struct{
 	ulint		max_buf_free;	/*!< recommended maximum value of
 					buf_free, after which the buffer is
 					flushed */
+ #ifdef UNIV_LOG_DEBUG
 	ulint		old_buf_free;	/*!< value of buf free when log was
 					last time opened; only in the debug
 					version */
 	ib_uint64_t	old_lsn;	/*!< value of lsn when log was
 					last time opened; only in the
 					debug version */
+#endif /* UNIV_LOG_DEBUG */
 	ibool		check_flush_or_checkpoint;
 					/*!< this is set to TRUE when there may
 					be need to flush the log buffer, or
@@ -844,13 +874,13 @@ struct log_struct{
 					later; this is advanced when a flush
 					operation is completed to all the log
 					groups */
-	ib_uint64_t	written_to_some_lsn;
+	lsn_t		written_to_some_lsn;
 					/*!< first log sequence number not yet
 					written to any log group; for this to
 					be advanced, it is enough that the
 					write i/o has been completed for any
 					one log group */
-	ib_uint64_t	written_to_all_lsn;
+	lsn_t		written_to_all_lsn;
 					/*!< first log sequence number not yet
 					written to some log group; for this to
 					be advanced, it is enough that the
@@ -866,16 +896,16 @@ struct log_struct{
 					flushed_to_disk_lsn or
 					write_lsn which are always
 					up-to-date and accurate. */
-	ib_uint64_t	write_lsn;	/*!< end lsn for the current running
+	lsn_t		write_lsn;	/*!< end lsn for the current running
 					write */
 	ulint		write_end_offset;/*!< the data in buffer has
 					been written up to this offset
 					when the current write ends:
 					this field will then be copied
 					to buf_next_to_write */
-	ib_uint64_t	current_flush_lsn;/*!< end lsn for the current running
+	lsn_t		current_flush_lsn;/*!< end lsn for the current running
 					write + flush operation */
-	ib_uint64_t	flushed_to_disk_lsn;
+	lsn_t		flushed_to_disk_lsn;
 					/*!< how far we have written the log
 					AND flushed to disk */
 	ulint		n_pending_writes;/*!< number of currently
@@ -912,42 +942,37 @@ struct log_struct{
 	/* @} */
 
 	/** Fields involved in checkpoints @{ */
-	ulint		log_group_capacity; /*!< capacity of the log group; if
+	lsn_t		log_group_capacity; /*!< capacity of the log group; if
 					the checkpoint age exceeds this, it is
 					a serious error because it is possible
 					we will then overwrite log and spoil
 					crash recovery */
-	ulint		max_modified_age_async;
+	lsn_t		max_modified_age_async;
 					/*!< when this recommended
 					value for lsn -
 					buf_pool_get_oldest_modification()
 					is exceeded, we start an
 					asynchronous preflush of pool pages */
-	ulint		max_modified_age_sync;
+	lsn_t		max_modified_age_sync;
 					/*!< when this recommended
 					value for lsn -
 					buf_pool_get_oldest_modification()
 					is exceeded, we start a
 					synchronous preflush of pool pages */
-	ulint		adm_checkpoint_interval;
-					/*!< administrator-specified checkpoint
-					interval in terms of log growth in
-					bytes; the interval actually used by
-					the database can be smaller */
-	ulint		max_checkpoint_age_async;
+	lsn_t		max_checkpoint_age_async;
 					/*!< when this checkpoint age
 					is exceeded we start an
 					asynchronous writing of a new
 					checkpoint */
-	ulint		max_checkpoint_age;
+	lsn_t		max_checkpoint_age;
 					/*!< this is the maximum allowed value
 					for lsn - last_checkpoint_lsn when a
 					new query step is started */
 	ib_uint64_t	next_checkpoint_no;
 					/*!< next checkpoint number */
-	ib_uint64_t	last_checkpoint_lsn;
+	lsn_t		last_checkpoint_lsn;
 					/*!< latest checkpoint lsn */
-	ib_uint64_t	next_checkpoint_lsn;
+	lsn_t		next_checkpoint_lsn;
 					/*!< next checkpoint lsn */
 	ulint		n_pending_checkpoint_writes;
 					/*!< number of currently pending
@@ -965,16 +990,16 @@ struct log_struct{
 	/** Fields involved in archiving @{ */
 	ulint		archiving_state;/*!< LOG_ARCH_ON, LOG_ARCH_STOPPING
 					LOG_ARCH_STOPPED, LOG_ARCH_OFF */
-	ib_uint64_t	archived_lsn;	/*!< archiving has advanced to this
+	lsn_t		archived_lsn;	/*!< archiving has advanced to this
 					lsn */
-	ulint		max_archived_lsn_age_async;
+	lsn_t		max_archived_lsn_age_async;
 					/*!< recommended maximum age of
 					archived_lsn, before we start
 					asynchronous copying to the archive */
-	ulint		max_archived_lsn_age;
+	lsn_t		max_archived_lsn_age;
 					/*!< maximum allowed age for
 					archived_lsn */
-	ib_uint64_t	next_archived_lsn;/*!< during an archive write,
+	lsn_t		next_archived_lsn;/*!< during an archive write,
 					until the write is completed, we
 					store the next value for
 					archived_lsn here: the write
@@ -990,6 +1015,7 @@ struct log_struct{
 					should wait for this without owning
 					the log mutex */
 	ulint		archive_buf_size;/*!< size of archive_buf */
+	byte*		archive_buf_ptr;/*!< unaligned archived_buf */
 	byte*		archive_buf;	/*!< log segment is written to the
 					archive from this buffer */
 	os_event_t	archiving_on;	/*!< if archiving has been stopped,
@@ -997,7 +1023,7 @@ struct log_struct{
 					become signaled */
 	/* @} */
 #endif /* UNIV_LOG_ARCHIVE */
-	ib_uint64_t	tracked_lsn;	/*!< log tracking has advanced to this
+	lsn_t		tracked_lsn;	/*!< log tracking has advanced to this
 					lsn.  Field accessed atomically where
 					64-bit atomic ops are supported,
 					protected by the log sys mutex
diff --git a/storage/xtradb/include/log0log.ic b/storage/xtradb/include/log0log.ic
index 0088df41225..7724d94b51a 100644
--- a/storage/xtradb/include/log0log.ic
+++ b/storage/xtradb/include/log0log.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,6 +26,9 @@ Created 12/9/1995 Heikki Tuuri
 #include "os0file.h"
 #include "mach0data.h"
 #include "mtr0mtr.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "ut0crc32.h"
 
 #ifdef UNIV_LOG_DEBUG
 /******************************************************//**
@@ -192,13 +195,13 @@ UNIV_INLINE
 ulint
 log_block_convert_lsn_to_no(
 /*========================*/
-	ib_uint64_t	lsn)	/*!< in: lsn of a byte within the block */
+	lsn_t	lsn)	/*!< in: lsn of a byte within the block */
 {
 	return(((ulint) (lsn / OS_FILE_LOG_BLOCK_SIZE) & 0x3FFFFFFFUL) + 1);
 }
 
 /************************************************************//**
-Calculates the checksum for a log block.
+Calculates the checksum for a log block using the current algorithm.
 @return	checksum */
 UNIV_INLINE
 ulint
@@ -206,6 +209,17 @@ log_block_calc_checksum(
 /*====================*/
 	const byte*	block)	/*!< in: log block */
 {
+	return(log_checksum_algorithm_ptr(block));
+}
+/************************************************************//**
+Calculates the checksum for a log block using the default InnoDB algorithm.
+@return	checksum */
+UNIV_INLINE
+ulint
+log_block_calc_checksum_innodb(
+/*===========================*/
+	const byte*	block)	/*!< in: log block */
+{
 	ulint	sum;
 	ulint	sh;
 	ulint	i;
@@ -228,6 +242,30 @@ log_block_calc_checksum(
 }
 
 /************************************************************//**
+Calculates the checksum for a log block using the CRC32 algorithm.
+@return	checksum */
+UNIV_INLINE
+ulint
+log_block_calc_checksum_crc32(
+/*==========================*/
+	const byte*	block)	/*!< in: log block */
+{
+	return(ut_crc32(block, OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE));
+}
+
+/************************************************************//**
+Calculates the checksum for a log block using the "no-op" algorithm.
+@return	checksum */
+UNIV_INLINE
+ulint
+log_block_calc_checksum_none(
+/*=========================*/
+	const byte*	block)	/*!< in: log block */
+{
+	return(LOG_NO_CHECKSUM_MAGIC);
+}
+
+/************************************************************//**
 Gets a log block checksum field value.
 @return	checksum */
 UNIV_INLINE
@@ -260,8 +298,8 @@ UNIV_INLINE
 void
 log_block_init(
 /*===========*/
-	byte*		log_block,	/*!< in: pointer to the log buffer */
-	ib_uint64_t	lsn)		/*!< in: lsn within the log block */
+	byte*	log_block,	/*!< in: pointer to the log buffer */
+	lsn_t	lsn)		/*!< in: lsn within the log block */
 {
 	ulint	no;
 
@@ -282,8 +320,8 @@ UNIV_INLINE
 void
 log_block_init_in_old_format(
 /*=========================*/
-	byte*		log_block,	/*!< in: pointer to the log buffer */
-	ib_uint64_t	lsn)		/*!< in: lsn within the log block */
+	byte*	log_block,	/*!< in: pointer to the log buffer */
+	lsn_t	lsn)		/*!< in: lsn within the log block */
 {
 	ulint	no;
 
@@ -304,12 +342,12 @@ Writes to the log the string given. The log must be released with
 log_release.
 @return	end lsn of the log record, zero if did not succeed */
 UNIV_INLINE
-ib_uint64_t
+lsn_t
 log_reserve_and_write_fast(
 /*=======================*/
 	const void*	str,	/*!< in: string */
 	ulint		len,	/*!< in: string length */
-	ib_uint64_t*	start_lsn)/*!< out: start lsn of the log record */
+	lsn_t*		start_lsn)/*!< out: start lsn of the log record */
 {
 	ulint		data_len;
 #ifdef UNIV_LOG_LSN_DEBUG
@@ -374,6 +412,9 @@ log_reserve_and_write_fast(
 
 	log_sys->lsn += len;
 
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
+		    log_sys->lsn - log_sys->last_checkpoint_lsn);
+
 #ifdef UNIV_LOG_DEBUG
 	log_check_log_recs(log_sys->buf + log_sys->old_buf_free,
 			   log_sys->buf_free - log_sys->old_buf_free,
@@ -411,11 +452,11 @@ log_release(void)
 Gets the current lsn.
 @return	current lsn */
 UNIV_INLINE
-ib_uint64_t
+lsn_t
 log_get_lsn(void)
 /*=============*/
 {
-	ib_uint64_t	lsn;
+	lsn_t	lsn;
 
 	mutex_enter(&(log_sys->mutex));
 
@@ -450,13 +491,25 @@ Gets the log group capacity. It is OK to read the value without
 holding log_sys->mutex because it is constant.
 @return	log group capacity */
 UNIV_INLINE
-ulint
+lsn_t
 log_get_capacity(void)
 /*==================*/
 {
 	return(log_sys->log_group_capacity);
 }
 
+/****************************************************************
+Get log_sys::max_modified_age_async. It is OK to read the value without
+holding log_sys::mutex because it is constant.
+@return	max_modified_age_async */
+UNIV_INLINE
+lsn_t
+log_get_max_modified_age_async(void)
+/*================================*/
+{
+	return(log_sys->max_modified_age_async);
+}
+
 /***********************************************************************//**
 Checks if there is need for a log buffer flush or a new checkpoint, and does
 this if yes. Any database operation should call this when it has modified
diff --git a/storage/xtradb/include/log0online.h b/storage/xtradb/include/log0online.h
index a20eef57d7a..1ef4df7d6da 100644
--- a/storage/xtradb/include/log0online.h
+++ b/storage/xtradb/include/log0online.h
@@ -26,6 +26,7 @@ Online database log parsing for changed page tracking
 
 #include "univ.i"
 #include "os0file.h"
+#include "log0log.h"
 
 /** Single bitmap file information */
 typedef struct log_online_bitmap_file_struct log_online_bitmap_file_t;
@@ -109,9 +110,9 @@ ibool
 log_online_bitmap_iterator_init(
 /*============================*/
 	log_bitmap_iterator_t	*i,		/*!<in/out:  iterator */
-	ib_uint64_t		min_lsn,	/*!<in: start LSN for the
+	lsn_t			min_lsn,	/*!<in: start LSN for the
 						iterator */
-	ib_uint64_t		max_lsn);	/*!<in: end LSN for the
+	lsn_t			max_lsn);	/*!<in: end LSN for the
 						iterator */
 
 /*********************************************************************//**
@@ -138,7 +139,7 @@ struct log_online_bitmap_file_struct {
 	char		name[FN_REFLEN];	/*!< Name with full path */
 	os_file_t	file;			/*!< Handle to opened file */
 	ib_uint64_t	size;			/*!< Size of the file */
-	ib_uint64_t	offset;			/*!< Offset of the next read,
+	os_offset_t	offset;			/*!< Offset of the next read,
 						or count of already-read bytes
 						*/
 };
@@ -147,12 +148,12 @@ struct log_online_bitmap_file_struct {
 struct log_online_bitmap_file_range_struct {
 	size_t	count;					/*!< Number of files */
 	/*!< Dynamically-allocated array of info about individual files */
-	struct {
-		char		name[FN_REFLEN];	/*!< Name of a file */
-		ib_uint64_t	start_lsn;		/*!< Starting LSN of
-						        data in	this file */
-		ulong		seq_num;		/*!< Sequence number of
-							this file */
+	struct files_t {
+		char	name[FN_REFLEN];	/*!< Name of a file */
+		lsn_t	start_lsn;		/*!< Starting LSN of data in
+						this file */
+		ulong	seq_num;		/*!< Sequence number of	this
+						file */
 	}	*files;
 };
 
@@ -171,9 +172,9 @@ struct log_bitmap_iterator_struct
 	ib_uint32_t			bit_offset;	/*!< bit offset inside
 							the current bitmap
 							block */
-	ib_uint64_t			start_lsn;	/*!< Start LSN of the
+	lsn_t				start_lsn;	/*!< Start LSN of the
 							current bitmap block */
-	ib_uint64_t			end_lsn;	/*!< End LSN of the
+	lsn_t				end_lsn;	/*!< End LSN of the
 							current bitmap block */
 	ib_uint32_t			space_id;	/*!< Current block
 							space id */
diff --git a/storage/xtradb/include/log0recv.h b/storage/xtradb/include/log0recv.h
index ad30f6862c2..a1653c10999 100644
--- a/storage/xtradb/include/log0recv.h
+++ b/storage/xtradb/include/log0recv.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -50,7 +50,7 @@ UNIV_INTERN
 ib_uint64_t
 recv_calc_lsn_on_data_add(
 /*======================*/
-	ib_uint64_t	lsn,	/*!< in: old lsn */
+	lsn_t		lsn,	/*!< in: old lsn */
 	ib_uint64_t	len);	/*!< in: this many bytes of data is
 				added, log block headers not included */
 
@@ -62,19 +62,17 @@ Reads the checkpoint info needed in hot backup.
 @return	TRUE if success */
 UNIV_INTERN
 ibool
-recv_read_cp_info_for_backup(
-/*=========================*/
+recv_read_checkpoint_info_for_backup(
+/*=================================*/
 	const byte*	hdr,	/*!< in: buffer containing the log group
 				header */
-	ib_uint64_t*	lsn,	/*!< out: checkpoint lsn */
-	ulint*		offset,	/*!< out: checkpoint offset in the log group */
-	ulint*		fsp_limit,/*!< out: fsp limit of space 0,
-				1000000000 if the database is running
-				with < version 3.23.50 of InnoDB */
-	ib_uint64_t*	cp_no,	/*!< out: checkpoint number */
-	ib_uint64_t*	first_header_lsn);
+	lsn_t*		lsn,	/*!< out: checkpoint lsn */
+	lsn_t*		offset,	/*!< out: checkpoint offset in the log group */
+	lsn_t*		cp_no,	/*!< out: checkpoint number */
+	lsn_t*		first_header_lsn)
 				/*!< out: lsn of of the start of the
 				first log file */
+	__attribute__((nonnull));
 /*******************************************************************//**
 Scans the log segment and n_bytes_scanned is set to the length of valid
 log scanned. */
@@ -84,7 +82,7 @@ recv_scan_log_seg_for_backup(
 /*=========================*/
 	byte*		buf,		/*!< in: buffer containing log data */
 	ulint		buf_len,	/*!< in: data length in that buffer */
-	ib_uint64_t*	scanned_lsn,	/*!< in/out: lsn of buffer start,
+	lsn_t*		scanned_lsn,	/*!< in/out: lsn of buffer start,
 					we return scanned lsn */
 	ulint*		scanned_checkpoint_no,
 					/*!< in/out: 4 lowest bytes of the
@@ -152,18 +150,18 @@ recv_recovery_from_checkpoint_finish should be called later to complete
 the recovery and free the resources used in it.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 recv_recovery_from_checkpoint_start_func(
 /*=====================================*/
 #ifdef UNIV_LOG_ARCHIVE
 	ulint		type,		/*!< in: LOG_CHECKPOINT or
 					LOG_ARCHIVE */
-	ib_uint64_t	limit_lsn,	/*!< in: recover up to this lsn
+	lsn_t		limit_lsn,	/*!< in: recover up to this lsn
 					if possible */
 #endif /* UNIV_LOG_ARCHIVE */
-	ib_uint64_t	min_flushed_lsn,/*!< in: min flushed lsn from
+	lsn_t		min_flushed_lsn,/*!< in: min flushed lsn from
 					data files */
-	ib_uint64_t	max_flushed_lsn);/*!< in: max flushed lsn from
+	lsn_t		max_flushed_lsn);/*!< in: max flushed lsn from
 					 data files */
 #ifdef UNIV_LOG_ARCHIVE
 /** Wrapper for recv_recovery_from_checkpoint_start_func().
@@ -239,11 +237,11 @@ recv_scan_log_recs(
 	const byte*	buf,		/*!< in: buffer containing a log
 					segment or garbage */
 	ulint		len,		/*!< in: buffer length */
-	ib_uint64_t	start_lsn,	/*!< in: buffer start lsn */
-	ib_uint64_t*	contiguous_lsn,	/*!< in/out: it is known that all log
+	lsn_t		start_lsn,	/*!< in: buffer start lsn */
+	lsn_t*		contiguous_lsn,	/*!< in/out: it is known that all log
 					groups contain contiguous log data up
 					to this lsn */
-	ib_uint64_t*	group_scanned_lsn);/*!< out: scanning succeeded up to
+	lsn_t*		group_scanned_lsn);/*!< out: scanning succeeded up to
 					this lsn */
 /******************************************************//**
 Resets the logs. The contents of log files will be lost! */
@@ -251,18 +249,18 @@ UNIV_INTERN
 void
 recv_reset_logs(
 /*============*/
-	ib_uint64_t	lsn,		/*!< in: reset to this lsn
-					rounded up to be divisible by
-					OS_FILE_LOG_BLOCK_SIZE, after
-					which we add
-					LOG_BLOCK_HDR_SIZE */
 #ifdef UNIV_LOG_ARCHIVE
 	ulint		arch_log_no,	/*!< in: next archived log file number */
-#endif /* UNIV_LOG_ARCHIVE */
-	ibool		new_logs_created);/*!< in: TRUE if resetting logs
+	ibool		new_logs_created,/*!< in: TRUE if resetting logs
 					is done at the log creation;
 					FALSE if it is done after
 					archive recovery */
+#endif /* UNIV_LOG_ARCHIVE */
+	lsn_t		lsn);		/*!< in: reset to this lsn
+					rounded up to be divisible by
+					OS_FILE_LOG_BLOCK_SIZE, after
+					which we add
+					LOG_BLOCK_HDR_SIZE */
 #ifdef UNIV_HOTBACKUP
 /******************************************************//**
 Creates new log files after a backup has been restored. */
@@ -272,8 +270,8 @@ recv_reset_log_files_for_backup(
 /*============================*/
 	const char*	log_dir,	/*!< in: log file directory path */
 	ulint		n_log_files,	/*!< in: number of log files */
-	ulint		log_file_size,	/*!< in: log file size */
-	ib_uint64_t	lsn);		/*!< in: new start lsn, must be
+	lsn_t		log_file_size,	/*!< in: log file size */
+	lsn_t		lsn);		/*!< in: new start lsn, must be
 					divisible by OS_FILE_LOG_BLOCK_SIZE */
 #endif /* UNIV_HOTBACKUP */
 /********************************************************//**
@@ -336,14 +334,14 @@ recv_apply_log_recs_for_backup(void);
 Recovers from archived log files, and also from log files, if they exist.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 recv_recovery_from_archive_start(
 /*=============================*/
-	ib_uint64_t	min_flushed_lsn,/*!< in: min flushed lsn field from the
+	lsn_t		min_flushed_lsn,/*!< in: min flushed lsn field from the
 					data files */
-	ib_uint64_t	limit_lsn,	/*!< in: recover up to this lsn if
+	lsn_t		limit_lsn,	/*!< in: recover up to this lsn if
 					possible */
-	ulint		first_log_no);	/*!< in: number of the first archived
+	lsn_t		first_log_no);	/*!< in: number of the first archived
 					log file to use in the recovery; the
 					file will be searched from
 					INNOBASE_LOG_ARCH_DIR specified in
@@ -357,9 +355,7 @@ recv_recovery_from_archive_finish(void);
 #endif /* UNIV_LOG_ARCHIVE */
 
 /** Block of log record data */
-typedef struct recv_data_struct	recv_data_t;
-/** Block of log record data */
-struct recv_data_struct{
+struct recv_data_t{
 	recv_data_t*	next;	/*!< pointer to the next block or NULL */
 				/*!< the log record data is stored physically
 				immediately after this struct, max amount
@@ -367,18 +363,16 @@ struct recv_data_struct{
 };
 
 /** Stored log record struct */
-typedef struct recv_struct	recv_t;
-/** Stored log record struct */
-struct recv_struct{
+struct recv_t{
 	byte		type;	/*!< log record type */
 	ulint		len;	/*!< log record body length in bytes */
 	recv_data_t*	data;	/*!< chain of blocks containing the log record
 				body */
-	ib_uint64_t	start_lsn;/*!< start lsn of the log segment written by
+	lsn_t		start_lsn;/*!< start lsn of the log segment written by
 				the mtr which generated this log record: NOTE
 				that this is not necessarily the start lsn of
 				this log record */
-	ib_uint64_t	end_lsn;/*!< end lsn of the log segment written by
+	lsn_t		end_lsn;/*!< end lsn of the log segment written by
 				the mtr which generated this log record: NOTE
 				that this is not necessarily the end lsn of
 				this log record */
@@ -386,7 +380,7 @@ struct recv_struct{
 			rec_list;/*!< list of log records for this page */
 };
 
-/** States of recv_addr_struct */
+/** States of recv_addr_t */
 enum recv_addr_state {
 	/** not yet processed */
 	RECV_NOT_PROCESSED,
@@ -400,9 +394,7 @@ enum recv_addr_state {
 };
 
 /** Hashed page file address struct */
-typedef struct recv_addr_struct	recv_addr_t;
-/** Hashed page file address struct */
-struct recv_addr_struct{
+struct recv_addr_t{
 	enum recv_addr_state state;
 				/*!< recovery state of the page */
 	unsigned	space:32;/*!< space id */
@@ -413,13 +405,14 @@ struct recv_addr_struct{
 };
 
 /** Recovery system data structure */
-typedef struct recv_sys_struct	recv_sys_t;
-/** Recovery system data structure */
-struct recv_sys_struct{
+struct recv_sys_t{
 #ifndef UNIV_HOTBACKUP
-	mutex_t		mutex;	/*!< mutex protecting the fields apply_log_recs,
+	ib_mutex_t		mutex;	/*!< mutex protecting the fields apply_log_recs,
 				n_addrs, and the state field in each recv_addr
 				struct */
+	ib_mutex_t		writer_mutex;/*!< mutex coordinating
+				flushing between recv_writer_thread and
+				the recovery thread. */
 #endif /* !UNIV_HOTBACKUP */
 	ibool		apply_log_recs;
 				/*!< this is TRUE when log rec application to
@@ -429,7 +422,7 @@ struct recv_sys_struct{
 	ibool		apply_batch_on;
 				/*!< this is TRUE when a log rec application
 				batch is running */
-	ib_uint64_t	lsn;	/*!< log sequence number */
+	lsn_t		lsn;	/*!< log sequence number */
 	ulint		last_log_buf_size;
 				/*!< size of the log buffer when the database
 				last time wrote to the log */
@@ -441,12 +434,12 @@ struct recv_sys_struct{
 				preceding buffer */
 	byte*		buf;	/*!< buffer for parsing log records */
 	ulint		len;	/*!< amount of data in buf */
-	ib_uint64_t	parse_start_lsn;
+	lsn_t		parse_start_lsn;
 				/*!< this is the lsn from which we were able to
 				start parsing log records and adding them to
 				the hash table; zero if a suitable
 				start point not found yet */
-	ib_uint64_t	scanned_lsn;
+	lsn_t		scanned_lsn;
 				/*!< the log data has been scanned up to this
 				lsn */
 	ulint		scanned_checkpoint_no;
@@ -455,10 +448,10 @@ struct recv_sys_struct{
 	ulint		recovered_offset;
 				/*!< start offset of non-parsed log records in
 				buf */
-	ib_uint64_t	recovered_lsn;
+	lsn_t		recovered_lsn;
 				/*!< the log records have been parsed up to
 				this lsn */
-	ib_uint64_t	limit_lsn;/*!< recovery should be made at most
+	lsn_t		limit_lsn;/*!< recovery should be made at most
 				up to this lsn */
 	ibool		found_corrupt_log;
 				/*!< this is set to TRUE if we during log
@@ -475,39 +468,6 @@ struct recv_sys_struct{
 	hash_table_t*	addr_hash;/*!< hash table of file addresses of pages */
 	ulint		n_addrs;/*!< number of not processed hashed file
 				addresses in the hash table */
-
-/* If you modified the following defines at original file,
-   You should also modify them. */
-/* defined in os0file.c */
-#define OS_AIO_MERGE_N_CONSECUTIVE	64
-/* defined in log0recv.c */
-#define RECV_READ_AHEAD_AREA	32
-	time_t		stats_recv_start_time;
-	ulint		stats_recv_turns;
-
-	ulint		stats_read_requested_pages;
-	ulint		stats_read_in_area[RECV_READ_AHEAD_AREA];
-
-	ulint		stats_read_io_pages;
-	ulint		stats_read_io_consecutive[OS_AIO_MERGE_N_CONSECUTIVE];
-	ulint		stats_write_io_pages;
-	ulint		stats_write_io_consecutive[OS_AIO_MERGE_N_CONSECUTIVE];
-
-	ulint		stats_doublewrite_check_pages;
-	ulint		stats_doublewrite_overwrite_pages;
-
-	ulint		stats_recover_pages_with_read;
-	ulint		stats_recover_pages_without_read;
-
-	ulint		stats_log_recs;
-	ulint		stats_log_len_sum;
-
-	ulint		stats_applied_log_recs;
-	ulint		stats_applied_log_len_sum;
-	ulint		stats_pages_already_new;
-
-	ib_uint64_t	stats_oldest_modified_lsn;
-	ib_uint64_t	stats_newest_modified_lsn;
 };
 
 /** The recovery system */
diff --git a/storage/xtradb/include/log0recv.ic b/storage/xtradb/include/log0recv.ic
index 62fd5c18e30..32c28dd03e6 100644
--- a/storage/xtradb/include/log0recv.ic
+++ b/storage/xtradb/include/log0recv.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -33,7 +33,7 @@ ibool
 recv_recovery_is_on(void)
 /*=====================*/
 {
-	return(UNIV_UNLIKELY(recv_recovery_on));
+	return(recv_recovery_on);
 }
 
 #ifdef UNIV_LOG_ARCHIVE
diff --git a/storage/xtradb/include/mach0data.h b/storage/xtradb/include/mach0data.h
index 81c0866f367..d0087f56aaa 100644
--- a/storage/xtradb/include/mach0data.h
+++ b/storage/xtradb/include/mach0data.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,6 +27,8 @@ Created 11/28/1995 Heikki Tuuri
 #ifndef mach0data_h
 #define mach0data_h
 
+#ifndef UNIV_INNOCHECKSUM
+
 #include "univ.i"
 #include "ut0byte.h"
 
@@ -204,7 +206,7 @@ UNIV_INLINE
 void
 mach_write_to_8(
 /*============*/
-	byte*		b,	/*!< in: pointer to 8 bytes where to store */
+	void*		b,	/*!< in: pointer to 8 bytes where to store */
 	ib_uint64_t	n);	/*!< in: 64-bit integer to be stored */
 /********************************************************//**
 The following function is used to fetch data from 8 consecutive
@@ -361,19 +363,53 @@ mach_write_to_2_little_endian(
 /*==========================*/
 	byte*	dest,		/*!< in: where to write */
 	ulint	n);		/*!< in: unsigned long int to write */
-
 /*********************************************************//**
 Convert integral type from storage byte order (big endian) to
 host byte order.
 @return	integer value */
 UNIV_INLINE
-ullint
+ib_uint64_t
 mach_read_int_type(
 /*===============*/
 	const byte*	src,		/*!< in: where to read from */
 	ulint		len,		/*!< in: length of src */
 	ibool		unsigned_type);	/*!< in: signed or unsigned flag */
+/***********************************************************//**
+Convert integral type from host byte order to (big-endian) storage
+byte order. */
+UNIV_INLINE
+void
+mach_write_int_type(
+/*================*/
+	byte*		dest,		/*!< in: where to write*/
+	const byte*	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of src */
+	bool		usign);		/*!< in: signed or unsigned flag */
+
+/*************************************************************
+Convert a ulonglong integer from host byte order to (big-endian)
+storage byte order. */
+UNIV_INLINE
+void
+mach_write_ulonglong(
+/*=================*/
+	byte*		dest,		/*!< in: where to write */
+	ulonglong	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of dest */
+	bool		usign);		/*!< in: signed or unsigned flag */
+
+/********************************************************//**
+Reads 1 - 4 bytes from a file page buffered in the buffer pool.
+@return	value read */
+UNIV_INLINE
+ulint
+mach_read_ulint(
+/*============*/
+	const byte*	ptr,	/*!< in: pointer from where to read */
+	ulint		type);	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+
 #endif /* !UNIV_HOTBACKUP */
+#endif /* !UNIV_INNOCHECKSUM */
 
 #ifndef UNIV_NONINL
 #include "mach0data.ic"
diff --git a/storage/xtradb/include/mach0data.ic b/storage/xtradb/include/mach0data.ic
index 238a56577af..27b9f62b552 100644
--- a/storage/xtradb/include/mach0data.ic
+++ b/storage/xtradb/include/mach0data.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -24,6 +24,8 @@ to the machine format.
 Created 11/28/1995 Heikki Tuuri
 ***********************************************************************/
 
+#ifndef UNIV_INNOCHECKSUM
+
 #include "ut0mem.h"
 
 /*******************************************************//**
@@ -38,7 +40,7 @@ mach_write_to_1(
 	ut_ad(b);
 	ut_ad((n | 0xFFUL) <= 0xFFUL);
 
-	b[0] = (byte)n;
+	b[0] = (byte) n;
 }
 
 /********************************************************//**
@@ -72,19 +74,6 @@ mach_write_to_2(
 }
 
 /********************************************************//**
-The following function is used to fetch data from 2 consecutive
-bytes. The most significant byte is at the lowest address.
-@return	ulint integer */
-UNIV_INLINE
-ulint
-mach_read_from_2(
-/*=============*/
-	const byte*	b)	/*!< in: pointer to 2 bytes */
-{
-	return(((ulint)(b[0]) << 8) | (ulint)(b[1]));
-}
-
-/********************************************************//**
 The following function is used to convert a 16-bit data item
 to the canonical format, for fast bytewise equality test
 against memory.
@@ -165,7 +154,22 @@ mach_write_to_4(
 	b[0] = (byte)(n >> 24);
 	b[1] = (byte)(n >> 16);
 	b[2] = (byte)(n >> 8);
-	b[3] = (byte)n;
+	b[3] = (byte) n;
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************//**
+The following function is used to fetch data from 2 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_2(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 2 bytes */
+{
+	return(((ulint)(b[0]) << 8) | (ulint)(b[1]));
 }
 
 /********************************************************//**
@@ -186,6 +190,8 @@ mach_read_from_4(
 		);
 }
 
+#ifndef UNIV_INNOCHECKSUM
+
 /*********************************************************//**
 Writes a ulint in a compressed form where the first byte codes the
 length of the stored ulint. We look at the most significant bits of
@@ -280,13 +286,13 @@ UNIV_INLINE
 void
 mach_write_to_8(
 /*============*/
-	byte*		b,	/*!< in: pointer to 8 bytes where to store */
+	void*		b,	/*!< in: pointer to 8 bytes where to store */
 	ib_uint64_t	n)	/*!< in: 64-bit integer to be stored */
 {
 	ut_ad(b);
 
-	mach_write_to_4(b, (ulint) (n >> 32));
-	mach_write_to_4(b + 4, (ulint) n);
+	mach_write_to_4(static_cast<byte*>(b), (ulint) (n >> 32));
+	mach_write_to_4(static_cast<byte*>(b) + 4, (ulint) n);
 }
 
 /********************************************************//**
@@ -550,7 +556,7 @@ mach_double_read(
 	ulint	i;
 	byte*	ptr;
 
-	ptr = (byte*)&d;
+	ptr = (byte*) &d;
 
 	for (i = 0; i < sizeof(double); i++) {
 #ifdef WORDS_BIGENDIAN
@@ -575,7 +581,7 @@ mach_double_write(
 	ulint	i;
 	byte*	ptr;
 
-	ptr = (byte*)&d;
+	ptr = (byte*) &d;
 
 	for (i = 0; i < sizeof(double); i++) {
 #ifdef WORDS_BIGENDIAN
@@ -599,7 +605,7 @@ mach_float_read(
 	ulint	i;
 	byte*	ptr;
 
-	ptr = (byte*)&d;
+	ptr = (byte*) &d;
 
 	for (i = 0; i < sizeof(float); i++) {
 #ifdef WORDS_BIGENDIAN
@@ -624,7 +630,7 @@ mach_float_write(
 	ulint	i;
 	byte*	ptr;
 
-	ptr = (byte*)&d;
+	ptr = (byte*) &d;
 
 	for (i = 0; i < sizeof(float); i++) {
 #ifdef WORDS_BIGENDIAN
@@ -648,7 +654,6 @@ mach_read_from_n_little_endian(
 	ulint	n	= 0;
 	const byte*	ptr;
 
-	ut_ad(buf_size <= sizeof(ulint));
 	ut_ad(buf_size > 0);
 
 	ptr = buf + buf_size;
@@ -736,7 +741,7 @@ Convert integral type from storage byte order (big endian) to
 host byte order.
 @return	integer value */
 UNIV_INLINE
-ullint
+ib_uint64_t
 mach_read_int_type(
 /*===============*/
 	const byte*	src,		/*!< in: where to read from */
@@ -771,4 +776,106 @@ mach_read_int_type(
 
 	return(ret);
 }
+/*********************************************************//**
+Swap byte ordering. */
+UNIV_INLINE
+void
+mach_swap_byte_order(
+/*=================*/
+        byte*           dest,           /*!< out: where to write */
+        const byte*     from,           /*!< in: where to read from */
+        ulint           len)            /*!< in: length of src */
+{
+        ut_ad(len > 0);
+        ut_ad(len <= 8);
+
+        dest += len;
+
+        switch (len & 0x7) {
+        case 0: *--dest = *from++;
+        case 7: *--dest = *from++;
+        case 6: *--dest = *from++;
+        case 5: *--dest = *from++;
+        case 4: *--dest = *from++;
+        case 3: *--dest = *from++;
+        case 2: *--dest = *from++;
+        case 1: *--dest = *from;
+        }
+}
+
+/*************************************************************
+Convert integral type from host byte order (big-endian) storage
+byte order. */
+UNIV_INLINE
+void
+mach_write_int_type(
+/*================*/
+	byte*		dest,		/*!< in: where to write */
+	const byte*	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of src */
+	bool		usign)		/*!< in: signed or unsigned flag */
+{
+#ifdef WORDS_BIGENDIAN
+        memcpy(dest, src, len);
+#else
+        mach_swap_byte_order(dest, src, len);
+#endif /* WORDS_BIGENDIAN */
+
+	if (!usign) {
+		*dest ^=  0x80;
+	}
+}
+
+/*************************************************************
+Convert a ulonglong integer from host byte order to (big-endian)
+storage byte order. */
+UNIV_INLINE
+void
+mach_write_ulonglong(
+/*=================*/
+	byte*		dest,		/*!< in: where to write */
+	ulonglong	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of dest */
+	bool		usign)		/*!< in: signed or unsigned flag */
+{
+	byte*		ptr = reinterpret_cast<byte*>(&src);
+
+	ut_ad(len <= sizeof(ulonglong));
+
+#ifdef WORDS_BIGENDIAN
+	memcpy(dest, ptr + (sizeof(src) - len), len);
+#else
+	mach_swap_byte_order(dest, reinterpret_cast<byte*>(ptr), len);
+#endif /* WORDS_BIGENDIAN */
+
+	if (!usign) {
+		*dest ^=  0x80;
+	}
+}
+
+/********************************************************//**
+Reads 1 - 4 bytes from a file page buffered in the buffer pool.
+@return	value read */
+UNIV_INLINE
+ulint
+mach_read_ulint(
+/*============*/
+	const byte*	ptr,	/*!< in: pointer from where to read */
+	ulint		type)	/*!< in: 1,2 or 4 bytes */
+{
+	switch (type) {
+	case 1:
+		return(mach_read_from_1(ptr));
+	case 2:
+		return(mach_read_from_2(ptr));
+	case 4:
+		return(mach_read_from_4(ptr));
+	default:
+		ut_error;
+	}
+
+	return(0);
+}
+
 #endif /* !UNIV_HOTBACKUP */
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/xtradb/include/mem0dbg.h b/storage/xtradb/include/mem0dbg.h
index 1c387706c98..cc339b82910 100644
--- a/storage/xtradb/include/mem0dbg.h
+++ b/storage/xtradb/include/mem0dbg.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -31,8 +31,8 @@ check fields whose sizes are given below */
 # ifndef UNIV_HOTBACKUP
 /* The mutex which protects in the debug version the hash table
 containing the list of live memory heaps, and also the global
-variables in mem0dbg.c. */
-extern mutex_t	mem_hash_mutex;
+variables in mem0dbg.cc. */
+extern ib_mutex_t	mem_hash_mutex;
 # endif /* !UNIV_HOTBACKUP */
 
 #define MEM_FIELD_HEADER_SIZE	ut_calc_align(2 * sizeof(ulint),\
diff --git a/storage/xtradb/include/mem0dbg.ic b/storage/xtradb/include/mem0dbg.ic
index 72c63e0a4c4..ec60ed35337 100644
--- a/storage/xtradb/include/mem0dbg.ic
+++ b/storage/xtradb/include/mem0dbg.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/mem0mem.h b/storage/xtradb/include/mem0mem.h
index 7dff3e7a2b8..c36ef06b554 100644
--- a/storage/xtradb/include/mem0mem.h
+++ b/storage/xtradb/include/mem0mem.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -38,15 +38,12 @@ Created 6/9/1994 Heikki Tuuri
 
 /* -------------------- MEMORY HEAPS ----------------------------- */
 
-/* The info structure stored at the beginning of a heap block */
-typedef struct mem_block_info_struct mem_block_info_t;
-
 /* A block of a memory heap consists of the info structure
 followed by an area of memory */
-typedef mem_block_info_t	mem_block_t;
+typedef struct mem_block_info_t	mem_block_t;
 
 /* A memory heap is a nonempty linear list of memory blocks */
-typedef mem_block_t	mem_heap_t;
+typedef mem_block_t		mem_heap_t;
 
 /* Types of allocation for memory heaps: DYNAMIC means allocation from the
 dynamic memory pool of the C compiler, BUFFER means allocation from the
@@ -62,6 +59,12 @@ buffer pool; the latter method is used for very big heaps */
 					allocation functions can return
 					NULL. */
 
+/* Different type of heaps in terms of which datastructure is using them */
+#define MEM_HEAP_FOR_BTR_SEARCH		(MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER)
+#define MEM_HEAP_FOR_PAGE_HASH		(MEM_HEAP_DYNAMIC)
+#define MEM_HEAP_FOR_RECV_SYS		(MEM_HEAP_BUFFER)
+#define MEM_HEAP_FOR_LOCK_HEAP		(MEM_HEAP_BUFFER)
+
 /* The following start size is used for the first block in the memory heap if
 the size is not specified, i.e., 0 is given as the parameter in the call of
 create. The standard size is the maximum (payload) size of the blocks used for
@@ -99,16 +102,8 @@ heap creation. */
 Use this macro instead of the corresponding function! Macro for memory
 heap creation. */
 
-#define mem_heap_create_in_buffer(N)	mem_heap_create_func(\
-		(N), MEM_HEAP_BUFFER, __FILE__, __LINE__)
-/**************************************************************//**
-Use this macro instead of the corresponding function! Macro for memory
-heap creation. */
-
-#define mem_heap_create_in_btr_search(N)	mem_heap_create_func(\
-		(N), MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER,\
-		__FILE__, __LINE__)
-
+#define mem_heap_create_typed(N, T)	mem_heap_create_func(\
+		(N), (T), __FILE__, __LINE__)
 /**************************************************************//**
 Use this macro instead of the corresponding function! Macro for memory
 heap freeing. */
@@ -221,7 +216,7 @@ mem_heap_get_size(
 Use this macro instead of the corresponding function!
 Macro for memory buffer allocation */
 
-#define mem_zalloc(N)	memset(mem_alloc(N), 0, (N));
+#define mem_zalloc(N)	memset(mem_alloc(N), 0, (N))
 
 #define mem_alloc(N)	mem_alloc_func((N), NULL, __FILE__, __LINE__)
 #define mem_alloc2(N,S)	mem_alloc_func((N), (S), __FILE__, __LINE__)
@@ -320,7 +315,7 @@ mem_heap_dup(
 	ulint		len);	/*!< in: length of data, in bytes */
 
 /****************************************************************//**
-A simple (s)printf replacement that dynamically allocates the space for the
+A simple sprintf replacement that dynamically allocates the space for the
 formatted string from the given heap. This supports a very limited set of
 the printf syntax: types 's' and 'u' and length modifier 'l' (which is
 required for the 'u' type).
@@ -345,9 +340,8 @@ mem_validate_all_blocks(void);
 
 /*#######################################################################*/
 
-/* The info header of a block in a memory heap */
-
-struct mem_block_info_struct {
+/** The info structure stored at the beginning of a heap block */
+struct mem_block_info_t {
 	ulint	magic_n;/* magic number for debugging */
 	char	file_name[8];/* file name where the mem heap was created */
 	ulint	line;	/*!< line number where the mem heap was created */
diff --git a/storage/xtradb/include/mem0mem.ic b/storage/xtradb/include/mem0mem.ic
index 6b2e35d7387..7f0e128cc40 100644
--- a/storage/xtradb/include/mem0mem.ic
+++ b/storage/xtradb/include/mem0mem.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -193,7 +193,7 @@ mem_heap_alloc(
 
 	free = mem_block_get_free(block);
 
-	buf = (byte*)block + free;
+	buf = (byte*) block + free;
 
 	mem_block_set_free(block, free + MEM_SPACE_NEEDED(n));
 
@@ -202,11 +202,11 @@ mem_heap_alloc(
 		       n + MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE);
 
 	/* In the debug version write debugging info to the field */
-	mem_field_init((byte*)buf, n);
+	mem_field_init((byte*) buf, n);
 
 	/* Advance buf to point at the storage which will be given to the
 	caller */
-	buf = (byte*)buf + MEM_FIELD_HEADER_SIZE;
+	buf = (byte*) buf + MEM_FIELD_HEADER_SIZE;
 
 #endif
 	UNIV_MEM_ALLOC(buf, n);
@@ -229,7 +229,7 @@ mem_heap_get_heap_top(
 
 	block = UT_LIST_GET_LAST(heap->base);
 
-	buf = (byte*)block + mem_block_get_free(block);
+	buf = (byte*) block + mem_block_get_free(block);
 
 	return(buf);
 }
@@ -247,16 +247,13 @@ mem_heap_free_heap_top(
 {
 	mem_block_t*	block;
 	mem_block_t*	prev_block;
-#ifdef UNIV_MEM_DEBUG
+#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG
 	ibool		error;
 	ulint		total_size;
 	ulint		size;
-#endif
 
 	ut_ad(mem_heap_check(heap));
 
-#ifdef UNIV_MEM_DEBUG
-
 	/* Validate the heap and get its total allocated size */
 	mem_heap_validate_or_print(heap, NULL, FALSE, &error, &total_size,
 				   NULL, NULL);
@@ -272,8 +269,8 @@ mem_heap_free_heap_top(
 	block = UT_LIST_GET_LAST(heap->base);
 
 	while (block != NULL) {
-		if (((byte*)block + mem_block_get_free(block) >= old_top)
-		    && ((byte*)block <= old_top)) {
+		if (((byte*) block + mem_block_get_free(block) >= old_top)
+		    && ((byte*) block <= old_top)) {
 			/* Found the right block */
 
 			break;
@@ -292,22 +289,20 @@ mem_heap_free_heap_top(
 	ut_ad(block);
 
 	/* Set the free field of block */
-	mem_block_set_free(block, old_top - (byte*)block);
+	mem_block_set_free(block, old_top - (byte*) block);
 
-#ifdef UNIV_MEM_DEBUG
 	ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
-
+	UNIV_MEM_ASSERT_W(old_top, (byte*) block + block->len - old_top);
+#if defined UNIV_MEM_DEBUG
 	/* In the debug version erase block from top up */
-	mem_erase_buf(old_top, (byte*)block + block->len - old_top);
+	mem_erase_buf(old_top, (byte*) block + block->len - old_top);
 
 	/* Update allocated memory count */
 	mutex_enter(&mem_hash_mutex);
 	mem_current_allocated_memory -= (total_size - size);
 	mutex_exit(&mem_hash_mutex);
-#else /* UNIV_MEM_DEBUG */
-	UNIV_MEM_ASSERT_W(old_top, (byte*)block + block->len - old_top);
 #endif /* UNIV_MEM_DEBUG */
-	UNIV_MEM_ALLOC(old_top, (byte*)block + block->len - old_top);
+	UNIV_MEM_ALLOC(old_top, (byte*) block + block->len - old_top);
 
 	/* If free == start, we may free the block if it is not the first
 	one */
@@ -326,7 +321,7 @@ mem_heap_empty(
 /*===========*/
 	mem_heap_t*	heap)	/*!< in: heap to empty */
 {
-	mem_heap_free_heap_top(heap, (byte*)heap + mem_block_get_start(heap));
+	mem_heap_free_heap_top(heap, (byte*) heap + mem_block_get_start(heap));
 #ifndef UNIV_HOTBACKUP
 	if (heap->free_block) {
 		mem_heap_free_block_free(heap);
@@ -394,7 +389,7 @@ mem_heap_free_top(
 	ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
 
 	/* In the debug version check the consistency, and erase field */
-	mem_field_erase((byte*)block + mem_block_get_free(block), n);
+	mem_field_erase((byte*) block + mem_block_get_free(block), n);
 #endif
 
 	/* If free == start, we may free the block if it is not the first
@@ -529,7 +524,7 @@ mem_alloc_func(
 	first block and thus we can calculate the pointer to the heap from
 	the pointer to the buffer when we free the memory buffer. */
 
-	if (UNIV_LIKELY_NULL(size)) {
+	if (size) {
 		/* Adjust the allocation to the actual size of the
 		memory block. */
 		ulint	m = mem_block_get_len(heap)
@@ -538,12 +533,13 @@ mem_alloc_func(
 		m -= MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE;
 #endif /* UNIV_MEM_DEBUG */
 		ut_ad(m >= n);
-		*size = n = m;
+		n = m;
+		*size = m;
 	}
 
 	buf = mem_heap_alloc(heap, n);
 
-	ut_a((byte*)heap == (byte*)buf - MEM_BLOCK_HEADER_SIZE
+	ut_a((byte*) heap == (byte*) buf - MEM_BLOCK_HEADER_SIZE
 	     - MEM_FIELD_HEADER_SIZE);
 	return(buf);
 }
@@ -562,7 +558,7 @@ mem_free_func(
 {
 	mem_heap_t*   heap;
 
-	heap = (mem_heap_t*)((byte*)ptr - MEM_BLOCK_HEADER_SIZE
+	heap = (mem_heap_t*)((byte*) ptr - MEM_BLOCK_HEADER_SIZE
 			     - MEM_FIELD_HEADER_SIZE);
 	mem_heap_free_func(heap, file_name, line);
 }
diff --git a/storage/xtradb/include/mem0pool.h b/storage/xtradb/include/mem0pool.h
index 26bac1c814b..a65ba50fdf9 100644
--- a/storage/xtradb/include/mem0pool.h
+++ b/storage/xtradb/include/mem0pool.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -30,17 +30,14 @@ Created 6/9/1994 Heikki Tuuri
 #include "os0file.h"
 #include "ut0lst.h"
 
-/** Memory area header */
-typedef struct mem_area_struct	mem_area_t;
 /** Memory pool */
-typedef struct mem_pool_struct	mem_pool_t;
+struct mem_pool_t;
 
 /** The common memory pool */
 extern mem_pool_t*	mem_comm_pool;
 
 /** Memory area header */
-
-struct mem_area_struct{
+struct mem_area_t{
 	ulint		size_and_free;	/*!< memory area size is obtained by
 					anding with ~MEM_AREA_FREE; area in
 					a free list if ANDing with
@@ -50,7 +47,7 @@ struct mem_area_struct{
 };
 
 /** Each memory area takes this many extra bytes for control information */
-#define MEM_AREA_EXTRA_SIZE	(ut_calc_align(sizeof(struct mem_area_struct),\
+#define MEM_AREA_EXTRA_SIZE	(ut_calc_align(sizeof(struct mem_area_t),\
 			UNIV_MEM_ALIGNMENT))
 
 /********************************************************************//**
diff --git a/storage/xtradb/include/mem0pool.ic b/storage/xtradb/include/mem0pool.ic
index f0e724648a1..f4bafb8ba63 100644
--- a/storage/xtradb/include/mem0pool.ic
+++ b/storage/xtradb/include/mem0pool.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/mtr0log.h b/storage/xtradb/include/mtr0log.h
index 8cccb982b48..18a345d050f 100644
--- a/storage/xtradb/include/mtr0log.h
+++ b/storage/xtradb/include/mtr0log.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -32,8 +32,8 @@ Created 12/7/1995 Heikki Tuuri
 
 #ifndef UNIV_HOTBACKUP
 /********************************************************//**
-Writes 1 - 4 bytes to a file page buffered in the buffer pool.
-Writes the corresponding log record to the mini-transaction log. */
+Writes 1, 2 or 4 bytes to a file page. Writes the corresponding log
+record to the mini-transaction log if mtr is not NULL. */
 UNIV_INTERN
 void
 mlog_write_ulint(
@@ -43,8 +43,8 @@ mlog_write_ulint(
 	byte	type,	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
 	mtr_t*	mtr);	/*!< in: mini-transaction handle */
 /********************************************************//**
-Writes 8 bytes to a file page buffered in the buffer pool.
-Writes the corresponding log record to the mini-transaction log. */
+Writes 8 bytes to a file page. Writes the corresponding log
+record to the mini-transaction log, only if mtr is not NULL */
 UNIV_INTERN
 void
 mlog_write_ull(
@@ -168,7 +168,7 @@ mlog_write_initial_log_record_fast(
 	mtr_t*		mtr);	/*!< in: mtr */
 #else /* !UNIV_HOTBACKUP */
 # define mlog_write_initial_log_record(ptr,type,mtr) ((void) 0)
-# define mlog_write_initial_log_record_fast(ptr,type,log_ptr,mtr) ((byte *) 0)
+# define mlog_write_initial_log_record_fast(ptr,type,log_ptr,mtr) ((byte*) 0)
 #endif /* !UNIV_HOTBACKUP */
 /********************************************************//**
 Parses an initial log record written by mlog_write_initial_log_record.
@@ -217,12 +217,13 @@ UNIV_INTERN
 byte*
 mlog_open_and_write_index(
 /*======================*/
-	mtr_t*		mtr,	/*!< in: mtr */
-	const byte*	rec,	/*!< in: index record or page */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	byte		type,	/*!< in: log item type */
-	ulint		size);	/*!< in: requested buffer size in bytes
-				(if 0, calls mlog_close() and returns NULL) */
+	mtr_t*			mtr,	/*!< in: mtr */
+	const byte*		rec,	/*!< in: index record or page */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	byte			type,	/*!< in: log item type */
+	ulint			size);	/*!< in: requested buffer size in bytes
+					(if 0, calls mlog_close() and
+					returns NULL) */
 #endif /* !UNIV_HOTBACKUP */
 
 /********************************************************//**
diff --git a/storage/xtradb/include/mtr0log.ic b/storage/xtradb/include/mtr0log.ic
index 5ef3f915b94..bc49f655294 100644
--- a/storage/xtradb/include/mtr0log.ic
+++ b/storage/xtradb/include/mtr0log.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,9 +26,11 @@ Created 12/7/1995 Heikki Tuuri
 #include "mach0data.h"
 #include "ut0lst.h"
 #include "buf0buf.h"
+#include "buf0dblwr.h"
 #include "fsp0types.h"
-#include "srv0srv.h"
+#include "btr0types.h"
 #include "trx0sys.h"
+
 /********************************************************//**
 Opens a buffer to mlog. It must be closed with mlog_close.
 @return	buffer, NULL if log mode MTR_LOG_NONE */
@@ -201,10 +203,9 @@ mlog_write_initial_log_record_fast(
 	the doublewrite buffer is located in pages
 	FSP_EXTENT_SIZE, ..., 3 * FSP_EXTENT_SIZE - 1 in the
 	system tablespace */
-	if ((space == TRX_SYS_SPACE
-	     || (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE))
-	    && offset >= (ulint)FSP_EXTENT_SIZE && offset < 3 * (ulint)FSP_EXTENT_SIZE) {
-		if (trx_doublewrite_buf_is_being_created) {
+	if (space == TRX_SYS_SPACE
+	    && offset >= FSP_EXTENT_SIZE && offset < 3 * FSP_EXTENT_SIZE) {
+		if (buf_dblwr_being_created) {
 			/* Do nothing: we only come to this branch in an
 			InnoDB database creation. We do not redo log
 			anything for the doublewrite buffer pages. */
diff --git a/storage/xtradb/include/mtr0mtr.h b/storage/xtradb/include/mtr0mtr.h
index 031fccd300c..fd0fb66c464 100644
--- a/storage/xtradb/include/mtr0mtr.h
+++ b/storage/xtradb/include/mtr0mtr.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -39,6 +40,7 @@ Created 11/26/1995 Heikki Tuuri
 #define MTR_LOG_ALL		21	/* default mode: log all operations
 					modifying disk-based data */
 #define	MTR_LOG_NONE		22	/* log no operations */
+#define	MTR_LOG_NO_REDO		23	/* Don't generate REDO */
 /*#define	MTR_LOG_SPACE	23 */	/* log only operations modifying
 					file space page allocation data
 					(operations in fsp0fsp.* ) */
@@ -180,7 +182,11 @@ For 1 - 8 bytes, the flag value must give the length also! @{ */
 #define MLOG_ZIP_WRITE_HEADER	((byte)50)	/*!< write to compressed page
 						header */
 #define MLOG_ZIP_PAGE_COMPRESS	((byte)51)	/*!< compress an index page */
-#define MLOG_BIGGEST_TYPE	((byte)51)	/*!< biggest value (used in
+#define MLOG_ZIP_PAGE_COMPRESS_NO_DATA	((byte)52)/*!< compress an index page
+						without logging it's image */
+#define MLOG_ZIP_PAGE_REORGANIZE ((byte)53)	/*!< reorganize a compressed
+						page */
+#define MLOG_BIGGEST_TYPE	((byte)53)	/*!< biggest value (used in
 						assertions) */
 /* @} */
 
@@ -191,6 +197,9 @@ functions).  The page number parameter was originally written as 0. @{ */
 					MLOG_FILE_CREATE, MLOG_FILE_CREATE2 */
 /* @} */
 
+/* included here because it needs MLOG_LSN defined */
+#include "log0log.h"
+
 /***************************************************************//**
 Starts a mini-transaction. */
 UNIV_INLINE
@@ -225,7 +234,7 @@ mtr_release_s_latch_at_savepoint(
 /*=============================*/
 	mtr_t*		mtr,		/*!< in: mtr */
 	ulint		savepoint,	/*!< in: savepoint */
-	rw_lock_t*	lock);		/*!< in: latch to release */
+	prio_rw_lock_t*	lock);		/*!< in: latch to release */
 #else /* !UNIV_HOTBACKUP */
 # define mtr_release_s_latch_at_savepoint(mtr,savepoint,lock) ((void) 0)
 #endif /* !UNIV_HOTBACKUP */
@@ -272,7 +281,7 @@ UNIV_INLINE
 void
 mtr_s_lock_func(
 /*============*/
-	rw_lock_t*	lock,	/*!< in: rw-lock */
+	prio_rw_lock_t*	lock,	/*!< in: rw-lock */
 	const char*	file,	/*!< in: file name */
 	ulint		line,	/*!< in: line number */
 	mtr_t*		mtr);	/*!< in: mtr */
@@ -283,16 +292,17 @@ UNIV_INLINE
 void
 mtr_x_lock_func(
 /*============*/
-	rw_lock_t*	lock,	/*!< in: rw-lock */
+	prio_rw_lock_t*	lock,	/*!< in: rw-lock */
 	const char*	file,	/*!< in: file name */
 	ulint		line,	/*!< in: line number */
 	mtr_t*		mtr);	/*!< in: mtr */
 #endif /* !UNIV_HOTBACKUP */
 
 /***************************************************//**
-Releases an object in the memo stack. */
+Releases an object in the memo stack.
+@return true if released */
 UNIV_INTERN
-void
+bool
 mtr_memo_release(
 /*=============*/
 	mtr_t*	mtr,	/*!< in/out: mini-transaction */
@@ -357,28 +367,27 @@ mtr_memo_push(
 	void*	object,	/*!< in: object */
 	ulint	type);	/*!< in: object type: MTR_MEMO_S_LOCK, ... */
 
-
-/* Type definition of a mini-transaction memo stack slot. */
-typedef	struct mtr_memo_slot_struct	mtr_memo_slot_t;
-struct mtr_memo_slot_struct{
+/** Mini-transaction memo stack slot. */
+struct mtr_memo_slot_t{
 	ulint	type;	/*!< type of the stored object (MTR_MEMO_S_LOCK, ...) */
 	void*	object;	/*!< pointer to the object */
 };
 
 /* Mini-transaction handle and buffer */
-struct mtr_struct{
+struct mtr_t{
 #ifdef UNIV_DEBUG
 	ulint		state;	/*!< MTR_ACTIVE, MTR_COMMITTING, MTR_COMMITTED */
 #endif
 	dyn_array_t	memo;	/*!< memo stack for locks etc. */
 	dyn_array_t	log;	/*!< mini-transaction log */
-	ibool		inside_ibuf;
+	unsigned	inside_ibuf:1;
 				/*!< TRUE if inside ibuf changes */
-	ibool		modifications;
-				/* TRUE if the mtr made modifications to
-				buffer pool pages */
-	ibool		made_dirty;/*!< TRUE if mtr has made at least
-				   one buffer pool page dirty */
+	unsigned	modifications:1;
+				/*!< TRUE if the mini-transaction
+				modified buffer pool pages */
+	unsigned	made_dirty:1;
+				/*!< TRUE if mtr has made at least
+				one buffer pool page dirty */
 	ulint		n_log_recs;
 				/* count of how many page initial log records
 				have been written to the mtr log */
@@ -387,9 +396,9 @@ struct mtr_struct{
 				this mini-transaction */
 	ulint		log_mode; /* specifies which operations should be
 				logged; default value MTR_LOG_ALL */
-	ib_uint64_t	start_lsn;/* start lsn of the possible log entry for
+	lsn_t		start_lsn;/* start lsn of the possible log entry for
 				this mtr */
-	ib_uint64_t	end_lsn;/* end lsn of the possible log entry for
+	lsn_t		end_lsn;/* end lsn of the possible log entry for
 				this mtr */
 #ifdef UNIV_DEBUG
 	ulint		magic_n;
diff --git a/storage/xtradb/include/mtr0mtr.ic b/storage/xtradb/include/mtr0mtr.ic
index 7b5d268b70f..4fe23c460ab 100644
--- a/storage/xtradb/include/mtr0mtr.ic
+++ b/storage/xtradb/include/mtr0mtr.ic
@@ -39,7 +39,6 @@ mtr_block_dirtied(
 	const buf_block_t*	block)	/*!< in: block being x-fixed */
 	__attribute__((nonnull,warn_unused_result));
 
-
 /***************************************************************//**
 Starts a mini-transaction. */
 UNIV_INLINE
@@ -54,11 +53,11 @@ mtr_start(
 	dyn_array_create(&(mtr->log));
 
 	mtr->log_mode = MTR_LOG_ALL;
-	mtr->modifications = FALSE;
 	mtr->inside_ibuf = FALSE;
+	mtr->modifications = FALSE;
+	mtr->made_dirty = FALSE;
 	mtr->n_log_recs = 0;
 	mtr->n_freed_pages = 0;
-	mtr->made_dirty = FALSE;
 
 	ut_d(mtr->state = MTR_ACTIVE);
 	ut_d(mtr->magic_n = MTR_MAGIC_N);
@@ -77,22 +76,22 @@ mtr_memo_push(
 	dyn_array_t*		memo;
 	mtr_memo_slot_t*	slot;
 
+	ut_ad(object);
+	ut_ad(type >= MTR_MEMO_PAGE_S_FIX);
+	ut_ad(type <= MTR_MEMO_X_LOCK);
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
 	/* If this mtr has x-fixed a clean page then we set
 	the made_dirty flag. This tells us if we need to
 	grab log_flush_order_mutex at mtr_commit so that we
 	can insert the dirtied page to the flush list. */
 	if (type == MTR_MEMO_PAGE_X_FIX && !mtr->made_dirty) {
 		mtr->made_dirty =
-			mtr_block_dirtied((const buf_block_t *)object);
+			mtr_block_dirtied((const buf_block_t*) object);
 	}
 
-	ut_ad(object);
-	ut_ad(type >= MTR_MEMO_PAGE_S_FIX);
-	ut_ad(type <= MTR_MEMO_X_LOCK);
-	ut_ad(mtr);
-	ut_ad(mtr->magic_n == MTR_MAGIC_N);
-	ut_ad(mtr->state == MTR_ACTIVE);
-
 	memo = &(mtr->memo);
 
 	slot = (mtr_memo_slot_t*) dyn_array_push(memo, sizeof *slot);
@@ -131,7 +130,7 @@ mtr_release_s_latch_at_savepoint(
 /*=============================*/
 	mtr_t*		mtr,		/*!< in: mtr */
 	ulint		savepoint,	/*!< in: savepoint */
-	rw_lock_t*	lock)		/*!< in: latch to release */
+	prio_rw_lock_t*	lock)		/*!< in: latch to release */
 {
 	mtr_memo_slot_t* slot;
 	dyn_array_t*	memo;
@@ -262,7 +261,7 @@ UNIV_INLINE
 void
 mtr_s_lock_func(
 /*============*/
-	rw_lock_t*	lock,	/*!< in: rw-lock */
+	prio_rw_lock_t*	lock,	/*!< in: rw-lock */
 	const char*	file,	/*!< in: file name */
 	ulint		line,	/*!< in: line number */
 	mtr_t*		mtr)	/*!< in: mtr */
@@ -281,7 +280,7 @@ UNIV_INLINE
 void
 mtr_x_lock_func(
 /*============*/
-	rw_lock_t*	lock,	/*!< in: rw-lock */
+	prio_rw_lock_t*	lock,	/*!< in: rw-lock */
 	const char*	file,	/*!< in: file name */
 	ulint		line,	/*!< in: line number */
 	mtr_t*		mtr)	/*!< in: mtr */
diff --git a/storage/xtradb/include/mtr0types.h b/storage/xtradb/include/mtr0types.h
index eb76c824666..43368c0b726 100644
--- a/storage/xtradb/include/mtr0types.h
+++ b/storage/xtradb/include/mtr0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,6 +26,6 @@ Created 11/26/1995 Heikki Tuuri
 #ifndef mtr0types_h
 #define mtr0types_h
 
-typedef struct mtr_struct	mtr_t;
+struct mtr_t;
 
 #endif
diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h
index e6c70edbd8f..564b579edc8 100644
--- a/storage/xtradb/include/os0file.h
+++ b/storage/xtradb/include/os0file.h
@@ -1,6 +1,6 @@
 /***********************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2009, Percona Inc.
 
 Portions of this file contain modifications contributed and copyrighted
@@ -19,9 +19,9 @@ WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
 Public License for more details.
 
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 ***********************************************************************/
 
@@ -45,11 +45,8 @@ Created 10/21/1995 Heikki Tuuri
 #endif
 
 /** File node of a tablespace or the log data space */
-typedef	struct fil_node_struct	fil_node_t;
+struct fil_node_t;
 
-#ifdef UNIV_DO_FLUSH
-extern ibool	os_do_not_call_flush_at_each_write;
-#endif /* UNIV_DO_FLUSH */
 extern ibool	os_has_said_disk_full;
 /** Flag: enable debug printout for asynchronous i/o */
 extern ibool	os_aio_print_debug;
@@ -75,6 +72,8 @@ extern ulint	os_n_pending_writes;
 
 #endif
 
+/** File offset in bytes */
+typedef ib_uint64_t os_offset_t;
 #ifdef __WIN__
 #define SRV_PATH_SEPARATOR	'\\'
 /** File handle */
@@ -107,14 +106,28 @@ log. */
 
 #define OS_FILE_LOG_BLOCK_SIZE		srv_log_block_size
 
-/** Options for file_create @{ */
-#define	OS_FILE_OPEN			51
-#define	OS_FILE_CREATE			52
-#define OS_FILE_OVERWRITE		53
-#define OS_FILE_OPEN_RAW		54
-#define	OS_FILE_CREATE_PATH		55
-#define	OS_FILE_OPEN_RETRY		56	/* for os_file_create() on
-						the first ibdata file */
+/** Options for os_file_create_func @{ */
+enum os_file_create_t {
+	OS_FILE_OPEN = 51,		/*!< to open an existing file (if
+					doesn't exist, error) */
+	OS_FILE_CREATE,			/*!< to create new file (if
+					exists, error) */
+	OS_FILE_OVERWRITE,		/*!< to create a new file, if exists
+					the overwrite old file */
+	OS_FILE_OPEN_RAW,		/*!< to open a raw device or disk
+					partition */
+	OS_FILE_CREATE_PATH,		/*!< to create the directories */
+	OS_FILE_OPEN_RETRY,		/*!< open with retry */
+
+	/** Flags that can be combined with the above values. Please ensure
+	that the above values stay below 128. */
+
+	OS_FILE_ON_ERROR_NO_EXIT = 128,	/*!< do not exit on unknown errors */
+	OS_FILE_ON_ERROR_SILENT = 256	/*!< don't print diagnostic messages to
+					the log unless it is a fatal error,
+					this flag is only used if
+					ON_ERROR_NO_EXIT is set */
+};
 
 #define OS_FILE_READ_ONLY		333
 #define	OS_FILE_READ_WRITE		444
@@ -210,45 +223,62 @@ various file I/O operations with performance schema.
 1) register_pfs_file_open_begin() and register_pfs_file_open_end() are
 used to register file creation, opening, closing and renaming.
 2) register_pfs_file_io_begin() and register_pfs_file_io_end() are
-used to register actual file read, write and flush */
+used to register actual file read, write and flush
+3) register_pfs_file_close_begin() and register_pfs_file_close_end()
+are used to register file deletion operations*/
 # define register_pfs_file_open_begin(state, locker, key, op, name,	\
 				      src_file, src_line)		\
 do {									\
-	if (PSI_server) {						\
-		locker = PSI_server->get_thread_file_name_locker(	\
-			state, key, op, name, &locker);			\
-		if (locker) {						\
-			PSI_server->start_file_open_wait(		\
-				locker, src_file, src_line);		\
-		}							\
+	locker = PSI_FILE_CALL(get_thread_file_name_locker)(		\
+		state, key, op, name, &locker);				\
+	if (UNIV_LIKELY(locker != NULL)) {				\
+		PSI_FILE_CALL(start_file_open_wait)(			\
+			locker, src_file, src_line);			\
 	}								\
 } while (0)
 
 # define register_pfs_file_open_end(locker, file)			\
 do {									\
-	if (locker) {							\
-		PSI_server->end_file_open_wait_and_bind_to_descriptor(	\
+	if (UNIV_LIKELY(locker != NULL)) {				\
+		PSI_FILE_CALL(end_file_open_wait_and_bind_to_descriptor)(\
 			locker, file);					\
 	}								\
 } while (0)
 
+# define register_pfs_file_close_begin(state, locker, key, op, name,	\
+				      src_file, src_line)		\
+do {									\
+	locker = PSI_FILE_CALL(get_thread_file_name_locker)(		\
+		state, key, op, name, &locker);				\
+	if (UNIV_LIKELY(locker != NULL)) {				\
+		PSI_FILE_CALL(start_file_close_wait)(			\
+			locker, src_file, src_line);			\
+	}								\
+} while (0)
+
+# define register_pfs_file_close_end(locker, result)			\
+do {									\
+	if (UNIV_LIKELY(locker != NULL)) {				\
+		PSI_FILE_CALL(end_file_close_wait)(			\
+			locker, result);				\
+	}								\
+} while (0)
+
 # define register_pfs_file_io_begin(state, locker, file, count, op,	\
 				    src_file, src_line)			\
 do {									\
-	if (PSI_server) {						\
-		locker = PSI_server->get_thread_file_descriptor_locker(	\
-			state, file, op);				\
-		if (locker) {						\
-			PSI_server->start_file_wait(			\
-				locker, count, src_file, src_line);	\
-		}							\
+	locker = PSI_FILE_CALL(get_thread_file_descriptor_locker)(	\
+		state, file, op);					\
+	if (UNIV_LIKELY(locker != NULL)) {				\
+		PSI_FILE_CALL(start_file_wait)(				\
+			locker, count, src_file, src_line);		\
 	}								\
 } while (0)
 
 # define register_pfs_file_io_end(locker, count)			\
 do {									\
-	if (locker) {							\
-		PSI_server->end_file_wait(locker, count);		\
+	if (UNIV_LIKELY(locker != NULL)) {				\
+		PSI_FILE_CALL(end_file_wait)(locker, count);		\
 	}								\
 } while (0)
 #endif /* UNIV_PFS_IO  */
@@ -286,35 +316,39 @@ The wrapper functions have the prefix of "innodb_". */
 # define os_file_close(file)						\
 	pfs_os_file_close_func(file, __FILE__, __LINE__)
 
-# define os_aio(type, mode, name, file, buf, offset, offset_high,	\
+# define os_aio(type, mode, name, file, buf, offset,			\
 		n, message1, message2, space_id, trx)			\
 	pfs_os_aio_func(type, mode, name, file, buf, offset,		\
-			offset_high, n, message1, message2, space_id, trx,\
-			__FILE__, __LINE__)
+		n, message1, message2, space_id, trx,			\
+		__FILE__, __LINE__)
 
-# define os_file_read(file, buf, offset, offset_high, n)		\
-	pfs_os_file_read_func(file, buf, offset, offset_high, n, NULL,	\
+# define os_file_read(file, buf, offset, n)				\
+	pfs_os_file_read_func(file, buf, offset, n, NULL,		\
 			      __FILE__, __LINE__)
 
-# define os_file_read_trx(file, buf, offset, offset_high, n, trx)	\
-	pfs_os_file_read_func(file, buf, offset, offset_high, n, trx,	\
+# define os_file_read_trx(file, buf, offset, n, trx)			\
+	pfs_os_file_read_func(file, buf, offset, n, trx,		\
 			      __FILE__, __LINE__)
 
-# define os_file_read_no_error_handling(file, buf, offset,		\
-					offset_high, n)			\
-	pfs_os_file_read_no_error_handling_func(file, buf, offset,	\
-						offset_high, n,		\
+# define os_file_read_no_error_handling(file, buf, offset, n)		\
+	pfs_os_file_read_no_error_handling_func(file, buf, offset, n,	\
 						__FILE__, __LINE__)
 
-# define os_file_write(name, file, buf, offset, offset_high, n)		\
-	pfs_os_file_write_func(name, file, buf, offset, offset_high,	\
+# define os_file_write(name, file, buf, offset, n)	\
+	pfs_os_file_write_func(name, file, buf, offset,	\
 			       n, __FILE__, __LINE__)
 
-# define os_file_flush(file, metadata)					\
-	pfs_os_file_flush_func(file, metadata, __FILE__, __LINE__)
+# define os_file_flush(file)						\
+	pfs_os_file_flush_func(file, __FILE__, __LINE__)
 
 # define os_file_rename(key, oldpath, newpath)				\
 	pfs_os_file_rename_func(key, oldpath, newpath, __FILE__, __LINE__)
+
+# define os_file_delete(key, name)					\
+	pfs_os_file_delete_func(key, name, __FILE__, __LINE__)
+
+# define os_file_delete_if_exists(key, name)				\
+	pfs_os_file_delete_if_exists_func(key, name, __FILE__, __LINE__)
 #else /* UNIV_PFS_IO */
 
 /* If UNIV_PFS_IO is not defined, these I/O APIs point
@@ -322,8 +356,8 @@ to original un-instrumented file I/O APIs */
 # define os_file_create(key, name, create, purpose, type, success)	\
 	os_file_create_func(name, create, purpose, type, success)
 
-# define os_file_create_simple(key, name, create, access, success)	\
-	os_file_create_simple_func(name, create, access, success)
+# define os_file_create_simple(key, name, create_mode, access, success)	\
+	os_file_create_simple_func(name, create_mode, access, success)
 
 # define os_file_create_simple_no_error_handling(			\
 		key, name, create_mode, access, success)		\
@@ -332,40 +366,43 @@ to original un-instrumented file I/O APIs */
 
 # define os_file_close(file)	os_file_close_func(file)
 
-# define os_aio(type, mode, name, file, buf, offset, offset_high,	\
-	       n, message1, message2, space_id, trx)			\
-	os_aio_func(type, mode, name, file, buf, offset, offset_high, n,\
+# define os_aio(type, mode, name, file, buf, offset, n, message1,	\
+		message2, space_id, trx)				\
+	os_aio_func(type, mode, name, file, buf, offset, n,		\
 		    message1, message2, space_id, trx)
 
-# define os_file_read(file, buf, offset, offset_high, n)		\
-	os_file_read_func(file, buf, offset, offset_high, n, NULL)
+# define os_file_read(file, buf, offset, n)				\
+	os_file_read_func(file, buf, offset, n, NULL)
 
-# define os_file_read_trx(file, buf, offset, offset_high, n, trx)	\
-	os_file_read_func(file, buf, offset, offset_high, n, trx)
+# define os_file_read_trx(file, buf, offset, n, trx)			\
+	os_file_read_func(file, buf, offset, n, trx)
 
-# define os_file_read_no_error_handling(file, buf, offset,		\
-				       offset_high, n)			\
-	os_file_read_no_error_handling_func(file, buf, offset, offset_high, n)
+# define os_file_read_no_error_handling(file, buf, offset, n)		\
+	os_file_read_no_error_handling_func(file, buf, offset, n)
 
-# define os_file_write(name, file, buf, offset, offset_high, n)		\
-	os_file_write_func(name, file, buf, offset, offset_high, n)
+# define os_file_write(name, file, buf, offset, n)			\
+	os_file_write_func(name, file, buf, offset, n)
 
-# define os_file_flush(file, metadata)	os_file_flush_func(file, metadata)
+# define os_file_flush(file)	os_file_flush_func(file)
 
 # define os_file_rename(key, oldpath, newpath)				\
 	os_file_rename_func(oldpath, newpath)
 
+# define os_file_delete(key, name)	os_file_delete_func(name)
+
+# define os_file_delete_if_exists(key, name)				\
+	os_file_delete_if_exists_func(name)
+
 #endif /* UNIV_PFS_IO */
 
 /* File types for directory entry data type */
 
-enum os_file_type_enum{
+enum os_file_type_t {
 	OS_FILE_TYPE_UNKNOWN = 0,
 	OS_FILE_TYPE_FILE,			/* regular file */
 	OS_FILE_TYPE_DIR,			/* directory */
 	OS_FILE_TYPE_LINK			/* symbolic link */
 };
-typedef enum os_file_type_enum	  os_file_type_t;
 
 /* Maximum path string length in bytes when referring to tables with in the
 './databasename/tablename.ibd' path format; we can allocate at least 2 buffers
@@ -373,16 +410,18 @@ of this size from the thread stack; that is why this should not be made much
 bigger than 4000 bytes */
 #define OS_FILE_MAX_PATH	4000
 
-/* Struct used in fetching information of a file in a directory */
-struct os_file_stat_struct{
+/** Struct used in fetching information of a file in a directory */
+struct os_file_stat_t {
 	char		name[OS_FILE_MAX_PATH];	/*!< path to a file */
 	os_file_type_t	type;			/*!< file type */
 	ib_int64_t	size;			/*!< file size */
 	time_t		ctime;			/*!< creation time */
 	time_t		mtime;			/*!< modification time */
 	time_t		atime;			/*!< access time */
+	bool		rw_perm;		/*!< true if can be opened
+						in read-write mode. Only valid
+						if type == OS_FILE_TYPE_FILE */
 };
-typedef struct os_file_stat_struct	os_file_stat_t;
 
 #ifdef __WIN__
 typedef HANDLE	os_file_dir_t;	/*!< directory stream */
@@ -478,13 +517,7 @@ os_file_create_simple_func(
 /*=======================*/
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file is
-				opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error), or
-				OS_FILE_CREATE_PATH if new file
-				(if exists, error) and subdirectories along
-				its path are created (if needed)*/
+	ulint		create_mode,/*!< in: create mode */
 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
 				OS_FILE_READ_WRITE */
 	ibool*		success);/*!< out: TRUE if succeed, FALSE if error */
@@ -500,15 +533,13 @@ os_file_create_simple_no_error_handling_func(
 /*=========================================*/
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
-				is opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error) */
+	ulint		create_mode,/*!< in: create mode */
 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
 				OS_FILE_READ_WRITE, or
 				OS_FILE_READ_ALLOW_DELETE; the last option is
 				used by a backup program reading the file */
-	ibool*		success);/*!< out: TRUE if succeed, FALSE if error */
+	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
+	__attribute__((nonnull, warn_unused_result));
 /****************************************************************//**
 Tries to disable OS caching on an opened file descriptor. */
 UNIV_INTERN
@@ -532,14 +563,7 @@ os_file_create_func(
 /*================*/
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
-				is opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error),
-				OS_FILE_OVERWRITE if a new file is created
-				or an old overwritten;
-				OS_FILE_OPEN_RAW, if a raw device or disk
-				partition should be opened */
+	ulint		create_mode,/*!< in: create mode */
 	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
 				non-buffered i/o is desired,
 				OS_FILE_NORMAL, if any normal file;
@@ -548,24 +572,27 @@ os_file_create_func(
 				async i/o or unbuffered i/o: look in the
 				function source code for the exact rules */
 	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
-	ibool*		success);/*!< out: TRUE if succeed, FALSE if error */
+	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
+	__attribute__((nonnull, warn_unused_result));
 /***********************************************************************//**
 Deletes a file. The file has to be closed before calling this.
 @return	TRUE if success */
 UNIV_INTERN
-ibool
-os_file_delete(
-/*===========*/
-	const char*	name);	/*!< in: file path as a null-terminated string */
+bool
+os_file_delete_func(
+/*================*/
+	const char*	name);	/*!< in: file path as a null-terminated
+				string */
 
 /***********************************************************************//**
 Deletes a file if it exists. The file has to be closed before calling this.
 @return	TRUE if success */
 UNIV_INTERN
-ibool
-os_file_delete_if_exists(
-/*=====================*/
-	const char*	name);	/*!< in: file path as a null-terminated string */
+bool
+os_file_delete_if_exists_func(
+/*==========================*/
+	const char*	name);	/*!< in: file path as a null-terminated
+				string */
 /***********************************************************************//**
 NOTE! Use the corresponding macro os_file_rename(), not directly
 this function!
@@ -606,18 +633,13 @@ pfs_os_file_create_simple_func(
 	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file is
-				opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error), or
-				OS_FILE_CREATE_PATH if new file
-				(if exists, error) and subdirectories along
-				its path are created (if needed)*/
+	ulint		create_mode,/*!< in: create mode */
 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
 				OS_FILE_READ_WRITE */
 	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
 	const char*	src_file,/*!< in: file name where func invoked */
-	ulint		src_line);/*!< in: line where the func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+	__attribute__((nonnull, warn_unused_result));
 
 /****************************************************************//**
 NOTE! Please use the corresponding macro
@@ -634,17 +656,15 @@ pfs_os_file_create_simple_no_error_handling_func(
 	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
-				is opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error) */
+	ulint		create_mode, /*!< in: file create mode */
 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
 				OS_FILE_READ_WRITE, or
 				OS_FILE_READ_ALLOW_DELETE; the last option is
 				used by a backup program reading the file */
 	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
 	const char*	src_file,/*!< in: file name where func invoked */
-	ulint		src_line);/*!< in: line where the func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+	__attribute__((nonnull, warn_unused_result));
 
 /****************************************************************//**
 NOTE! Please use the corresponding macro os_file_create(), not directly
@@ -660,14 +680,7 @@ pfs_os_file_create_func(
 	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
-				is opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error),
-				OS_FILE_OVERWRITE if a new file is created
-				or an old overwritten;
-				OS_FILE_OPEN_RAW, if a raw device or disk
-				partition should be opened */
+	ulint		create_mode,/*!< in: file create mode */
 	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
 				non-buffered i/o is desired,
 				OS_FILE_NORMAL, if any normal file;
@@ -678,7 +691,8 @@ pfs_os_file_create_func(
 	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
 	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
 	const char*	src_file,/*!< in: file name where func invoked */
-	ulint		src_line);/*!< in: line where the func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+	__attribute__((nonnull, warn_unused_result));
 
 /***********************************************************************//**
 NOTE! Please use the corresponding macro os_file_close(), not directly
@@ -704,10 +718,7 @@ pfs_os_file_read_func(
 /*==================*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n,	/*!< in: number of bytes to read */
 	trx_t*		trx,
 	const char*	src_file,/*!< in: file name where func invoked */
@@ -726,10 +737,7 @@ pfs_os_file_read_no_error_handling_func(
 /*====================================*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n,	/*!< in: number of bytes to read */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line);/*!< in: line where the func invoked */
@@ -751,10 +759,7 @@ pfs_os_aio_func(
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read or from which
 				to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read or write */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read or write */
 	ulint		n,	/*!< in: number of bytes to read or write */
 	fil_node_t*	message1,/*!< in: message for the aio handler
 				(can be used to identify a completed
@@ -782,10 +787,7 @@ pfs_os_file_write_func(
 				null-terminated string */
 	os_file_t	file,	/*!< in: handle to a file */
 	const void*	buf,	/*!< in: buffer from which to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to write */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to write */
 	ulint		n,	/*!< in: number of bytes to write */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line);/*!< in: line where the func invoked */
@@ -801,7 +803,6 @@ ibool
 pfs_os_file_flush_func(
 /*===================*/
 	os_file_t	file,	/*!< in, own: handle to a file */
-	ibool		metadata,
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line);/*!< in: line where the func invoked */
 
@@ -821,6 +822,38 @@ pfs_os_file_rename_func(
 	const char*	newpath,/*!< in: new file path */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line);/*!< in: line where the func invoked */
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_delete(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete()
+@return TRUE if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_func(
+/*====================*/
+	mysql_pfs_key_t	key,	/*!< in: Performance Schema Key */
+	const char*	name,	/*!< in: old file path as a null-terminated
+				string */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line);/*!< in: line where the func invoked */
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_delete_if_exists(), not
+directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete_if_exists()
+@return TRUE if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_if_exists_func(
+/*==============================*/
+	mysql_pfs_key_t	key,	/*!< in: Performance Schema Key */
+	const char*	name,	/*!< in: old file path as a null-terminated
+				string */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line);/*!< in: line where the func invoked */
 #endif	/* UNIV_PFS_IO */
 
 /***********************************************************************//**
@@ -833,23 +866,13 @@ os_file_close_no_error_handling(
 	os_file_t	file);	/*!< in, own: handle to a file */
 /***********************************************************************//**
 Gets a file size.
-@return	TRUE if success */
+@return	file size, or (os_offset_t) -1 on failure */
 UNIV_INTERN
-ibool
+os_offset_t
 os_file_get_size(
 /*=============*/
-	os_file_t	file,	/*!< in: handle to a file */
-	ulint*		size,	/*!< out: least significant 32 bits of file
-				size */
-	ulint*		size_high);/*!< out: most significant 32 bits of size */
-/***********************************************************************//**
-Gets file size as a 64-bit integer ib_int64_t.
-@return	size in bytes, -1 if error */
-UNIV_INTERN
-ib_int64_t
-os_file_get_size_as_iblonglong(
-/*===========================*/
-	os_file_t	file);	/*!< in: handle to a file */
+	os_file_t	file)	/*!< in: handle to a file */
+	__attribute__((warn_unused_result));
 /***********************************************************************//**
 Write the specified number of zeros to a newly created file.
 @return	TRUE if success */
@@ -860,9 +883,8 @@ os_file_set_size(
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
 	os_file_t	file,	/*!< in: handle to a file */
-	ulint		size,	/*!< in: least significant 32 bits of file
-				size */
-	ulint		size_high);/*!< in: most significant 32 bits of size */
+	os_offset_t	size)	/*!< in: file size */
+	__attribute__((nonnull, warn_unused_result));
 /***********************************************************************//**
 Truncates a file at its current position.
 @return	TRUE if success */
@@ -887,8 +909,7 @@ UNIV_INTERN
 ibool
 os_file_flush_func(
 /*===============*/
-	os_file_t	file,	/*!< in, own: handle to a file */
-	ibool		metadata);
+	os_file_t	file);	/*!< in, own: handle to a file */
 /***********************************************************************//**
 Retrieves the last error number if an error occurs in a file io function.
 The number should be retrieved before any other OS calls (because they may
@@ -899,7 +920,7 @@ UNIV_INTERN
 ulint
 os_file_get_last_error(
 /*===================*/
-	ibool	report_all_errors);	/*!< in: TRUE if we want an error message
+	bool	report_all_errors);	/*!< in: TRUE if we want an error message
 					printed of all errors */
 /*******************************************************************//**
 NOTE! Use the corresponding macro os_file_read(), not directly this function!
@@ -911,10 +932,7 @@ os_file_read_func(
 /*==============*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n,	/*!< in: number of bytes to read */
 	trx_t*		trx);
 /*******************************************************************//**
@@ -940,10 +958,7 @@ os_file_read_no_error_handling_func(
 /*================================*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n);	/*!< in: number of bytes to read */
 
 /*******************************************************************//**
@@ -959,10 +974,7 @@ os_file_write_func(
 				null-terminated string */
 	os_file_t	file,	/*!< in: handle to a file */
 	const void*	buf,	/*!< in: buffer from which to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to write */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to write */
 	ulint		n);	/*!< in: number of bytes to write */
 /*******************************************************************//**
 Check the existence and type of the given file.
@@ -978,8 +990,8 @@ os_file_status(
 The function os_file_dirname returns a directory component of a
 null-terminated pathname string.  In the usual case, dirname returns
 the string up to, but not including, the final '/', and basename
-is the component following the final '/'.  Trailing '/' charac�
-ters are not counted as part of the pathname.
+is the component following the final '/'.  Trailing '/' characters
+are not counted as part of the pathname.
 
 If path does not contain a slash, dirname returns the string ".".
 
@@ -1008,6 +1020,60 @@ os_file_dirname(
 /*============*/
 	const char*	path);	/*!< in: pathname */
 /****************************************************************//**
+This function returns a new path name after replacing the basename
+in an old path with a new basename.  The old_path is a full path
+name including the extension.  The tablename is in the normal
+form "databasename/tablename".  The new base name is found after
+the forward slash.  Both input strings are null terminated.
+
+This function allocates memory to be returned.  It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@return	own: new full pathname */
+UNIV_INTERN
+char*
+os_file_make_new_pathname(
+/*======================*/
+	const char*	old_path,	/*!< in: pathname */
+	const char*	new_name);	/*!< in: new file name */
+/****************************************************************//**
+This function returns a remote path name by combining a data directory
+path provided in a DATA DIRECTORY clause with the tablename which is
+in the form 'database/tablename'.  It strips the file basename (which
+is the tablename) found after the last directory in the path provided.
+The full filepath created will include the database name as a directory
+under the path provided.  The filename is the tablename with the '.ibd'
+extension. All input and output strings are null-terminated.
+
+This function allocates memory to be returned.  It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@return	own: A full pathname; data_dir_path/databasename/tablename.ibd */
+UNIV_INTERN
+char*
+os_file_make_remote_pathname(
+/*=========================*/
+	const char*	data_dir_path,	/*!< in: pathname */
+	const char*	tablename,	/*!< in: tablename */
+	const char*	extention);	/*!< in: file extention; ibd,cfg*/
+/****************************************************************//**
+This function reduces a null-terminated full remote path name into
+the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
+the 'databasename/tablename.ibd' found at the end of the path with just
+'tablename'.
+
+Since the result is always smaller than the path sent in, no new memory
+is allocated. The caller should allocate memory for the path sent in.
+This function manipulates that path in place.
+
+If the path format is not as expected, just return.  The result is used
+to inform a SHOW CREATE TABLE command. */
+UNIV_INTERN
+void
+os_file_make_data_dir_path(
+/*========================*/
+	char*	data_dir_path);	/*!< in/out: full path/data_dir_path */
+/****************************************************************//**
 Creates all missing subdirectories along the given path.
 @return	TRUE if call succeeded FALSE otherwise */
 UNIV_INTERN
@@ -1066,10 +1132,7 @@ os_aio_func(
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read or from which
 				to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read or write */
-	ulint		offset_high, /*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read or write */
 	ulint		n,	/*!< in: number of bytes to read or write */
 	fil_node_t*	message1,/*!< in: message for the aio handler
 				(can be used to identify a completed
@@ -1198,14 +1261,16 @@ os_aio_all_slots_free(void);
 
 /*******************************************************************//**
 This function returns information about the specified file
-@return	TRUE if stat information found */
+@return	DB_SUCCESS if all OK */
 UNIV_INTERN
-ibool
+dberr_t
 os_file_get_status(
 /*===============*/
-	const char*	path,		/*!< in:	pathname of the file */
-	os_file_stat_t* stat_info);	/*!< information of a file in a
+	const char*	path,		/*!< in: pathname of the file */
+	os_file_stat_t* stat_info,	/*!< information of a file in a
 					directory */
+	bool		check_rw_perm);	/*!< in: for testing whether the
+					file can be opened in RW mode */
 
 #if !defined(UNIV_HOTBACKUP)
 /*********************************************************************//**
diff --git a/storage/xtradb/include/os0file.ic b/storage/xtradb/include/os0file.ic
index 137ce59b62d..25a1397147e 100644
--- a/storage/xtradb/include/os0file.ic
+++ b/storage/xtradb/include/os0file.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2010, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -40,13 +40,7 @@ pfs_os_file_create_simple_func(
 	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file is
-				opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error), or
-				OS_FILE_CREATE_PATH if new file
-				(if exists, error) and subdirectories along
-				its path are created (if needed)*/
+	ulint		create_mode,/*!< in: create mode */
 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
 				OS_FILE_READ_WRITE */
 	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
@@ -88,10 +82,7 @@ pfs_os_file_create_simple_no_error_handling_func(
 	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
-				is opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error) */
+	ulint		create_mode, /*!< in: file create mode */
 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
 				OS_FILE_READ_WRITE, or
 				OS_FILE_READ_ALLOW_DELETE; the last option is
@@ -133,14 +124,7 @@ pfs_os_file_create_func(
 	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
-				is opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error),
-				OS_FILE_OVERWRITE if a new file is created
-				or an old overwritten;
-				OS_FILE_OPEN_RAW, if a raw device or disk
-				partition should be opened */
+	ulint		create_mode,/*!< in: file create mode */
 	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
 				non-buffered i/o is desired,
 				OS_FILE_NORMAL, if any normal file;
@@ -216,10 +200,7 @@ pfs_os_aio_func(
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read or from which
 				to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read or write */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read or write */
 	ulint		n,	/*!< in: number of bytes to read or write */
 	fil_node_t*	message1,/*!< in: message for the aio handler
 				(can be used to identify a completed
@@ -245,7 +226,7 @@ pfs_os_aio_func(
 					: PSI_FILE_READ,
 				   src_file, src_line);
 
-	result = os_aio_func(type, mode, name, file, buf, offset, offset_high,
+	result = os_aio_func(type, mode, name, file, buf, offset,
 			     n, message1, message2, space_id, trx);
 
 	register_pfs_file_io_end(locker, n);
@@ -265,10 +246,7 @@ pfs_os_file_read_func(
 /*==================*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n,	/*!< in: number of bytes to read */
 	trx_t*		trx,
 	const char*	src_file,/*!< in: file name where func invoked */
@@ -281,7 +259,7 @@ pfs_os_file_read_func(
 	register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_READ,
 				   src_file, src_line);
 
-	result = os_file_read_func(file, buf, offset, offset_high, n, trx);
+	result = os_file_read_func(file, buf, offset, n, trx);
 
 	register_pfs_file_io_end(locker, n);
 
@@ -302,10 +280,7 @@ pfs_os_file_read_no_error_handling_func(
 /*====================================*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n,	/*!< in: number of bytes to read */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line)/*!< in: line where the func invoked */
@@ -317,8 +292,7 @@ pfs_os_file_read_no_error_handling_func(
 	register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_READ,
 				   src_file, src_line);
 
-	result = os_file_read_no_error_handling_func(file, buf, offset,
-						     offset_high, n);
+	result = os_file_read_no_error_handling_func(file, buf, offset, n);
 
 	register_pfs_file_io_end(locker, n);
 
@@ -339,10 +313,7 @@ pfs_os_file_write_func(
 				null-terminated string */
 	os_file_t	file,	/*!< in: handle to a file */
 	const void*	buf,	/*!< in: buffer from which to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to write */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to write */
 	ulint		n,	/*!< in: number of bytes to write */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line)/*!< in: line where the func invoked */
@@ -354,7 +325,7 @@ pfs_os_file_write_func(
 	register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_WRITE,
 				   src_file, src_line);
 
-	result = os_file_write_func(name, file, buf, offset, offset_high, n);
+	result = os_file_write_func(name, file, buf, offset, n);
 
 	register_pfs_file_io_end(locker, n);
 
@@ -372,7 +343,6 @@ ibool
 pfs_os_file_flush_func(
 /*===================*/
 	os_file_t	file,	/*!< in, own: handle to a file */
-	ibool		metadata,
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line)/*!< in: line where the func invoked */
 {
@@ -382,7 +352,7 @@ pfs_os_file_flush_func(
 
 	register_pfs_file_io_begin(&state, locker, file, 0, PSI_FILE_SYNC,
 				   src_file, src_line);
-	result = os_file_flush_func(file, metadata);
+	result = os_file_flush_func(file);
 
 	register_pfs_file_io_end(locker, 0);
 
@@ -419,4 +389,64 @@ pfs_os_file_rename_func(
 
 	return(result);
 }
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_delete(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete()
+@return TRUE if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_func(
+/*====================*/
+	mysql_pfs_key_t key,		/*!< in: Performance Schema Key */
+	const char*	name,		/*!< in: file path as a null-terminated
+					string */
+	const char*	src_file,	/*!< in: file name where func invoked */
+	ulint		src_line)	/*!< in: line where the func invoked */
+{
+	bool	result;
+	struct PSI_file_locker*	locker = NULL;
+	PSI_file_locker_state	state;
+
+	register_pfs_file_close_begin(&state, locker, key, PSI_FILE_DELETE,
+				      name, src_file, src_line);
+
+	result = os_file_delete_func(name);
+
+	register_pfs_file_close_end(locker, 0);
+
+	return(result);
+}
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_delete_if_exists(), not
+directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete_if_exists()
+@return TRUE if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_if_exists_func(
+/*==============================*/
+	mysql_pfs_key_t key,		/*!< in: Performance Schema Key */
+	const char*	name,		/*!< in: file path as a null-terminated
+					string */
+	const char*	src_file,	/*!< in: file name where func invoked */
+	ulint		src_line)	/*!< in: line where the func invoked */
+{
+	bool	result;
+	struct PSI_file_locker*	locker = NULL;
+	PSI_file_locker_state	state;
+
+	register_pfs_file_close_begin(&state, locker, key, PSI_FILE_DELETE,
+				      name, src_file, src_line);
+
+	result = os_file_delete_if_exists_func(name);
+
+	register_pfs_file_close_end(locker, 0);
+
+	return(result);
+}
 #endif /* UNIV_PFS_IO */
diff --git a/storage/xtradb/include/os0proc.h b/storage/xtradb/include/os0proc.h
index 7cf80217bec..f9e88ff1a28 100644
--- a/storage/xtradb/include/os0proc.h
+++ b/storage/xtradb/include/os0proc.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/os0proc.ic b/storage/xtradb/include/os0proc.ic
index 6d7eb1be37c..506f4f8ce0c 100644
--- a/storage/xtradb/include/os0proc.ic
+++ b/storage/xtradb/include/os0proc.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/os0sync.h b/storage/xtradb/include/os0sync.h
index 60ee5dca08f..51c4530bb5a 100644
--- a/storage/xtradb/include/os0sync.h
+++ b/storage/xtradb/include/os0sync.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -18,8 +18,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -36,28 +36,37 @@ Created 9/6/1995 Heikki Tuuri
 
 #include "univ.i"
 #include "ut0lst.h"
+#include "sync0types.h"
 
 #ifdef __WIN__
 /** Native event (slow)*/
 typedef HANDLE			os_native_event_t;
 /** Native mutex */
-typedef CRITICAL_SECTION	os_fast_mutex_t;
+typedef CRITICAL_SECTION	fast_mutex_t;
 /** Native condition variable. */
 typedef CONDITION_VARIABLE	os_cond_t;
 #else
 /** Native mutex */
-typedef pthread_mutex_t		os_fast_mutex_t;
+typedef pthread_mutex_t		fast_mutex_t;
 /** Native condition variable */
 typedef pthread_cond_t		os_cond_t;
 #endif
 
-/** Operating system event */
-typedef struct os_event_struct	os_event_struct_t;
+/** Structure that includes Performance Schema Probe pfs_psi
+in the os_fast_mutex structure if UNIV_PFS_MUTEX is defined */
+struct os_fast_mutex_t {
+	fast_mutex_t		mutex;	/*!< os_fast_mutex */
+#ifdef UNIV_PFS_MUTEX
+	struct PSI_mutex*	pfs_psi;/*!< The performance schema
+					instrumentation hook */
+#endif
+};
+
 /** Operating system event handle */
-typedef os_event_struct_t*	os_event_t;
+typedef struct os_event*	os_event_t;
 
 /** An asynchronous signal sent between threads */
-struct os_event_struct {
+struct os_event {
 #ifdef __WIN__
 	HANDLE		handle;		/*!< kernel event object, slow,
 					used on older Windows */
@@ -72,7 +81,7 @@ struct os_event_struct {
 					the event becomes signaled */
 	os_cond_t	cond_var;	/*!< condition variable is used in
 					waiting for the event */
-	UT_LIST_NODE_T(os_event_struct_t) os_event_list;
+	UT_LIST_NODE_T(os_event_t) os_event_list;
 					/*!< list of all created events */
 };
 
@@ -82,13 +91,11 @@ struct os_event_struct {
 /** Return value of os_event_wait_time() when the time is exceeded */
 #define OS_SYNC_TIME_EXCEEDED   1
 
-/** Operating system mutex */
-typedef struct os_mutex_struct	os_mutex_str_t;
 /** Operating system mutex handle */
-typedef os_mutex_str_t*		os_mutex_t;
+typedef struct os_mutex_t*	os_ib_mutex_t;
 
 /** Mutex protecting counts and the event and OS 'slow' mutex lists */
-extern os_mutex_t	os_sync_mutex;
+extern os_ib_mutex_t	os_sync_mutex;
 
 /** This is incremented by 1 in os_thread_create and decremented by 1 in
 os_thread_exit */
@@ -117,10 +124,8 @@ explicitly by calling sync_os_reset_event.
 @return	the event handle */
 UNIV_INTERN
 os_event_t
-os_event_create(
-/*============*/
-	const char*	name);	/*!< in: the name of the event, if NULL
-				the event is created without a name */
+os_event_create(void);
+/*==================*/
 /**********************************************************//**
 Sets an event semaphore to the signaled state: lets waiting threads
 proceed. */
@@ -176,7 +181,7 @@ os_event_wait_low(
 					os_event_reset(). */
 
 #define os_event_wait(event) os_event_wait_low(event, 0)
-#define os_event_wait_time(e, t) os_event_wait_time_low(e, t, 0)
+#define os_event_wait_time(event, t) os_event_wait_time_low(event, t, 0)
 
 /**********************************************************//**
 Waits for an event object until it is in the signaled state or
@@ -195,10 +200,10 @@ os_event_wait_time_low(
 						os_event_reset(). */
 /*********************************************************//**
 Creates an operating system mutex semaphore. Because these are slow, the
-mutex semaphore of InnoDB itself (mutex_t) should be used where possible.
+mutex semaphore of InnoDB itself (ib_mutex_t) should be used where possible.
 @return	the mutex handle */
 UNIV_INTERN
-os_mutex_t
+os_ib_mutex_t
 os_mutex_create(void);
 /*=================*/
 /**********************************************************//**
@@ -207,21 +212,21 @@ UNIV_INTERN
 void
 os_mutex_enter(
 /*===========*/
-	os_mutex_t	mutex);	/*!< in: mutex to acquire */
+	os_ib_mutex_t	mutex);	/*!< in: mutex to acquire */
 /**********************************************************//**
 Releases ownership of a mutex. */
 UNIV_INTERN
 void
 os_mutex_exit(
 /*==========*/
-	os_mutex_t	mutex);	/*!< in: mutex to release */
+	os_ib_mutex_t	mutex);	/*!< in: mutex to release */
 /**********************************************************//**
 Frees an mutex object. */
 UNIV_INTERN
 void
 os_mutex_free(
 /*==========*/
-	os_mutex_t	mutex);	/*!< in: mutex to free */
+	os_ib_mutex_t	mutex);	/*!< in: mutex to free */
 /**********************************************************//**
 Acquires ownership of a fast mutex. Currently in Windows this is the same
 as os_fast_mutex_lock!
@@ -231,34 +236,119 @@ ulint
 os_fast_mutex_trylock(
 /*==================*/
 	os_fast_mutex_t*	fast_mutex);	/*!< in: mutex to acquire */
+
+/**********************************************************************
+Following os_fast_ mutex APIs would be performance schema instrumented:
+
+os_fast_mutex_init
+os_fast_mutex_lock
+os_fast_mutex_unlock
+os_fast_mutex_free
+
+These mutex APIs will point to corresponding wrapper functions that contain
+the performance schema instrumentation.
+
+NOTE! The following macro should be used in mutex operation, not the
+corresponding function. */
+
+#ifdef UNIV_PFS_MUTEX
+# define os_fast_mutex_init(K, M)			\
+	pfs_os_fast_mutex_init(K, M)
+
+# define os_fast_mutex_lock(M)				\
+	pfs_os_fast_mutex_lock(M, __FILE__, __LINE__)
+
+# define os_fast_mutex_unlock(M)	pfs_os_fast_mutex_unlock(M)
+
+# define os_fast_mutex_free(M)		pfs_os_fast_mutex_free(M)
+
+/*********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_init(), not directly
+this function!
+A wrapper function for os_fast_mutex_init_func(). Initializes an operating
+system fast mutex semaphore. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_init(
+/*===================*/
+	PSI_mutex_key		key,		/*!< in: Performance Schema
+						key */
+	os_fast_mutex_t*	fast_mutex);	/*!< out: fast mutex */
+/**********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_free(), not directly
+this function!
+Wrapper function for pfs_os_fast_mutex_free(). Also destroys the performance
+schema probes when freeing the mutex */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_free(
+/*===================*/
+	os_fast_mutex_t*	fast_mutex);	/*!< in/out: mutex to free */
+/**********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_lock, not directly
+this function!
+Wrapper function of os_fast_mutex_lock. Acquires ownership of a fast mutex. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_lock(
+/*===================*/
+	os_fast_mutex_t*	fast_mutex,	/*!< in/out: mutex to acquire */
+	const char*		file_name,	/*!< in: file name where
+						 locked */
+	ulint			line);		/*!< in: line where locked */
+/**********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_unlock, not directly
+this function!
+Wrapper function of os_fast_mutex_unlock. Releases ownership of a fast mutex. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_unlock(
+/*=====================*/
+	os_fast_mutex_t*	fast_mutex);	/*!< in/out: mutex to release */
+
+#else /* UNIV_PFS_MUTEX */
+
+# define os_fast_mutex_init(K, M)			\
+	os_fast_mutex_init_func(&((os_fast_mutex_t*)(M))->mutex)
+
+# define os_fast_mutex_lock(M)				\
+	os_fast_mutex_lock_func(&((os_fast_mutex_t*)(M))->mutex)
+
+# define os_fast_mutex_unlock(M)			\
+	os_fast_mutex_unlock_func(&((os_fast_mutex_t*)(M))->mutex)
+
+# define os_fast_mutex_free(M)				\
+	os_fast_mutex_free_func(&((os_fast_mutex_t*)(M))->mutex)
+#endif /* UNIV_PFS_MUTEX */
+
 /**********************************************************//**
 Releases ownership of a fast mutex. */
 UNIV_INTERN
 void
-os_fast_mutex_unlock(
-/*=================*/
-	os_fast_mutex_t*	fast_mutex);	/*!< in: mutex to release */
+os_fast_mutex_unlock_func(
+/*======================*/
+	fast_mutex_t*		fast_mutex);	/*!< in: mutex to release */
 /*********************************************************//**
 Initializes an operating system fast mutex semaphore. */
 UNIV_INTERN
 void
-os_fast_mutex_init(
-/*===============*/
-	os_fast_mutex_t*	fast_mutex);	/*!< in: fast mutex */
+os_fast_mutex_init_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex);	/*!< in: fast mutex */
 /**********************************************************//**
 Acquires ownership of a fast mutex. */
 UNIV_INTERN
 void
-os_fast_mutex_lock(
-/*===============*/
-	os_fast_mutex_t*	fast_mutex);	/*!< in: mutex to acquire */
+os_fast_mutex_lock_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex);	/*!< in: mutex to acquire */
 /**********************************************************//**
 Frees an mutex object. */
 UNIV_INTERN
 void
-os_fast_mutex_free(
-/*===============*/
-	os_fast_mutex_t*	fast_mutex);	/*!< in: mutex to free */
+os_fast_mutex_free_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex);	/*!< in: mutex to free */
 
 /**********************************************************//**
 Atomic compare-and-swap and increment for InnoDB. */
@@ -311,12 +401,30 @@ amount of increment. */
 # define os_atomic_increment_uint64(ptr, amount) \
 	os_atomic_increment(ptr, amount)
 
+/* Returns the resulting value, ptr is pointer to target, amount is the
+amount to decrement. */
+
+# define os_atomic_decrement(ptr, amount) \
+	__sync_sub_and_fetch(ptr, amount)
+
+# define os_atomic_decrement_lint(ptr, amount) \
+	os_atomic_decrement(ptr, amount)
+
+# define os_atomic_decrement_ulint(ptr, amount) \
+	os_atomic_decrement(ptr, amount)
+
+# define os_atomic_decrement_uint64(ptr, amount) \
+	os_atomic_decrement(ptr, amount)
+
 /**********************************************************//**
 Returns the old value of *ptr, atomically sets *ptr to new_val */
 
 # define os_atomic_test_and_set_byte(ptr, new_val) \
 	__sync_lock_test_and_set(ptr, (byte) new_val)
 
+# define os_atomic_test_and_set_ulint(ptr, new_val) \
+	__sync_lock_test_and_set(ptr, new_val)
+
 #elif defined(HAVE_IB_SOLARIS_ATOMICS)
 
 # define HAVE_ATOMIC_BUILTINS
@@ -335,15 +443,15 @@ compare to, new_val is the value to swap in. */
 	(atomic_cas_ulong(ptr, old_val, new_val) == old_val)
 
 # define os_compare_and_swap_lint(ptr, old_val, new_val) \
-	((lint)atomic_cas_ulong((ulong_t*) ptr, old_val, new_val) == old_val)
+	((lint) atomic_cas_ulong((ulong_t*) ptr, old_val, new_val) == old_val)
 
 # ifdef HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS
 #  if SIZEOF_PTHREAD_T == 4
 #   define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
-	((pthread_t)atomic_cas_32(ptr, old_val, new_val) == old_val)
+	((pthread_t) atomic_cas_32(ptr, old_val, new_val) == old_val)
 #  elif SIZEOF_PTHREAD_T == 8
 #   define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
-	((pthread_t)atomic_cas_64(ptr, old_val, new_val) == old_val)
+	((pthread_t) atomic_cas_64(ptr, old_val, new_val) == old_val)
 #  else
 #   error "SIZEOF_PTHREAD_T != 4 or 8"
 #  endif /* SIZEOF_PTHREAD_T CHECK */
@@ -359,21 +467,36 @@ compare to, new_val is the value to swap in. */
 Returns the resulting value, ptr is pointer to target, amount is the
 amount of increment. */
 
-# define os_atomic_increment_lint(ptr, amount) \
-	atomic_add_long_nv((ulong_t*) ptr, amount)
-
 # define os_atomic_increment_ulint(ptr, amount) \
 	atomic_add_long_nv(ptr, amount)
 
+# define os_atomic_increment_lint(ptr, amount) \
+	os_atomic_increment_ulint((ulong_t*) ptr, amount)
+
 # define os_atomic_increment_uint64(ptr, amount) \
 	atomic_add_64_nv(ptr, amount)
 
+/* Returns the resulting value, ptr is pointer to target, amount is the
+amount to decrement. */
+
+# define os_atomic_decrement_lint(ptr, amount) \
+	os_atomic_increment_ulint((ulong_t*) ptr, -(amount))
+
+# define os_atomic_decrement_ulint(ptr, amount) \
+	os_atomic_increment_ulint(ptr, -(amount))
+
+# define os_atomic_decrement_uint64(ptr, amount) \
+	os_atomic_increment_uint64(ptr, -(amount))
+
 /**********************************************************//**
 Returns the old value of *ptr, atomically sets *ptr to new_val */
 
 # define os_atomic_test_and_set_byte(ptr, new_val) \
 	atomic_swap_uchar(ptr, new_val)
 
+# define os_atomic_test_and_set_ulint(ptr, new_val) \
+	atomic_swap_ulong(ptr, new_val)
+
 #elif defined(HAVE_WINDOWS_ATOMICS)
 
 # define HAVE_ATOMIC_BUILTINS
@@ -382,28 +505,66 @@ Returns the old value of *ptr, atomically sets *ptr to new_val */
 #  define HAVE_ATOMIC_BUILTINS_64
 # endif
 
-/* On Windows, use Windows atomics / interlocked */
-# ifdef _WIN64
-#  define win_cmp_and_xchg InterlockedCompareExchange64
-#  define win_xchg_and_add InterlockedExchangeAdd64
-# else /* _WIN64 */
-#  define win_cmp_and_xchg InterlockedCompareExchange
-#  define win_xchg_and_add InterlockedExchangeAdd
-# endif
+/**********************************************************//**
+Atomic compare and exchange of signed integers (both 32 and 64 bit).
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+lint
+win_cmp_and_xchg_lint(
+/*==================*/
+	volatile lint*	ptr,		/*!< in/out: source/destination */
+	lint		new_val,	/*!< in: exchange value */
+	lint		old_val);	/*!< in: value to compare to */
+
+/**********************************************************//**
+Atomic addition of signed integers.
+@return Initial value of the variable pointed to by ptr */
+UNIV_INLINE
+lint
+win_xchg_and_add(
+/*=============*/
+	volatile lint*	ptr,	/*!< in/out: address of destination */
+	lint		val);	/*!< in: number to be added */
+
+/**********************************************************//**
+Atomic compare and exchange of unsigned integers.
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+ulint
+win_cmp_and_xchg_ulint(
+/*===================*/
+	volatile ulint*	ptr,		/*!< in/out: source/destination */
+	ulint		new_val,	/*!< in: exchange value */
+	ulint		old_val);	/*!< in: value to compare to */
+
+/**********************************************************//**
+Atomic compare and exchange of 32 bit unsigned integers.
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+DWORD
+win_cmp_and_xchg_dword(
+/*===================*/
+	volatile DWORD*	ptr,		/*!< in/out: source/destination */
+	DWORD		new_val,	/*!< in: exchange value */
+	DWORD		old_val);	/*!< in: value to compare to */
 
 /**********************************************************//**
 Returns true if swapped, ptr is pointer to target, old_val is value to
 compare to, new_val is the value to swap in. */
 
 # define os_compare_and_swap_ulint(ptr, old_val, new_val) \
-	(win_cmp_and_xchg(ptr, new_val, old_val) == old_val)
+	(win_cmp_and_xchg_ulint(ptr, new_val, old_val) == old_val)
 
 # define os_compare_and_swap_lint(ptr, old_val, new_val) \
-	(win_cmp_and_xchg(ptr, new_val, old_val) == old_val)
+	(win_cmp_and_xchg_lint(ptr, new_val, old_val) == old_val)
 
 /* windows thread objects can always be passed to windows atomic functions */
 # define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
-	(InterlockedCompareExchange(ptr, new_val, old_val) == old_val)
+	(win_cmp_and_xchg_dword(ptr, new_val, old_val) == old_val)
+
 # define INNODB_RW_LOCKS_USE_ATOMICS
 # define IB_ATOMICS_STARTUP_MSG \
 	"Mutexes and rw_locks use Windows interlocked functions"
@@ -416,12 +577,27 @@ amount of increment. */
 	(win_xchg_and_add(ptr, amount) + amount)
 
 # define os_atomic_increment_ulint(ptr, amount) \
-	((ulint) (win_xchg_and_add(ptr, amount) + amount))
+	((ulint) (win_xchg_and_add((lint*) ptr, (lint) amount) + amount))
 
 # define os_atomic_increment_uint64(ptr, amount)		\
 	((ib_uint64_t) (InterlockedExchangeAdd64(		\
-				 (ib_int64_t*) ptr,		\
-				 (ib_int64_t) amount) + amount))
+				(ib_int64_t*) ptr,		\
+				(ib_int64_t) amount) + amount))
+
+/**********************************************************//**
+Returns the resulting value, ptr is pointer to target, amount is the
+amount to decrement. There is no atomic substract function on Windows */
+
+# define os_atomic_decrement_lint(ptr, amount) \
+	(win_xchg_and_add(ptr, -(lint) amount) - amount)
+
+# define os_atomic_decrement_ulint(ptr, amount) \
+	((ulint) (win_xchg_and_add((lint*) ptr, -(lint) amount) - amount))
+
+# define os_atomic_decrement_uint64(ptr, amount)		\
+	((ib_uint64_t) (InterlockedExchangeAdd64(		\
+				(ib_int64_t*) ptr,		\
+				-(ib_int64_t) amount) - amount))
 
 /**********************************************************//**
 Returns the old value of *ptr, atomically sets *ptr to new_val.
@@ -431,10 +607,55 @@ clobbered */
 # define os_atomic_test_and_set_byte(ptr, new_val) \
 	((byte) InterlockedExchange(ptr, new_val))
 
+# define os_atomic_test_and_set_ulong(ptr, new_val) \
+	InterlockedExchange(ptr, new_val)
+
 #else
 # define IB_ATOMICS_STARTUP_MSG \
 	"Mutexes and rw_locks use InnoDB's own implementation"
 #endif
+#ifdef HAVE_ATOMIC_BUILTINS
+#define os_atomic_inc_ulint(m,v,d)	os_atomic_increment_ulint(v, d)
+#define os_atomic_dec_ulint(m,v,d)	os_atomic_decrement_ulint(v, d)
+#else
+#define os_atomic_inc_ulint(m,v,d)	os_atomic_inc_ulint_func(m, v, d)
+#define os_atomic_dec_ulint(m,v,d)	os_atomic_dec_ulint_func(m, v, d)
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+/**********************************************************//**
+Following macros are used to update specified counter atomically
+if HAVE_ATOMIC_BUILTINS defined. Otherwise, use mutex passed in
+for synchronization */
+#ifdef HAVE_ATOMIC_BUILTINS
+#define os_increment_counter_by_amount(mutex, counter, amount)	\
+	(void) os_atomic_increment_ulint(&counter, amount)
+
+#define os_decrement_counter_by_amount(mutex, counter, amount)	\
+	(void) os_atomic_increment_ulint(&counter, (-((lint) amount)))
+#else
+#define os_increment_counter_by_amount(mutex, counter, amount)	\
+	do {							\
+		mutex_enter(&(mutex));				\
+		(counter) += (amount);				\
+		mutex_exit(&(mutex));				\
+	} while (0)
+
+#define os_decrement_counter_by_amount(mutex, counter, amount)	\
+	do {							\
+		ut_a(counter >= amount);			\
+		mutex_enter(&(mutex));				\
+		(counter) -= (amount);				\
+		mutex_exit(&(mutex));				\
+	} while (0)
+#endif  /* HAVE_ATOMIC_BUILTINS */
+
+#define os_inc_counter(mutex, counter)				\
+	os_increment_counter_by_amount(mutex, counter, 1)
+
+#define os_dec_counter(mutex, counter)				\
+	do {							\
+		os_decrement_counter_by_amount(mutex, counter, 1);\
+	} while (0);
 
 #ifndef UNIV_NONINL
 #include "os0sync.ic"
diff --git a/storage/xtradb/include/os0sync.ic b/storage/xtradb/include/os0sync.ic
index 409ff19170a..33c238ceb47 100644
--- a/storage/xtradb/include/os0sync.ic
+++ b/storage/xtradb/include/os0sync.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -36,14 +36,10 @@ os_fast_mutex_trylock(
 /*==================*/
 	os_fast_mutex_t*	fast_mutex)	/*!< in: mutex to acquire */
 {
-#ifdef __WIN__
-	if (TryEnterCriticalSection(fast_mutex)) {
-
-		return(0);
-	} else {
+	fast_mutex_t*	mutex = &fast_mutex->mutex;
 
-		return(1);
-	}
+#ifdef __WIN__
+	return(!TryEnterCriticalSection(mutex));
 #else
 	/* NOTE that the MySQL my_pthread.h redefines pthread_mutex_trylock
 	so that it returns 0 on success. In the operating system
@@ -51,6 +47,186 @@ os_fast_mutex_trylock(
 	returns 1 on success (but MySQL remaps that to 0), while Linux,
 	FreeBSD, Solaris, AIX, Tru64 Unix, HP-UX-11.0 return 0 on success. */
 
-	return((ulint) pthread_mutex_trylock(fast_mutex));
+	return((ulint) pthread_mutex_trylock(mutex));
+#endif
+}
+
+#ifdef UNIV_PFS_MUTEX
+/*********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_init(), not directly
+this function!
+A wrapper function for os_fast_mutex_init_func(). Initializes an operating
+system fast mutex semaphore. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_init(
+/*===================*/
+	PSI_mutex_key		key,		/*!< in: Performance Schema
+						key */
+	os_fast_mutex_t*	fast_mutex)	/*!< out: fast mutex */
+{
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+	fast_mutex->pfs_psi = PSI_MUTEX_CALL(init_mutex)(key, &fast_mutex->mutex);
+#else
+	fast_mutex->pfs_psi = NULL;
+#endif
+
+	os_fast_mutex_init_func(&fast_mutex->mutex);
+}
+/******************************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_free(), not directly
+this function!
+Wrapper function for pfs_os_fast_mutex_free(). Also destroys the performance
+schema probes when freeing the mutex */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_free(
+/*===================*/
+	os_fast_mutex_t*	fast_mutex)  /*!< in/out: mutex */
+{
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+	if (fast_mutex->pfs_psi != NULL)
+		PSI_MUTEX_CALL(destroy_mutex)(fast_mutex->pfs_psi);
 #endif
+	fast_mutex->pfs_psi = NULL;
+
+	os_fast_mutex_free_func(&fast_mutex->mutex);
 }
+/**********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_lock, not directly
+this function!
+Wrapper function of os_fast_mutex_lock_func. Acquires ownership of a fast
+mutex. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_lock(
+/*===================*/
+	os_fast_mutex_t*	fast_mutex,	/*!< in/out: mutex to acquire */
+	const char*		file_name,	/*!< in: file name where
+						 locked */
+	ulint			line)		/*!< in: line where locked */
+{
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+	if (fast_mutex->pfs_psi != NULL)
+	{
+		PSI_mutex_locker* 	locker;
+		PSI_mutex_locker_state	state;
+
+		locker = PSI_MUTEX_CALL(start_mutex_wait)(&state, fast_mutex->pfs_psi,
+			PSI_MUTEX_LOCK, file_name, line);
+
+		os_fast_mutex_lock_func(&fast_mutex->mutex);
+
+		if (locker != NULL)
+			PSI_MUTEX_CALL(end_mutex_wait)(locker, 0);
+	}
+	else
+#endif
+	{
+		os_fast_mutex_lock_func(&fast_mutex->mutex);
+	}
+
+	return;
+}
+/**********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_unlock, not directly
+this function!
+Wrapper function of os_fast_mutex_unlock_func. Releases ownership of a
+fast mutex. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_unlock(
+/*=====================*/
+	os_fast_mutex_t*	fast_mutex)	/*!< in/out: mutex to release */
+{
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+	if (fast_mutex->pfs_psi != NULL)
+		PSI_MUTEX_CALL(unlock_mutex)(fast_mutex->pfs_psi);
+#endif
+
+	os_fast_mutex_unlock_func(&fast_mutex->mutex);
+}
+#endif /* UNIV_PFS_MUTEX */
+
+#ifdef HAVE_WINDOWS_ATOMICS
+
+/* Use inline functions to make 64 and 32 bit versions of windows atomic
+functions so that typecasts are evaluated at compile time. Take advantage
+that lint is either __int64 or long int and windows atomic functions work
+on __int64 and LONG */
+
+/**********************************************************//**
+Atomic compare and exchange of unsigned integers.
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+lint
+win_cmp_and_xchg_lint(
+/*==================*/
+	volatile lint*	ptr,		/*!< in/out: source/destination */
+	lint		new_val,	/*!< in: exchange value */
+	lint		old_val)	/*!< in: value to compare to */
+{
+# ifdef _WIN64
+	return(InterlockedCompareExchange64(ptr, new_val, old_val));
+# else
+	return(InterlockedCompareExchange(ptr, new_val, old_val));
+# endif
+}
+
+/**********************************************************//**
+Atomic addition of signed integers.
+@return Initial value of the variable pointed to by ptr */
+UNIV_INLINE
+lint
+win_xchg_and_add(
+/*=============*/
+	volatile lint*	ptr,	/*!< in/out: address of destination */
+	lint		val)	/*!< in: number to be added */
+{
+#ifdef _WIN64
+	return(InterlockedExchangeAdd64(ptr, val));
+#else
+	return(InterlockedExchangeAdd(ptr, val));
+#endif
+}
+
+/**********************************************************//**
+Atomic compare and exchange of unsigned integers.
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+ulint
+win_cmp_and_xchg_ulint(
+/*===================*/
+	volatile ulint*	ptr,		/*!< in/out: source/destination */
+	ulint		new_val,	/*!< in: exchange value */
+	ulint		old_val)	/*!< in: value to compare to */
+{
+	return((ulint) win_cmp_and_xchg_lint(
+		(volatile lint*) ptr,
+		(lint) new_val,
+		(lint) old_val));
+}
+
+/**********************************************************//**
+Atomic compare and exchange of 32-bit unsigned integers.
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+DWORD
+win_cmp_and_xchg_dword(
+/*===================*/
+	volatile DWORD*	ptr,		/*!< in/out: source/destination */
+	DWORD		new_val,	/*!< in: exchange value */
+	DWORD		old_val)	/*!< in: value to compare to */
+{
+	ut_ad(sizeof(DWORD) == sizeof(LONG));	/* We assume this. */
+	return(InterlockedCompareExchange(
+		(volatile LONG*) ptr,
+		(LONG) new_val,
+		(LONG) old_val));
+}
+
+#endif /* HAVE_WINDOWS_ATOMICS */
+
diff --git a/storage/xtradb/include/os0thread.h b/storage/xtradb/include/os0thread.h
index e8538247d10..d84eff99519 100644
--- a/storage/xtradb/include/os0thread.h
+++ b/storage/xtradb/include/os0thread.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -29,13 +29,16 @@ Created 9/8/1995 Heikki Tuuri
 
 #include "univ.i"
 
+#ifdef UNIV_LINUX
+#include <sys/types.h>
+#endif
+
 /* Maximum number of threads which can be created in the program;
 this is also the size of the wait slot array for MySQL threads which
 can wait inside InnoDB */
 
 #define	OS_THREAD_MAX_N		srv_max_n_threads
 
-
 /* Possible fixed priorities for threads */
 #define OS_THREAD_PRIORITY_NONE		100
 #define OS_THREAD_PRIORITY_BACKGROUND	1
@@ -44,15 +47,46 @@ can wait inside InnoDB */
 
 #ifdef __WIN__
 typedef void*			os_thread_t;
-typedef unsigned long		os_thread_id_t;	/*!< In Windows the thread id
+typedef DWORD			os_thread_id_t;	/*!< In Windows the thread id
 						is an unsigned long int */
+typedef os_thread_id_t		os_tid_t;
+extern "C"  {
+typedef LPTHREAD_START_ROUTINE	os_thread_func_t;
+}
+
+/** Macro for specifying a Windows thread start function. */
+#define DECLARE_THREAD(func)	WINAPI func
+
+/** Required to get around a build error on Windows. Even though our functions
+are defined/declared as WINAPI f(LPVOID a); the compiler complains that they
+are defined as: os_thread_ret_t (__cdecl*)(void*). Because our functions
+don't access the arguments and don't return any value, we should be safe. */
+#define os_thread_create(f,a,i)	\
+	os_thread_create_func(reinterpret_cast<os_thread_func_t>(f), a, i)
+
 #else
+
 typedef pthread_t		os_thread_t;
 typedef os_thread_t		os_thread_id_t;	/*!< In Unix we use the thread
 						handle itself as the id of
 						the thread */
+#ifdef UNIV_LINUX
+typedef pid_t			os_tid_t;	/*!< An alias for pid_t on
+						Linux, where setpriority()
+						accepts thread id of this type
+						and not pthread_t */
+#else
+typedef os_thread_id_t		os_tid_t;
 #endif
 
+extern "C"  { typedef void*	(*os_thread_func_t)(void*); }
+
+/** Macro for specifying a POSIX thread start function. */
+#define DECLARE_THREAD(func)	func
+#define os_thread_create(f,a,i)	os_thread_create_func(f, a, i)
+
+#endif /* __WIN__ */
+
 /* Define a function pointer type to use in a typecast */
 typedef void* (*os_posix_f_t) (void*);
 
@@ -88,14 +122,10 @@ thread should always use that to exit and not use return() to exit.
 @return	handle to the thread */
 UNIV_INTERN
 os_thread_t
-os_thread_create(
-/*=============*/
-#ifndef __WIN__
-	os_posix_f_t		start_f,
-#else
-	ulint (*start_f)(void*),		/*!< in: pointer to function
+os_thread_create_func(
+/*==================*/
+	os_thread_func_t	func,		/*!< in: pointer to function
 						from which to start */
-#endif
 	void*			arg,		/*!< in: argument to start
 						function */
 	os_thread_id_t*		thread_id);	/*!< out: id of the created
@@ -118,6 +148,15 @@ os_thread_id_t
 os_thread_get_curr_id(void);
 /*========================*/
 /*****************************************************************//**
+Returns the system-specific thread identifier of current thread.  On Linux,
+returns tid.  On other systems currently returns os_thread_get_curr_id().
+
+@return	current thread identifier */
+UNIV_INTERN
+os_tid_t
+os_thread_get_tid(void);
+/*=====================*/
+/*****************************************************************//**
 Advises the os to give up remainder of the thread's time slice. */
 UNIV_INTERN
 void
@@ -130,6 +169,18 @@ void
 os_thread_sleep(
 /*============*/
 	ulint	tm);	/*!< in: time in microseconds */
+/*****************************************************************//**
+Set relative scheduling priority for a given thread on Linux.  Currently a
+no-op on other systems.
+
+@return An actual thread priority after the update  */
+UNIV_INTERN
+ulint
+os_thread_set_priority(
+/*===================*/
+	os_tid_t	thread_id,		/*!< in: thread id */
+	ulint		relative_priority);	/*!< in: system-specific
+						priority value */
 
 #ifndef UNIV_NONINL
 #include "os0thread.ic"
diff --git a/storage/xtradb/include/os0thread.ic b/storage/xtradb/include/os0thread.ic
index 5615791c77e..0622d22f2dc 100644
--- a/storage/xtradb/include/os0thread.ic
+++ b/storage/xtradb/include/os0thread.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/page0cur.h b/storage/xtradb/include/page0cur.h
index 5081a1de0ab..b1ad49b4915 100644
--- a/storage/xtradb/include/page0cur.h
+++ b/storage/xtradb/include/page0cur.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -162,6 +162,12 @@ Inserts a record next to page cursor. Returns pointer to inserted record if
 succeed, i.e., enough space available, NULL otherwise. The cursor stays at
 the same logical position, but the physical position may change if it is
 pointing to a compressed page that was reorganized.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
 @return	pointer to record if succeed, NULL otherwise */
 UNIV_INLINE
 rec_t*
@@ -170,14 +176,23 @@ page_cur_tuple_insert(
 	page_cur_t*	cursor,	/*!< in/out: a page cursor */
 	const dtuple_t*	tuple,	/*!< in: pointer to a data tuple */
 	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint**		offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle, or NULL */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+	__attribute__((nonnull(1,2,3,4,5), warn_unused_result));
 #endif /* !UNIV_HOTBACKUP */
 /***********************************************************//**
 Inserts a record next to page cursor. Returns pointer to inserted record if
 succeed, i.e., enough space available, NULL otherwise. The cursor stays at
 the same logical position, but the physical position may change if it is
 pointing to a compressed page that was reorganized.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
 @return	pointer to record if succeed, NULL otherwise */
 UNIV_INLINE
 rec_t*
@@ -202,27 +217,38 @@ page_cur_insert_rec_low(
 	dict_index_t*	index,	/*!< in: record descriptor */
 	const rec_t*	rec,	/*!< in: pointer to a physical record */
 	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle, or NULL */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+	__attribute__((nonnull(1,2,3,4), warn_unused_result));
 /***********************************************************//**
 Inserts a record next to page cursor on a compressed and uncompressed
 page. Returns pointer to inserted record if succeed, i.e.,
 enough space available, NULL otherwise.
 The cursor stays at the same position.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
 @return	pointer to record if succeed, NULL otherwise */
 UNIV_INTERN
 rec_t*
 page_cur_insert_rec_zip(
 /*====================*/
-	rec_t**		current_rec,/*!< in/out: pointer to current record after
-				which the new record is inserted */
-	buf_block_t*	block,	/*!< in: buffer block of *current_rec */
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
 	dict_index_t*	index,	/*!< in: record descriptor */
 	const rec_t*	rec,	/*!< in: pointer to a physical record */
 	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle, or NULL */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+	__attribute__((nonnull(1,2,3,4), warn_unused_result));
 /*************************************************************//**
 Copies records from page to a newly created page, from a given record onward,
-including that record. Infimum and supremum records are not copied. */
+including that record. Infimum and supremum records are not copied.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit(). */
 UNIV_INTERN
 void
 page_copy_rec_list_end_to_created_page(
@@ -238,10 +264,11 @@ UNIV_INTERN
 void
 page_cur_delete_rec(
 /*================*/
-	page_cur_t*	cursor,	/*!< in/out: a page cursor */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	const ulint*	offsets,/*!< in: rec_get_offsets(cursor->rec, index) */
-	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+	page_cur_t*		cursor,	/*!< in/out: a page cursor */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const ulint*		offsets,/*!< in: rec_get_offsets(
+					cursor->rec, index) */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
 #ifndef UNIV_HOTBACKUP
 /****************************************************************//**
 Searches the right position for a page cursor.
@@ -331,10 +358,24 @@ page_cur_parse_delete_rec(
 	buf_block_t*	block,	/*!< in: page or NULL */
 	dict_index_t*	index,	/*!< in: record descriptor */
 	mtr_t*		mtr);	/*!< in: mtr or NULL */
+/*******************************************************//**
+Removes the record from a leaf page. This function does not log
+any changes. It is used by the IMPORT tablespace functions.
+@return	true if success, i.e., the page did not become too empty */
+UNIV_INTERN
+bool
+page_delete_rec(
+/*============*/
+	const dict_index_t*	index,	/*!< in: The index that the record
+					belongs to */
+	page_cur_t*		pcur,	/*!< in/out: page cursor on record
+					to delete */
+	page_zip_des_t*		page_zip,/*!< in: compressed page descriptor */
+	const ulint*		offsets);/*!< in: offsets for record */
 
 /** Index page cursor */
 
-struct page_cur_struct{
+struct page_cur_t{
 	byte*		rec;	/*!< pointer to a record on page */
 	buf_block_t*	block;	/*!< pointer to the block containing rec */
 };
diff --git a/storage/xtradb/include/page0cur.ic b/storage/xtradb/include/page0cur.ic
index 1903fedf9e5..028d33b17aa 100644
--- a/storage/xtradb/include/page0cur.ic
+++ b/storage/xtradb/include/page0cur.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,6 +27,8 @@ Created 10/4/1994 Heikki Tuuri
 #include "buf0types.h"
 
 #ifdef UNIV_DEBUG
+# include "rem0cmp.h"
+
 /*********************************************************//**
 Gets pointer to the page frame where the cursor is positioned.
 @return	page */
@@ -235,6 +237,12 @@ Inserts a record next to page cursor. Returns pointer to inserted record if
 succeed, i.e., enough space available, NULL otherwise. The cursor stays at
 the same logical position, but the physical position may change if it is
 pointing to a compressed page that was reorganized.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
 @return	pointer to record if succeed, NULL otherwise */
 UNIV_INLINE
 rec_t*
@@ -243,32 +251,36 @@ page_cur_tuple_insert(
 	page_cur_t*	cursor,	/*!< in/out: a page cursor */
 	const dtuple_t*	tuple,	/*!< in: pointer to a data tuple */
 	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint**		offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
 	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
 {
-	mem_heap_t*	heap;
-	ulint*		offsets;
 	ulint		size
 		= rec_get_converted_size(index, tuple, n_ext);
 	rec_t*		rec;
 
-	heap = mem_heap_create(size
-			       + (4 + REC_OFFS_HEADER_SIZE
-				  + dtuple_get_n_fields(tuple))
-			       * sizeof *offsets);
-	rec = rec_convert_dtuple_to_rec((byte*) mem_heap_alloc(heap, size),
+	if (!*heap) {
+		*heap = mem_heap_create(size
+					+ (4 + REC_OFFS_HEADER_SIZE
+					   + dtuple_get_n_fields(tuple))
+					* sizeof **offsets);
+	}
+
+	rec = rec_convert_dtuple_to_rec((byte*) mem_heap_alloc(*heap, size),
 					index, tuple, n_ext);
-	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
+	*offsets = rec_get_offsets(
+		rec, index, *offsets, ULINT_UNDEFINED, heap);
 
 	if (buf_block_get_page_zip(cursor->block)) {
-		rec = page_cur_insert_rec_zip(&cursor->rec, cursor->block,
-					      index, rec, offsets, mtr);
+		rec = page_cur_insert_rec_zip(
+			cursor, index, rec, *offsets, mtr);
 	} else {
 		rec = page_cur_insert_rec_low(cursor->rec,
-					      index, rec, offsets, mtr);
+					      index, rec, *offsets, mtr);
 	}
 
-	mem_heap_free(heap);
+	ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, *offsets));
 	return(rec);
 }
 #endif /* !UNIV_HOTBACKUP */
@@ -278,6 +290,12 @@ Inserts a record next to page cursor. Returns pointer to inserted record if
 succeed, i.e., enough space available, NULL otherwise. The cursor stays at
 the same logical position, but the physical position may change if it is
 pointing to a compressed page that was reorganized.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
 @return	pointer to record if succeed, NULL otherwise */
 UNIV_INLINE
 rec_t*
@@ -290,8 +308,8 @@ page_cur_rec_insert(
 	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
 {
 	if (buf_block_get_page_zip(cursor->block)) {
-		return(page_cur_insert_rec_zip(&cursor->rec, cursor->block,
-					       index, rec, offsets, mtr));
+		return(page_cur_insert_rec_zip(
+			       cursor, index, rec, offsets, mtr));
 	} else {
 		return(page_cur_insert_rec_low(cursor->rec,
 					       index, rec, offsets, mtr));
diff --git a/storage/xtradb/include/page0page.h b/storage/xtradb/include/page0page.h
index ba1ee7a7d11..80181bb5c30 100644
--- a/storage/xtradb/include/page0page.h
+++ b/storage/xtradb/include/page0page.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -518,14 +518,32 @@ page_rec_get_heap_no(
 	const rec_t*	rec);	/*!< in: the physical record */
 /************************************************************//**
 Determine whether the page is a B-tree leaf.
-@return	TRUE if the page is a B-tree leaf */
+@return	true if the page is a B-tree leaf (PAGE_LEVEL = 0) */
 UNIV_INLINE
-ibool
+bool
 page_is_leaf(
 /*=========*/
 	const page_t*	page)	/*!< in: page */
 	__attribute__((pure));
 /************************************************************//**
+Determine whether the page is empty.
+@return	true if the page is empty (PAGE_N_RECS = 0) */
+UNIV_INLINE
+bool
+page_is_empty(
+/*==========*/
+	const page_t*	page)	/*!< in: page */
+	__attribute__((nonnull, pure));
+/************************************************************//**
+Determine whether the page contains garbage.
+@return	true if the page contains garbage (PAGE_GARBAGE is not 0) */
+UNIV_INLINE
+bool
+page_has_garbage(
+/*=============*/
+	const page_t*	page)	/*!< in: page */
+	__attribute__((nonnull, pure));
+/************************************************************//**
 Gets the pointer to the next record on the page.
 @return	pointer to next record */
 UNIV_INLINE
@@ -551,15 +569,25 @@ page_rec_get_next_const(
 /*====================*/
 	const rec_t*	rec);	/*!< in: pointer to record */
 /************************************************************//**
+Gets the pointer to the next non delete-marked record on the page.
+If all subsequent records are delete-marked, then this function
+will return the supremum record.
+@return	pointer to next non delete-marked record or pointer to supremum */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_non_del_marked(
+/*=============================*/
+	const rec_t*	rec);	/*!< in: pointer to record */
+/************************************************************//**
 Sets the pointer to the next record on the page. */
 UNIV_INLINE
 void
 page_rec_set_next(
 /*==============*/
-	rec_t*	rec,	/*!< in: pointer to record,
-			must not be page supremum */
-	rec_t*	next);	/*!< in: pointer to next record,
-			must not be page infimum */
+	rec_t*		rec,	/*!< in: pointer to record,
+				must not be page supremum */
+	const rec_t*	next);	/*!< in: pointer to next record,
+				must not be page infimum */
 /************************************************************//**
 Gets the pointer to the previous record.
 @return	pointer to previous record */
@@ -737,11 +765,14 @@ UNIV_INLINE
 void
 page_mem_free(
 /*==========*/
-	page_t*		page,	/*!< in/out: index page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	rec_t*		rec,	/*!< in: pointer to the (origin of) record */
-	dict_index_t*	index,	/*!< in: index of rec */
-	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+	page_t*			page,	/*!< in/out: index page */
+	page_zip_des_t*		page_zip,/*!< in/out: compressed page,
+					 or NULL */
+	rec_t*			rec,	/*!< in: pointer to the (origin of)
+					record */
+	const dict_index_t*	index,	/*!< in: index of rec */
+	const ulint*		offsets);/*!< in: array returned by
+					 rec_get_offsets() */
 /**********************************************************//**
 Create an uncompressed B-tree index page.
 @return	pointer to the page */
@@ -764,11 +795,27 @@ page_create_zip(
 					page is created */
 	dict_index_t*	index,		/*!< in: the index of the page */
 	ulint		level,		/*!< in: the B-tree level of the page */
-	mtr_t*		mtr);		/*!< in: mini-transaction handle */
-
+	trx_id_t	max_trx_id,	/*!< in: PAGE_MAX_TRX_ID */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
+/**********************************************************//**
+Empty a previously created B-tree index page. */
+UNIV_INTERN
+void
+page_create_empty(
+/*==============*/
+	buf_block_t*	block,	/*!< in/out: B-tree block */
+	dict_index_t*	index,	/*!< in: the index of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull(1,2)));
 /*************************************************************//**
 Differs from page_copy_rec_list_end, because this function does not
-touch the lock table and max trx id on page or compress the page. */
+touch the lock table and max trx id on page or compress the page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit(). */
 UNIV_INTERN
 void
 page_copy_rec_list_end_no_locks(
@@ -782,6 +829,12 @@ page_copy_rec_list_end_no_locks(
 Copies records from page to new_page, from the given record onward,
 including that record. Infimum and supremum records are not copied.
 The records are copied to the start of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
 @return pointer to the original successor of the infimum record on
 new_page, or NULL on zip overflow (new_block will be decompressed) */
 UNIV_INTERN
@@ -798,6 +851,12 @@ page_copy_rec_list_end(
 Copies records from page to new_page, up to the given record, NOT
 including that record. Infimum and supremum records are not copied.
 The records are copied to the end of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
 @return pointer to the original predecessor of the supremum record on
 new_page, or NULL on zip overflow (new_block will be decompressed) */
 UNIV_INTERN
@@ -842,6 +901,12 @@ page_delete_rec_list_start(
 /*************************************************************//**
 Moves record list end to another page. Moved records include
 split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
 @return TRUE on success; FALSE on compression failure (new_block will
 be decompressed) */
 UNIV_INTERN
@@ -857,6 +922,12 @@ page_move_rec_list_end(
 /*************************************************************//**
 Moves record list start to another page. Moved records do not include
 split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
 @return	TRUE on success; FALSE on compression failure */
 UNIV_INTERN
 ibool
@@ -1031,7 +1102,6 @@ page_find_rec_with_heap_no(
 /*=======================*/
 	const page_t*	page,	/*!< in: index page */
 	ulint		heap_no);/*!< in: heap number */
-
 #ifdef UNIV_MATERIALIZE
 #undef UNIV_INLINE
 #define UNIV_INLINE  UNIV_INLINE_ORIGINAL
diff --git a/storage/xtradb/include/page0page.ic b/storage/xtradb/include/page0page.ic
index 4fe93345ce5..58add015d34 100644
--- a/storage/xtradb/include/page0page.ic
+++ b/storage/xtradb/include/page0page.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -136,7 +136,7 @@ page_header_set_field(
 	ut_ad(field != PAGE_N_HEAP || (val & 0x7fff) < UNIV_PAGE_SIZE);
 
 	mach_write_to_2(page + PAGE_HEADER + field, val);
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		page_zip_write_header(page_zip,
 				      page + PAGE_HEADER + field, 2, NULL);
 	}
@@ -211,7 +211,7 @@ page_header_reset_last_insert(
 {
 	ut_ad(page && mtr);
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		mach_write_to_2(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0);
 		page_zip_write_header(page_zip,
 				      page + (PAGE_HEADER + PAGE_LAST_INSERT),
@@ -233,8 +233,7 @@ page_is_comp(
 /*=========*/
 	const page_t*	page)	/*!< in: index page */
 {
-	return(UNIV_EXPECT(page_header_get_field(page, PAGE_N_HEAP) & 0x8000,
-			   0x8000));
+	return(page_header_get_field(page, PAGE_N_HEAP) & 0x8000);
 }
 
 /************************************************************//**
@@ -267,9 +266,9 @@ page_rec_get_heap_no(
 
 /************************************************************//**
 Determine whether the page is a B-tree leaf.
-@return	TRUE if the page is a B-tree leaf */
+@return	true if the page is a B-tree leaf (PAGE_LEVEL = 0) */
 UNIV_INLINE
-ibool
+bool
 page_is_leaf(
 /*=========*/
 	const page_t*	page)	/*!< in: page */
@@ -281,6 +280,30 @@ page_is_leaf(
 }
 
 /************************************************************//**
+Determine whether the page is empty.
+@return	true if the page is empty (PAGE_N_RECS = 0) */
+UNIV_INLINE
+bool
+page_is_empty(
+/*==========*/
+	const page_t*	page)	/*!< in: page */
+{
+	return(!*(const uint16*) (page + (PAGE_HEADER + PAGE_N_RECS)));
+}
+
+/************************************************************//**
+Determine whether the page contains garbage.
+@return	true if the page contains garbage (PAGE_GARBAGE is not 0) */
+UNIV_INLINE
+bool
+page_has_garbage(
+/*=============*/
+	const page_t*	page)	/*!< in: page */
+{
+	return(!!*(const uint16*) (page + (PAGE_HEADER + PAGE_GARBAGE)));
+}
+
+/************************************************************//**
 Gets the offset of the first record on the page.
 @return	offset of the first record in record list, relative from page */
 UNIV_INLINE
@@ -348,10 +371,10 @@ page_rec_is_user_rec_low(
 #endif
 	ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
 
-	return(UNIV_LIKELY(offset != PAGE_NEW_SUPREMUM)
-	       && UNIV_LIKELY(offset != PAGE_NEW_INFIMUM)
-	       && UNIV_LIKELY(offset != PAGE_OLD_INFIMUM)
-	       && UNIV_LIKELY(offset != PAGE_OLD_SUPREMUM));
+	return(offset != PAGE_NEW_SUPREMUM
+	       && offset != PAGE_NEW_INFIMUM
+	       && offset != PAGE_OLD_INFIMUM
+	       && offset != PAGE_OLD_SUPREMUM);
 }
 
 /************************************************************//**
@@ -366,8 +389,8 @@ page_rec_is_supremum_low(
 	ut_ad(offset >= PAGE_NEW_INFIMUM);
 	ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
 
-	return(UNIV_UNLIKELY(offset == PAGE_NEW_SUPREMUM)
-	       || UNIV_UNLIKELY(offset == PAGE_OLD_SUPREMUM));
+	return(offset == PAGE_NEW_SUPREMUM
+	       || offset == PAGE_OLD_SUPREMUM);
 }
 
 /************************************************************//**
@@ -382,8 +405,7 @@ page_rec_is_infimum_low(
 	ut_ad(offset >= PAGE_NEW_INFIMUM);
 	ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
 
-	return(UNIV_UNLIKELY(offset == PAGE_NEW_INFIMUM)
-	       || UNIV_UNLIKELY(offset == PAGE_OLD_INFIMUM));
+	return(offset == PAGE_NEW_INFIMUM || offset == PAGE_OLD_INFIMUM);
 }
 
 /************************************************************//**
@@ -487,12 +509,14 @@ page_cmp_dtuple_rec_with_match(
 
 	rec_offset = page_offset(rec);
 
-	if (UNIV_UNLIKELY(rec_offset == PAGE_NEW_INFIMUM)
-	    || UNIV_UNLIKELY(rec_offset == PAGE_OLD_INFIMUM)) {
+	if (rec_offset == PAGE_NEW_INFIMUM
+	    || rec_offset == PAGE_OLD_INFIMUM) {
+
 		return(1);
-	}
-	if (UNIV_UNLIKELY(rec_offset == PAGE_NEW_SUPREMUM)
-	    || UNIV_UNLIKELY(rec_offset == PAGE_OLD_SUPREMUM)) {
+
+	} else if (rec_offset == PAGE_NEW_SUPREMUM
+		   || rec_offset == PAGE_OLD_SUPREMUM) {
+
 		return(-1);
 	}
 
@@ -734,21 +758,19 @@ page_rec_get_next_low(
 
 	offs = rec_get_next_offs(rec, comp);
 
-	if (UNIV_UNLIKELY(offs >= UNIV_PAGE_SIZE)) {
+	if (offs >= UNIV_PAGE_SIZE) {
 		fprintf(stderr,
 			"InnoDB: Next record offset is nonsensical %lu"
 			" in record at offset %lu\n"
 			"InnoDB: rec address %p, space id %lu, page %lu\n",
-			(ulong)offs, (ulong) page_offset(rec),
+			(ulong) offs, (ulong) page_offset(rec),
 			(void*) rec,
 			(ulong) page_get_space_id(page),
 			(ulong) page_get_page_no(page));
 		buf_page_print(page, 0, 0);
 
 		ut_error;
-	}
-
-	if (UNIV_UNLIKELY(offs == 0)) {
+	} else if (offs == 0) {
 
 		return(NULL);
 	}
@@ -781,14 +803,38 @@ page_rec_get_next_const(
 }
 
 /************************************************************//**
+Gets the pointer to the next non delete-marked record on the page.
+If all subsequent records are delete-marked, then this function
+will return the supremum record.
+@return	pointer to next non delete-marked record or pointer to supremum */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_non_del_marked(
+/*=============================*/
+	const rec_t*	rec)	/*!< in: pointer to record */
+{
+	const rec_t*	r;
+	ulint		page_is_compact = page_rec_is_comp(rec);
+
+	for (r = page_rec_get_next_const(rec);
+	     !page_rec_is_supremum(r)
+	     && rec_get_deleted_flag(r, page_is_compact);
+	     r = page_rec_get_next_const(r)) {
+		/* noop */
+	}
+
+	return(r);
+}
+
+/************************************************************//**
 Sets the pointer to the next record on the page. */
 UNIV_INLINE
 void
 page_rec_set_next(
 /*==============*/
-	rec_t*	rec,		/*!< in: pointer to record,
+	rec_t*		rec,	/*!< in: pointer to record,
 				must not be page supremum */
-	rec_t*	next)		/*!< in: pointer to next record,
+	const rec_t*	next)	/*!< in: pointer to next record,
 				must not be page infimum */
 {
 	ulint	offs;
@@ -800,11 +846,7 @@ page_rec_set_next(
 	ut_ad(!next || !page_rec_is_infimum(next));
 	ut_ad(!next || page_align(rec) == page_align(next));
 
-	if (UNIV_LIKELY(next != NULL)) {
-		offs = page_offset(next);
-	} else {
-		offs = 0;
-	}
+	offs = next != NULL ? page_offset(next) : 0;
 
 	if (page_rec_is_comp(rec)) {
 		rec_set_next_offs_new(rec, offs);
@@ -979,7 +1021,7 @@ page_get_free_space_of_empty(
 /*=========================*/
 	ulint	comp)		/*!< in: nonzero=compact page layout */
 {
-	if (UNIV_LIKELY(comp)) {
+	if (comp) {
 		return((ulint)(UNIV_PAGE_SIZE
 			       - PAGE_NEW_SUPREMUM_END
 			       - PAGE_DIR
@@ -1094,11 +1136,14 @@ UNIV_INLINE
 void
 page_mem_free(
 /*==========*/
-	page_t*		page,	/*!< in/out: index page */
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	rec_t*		rec,	/*!< in: pointer to the (origin of) record */
-	dict_index_t*	index,	/*!< in: index of rec */
-	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	page_t*			page,		/*!< in/out: index page */
+	page_zip_des_t*		page_zip,	/*!< in/out: compressed page,
+						or NULL */
+	rec_t*			rec,		/*!< in: pointer to the
+						(origin of) record */
+	const dict_index_t*	index,		/*!< in: index of rec */
+	const ulint*		offsets)	/*!< in: array returned by
+						rec_get_offsets() */
 {
 	rec_t*		free;
 	ulint		garbage;
@@ -1114,7 +1159,7 @@ page_mem_free(
 	page_header_set_field(page, page_zip, PAGE_GARBAGE,
 			      garbage + rec_offs_size(offsets));
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		page_zip_dir_delete(page_zip, rec, index, offsets, free);
 	} else {
 		page_header_set_field(page, page_zip, PAGE_N_RECS,
diff --git a/storage/xtradb/include/page0types.h b/storage/xtradb/include/page0types.h
index 4e76e52ecfb..95143a4bb44 100644
--- a/storage/xtradb/include/page0types.h
+++ b/storage/xtradb/include/page0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,6 +26,10 @@ Created 2/2/1994 Heikki Tuuri
 #ifndef page0types_h
 #define page0types_h
 
+using namespace std;
+
+#include <map>
+
 #include "univ.i"
 #include "dict0types.h"
 #include "mtr0types.h"
@@ -35,12 +39,10 @@ Created 2/2/1994 Heikki Tuuri
 /** Type of the index page */
 typedef	byte		page_t;
 /** Index page cursor */
-typedef struct page_cur_struct	page_cur_t;
+struct page_cur_t;
 
 /** Compressed index page */
-typedef byte				page_zip_t;
-/** Compressed page descriptor */
-typedef struct page_zip_des_struct	page_zip_des_t;
+typedef byte		page_zip_t;
 
 /* The following definitions would better belong to page0zip.h,
 but we cannot include page0zip.h from rem0rec.ic, because
@@ -49,25 +51,25 @@ page0*.h includes rem0rec.h and may include rem0rec.ic. */
 /** Number of bits needed for representing different compressed page sizes */
 #define PAGE_ZIP_SSIZE_BITS 3
 
-/** log2 of smallest compressed page size */
-#define PAGE_ZIP_MIN_SIZE_SHIFT	10
-/** Smallest compressed page size */
-#define PAGE_ZIP_MIN_SIZE	(1 << PAGE_ZIP_MIN_SIZE_SHIFT)
+/** Maximum compressed page shift size */
+#define PAGE_ZIP_SSIZE_MAX	\
+	(UNIV_ZIP_SIZE_SHIFT_MAX - UNIV_ZIP_SIZE_SHIFT_MIN + 1)
 
-/** Number of supported compressed page sizes */
-#define PAGE_ZIP_NUM_SSIZE (UNIV_PAGE_SIZE_SHIFT - PAGE_ZIP_MIN_SIZE_SHIFT + 2)
-#define PAGE_ZIP_NUM_SSIZE_MAX (UNIV_PAGE_SIZE_SHIFT_MAX - PAGE_ZIP_MIN_SIZE_SHIFT + 2)
-#if PAGE_ZIP_NUM_SSIZE_MAX > (1 << PAGE_ZIP_SSIZE_BITS)
-# error "PAGE_ZIP_NUM_SSIZE_MAX > (1 << PAGE_ZIP_SSIZE_BITS)"
+/* Make sure there are enough bits available to store the maximum zip
+ssize, which is the number of shifts from 512. */
+#if PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)
+# error "PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)"
 #endif
 
 /** Compressed page descriptor */
-struct page_zip_des_struct
+struct page_zip_des_t
 {
 	page_zip_t*	data;		/*!< compressed page data */
 
 #ifdef UNIV_DEBUG
 	unsigned	m_start:16;	/*!< start offset of modification log */
+	bool		m_external;	/*!< Allocated externally, not from the
+					buffer pool */
 #endif /* UNIV_DEBUG */
 	unsigned	m_end:16;	/*!< end offset of modification log */
 	unsigned	m_nonempty:1;	/*!< TRUE if the modification log
@@ -76,13 +78,13 @@ struct page_zip_des_struct
 					columns on the page; the maximum
 					is 744 on a 16 KiB page */
 	unsigned	ssize:PAGE_ZIP_SSIZE_BITS;
-					/*!< 0 or compressed page size;
+					/*!< 0 or compressed page shift size;
 					the size in bytes is
-					PAGE_ZIP_MIN_SIZE << (ssize - 1). */
+					(UNIV_ZIP_SIZE_MIN >> 1) << ssize. */
 };
 
 /** Compression statistics for a given page size */
-struct page_zip_stat_struct {
+struct page_zip_stat_t {
 	/** Number of page compressions */
 	ulint		compressed;
 	/** Number of successful page compressions */
@@ -93,13 +95,29 @@ struct page_zip_stat_struct {
 	ib_uint64_t	compressed_usec;
 	/** Duration of page decompressions in microseconds */
 	ib_uint64_t	decompressed_usec;
+	page_zip_stat_t() :
+		/* Initialize members to 0 so that when we do
+		stlmap[key].compressed++ and element with "key" does not
+		exist it gets inserted with zeroed members. */
+		compressed(0),
+		compressed_ok(0),
+		decompressed(0),
+		compressed_usec(0),
+		decompressed_usec(0)
+	{ }
 };
 
-/** Compression statistics */
-typedef struct page_zip_stat_struct page_zip_stat_t;
-
-/** Statistics on compression, indexed by page_zip_des_struct::ssize - 1 */
-extern page_zip_stat_t page_zip_stat[PAGE_ZIP_NUM_SSIZE_MAX - 1];
+/** Compression statistics types */
+typedef map<index_id_t, page_zip_stat_t>	page_zip_stat_per_index_t;
+
+/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
+extern page_zip_stat_t				page_zip_stat[PAGE_ZIP_SSIZE_MAX];
+/** Statistics on compression, indexed by dict_index_t::id */
+extern page_zip_stat_per_index_t		page_zip_stat_per_index;
+extern ib_mutex_t				page_zip_stat_per_index_mutex;
+#ifdef HAVE_PSI_INTERFACE
+extern mysql_pfs_key_t				page_zip_stat_per_index_mutex_key;
+#endif /* HAVE_PSI_INTERFACE */
 
 /**********************************************************************//**
 Write the "deleted" flag of a record on a compressed page.  The flag must
diff --git a/storage/xtradb/include/page0zip.h b/storage/xtradb/include/page0zip.h
index a33407e78bc..2f9efc4a40c 100644
--- a/storage/xtradb/include/page0zip.h
+++ b/storage/xtradb/include/page0zip.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
-Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +12,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -35,9 +36,20 @@ Created June 2005 by Marko Makela
 #include "page0types.h"
 #include "buf0types.h"
 #include "dict0types.h"
+#include "srv0srv.h"
 #include "trx0types.h"
 #include "mem0mem.h"
 
+/* Compression level to be used by zlib. Settable by user. */
+extern uint	page_zip_level;
+
+/* Default compression level. */
+#define DEFAULT_COMPRESSION_LEVEL	6
+
+/* Whether or not to log compressed page images to avoid possible
+compression algorithm changes in zlib. */
+extern my_bool	page_zip_log_pages;
+
 /**********************************************************************//**
 Determine the size of a compressed page in bytes.
 @return	size in bytes */
@@ -113,6 +125,7 @@ page_zip_compress(
 				m_start, m_end, m_nonempty */
 	const page_t*	page,	/*!< in: uncompressed page */
 	dict_index_t*	index,	/*!< in: index of the B-tree node */
+	ulint		level,	/*!< in: compression level */
 	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
 	__attribute__((nonnull(1,3)));
 
@@ -336,11 +349,12 @@ UNIV_INTERN
 void
 page_zip_dir_delete(
 /*================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	byte*		rec,	/*!< in: deleted record */
-	dict_index_t*	index,	/*!< in: index of rec */
-	const ulint*	offsets,/*!< in: rec_get_offsets(rec) */
-	const byte*	free)	/*!< in: previous start of the free list */
+	page_zip_des_t*		page_zip,	/*!< in/out: compressed page */
+	byte*			rec,		/*!< in: deleted record */
+	const dict_index_t*	index,		/*!< in: index of rec */
+	const ulint*		offsets,	/*!< in: rec_get_offsets(rec) */
+	const byte*		free)		/*!< in: previous start of
+						the free list */
 	__attribute__((nonnull(1,2,3,4)));
 
 /**********************************************************************//**
@@ -446,16 +460,63 @@ ulint
 page_zip_calc_checksum(
 /*===================*/
         const void*     data,   /*!< in: compressed page */
-        ulint           size)   /*!< in: size of compressed page */
+        ulint           size,   /*!< in: size of compressed page */
+	srv_checksum_algorithm_t algo) /*!< in: algorithm to use */
 	__attribute__((nonnull));
 
+/**********************************************************************//**
+Verify a compressed page's checksum.
+@return	TRUE if the stored checksum is valid according to the value of
+innodb_checksum_algorithm */
+UNIV_INTERN
+ibool
+page_zip_verify_checksum(
+/*=====================*/
+	const void*	data,	/*!< in: compressed page */
+	ulint		size);	/*!< in: size of compressed page */
+/**********************************************************************//**
+Write a log record of compressing an index page without the data on the page. */
+UNIV_INLINE
+void
+page_zip_compress_write_log_no_data(
+/*================================*/
+	ulint		level,	/*!< in: compression level */
+	const page_t*	page,	/*!< in: page that is compressed */
+	dict_index_t*	index,	/*!< in: index */
+	mtr_t*		mtr);	/*!< in: mtr */
+/**********************************************************************//**
+Parses a log record of compressing an index page without the data.
+@return	end of log record or NULL */
+UNIV_INLINE
+byte*
+page_zip_parse_compress_no_data(
+/*============================*/
+	byte*		ptr,		/*!< in: buffer */
+	byte*		end_ptr,	/*!< in: buffer end */
+	page_t*		page,		/*!< in: uncompressed page */
+	page_zip_des_t*	page_zip,	/*!< out: compressed page */
+	dict_index_t*	index)		/*!< in: index */
+	__attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Reset the counters used for filling
+INFORMATION_SCHEMA.innodb_cmp_per_index. */
+UNIV_INLINE
+void
+page_zip_reset_stat_per_index();
+/*===========================*/
+
 #ifndef UNIV_HOTBACKUP
 /** Check if a pointer to an uncompressed page matches a compressed page.
+When we IMPORT a tablespace the blocks and accompanying frames are allocted
+from outside the buffer pool.
 @param ptr	pointer to an uncompressed page frame
 @param page_zip	compressed page descriptor
 @return		TRUE if ptr and page_zip refer to the same block */
-# define PAGE_ZIP_MATCH(ptr, page_zip)			\
-	(buf_frame_get_page_zip(ptr) == (page_zip))
+# define PAGE_ZIP_MATCH(ptr, page_zip)					\
+	(((page_zip)->m_external					\
+	  && (page_align(ptr) + UNIV_PAGE_SIZE == (page_zip)->data))	\
+	  || buf_frame_get_page_zip(ptr) == (page_zip))
 #else /* !UNIV_HOTBACKUP */
 /** Check if a pointer to an uncompressed page matches a compressed page.
 @param ptr	pointer to an uncompressed page frame
diff --git a/storage/xtradb/include/page0zip.ic b/storage/xtradb/include/page0zip.ic
index e26fa3e3d94..6c7d8cd32c7 100644
--- a/storage/xtradb/include/page0zip.ic
+++ b/storage/xtradb/include/page0zip.ic
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +12,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -29,6 +30,7 @@ Created June 2005 by Marko Makela
 #endif
 
 #include "page0zip.h"
+#include "mtr0log.h"
 #include "page0page.h"
 
 /* The format of compressed pages is as follows.
@@ -120,13 +122,13 @@ page_zip_get_size(
 {
 	ulint	size;
 
-	if (UNIV_UNLIKELY(!page_zip->ssize)) {
+	if (!page_zip->ssize) {
 		return(0);
 	}
 
-	size = (PAGE_ZIP_MIN_SIZE >> 1) << page_zip->ssize;
+	size = (UNIV_ZIP_SIZE_MIN >> 1) << page_zip->ssize;
 
-	ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
 	ut_ad(size <= UNIV_PAGE_SIZE);
 
 	return(size);
@@ -174,13 +176,13 @@ page_zip_rec_needs_ext(
 	ut_ad(ut_is_2pow(zip_size));
 	ut_ad(comp || !zip_size);
 
-#if UNIV_PAGE_SIZE > REC_MAX_DATA_SIZE
-	if (UNIV_UNLIKELY(rec_size >= REC_MAX_DATA_SIZE)) {
+#if UNIV_PAGE_SIZE_MAX > REC_MAX_DATA_SIZE
+	if (rec_size >= REC_MAX_DATA_SIZE) {
 		return(TRUE);
 	}
 #endif
 
-	if (UNIV_UNLIKELY(zip_size)) {
+	if (zip_size) {
 		ut_ad(comp);
 		/* On a compressed page, there is a two-byte entry in
 		the dense page directory for every record.  But there
@@ -209,7 +211,7 @@ page_zip_simple_validate(
 {
 	ut_ad(page_zip);
 	ut_ad(page_zip->data);
-	ut_ad(page_zip->ssize < PAGE_ZIP_NUM_SSIZE);
+	ut_ad(page_zip->ssize <= PAGE_ZIP_SSIZE_MAX);
 	ut_ad(page_zip_get_size(page_zip)
 	      > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE);
 	ut_ad(page_zip->m_start <= page_zip->m_end);
@@ -236,11 +238,11 @@ page_zip_get_trailer_len(
 	ut_ad(page_zip_simple_validate(page_zip));
 	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
 
-	if (UNIV_UNLIKELY(!page_is_leaf(page_zip->data))) {
+	if (!page_is_leaf(page_zip->data)) {
 		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
 			+ REC_NODE_PTR_SIZE;
 		ut_ad(!page_zip->n_blobs);
-	} else if (UNIV_UNLIKELY(is_clust)) {
+	} else if (is_clust) {
 		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
 			+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
 	} else {
@@ -305,7 +307,7 @@ page_zip_available(
 	space needed for identifying the record (encoded heap_no). */
 	length -= REC_N_NEW_EXTRA_BYTES - 2;
 
-	if (UNIV_UNLIKELY(create)) {
+	if (create > 0) {
 		/* When a record is created, a pointer may be added to
 		the dense directory.
 		Likewise, space for the columns that will not be
@@ -316,10 +318,8 @@ page_zip_available(
 		trailer_len += PAGE_ZIP_DIR_SLOT_SIZE;
 	}
 
-	return(UNIV_LIKELY(length
-			   + trailer_len
-			   + page_zip->m_end
-			   < page_zip_get_size(page_zip)));
+	return(length + trailer_len + page_zip->m_end
+	       < page_zip_get_size(page_zip));
 }
 
 /**********************************************************************//**
@@ -374,13 +374,82 @@ page_zip_write_header(
 	/* The following would fail in page_cur_insert_rec_zip(). */
 	/* ut_ad(page_zip_validate(page_zip, str - pos)); */
 
-	if (UNIV_LIKELY_NULL(mtr)) {
+	if (mtr) {
 #ifndef UNIV_HOTBACKUP
 		page_zip_write_header_log(str, length, mtr);
 #endif /* !UNIV_HOTBACKUP */
 	}
 }
 
+/**********************************************************************//**
+Write a log record of compressing an index page without the data on the page. */
+UNIV_INLINE
+void
+page_zip_compress_write_log_no_data(
+/*================================*/
+	ulint		level,	/*!< in: compression level */
+	const page_t*	page,	/*!< in: page that is compressed */
+	dict_index_t*	index,	/*!< in: index */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	byte* log_ptr = mlog_open_and_write_index(
+		mtr, page, index, MLOG_ZIP_PAGE_COMPRESS_NO_DATA, 1);
+
+	if (log_ptr) {
+		mach_write_to_1(log_ptr, level);
+		mlog_close(mtr, log_ptr + 1);
+	}
+}
+
+/**********************************************************************//**
+Parses a log record of compressing an index page without the data.
+@return	end of log record or NULL */
+UNIV_INLINE
+byte*
+page_zip_parse_compress_no_data(
+/*============================*/
+	byte*		ptr,		/*!< in: buffer */
+	byte*		end_ptr,	/*!< in: buffer end */
+	page_t*		page,		/*!< in: uncompressed page */
+	page_zip_des_t*	page_zip,	/*!< out: compressed page */
+	dict_index_t*	index)		/*!< in: index */
+{
+	ulint	level;
+	if (end_ptr == ptr) {
+		return(NULL);
+	}
+
+	level = mach_read_from_1(ptr);
+
+	/* If page compression fails then there must be something wrong
+	because a compress log record is logged only if the compression
+	was successful. Crash in this case. */
+
+	if (page
+	    && !page_zip_compress(page_zip, page, index, level, NULL)) {
+		ut_error;
+	}
+
+	return(ptr + 1);
+}
+
+/**********************************************************************//**
+Reset the counters used for filling
+INFORMATION_SCHEMA.innodb_cmp_per_index. */
+UNIV_INLINE
+void
+page_zip_reset_stat_per_index()
+/*===========================*/
+{
+	mutex_enter(&page_zip_stat_per_index_mutex);
+
+	page_zip_stat_per_index.erase(
+		page_zip_stat_per_index.begin(),
+		page_zip_stat_per_index.end());
+
+	mutex_exit(&page_zip_stat_per_index_mutex);
+}
+
 #ifdef UNIV_MATERIALIZE
 # undef UNIV_INLINE
 # define UNIV_INLINE	UNIV_INLINE_ORIGINAL
diff --git a/storage/xtradb/include/pars0grm.h b/storage/xtradb/include/pars0grm.h
index abaffb66c1e..8e725fe9545 100644
--- a/storage/xtradb/include/pars0grm.h
+++ b/storage/xtradb/include/pars0grm.h
@@ -1,29 +1,37 @@
-/*****************************************************************************
+/* A Bison parser, made by GNU Bison 2.3.  */
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-Copyright (c) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004 Free Software
-Foundation, Inc.
+/* Skeleton interface for Bison's Yacc-like parsers in C
 
-As a special exception, when this file is copied by Bison into a
-Bison output file, you may use that output file without restriction.
-This special exception was added by the Free Software Foundation
-in version 1.24 of Bison.
+   Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+   Free Software Foundation, Inc.
 
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
 
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
 
-*****************************************************************************/
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
 
-/* A Bison parser, made by GNU Bison 1.875d.  */
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
 
 /* Tokens.  */
 #ifndef YYTOKENTYPE
@@ -123,9 +131,19 @@ this program; if not, write to the Free Software Foundation, Inc.,
      PARS_LOCK_TOKEN = 347,
      PARS_SHARE_TOKEN = 348,
      PARS_MODE_TOKEN = 349,
-     NEG = 350
+     PARS_LIKE_TOKEN = 350,
+     PARS_LIKE_TOKEN_EXACT = 351,
+     PARS_LIKE_TOKEN_PREFIX = 352,
+     PARS_LIKE_TOKEN_SUFFIX = 353,
+     PARS_LIKE_TOKEN_SUBSTR = 354,
+     PARS_TABLE_NAME_TOKEN = 355,
+     PARS_COMPACT_TOKEN = 356,
+     PARS_BLOCK_SIZE_TOKEN = 357,
+     PARS_BIGINT_TOKEN = 358,
+     NEG = 359
    };
 #endif
+/* Tokens.  */
 #define PARS_INT_LIT 258
 #define PARS_FLOAT_LIT 259
 #define PARS_STR_LIT 260
@@ -218,12 +236,21 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #define PARS_LOCK_TOKEN 347
 #define PARS_SHARE_TOKEN 348
 #define PARS_MODE_TOKEN 349
-#define NEG 350
+#define PARS_LIKE_TOKEN 350
+#define PARS_LIKE_TOKEN_EXACT 351
+#define PARS_LIKE_TOKEN_PREFIX 352
+#define PARS_LIKE_TOKEN_SUFFIX 353
+#define PARS_LIKE_TOKEN_SUBSTR 354
+#define PARS_TABLE_NAME_TOKEN 355
+#define PARS_COMPACT_TOKEN 356
+#define PARS_BLOCK_SIZE_TOKEN 357
+#define PARS_BIGINT_TOKEN 358
+#define NEG 359
 
 
 
 
-#if ! defined (YYSTYPE) && ! defined (YYSTYPE_IS_DECLARED)
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
 typedef int YYSTYPE;
 # define yystype YYSTYPE /* obsolescent; will be withdrawn */
 # define YYSTYPE_IS_DECLARED 1
@@ -232,5 +259,3 @@ typedef int YYSTYPE;
 
 extern YYSTYPE yylval;
 
-
-
diff --git a/storage/xtradb/include/pars0opt.h b/storage/xtradb/include/pars0opt.h
index fd6b9726019..1084d644c90 100644
--- a/storage/xtradb/include/pars0opt.h
+++ b/storage/xtradb/include/pars0opt.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/pars0opt.ic b/storage/xtradb/include/pars0opt.ic
index f303fe91d3b..786d911ca3d 100644
--- a/storage/xtradb/include/pars0opt.ic
+++ b/storage/xtradb/include/pars0opt.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/pars0pars.h b/storage/xtradb/include/pars0pars.h
index eb79dcb18c1..65ff7533828 100644
--- a/storage/xtradb/include/pars0pars.h
+++ b/storage/xtradb/include/pars0pars.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -38,7 +38,7 @@ Created 11/19/1996 Heikki Tuuri
 and varies in type, while 'user_arg' is a user-supplied argument. The
 meaning of the return type also varies. See the individual use cases, e.g.
 the FETCH statement, for details on them. */
-typedef void* (*pars_user_func_cb_t)(void* arg, void* user_arg);
+typedef ibool	(*pars_user_func_cb_t)(void* arg, void* user_arg);
 
 /** If the following is set TRUE, the parser will emit debugging
 information */
@@ -74,6 +74,7 @@ extern pars_res_word_t	pars_distinct_token;
 extern pars_res_word_t	pars_binary_token;
 extern pars_res_word_t	pars_blob_token;
 extern pars_res_word_t	pars_int_token;
+extern pars_res_word_t	pars_bigint_token;
 extern pars_res_word_t	pars_char_token;
 extern pars_res_word_t	pars_float_token;
 extern pars_res_word_t	pars_update_token;
@@ -105,13 +106,13 @@ pars_sql(
 	pars_info_t*	info,	/*!< in: extra information, or NULL */
 	const char*	str);	/*!< in: SQL string */
 /*************************************************************//**
-Retrieves characters to the lexical analyzer. */
+Retrieves characters to the lexical analyzer.
+@return number of characters copied or 0 on EOF */
 UNIV_INTERN
-void
+int
 pars_get_lex_chars(
 /*===============*/
 	char*	buf,		/*!< in/out: buffer where to copy */
-	int*	result,		/*!< out: number of characters copied or EOF */
 	int	max_size);	/*!< in: maximum number of characters which fit
 				in the buffer */
 /*************************************************************//**
@@ -140,6 +141,17 @@ pars_func(
 /*======*/
 	que_node_t*	res_word,/*!< in: function name reserved word */
 	que_node_t*	arg);	/*!< in: first argument in the argument list */
+/*************************************************************************
+Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded
+within the search string.
+@return	own: function node in a query tree */
+UNIV_INTERN
+int
+pars_like_rebind(
+/*=============*/
+        sym_node_t*     node,   /* in: The search string node.*/
+        const byte*     ptr,    /* in: literal to (re) bind */
+        ulint           len);   /* in: length of literal to (re) bind*/
 /*********************************************************************//**
 Parses an operator expression.
 @return	own: function node in a query tree */
@@ -397,7 +409,10 @@ pars_create_table(
 	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
 					table */
 	sym_node_t*	column_defs,	/*!< in: list of column names */
-	void*		not_fit_in_memory);/*!< in: a non-NULL pointer means that
+	sym_node_t*	compact,	/* in: non-NULL if COMPACT table. */
+	sym_node_t*	block_size,	/* in: block size (can be NULL) */
+	void*		not_fit_in_memory);
+					/*!< in: a non-NULL pointer means that
 					this is a table which in simulations
 					should be simulated as not fitting
 					in memory; thread is put to sleep
@@ -454,9 +469,10 @@ que_thr_t*
 pars_complete_graph_for_exec(
 /*=========================*/
 	que_node_t*	node,	/*!< in: root node for an incomplete
-				query graph */
+				query graph, or NULL for dummy graph */
 	trx_t*		trx,	/*!< in: transaction handle */
-	mem_heap_t*	heap);	/*!< in: memory heap from which allocated */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+	__attribute__((nonnull(2,3), warn_unused_result));
 
 /****************************************************************//**
 Create parser info struct.
@@ -498,7 +514,76 @@ pars_info_add_str_literal(
 	pars_info_t*	info,		/*!< in: info struct */
 	const char*	name,		/*!< in: name */
 	const char*	str);		/*!< in: string */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+UNIV_INTERN
+void
+pars_info_bind_literal(
+/*===================*/
+	pars_info_t*	info,		/* in: info struct */
+	const char*	name,		/* in: name */
+	const void*	address,	/* in: address */
+	ulint		length,		/* in: length of data */
+	ulint		type,		/* in: type, e.g. DATA_FIXBINARY */
+	ulint		prtype);	/* in: precise type, e.g. */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+UNIV_INTERN
+void
+pars_info_bind_varchar_literal(
+/*===========================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const byte*	str,		/*!< in: string */
+	ulint		str_len);	/*!< in: string length */
+/****************************************************************//**
+Equivalent to:
 
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_bind_int4_literal(
+/*=======================*/
+	pars_info_t*		info,		/*!< in: info struct */
+	const char*		name,		/*!< in: name */
+	const ib_uint32_t*	val);		/*!< in: value */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+UNIV_INTERN
+void
+pars_info_bind_int8_literal(
+/*=======================*/
+	pars_info_t*		info,		/*!< in: info struct */
+	const char*		name,		/*!< in: name */
+	const ib_uint64_t*	val);		/*!< in: value */
+/****************************************************************//**
+Add user function. */
+UNIV_INTERN
+void
+pars_info_bind_function(
+/*===================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name,	/*!< in: function name */
+	pars_user_func_cb_t	func,	/*!< in: function address */
+	void*			arg);	/*!< in: user-supplied argument */
+/****************************************************************//**
+Add bound id. */
+UNIV_INTERN
+void
+pars_info_bind_id(
+/*=============*/
+	pars_info_t*		info,	/*!< in: info struct */
+	ibool			copy_name,/* in: make a copy of name if TRUE */
+	const char*		name,	/*!< in: name */
+	const char*		id);	/*!< in: id */
 /****************************************************************//**
 Equivalent to:
 
@@ -532,16 +617,18 @@ pars_info_add_ull_literal(
 	pars_info_t*	info,		/*!< in: info struct */
 	const char*	name,		/*!< in: name */
 	ib_uint64_t	val);		/*!< in: value */
+
 /****************************************************************//**
-Add user function. */
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
 UNIV_INTERN
 void
-pars_info_add_function(
-/*===================*/
+pars_info_bind_ull_literal(
+/*=======================*/
 	pars_info_t*		info,	/*!< in: info struct */
-	const char*		name,	/*!< in: function name */
-	pars_user_func_cb_t	func,	/*!< in: function address */
-	void*			arg);	/*!< in: user-supplied argument */
+	const char*		name,	/*!< in: name */
+	const ib_uint64_t*	val)	/*!< in: value */
+	__attribute__((nonnull));
 
 /****************************************************************//**
 Add bound id. */
@@ -554,16 +641,6 @@ pars_info_add_id(
 	const char*	id);		/*!< in: id */
 
 /****************************************************************//**
-Get user function with the given name.
-@return	user func, or NULL if not found */
-UNIV_INTERN
-pars_user_func_t*
-pars_info_get_user_func(
-/*====================*/
-	pars_info_t*		info,	/*!< in: info struct */
-	const char*		name);	/*!< in: function name to find*/
-
-/****************************************************************//**
 Get bound literal with the given name.
 @return	bound literal, or NULL if not found */
 UNIV_INTERN
@@ -591,7 +668,7 @@ pars_lexer_close(void);
 /*==================*/
 
 /** Extra information supplied for pars_sql(). */
-struct pars_info_struct {
+struct pars_info_t {
 	mem_heap_t*	heap;		/*!< our own memory heap */
 
 	ib_vector_t*	funcs;		/*!< user functions, or NUll
@@ -606,39 +683,40 @@ struct pars_info_struct {
 };
 
 /** User-supplied function and argument. */
-struct pars_user_func_struct {
+struct pars_user_func_t {
 	const char*		name;	/*!< function name */
 	pars_user_func_cb_t	func;	/*!< function address */
 	void*			arg;	/*!< user-supplied argument */
 };
 
 /** Bound literal. */
-struct pars_bound_lit_struct {
+struct pars_bound_lit_t {
 	const char*	name;		/*!< name */
 	const void*	address;	/*!< address */
 	ulint		length;		/*!< length of data */
 	ulint		type;		/*!< type, e.g. DATA_FIXBINARY */
 	ulint		prtype;		/*!< precise type, e.g. DATA_UNSIGNED */
+	sym_node_t*	node;		/*!< symbol node */
 };
 
 /** Bound identifier. */
-struct pars_bound_id_struct {
+struct pars_bound_id_t {
 	const char*	name;		/*!< name */
 	const char*	id;		/*!< identifier */
 };
 
 /** Struct used to denote a reserved word in a parsing tree */
-struct pars_res_word_struct{
+struct pars_res_word_t{
 	int	code;	/*!< the token code for the reserved word from
 			pars0grm.h */
 };
 
 /** A predefined function or operator node in a parsing tree; this construct
 is also used for some non-functions like the assignment ':=' */
-struct func_node_struct{
+struct func_node_t{
 	que_common_t	common;	/*!< type: QUE_NODE_FUNC */
 	int		func;	/*!< token code of the function name */
-	ulint		class;	/*!< class of the function */
+	ulint		fclass;	/*!< class of the function */
 	que_node_t*	args;	/*!< argument(s) of the function */
 	UT_LIST_NODE_T(func_node_t) cond_list;
 				/*!< list of comparison conditions; defined
@@ -650,14 +728,14 @@ struct func_node_struct{
 };
 
 /** An order-by node in a select */
-struct order_node_struct{
+struct order_node_t{
 	que_common_t	common;	/*!< type: QUE_NODE_ORDER */
 	sym_node_t*	column;	/*!< order-by column */
 	ibool		asc;	/*!< TRUE if ascending, FALSE if descending */
 };
 
 /** Procedure definition node */
-struct proc_node_struct{
+struct proc_node_t{
 	que_common_t	common;		/*!< type: QUE_NODE_PROC */
 	sym_node_t*	proc_id;	/*!< procedure name symbol in the symbol
 					table of this same procedure */
@@ -667,14 +745,14 @@ struct proc_node_struct{
 };
 
 /** elsif-element node */
-struct elsif_node_struct{
+struct elsif_node_t{
 	que_common_t	common;		/*!< type: QUE_NODE_ELSIF */
 	que_node_t*	cond;		/*!< if condition */
 	que_node_t*	stat_list;	/*!< statement list */
 };
 
 /** if-statement node */
-struct if_node_struct{
+struct if_node_t{
 	que_common_t	common;		/*!< type: QUE_NODE_IF */
 	que_node_t*	cond;		/*!< if condition */
 	que_node_t*	stat_list;	/*!< statement list */
@@ -683,14 +761,14 @@ struct if_node_struct{
 };
 
 /** while-statement node */
-struct while_node_struct{
+struct while_node_t{
 	que_common_t	common;		/*!< type: QUE_NODE_WHILE */
 	que_node_t*	cond;		/*!< while condition */
 	que_node_t*	stat_list;	/*!< statement list */
 };
 
 /** for-loop-statement node */
-struct for_node_struct{
+struct for_node_t{
 	que_common_t	common;		/*!< type: QUE_NODE_FOR */
 	sym_node_t*	loop_var;	/*!< loop variable: this is the
 					dereferenced symbol from the
@@ -707,24 +785,24 @@ struct for_node_struct{
 };
 
 /** exit statement node */
-struct exit_node_struct{
+struct exit_node_t{
 	que_common_t	common;		/*!< type: QUE_NODE_EXIT */
 };
 
 /** return-statement node */
-struct return_node_struct{
+struct return_node_t{
 	que_common_t	common;		/*!< type: QUE_NODE_RETURN */
 };
 
 /** Assignment statement node */
-struct assign_node_struct{
+struct assign_node_t{
 	que_common_t	common;		/*!< type: QUE_NODE_ASSIGNMENT */
 	sym_node_t*	var;		/*!< variable to set */
 	que_node_t*	val;		/*!< value to assign */
 };
 
 /** Column assignment node */
-struct col_assign_node_struct{
+struct col_assign_node_t{
 	que_common_t	common;		/*!< type: QUE_NODE_COL_ASSIGN */
 	sym_node_t*	col;		/*!< column to set */
 	que_node_t*	val;		/*!< value to assign */
diff --git a/storage/xtradb/include/pars0pars.ic b/storage/xtradb/include/pars0pars.ic
index 558d1093bfe..4c88337a265 100644
--- a/storage/xtradb/include/pars0pars.ic
+++ b/storage/xtradb/include/pars0pars.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/pars0sym.h b/storage/xtradb/include/pars0sym.h
index 9241aff3be1..bcf73639228 100644
--- a/storage/xtradb/include/pars0sym.h
+++ b/storage/xtradb/include/pars0sym.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -67,7 +67,7 @@ sym_node_t*
 sym_tab_add_str_lit(
 /*================*/
 	sym_tab_t*	sym_tab,	/*!< in: symbol table */
-	byte*		str,		/*!< in: string with no quotes around
+	const byte*	str,		/*!< in: string with no quotes around
 					it */
 	ulint		len);		/*!< in: string length */
 /******************************************************************//**
@@ -80,6 +80,16 @@ sym_tab_add_bound_lit(
 	sym_tab_t*	sym_tab,	/*!< in: symbol table */
 	const char*	name,		/*!< in: name of bound literal */
 	ulint*		lit_type);	/*!< out: type of literal (PARS_*_LIT) */
+/**********************************************************************
+Rebind literal to a node in the symbol table. */
+
+sym_node_t*
+sym_tab_rebind_lit(
+/*===============*/
+                                        /* out: symbol table node */
+        sym_node_t*     node,           /* in: node that is bound to literal*/
+        const void*     address,        /* in: pointer to data */
+        ulint           length);        /* in: length of data */
 /******************************************************************//**
 Adds an SQL null literal to a symbol table.
 @return	symbol table node */
@@ -109,18 +119,21 @@ sym_tab_add_bound_id(
 	sym_tab_t*	sym_tab,	/*!< in: symbol table */
 	const char*	name);		/*!< in: name of bound id */
 
-/** Index of sym_node_struct::field_nos corresponding to the clustered index */
+/** Index of sym_node_t::field_nos corresponding to the clustered index */
 #define	SYM_CLUST_FIELD_NO	0
-/** Index of sym_node_struct::field_nos corresponding to a secondary index */
+/** Index of sym_node_t::field_nos corresponding to a secondary index */
 #define	SYM_SEC_FIELD_NO	1
 
 /** Types of a symbol table node */
 enum sym_tab_entry {
+	SYM_UNSET,		/*!< Unset entry. */
 	SYM_VAR = 91,		/*!< declared parameter or local
 				variable of a procedure */
 	SYM_IMPLICIT_VAR,	/*!< storage for a intermediate result
 				of a calculation */
 	SYM_LIT,		/*!< literal */
+	SYM_TABLE_REF_COUNTED,	/*!< database table name, ref counted. Must
+				be closed explicitly. */
 	SYM_TABLE,		/*!< database table name */
 	SYM_COLUMN,		/*!< database table name */
 	SYM_CURSOR,		/*!< named cursor */
@@ -130,7 +143,7 @@ enum sym_tab_entry {
 };
 
 /** Symbol table node */
-struct sym_node_struct{
+struct sym_node_t{
 	que_common_t			common;		/*!< node type:
 							QUE_NODE_SYMBOL */
 	/* NOTE: if the data field in 'common.val' is not NULL and the symbol
@@ -210,10 +223,11 @@ struct sym_node_struct{
 							the symbol table */
 	UT_LIST_NODE_T(sym_node_t)	sym_list;	/*!< list of symbol
 							nodes */
+	sym_node_t*			like_node;	/* LIKE operator node*/
 };
 
 /** Symbol table */
-struct sym_tab_struct{
+struct sym_tab_t{
 	que_t*			query_graph;
 					/*!< query graph generated by the
 					parser */
diff --git a/storage/xtradb/include/pars0sym.ic b/storage/xtradb/include/pars0sym.ic
index ecf014908a9..266c1a6310d 100644
--- a/storage/xtradb/include/pars0sym.ic
+++ b/storage/xtradb/include/pars0sym.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/pars0types.h b/storage/xtradb/include/pars0types.h
index 4f3b2c06db6..47f4b432d20 100644
--- a/storage/xtradb/include/pars0types.h
+++ b/storage/xtradb/include/pars0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1998, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,24 +26,24 @@ Created 1/11/1998 Heikki Tuuri
 #ifndef pars0types_h
 #define pars0types_h
 
-typedef struct pars_info_struct		pars_info_t;
-typedef struct pars_user_func_struct	pars_user_func_t;
-typedef struct pars_bound_lit_struct	pars_bound_lit_t;
-typedef struct pars_bound_id_struct	pars_bound_id_t;
-typedef struct sym_node_struct		sym_node_t;
-typedef struct sym_tab_struct		sym_tab_t;
-typedef struct pars_res_word_struct	pars_res_word_t;
-typedef struct func_node_struct		func_node_t;
-typedef struct order_node_struct	order_node_t;
-typedef struct proc_node_struct		proc_node_t;
-typedef struct elsif_node_struct	elsif_node_t;
-typedef struct if_node_struct		if_node_t;
-typedef struct while_node_struct	while_node_t;
-typedef struct for_node_struct		for_node_t;
-typedef struct exit_node_struct		exit_node_t;
-typedef struct return_node_struct	return_node_t;
-typedef struct assign_node_struct	assign_node_t;
-typedef struct col_assign_node_struct	col_assign_node_t;
+struct pars_info_t;
+struct pars_user_func_t;
+struct pars_bound_lit_t;
+struct pars_bound_id_t;
+struct sym_node_t;
+struct sym_tab_t;
+struct pars_res_word_t;
+struct func_node_t;
+struct order_node_t;
+struct proc_node_t;
+struct elsif_node_t;
+struct if_node_t;
+struct while_node_t;
+struct for_node_t;
+struct exit_node_t;
+struct return_node_t;
+struct assign_node_t;
+struct col_assign_node_t;
 
 typedef UT_LIST_BASE_NODE_T(sym_node_t)	sym_node_list_t;
 
diff --git a/storage/xtradb/include/que0que.h b/storage/xtradb/include/que0que.h
index 8de221580fd..e5b2a1ba3fc 100644
--- a/storage/xtradb/include/que0que.h
+++ b/storage/xtradb/include/que0que.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -29,6 +29,7 @@ Created 5/27/1996 Heikki Tuuri
 #include "univ.i"
 #include "data0data.h"
 #include "dict0types.h"
+#include "btr0sea.h"
 #include "trx0trx.h"
 #include "trx0roll.h"
 #include "srv0srv.h"
@@ -41,14 +42,9 @@ Created 5/27/1996 Heikki Tuuri
 of SQL execution in the UNIV_SQL_DEBUG version */
 extern ibool	que_trace_on;
 
-/***********************************************************************//**
-Adds a query graph to the session's list of graphs. */
-UNIV_INTERN
-void
-que_graph_publish(
-/*==============*/
-	que_t*	graph,	/*!< in: graph */
-	sess_t*	sess);	/*!< in: session */
+/** Mutex protecting the query threads. */
+extern ib_mutex_t	que_thr_mutex;
+
 /***********************************************************************//**
 Creates a query graph fork node.
 @return	own: fork node */
@@ -114,8 +110,8 @@ que_graph_free(
 			afterwards! */
 /**********************************************************************//**
 Stops a query thread if graph or trx is in a state requiring it. The
-conditions are tested in the order (1) graph, (2) trx. The kernel mutex has
-to be reserved.
+conditions are tested in the order (1) graph, (2) trx. The lock_sys_t::mutex
+has to be reserved.
 @return	TRUE if stopped */
 UNIV_INTERN
 ibool
@@ -143,7 +139,7 @@ que_thr_stop_for_mysql_no_error(
 /**********************************************************************//**
 A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
 query thread is stopped and made inactive, except in the case where
-it was put to the lock wait state in lock0lock.c, but the lock has already
+it was put to the lock wait state in lock0lock.cc, but the lock has already
 been granted or the transaction chosen as a victim in deadlock resolution. */
 UNIV_INTERN
 void
@@ -158,44 +154,17 @@ que_run_threads(
 /*============*/
 	que_thr_t*	thr);	/*!< in: query thread */
 /**********************************************************************//**
-After signal handling is finished, returns control to a query graph error
-handling routine. (Currently, just returns the control to the root of the
-graph so that the graph can communicate an error message to the client.) */
-UNIV_INTERN
-void
-que_fork_error_handle(
-/*==================*/
-	trx_t*	trx,	/*!< in: trx */
-	que_t*	fork);	/*!< in: query graph which was run before signal
-			handling started, NULL not allowed */
-/**********************************************************************//**
-Moves a suspended query thread to the QUE_THR_RUNNING state and releases
-a single worker thread to execute it. This function should be used to end
+Moves a suspended query thread to the QUE_THR_RUNNING state and release
+a worker thread to execute it. This function should be used to end
 the wait state of a query thread waiting for a lock or a stored procedure
-completion. */
+completion.
+@return query thread instance of thread to wakeup or NULL  */
 UNIV_INTERN
-void
-que_thr_end_wait(
-/*=============*/
-	que_thr_t*	thr,		/*!< in: query thread in the
-					QUE_THR_LOCK_WAIT,
-					or QUE_THR_PROCEDURE_WAIT, or
-					QUE_THR_SIG_REPLY_WAIT state */
-	que_thr_t**	next_thr);	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread */
-/**********************************************************************//**
-Same as que_thr_end_wait, but no parameter next_thr available. */
-UNIV_INTERN
-void
-que_thr_end_wait_no_next_thr(
-/*=========================*/
-	que_thr_t*	thr);		/*!< in: query thread in the
-					QUE_THR_LOCK_WAIT,
-					or QUE_THR_PROCEDURE_WAIT, or
-					QUE_THR_SIG_REPLY_WAIT state */
+que_thr_t*
+que_thr_end_lock_wait(
+/*==================*/
+	trx_t*		trx);		/*!< in: transaction in the
+					QUE_THR_LOCK_WAIT state */
 /**********************************************************************//**
 Starts execution of a command in a query fork. Picks a query thread which
 is not in the QUE_THR_RUNNING state and moves it to that state. If none
@@ -296,6 +265,14 @@ que_node_list_add_last(
 /*===================*/
 	que_node_t*	node_list,	/*!< in: node list, or NULL */
 	que_node_t*	node);		/*!< in: node */
+/*************************************************************************
+Get the last node from the list.*/
+UNIV_INLINE
+que_node_t*
+que_node_list_get_last(
+/*===================*/
+					/* out: node last node from list.*/
+	que_node_t*	node_list);	/* in: node list, or NULL */
 /*********************************************************************//**
 Gets a query graph node list length.
 @return	length, for NULL list 0 */
@@ -308,7 +285,7 @@ que_node_list_get_len(
 Checks if graph, trx, or session is in a state where the query thread should
 be stopped.
 @return TRUE if should be stopped; NOTE that if the peek is made
-without reserving the kernel mutex, then another peek with the mutex
+without reserving the trx_t::mutex, then another peek with the mutex
 reserved is necessary before deciding the actual stopping */
 UNIV_INLINE
 ibool
@@ -334,7 +311,7 @@ que_node_print_info(
 Evaluate the given SQL
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 que_eval_sql(
 /*=========*/
 	pars_info_t*	info,	/*!< in: info struct, or NULL */
@@ -344,33 +321,50 @@ que_eval_sql(
 				dict_sys->mutex around call to pars_sql. */
 	trx_t*		trx);	/*!< in: trx */
 
-/* Query graph query thread node: the fields are protected by the kernel
-mutex with the exceptions named below */
+/**********************************************************************//**
+Round robin scheduler.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+UNIV_INTERN
+que_thr_t*
+que_fork_scheduler_round_robin(
+/*===========================*/
+	que_fork_t*	fork,		/*!< in: a query fork */
+	que_thr_t*	thr);		/*!< in: current pos */
+
+/*********************************************************************//**
+Initialise the query sub-system. */
+UNIV_INTERN
+void
+que_init(void);
+/*==========*/
 
-struct que_thr_struct{
+/*********************************************************************//**
+Close the query sub-system. */
+UNIV_INTERN
+void
+que_close(void);
+/*===========*/
+
+/* Query graph query thread node: the fields are protected by the
+trx_t::mutex with the exceptions named below */
+
+struct que_thr_t{
 	que_common_t	common;		/*!< type: QUE_NODE_THR */
 	ulint		magic_n;	/*!< magic number to catch memory
 					corruption */
 	que_node_t*	child;		/*!< graph child node */
 	que_t*		graph;		/*!< graph where this node belongs */
+	ulint		state;		/*!< state of the query thread */
 	ibool		is_active;	/*!< TRUE if the thread has been set
 					to the run state in
 					que_thr_move_to_run_state, but not
 					deactivated in
 					que_thr_dec_reference_count */
-	ulint		state;		/*!< state of the query thread */
-	UT_LIST_NODE_T(que_thr_t)
-			thrs;		/*!< list of thread nodes of the fork
-					node */
-	UT_LIST_NODE_T(que_thr_t)
-			trx_thrs;	/*!< lists of threads in wait list of
-					the trx */
-	UT_LIST_NODE_T(que_thr_t)
-			queue;		/*!< list of runnable thread nodes in
-					the server task queue */
 	/*------------------------------*/
 	/* The following fields are private to the OS thread executing the
-	query thread, and are not protected by the kernel mutex: */
+	query thread, and are not protected by any mutex: */
 
 	que_node_t*	run_node;	/*!< pointer to the node where the
 					subgraph down from this node is
@@ -381,6 +375,21 @@ struct que_thr_struct{
 					thus far */
 	ulint		lock_state;	/*!< lock state of thread (table or
 					row) */
+	struct srv_slot_t*
+			slot;		/* The thread slot in the wait
+					array in srv_sys_t */
+	/*------------------------------*/
+	/* The following fields are links for the various lists that
+	this type can be on. */
+	UT_LIST_NODE_T(que_thr_t)
+			thrs;		/*!< list of thread nodes of the fork
+					node */
+	UT_LIST_NODE_T(que_thr_t)
+			trx_thrs;	/*!< lists of threads in wait list of
+					the trx */
+	UT_LIST_NODE_T(que_thr_t)
+			queue;		/*!< list of runnable thread nodes in
+					the server task queue */
 	ulint		fk_cascade_depth; /*!< maximum cascading call depth
 					supported for foreign key constraint
 					related delete/updates */
@@ -389,8 +398,8 @@ struct que_thr_struct{
 #define QUE_THR_MAGIC_N		8476583
 #define QUE_THR_MAGIC_FREED	123461526
 
-/* Query graph fork node: its fields are protected by the kernel mutex */
-struct que_fork_struct{
+/* Query graph fork node: its fields are protected by the query thread mutex */
+struct que_fork_t{
 	que_common_t	common;		/*!< type: QUE_NODE_FORK */
 	que_t*		graph;		/*!< query graph of this node */
 	ulint		fork_type;	/*!< fork type */
@@ -492,8 +501,6 @@ struct que_fork_struct{
 #define QUE_NODE_CALL		31
 #define QUE_NODE_EXIT		32
 
-#define QUE_NODE_INSERT_STATS	34
-
 /* Query thread states */
 #define QUE_THR_RUNNING		1
 #define QUE_THR_PROCEDURE_WAIT	2
@@ -504,7 +511,6 @@ struct que_fork_struct{
 					thread has done its task */
 #define QUE_THR_COMMAND_WAIT	4
 #define QUE_THR_LOCK_WAIT	5
-#define QUE_THR_SIG_REPLY_WAIT	6
 #define QUE_THR_SUSPENDED	7
 #define QUE_THR_ERROR		8
 
@@ -518,7 +524,6 @@ struct que_fork_struct{
 #define QUE_CUR_START		2
 #define	QUE_CUR_END		3
 
-
 #ifndef UNIV_NONINL
 #include "que0que.ic"
 #endif
diff --git a/storage/xtradb/include/que0que.ic b/storage/xtradb/include/que0que.ic
index 2de679e3894..eff5a86d958 100644
--- a/storage/xtradb/include/que0que.ic
+++ b/storage/xtradb/include/que0que.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -88,7 +88,7 @@ que_node_get_type(
 {
 	ut_ad(node);
 
-	return(((que_common_t*)node)->type);
+	return(((que_common_t*) node)->type);
 }
 
 /***********************************************************************//**
@@ -101,7 +101,7 @@ que_node_get_val(
 {
 	ut_ad(node);
 
-	return(&(((que_common_t*)node)->val));
+	return(&(((que_common_t*) node)->val));
 }
 
 /***********************************************************************//**
@@ -115,7 +115,7 @@ que_node_get_val_buf_size(
 {
 	ut_ad(node);
 
-	return(((que_common_t*)node)->val_buf_size);
+	return(((que_common_t*) node)->val_buf_size);
 }
 
 /***********************************************************************//**
@@ -129,7 +129,7 @@ que_node_set_val_buf_size(
 {
 	ut_ad(node);
 
-	((que_common_t*)node)->val_buf_size = size;
+	((que_common_t*) node)->val_buf_size = size;
 }
 
 /***********************************************************************//**
@@ -143,7 +143,7 @@ que_node_set_parent(
 {
 	ut_ad(node);
 
-	((que_common_t*)node)->parent = parent;
+	((que_common_t*) node)->parent = parent;
 }
 
 /***********************************************************************//**
@@ -192,6 +192,28 @@ que_node_list_add_last(
 	return(node_list);
 }
 
+/*************************************************************************
+Removes a query graph node from the list.*/
+UNIV_INLINE
+que_node_t*
+que_node_list_get_last(
+/*===================*/
+					/* out: last node in list.*/
+	que_node_t*	node_list)	/* in: node list */
+{
+	que_common_t*	node;
+
+	ut_a(node_list != NULL);
+
+	node = (que_common_t*) node_list;
+
+	/* We need the last element */
+	while (node->brother != NULL) {
+		node = (que_common_t*) node->brother;
+	}
+
+	return(node);
+}
 /*********************************************************************//**
 Gets the next list node in a list of query graph nodes.
 @return	next node in a list of nodes */
@@ -201,7 +223,7 @@ que_node_get_next(
 /*==============*/
 	que_node_t*	node)	/*!< in: node in a list */
 {
-	return(((que_common_t*)node)->brother);
+	return(((que_common_t*) node)->brother);
 }
 
 /*********************************************************************//**
@@ -236,14 +258,14 @@ que_node_get_parent(
 /*================*/
 	que_node_t*	node)	/*!< in: node */
 {
-	return(((que_common_t*)node)->parent);
+	return(((que_common_t*) node)->parent);
 }
 
 /**********************************************************************//**
 Checks if graph, trx, or session is in a state where the query thread should
 be stopped.
 @return TRUE if should be stopped; NOTE that if the peek is made
-without reserving the kernel mutex, then another peek with the mutex
+without reserving the trx mutex, then another peek with the mutex
 reserved is necessary before deciding the actual stopping */
 UNIV_INLINE
 ibool
@@ -258,9 +280,9 @@ que_thr_peek_stop(
 	trx = graph->trx;
 
 	if (graph->state != QUE_FORK_ACTIVE
-	    || trx->que_state == TRX_QUE_LOCK_WAIT
-	    || (UT_LIST_GET_LEN(trx->signals) > 0
-		&& trx->que_state == TRX_QUE_RUNNING)) {
+	    || trx->lock.que_state == TRX_QUE_LOCK_WAIT
+	    || (trx->lock.que_state != TRX_QUE_ROLLING_BACK
+		&& trx->lock.que_state != TRX_QUE_RUNNING)) {
 
 		return(TRUE);
 	}
diff --git a/storage/xtradb/include/que0types.h b/storage/xtradb/include/que0types.h
index 69fb0557d8b..0f11cad301a 100644
--- a/storage/xtradb/include/que0types.h
+++ b/storage/xtradb/include/que0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -32,18 +32,15 @@ Created 5/27/1996 Heikki Tuuri
 /* Pseudotype for all graph nodes */
 typedef void	que_node_t;
 
-typedef struct que_fork_struct	que_fork_t;
-
 /* Query graph root is a fork node */
-typedef	que_fork_t	que_t;
+typedef	struct que_fork_t	que_t;
 
-typedef struct que_thr_struct		que_thr_t;
-typedef struct que_common_struct	que_common_t;
+struct que_thr_t;
 
 /* Common struct at the beginning of each query graph node; the name of this
 substruct must be 'common' */
 
-struct que_common_struct{
+struct que_common_t{
 	ulint		type;	/*!< query node type */
 	que_node_t*	parent;	/*!< back pointer to parent node, or NULL */
 	que_node_t*	brother;/* pointer to a possible brother node */
diff --git a/storage/xtradb/include/read0i_s.h b/storage/xtradb/include/read0i_s.h
new file mode 100644
index 00000000000..11b63affe09
--- /dev/null
+++ b/storage/xtradb/include/read0i_s.h
@@ -0,0 +1,54 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2010-2012, Percona Inc. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef read0i_s_h
+#define read0i_s_h
+
+#include <trx0types.h>
+
+struct i_s_xtradb_read_view_struct {
+	undo_no_t	undo_no;/*!< 0 or if type is
+				VIEW_HIGH_GRANULARITY
+				transaction undo_no when this high-granularity
+				consistent read view was created */
+	trx_id_t	low_limit_no;
+				/*!< The view does not need to see the undo
+				logs for transactions whose transaction number
+				is strictly smaller (<) than this value: they
+				can be removed in purge if not needed by other
+				views */
+	trx_id_t	low_limit_id;
+				/*!< The read should not see any transaction
+				with trx id >= this value. In other words,
+				this is the "high water mark". */
+	trx_id_t	up_limit_id;
+				/*!< The read should see all trx ids which
+				are strictly smaller (<) than this value.
+				In other words,
+				this is the "low water mark". */
+};
+
+typedef struct i_s_xtradb_read_view_struct i_s_xtradb_read_view_t;
+
+UNIV_INTERN
+i_s_xtradb_read_view_t*
+read_fill_i_s_xtradb_read_view(i_s_xtradb_read_view_t *rv);
+
+
+#endif /* read0i_s_h */
diff --git a/storage/xtradb/include/read0read.h b/storage/xtradb/include/read0read.h
index c6ba9557d32..e17d49b1321 100644
--- a/storage/xtradb/include/read0read.h
+++ b/storage/xtradb/include/read0read.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -31,6 +31,7 @@ Created 2/16/1997 Heikki Tuuri
 
 #include "ut0byte.h"
 #include "ut0lst.h"
+#include "btr0types.h"
 #include "trx0trx.h"
 #include "trx0sys.h"
 #include "read0types.h"
@@ -45,10 +46,8 @@ read_view_open_now(
 /*===============*/
 	trx_id_t	cr_trx_id,	/*!< in: trx_id of creating
 					transaction, or 0 used in purge */
-	read_view_t*	view,		/*!< in: current read view or NULL if it
-					doesn't exist yet */
-	ibool		exclude_self);	/*!< in: TRUE, if cr_trx_id should be
-					excluded from the resulting view */
+	read_view_t*&	view);		/*!< in,out: pre-allocated view array or
+					NULL if a new one needs to be created */
 
 /*********************************************************************//**
 Makes a copy of the oldest existing read view, or opens a new. The view
@@ -56,26 +55,29 @@ must be closed with ..._close.
 @return	own: read view struct */
 UNIV_INTERN
 read_view_t*
-read_view_oldest_copy_or_open_new(
-/*==============================*/
-	trx_id_t	cr_trx_id,	/*!< in: trx_id of creating
-					transaction, or 0 used in purge */
-	read_view_t*	view);		/*!< in: pre-allocated view array or
+read_view_purge_open(
+/*=================*/
+	read_view_t*&	clone_view,	/*!< in,out: pre-allocated view that
+					will be used to clone the oldest view if
+					exists */
+	read_view_t*&	view);		/*!< in,out: pre-allocated view array or
 					NULL if a new one needs to be created */
 /*********************************************************************//**
-Closes a read view. */
-UNIV_INTERN
+Remove a read view from the trx_sys->view_list. */
+UNIV_INLINE
 void
-read_view_close(
-/*============*/
-	read_view_t*	view);	/*!< in: read view */
+read_view_remove(
+/*=============*/
+	read_view_t*	view,		/*!< in: read view, can be 0 */
+	bool		own_mutex);	/*!< in: true if caller owns the
+					trx_sys_t::mutex */
 /*********************************************************************//**
 Frees memory allocated by a read view. */
 UNIV_INTERN
 void
 read_view_free(
 /*===========*/
-	read_view_t*	view);	/*< in: read view */
+	read_view_t*&	view);	/*< in,out: read view */
 /*********************************************************************//**
 Closes a consistent read view for MySQL. This function is called at an SQL
 statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */
@@ -86,20 +88,21 @@ read_view_close_for_mysql(
 	trx_t*	trx);	/*!< in: trx which has a read view */
 /*********************************************************************//**
 Checks if a read view sees the specified transaction.
-@return	TRUE if sees */
+@return	true if sees */
 UNIV_INLINE
-ibool
+bool
 read_view_sees_trx_id(
 /*==================*/
 	const read_view_t*	view,	/*!< in: read view */
-	trx_id_t		trx_id);/*!< in: trx id */
+	trx_id_t		trx_id)	/*!< in: trx id */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
-Prints a read view to stderr. */
+Prints a read view to file. */
 UNIV_INTERN
 void
 read_view_print(
 /*============*/
-	FILE*			file,
+	FILE*			file,	/*!< in: file to print to */
 	const read_view_t*	view);	/*!< in: read view */
 /*********************************************************************//**
 Create a consistent cursor view for mysql to be used in cursors. In this
@@ -133,7 +136,7 @@ read_cursor_set_for_mysql(
 /** Read view lists the trx ids of those transactions for which a consistent
 read should not see the modifications to the database. */
 
-struct read_view_struct{
+struct read_view_t{
 	ulint		type;	/*!< VIEW_NORMAL, VIEW_HIGH_GRANULARITY */
 	undo_no_t	undo_no;/*!< 0 or if type is
 				VIEW_HIGH_GRANULARITY
@@ -160,14 +163,14 @@ struct read_view_struct{
 				/*!< Maximum number of cells in the trx_ids
 				array */
 	trx_id_t*	descriptors;
-				/*!< Array of trx descriptors which the read
-				should not see: typically, these are the active
-				transactions at the time when the read is
-				serialized, except the reading transaction
+				/*!< Additional trx ids which the read should
+				not see: typically, these are the read-write
+				active transactions at the time when the read
+				is serialized, except the reading transaction
 				itself; the trx ids in this array are in a
-				descending order. These trx_ids should be
-				between the "low" and "high" water marks, that
-				is, up_limit_id and low_limit_id. */
+				ascending order. These trx_ids should be
+				between the "low" and "high" water marks,
+				that is, up_limit_id and low_limit_id. */
 	trx_id_t	creator_trx_id;
 				/*!< trx id of creating transaction, or
 				0 used in purge */
@@ -191,7 +194,7 @@ struct read_view_struct{
 cursors. This struct holds both heap where consistent read view
 is allocated and pointer to a read view. */
 
-struct cursor_view_struct{
+struct cursor_view_t{
 	mem_heap_t*	heap;
 				/*!< Memory heap for the cursor view */
 	read_view_t*	read_view;
diff --git a/storage/xtradb/include/read0read.ic b/storage/xtradb/include/read0read.ic
index 62c47e05b9d..66bef8866c9 100644
--- a/storage/xtradb/include/read0read.ic
+++ b/storage/xtradb/include/read0read.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -23,50 +23,66 @@ Cursor read
 Created 2/16/1997 Heikki Tuuri
 *******************************************************/
 
+#include "trx0sys.h"
+
+#ifdef UNIV_DEBUG
 /*********************************************************************//**
-Gets the nth trx id in a read view.
+Validates a read view object. */
+static
+bool
+read_view_validate(
+/*===============*/
+	const read_view_t*	view)	/*!< in: view to validate */
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+	ut_ad(view->max_descr >= view->n_descr);
+	ut_ad(view->descriptors == NULL || view->max_descr > 0);
 
-Upstream code stores array of trx_ids in the descending order. Percona Server
-keeps it in the ascending order for performance reasons. Let us keep the
-semantics.
+	/* Check that the view->descriptors array is in ascending order. */
+	for (ulint i = 1; i < view->n_descr; ++i) {
 
-@return	trx id */
-UNIV_INLINE
-trx_id_t
-read_view_get_nth_trx_id(
-/*=====================*/
-	const read_view_t*	view,	/*!< in: read view */
-	ulint			n)	/*!< in: position */
-{
-	ut_ad(n < view->n_descr);
+		ut_a(view->descriptors[i] > view->descriptors[i - 1]);
+	}
 
-	return(view->descriptors[view->n_descr - 1 - n]);
+	return(true);
 }
 
-/*********************************************************************//**
-Sets the nth trx id in a read view.
+/** Functor to validate the view list. */
+struct	ViewCheck {
 
-Upstream code stores array of trx_ids in the descending order. Percona Server
-keeps it in the ascending order for performance reasons. Let us keep the
-semantics. */
-UNIV_INLINE
-void
-read_view_set_nth_trx_id(
-/*=====================*/
-	read_view_t*	view,	/*!< in: read view */
-	ulint		n,	/*!< in: position */
-	trx_id_t	trx_id)	/*!< in: trx id to set */
+	ViewCheck() : m_prev_view(0) { }
+
+	void	operator()(const read_view_t* view)
+	{
+		ut_a(m_prev_view == NULL
+		     || m_prev_view->low_limit_no >= view->low_limit_no);
+
+		m_prev_view = view;
+	}
+
+	const read_view_t*	m_prev_view;
+};
+
+/*********************************************************************//**
+Validates a read view list. */
+static
+bool
+read_view_list_validate(void)
+/*=========================*/
 {
-	ut_ad(n < view->n_descr);
+	ut_ad(mutex_own(&trx_sys->mutex));
 
-	view->descriptors[view->n_descr - 1 - n] = trx_id;
+	ut_list_map(trx_sys->view_list, &read_view_t::view_list, ViewCheck());
+
+	return(true);
 }
+#endif /* UNIV_DEBUG */
 
 /*********************************************************************//**
 Checks if a read view sees the specified transaction.
-@return	TRUE if sees */
+@return	true if sees */
 UNIV_INLINE
-ibool
+bool
 read_view_sees_trx_id(
 /*==================*/
 	const read_view_t*	view,	/*!< in: read view */
@@ -74,12 +90,10 @@ read_view_sees_trx_id(
 {
 	if (trx_id < view->up_limit_id) {
 
-		return(TRUE);
-	}
-
-	if (trx_id >= view->low_limit_id) {
+		return(true);
+	} else if (trx_id >= view->low_limit_id) {
 
-		return(FALSE);
+		return(false);
 	}
 
 	/* Do a binary search over this view's descriptors array */
@@ -87,3 +101,31 @@ read_view_sees_trx_id(
 	return(trx_find_descriptor(view->descriptors, view->n_descr,
 				   trx_id) == NULL);
 }
+
+/*********************************************************************//**
+Remove a read view from the trx_sys->view_list. */
+UNIV_INLINE
+void
+read_view_remove(
+/*=============*/
+	read_view_t*	view,		/*!< in: read view, can be 0 */
+	bool		own_mutex)	/*!< in: true if caller owns the
+					trx_sys_t::mutex */
+{
+	if (view != 0) {
+		if (!own_mutex) {
+			mutex_enter(&trx_sys->mutex);
+		}
+
+		ut_ad(read_view_validate(view));
+
+		UT_LIST_REMOVE(view_list, trx_sys->view_list, view);
+
+		ut_ad(read_view_list_validate());
+
+		if (!own_mutex) {
+			mutex_exit(&trx_sys->mutex);
+		}
+	}
+}
+
diff --git a/storage/xtradb/include/read0types.h b/storage/xtradb/include/read0types.h
index 4bb9618448b..969f4ebb637 100644
--- a/storage/xtradb/include/read0types.h
+++ b/storage/xtradb/include/read0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,7 +26,7 @@ Created 2/16/1997 Heikki Tuuri
 #ifndef read0types_h
 #define read0types_h
 
-typedef struct read_view_struct	read_view_t;
-typedef struct cursor_view_struct	cursor_view_t;
+struct read_view_t;
+struct cursor_view_t;
 
 #endif
diff --git a/storage/xtradb/include/rem0cmp.h b/storage/xtradb/include/rem0cmp.h
index c5ef0d5438a..cb3c85ac2c8 100644
--- a/storage/xtradb/include/rem0cmp.h
+++ b/storage/xtradb/include/rem0cmp.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -75,6 +75,63 @@ cmp_data_data_slow(
 	const byte*	data2,	/*!< in: data field (== a pointer to a memory
 				buffer) */
 	ulint		len2);	/*!< in: data field length or UNIV_SQL_NULL */
+
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type to be VARCHAR.
+@return	1, 0, -1, if lhs is greater, equal, less than rhs, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow_varchar(
+/*=======================*/
+	const byte*	lhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		lhs_len,/* in: data field length or UNIV_SQL_NULL */
+	const byte*	rhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		rhs_len);/* in: data field length or UNIV_SQL_NULL */
+/*****************************************************************
+This function is used to compare two varchar/char fields. The comparison
+is for the LIKE operator.
+@return	1, 0, -1, if lhs is greater, equal, less than rhs, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow_like_prefix(
+/*===========================*/
+	const byte*	data1,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/* in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2);	/* in: data field length or UNIV_SQL_NULL */
+/*****************************************************************
+This function is used to compare two varchar/char fields. The comparison
+is for the LIKE operator.
+@return	1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow_like_suffix(
+/*===========================*/
+	const byte*	data1,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/* in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2);	/* in: data field length or UNIV_SQL_NULL */
+/*****************************************************************
+This function is used to compare two varchar/char fields. The comparison
+is for the LIKE operator.
+@return	1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow_like_substr(
+/*===========================*/
+	const byte*	data1,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/* in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2);	/* in: data field length or UNIV_SQL_NULL */
 /*************************************************************//**
 This function is used to compare two dfields where at least the first
 has its data type field set.
@@ -99,21 +156,28 @@ respectively, when only the common first fields are compared, or until
 the first externally stored field in rec */
 UNIV_INTERN
 int
-cmp_dtuple_rec_with_match(
-/*======================*/
+cmp_dtuple_rec_with_match_low(
+/*==========================*/
 	const dtuple_t*	dtuple,	/*!< in: data tuple */
 	const rec_t*	rec,	/*!< in: physical record which differs from
 				dtuple in some of the common fields, or which
 				has an equal number or more fields than
 				dtuple */
 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
-	ulint*		matched_fields, /*!< in/out: number of already completely
+	ulint		n_cmp,	/*!< in: number of fields to compare */
+	ulint*		matched_fields,
+				/*!< in/out: number of already completely
 				matched fields; when function returns,
 				contains the value for current comparison */
-	ulint*		matched_bytes); /*!< in/out: number of already matched
+	ulint*		matched_bytes)
+				/*!< in/out: number of already matched
 				bytes within the first field not completely
 				matched; when function returns, contains the
 				value for current comparison */
+	__attribute__((nonnull));
+#define cmp_dtuple_rec_with_match(tuple,rec,offsets,fields,bytes)	\
+	cmp_dtuple_rec_with_match_low(					\
+		tuple,rec,offsets,dtuple_get_n_fields_cmp(tuple),fields,bytes)
 /**************************************************************//**
 Compares a data tuple to a physical record.
 @see cmp_dtuple_rec_with_match
@@ -139,7 +203,9 @@ cmp_dtuple_is_prefix_of_rec(
 /*************************************************************//**
 Compare two physical records that contain the same number of columns,
 none of which are stored externally.
-@return	1, 0, -1 if rec1 is greater, equal, less, respectively, than rec2 */
+@retval 1 if rec1 (including non-ordering columns) is greater than rec2
+@retval -1 if rec1 (including non-ordering columns) is less than rec2
+@retval 0 if rec1 is a duplicate of rec2 */
 UNIV_INTERN
 int
 cmp_rec_rec_simple(
@@ -149,8 +215,10 @@ cmp_rec_rec_simple(
 	const ulint*		offsets1,/*!< in: rec_get_offsets(rec1, ...) */
 	const ulint*		offsets2,/*!< in: rec_get_offsets(rec2, ...) */
 	const dict_index_t*	index,	/*!< in: data dictionary index */
-	ibool*			null_eq);/*!< out: set to TRUE if
-					found matching null values */
+	struct TABLE*		table)	/*!< in: MySQL table, for reporting
+					duplicate key value if applicable,
+					or NULL */
+	__attribute__((nonnull(1,2,3,4), warn_unused_result));
 /*************************************************************//**
 This function is used to compare two physical records. Only the common
 first fields are compared, and if an externally stored field is
@@ -192,6 +260,39 @@ cmp_rec_rec(
 	const ulint*	offsets2,/*!< in: rec_get_offsets(rec2, index) */
 	dict_index_t*	index);	/*!< in: data dictionary index */
 
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INTERN
+int
+cmp_dfield_dfield_like_prefix(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*	dfield1,/* in: data field; must have type field set */
+	dfield_t*	dfield2);/* in: data field */
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INLINE
+int
+cmp_dfield_dfield_like_substr(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*	dfield1,/* in: data field; must have type field set */
+	dfield_t*	dfield2);/* in: data field */
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INLINE
+int
+cmp_dfield_dfield_like_suffix(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*	dfield1,/* in: data field; must have type field set */
+	dfield_t*	dfield2);/* in: data field */
 
 #ifndef UNIV_NONINL
 #include "rem0cmp.ic"
diff --git a/storage/xtradb/include/rem0cmp.ic b/storage/xtradb/include/rem0cmp.ic
index 22db4b0cd47..67a2dcacba1 100644
--- a/storage/xtradb/include/rem0cmp.ic
+++ b/storage/xtradb/include/rem0cmp.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -43,6 +43,60 @@ cmp_data_data(
 	return(cmp_data_data_slow(mtype, prtype, data1, len1, data2, len2));
 }
 
+/*****************************************************************
+This function is used to compare two (CHAR) data fields for the LIKE
+operator. */
+UNIV_INLINE
+int
+cmp_data_data_like_prefix(
+/*======================*/
+				/* out: 1, 0, -1, if data1 is greater, equal,
+				less than data2, respectively */
+	byte*           data1,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len1,   /* in: data field length or UNIV_SQL_NULL */
+	byte*           data2,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len2)   /* in: data field length or UNIV_SQL_NULL */
+{
+	return(cmp_data_data_slow_like_prefix(data1, len1, data2, len2));
+}
+/*****************************************************************
+This function is used to compare two (CHAR) data fields for the LIKE
+operator. */
+UNIV_INLINE
+int
+cmp_data_data_like_suffix(
+/*======================*/
+				/* out: 1, 0, -1, if data1 is greater, equal,
+				less than data2, respectively */
+	byte*           data1,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len1,   /* in: data field length or UNIV_SQL_NULL */
+	byte*           data2,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len2)   /* in: data field length or UNIV_SQL_NULL */
+{
+	return(cmp_data_data_slow_like_suffix(data1, len1, data2, len2));
+}
+/*****************************************************************
+This function is used to compare two (CHAR) data fields for the LIKE
+operator. */
+UNIV_INLINE
+int
+cmp_data_data_like_substr(
+/*======================*/
+				/* out: 1, 0, -1, if data1 is greater, equal,
+				less than data2, respectively */
+	byte*           data1,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len1,   /* in: data field length or UNIV_SQL_NULL */
+	byte*           data2,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len2)   /* in: data field length or UNIV_SQL_NULL */
+{
+	return(cmp_data_data_slow_like_substr(data1, len1, data2, len2));
+}
 /*************************************************************//**
 This function is used to compare two dfields where at least the first
 has its data type field set.
@@ -68,6 +122,47 @@ cmp_dfield_dfield(
 			     dfield_get_len(dfield2)));
 }
 
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INLINE
+int
+cmp_dfield_dfield_like_suffix(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*       dfield1,/* in: data field; must have type field set */
+	dfield_t*       dfield2)/* in: data field */
+{
+	ut_ad(dfield_check_typed(dfield1));
+
+	return(cmp_data_data_like_suffix(
+		(byte*) dfield_get_data(dfield1),
+		dfield_get_len(dfield1),
+		(byte*) dfield_get_data(dfield2),
+		dfield_get_len(dfield2)));
+}
+
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INLINE
+int
+cmp_dfield_dfield_like_substr(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*       dfield1,/* in: data field; must have type field set */
+	dfield_t*       dfield2)/* in: data field */
+{
+	ut_ad(dfield_check_typed(dfield1));
+
+	return(cmp_data_data_like_substr(
+		(byte*) dfield_get_data(dfield1),
+		dfield_get_len(dfield1),
+		(byte*) dfield_get_data(dfield2),
+		dfield_get_len(dfield2)));
+}
 /*************************************************************//**
 This function is used to compare two physical records. Only the common
 first fields are compared.
diff --git a/storage/xtradb/include/rem0rec.h b/storage/xtradb/include/rem0rec.h
index 9dd96f609ea..2a84aee7a6f 100644
--- a/storage/xtradb/include/rem0rec.h
+++ b/storage/xtradb/include/rem0rec.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -54,7 +54,7 @@ in addition to the data and the offsets */
 #define REC_STATUS_INFIMUM	2
 #define REC_STATUS_SUPREMUM	3
 
-/* The following four constants are needed in page0zip.c in order to
+/* The following four constants are needed in page0zip.cc in order to
 efficiently compress and decompress pages. */
 
 /* The offset of heap_no in a compact record */
@@ -66,6 +66,15 @@ The status is stored in the low-order bits. */
 /* Length of a B-tree node pointer, in bytes */
 #define REC_NODE_PTR_SIZE	4
 
+/** SQL null flag in a 1-byte offset of ROW_FORMAT=REDUNDANT records */
+#define REC_1BYTE_SQL_NULL_MASK	0x80UL
+/** SQL null flag in a 2-byte offset of ROW_FORMAT=REDUNDANT records */
+#define REC_2BYTE_SQL_NULL_MASK	0x8000UL
+
+/** In a 2-byte offset of ROW_FORMAT=REDUNDANT records, the second most
+significant bit denotes that the tail of a field is stored off-page. */
+#define REC_2BYTE_EXTERN_MASK	0x4000UL
+
 #ifdef UNIV_DEBUG
 /* Length of the rec_get_offsets() header */
 # define REC_OFFS_HEADER_SIZE	4
@@ -88,7 +97,8 @@ const rec_t*
 rec_get_next_ptr_const(
 /*===================*/
 	const rec_t*	rec,	/*!< in: physical record */
-	ulint		comp);	/*!< in: nonzero=compact page format */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 The following function is used to get the pointer of the next chained record
 on the same page.
@@ -98,7 +108,8 @@ rec_t*
 rec_get_next_ptr(
 /*=============*/
 	rec_t*	rec,	/*!< in: physical record */
-	ulint	comp);	/*!< in: nonzero=compact page format */
+	ulint	comp)	/*!< in: nonzero=compact page format */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 The following function is used to get the offset of the
 next chained record on the same page.
@@ -108,7 +119,8 @@ ulint
 rec_get_next_offs(
 /*==============*/
 	const rec_t*	rec,	/*!< in: physical record */
-	ulint		comp);	/*!< in: nonzero=compact page format */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 The following function is used to set the next record offset field
 of an old-style record. */
@@ -117,7 +129,8 @@ void
 rec_set_next_offs_old(
 /*==================*/
 	rec_t*	rec,	/*!< in: old-style physical record */
-	ulint	next);	/*!< in: offset of the next record */
+	ulint	next)	/*!< in: offset of the next record */
+	__attribute__((nonnull));
 /******************************************************//**
 The following function is used to set the next record offset field
 of a new-style record. */
@@ -126,7 +139,8 @@ void
 rec_set_next_offs_new(
 /*==================*/
 	rec_t*	rec,	/*!< in/out: new-style physical record */
-	ulint	next);	/*!< in: offset of the next record */
+	ulint	next)	/*!< in: offset of the next record */
+	__attribute__((nonnull));
 /******************************************************//**
 The following function is used to get the number of fields
 in an old-style record.
@@ -135,7 +149,8 @@ UNIV_INLINE
 ulint
 rec_get_n_fields_old(
 /*=================*/
-	const rec_t*	rec);	/*!< in: physical record */
+	const rec_t*	rec)	/*!< in: physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 The following function is used to get the number of fields
 in a record.
@@ -145,7 +160,8 @@ ulint
 rec_get_n_fields(
 /*=============*/
 	const rec_t*		rec,	/*!< in: physical record */
-	const dict_index_t*	index);	/*!< in: record descriptor */
+	const dict_index_t*	index)	/*!< in: record descriptor */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 The following function is used to get the number of records owned by the
 previous directory record.
@@ -154,7 +170,8 @@ UNIV_INLINE
 ulint
 rec_get_n_owned_old(
 /*================*/
-	const rec_t*	rec);	/*!< in: old-style physical record */
+	const rec_t*	rec)	/*!< in: old-style physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 The following function is used to set the number of owned records. */
 UNIV_INLINE
@@ -162,7 +179,8 @@ void
 rec_set_n_owned_old(
 /*================*/
 	rec_t*	rec,		/*!< in: old-style physical record */
-	ulint	n_owned);	/*!< in: the number of owned */
+	ulint	n_owned)	/*!< in: the number of owned */
+	__attribute__((nonnull));
 /******************************************************//**
 The following function is used to get the number of records owned by the
 previous directory record.
@@ -171,7 +189,8 @@ UNIV_INLINE
 ulint
 rec_get_n_owned_new(
 /*================*/
-	const rec_t*	rec);	/*!< in: new-style physical record */
+	const rec_t*	rec)	/*!< in: new-style physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 The following function is used to set the number of owned records. */
 UNIV_INLINE
@@ -180,7 +199,8 @@ rec_set_n_owned_new(
 /*================*/
 	rec_t*		rec,	/*!< in/out: new-style physical record */
 	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	ulint		n_owned);/*!< in: the number of owned */
+	ulint		n_owned)/*!< in: the number of owned */
+	__attribute__((nonnull(1)));
 /******************************************************//**
 The following function is used to retrieve the info bits of
 a record.
@@ -190,7 +210,8 @@ ulint
 rec_get_info_bits(
 /*==============*/
 	const rec_t*	rec,	/*!< in: physical record */
-	ulint		comp);	/*!< in: nonzero=compact page format */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 The following function is used to set the info bits of a record. */
 UNIV_INLINE
@@ -198,7 +219,8 @@ void
 rec_set_info_bits_old(
 /*==================*/
 	rec_t*	rec,	/*!< in: old-style physical record */
-	ulint	bits);	/*!< in: info bits */
+	ulint	bits)	/*!< in: info bits */
+	__attribute__((nonnull));
 /******************************************************//**
 The following function is used to set the info bits of a record. */
 UNIV_INLINE
@@ -206,7 +228,8 @@ void
 rec_set_info_bits_new(
 /*==================*/
 	rec_t*	rec,	/*!< in/out: new-style physical record */
-	ulint	bits);	/*!< in: info bits */
+	ulint	bits)	/*!< in: info bits */
+	__attribute__((nonnull));
 /******************************************************//**
 The following function retrieves the status bits of a new-style record.
 @return	status bits */
@@ -214,7 +237,8 @@ UNIV_INLINE
 ulint
 rec_get_status(
 /*===========*/
-	const rec_t*	rec);	/*!< in: physical record */
+	const rec_t*	rec)	/*!< in: physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
 
 /******************************************************//**
 The following function is used to set the status bits of a new-style record. */
@@ -223,7 +247,8 @@ void
 rec_set_status(
 /*===========*/
 	rec_t*	rec,	/*!< in/out: physical record */
-	ulint	bits);	/*!< in: info bits */
+	ulint	bits)	/*!< in: info bits */
+	__attribute__((nonnull));
 
 /******************************************************//**
 The following function is used to retrieve the info and status
@@ -234,7 +259,8 @@ ulint
 rec_get_info_and_status_bits(
 /*=========================*/
 	const rec_t*	rec,	/*!< in: physical record */
-	ulint		comp);	/*!< in: nonzero=compact page format */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 The following function is used to set the info and status
 bits of a record.  (Only compact records have status bits.) */
@@ -243,7 +269,8 @@ void
 rec_set_info_and_status_bits(
 /*=========================*/
 	rec_t*	rec,	/*!< in/out: compact physical record */
-	ulint	bits);	/*!< in: info bits */
+	ulint	bits)	/*!< in: info bits */
+	__attribute__((nonnull));
 
 /******************************************************//**
 The following function tells if record is delete marked.
@@ -253,7 +280,8 @@ ulint
 rec_get_deleted_flag(
 /*=================*/
 	const rec_t*	rec,	/*!< in: physical record */
-	ulint		comp);	/*!< in: nonzero=compact page format */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 The following function is used to set the deleted bit. */
 UNIV_INLINE
@@ -261,7 +289,8 @@ void
 rec_set_deleted_flag_old(
 /*=====================*/
 	rec_t*	rec,	/*!< in: old-style physical record */
-	ulint	flag);	/*!< in: nonzero if delete marked */
+	ulint	flag)	/*!< in: nonzero if delete marked */
+	__attribute__((nonnull));
 /******************************************************//**
 The following function is used to set the deleted bit. */
 UNIV_INLINE
@@ -270,7 +299,8 @@ rec_set_deleted_flag_new(
 /*=====================*/
 	rec_t*		rec,	/*!< in/out: new-style physical record */
 	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
-	ulint		flag);	/*!< in: nonzero if delete marked */
+	ulint		flag)	/*!< in: nonzero if delete marked */
+	__attribute__((nonnull(1)));
 /******************************************************//**
 The following function tells if a new-style record is a node pointer.
 @return	TRUE if node pointer */
@@ -278,7 +308,8 @@ UNIV_INLINE
 ibool
 rec_get_node_ptr_flag(
 /*==================*/
-	const rec_t*	rec);	/*!< in: physical record */
+	const rec_t*	rec)	/*!< in: physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 The following function is used to get the order number
 of an old-style record in the heap of the index page.
@@ -287,7 +318,8 @@ UNIV_INLINE
 ulint
 rec_get_heap_no_old(
 /*================*/
-	const rec_t*	rec);	/*!< in: physical record */
+	const rec_t*	rec)	/*!< in: physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 The following function is used to set the heap number
 field in an old-style record. */
@@ -296,7 +328,8 @@ void
 rec_set_heap_no_old(
 /*================*/
 	rec_t*	rec,	/*!< in: physical record */
-	ulint	heap_no);/*!< in: the heap number */
+	ulint	heap_no)/*!< in: the heap number */
+	__attribute__((nonnull));
 /******************************************************//**
 The following function is used to get the order number
 of a new-style record in the heap of the index page.
@@ -305,7 +338,8 @@ UNIV_INLINE
 ulint
 rec_get_heap_no_new(
 /*================*/
-	const rec_t*	rec);	/*!< in: physical record */
+	const rec_t*	rec)	/*!< in: physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 The following function is used to set the heap number
 field in a new-style record. */
@@ -314,7 +348,8 @@ void
 rec_set_heap_no_new(
 /*================*/
 	rec_t*	rec,	/*!< in/out: physical record */
-	ulint	heap_no);/*!< in: the heap number */
+	ulint	heap_no)/*!< in: the heap number */
+	__attribute__((nonnull));
 /******************************************************//**
 The following function is used to test whether the data offsets
 in the record are stored in one-byte or two-byte format.
@@ -323,7 +358,57 @@ UNIV_INLINE
 ibool
 rec_get_1byte_offs_flag(
 /*====================*/
-	const rec_t*	rec);	/*!< in: physical record */
+	const rec_t*	rec)	/*!< in: physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
+
+/******************************************************//**
+The following function is used to set the 1-byte offsets flag. */
+UNIV_INLINE
+void
+rec_set_1byte_offs_flag(
+/*====================*/
+	rec_t*	rec,	/*!< in: physical record */
+	ibool	flag)	/*!< in: TRUE if 1byte form */
+	__attribute__((nonnull));
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return	offset of the start of the field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_1_get_field_end_info(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+	__attribute__((nonnull, pure, warn_unused_result));
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag and extern
+storage flag ORed */
+UNIV_INLINE
+ulint
+rec_2_get_field_end_info(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+	__attribute__((nonnull, pure, warn_unused_result));
+
+/******************************************************//**
+Returns nonzero if the field is stored off-page.
+@retval 0 if the field is stored in-page
+@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */
+UNIV_INLINE
+ulint
+rec_2_is_field_extern(
+/*==================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+	__attribute__((nonnull, pure, warn_unused_result));
 
 /******************************************************//**
 Determine how many of the first n columns in a compact
@@ -333,9 +418,10 @@ UNIV_INTERN
 ulint
 rec_get_n_extern_new(
 /*=================*/
-	const rec_t*	rec,	/*!< in: compact physical record */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	ulint		n);	/*!< in: number of columns to scan */
+	const rec_t*		rec,	/*!< in: compact physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint			n)	/*!< in: number of columns to scan */
+	__attribute__((nonnull, warn_unused_result));
 
 /******************************************************//**
 The following function determines the offsets to each field
@@ -356,7 +442,8 @@ rec_get_offsets_func(
 					 (ULINT_UNDEFINED if all fields) */
 	mem_heap_t**		heap,	/*!< in/out: memory heap */
 	const char*		file,	/*!< in: file name where called */
-	ulint			line);	/*!< in: line number where called */
+	ulint			line)	/*!< in: line number where called */
+	__attribute__((nonnull(1,2,5,6),warn_unused_result));
 
 #define rec_get_offsets(rec,index,offsets,n,heap)	\
 	rec_get_offsets_func(rec,index,offsets,n,heap,__FILE__,__LINE__)
@@ -375,9 +462,10 @@ rec_get_offsets_reverse(
 	const dict_index_t*	index,	/*!< in: record descriptor */
 	ulint			node_ptr,/*!< in: nonzero=node pointer,
 					0=leaf node */
-	ulint*			offsets);/*!< in/out: array consisting of
+	ulint*			offsets)/*!< in/out: array consisting of
 					offsets[0] allocated elements */
-
+	__attribute__((nonnull));
+#ifdef UNIV_DEBUG
 /************************************************************//**
 Validates offsets returned by rec_get_offsets().
 @return	TRUE if valid */
@@ -387,9 +475,9 @@ rec_offs_validate(
 /*==============*/
 	const rec_t*		rec,	/*!< in: record or NULL */
 	const dict_index_t*	index,	/*!< in: record descriptor or NULL */
-	const ulint*		offsets);/*!< in: array returned by
+	const ulint*		offsets)/*!< in: array returned by
 					rec_get_offsets() */
-#ifdef UNIV_DEBUG
+	__attribute__((nonnull(3), warn_unused_result));
 /************************************************************//**
 Updates debug data in offsets, in order to avoid bogus
 rec_offs_validate() failures. */
@@ -399,8 +487,9 @@ rec_offs_make_valid(
 /*================*/
 	const rec_t*		rec,	/*!< in: record */
 	const dict_index_t*	index,	/*!< in: record descriptor */
-	ulint*			offsets);/*!< in: array returned by
+	ulint*			offsets)/*!< in: array returned by
 					rec_get_offsets() */
+	__attribute__((nonnull));
 #else
 # define rec_offs_make_valid(rec, index, offsets) ((void) 0)
 #endif /* UNIV_DEBUG */
@@ -415,8 +504,9 @@ rec_get_nth_field_offs_old(
 /*=======================*/
 	const rec_t*	rec,	/*!< in: record */
 	ulint		n,	/*!< in: index of the field */
-	ulint*		len);	/*!< out: length of the field; UNIV_SQL_NULL
+	ulint*		len)	/*!< out: length of the field; UNIV_SQL_NULL
 				if SQL null */
+	__attribute__((nonnull));
 #define rec_get_nth_field_old(rec, n, len) \
 ((rec) + rec_get_nth_field_offs_old(rec, n, len))
 /************************************************************//**
@@ -429,7 +519,8 @@ ulint
 rec_get_nth_field_size(
 /*===================*/
 	const rec_t*	rec,	/*!< in: record */
-	ulint		n);	/*!< in: index of the field */
+	ulint		n)	/*!< in: index of the field */
+	__attribute__((nonnull, pure, warn_unused_result));
 /************************************************************//**
 The following function is used to get an offset to the nth
 data field in a record.
@@ -440,8 +531,9 @@ rec_get_nth_field_offs(
 /*===================*/
 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
 	ulint		n,	/*!< in: index of the field */
-	ulint*		len);	/*!< out: length of the field; UNIV_SQL_NULL
+	ulint*		len)	/*!< out: length of the field; UNIV_SQL_NULL
 				if SQL null */
+	__attribute__((nonnull));
 #define rec_get_nth_field(rec, offsets, n, len) \
 ((rec) + rec_get_nth_field_offs(offsets, n, len))
 /******************************************************//**
@@ -452,7 +544,8 @@ UNIV_INLINE
 ulint
 rec_offs_comp(
 /*==========*/
-	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 Determine if the offsets are for a record containing
 externally stored columns.
@@ -461,8 +554,8 @@ UNIV_INLINE
 ulint
 rec_offs_any_extern(
 /*================*/
-	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
-#ifdef UNIV_BLOB_NULL_DEBUG
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 Determine if the offsets are for a record containing null BLOB pointers.
 @return	first field containing a null BLOB pointer, or NULL if none found */
@@ -472,8 +565,7 @@ rec_offs_any_null_extern(
 /*=====================*/
 	const rec_t*	rec,		/*!< in: record */
 	const ulint*	offsets)	/*!< in: rec_get_offsets(rec) */
-	__attribute__((nonnull, warn_unused_result));
-#endif /* UNIV_BLOB_NULL_DEBUG */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 Returns nonzero if the extern bit is set in nth field of rec.
 @return	nonzero if externally stored */
@@ -482,7 +574,8 @@ ulint
 rec_offs_nth_extern(
 /*================*/
 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
-	ulint		n);	/*!< in: nth field */
+	ulint		n)	/*!< in: nth field */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 Returns nonzero if the SQL NULL bit is set in nth field of rec.
 @return	nonzero if SQL NULL */
@@ -491,7 +584,8 @@ ulint
 rec_offs_nth_sql_null(
 /*==================*/
 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
-	ulint		n);	/*!< in: nth field */
+	ulint		n)	/*!< in: nth field */
+	__attribute__((nonnull, pure, warn_unused_result));
 /******************************************************//**
 Gets the physical size of a field.
 @return	length of field */
@@ -500,7 +594,8 @@ ulint
 rec_offs_nth_size(
 /*==============*/
 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
-	ulint		n);	/*!< in: nth field */
+	ulint		n)	/*!< in: nth field */
+	__attribute__((nonnull, pure, warn_unused_result));
 
 /******************************************************//**
 Returns the number of extern bits set in a record.
@@ -509,7 +604,8 @@ UNIV_INLINE
 ulint
 rec_offs_n_extern(
 /*==============*/
-	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
 /***********************************************************//**
 This is used to modify the value of an already existing field in a record.
 The previous value must have exactly the same size as the new value. If len
@@ -524,7 +620,12 @@ rec_set_nth_field(
 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
 	ulint		n,	/*!< in: index number of the field */
 	const void*	data,	/*!< in: pointer to the data if not SQL null */
-	ulint		len);	/*!< in: length of the data or UNIV_SQL_NULL */
+	ulint		len)	/*!< in: length of the data or UNIV_SQL_NULL.
+				If not SQL null, must have the same
+				length as the previous value.
+				If SQL null, previous value must be
+				SQL null. */
+	__attribute__((nonnull(1,2)));
 /**********************************************************//**
 The following function returns the data size of an old-style physical
 record, that is the sum of field lengths. SQL null fields
@@ -535,7 +636,8 @@ UNIV_INLINE
 ulint
 rec_get_data_size_old(
 /*==================*/
-	const rec_t*	rec);	/*!< in: physical record */
+	const rec_t*	rec)	/*!< in: physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
 /**********************************************************//**
 The following function returns the number of allocated elements
 for an array of offsets.
@@ -544,7 +646,8 @@ UNIV_INLINE
 ulint
 rec_offs_get_n_alloc(
 /*=================*/
-	const ulint*	offsets);/*!< in: array for rec_get_offsets() */
+	const ulint*	offsets)/*!< in: array for rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
 /**********************************************************//**
 The following function sets the number of allocated elements
 for an array of offsets. */
@@ -554,7 +657,8 @@ rec_offs_set_n_alloc(
 /*=================*/
 	ulint*	offsets,	/*!< out: array for rec_get_offsets(),
 				must be allocated */
-	ulint	n_alloc);	/*!< in: number of elements */
+	ulint	n_alloc)	/*!< in: number of elements */
+	__attribute__((nonnull));
 #define rec_offs_init(offsets) \
 	rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets)
 /**********************************************************//**
@@ -564,7 +668,8 @@ UNIV_INLINE
 ulint
 rec_offs_n_fields(
 /*==============*/
-	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
 /**********************************************************//**
 The following function returns the data size of a physical
 record, that is the sum of field lengths. SQL null fields
@@ -575,7 +680,8 @@ UNIV_INLINE
 ulint
 rec_offs_data_size(
 /*===============*/
-	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
 /**********************************************************//**
 Returns the total size of record minus data size of record.
 The value returned by the function is the distance from record
@@ -585,7 +691,8 @@ UNIV_INLINE
 ulint
 rec_offs_extra_size(
 /*================*/
-	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
 /**********************************************************//**
 Returns the total size of a physical record.
 @return	size */
@@ -593,7 +700,8 @@ UNIV_INLINE
 ulint
 rec_offs_size(
 /*==========*/
-	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
 #ifdef UNIV_DEBUG
 /**********************************************************//**
 Returns a pointer to the start of the record.
@@ -603,7 +711,8 @@ byte*
 rec_get_start(
 /*==========*/
 	const rec_t*	rec,	/*!< in: pointer to record */
-	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
 /**********************************************************//**
 Returns a pointer to the end of the record.
 @return	pointer to end */
@@ -612,7 +721,8 @@ byte*
 rec_get_end(
 /*========*/
 	const rec_t*	rec,	/*!< in: pointer to record */
-	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
 #else /* UNIV_DEBUG */
 # define rec_get_start(rec, offsets) ((rec) - rec_offs_extra_size(offsets))
 # define rec_get_end(rec, offsets) ((rec) + rec_offs_data_size(offsets))
@@ -683,7 +793,8 @@ rec_copy_prefix_to_buf(
 	byte**			buf,		/*!< in/out: memory buffer
 						for the copied prefix,
 						or NULL */
-	ulint*			buf_size);	/*!< in/out: buffer size */
+	ulint*			buf_size)	/*!< in/out: buffer size */
+	__attribute__((nonnull));
 /************************************************************//**
 Folds a prefix of a physical record to a ulint.
 @return	the folded value */
@@ -699,7 +810,7 @@ rec_fold(
 	ulint		n_bytes,	/*!< in: number of bytes to fold
 					in an incomplete last field */
 	index_id_t	tree_id)	/*!< in: index tree id */
-	__attribute__((pure));
+	__attribute__((nonnull, pure, warn_unused_result));
 #endif /* !UNIV_HOTBACKUP */
 /*********************************************************//**
 Builds a physical record out of a data tuple and
@@ -713,8 +824,9 @@ rec_convert_dtuple_to_rec(
 					physical record */
 	const dict_index_t*	index,	/*!< in: record descriptor */
 	const dtuple_t*		dtuple,	/*!< in: data tuple */
-	ulint			n_ext);	/*!< in: number of
+	ulint			n_ext)	/*!< in: number of
 					externally stored columns */
+	__attribute__((nonnull, warn_unused_result));
 /**********************************************************//**
 Returns the extra size of an old-style physical record if we know its
 data size and number of fields.
@@ -726,7 +838,7 @@ rec_get_converted_extra_size(
 	ulint	data_size,	/*!< in: data size */
 	ulint	n_fields,	/*!< in: number of fields */
 	ulint	n_ext)		/*!< in: number of externally stored columns */
-		__attribute__((const));
+	__attribute__((const));
 /**********************************************************//**
 Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT.
 @return	total size */
@@ -737,7 +849,8 @@ rec_get_converted_size_comp_prefix(
 	const dict_index_t*	index,	/*!< in: record descriptor */
 	const dfield_t*		fields,	/*!< in: array of data fields */
 	ulint			n_fields,/*!< in: number of data fields */
-	ulint*			extra);	/*!< out: extra size */
+	ulint*			extra)	/*!< out: extra size */
+	__attribute__((warn_unused_result, nonnull(1,2)));
 /**********************************************************//**
 Determines the size of a data tuple in ROW_FORMAT=COMPACT.
 @return	total size */
@@ -752,7 +865,8 @@ rec_get_converted_size_comp(
 	ulint			status,	/*!< in: status bits of the record */
 	const dfield_t*		fields,	/*!< in: array of data fields */
 	ulint			n_fields,/*!< in: number of data fields */
-	ulint*			extra);	/*!< out: extra size */
+	ulint*			extra)	/*!< out: extra size */
+	__attribute__((nonnull(1,3)));
 /**********************************************************//**
 The following function returns the size of a data tuple when converted to
 a physical record.
@@ -763,7 +877,8 @@ rec_get_converted_size(
 /*===================*/
 	dict_index_t*	index,	/*!< in: record descriptor */
 	const dtuple_t*	dtuple,	/*!< in: data tuple */
-	ulint		n_ext);	/*!< in: number of externally stored columns */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+	__attribute__((warn_unused_result, nonnull));
 #ifndef UNIV_HOTBACKUP
 /**************************************************************//**
 Copies the first n fields of a physical record to a data tuple.
@@ -777,7 +892,8 @@ rec_copy_prefix_to_dtuple(
 	const dict_index_t*	index,		/*!< in: record descriptor */
 	ulint			n_fields,	/*!< in: number of fields
 						to copy */
-	mem_heap_t*		heap);		/*!< in: memory heap */
+	mem_heap_t*		heap)		/*!< in: memory heap */
+	__attribute__((nonnull));
 #endif /* !UNIV_HOTBACKUP */
 /***************************************************************//**
 Validates the consistency of a physical record.
@@ -787,7 +903,8 @@ ibool
 rec_validate(
 /*=========*/
 	const rec_t*	rec,	/*!< in: physical record */
-	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull));
 /***************************************************************//**
 Prints an old-style physical record. */
 UNIV_INTERN
@@ -795,7 +912,8 @@ void
 rec_print_old(
 /*==========*/
 	FILE*		file,	/*!< in: file where to print */
-	const rec_t*	rec);	/*!< in: physical record */
+	const rec_t*	rec)	/*!< in: physical record */
+	__attribute__((nonnull));
 #ifndef UNIV_HOTBACKUP
 /***************************************************************//**
 Prints a physical record in ROW_FORMAT=COMPACT.  Ignores the
@@ -806,7 +924,8 @@ rec_print_comp(
 /*===========*/
 	FILE*		file,	/*!< in: file where to print */
 	const rec_t*	rec,	/*!< in: physical record */
-	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull));
 /***************************************************************//**
 Prints a physical record. */
 UNIV_INTERN
@@ -815,7 +934,8 @@ rec_print_new(
 /*==========*/
 	FILE*		file,	/*!< in: file where to print */
 	const rec_t*	rec,	/*!< in: physical record */
-	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull));
 /***************************************************************//**
 Prints a physical record. */
 UNIV_INTERN
@@ -824,7 +944,21 @@ rec_print(
 /*======*/
 	FILE*			file,	/*!< in: file where to print */
 	const rec_t*		rec,	/*!< in: physical record */
-	const dict_index_t*	index);	/*!< in: record descriptor */
+	const dict_index_t*	index)	/*!< in: record descriptor */
+	__attribute__((nonnull));
+
+# ifdef UNIV_DEBUG
+/************************************************************//**
+Reads the DB_TRX_ID of a clustered index record.
+@return	the value of DB_TRX_ID */
+UNIV_INTERN
+trx_id_t
+rec_get_trx_id(
+/*===========*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index)	/*!< in: clustered index */
+	__attribute__((nonnull, warn_unused_result));
+# endif /* UNIV_DEBUG */
 #endif /* UNIV_HOTBACKUP */
 
 /* Maximum lengths for the data in a physical record if the offsets
diff --git a/storage/xtradb/include/rem0rec.ic b/storage/xtradb/include/rem0rec.ic
index b14366312e0..a539320dd2a 100644
--- a/storage/xtradb/include/rem0rec.ic
+++ b/storage/xtradb/include/rem0rec.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -103,7 +103,7 @@ and the shift needed to obtain each bit-field of the record. */
 
 #define REC_OLD_HEAP_NO		5
 #define REC_HEAP_NO_MASK	0xFFF8UL
-#if 0 /* defined in rem0rec.h for use of page0zip.c */
+#if 0 /* defined in rem0rec.h for use of page0zip.cc */
 #define REC_NEW_HEAP_NO		4
 #define	REC_HEAP_NO_SHIFT	3
 #endif
@@ -118,17 +118,6 @@ and the shift needed to obtain each bit-field of the record. */
 #define	REC_INFO_BITS_MASK	0xF0UL
 #define REC_INFO_BITS_SHIFT	0
 
-/* The following masks are used to filter the SQL null bit from
-one-byte and two-byte offsets */
-
-#define REC_1BYTE_SQL_NULL_MASK	0x80UL
-#define REC_2BYTE_SQL_NULL_MASK	0x8000UL
-
-/* In a 2-byte offset the second most significant bit denotes
-a field stored to another page: */
-
-#define REC_2BYTE_EXTERN_MASK	0x4000UL
-
 #if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \
 		^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \
 		^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \
@@ -264,13 +253,13 @@ rec_get_next_ptr_const(
 
 	field_value = mach_read_from_2(rec - REC_NEXT);
 
-	if (UNIV_UNLIKELY(field_value == 0)) {
+	if (field_value == 0) {
 
 		return(NULL);
 	}
 
-	if (UNIV_LIKELY(comp != 0)) {
-#if UNIV_PAGE_SIZE <= 32768
+	if (comp) {
+#if UNIV_PAGE_SIZE_MAX <= 32768
 		/* Note that for 64 KiB pages, field_value can 'wrap around'
 		and the debug assertion is not valid */
 
@@ -313,7 +302,7 @@ rec_get_next_ptr(
 	rec_t*	rec,	/*!< in: physical record */
 	ulint	comp)	/*!< in: nonzero=compact page format */
 {
-	return((rec_t*) rec_get_next_ptr_const(rec, comp));
+	return(const_cast<rec_t*>(rec_get_next_ptr_const(rec, comp)));
 }
 
 /******************************************************//**
@@ -337,8 +326,8 @@ rec_get_next_offs(
 
 	field_value = mach_read_from_2(rec - REC_NEXT);
 
-	if (UNIV_LIKELY(comp != 0)) {
-#if UNIV_PAGE_SIZE <= 32768
+	if (comp) {
+#if UNIV_PAGE_SIZE_MAX <= 32768
 		/* Note that for 64 KiB pages, field_value can 'wrap around'
 		and the debug assertion is not valid */
 
@@ -354,7 +343,7 @@ rec_get_next_offs(
 		      + ut_align_offset(rec, UNIV_PAGE_SIZE)
 		      < UNIV_PAGE_SIZE);
 #endif
-		if (UNIV_UNLIKELY(field_value == 0)) {
+		if (field_value == 0) {
 
 			return(0);
 		}
@@ -410,7 +399,7 @@ rec_set_next_offs_new(
 	ut_ad(rec);
 	ut_ad(UNIV_PAGE_SIZE > next);
 
-	if (UNIV_UNLIKELY(!next)) {
+	if (!next) {
 		field_value = 0;
 	} else {
 		/* The following two statements calculate
@@ -418,7 +407,7 @@ rec_set_next_offs_new(
 		as a non-negative number */
 
 		field_value = (ulint)
-			((lint) next 
+			((lint) next
 			 - (lint) ut_align_offset(rec, UNIV_PAGE_SIZE));
 		field_value &= REC_NEXT_MASK;
 	}
@@ -572,9 +561,7 @@ rec_set_n_owned_new(
 {
 	rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
 			    REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
-	if (UNIV_LIKELY_NULL(page_zip)
-	    && UNIV_LIKELY(rec_get_status(rec)
-			   != REC_STATUS_SUPREMUM)) {
+	if (page_zip && rec_get_status(rec) != REC_STATUS_SUPREMUM) {
 		page_zip_rec_set_owned(page_zip, rec, n_owned);
 	}
 }
@@ -648,7 +635,7 @@ rec_get_info_and_status_bits(
 & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)
 # error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap"
 #endif
-	if (UNIV_LIKELY(comp != 0)) {
+	if (comp) {
 		bits = rec_get_info_bits(rec, TRUE) | rec_get_status(rec);
 	} else {
 		bits = rec_get_info_bits(rec, FALSE);
@@ -684,16 +671,14 @@ rec_get_deleted_flag(
 	const rec_t*	rec,	/*!< in: physical record */
 	ulint		comp)	/*!< in: nonzero=compact page format */
 {
-	if (UNIV_LIKELY(comp != 0)) {
-		return(UNIV_UNLIKELY(
-			       rec_get_bit_field_1(rec, REC_NEW_INFO_BITS,
-						   REC_INFO_DELETED_FLAG,
-						   REC_INFO_BITS_SHIFT)));
+	if (comp) {
+		return(rec_get_bit_field_1(rec, REC_NEW_INFO_BITS,
+					   REC_INFO_DELETED_FLAG,
+					   REC_INFO_BITS_SHIFT));
 	} else {
-		return(UNIV_UNLIKELY(
-			       rec_get_bit_field_1(rec, REC_OLD_INFO_BITS,
-						   REC_INFO_DELETED_FLAG,
-						   REC_INFO_BITS_SHIFT)));
+		return(rec_get_bit_field_1(rec, REC_OLD_INFO_BITS,
+					   REC_INFO_DELETED_FLAG,
+					   REC_INFO_BITS_SHIFT));
 	}
 }
 
@@ -741,7 +726,7 @@ rec_set_deleted_flag_new(
 
 	rec_set_info_bits_new(rec, val);
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		page_zip_rec_set_deleted(page_zip, rec, flag);
 	}
 }
@@ -887,6 +872,20 @@ rec_2_get_field_end_info(
 	return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2)));
 }
 
+/******************************************************//**
+Returns nonzero if the field is stored off-page.
+@retval 0 if the field is stored in-page
+@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */
+UNIV_INLINE
+ulint
+rec_2_is_field_extern(
+/*==================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	return(rec_2_get_field_end_info(rec, n) & REC_2BYTE_EXTERN_MASK);
+}
+
 /* Get the base address of offsets.  The extra_size is stored at
 this position, and following positions hold the end offsets of
 the fields. */
@@ -1041,7 +1040,7 @@ rec_get_nth_field_offs(
 	ut_ad(n < rec_offs_n_fields(offsets));
 	ut_ad(len);
 
-	if (UNIV_UNLIKELY(n == 0)) {
+	if (n == 0) {
 		offs = 0;
 	} else {
 		offs = rec_offs_base(offsets)[n] & REC_OFFS_MASK;
@@ -1085,10 +1084,9 @@ rec_offs_any_extern(
 	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
 {
 	ut_ad(rec_offs_validate(NULL, NULL, offsets));
-	return(UNIV_UNLIKELY(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL));
+	return(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL);
 }
 
-#ifdef UNIV_BLOB_NULL_DEBUG
 /******************************************************//**
 Determine if the offsets are for a record containing null BLOB pointers.
 @return	first field containing a null BLOB pointer, or NULL if none found */
@@ -1124,7 +1122,6 @@ rec_offs_any_null_extern(
 
 	return(NULL);
 }
-#endif /* UNIV_BLOB_NULL_DEBUG */
 
 /******************************************************//**
 Returns nonzero if the extern bit is set in nth field of rec.
@@ -1138,8 +1135,7 @@ rec_offs_nth_extern(
 {
 	ut_ad(rec_offs_validate(NULL, NULL, offsets));
 	ut_ad(n < rec_offs_n_fields(offsets));
-	return(UNIV_UNLIKELY(rec_offs_base(offsets)[1 + n]
-			     & REC_OFFS_EXTERNAL));
+	return(rec_offs_base(offsets)[1 + n] & REC_OFFS_EXTERNAL);
 }
 
 /******************************************************//**
@@ -1154,8 +1150,7 @@ rec_offs_nth_sql_null(
 {
 	ut_ad(rec_offs_validate(NULL, NULL, offsets));
 	ut_ad(n < rec_offs_n_fields(offsets));
-	return(UNIV_UNLIKELY(rec_offs_base(offsets)[1 + n]
-			     & REC_OFFS_SQL_NULL));
+	return(rec_offs_base(offsets)[1 + n] & REC_OFFS_SQL_NULL);
 }
 
 /******************************************************//**
@@ -1394,7 +1389,7 @@ rec_set_nth_field(
 	ut_ad(rec);
 	ut_ad(rec_offs_validate(rec, NULL, offsets));
 
-	if (UNIV_UNLIKELY(len == UNIV_SQL_NULL)) {
+	if (len == UNIV_SQL_NULL) {
 		if (!rec_offs_nth_sql_null(offsets, n)) {
 			ut_a(!rec_offs_comp(offsets));
 			rec_set_nth_field_sql_null(rec, n);
@@ -1513,7 +1508,7 @@ rec_get_end(
 	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
 {
 	ut_ad(rec_offs_validate(rec, NULL, offsets));
-	return((rec_t*) rec + rec_offs_data_size(offsets));
+	return(const_cast<rec_t*>(rec + rec_offs_data_size(offsets)));
 }
 
 /**********************************************************//**
@@ -1527,7 +1522,7 @@ rec_get_start(
 	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
 {
 	ut_ad(rec_offs_validate(rec, NULL, offsets));
-	return((rec_t*) rec - rec_offs_extra_size(offsets));
+	return(const_cast<rec_t*>(rec - rec_offs_extra_size(offsets)));
 }
 #endif /* UNIV_DEBUG */
 
@@ -1546,7 +1541,7 @@ rec_copy(
 	ulint	data_len;
 
 	ut_ad(rec && buf);
-	ut_ad(rec_offs_validate((rec_t*) rec, NULL, offsets));
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
 	ut_ad(rec_validate(rec, offsets));
 
 	extra_len = rec_offs_extra_size(offsets);
@@ -1554,7 +1549,7 @@ rec_copy(
 
 	ut_memcpy(buf, rec - extra_len, extra_len + data_len);
 
-	return((byte*)buf + extra_len);
+	return((byte*) buf + extra_len);
 }
 
 /**********************************************************//**
@@ -1596,7 +1591,7 @@ rec_get_converted_size(
 	ut_ad(dtuple);
 	ut_ad(dtuple_check_typed(dtuple));
 
-	ut_ad(index->type & DICT_UNIVERSAL
+	ut_ad(dict_index_is_univ(index)
 	      || dtuple_get_n_fields(dtuple)
 	      == (((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK)
 		   == REC_STATUS_NODE_PTR)
@@ -1616,6 +1611,41 @@ rec_get_converted_size(
 	extra_size = rec_get_converted_extra_size(
 		data_size, dtuple_get_n_fields(dtuple), n_ext);
 
+#if 0
+	/* This code is inactive since it may be the wrong place to add
+	in the size of node pointers used in parent pages AND it is not
+	currently needed since ha_innobase::max_supported_key_length()
+	ensures that the key size limit for each page size is well below
+	the actual limit ((free space on page / 4) - record overhead).
+	But those limits will need to be raised when InnoDB can
+	support multiple page sizes.  At that time, we will need
+	to consider the node pointer on these universal btrees. */
+
+	if (dict_index_is_univ(index)) {
+		/* This is for the insert buffer B-tree.
+		All fields in the leaf tuple ascend to the
+		parent node plus the child page pointer. */
+
+		/* ibuf cannot contain externally stored fields */
+		ut_ad(n_ext == 0);
+
+		/* Add the data pointer and recompute extra_size
+		based on one more field. */
+		data_size += REC_NODE_PTR_SIZE;
+		extra_size = rec_get_converted_extra_size(
+			data_size,
+			dtuple_get_n_fields(dtuple) + 1,
+			0);
+
+		/* Be sure dtuple->n_fields has this node ptr
+		accounted for.  This function should correspond to
+		what rec_convert_dtuple_to_rec() needs in storage.
+		In optimistic insert or update-not-in-place, we will
+		have to ensure that if the record is converted to a
+		node pointer, it will not become too large.*/
+	}
+#endif
+
 	return(data_size + extra_size);
 }
 
diff --git a/storage/xtradb/include/rem0types.h b/storage/xtradb/include/rem0types.h
index 248ce27eee3..f8133f77466 100644
--- a/storage/xtradb/include/rem0types.h
+++ b/storage/xtradb/include/rem0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -34,6 +34,15 @@ typedef byte	rec_t;
 #define REC_MAX_HEAP_NO		(2 * 8192 - 1)
 #define REC_MAX_N_OWNED		(16 - 1)
 
+/* Maximum number of user defined fields/columns. The reserved columns
+are the ones InnoDB adds internally: DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR.
+We need "* 2" because mlog_parse_index() creates a dummy table object
+possibly, with some of the system columns in it, and then adds the 3
+system columns (again) using dict_table_add_system_columns(). The problem
+is that mlog_parse_index() cannot recognize the system columns by
+just having n_fields, n_uniq and the lengths of the columns. */
+#define REC_MAX_N_USER_FIELDS	(REC_MAX_N_FIELDS - DATA_N_SYS_COLS * 2)
+
 /* REC_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and is the maximum
 indexed field length (or indexed prefix length) for indexes on tables of
 ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT format.
@@ -45,10 +54,21 @@ This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
 files would be at risk! */
 #define REC_ANTELOPE_MAX_INDEX_COL_LEN		768
 
-/** Maximum indexed field length for table format DICT_TF_FORMAT_ZIP and
+/** Maximum indexed field length for table format UNIV_FORMAT_B and
 beyond.
 This (3072) is the maximum index row length allowed, so we cannot create index
 prefix column longer than that. */
 #define REC_VERSION_56_MAX_INDEX_COL_LEN	3072
 
+/** Innodb row types are a subset of the MySQL global enum row_type.
+They are made into their own enum so that switch statements can account
+for each of them. */
+enum rec_format_enum {
+	REC_FORMAT_REDUNDANT	= 0,	/*!< REDUNDANT row format */
+	REC_FORMAT_COMPACT	= 1,	/*!< COMPACT row format */
+	REC_FORMAT_COMPRESSED	= 2,	/*!< COMPRESSED row format */
+	REC_FORMAT_DYNAMIC	= 3	/*!< DYNAMIC row format */
+};
+typedef enum rec_format_enum rec_format_t;
+
 #endif
diff --git a/storage/xtradb/include/row0ext.h b/storage/xtradb/include/row0ext.h
index 71c7b6ecce4..a098e2f9b29 100644
--- a/storage/xtradb/include/row0ext.h
+++ b/storage/xtradb/include/row0ext.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -84,7 +84,7 @@ row_ext_lookup(
 					DICT_MAX_FIELD_LEN_BY_FORMAT() */
 
 /** Prefixes of externally stored columns */
-struct row_ext_struct{
+struct row_ext_t{
 	ulint		n_ext;	/*!< number of externally stored columns */
 	const ulint*	ext;	/*!< col_no's of externally stored columns */
 	byte*		buf;	/*!< backing store of the column prefix cache */
diff --git a/storage/xtradb/include/row0ext.ic b/storage/xtradb/include/row0ext.ic
index 56e71d9a968..39e150d91d5 100644
--- a/storage/xtradb/include/row0ext.ic
+++ b/storage/xtradb/include/row0ext.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -48,7 +48,7 @@ row_ext_lookup_ith(
 	ut_ad(*len <= ext->max_len);
 	ut_ad(ext->max_len > 0);
 
-	if (UNIV_UNLIKELY(*len == 0)) {
+	if (*len == 0) {
 		/* The BLOB could not be fetched to the cache. */
 		return(field_ref_zero);
 	} else {
diff --git a/storage/xtradb/include/row0ftsort.h b/storage/xtradb/include/row0ftsort.h
new file mode 100644
index 00000000000..4a486450efc
--- /dev/null
+++ b/storage/xtradb/include/row0ftsort.h
@@ -0,0 +1,275 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ftsort.h
+Create Full Text Index with (parallel) merge sort
+
+Created 10/13/2010 Jimmy Yang
+*******************************************************/
+
+#ifndef row0ftsort_h
+#define row0ftsort_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "row0mysql.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "fts0priv.h"
+#include "row0merge.h"
+
+/** This structure defineds information the scan thread will fetch
+and put to the linked list for parallel tokenization/sort threads
+to process */
+typedef struct fts_doc_item     fts_doc_item_t;
+
+/** Information about temporary files used in merge sort */
+struct fts_doc_item {
+	dfield_t*	field;		/*!< field contains document string */
+	doc_id_t	doc_id;		/*!< document ID */
+	UT_LIST_NODE_T(fts_doc_item_t)	doc_list;
+					/*!< list of doc items */
+};
+
+/** This defines the list type that scan thread would feed the parallel
+tokenization threads and sort threads. */
+typedef UT_LIST_BASE_NODE_T(fts_doc_item_t)     fts_doc_list_t;
+
+#define FTS_NUM_AUX_INDEX	6
+#define FTS_PLL_MERGE		1
+
+/** Sort information passed to each individual parallel sort thread */
+struct fts_psort_t;
+
+/** Common info passed to each parallel sort thread */
+struct fts_psort_common_t {
+	row_merge_dup_t*	dup;		/*!< descriptor of FTS index */
+	dict_table_t*		new_table;	/*!< source table */
+	trx_t*			trx;		/*!< transaction */
+	fts_psort_t*		all_info;	/*!< all parallel sort info */
+	os_event_t		sort_event;	/*!< sort event */
+	os_event_t		merge_event;	/*!< merge event */
+	ibool			opt_doc_id_size;/*!< whether to use 4 bytes
+						instead of 8 bytes integer to
+						store Doc ID during sort, if
+						Doc ID will not be big enough
+						to use 8 bytes value */
+};
+
+struct fts_psort_t {
+	ulint			psort_id;	/*!< Parallel sort ID */
+	row_merge_buf_t*	merge_buf[FTS_NUM_AUX_INDEX];
+						/*!< sort buffer */
+	merge_file_t*		merge_file[FTS_NUM_AUX_INDEX];
+						/*!< sort file */
+	row_merge_block_t*	merge_block[FTS_NUM_AUX_INDEX];
+						/*!< buffer to write to file */
+	row_merge_block_t*	block_alloc[FTS_NUM_AUX_INDEX];
+						/*!< buffer to allocated */
+	ulint			child_status;	/*!< child thread status */
+	ulint			state;		/*!< child thread state */
+	fts_doc_list_t		fts_doc_list;	/*!< doc list to process */
+	fts_psort_common_t*	psort_common;	/*!< ptr to all psort info */
+	os_thread_t		thread_hdl;	/*!< thread handler */
+};
+
+/** Structure stores information from string tokenization operation */
+struct fts_tokenize_ctx {
+	ulint			processed_len;  /*!< processed string length */
+	ulint			init_pos;       /*!< doc start position */
+	ulint			buf_used;       /*!< the sort buffer (ID) when
+						tokenization stops, which
+						could due to sort buffer full */
+	ulint			rows_added[FTS_NUM_AUX_INDEX];
+						/*!< number of rows added for
+						each FTS index partition */
+	ib_rbt_t*		cached_stopword;/*!< in: stopword list */
+	dfield_t		sort_field[FTS_NUM_FIELDS_SORT];
+						/*!< in: sort field */
+};
+
+typedef struct fts_tokenize_ctx fts_tokenize_ctx_t;
+
+/** Structure stores information needed for the insertion phase of FTS
+parallel sort. */
+struct fts_psort_insert {
+	trx_t*		trx;		/*!< Transaction used for insertion */
+	que_t**		ins_graph;	/*!< insert graph */
+	fts_table_t	fts_table;	/*!< auxiliary table */
+	CHARSET_INFO*	charset;	/*!< charset info */
+	mem_heap_t*	heap;		/*!< heap */
+	ibool		opt_doc_id_size;/*!< Whether to use smaller (4 bytes)
+					integer for Doc ID */
+};
+
+typedef struct fts_psort_insert	fts_psort_insert_t;
+
+
+/** status bit used for communication between parent and child thread */
+#define FTS_PARENT_COMPLETE	1
+#define FTS_CHILD_COMPLETE	1
+#define FTS_CHILD_EXITING	2
+
+/** Print some debug information */
+#define	FTSORT_PRINT
+
+#ifdef	FTSORT_PRINT
+#define	DEBUG_FTS_SORT_PRINT(str)		\
+	do {					\
+		ut_print_timestamp(stderr);	\
+		fprintf(stderr, str);		\
+	} while (0)
+#else
+#define DEBUG_FTS_SORT_PRINT(str)
+#endif	/* FTSORT_PRINT */
+
+/*************************************************************//**
+Create a temporary "fts sort index" used to merge sort the
+tokenized doc string. The index has three "fields":
+
+1) Tokenized word,
+2) Doc ID
+3) Word's position in original 'doc'.
+
+@return dict_index_t structure for the fts sort index */
+UNIV_INTERN
+dict_index_t*
+row_merge_create_fts_sort_index(
+/*============================*/
+	dict_index_t*		index,	/*!< in: Original FTS index
+					based on which this sort index
+					is created */
+	const dict_table_t*	table,	/*!< in: table that FTS index
+					is being created on */
+	ibool*			opt_doc_id_size);
+					/*!< out: whether to use 4 bytes
+					instead of 8 bytes integer to
+					store Doc ID during sort */
+
+/********************************************************************//**
+Initialize FTS parallel sort structures.
+@return TRUE if all successful */
+UNIV_INTERN
+ibool
+row_fts_psort_info_init(
+/*====================*/
+	trx_t*			trx,	/*!< in: transaction */
+	row_merge_dup_t*	dup,	/*!< in,own: descriptor of
+					FTS index being created */
+	const dict_table_t*	new_table,/*!< in: table where indexes are
+					created */
+	ibool			opt_doc_id_size,
+					/*!< in: whether to use 4 bytes
+					instead of 8 bytes integer to
+					store Doc ID during sort */
+	fts_psort_t**		psort,	/*!< out: parallel sort info to be
+					instantiated */
+	fts_psort_t**		merge)	/*!< out: parallel merge info
+					to be instantiated */
+	__attribute__((nonnull));
+/********************************************************************//**
+Clean up and deallocate FTS parallel sort structures, and close
+temparary merge sort files */
+UNIV_INTERN
+void
+row_fts_psort_info_destroy(
+/*=======================*/
+	fts_psort_t*	psort_info,	/*!< parallel sort info */
+	fts_psort_t*	merge_info);	/*!< parallel merge info */
+/********************************************************************//**
+Free up merge buffers when merge sort is done */
+UNIV_INTERN
+void
+row_fts_free_pll_merge_buf(
+/*=======================*/
+	fts_psort_t*	psort_info);	/*!< in: parallel sort info */
+
+/*********************************************************************//**
+Function performs parallel tokenization of the incoming doc strings.
+@return OS_THREAD_DUMMY_RETURN */
+UNIV_INTERN
+os_thread_ret_t
+fts_parallel_tokenization(
+/*======================*/
+	void*		arg);		/*!< in: psort_info for the thread */
+/*********************************************************************//**
+Start the parallel tokenization and parallel merge sort */
+UNIV_INTERN
+void
+row_fts_start_psort(
+/*================*/
+	fts_psort_t*	psort_info);	/*!< in: parallel sort info */
+/*********************************************************************//**
+Function performs the merge and insertion of the sorted records.
+@return OS_THREAD_DUMMY_RETURN */
+UNIV_INTERN
+os_thread_ret_t
+fts_parallel_merge(
+/*===============*/
+	void*		arg);		/*!< in: parallel merge info */
+/*********************************************************************//**
+Kick off the parallel merge and insert thread */
+UNIV_INTERN
+void
+row_fts_start_parallel_merge(
+/*=========================*/
+	fts_psort_t*	merge_info);	/*!< in: parallel sort info */
+/********************************************************************//**
+Read sorted FTS data files and insert data tuples to auxillary tables.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+void
+row_fts_insert_tuple(
+/*=================*/
+	fts_psort_insert_t*
+			ins_ctx,        /*!< in: insert context */
+	fts_tokenizer_word_t* word,	/*!< in: last processed
+					tokenized word */
+	ib_vector_t*	positions,	/*!< in: word position */
+	doc_id_t*	in_doc_id,	/*!< in: last item doc id */
+	dtuple_t*	dtuple);	/*!< in: entry to insert */
+/********************************************************************//**
+Propagate a newly added record up one level in the selection tree
+@return parent where this value propagated to */
+UNIV_INTERN
+int
+row_merge_fts_sel_propagate(
+/*========================*/
+	int		propogated,	/*<! in: tree node propagated */
+	int*		sel_tree,	/*<! in: selection tree */
+	ulint		level,		/*<! in: selection tree level */
+	const mrec_t**	 mrec,		/*<! in: sort record */
+	ulint**		offsets,	/*<! in: record offsets */
+	dict_index_t*	index);		/*<! in: FTS index */
+/********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+row_fts_merge_insert(
+/*=================*/
+	dict_index_t*	index,		/*!< in: index */
+	dict_table_t*	table,		/*!< in: new table */
+	fts_psort_t*	psort_info,	/*!< parallel sort info */
+	ulint		id)		/* !< in: which auxiliary table's data
+					to insert to */
+	__attribute__((nonnull));
+#endif /* row0ftsort_h */
diff --git a/storage/xtradb/include/row0import.h b/storage/xtradb/include/row0import.h
new file mode 100644
index 00000000000..aa46fdb7c27
--- /dev/null
+++ b/storage/xtradb/include/row0import.h
@@ -0,0 +1,91 @@
+/*****************************************************************************
+
+Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0import.h
+Header file for import tablespace functions.
+
+Created 2012-02-08 by Sunny Bains
+*******************************************************/
+
+#ifndef row0import_h
+#define row0import_h
+
+#include "univ.i"
+#include "db0err.h"
+#include "dict0types.h"
+
+// Forward declarations
+struct trx_t;
+struct dict_table_t;
+struct row_prebuilt_t;
+
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_import_for_mysql(
+/*=================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct
+						in MySQL */
+	__attribute__((nonnull, warn_unused_result));
+
+/*****************************************************************//**
+Update the DICT_TF2_DISCARDED flag in SYS_TABLES.
+@return DB_SUCCESS or error code. */
+UNIV_INTERN
+dberr_t
+row_import_update_discarded_flag(
+/*=============================*/
+	trx_t*		trx,			/*!< in/out: transaction that
+						covers the update */
+	table_id_t	table_id,		/*!< in: Table for which we want
+						to set the root table->flags2 */
+	bool		discarded,		/*!< in: set MIX_LEN column bit
+						to discarded, if true */
+	bool		dict_locked)		/*!< in: Set to true if the
+						caller already owns the
+						dict_sys_t:: mutex. */
+	__attribute__((nonnull, warn_unused_result));
+
+/*****************************************************************//**
+Update the (space, root page) of a table's indexes from the values
+in the data dictionary.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_import_update_index_root(
+/*=========================*/
+	trx_t*			trx,		/*!< in/out: transaction that
+						covers the update */
+	const dict_table_t*	table,		/*!< in: Table for which we want
+						to set the root page_no */
+	bool			reset,		/*!< in: if true then set to
+						FIL_NUL */
+	bool			dict_locked)	/*!< in: Set to true if the
+						caller already owns the
+						dict_sys_t:: mutex. */
+	__attribute__((nonnull, warn_unused_result));
+#ifndef UNIV_NONINL
+#include "row0import.ic"
+#endif
+
+#endif /* row0import_h */
diff --git a/storage/xtradb/include/row0import.ic b/storage/xtradb/include/row0import.ic
new file mode 100644
index 00000000000..c5bbab49f6f
--- /dev/null
+++ b/storage/xtradb/include/row0import.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0import.ic
+
+Import tablespace inline functions.
+
+Created 2012-02-08 Sunny Bains
+*******************************************************/
diff --git a/storage/xtradb/include/row0ins.h b/storage/xtradb/include/row0ins.h
index 1da3ef48a81..2a892d2f5df 100644
--- a/storage/xtradb/include/row0ins.h
+++ b/storage/xtradb/include/row0ins.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -40,7 +40,7 @@ the caller must have a shared latch on dict_foreign_key_check_lock.
 @return DB_SUCCESS, DB_LOCK_WAIT, DB_NO_REFERENCED_ROW, or
 DB_ROW_IS_REFERENCED */
 UNIV_INTERN
-ulint
+dberr_t
 row_ins_check_foreign_constraint(
 /*=============================*/
 	ibool		check_ref,/*!< in: TRUE If we want to check that
@@ -52,7 +52,8 @@ row_ins_check_foreign_constraint(
 	dict_table_t*	table,	/*!< in: if check_ref is TRUE, then the foreign
 				table, else the referenced table */
 	dtuple_t*	entry,	/*!< in: index entry for index */
-	que_thr_t*	thr);	/*!< in: query thread */
+	que_thr_t*	thr)	/*!< in: query thread */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Creates an insert node struct.
 @return	own: insert node struct */
@@ -74,21 +75,110 @@ ins_node_set_new_row(
 	ins_node_t*	node,	/*!< in: insert node */
 	dtuple_t*	row);	/*!< in: new row (or first row) for the node */
 /***************************************************************//**
-Inserts an index entry to index. Tries first optimistic, then pessimistic
-descent down the tree. If the entry matches enough to a delete marked record,
-performs the insert by updating or delete unmarking the delete marked
-record.
-@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+Tries to insert an entry into a clustered index, ignoring foreign key
+constraints. If a record with the same unique key is found, the other
+record is necessarily marked deleted by a committed transaction, or a
+unique key violation error occurs. The delete marked record is then
+updated to an existing record, and we must write an undo log record on
+the delete marked record.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
 UNIV_INTERN
-ulint
-row_ins_index_entry(
-/*================*/
-	dict_index_t*	index,	/*!< in: index */
+dberr_t
+row_ins_clust_index_entry_low(
+/*==========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		n_uniq,	/*!< in: 0 or index->n_uniq */
 	dtuple_t*	entry,	/*!< in/out: index entry to insert */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
-	ibool		foreign,/*!< in: TRUE=check foreign key constraints
-				(foreign=FALSE only during CREATE INDEX) */
-	que_thr_t*	thr);	/*!< in: query thread */
+	que_thr_t*	thr)	/*!< in: query thread or NULL */
+	__attribute__((nonnull, warn_unused_result));
+/***************************************************************//**
+Tries to insert an entry into a secondary index. If a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+UNIV_INTERN
+dberr_t
+row_ins_sec_index_entry_low(
+/*========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: secondary index */
+	mem_heap_t*	offsets_heap,
+				/*!< in/out: memory heap that can be emptied */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	trx_id_t	trx_id,	/*!< in: PAGE_MAX_TRX_ID during
+				row_log_table_apply(), or 0 */
+	que_thr_t*	thr)	/*!< in: query thread */
+	__attribute__((nonnull, warn_unused_result));
+/***************************************************************//**
+Tries to insert the externally stored fields (off-page columns)
+of a clustered index entry.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+UNIV_INTERN
+dberr_t
+row_ins_index_entry_big_rec_func(
+/*=============================*/
+	const dtuple_t*		entry,	/*!< in/out: index entry to insert */
+	const big_rec_t*	big_rec,/*!< in: externally stored fields */
+	ulint*			offsets,/*!< in/out: rec offsets */
+	mem_heap_t**		heap,	/*!< in/out: memory heap */
+	dict_index_t*		index,	/*!< in: index */
+	const char*		file,	/*!< in: file name of caller */
+#ifndef DBUG_OFF
+	const void*		thd,	/*!< in: connection, or NULL */
+#endif /* DBUG_OFF */
+	ulint			line)	/*!< in: line number of caller */
+	__attribute__((nonnull(1,2,3,4,5,6), warn_unused_result));
+#ifdef DBUG_OFF
+# define row_ins_index_entry_big_rec(e,big,ofs,heap,index,thd,file,line) \
+	row_ins_index_entry_big_rec_func(e,big,ofs,heap,index,file,line)
+#else /* DBUG_OFF */
+# define row_ins_index_entry_big_rec(e,big,ofs,heap,index,thd,file,line) \
+	row_ins_index_entry_big_rec_func(e,big,ofs,heap,index,file,thd,line)
+#endif /* DBUG_OFF */
+/***************************************************************//**
+Inserts an entry into a clustered index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+UNIV_INTERN
+dberr_t
+row_ins_clust_index_entry(
+/*======================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+	__attribute__((nonnull, warn_unused_result));
+/***************************************************************//**
+Inserts an entry into a secondary index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+UNIV_INTERN
+dberr_t
+row_ins_sec_index_entry(
+/*====================*/
+	dict_index_t*	index,	/*!< in: secondary index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr)	/*!< in: query thread */
+	__attribute__((nonnull, warn_unused_result));
 /***********************************************************//**
 Inserts a row to a table. This is a high-level function used in
 SQL execution graphs.
@@ -98,17 +188,10 @@ que_thr_t*
 row_ins_step(
 /*=========*/
 	que_thr_t*	thr);	/*!< in: query thread */
-/***********************************************************//**
-Creates an entry template for each index of a table. */
-UNIV_INTERN
-void
-ins_node_create_entry_list(
-/*=======================*/
-	ins_node_t*	node);	/*!< in: row insert node */
 
 /* Insert node structure */
 
-struct ins_node_struct{
+struct ins_node_t{
 	que_common_t	common;	/*!< node type: QUE_NODE_INSERT */
 	ulint		ins_type;/* INS_VALUES, INS_SEARCHED, or INS_DIRECT */
 	dtuple_t*	row;	/*!< row to insert */
diff --git a/storage/xtradb/include/row0ins.ic b/storage/xtradb/include/row0ins.ic
index 6e96e9fd675..9c191d869a2 100644
--- a/storage/xtradb/include/row0ins.ic
+++ b/storage/xtradb/include/row0ins.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/row0log.h b/storage/xtradb/include/row0log.h
new file mode 100644
index 00000000000..41dac63963d
--- /dev/null
+++ b/storage/xtradb/include/row0log.h
@@ -0,0 +1,238 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0log.h
+Modification log for online index creation and online table rebuild
+
+Created 2011-05-26 Marko Makela
+*******************************************************/
+
+#ifndef row0log_h
+#define row0log_h
+
+#include "univ.i"
+#include "mtr0types.h"
+#include "row0types.h"
+#include "rem0types.h"
+#include "data0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+
+/******************************************************//**
+Allocate the row log for an index and flag the index
+for online creation.
+@retval true if success, false if not */
+UNIV_INTERN
+bool
+row_log_allocate(
+/*=============*/
+	dict_index_t*	index,	/*!< in/out: index */
+	dict_table_t*	table,	/*!< in/out: new table being rebuilt,
+				or NULL when creating a secondary index */
+	bool		same_pk,/*!< in: whether the definition of the
+				PRIMARY KEY has remained the same */
+	const dtuple_t*	add_cols,
+				/*!< in: default values of
+				added columns, or NULL */
+	const ulint*	col_map)/*!< in: mapping of old column
+				numbers to new ones, or NULL if !table */
+	__attribute__((nonnull(1), warn_unused_result));
+
+/******************************************************//**
+Free the row log for an index that was being created online. */
+UNIV_INTERN
+void
+row_log_free(
+/*=========*/
+	row_log_t*&	log)	/*!< in,own: row log */
+	__attribute__((nonnull));
+
+/******************************************************//**
+Free the row log for an index on which online creation was aborted. */
+UNIV_INLINE
+void
+row_log_abort_sec(
+/*==============*/
+	dict_index_t*	index)	/*!< in/out: index (x-latched) */
+	__attribute__((nonnull));
+
+/******************************************************//**
+Try to log an operation to a secondary index that is
+(or was) being created.
+@retval	true if the operation was logged or can be ignored
+@retval	false if online index creation is not taking place */
+UNIV_INLINE
+bool
+row_log_online_op_try(
+/*==================*/
+	dict_index_t*	index,	/*!< in/out: index, S or X latched */
+	const dtuple_t* tuple,	/*!< in: index tuple */
+	trx_id_t	trx_id)	/*!< in: transaction ID for insert,
+				or 0 for delete */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************//**
+Logs an operation to a secondary index that is (or was) being created. */
+UNIV_INTERN
+void
+row_log_online_op(
+/*==============*/
+	dict_index_t*	index,	/*!< in/out: index, S or X latched */
+	const dtuple_t*	tuple,	/*!< in: index tuple */
+	trx_id_t	trx_id)	/*!< in: transaction ID for insert,
+				or 0 for delete */
+	UNIV_COLD __attribute__((nonnull));
+
+/******************************************************//**
+Gets the error status of the online index rebuild log.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_log_table_get_error(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: clustered index of a table
+					that is being rebuilt online */
+	__attribute__((nonnull, warn_unused_result));
+
+/******************************************************//**
+Logs a delete operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_delete(). */
+UNIV_INTERN
+void
+row_log_table_delete(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	bool		purge,	/*!< in: true=purging BLOBs */
+	trx_id_t	trx_id)	/*!< in: DB_TRX_ID of the record before
+				it was deleted */
+	UNIV_COLD __attribute__((nonnull));
+
+/******************************************************//**
+Logs an update operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_update(). */
+UNIV_INTERN
+void
+row_log_table_update(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	const dtuple_t*	old_pk)	/*!< in: row_log_table_get_pk()
+				before the update */
+	UNIV_COLD __attribute__((nonnull(1,2,3)));
+
+/******************************************************//**
+Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR
+of a table that is being rebuilt.
+@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table,
+or NULL if the PRIMARY KEY definition does not change */
+UNIV_INTERN
+const dtuple_t*
+row_log_table_get_pk(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index),
+				or NULL */
+	mem_heap_t**	heap)	/*!< in/out: memory heap where allocated */
+	UNIV_COLD __attribute__((nonnull(1,2,4), warn_unused_result));
+
+/******************************************************//**
+Logs an insert to a table that is being rebuilt.
+This will be merged in row_log_table_apply_insert(). */
+UNIV_INTERN
+void
+row_log_table_insert(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec,index) */
+	UNIV_COLD __attribute__((nonnull));
+/******************************************************//**
+Notes that a BLOB is being freed during online ALTER TABLE. */
+UNIV_INTERN
+void
+row_log_table_blob_free(
+/*====================*/
+	dict_index_t*	index,	/*!< in/out: clustered index, X-latched */
+	ulint		page_no)/*!< in: starting page number of the BLOB */
+	UNIV_COLD __attribute__((nonnull));
+/******************************************************//**
+Notes that a BLOB is being allocated during online ALTER TABLE. */
+UNIV_INTERN
+void
+row_log_table_blob_alloc(
+/*=====================*/
+	dict_index_t*	index,	/*!< in/out: clustered index, X-latched */
+	ulint		page_no)/*!< in: starting page number of the BLOB */
+	UNIV_COLD __attribute__((nonnull));
+/******************************************************//**
+Apply the row_log_table log to a table upon completing rebuild.
+@return DB_SUCCESS, or error code on failure */
+UNIV_INTERN
+dberr_t
+row_log_table_apply(
+/*================*/
+	que_thr_t*	thr,	/*!< in: query graph */
+	dict_table_t*	old_table,
+				/*!< in: old table */
+	struct TABLE*	table)	/*!< in/out: MySQL table
+				(for reporting duplicates) */
+	__attribute__((nonnull, warn_unused_result));
+
+/******************************************************//**
+Get the latest transaction ID that has invoked row_log_online_op()
+during online creation.
+@return latest transaction ID, or 0 if nothing was logged */
+UNIV_INTERN
+trx_id_t
+row_log_get_max_trx(
+/*================*/
+	dict_index_t*	index)	/*!< in: index, must be locked */
+	__attribute__((nonnull, warn_unused_result));
+
+/******************************************************//**
+Merge the row log to the index upon completing index creation.
+@return DB_SUCCESS, or error code on failure */
+UNIV_INTERN
+dberr_t
+row_log_apply(
+/*==========*/
+	trx_t*		trx,	/*!< in: transaction (for checking if
+				the operation was interrupted) */
+	dict_index_t*	index,	/*!< in/out: secondary index */
+	struct TABLE*	table)	/*!< in/out: MySQL table
+				(for reporting duplicates) */
+	__attribute__((nonnull, warn_unused_result));
+
+#ifndef UNIV_NONINL
+#include "row0log.ic"
+#endif
+
+#endif /* row0log.h */
diff --git a/storage/xtradb/include/row0log.ic b/storage/xtradb/include/row0log.ic
new file mode 100644
index 00000000000..b0f37dbd8e7
--- /dev/null
+++ b/storage/xtradb/include/row0log.ic
@@ -0,0 +1,84 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0log.ic
+Modification log for online index creation and online table rebuild
+
+Created 2012-10-18 Marko Makela
+*******************************************************/
+
+#include "dict0dict.h"
+
+/******************************************************//**
+Free the row log for an index on which online creation was aborted. */
+UNIV_INLINE
+void
+row_log_abort_sec(
+/*===============*/
+	dict_index_t*	index)	/*!< in/out: index (x-latched) */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(!dict_index_is_clust(index));
+	dict_index_set_online_status(index, ONLINE_INDEX_ABORTED);
+	row_log_free(index->online_log);
+}
+
+/******************************************************//**
+Try to log an operation to a secondary index that is
+(or was) being created.
+@retval	true if the operation was logged or can be ignored
+@retval	false if online index creation is not taking place */
+UNIV_INLINE
+bool
+row_log_online_op_try(
+/*==================*/
+	dict_index_t*	index,	/*!< in/out: index, S or X latched */
+	const dtuple_t* tuple,	/*!< in: index tuple */
+	trx_id_t	trx_id)	/*!< in: transaction ID for insert,
+				or 0 for delete */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED)
+	      || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	switch (dict_index_get_online_status(index)) {
+	case ONLINE_INDEX_COMPLETE:
+		/* This is a normal index. Do not log anything.
+		The caller must perform the operation on the
+		index tree directly. */
+		return(false);
+	case ONLINE_INDEX_CREATION:
+		/* The index is being created online. Log the
+		operation. */
+		row_log_online_op(index, tuple, trx_id);
+		break;
+	case ONLINE_INDEX_ABORTED:
+	case ONLINE_INDEX_ABORTED_DROPPED:
+		/* The index was created online, but the operation was
+		aborted. Do not log the operation and tell the caller
+		to skip the operation. */
+		break;
+	}
+
+	return(true);
+}
diff --git a/storage/xtradb/include/row0merge.h b/storage/xtradb/include/row0merge.h
index 22786fd7e49..390c0ce038b 100644
--- a/storage/xtradb/include/row0merge.h
+++ b/storage/xtradb/include/row0merge.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -38,111 +38,210 @@ Created 13/06/2005 Jan Lindstrom
 #include "btr0types.h"
 #include "row0mysql.h"
 #include "lock0types.h"
+#include "srv0srv.h"
+
+// Forward declaration
+struct ib_sequence_t;
+
+/** @brief Block size for I/O operations in merge sort.
+
+The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty()
+rounded to a power of 2.
+
+When not creating a PRIMARY KEY that contains column prefixes, this
+can be set as small as UNIV_PAGE_SIZE / 2. */
+typedef byte	row_merge_block_t;
+
+/** @brief Secondary buffer for I/O operations of merge records.
+
+This buffer is used for writing or reading a record that spans two
+row_merge_block_t.  Thus, it must be able to hold one merge record,
+whose maximum size is the same as the minimum size of
+row_merge_block_t. */
+typedef byte	mrec_buf_t[UNIV_PAGE_SIZE_MAX];
+
+/** @brief Merge record in row_merge_block_t.
+
+The format is the same as a record in ROW_FORMAT=COMPACT with the
+exception that the REC_N_NEW_EXTRA_BYTES are omitted. */
+typedef byte	mrec_t;
+
+/** Merge record in row_merge_buf_t */
+struct mtuple_t {
+	dfield_t*	fields;		/*!< data fields */
+};
+
+/** Buffer for sorting in main memory. */
+struct row_merge_buf_t {
+	mem_heap_t*	heap;		/*!< memory heap where allocated */
+	dict_index_t*	index;		/*!< the index the tuples belong to */
+	ulint		total_size;	/*!< total amount of data bytes */
+	ulint		n_tuples;	/*!< number of data tuples */
+	ulint		max_tuples;	/*!< maximum number of data tuples */
+	mtuple_t*	tuples;		/*!< array of data tuples */
+	mtuple_t*	tmp_tuples;	/*!< temporary copy of tuples,
+					for sorting */
+};
+
+/** Information about temporary files used in merge sort */
+struct merge_file_t {
+	int		fd;		/*!< file descriptor */
+	ulint		offset;		/*!< file offset (end of file) */
+	ib_uint64_t	n_rec;		/*!< number of records in the file */
+};
 
 /** Index field definition */
-struct merge_index_field_struct {
+struct index_field_t {
+	ulint		col_no;		/*!< column offset */
 	ulint		prefix_len;	/*!< column prefix length, or 0
 					if indexing the whole column */
-	const char*	field_name;	/*!< field name */
 };
 
-/** Index field definition */
-typedef struct merge_index_field_struct merge_index_field_t;
-
 /** Definition of an index being created */
-struct merge_index_def_struct {
-	const char*		name;		/*!< index name */
-	ulint			ind_type;	/*!< 0, DICT_UNIQUE,
-						or DICT_CLUSTERED */
-	ulint			n_fields;	/*!< number of fields
-						in index */
-	merge_index_field_t*	fields;		/*!< field definitions */
+struct index_def_t {
+	const char*	name;		/*!< index name */
+	ulint		ind_type;	/*!< 0, DICT_UNIQUE,
+					or DICT_CLUSTERED */
+	ulint		key_number;	/*!< MySQL key number,
+					or ULINT_UNDEFINED if none */
+	ulint		n_fields;	/*!< number of fields in index */
+	index_field_t*	fields;		/*!< field definitions */
 };
 
-/** Definition of an index being created */
-typedef struct merge_index_def_struct merge_index_def_t;
+/** Structure for reporting duplicate records. */
+struct row_merge_dup_t {
+	dict_index_t*		index;	/*!< index being sorted */
+	struct TABLE*		table;	/*!< MySQL table object */
+	const ulint*		col_map;/*!< mapping of column numbers
+					in table to the rebuilt table
+					(index->table), or NULL if not
+					rebuilding table */
+	ulint			n_dup;	/*!< number of duplicates */
+};
 
+/*************************************************************//**
+Report a duplicate key. */
+UNIV_INTERN
+void
+row_merge_dup_report(
+/*=================*/
+	row_merge_dup_t*	dup,	/*!< in/out: for reporting duplicates */
+	const dfield_t*		entry)	/*!< in: duplicate index entry */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Sets an exclusive lock on a table, for the duration of creating indexes.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 row_merge_lock_table(
 /*=================*/
 	trx_t*		trx,		/*!< in/out: transaction */
 	dict_table_t*	table,		/*!< in: table to lock */
-	enum lock_mode	mode);		/*!< in: LOCK_X or LOCK_S */
+	enum lock_mode	mode)		/*!< in: LOCK_X or LOCK_S */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
-Drop an index from the InnoDB system tables.  The data dictionary must
-have been locked exclusively by the caller, because the transaction
-will not be committed. */
+Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
 UNIV_INTERN
 void
-row_merge_drop_index(
-/*=================*/
-	dict_index_t*	index,	/*!< in: index to be removed */
-	dict_table_t*	table,	/*!< in: table */
-	trx_t*		trx);	/*!< in: transaction handle */
+row_merge_drop_indexes_dict(
+/*========================*/
+	trx_t*		trx,	/*!< in/out: dictionary transaction */
+	table_id_t	table_id)/*!< in: table identifier */
+	__attribute__((nonnull));
 /*********************************************************************//**
-Drop those indexes which were created before an error occurred when
-building an index.  The data dictionary must have been locked
-exclusively by the caller, because the transaction will not be
-committed. */
+Drop those indexes which were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
 UNIV_INTERN
 void
 row_merge_drop_indexes(
 /*===================*/
-	trx_t*		trx,		/*!< in: transaction */
-	dict_table_t*	table,		/*!< in: table containing the indexes */
-	dict_index_t**	index,		/*!< in: indexes to drop */
-	ulint		num_created);	/*!< in: number of elements in index[] */
+	trx_t*		trx,	/*!< in/out: transaction */
+	dict_table_t*	table,	/*!< in/out: table containing the indexes */
+	ibool		locked)	/*!< in: TRUE=table locked,
+				FALSE=may need to do a lazy drop */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Drop all partially created indexes during crash recovery. */
 UNIV_INTERN
 void
 row_merge_drop_temp_indexes(void);
 /*=============================*/
+
+/*********************************************************************//**
+Creates temporary merge files, and if UNIV_PFS_IO defined, register
+the file descriptor with Performance Schema.
+@return File descriptor */
+UNIV_INTERN
+int
+row_merge_file_create_low(void)
+/*===========================*/
+	__attribute__((warn_unused_result));
+/*********************************************************************//**
+Destroy a merge file. And de-register the file from Performance Schema
+if UNIV_PFS_IO is defined. */
+UNIV_INTERN
+void
+row_merge_file_destroy_low(
+/*=======================*/
+	int		fd);	/*!< in: merge file descriptor */
+
+/*********************************************************************//**
+Provide a new pathname for a table that is being renamed if it belongs to
+a file-per-table tablespace.  The caller is responsible for freeing the
+memory allocated for the return value.
+@return	new pathname of tablespace file, or NULL if space = 0 */
+UNIV_INTERN
+char*
+row_make_new_pathname(
+/*==================*/
+	dict_table_t*	table,		/*!< in: table to be renamed */
+	const char*	new_name);	/*!< in: new name */
 /*********************************************************************//**
 Rename the tables in the data dictionary.  The data dictionary must
 have been locked exclusively by the caller, because the transaction
 will not be committed.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
-row_merge_rename_tables(
-/*====================*/
+dberr_t
+row_merge_rename_tables_dict(
+/*=========================*/
 	dict_table_t*	old_table,	/*!< in/out: old table, renamed to
 					tmp_name */
 	dict_table_t*	new_table,	/*!< in/out: new table, renamed to
 					old_table->name */
 	const char*	tmp_name,	/*!< in: new name for old_table */
-	trx_t*		trx);		/*!< in: transaction handle */
+	trx_t*		trx)		/*!< in/out: dictionary transaction */
+	__attribute__((nonnull, warn_unused_result));
 
 /*********************************************************************//**
-Create a temporary table for creating a primary key, using the definition
-of an existing table.
-@return	table, or NULL on error */
+Rename an index in the dictionary that was created. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return	DB_SUCCESS if all OK */
 UNIV_INTERN
-dict_table_t*
-row_merge_create_temporary_table(
-/*=============================*/
-	const char*		table_name,	/*!< in: new table name */
-	const merge_index_def_t*index_def,	/*!< in: the index definition
-						of the primary key */
-	const dict_table_t*	table,		/*!< in: old table definition */
-	trx_t*			trx);		/*!< in/out: transaction
-						(sets error_state) */
-/*********************************************************************//**
-Rename the temporary indexes in the dictionary to permanent ones.  The
-data dictionary must have been locked exclusively by the caller,
-because the transaction will not be committed.
+dberr_t
+row_merge_rename_index_to_add(
+/*==========================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	table_id_t	table_id,	/*!< in: table identifier */
+	index_id_t	index_id)	/*!< in: index identifier */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Rename an index in the dictionary that is to be dropped. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
 @return	DB_SUCCESS if all OK */
 UNIV_INTERN
-ulint
-row_merge_rename_indexes(
-/*=====================*/
+dberr_t
+row_merge_rename_index_to_drop(
+/*===========================*/
 	trx_t*		trx,		/*!< in/out: transaction */
-	dict_table_t*	table);		/*!< in/out: table with new indexes */
+	table_id_t	table_id,	/*!< in: table identifier */
+	index_id_t	index_id)	/*!< in: index identifier */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Create the index and load in to the dictionary.
 @return	index, or NULL on error */
@@ -152,7 +251,7 @@ row_merge_create_index(
 /*===================*/
 	trx_t*			trx,	/*!< in/out: trx (sets error_state) */
 	dict_table_t*		table,	/*!< in: the index is on this table */
-	const merge_index_def_t*index_def);
+	const index_def_t*	index_def);
 					/*!< in: the index definition */
 /*********************************************************************//**
 Check if a transaction can use an index.
@@ -164,23 +263,25 @@ row_merge_is_index_usable(
 	const trx_t*		trx,	/*!< in: transaction */
 	const dict_index_t*	index);	/*!< in: index to check */
 /*********************************************************************//**
-If there are views that refer to the old table name then we "attach" to
-the new instance of the table else we drop it immediately.
+Drop a table. The caller must have ensured that the background stats
+thread is not processing the table. This can be done by calling
+dict_stats_wait_bg_to_stop_using_table() after locking the dictionary and
+before calling this function.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 row_merge_drop_table(
 /*=================*/
 	trx_t*		trx,		/*!< in: transaction */
-	dict_table_t*	table);		/*!< in: table instance to drop */
-
+	dict_table_t*	table)		/*!< in: table instance to drop */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Build indexes on a table by reading a clustered index,
 creating a temporary file containing index entries, merge sorting
 these index entries and inserting sorted index entries to indexes.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 row_merge_build_indexes(
 /*====================*/
 	trx_t*		trx,		/*!< in: transaction */
@@ -189,9 +290,141 @@ row_merge_build_indexes(
 	dict_table_t*	new_table,	/*!< in: table where indexes are
 					created; identical to old_table
 					unless creating a PRIMARY KEY */
+	bool		online,		/*!< in: true if creating indexes
+					online */
 	dict_index_t**	indexes,	/*!< in: indexes to be created */
+	const ulint*	key_numbers,	/*!< in: MySQL key numbers */
 	ulint		n_indexes,	/*!< in: size of indexes[] */
-	struct TABLE*	table);		/*!< in/out: MySQL table, for
+	struct TABLE*	table,		/*!< in/out: MySQL table, for
 					reporting erroneous key value
 					if applicable */
+	const dtuple_t*	add_cols,	/*!< in: default values of
+					added columns, or NULL */
+	const ulint*	col_map,	/*!< in: mapping of old column
+					numbers to new ones, or NULL
+					if old_table == new_table */
+	ulint		add_autoinc,	/*!< in: number of added
+					AUTO_INCREMENT column, or
+					ULINT_UNDEFINED if none is added */
+	ib_sequence_t&	sequence)	/*!< in/out: autoinc sequence */
+	__attribute__((nonnull(1,2,3,5,6,8), warn_unused_result));
+/********************************************************************//**
+Write a buffer to a block. */
+UNIV_INTERN
+void
+row_merge_buf_write(
+/*================*/
+	const row_merge_buf_t*	buf,	/*!< in: sorted buffer */
+	const merge_file_t*	of,	/*!< in: output file */
+	row_merge_block_t*	block)	/*!< out: buffer for writing to file */
+	__attribute__((nonnull));
+/********************************************************************//**
+Sort a buffer. */
+UNIV_INTERN
+void
+row_merge_buf_sort(
+/*===============*/
+	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
+	row_merge_dup_t*	dup)	/*!< in/out: reporter of duplicates
+					(NULL if non-unique index) */
+	__attribute__((nonnull(1)));
+/********************************************************************//**
+Write a merge block to the file system.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+row_merge_write(
+/*============*/
+	int		fd,	/*!< in: file descriptor */
+	ulint		offset,	/*!< in: offset where to write,
+				in number of row_merge_block_t elements */
+	const void*	buf);	/*!< in: data */
+/********************************************************************//**
+Empty a sort buffer.
+@return sort buffer */
+UNIV_INTERN
+row_merge_buf_t*
+row_merge_buf_empty(
+/*================*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer */
+	__attribute__((warn_unused_result, nonnull));
+/*********************************************************************//**
+Create a merge file.
+@return file descriptor, or -1 on failure */
+UNIV_INTERN
+int
+row_merge_file_create(
+/*==================*/
+	merge_file_t*	merge_file)	/*!< out: merge file structure */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Merge disk files.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_merge_sort(
+/*===========*/
+	trx_t*			trx,	/*!< in: transaction */
+	const row_merge_dup_t*	dup,	/*!< in: descriptor of
+					index being created */
+	merge_file_t*		file,	/*!< in/out: file containing
+					index entries */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	int*			tmpfd)	/*!< in/out: temporary file handle */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Allocate a sort buffer.
+@return own: sort buffer */
+UNIV_INTERN
+row_merge_buf_t*
+row_merge_buf_create(
+/*=================*/
+	dict_index_t*	index)	/*!< in: secondary index */
+	__attribute__((warn_unused_result, nonnull, malloc));
+/*********************************************************************//**
+Deallocate a sort buffer. */
+UNIV_INTERN
+void
+row_merge_buf_free(
+/*===============*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer to be freed */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Destroy a merge file. */
+UNIV_INTERN
+void
+row_merge_file_destroy(
+/*===================*/
+	merge_file_t*	merge_file)	/*!< in/out: merge file structure */
+	__attribute__((nonnull));
+/********************************************************************//**
+Read a merge block from the file system.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+row_merge_read(
+/*===========*/
+	int			fd,	/*!< in: file descriptor */
+	ulint			offset,	/*!< in: offset where to read
+					in number of row_merge_block_t
+					elements */
+	row_merge_block_t*	buf);	/*!< out: data */
+/********************************************************************//**
+Read a merge record.
+@return pointer to next record, or NULL on I/O error or end of list */
+UNIV_INTERN
+const byte*
+row_merge_read_rec(
+/*===============*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	const byte*		b,	/*!< in: pointer to record */
+	const dict_index_t*	index,	/*!< in: index of the record */
+	int			fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t**		mrec,	/*!< out: pointer to merge record,
+					or NULL on end of list
+					(non-NULL on I/O error) */
+	ulint*			offsets)/*!< out: offsets of mrec */
+	__attribute__((nonnull, warn_unused_result));
 #endif /* row0merge.h */
diff --git a/storage/xtradb/include/row0mysql.h b/storage/xtradb/include/row0mysql.h
index 35378bf3302..cd37a2f69bb 100644
--- a/storage/xtradb/include/row0mysql.h
+++ b/storage/xtradb/include/row0mysql.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2000, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -36,9 +36,12 @@ Created 9/17/2000 Heikki Tuuri
 #include "btr0pcur.h"
 #include "trx0types.h"
 
+// Forward declaration
+struct SysIndexCallback;
+
 extern ibool row_rollback_on_timeout;
 
-typedef struct row_prebuilt_struct row_prebuilt_t;
+struct row_prebuilt_t;
 
 /*******************************************************************//**
 Frees the blob heap in prebuilt when no longer needed. */
@@ -116,7 +119,7 @@ row_mysql_pad_col(
 /**************************************************************//**
 Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
 The counterpart of this function is row_sel_field_store_in_mysql_format() in
-row0sel.c.
+row0sel.cc.
 @return	up to which byte we used buf in the conversion */
 UNIV_INTERN
 byte*
@@ -127,7 +130,10 @@ row_mysql_store_col_in_innobase_format(
 					this function is called! */
 	byte*		buf,		/*!< in/out: buffer for a converted
 					integer value; this must be at least
-					col_len long then! */
+					col_len long then! NOTE that dfield
+					may also get a pointer to 'buf',
+					therefore do not discard this as long
+					as dfield is used! */
 	ibool		row_format_col,	/*!< TRUE if the mysql_data is from
 					a MySQL row, FALSE if from a MySQL
 					key value;
@@ -149,18 +155,19 @@ row_mysql_store_col_in_innobase_format(
 	ulint		comp);		/*!< in: nonzero=compact format */
 /****************************************************************//**
 Handles user errors and lock waits detected by the database engine.
-@return TRUE if it was a lock wait and we should continue running the
+@return true if it was a lock wait and we should continue running the
 query thread */
 UNIV_INTERN
-ibool
+bool
 row_mysql_handle_errors(
 /*====================*/
-	ulint*		new_err,/*!< out: possible new error encountered in
+	dberr_t*	new_err,/*!< out: possible new error encountered in
 				rollback, or the old error which was
 				during the function entry */
 	trx_t*		trx,	/*!< in: transaction */
-	que_thr_t*	thr,	/*!< in: query thread */
-	trx_savept_t*	savept);/*!< in: savepoint */
+	que_thr_t*	thr,	/*!< in: query thread, or NULL */
+	trx_savept_t*	savept)	/*!< in: savepoint, or NULL */
+	__attribute__((nonnull(1,2)));
 /********************************************************************//**
 Create a prebuilt struct for a MySQL table handle.
 @return	own: a prebuilt struct */
@@ -190,15 +197,6 @@ row_update_prebuilt_trx(
 					in MySQL handle */
 	trx_t*		trx);		/*!< in: transaction handle */
 /*********************************************************************//**
-Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
-function should be called at the the end of an SQL statement, by the
-connection thread that owns the transaction (trx->mysql_thd). */
-UNIV_INTERN
-void
-row_unlock_table_autoinc_for_mysql(
-/*===============================*/
-	trx_t*	trx);			/*!< in/out: transaction */
-/*********************************************************************//**
 Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
 AUTO_INC lock gives exclusive access to the auto-inc counter of the
 table. The lock is reserved only for the duration of an SQL statement.
@@ -206,16 +204,17 @@ It is not compatible with another AUTO_INC or exclusive lock on the
 table.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_lock_table_autoinc_for_mysql(
 /*=============================*/
-	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct in the MySQL
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in the MySQL
 					table handle */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Sets a table lock on the table mentioned in prebuilt.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_lock_table_for_mysql(
 /*=====================*/
 	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct in the MySQL
@@ -224,19 +223,20 @@ row_lock_table_for_mysql(
 					if prebuilt->table should be
 					locked as
 					prebuilt->select_lock_type */
-	ulint		mode);		/*!< in: lock mode of table
+	ulint		mode)		/*!< in: lock mode of table
 					(ignored if table==NULL) */
-
+	__attribute__((nonnull(1)));
 /*********************************************************************//**
 Does an insert for MySQL.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_insert_for_mysql(
 /*=================*/
 	byte*		mysql_rec,	/*!< in: row in the MySQL format */
-	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct in MySQL
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
 					handle */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Builds a dummy query graph used in selects. */
 UNIV_INTERN
@@ -269,13 +269,14 @@ row_table_got_default_clust_index(
 Does an update or delete of a row for MySQL.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_update_for_mysql(
 /*=================*/
 	byte*		mysql_rec,	/*!< in: the row to be updated, in
 					the MySQL format */
-	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct in MySQL
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
 					handle */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 This can only be used when srv_locks_unsafe_for_binlog is TRUE or this
 session is using a READ COMMITTED or READ UNCOMMITTED isolation level.
@@ -284,19 +285,31 @@ initialized prebuilt->new_rec_locks to store the information which new
 record locks really were set. This function removes a newly set
 clustered index record lock under prebuilt->pcur or
 prebuilt->clust_pcur.  Thus, this implements a 'mini-rollback' that
-releases the latest clustered index record lock we set.
-@return error code or DB_SUCCESS */
+releases the latest clustered index record lock we set. */
 UNIV_INTERN
-int
+void
 row_unlock_for_mysql(
 /*=================*/
 	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct in MySQL
 					handle */
-	ibool		has_latches_on_recs);/*!< in: TRUE if called
+	ibool		has_latches_on_recs)/*!< in: TRUE if called
 					so that we have the latches on
 					the records under pcur and
 					clust_pcur, and we do not need
 					to reposition the cursors. */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Checks if a table name contains the string "/#sql" which denotes temporary
+tables in MySQL.
+@return true if temporary table */
+UNIV_INTERN
+bool
+row_is_mysql_tmp_table_name(
+/*========================*/
+	const char*	name) __attribute__((warn_unused_result));
+				/*!< in: table name in the form
+				'database/tablename' */
+
 /*********************************************************************//**
 Creates an query graph node of 'update' type to be used in the MySQL
 interface.
@@ -311,13 +324,14 @@ row_create_update_node_for_mysql(
 Does a cascaded delete or set null in a foreign key operation.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 row_update_cascade_for_mysql(
 /*=========================*/
 	que_thr_t*	thr,	/*!< in: query thread */
 	upd_node_t*	node,	/*!< in: update node used in the cascade
 				or set null operation */
-	dict_table_t*	table);	/*!< in: table where we do the operation */
+	dict_table_t*	table)	/*!< in: table where we do the operation */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Locks the data dictionary exclusively for performing a table create or other
 data dictionary modification operation. */
@@ -361,49 +375,38 @@ Creates a table for MySQL. If the name of the table ends in
 one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
 "innodb_table_monitor", then this will also start the printing of monitor
 output by the master thread. If the table name ends in "innodb_mem_validate",
-InnoDB will try to invoke mem_validate().
+InnoDB will try to invoke mem_validate(). On failure the transaction will
+be rolled back.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_create_table_for_mysql(
 /*=======================*/
-	dict_table_t*	table,		/*!< in, own: table definition
-					(will be freed) */
-	trx_t*		trx);		/*!< in: transaction handle */
+	dict_table_t*	table,	/*!< in, own: table definition
+				(will be freed, or on DB_SUCCESS
+				added to the data dictionary cache) */
+	trx_t*		trx,	/*!< in/out: transaction */
+	bool		commit)	/*!< in: if true, commit the transaction */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Does an index creation operation for MySQL. TODO: currently failure
 to create an index results in dropping the whole table! This is no problem
 currently as all indexes must be created at the same time as the table.
 @return	error number or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_create_index_for_mysql(
 /*=======================*/
 	dict_index_t*	index,		/*!< in, own: index definition
 					(will be freed) */
 	trx_t*		trx,		/*!< in: transaction handle */
-	const ulint*	field_lengths); /*!< in: if not NULL, must contain
+	const ulint*	field_lengths)	/*!< in: if not NULL, must contain
 					dict_index_get_n_fields(index)
 					actual field lengths for the
 					index columns, which are
 					then checked for not being too
 					large. */
-/*********************************************************************//**
-*/
-UNIV_INTERN
-int
-row_insert_stats_for_mysql(
-/*=======================*/
-	dict_index_t*	index,
-	trx_t*		trx);
-/*********************************************************************//**
-*/
-UNIV_INTERN
-int
-row_delete_stats_for_mysql(
-/*=======================*/
-	dict_index_t*	index,
-	trx_t*		trx);
+	__attribute__((nonnull(1,2), warn_unused_result));
 /*********************************************************************//**
 Scans a table create SQL string and adds to the data dictionary
 the foreign key constraints declared in the string. This function
@@ -413,7 +416,7 @@ bot participating tables. The indexes are allowed to contain more
 fields than mentioned in the constraint.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_table_add_foreign_constraints(
 /*==============================*/
 	trx_t*		trx,		/*!< in: transaction */
@@ -426,12 +429,12 @@ row_table_add_foreign_constraints(
 	const char*	name,		/*!< in: table full name in the
 					normalized form
 					database_name/table_name */
-	ibool		reject_fks);	/*!< in: if TRUE, fail with error
+	ibool		reject_fks)	/*!< in: if TRUE, fail with error
 					code DB_CANNOT_ADD_CONSTRAINT if
 					any foreign keys are found. */
-
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
-The master thread in srv0srv.c calls this regularly to drop tables which
+The master thread in srv0srv.cc calls this regularly to drop tables which
 we must drop in background after queries to them have ended. Such lazy
 dropping of tables is needed in ALTER TABLE on Unix.
 @return	how many tables dropped + remaining tables in list */
@@ -448,14 +451,28 @@ ulint
 row_get_background_drop_list_len_low(void);
 /*======================================*/
 /*********************************************************************//**
+Sets an exclusive lock on a table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_mysql_lock_table(
+/*=================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table,		/*!< in: table to lock */
+	enum lock_mode	mode,		/*!< in: LOCK_X or LOCK_S */
+	const char*	op_info)	/*!< in: string for trx->op_info */
+	__attribute__((nonnull, warn_unused_result));
+
+/*********************************************************************//**
 Truncates a table for MySQL.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_truncate_table_for_mysql(
 /*=========================*/
 	dict_table_t*	table,	/*!< in: table handle */
-	trx_t*		trx);	/*!< in: transaction handle */
+	trx_t*		trx)	/*!< in: transaction handle */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Drops a table for MySQL.  If the name of the dropped table ends in
 one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
@@ -465,12 +482,16 @@ by the transaction, the transaction will be committed.  Otherwise, the
 data dictionary will remain locked.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_drop_table_for_mysql(
 /*=====================*/
 	const char*	name,	/*!< in: table name */
-	trx_t*		trx,	/*!< in: transaction handle */
-	ibool		drop_db);/*!< in: TRUE=dropping whole database */
+	trx_t*		trx,	/*!< in: dictionary transaction handle */
+	bool		drop_db,/*!< in: true=dropping whole database */
+	bool		nonatomic = true)
+				/*!< in: whether it is permitted
+				to release and reacquire dict_operation_lock */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Drop all temporary tables during crash recovery. */
 UNIV_INTERN
@@ -484,73 +505,102 @@ means that this function deletes the .ibd file and assigns a new table id for
 the table. Also the flag table->ibd_file_missing is set TRUE.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_discard_tablespace_for_mysql(
 /*=============================*/
 	const char*	name,	/*!< in: table name */
-	trx_t*		trx);	/*!< in: transaction handle */
+	trx_t*		trx)	/*!< in: transaction handle */
+	__attribute__((nonnull, warn_unused_result));
 /*****************************************************************//**
 Imports a tablespace. The space id in the .ibd file must match the space id
 of the table in the data dictionary.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_import_tablespace_for_mysql(
 /*============================*/
-	const char*	name,	/*!< in: table name */
-	trx_t*		trx);	/*!< in: transaction handle */
+	dict_table_t*	table,		/*!< in/out: table */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL */
+        __attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Drops a database for MySQL.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_drop_database_for_mysql(
 /*========================*/
 	const char*	name,	/*!< in: database name which ends to '/' */
-	trx_t*		trx);	/*!< in: transaction handle */
+	trx_t*		trx)	/*!< in: transaction handle */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Renames a table for MySQL.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 row_rename_table_for_mysql(
 /*=======================*/
 	const char*	old_name,	/*!< in: old table name */
 	const char*	new_name,	/*!< in: new table name */
-	trx_t*		trx,		/*!< in: transaction handle */
-	ibool		commit);	/*!< in: if TRUE then commit trx */
+	trx_t*		trx,		/*!< in/out: transaction */
+	bool		commit)		/*!< in: whether to commit trx */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Checks that the index contains entries in an ascending order, unique
 constraint is not broken, and calculates the number of index entries
 in the read view of the current transaction.
-@return	DB_SUCCESS if ok */
+@return true if ok */
 UNIV_INTERN
-ulint
+bool
 row_check_index_for_mysql(
 /*======================*/
 	row_prebuilt_t*		prebuilt,	/*!< in: prebuilt struct
 						in MySQL handle */
 	const dict_index_t*	index,		/*!< in: index */
-	ulint*			n_rows);	/*!< out: number of entries
+	ulint*			n_rows)		/*!< out: number of entries
 						seen in the consistent read */
-
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Determines if a table is a magic monitor table.
-@return	TRUE if monitor table */
+@return	true if monitor table */
 UNIV_INTERN
-ibool
+bool
 row_is_magic_monitor_table(
 /*=======================*/
-	const char*	table_name);	/*!< in: name of the table, in the
+	const char*	table_name)	/*!< in: name of the table, in the
 					form database/table_name */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Initialize this module */
+UNIV_INTERN
+void
+row_mysql_init(void);
+/*================*/
+
+/*********************************************************************//**
+Close this module */
+UNIV_INTERN
+void
+row_mysql_close(void);
+/*=================*/
+
+/*********************************************************************//**
+Reassigns the table identifier of a table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_mysql_table_id_reassign(
+/*========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx,	/*!< in/out: transaction */
+	table_id_t*	new_id) /*!< out: new table id */
+        __attribute__((nonnull, warn_unused_result));
 
 /* A struct describing a place for an individual column in the MySQL
 row format which is presented to the table handler in ha_innobase.
 This template struct is used to speed up row transformations between
 Innobase and MySQL. */
 
-typedef struct mysql_row_templ_struct mysql_row_templ_t;
-struct mysql_row_templ_struct {
+struct mysql_row_templ_t {
 	ulint	col_no;			/*!< column number of the column */
 	ulint	rec_field_no;		/*!< field number of the column in an
 					Innobase record in the current index;
@@ -606,7 +656,7 @@ struct mysql_row_templ_struct {
 
 handle used within MySQL; these are used to save CPU time. */
 
-struct row_prebuilt_struct {
+struct row_prebuilt_t {
 	ulint		magic_n;	/*!< this magic number is set to
 					ROW_PREBUILT_ALLOCATED when created,
 					or ROW_PREBUILT_FREED when the
@@ -691,8 +741,11 @@ struct row_prebuilt_struct {
 					columns in the table */
 	upd_node_t*	upd_node;	/*!< Innobase SQL update node used
 					to perform updates and deletes */
+	trx_id_t	trx_id;		/*!< The table->def_trx_id when
+					ins_graph was built */
 	que_fork_t*	ins_graph;	/*!< Innobase SQL query graph used
-					in inserts */
+					in inserts. Will be rebuilt on
+					trx_id or n_indexes mismatch. */
 	que_fork_t*	upd_graph;	/*!< Innobase SQL query graph used
 					in updates or deletes */
 	btr_pcur_t	pcur;		/*!< persistent cursor used in selects
@@ -707,6 +760,12 @@ struct row_prebuilt_struct {
 					generated, the row id of the
 					last row fetched is stored
 					here */
+	doc_id_t	fts_doc_id;	/* if the table has an FTS index on
+					it then we fetch the doc_id.
+					FTS-FIXME: Currently we fetch it always
+					but in the future we must only fetch
+					it when FTS columns are being
+					updated */
 	dtuple_t*	clust_ref;	/*!< prebuilt dtuple used in
 					sel/upd/del */
 	ulint		select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */
@@ -783,6 +842,7 @@ struct row_prebuilt_struct {
 					to this heap */
 	mem_heap_t*	old_vers_heap;	/*!< memory heap where a previous
 					version is built in consistent read */
+	bool		in_fts_query;	/*!< Whether we are in a FTS query */
 	/*----------------------*/
 	ulonglong	autoinc_last_value;
 					/*!< last value of AUTO-INC interval */
@@ -793,7 +853,7 @@ struct row_prebuilt_struct {
 	ulonglong	autoinc_offset; /*!< The offset passed to
 					get_auto_increment() by MySQL. Required
 					to calculate the next value */
-	ulint		autoinc_error;	/*!< The actual error code encountered
+	dberr_t		autoinc_error;	/*!< The actual error code encountered
 					while trying to init or read the
 					autoinc value from the table. We
 					store it here so that we can return
@@ -808,6 +868,20 @@ struct row_prebuilt_struct {
 	/*----------------------*/
 	ulint		magic_n2;	/*!< this should be the same as
 					magic_n */
+	/*----------------------*/
+	unsigned	innodb_api:1;	/*!< whether this is a InnoDB API
+					query */
+	const rec_t*	innodb_api_rec;	/*!< InnoDB API search result */
+};
+
+/** Callback for row_mysql_sys_index_iterate() */
+struct SysIndexCallback {
+	virtual ~SysIndexCallback() { }
+
+	/** Callback method
+	@param mtr - current mini transaction
+	@param pcur - persistent cursor. */
+	virtual void operator()(mtr_t* mtr, btr_pcur_t* pcur) throw() = 0;
 };
 
 #define ROW_PREBUILT_FETCH_MAGIC_N	465765687
@@ -831,4 +905,4 @@ struct row_prebuilt_struct {
 #include "row0mysql.ic"
 #endif
 
-#endif
+#endif /* row0mysql.h */
diff --git a/storage/xtradb/include/row0mysql.ic b/storage/xtradb/include/row0mysql.ic
index 878523528b2..2eb60898c46 100644
--- a/storage/xtradb/include/row0mysql.ic
+++ b/storage/xtradb/include/row0mysql.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2001, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2001, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/row0purge.h b/storage/xtradb/include/row0purge.h
index fa9c9291d5d..93dcf9cf49b 100644
--- a/storage/xtradb/include/row0purge.h
+++ b/storage/xtradb/include/row0purge.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -34,6 +34,8 @@ Created 3/14/1997 Heikki Tuuri
 #include "trx0types.h"
 #include "que0types.h"
 #include "row0types.h"
+#include "row0purge.h"
+#include "ut0vec.h"
 
 /********************************************************************//**
 Creates a purge node to a query graph.
@@ -42,8 +44,10 @@ UNIV_INTERN
 purge_node_t*
 row_purge_node_create(
 /*==================*/
-	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
-	mem_heap_t*	heap);	/*!< in: memory heap where created */
+	que_thr_t*	parent,		/*!< in: parent node, i.e., a
+					thr node */
+	mem_heap_t*	heap)		/*!< in: memory heap where created */
+	__attribute__((nonnull, warn_unused_result));
 /***********************************************************//**
 Determines if it is possible to remove a secondary index entry.
 Removal is possible if the secondary index entry does not refer to any
@@ -53,19 +57,20 @@ is newer than the purge view.
 NOTE: This function should only be called by the purge thread, only
 while holding a latch on the leaf page of the secondary index entry
 (or keeping the buffer pool watch on the page).  It is possible that
-this function first returns TRUE and then FALSE, if a user transaction
+this function first returns true and then false, if a user transaction
 inserts a record that the secondary index entry would refer to.
 However, in that case, the user transaction would also re-insert the
 secondary index entry after purge has removed it and released the leaf
 page latch.
-@return	TRUE if the secondary index record can be purged */
+@return	true if the secondary index record can be purged */
 UNIV_INTERN
-ibool
+bool
 row_purge_poss_sec(
 /*===============*/
 	purge_node_t*	node,	/*!< in/out: row purge node */
 	dict_index_t*	index,	/*!< in: secondary index */
-	const dtuple_t*	entry);	/*!< in: secondary index entry */
+	const dtuple_t*	entry)	/*!< in: secondary index entry */
+	__attribute__((nonnull, warn_unused_result));
 /***************************************************************
 Does the purge operation for a single undo log record. This is a high-level
 function used in an SQL execution graph.
@@ -74,29 +79,26 @@ UNIV_INTERN
 que_thr_t*
 row_purge_step(
 /*===========*/
-	que_thr_t*	thr);	/*!< in: query thread */
+	que_thr_t*	thr)	/*!< in: query thread */
+	__attribute__((nonnull, warn_unused_result));
 
 /* Purge node structure */
 
-struct purge_node_struct{
+struct purge_node_t{
 	que_common_t	common;	/*!< node type: QUE_NODE_PURGE */
 	/*----------------------*/
 	/* Local storage for this graph node */
 	roll_ptr_t	roll_ptr;/* roll pointer to undo log record */
-	trx_undo_rec_t*	undo_rec;/* undo log record */
-	trx_undo_inf_t*	reservation;/* reservation for the undo log record in
-				the purge array */
+	ib_vector_t*    undo_recs;/*!< Undo recs to purge */
+
 	undo_no_t	undo_no;/* undo number of the record */
+
 	ulint		rec_type;/* undo log record type: TRX_UNDO_INSERT_REC,
 				... */
-	btr_pcur_t	pcur;	/*!< persistent cursor used in searching the
-				clustered index record */
-	ibool		found_clust;/* TRUE if the clustered index record
-				determined by ref was found in the clustered
-				index, and we were able to position pcur on
-				it */
 	dict_table_t*	table;	/*!< table where purge is done */
+
 	ulint		cmpl_info;/* compiler analysis info of an update */
+
 	upd_t*		update;	/*!< update vector for a clustered index
 				record */
 	dtuple_t*	ref;	/*!< NULL, or row reference to the next row to
@@ -109,6 +111,14 @@ struct purge_node_struct{
 	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage for
 				row; this must be emptied after a successful
 				purge of a row */
+	ibool		found_clust;/* TRUE if the clustered index record
+				determined by ref was found in the clustered
+				index, and we were able to position pcur on
+				it */
+	btr_pcur_t	pcur;	/*!< persistent cursor used in searching the
+				clustered index record */
+	ibool		done;	/* Debug flag */
+
 };
 
 #ifndef UNIV_NONINL
diff --git a/storage/xtradb/include/row0purge.ic b/storage/xtradb/include/row0purge.ic
index 6465c2ca971..700106d1048 100644
--- a/storage/xtradb/include/row0purge.ic
+++ b/storage/xtradb/include/row0purge.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/row0quiesce.h b/storage/xtradb/include/row0quiesce.h
new file mode 100644
index 00000000000..1d6d11291b8
--- /dev/null
+++ b/storage/xtradb/include/row0quiesce.h
@@ -0,0 +1,74 @@
+/*****************************************************************************
+
+Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0quiesce.h
+
+Header file for tablespace quiesce functions.
+
+Created 2012-02-08 by Sunny Bains
+*******************************************************/
+
+#ifndef row0quiesce_h
+#define row0quiesce_h
+
+#include "univ.i"
+#include "dict0types.h"
+
+struct trx_t;
+
+/** The version number of the export meta-data text file. */
+#define IB_EXPORT_CFG_VERSION_V1	0x1UL
+
+/*********************************************************************//**
+Quiesce the tablespace that the table resides in. */
+UNIV_INTERN
+void
+row_quiesce_table_start(
+/*====================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	trx_t*		trx)		/*!< in/out: transaction/session */
+        __attribute__((nonnull));
+
+/*********************************************************************//**
+Set a table's quiesce state.
+@return DB_SUCCESS or errro code. */
+UNIV_INTERN
+dberr_t
+row_quiesce_set_state(
+/*==================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	ib_quiesce_t	state,		/*!< in: quiesce state to set */
+	trx_t*		trx)		/*!< in/out: transaction */
+        __attribute__((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Cleanup after table quiesce. */
+UNIV_INTERN
+void
+row_quiesce_table_complete(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	trx_t*		trx)		/*!< in/out: transaction/session */
+        __attribute__((nonnull));
+
+#ifndef UNIV_NONINL
+#include "row0quiesce.ic"
+#endif
+
+#endif /* row0quiesce_h */
diff --git a/storage/xtradb/include/row0quiesce.ic b/storage/xtradb/include/row0quiesce.ic
new file mode 100644
index 00000000000..f570a6aed05
--- /dev/null
+++ b/storage/xtradb/include/row0quiesce.ic
@@ -0,0 +1,26 @@
+/*****************************************************************************
+
+Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0quiesce.ic
+
+Quiesce a tablespace.
+
+Created 2012-02-08 Sunny Bains
+*******************************************************/
+
diff --git a/storage/xtradb/include/row0row.h b/storage/xtradb/include/row0row.h
index bf135217bd0..a4e5e0dd2fa 100644
--- a/storage/xtradb/include/row0row.h
+++ b/storage/xtradb/include/row0row.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -73,20 +73,41 @@ row_get_rec_roll_ptr(
 /*****************************************************************//**
 When an insert or purge to a table is performed, this function builds
 the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged
+@retval NULL if the externally stored columns in the clustered index record
+are unavailable and ext != NULL, or row is missing some needed columns. */
+UNIV_INTERN
+dtuple_t*
+row_build_index_entry_low(
+/*======================*/
+	const dtuple_t*		row,	/*!< in: row which should be
+					inserted or purged */
+	const row_ext_t*	ext,	/*!< in: externally stored column
+					prefixes, or NULL */
+	dict_index_t*		index,	/*!< in: index on the table */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory for the index entry
+					is allocated */
+	__attribute__((warn_unused_result, nonnull(1,3,4)));
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
 @return index entry which should be inserted or purged, or NULL if the
 externally stored columns in the clustered index record are
 unavailable and ext != NULL */
-UNIV_INTERN
+UNIV_INLINE
 dtuple_t*
 row_build_index_entry(
 /*==================*/
-	const dtuple_t*	row,	/*!< in: row which should be
-				inserted or purged */
-	row_ext_t*	ext,	/*!< in: externally stored column prefixes,
-				or NULL */
-	dict_index_t*	index,	/*!< in: index on the table */
-	mem_heap_t*	heap);	/*!< in: memory heap from which the memory for
-				the index entry is allocated */
+	const dtuple_t*		row,	/*!< in: row which should be
+					inserted or purged */
+	const row_ext_t*	ext,	/*!< in: externally stored column
+					prefixes, or NULL */
+	dict_index_t*		index,	/*!< in: index on the table */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory for the index entry
+					is allocated */
+	__attribute__((warn_unused_result, nonnull(1,3,4)));
 /*******************************************************************//**
 An inverse function to row_build_index_entry. Builds a row from a
 record in a clustered index.
@@ -124,11 +145,17 @@ row_build(
 					consulted instead; the user
 					columns in this table should be
 					the same columns as in index->table */
+	const dtuple_t*		add_cols,
+					/*!< in: default values of
+					added columns, or NULL */
+	const ulint*		col_map,/*!< in: mapping of old column
+					numbers to new ones, or NULL */
 	row_ext_t**		ext,	/*!< out, own: cache of
 					externally stored column
 					prefixes, or NULL */
-	mem_heap_t*		heap);	/*!< in: memory heap from which
+	mem_heap_t*		heap)	/*!< in: memory heap from which
 					the memory needed is allocated */
+	__attribute__((nonnull(2,3,9)));
 /*******************************************************************//**
 Converts an index record to a typed data tuple.
 @return index entry built; does not set info_bits, and the data fields
@@ -142,37 +169,25 @@ row_rec_to_index_entry_low(
 	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
 	ulint*			n_ext,	/*!< out: number of externally
 					stored columns */
-	mem_heap_t*		heap);	/*!< in: memory heap from which
+	mem_heap_t*		heap)	/*!< in: memory heap from which
 					the memory needed is allocated */
+	__attribute__((nonnull, warn_unused_result));
 /*******************************************************************//**
 Converts an index record to a typed data tuple. NOTE that externally
 stored (often big) fields are NOT copied to heap.
-@return	own: index entry built; see the NOTE below! */
+@return	own: index entry built */
 UNIV_INTERN
 dtuple_t*
 row_rec_to_index_entry(
 /*===================*/
-	ulint			type,	/*!< in: ROW_COPY_DATA, or
-					ROW_COPY_POINTERS: the former
-					copies also the data fields to
-					heap as the latter only places
-					pointers to data fields on the
-					index page */
-	const rec_t*		rec,	/*!< in: record in the index;
-					NOTE: in the case
-					ROW_COPY_POINTERS the data
-					fields in the row will point
-					directly into this record,
-					therefore, the buffer page of
-					this record must be at least
-					s-latched and the latch held
-					as long as the dtuple is used! */
+	const rec_t*		rec,	/*!< in: record in the index */
 	const dict_index_t*	index,	/*!< in: index */
-	ulint*			offsets,/*!< in/out: rec_get_offsets(rec) */
+	const ulint*		offsets,/*!< in/out: rec_get_offsets(rec) */
 	ulint*			n_ext,	/*!< out: number of externally
 					stored columns */
-	mem_heap_t*		heap);	/*!< in: memory heap from which
+	mem_heap_t*		heap)	/*!< in: memory heap from which
 					the memory needed is allocated */
+	__attribute__((nonnull, warn_unused_result));
 /*******************************************************************//**
 Builds from a secondary index record a row reference with which we can
 search the clustered index record.
@@ -193,8 +208,9 @@ row_build_row_ref(
 				the buffer page of this record must be
 				at least s-latched and the latch held
 				as long as the row reference is used! */
-	mem_heap_t*	heap);	/*!< in: memory heap from which the memory
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
 				needed is allocated */
+	__attribute__((nonnull, warn_unused_result));
 /*******************************************************************//**
 Builds from a secondary index record a row reference with which we can
 search the clustered index record. */
@@ -215,7 +231,8 @@ row_build_row_ref_in_tuple(
 	const dict_index_t*	index,	/*!< in: secondary index */
 	ulint*			offsets,/*!< in: rec_get_offsets(rec, index)
 					or NULL */
-	trx_t*			trx);	/*!< in: transaction */
+	trx_t*			trx)	/*!< in: transaction or NULL */
+	__attribute__((nonnull(1,2,3)));
 /*******************************************************************//**
 Builds from a secondary index record a row reference with which we can
 search the clustered index record. */
@@ -245,7 +262,8 @@ row_search_on_row_ref(
 	ulint			mode,	/*!< in: BTR_MODIFY_LEAF, ... */
 	const dict_table_t*	table,	/*!< in: table */
 	const dtuple_t*		ref,	/*!< in: row reference */
-	mtr_t*			mtr);	/*!< in/out: mtr */
+	mtr_t*			mtr)	/*!< in/out: mtr */
+	__attribute__((nonnull, warn_unused_result));
 /*********************************************************************//**
 Fetches the clustered index record for a secondary index record. The latches
 on the secondary index record are preserved.
@@ -258,7 +276,8 @@ row_get_clust_rec(
 	const rec_t*	rec,	/*!< in: record in a secondary index */
 	dict_index_t*	index,	/*!< in: secondary index */
 	dict_index_t**	clust_index,/*!< out: clustered index */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull, warn_unused_result));
 
 /** Result of row_search_index_entry */
 enum row_search_result {
@@ -285,8 +304,8 @@ row_search_index_entry(
 	ulint		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
 	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor, which must
 				be closed by the caller */
-	mtr_t*		mtr);	/*!< in: mtr */
-
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull, warn_unused_result));
 
 #define ROW_COPY_DATA		1
 #define ROW_COPY_POINTERS	2
@@ -294,10 +313,7 @@ row_search_index_entry(
 /* The allowed latching order of index records is the following:
 (1) a secondary index record ->
 (2) the clustered index record ->
-(3) rollback segment data for the clustered index record.
-
-No new latches may be obtained while the kernel mutex is reserved.
-However, the kernel mutex can be reserved while latches are owned. */
+(3) rollback segment data for the clustered index record. */
 
 /*******************************************************************//**
 Formats the raw data in "data" (in InnoDB on-disk format) using
@@ -316,8 +332,9 @@ row_raw_format(
 						in bytes */
 	const dict_field_t*	dict_field,	/*!< in: index field */
 	char*			buf,		/*!< out: output buffer */
-	ulint			buf_size);	/*!< in: output buffer size
+	ulint			buf_size)	/*!< in: output buffer size
 						in bytes */
+	__attribute__((nonnull, warn_unused_result));
 
 #ifndef UNIV_NONINL
 #include "row0row.ic"
diff --git a/storage/xtradb/include/row0row.ic b/storage/xtradb/include/row0row.ic
index 831c2339d96..ac62422be1f 100644
--- a/storage/xtradb/include/row0row.ic
+++ b/storage/xtradb/include/row0row.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -104,6 +104,33 @@ row_get_rec_roll_ptr(
 	return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN));
 }
 
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INLINE
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+	const dtuple_t*		row,	/*!< in: row which should be
+					inserted or purged */
+	const row_ext_t*	ext,	/*!< in: externally stored column
+					prefixes, or NULL */
+	dict_index_t*		index,	/*!< in: index on the table */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory for the index entry
+					is allocated */
+{
+	dtuple_t*	entry;
+
+	ut_ad(dtuple_check_typed(row));
+	entry = row_build_index_entry_low(row, ext, index, heap);
+	ut_ad(!entry || dtuple_check_typed(entry));
+	return(entry);
+}
+
 /*******************************************************************//**
 Builds from a secondary index record a row reference with which we can
 search the clustered index record. */
diff --git a/storage/xtradb/include/row0sel.h b/storage/xtradb/include/row0sel.h
index 830615effc2..c8be80f89d9 100644
--- a/storage/xtradb/include/row0sel.h
+++ b/storage/xtradb/include/row0sel.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -148,7 +148,7 @@ position and fetch next or fetch prev must not be tried to the cursor!
 @return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
 DB_LOCK_TABLE_FULL, or DB_TOO_BIG_RECORD */
 UNIV_INTERN
-ulint
+dberr_t
 row_search_for_mysql(
 /*=================*/
 	byte*		buf,		/*!< in/out: buffer for the fetched
@@ -163,11 +163,12 @@ row_search_for_mysql(
 					'mode' */
 	ulint		match_mode,	/*!< in: 0 or ROW_SEL_EXACT or
 					ROW_SEL_EXACT_PREFIX */
-	ulint		direction);	/*!< in: 0 or ROW_SEL_NEXT or
+	ulint		direction)	/*!< in: 0 or ROW_SEL_NEXT or
 					ROW_SEL_PREV; NOTE: if this is != 0,
 					then prebuilt must have a pcur
 					with stored position! In opening of a
 					cursor 'direction' should be 0. */
+	__attribute__((nonnull, warn_unused_result));
 /*******************************************************************//**
 Checks if MySQL at the moment is allowed for this table to retrieve a
 consistent read result, or store it to the query cache.
@@ -183,15 +184,16 @@ row_search_check_if_query_cache_permitted(
 Read the max AUTOINC value from an index.
 @return	DB_SUCCESS if all OK else error code */
 UNIV_INTERN
-ulint
+dberr_t
 row_search_max_autoinc(
 /*===================*/
 	dict_index_t*	index,		/*!< in: index to search */
 	const char*	col_name,	/*!< in: autoinc column name */
-	ib_uint64_t*	value);		/*!< out: AUTOINC value read */
+	ib_uint64_t*	value)		/*!< out: AUTOINC value read */
+	__attribute__((nonnull, warn_unused_result));
 
 /** A structure for caching column values for prefetched rows */
-struct sel_buf_struct{
+struct sel_buf_t{
 	byte*		data;	/*!< data, or NULL; if not NULL, this field
 				has allocated memory which must be explicitly
 				freed; can be != NULL even when len is
@@ -204,7 +206,7 @@ struct sel_buf_struct{
 };
 
 /** Query plan */
-struct plan_struct{
+struct plan_t{
 	dict_table_t*	table;		/*!< table struct in the dictionary
 					cache */
 	dict_index_t*	index;		/*!< table index used in the search */
@@ -290,7 +292,7 @@ enum sel_node_state {
 };
 
 /** Select statement node */
-struct sel_node_struct{
+struct sel_node_t{
 	que_common_t	common;		/*!< node type: QUE_NODE_SELECT */
 	enum sel_node_state
 			state;	/*!< node state */
@@ -343,7 +345,7 @@ struct sel_node_struct{
 };
 
 /** Fetch statement node */
-struct fetch_node_struct{
+struct fetch_node_t{
 	que_common_t	common;		/*!< type: QUE_NODE_FETCH */
 	sel_node_t*	cursor_def;	/*!< cursor definition */
 	sym_node_t*	into_list;	/*!< variables to set */
@@ -370,7 +372,7 @@ enum open_node_op {
 };
 
 /** Open or close cursor statement node */
-struct open_node_struct{
+struct open_node_t{
 	que_common_t	common;		/*!< type: QUE_NODE_OPEN */
 	enum open_node_op
 			op_type;	/*!< operation type: open or
@@ -379,7 +381,7 @@ struct open_node_struct{
 };
 
 /** Row printf statement node */
-struct row_printf_node_struct{
+struct row_printf_node_t{
 	que_common_t	common;		/*!< type: QUE_NODE_ROW_PRINTF */
 	sel_node_t*	sel_node;	/*!< select */
 };
diff --git a/storage/xtradb/include/row0sel.ic b/storage/xtradb/include/row0sel.ic
index 03c30e80dfe..d83a3448832 100644
--- a/storage/xtradb/include/row0sel.ic
+++ b/storage/xtradb/include/row0sel.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -92,7 +92,7 @@ open_step(
 		}
 	}
 
-	if (UNIV_EXPECT(err, DB_SUCCESS) != DB_SUCCESS) {
+	if (err != DB_SUCCESS) {
 		/* SQL error detected */
 		fprintf(stderr, "SQL error %lu\n", (ulong) err);
 
diff --git a/storage/xtradb/include/row0types.h b/storage/xtradb/include/row0types.h
index b40094d05d6..52c89cb01fa 100644
--- a/storage/xtradb/include/row0types.h
+++ b/storage/xtradb/include/row0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,32 +26,28 @@ Created 12/27/1996 Heikki Tuuri
 #ifndef row0types_h
 #define row0types_h
 
-typedef struct plan_struct plan_t;
+struct plan_t;
 
-typedef	struct upd_struct upd_t;
+struct upd_t;
+struct upd_field_t;
+struct upd_node_t;
+struct del_node_t;
+struct ins_node_t;
+struct sel_node_t;
+struct open_node_t;
+struct fetch_node_t;
 
-typedef struct upd_field_struct upd_field_t;
+struct row_printf_node_t;
+struct sel_buf_t;
 
-typedef	struct upd_node_struct upd_node_t;
+struct undo_node_t;
 
-typedef	struct del_node_struct del_node_t;
+struct purge_node_t;
 
-typedef	struct ins_node_struct ins_node_t;
+struct row_ext_t;
 
-typedef struct sel_node_struct	sel_node_t;
-
-typedef struct open_node_struct	open_node_t;
-
-typedef struct fetch_node_struct fetch_node_t;
-
-typedef struct row_printf_node_struct	row_printf_node_t;
-typedef struct sel_buf_struct	sel_buf_t;
-
-typedef	struct undo_node_struct undo_node_t;
-
-typedef	struct purge_node_struct purge_node_t;
-
-typedef struct row_ext_struct row_ext_t;
+/** Buffer for logging modifications during online index creation */
+struct row_log_t;
 
 /* MySQL data types */
 struct TABLE;
diff --git a/storage/xtradb/include/row0uins.h b/storage/xtradb/include/row0uins.h
index 6809c6d9317..ebf4881208a 100644
--- a/storage/xtradb/include/row0uins.h
+++ b/storage/xtradb/include/row0uins.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -42,11 +42,11 @@ if it figures out that an index record will be removed in the purge
 anyway, it will remove it in the rollback.
 @return	DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 row_undo_ins(
 /*=========*/
-	undo_node_t*	node);	/*!< in: row undo node */
-
+	undo_node_t*	node)	/*!< in: row undo node */
+	__attribute__((nonnull, warn_unused_result));
 #ifndef UNIV_NONINL
 #include "row0uins.ic"
 #endif
diff --git a/storage/xtradb/include/row0uins.ic b/storage/xtradb/include/row0uins.ic
index fb8a335191d..54da2e49874 100644
--- a/storage/xtradb/include/row0uins.ic
+++ b/storage/xtradb/include/row0uins.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/row0umod.h b/storage/xtradb/include/row0umod.h
index aca35ce2170..f89d5a334fc 100644
--- a/storage/xtradb/include/row0umod.h
+++ b/storage/xtradb/include/row0umod.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -38,12 +38,12 @@ Created 2/27/1997 Heikki Tuuri
 Undoes a modify operation on a row of a table.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 row_undo_mod(
 /*=========*/
 	undo_node_t*	node,	/*!< in: row undo node */
-	que_thr_t*	thr);	/*!< in: query thread */
-
+	que_thr_t*	thr)	/*!< in: query thread */
+	__attribute__((nonnull, warn_unused_result));
 
 #ifndef UNIV_NONINL
 #include "row0umod.ic"
diff --git a/storage/xtradb/include/row0umod.ic b/storage/xtradb/include/row0umod.ic
index dd9e217fa20..00a8cd86e01 100644
--- a/storage/xtradb/include/row0umod.ic
+++ b/storage/xtradb/include/row0umod.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/row0undo.h b/storage/xtradb/include/row0undo.h
index d783c94a110..5dddfb4eae1 100644
--- a/storage/xtradb/include/row0undo.h
+++ b/storage/xtradb/include/row0undo.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -95,7 +95,7 @@ enum undo_exec {
 };
 
 /** Undo node structure */
-struct undo_node_struct{
+struct undo_node_t{
 	que_common_t	common;	/*!< node type: QUE_NODE_UNDO */
 	enum undo_exec	state;	/*!< node execution state */
 	trx_t*		trx;	/*!< trx for which undo is done */
diff --git a/storage/xtradb/include/row0undo.ic b/storage/xtradb/include/row0undo.ic
index 21723c88ecb..b97ffca590e 100644
--- a/storage/xtradb/include/row0undo.ic
+++ b/storage/xtradb/include/row0undo.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/row0upd.h b/storage/xtradb/include/row0upd.h
index 16c069d5ae8..27dedeb65a7 100644
--- a/storage/xtradb/include/row0upd.h
+++ b/storage/xtradb/include/row0upd.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -101,7 +101,7 @@ byte*
 row_upd_write_sys_vals_to_log(
 /*==========================*/
 	dict_index_t*	index,	/*!< in: clustered index */
-	trx_t*		trx,	/*!< in: transaction */
+	trx_id_t	trx_id,	/*!< in: transaction id */
 	roll_ptr_t	roll_ptr,/*!< in: roll ptr of the undo log record */
 	byte*		log_ptr,/*!< pointer to a buffer of size > 20 opened
 				in mlog */
@@ -118,8 +118,9 @@ row_upd_rec_sys_fields(
 				uncompressed part will be updated, or NULL */
 	dict_index_t*	index,	/*!< in: clustered index */
 	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
-	trx_t*		trx,	/*!< in: transaction */
-	roll_ptr_t	roll_ptr);/*!< in: roll ptr of the undo log record */
+	const trx_t*	trx,	/*!< in: transaction */
+	roll_ptr_t	roll_ptr);/*!< in: roll ptr of the undo log record,
+				  can be 0 during IMPORT */
 /*********************************************************************//**
 Sets the trx id or roll ptr field of a clustered index entry. */
 UNIV_INTERN
@@ -165,6 +166,15 @@ row_upd_changes_field_size_or_external(
 	dict_index_t*	index,	/*!< in: index */
 	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
 	const upd_t*	update);/*!< in: update vector */
+/***********************************************************//**
+Returns true if row update contains disowned external fields.
+@return true if the update contains disowned external fields. */
+UNIV_INTERN
+bool
+row_upd_changes_disowned_external(
+/*==============================*/
+	const upd_t*	update)	/*!< in: update vector */
+	__attribute__((nonnull, warn_unused_result));
 #endif /* !UNIV_HOTBACKUP */
 /***********************************************************//**
 Replaces the new column values stored in the update vector to the
@@ -192,11 +202,12 @@ UNIV_INTERN
 upd_t*
 row_upd_build_sec_rec_difference_binary(
 /*====================================*/
+	const rec_t*	rec,	/*!< in: secondary index record */
 	dict_index_t*	index,	/*!< in: index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
 	const dtuple_t*	entry,	/*!< in: entry to insert */
-	const rec_t*	rec,	/*!< in: secondary index record */
-	trx_t*		trx,	/*!< in: transaction */
-	mem_heap_t*	heap);	/*!< in: memory heap from which allocated */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+	__attribute__((warn_unused_result, nonnull));
 /***************************************************************//**
 Builds an update vector from those fields, excluding the roll ptr and
 trx id fields, which in an index entry differ from a record that has
@@ -204,14 +215,19 @@ the equal ordering fields. NOTE: we compare the fields as binary strings!
 @return own: update vector of differing fields, excluding roll ptr and
 trx id */
 UNIV_INTERN
-upd_t*
+const upd_t*
 row_upd_build_difference_binary(
 /*============================*/
 	dict_index_t*	index,	/*!< in: clustered index */
 	const dtuple_t*	entry,	/*!< in: entry to insert */
 	const rec_t*	rec,	/*!< in: clustered index record */
-	trx_t*		trx,	/*!< in: transaction */
-	mem_heap_t*	heap);	/*!< in: memory heap from which allocated */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index), or NULL */
+	bool		no_sys,	/*!< in: skip the system columns
+				DB_TRX_ID and DB_ROLL_PTR */
+	trx_t*		trx,	/*!< in: transaction (for diagnostics),
+				or NULL */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+	__attribute__((nonnull(1,2,3,7), warn_unused_result));
 /***********************************************************//**
 Replaces the new column values stored in the update vector to the index entry
 given. */
@@ -304,6 +320,26 @@ row_upd_changes_ord_field_binary_func(
 	row_upd_changes_ord_field_binary_func(index,update,row,ext)
 #endif /* UNIV_DEBUG */
 /***********************************************************//**
+Checks if an FTS indexed column is affected by an UPDATE.
+@return offset within fts_t::indexes if FTS indexed column updated else
+ULINT_UNDEFINED */
+UNIV_INTERN
+ulint
+row_upd_changes_fts_column(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field);	/*!< in: field to check */
+/***********************************************************//**
+Checks if an FTS Doc ID column is affected by an UPDATE.
+@return whether Doc ID column is affected */
+UNIV_INTERN
+bool
+row_upd_changes_doc_id(
+/*===================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field)	/*!< in: field to check */
+	__attribute__((nonnull, warn_unused_result));
+/***********************************************************//**
 Checks if an update vector changes an ordering field of an index record.
 This function is fast if the update vector is short or the number of ordering
 fields in the index is small. Otherwise, this can be quadratic.
@@ -366,10 +402,10 @@ row_upd_index_parse(
 
 
 /* Update vector field */
-struct upd_field_struct{
+struct upd_field_t{
 	unsigned	field_no:16;	/*!< field number in an index, usually
 					the clustered index, but in updating
-					a secondary index record in btr0cur.c
+					a secondary index record in btr0cur.cc
 					this is the position in the secondary
 					index */
 #ifndef UNIV_HOTBACKUP
@@ -385,7 +421,7 @@ struct upd_field_struct{
 };
 
 /* Update vector structure */
-struct upd_struct{
+struct upd_t{
 	ulint		info_bits;	/*!< new value of info bits to record;
 					default is 0 */
 	ulint		n_fields;	/*!< number of update fields */
@@ -396,7 +432,7 @@ struct upd_struct{
 /* Update node structure which also implements the delete operation
 of a row */
 
-struct upd_node_struct{
+struct upd_node_t{
 	que_common_t	common;	/*!< node type: QUE_NODE_UPDATE */
 	ibool		is_delete;/* TRUE if delete, FALSE if update */
 	ibool		searched_update;
diff --git a/storage/xtradb/include/row0upd.ic b/storage/xtradb/include/row0upd.ic
index 9b699455665..618a77fa4bf 100644
--- a/storage/xtradb/include/row0upd.ic
+++ b/storage/xtradb/include/row0upd.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -44,12 +44,11 @@ upd_create(
 {
 	upd_t*	update;
 
-	update = (upd_t*) mem_heap_alloc(heap, sizeof(upd_t));
+	update = (upd_t*) mem_heap_zalloc(heap, sizeof(upd_t));
 
-	update->info_bits = 0;
 	update->n_fields = n;
 	update->fields = (upd_field_t*)
-		mem_heap_alloc(heap, sizeof(upd_field_t) * n);
+		mem_heap_zalloc(heap, sizeof(upd_field_t) * n);
 
 	return(update);
 }
@@ -103,7 +102,7 @@ upd_field_set_field_no(
 	upd_field->field_no = field_no;
 	upd_field->orig_len = 0;
 
-	if (UNIV_UNLIKELY(field_no >= dict_index_get_n_fields(index))) {
+	if (field_no >= dict_index_get_n_fields(index)) {
 		fprintf(stderr,
 			"InnoDB: Error: trying to access field %lu in ",
 			(ulong) field_no);
@@ -111,6 +110,7 @@ upd_field_set_field_no(
 		fprintf(stderr, "\n"
 			"InnoDB: but index only has %lu fields\n",
 			(ulong) dict_index_get_n_fields(index));
+		ut_ad(0);
 	}
 
 	dict_col_copy_type(dict_index_get_nth_col(index, field_no),
@@ -152,13 +152,14 @@ row_upd_rec_sys_fields(
 				uncompressed part will be updated, or NULL */
 	dict_index_t*	index,	/*!< in: clustered index */
 	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
-	trx_t*		trx,	/*!< in: transaction */
-	roll_ptr_t	roll_ptr)/*!< in: roll ptr of the undo log record */
+	const trx_t*	trx,	/*!< in: transaction */
+	roll_ptr_t	roll_ptr)/*!< in: roll ptr of the undo log record,
+				 can be 0 during IMPORT */
 {
 	ut_ad(dict_index_is_clust(index));
 	ut_ad(rec_offs_validate(rec, index, offsets));
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		ulint	pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
 		page_zip_write_trx_id_and_roll_ptr(page_zip, rec, offsets,
 						   pos, trx->id, roll_ptr);
@@ -172,8 +173,14 @@ row_upd_rec_sys_fields(
 #if DATA_TRX_ID + 1 != DATA_ROLL_PTR
 # error "DATA_TRX_ID + 1 != DATA_ROLL_PTR"
 #endif
-		ut_ad(lock_check_trx_id_sanity(trx_read_trx_id(rec + offset),
-					       rec, index, offsets, FALSE));
+		/* During IMPORT the trx id in the record can be in the
+		future, if the .ibd file is being imported from another
+		instance. During IMPORT roll_ptr will be 0. */
+		ut_ad(roll_ptr == 0
+		      || lock_check_trx_id_sanity(
+			      trx_read_trx_id(rec + offset),
+			      rec, index, offsets));
+
 		trx_write_trx_id(rec + offset, trx->id);
 		trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr);
 	}
diff --git a/storage/xtradb/include/row0vers.h b/storage/xtradb/include/row0vers.h
index 48d5fc43fd1..1df5b4d3e98 100644
--- a/storage/xtradb/include/row0vers.h
+++ b/storage/xtradb/include/row0vers.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -37,13 +37,15 @@ Created 2/6/1997 Heikki Tuuri
 
 /*****************************************************************//**
 Finds out if an active transaction has inserted or modified a secondary
-index record. NOTE: the kernel mutex is temporarily released in this
-function!
-@return NULL if committed, else the active transaction */
+index record.
+@return 0 if committed, else the active transaction id;
+NOTE that this function can return false positives but never false
+negatives. The caller must confirm all positive results by calling
+trx_is_active() while holding lock_sys->mutex. */
 UNIV_INTERN
-trx_t*
-row_vers_impl_x_locked_off_kernel(
-/*==============================*/
+trx_id_t
+row_vers_impl_x_locked(
+/*===================*/
 	const rec_t*	rec,	/*!< in: record in a secondary index */
 	dict_index_t*	index,	/*!< in: the secondary index */
 	const ulint*	offsets);/*!< in: rec_get_offsets(rec, index) */
@@ -85,7 +87,7 @@ read should see. We assume that the trx id stored in rec is such that
 the consistent read should not see rec in its present version.
 @return	DB_SUCCESS or DB_MISSING_HISTORY */
 UNIV_INTERN
-ulint
+dberr_t
 row_vers_build_for_consistent_read(
 /*===============================*/
 	const rec_t*	rec,	/*!< in: record in a clustered index; the
@@ -104,16 +106,17 @@ row_vers_build_for_consistent_read(
 				*old_vers is allocated; memory for possible
 				intermediate versions is allocated and freed
 				locally within the function */
-	rec_t**		old_vers);/*!< out, own: old version, or NULL if the
-				record does not exist in the view, that is,
+	rec_t**		old_vers)/*!< out, own: old version, or NULL
+				if the history is missing or the record
+				does not exist in the view, that is,
 				it was freshly inserted afterwards */
+	__attribute__((nonnull(1,2,3,4,5,6,7)));
 
 /*****************************************************************//**
 Constructs the last committed version of a clustered index record,
-which should be seen by a semi-consistent read.
-@return	DB_SUCCESS or DB_MISSING_HISTORY */
+which should be seen by a semi-consistent read. */
 UNIV_INTERN
-ulint
+void
 row_vers_build_for_semi_consistent_read(
 /*====================================*/
 	const rec_t*	rec,	/*!< in: record in a clustered index; the
@@ -130,9 +133,10 @@ row_vers_build_for_semi_consistent_read(
 				*old_vers is allocated; memory for possible
 				intermediate versions is allocated and freed
 				locally within the function */
-	const rec_t**	old_vers);/*!< out: rec, old version, or NULL if the
+	const rec_t**	old_vers)/*!< out: rec, old version, or NULL if the
 				record does not exist in the view, that is,
 				it was freshly inserted afterwards */
+	__attribute__((nonnull(1,2,3,4,5)));
 
 
 #ifndef UNIV_NONINL
diff --git a/storage/xtradb/include/row0vers.ic b/storage/xtradb/include/row0vers.ic
index 2687d1a9e15..ef43a55bf70 100644
--- a/storage/xtradb/include/row0vers.ic
+++ b/storage/xtradb/include/row0vers.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/srv0conc.h b/storage/xtradb/include/srv0conc.h
new file mode 100644
index 00000000000..9aee1b17bf0
--- /dev/null
+++ b/storage/xtradb/include/srv0conc.h
@@ -0,0 +1,111 @@
+/*****************************************************************************
+
+Copyright (c) 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0conc.h
+
+InnoDB concurrency manager header file
+
+Created 2011/04/18 Sunny Bains
+*******************************************************/
+
+#ifndef srv_conc_h
+#define srv_conc_h
+
+/** We are prepared for a situation that we have this many threads waiting for
+a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the
+value. */
+
+extern	ulint	srv_max_n_threads;
+
+/** The following controls how many threads we let inside InnoDB concurrently:
+threads waiting for locks are not counted into the number because otherwise
+we could get a deadlock. Value of 0 will disable the concurrency check. */
+
+extern ulong	srv_thread_concurrency;
+
+/*********************************************************************//**
+Initialise the concurrency management data structures */
+void
+srv_conc_init(void);
+/*===============*/
+
+/*********************************************************************//**
+Free the concurrency management data structures */
+void
+srv_conc_free(void);
+/*===============*/
+
+/*********************************************************************//**
+Puts an OS thread to wait if there are too many concurrent threads
+(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
+UNIV_INTERN
+void
+srv_conc_enter_innodb(
+/*==================*/
+	trx_t*	trx);		/*!< in: transaction object associated
+				with the thread */
+
+/*********************************************************************//**
+This lets a thread enter InnoDB regardless of the number of threads inside
+InnoDB. This must be called when a thread ends a lock wait. */
+UNIV_INTERN
+void
+srv_conc_force_enter_innodb(
+/*========================*/
+	trx_t*	trx);		/*!< in: transaction object associated with
+				the thread */
+
+/*********************************************************************//**
+This must be called when a thread exits InnoDB in a lock wait or at the
+end of an SQL statement. */
+UNIV_INTERN
+void
+srv_conc_force_exit_innodb(
+/*=======================*/
+	trx_t*	trx);		/*!< in: transaction object associated with
+				the thread */
+
+/*********************************************************************//**
+Get the count of threads waiting inside InnoDB. */
+UNIV_INTERN
+ulint
+srv_conc_get_waiting_threads(void);
+/*==============================*/
+
+/*********************************************************************//**
+Get the count of threads active inside InnoDB. */
+UNIV_INTERN
+ulint
+srv_conc_get_active_threads(void);
+/*==============================*/
+
+#endif /* srv_conc_h */
diff --git a/storage/xtradb/include/srv0mon.h b/storage/xtradb/include/srv0mon.h
new file mode 100644
index 00000000000..209894833a0
--- /dev/null
+++ b/storage/xtradb/include/srv0mon.h
@@ -0,0 +1,892 @@
+/***********************************************************************
+
+Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file include/srv0mon.h
+Server monitor counter related defines
+
+Created 12/15/2009	Jimmy Yang
+*******************************************************/
+
+#ifndef srv0mon_h
+#define srv0mon_h
+
+#include "univ.i"
+#ifndef UNIV_HOTBACKUP
+
+
+/** Possible status values for "mon_status" in "struct monitor_value" */
+enum monitor_running_status {
+	MONITOR_STARTED = 1,	/*!< Monitor has been turned on */
+	MONITOR_STOPPED = 2	/*!< Monitor has been turned off */
+};
+
+typedef enum monitor_running_status	monitor_running_t;
+
+/** Monitor counter value type */
+typedef	ib_int64_t			mon_type_t;
+
+/** Two monitor structures are defined in this file. One is
+"monitor_value_t" which contains dynamic counter values for each
+counter. The other is "monitor_info_t", which contains
+static information (counter name, desc etc.) for each counter.
+In addition, an enum datatype "monitor_id_t" is also defined,
+it identifies each monitor with an internally used symbol, whose
+integer value indexes into above two structure for its dynamic
+and static information.
+Developer who intend to add new counters would require to
+fill in counter information as described in "monitor_info_t" and
+create the internal counter ID in "monitor_id_t". */
+
+/** Structure containing the actual values of a monitor counter. */
+struct monitor_value_t {
+	ib_time_t	mon_start_time;	/*!< Start time of monitoring  */
+	ib_time_t	mon_stop_time;	/*!< Stop time of monitoring */
+	ib_time_t	mon_reset_time;	/*!< Time counter resetted */
+	mon_type_t	mon_value;	/*!< Current counter Value */
+	mon_type_t	mon_max_value;	/*!< Current Max value */
+	mon_type_t	mon_min_value;	/*!< Current Min value */
+	mon_type_t	mon_value_reset;/*!< value at last reset */
+	mon_type_t	mon_max_value_start; /*!< Max value since start */
+	mon_type_t	mon_min_value_start; /*!< Min value since start */
+	mon_type_t	mon_start_value;/*!< Value at the start time */
+	mon_type_t	mon_last_value;	/*!< Last set of values */
+	monitor_running_t mon_status;	/* whether monitor still running */
+};
+
+/** Follwoing defines are possible values for "monitor_type" field in
+"struct monitor_info" */
+enum monitor_type_t {
+	MONITOR_NONE = 0,	/*!< No monitoring */
+	MONITOR_MODULE = 1,	/*!< This is a monitor module type,
+				not a counter */
+	MONITOR_EXISTING = 2,	/*!< The monitor carries information from
+				an existing system status variable */
+	MONITOR_NO_AVERAGE = 4,	/*!< Set this status if we don't want to
+				calculate the average value for the counter */
+	MONITOR_DISPLAY_CURRENT = 8, /*!< Display current value of the
+				counter, rather than incremental value
+				over the period. Mostly for counters
+				displaying current resource usage */
+	MONITOR_GROUP_MODULE = 16, /*!< Monitor can be turned on/off
+				only as a module, but not individually */
+	MONITOR_DEFAULT_ON = 32,/*!< Monitor will be turned on by default at
+				server start up */
+	MONITOR_SET_OWNER = 64,	/*!< Owner of "monitor set", a set of
+				monitor counters */
+	MONITOR_SET_MEMBER = 128,/*!< Being part of a "monitor set" */
+	MONITOR_HIDDEN = 256	/*!< Do not display this monitor in the
+				metrics table */
+};
+
+/** Counter minimum value is initialized to be max value of
+ mon_type_t (ib_int64_t) */
+#define	MIN_RESERVED		((mon_type_t) (IB_UINT64_MAX >> 1))
+#define	MAX_RESERVED		(~MIN_RESERVED)
+
+/** This enumeration defines internal monitor identifier used internally
+to identify each particular counter. Its value indexes into two arrays,
+one is the "innodb_counter_value" array which records actual monitor
+counter values, the other is "innodb_counter_info" array which describes
+each counter's basic information (name, desc etc.). A couple of
+naming rules here:
+1) If the monitor defines a module, it starts with MONITOR_MODULE
+2) If the monitor uses exisitng counters from "status variable", its ID
+name shall start with MONITOR_OVLD
+
+Please refer to "innodb_counter_info" in srv/srv0mon.cc for detail
+information for each monitor counter */
+
+enum monitor_id_t {
+	/* This is to identify the default value set by the metrics
+	control global variables */
+	MONITOR_DEFAULT_START = 0,
+
+	/* Start of Metadata counter */
+	MONITOR_MODULE_METADATA,
+	MONITOR_TABLE_OPEN,
+	MONITOR_TABLE_CLOSE,
+	MONITOR_TABLE_REFERENCE,
+	MONITOR_OVLD_META_MEM_POOL,
+
+	/* Lock manager related counters */
+	MONITOR_MODULE_LOCK,
+	MONITOR_DEADLOCK,
+	MONITOR_TIMEOUT,
+	MONITOR_LOCKREC_WAIT,
+	MONITOR_TABLELOCK_WAIT,
+	MONITOR_NUM_RECLOCK_REQ,
+	MONITOR_RECLOCK_CREATED,
+	MONITOR_RECLOCK_REMOVED,
+	MONITOR_NUM_RECLOCK,
+	MONITOR_TABLELOCK_CREATED,
+	MONITOR_TABLELOCK_REMOVED,
+	MONITOR_NUM_TABLELOCK,
+	MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT,
+	MONITOR_OVLD_LOCK_WAIT_TIME,
+	MONITOR_OVLD_LOCK_MAX_WAIT_TIME,
+	MONITOR_OVLD_ROW_LOCK_WAIT,
+	MONITOR_OVLD_LOCK_AVG_WAIT_TIME,
+
+	/* Buffer and I/O realted counters. */
+	MONITOR_MODULE_BUFFER,
+	MONITOR_OVLD_BUFFER_POOL_SIZE,
+	MONITOR_OVLD_BUF_POOL_READS,
+	MONITOR_OVLD_BUF_POOL_READ_REQUESTS,
+	MONITOR_OVLD_BUF_POOL_WRITE_REQUEST,
+	MONITOR_OVLD_BUF_POOL_WAIT_FREE,
+	MONITOR_OVLD_BUF_POOL_READ_AHEAD,
+	MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED,
+	MONITOR_OVLD_BUF_POOL_PAGE_TOTAL,
+	MONITOR_OVLD_BUF_POOL_PAGE_MISC,
+	MONITOR_OVLD_BUF_POOL_PAGES_DATA,
+	MONITOR_OVLD_BUF_POOL_BYTES_DATA,
+	MONITOR_OVLD_BUF_POOL_PAGES_DIRTY,
+	MONITOR_OVLD_BUF_POOL_BYTES_DIRTY,
+	MONITOR_OVLD_BUF_POOL_PAGES_FREE,
+	MONITOR_OVLD_PAGE_CREATED,
+	MONITOR_OVLD_PAGES_WRITTEN,
+	MONITOR_OVLD_PAGES_READ,
+	MONITOR_OVLD_BYTE_READ,
+	MONITOR_OVLD_BYTE_WRITTEN,
+	MONITOR_FLUSH_BATCH_SCANNED,
+	MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+	MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+	MONITOR_FLUSH_HP_RESCAN,
+	MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+	MONITOR_FLUSH_BATCH_COUNT,
+	MONITOR_FLUSH_BATCH_PAGES,
+	MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+	MONITOR_FLUSH_NEIGHBOR_COUNT,
+	MONITOR_FLUSH_NEIGHBOR_PAGES,
+	MONITOR_FLUSH_N_TO_FLUSH_REQUESTED,
+	MONITOR_FLUSH_AVG_PAGE_RATE,
+	MONITOR_FLUSH_LSN_AVG_RATE,
+	MONITOR_FLUSH_PCT_FOR_DIRTY,
+	MONITOR_FLUSH_PCT_FOR_LSN,
+	MONITOR_FLUSH_SYNC_WAITS,
+	MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+	MONITOR_FLUSH_ADAPTIVE_COUNT,
+	MONITOR_FLUSH_ADAPTIVE_PAGES,
+	MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+	MONITOR_FLUSH_SYNC_COUNT,
+	MONITOR_FLUSH_SYNC_PAGES,
+	MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+	MONITOR_FLUSH_BACKGROUND_COUNT,
+	MONITOR_FLUSH_BACKGROUND_PAGES,
+	MONITOR_LRU_BATCH_SCANNED,
+	MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+	MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+	MONITOR_LRU_BATCH_TOTAL_PAGE,
+	MONITOR_LRU_BATCH_COUNT,
+	MONITOR_LRU_BATCH_PAGES,
+	MONITOR_LRU_SINGLE_FLUSH_SCANNED,
+	MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
+	MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
+	MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT,
+	MONITOR_LRU_GET_FREE_SEARCH,
+	MONITOR_LRU_SEARCH_SCANNED,
+	MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+	MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+	MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+	MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+	MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+
+	/* Buffer Page I/O specific counters. */
+	MONITOR_MODULE_BUF_PAGE,
+	MONITOR_INDEX_LEAF_PAGE_READ,
+	MONITOR_INDEX_NON_LEAF_PAGE_READ,
+	MONITOR_INDEX_IBUF_LEAF_PAGE_READ,
+	MONITOR_INDEX_IBUF_NON_LEAF_PAGE_READ,
+	MONITOR_UNDO_LOG_PAGE_READ,
+	MONITOR_INODE_PAGE_READ,
+	MONITOR_IBUF_FREELIST_PAGE_READ,
+	MONITOR_IBUF_BITMAP_PAGE_READ,
+	MONITOR_SYSTEM_PAGE_READ,
+	MONITOR_TRX_SYSTEM_PAGE_READ,
+	MONITOR_FSP_HDR_PAGE_READ,
+	MONITOR_XDES_PAGE_READ,
+	MONITOR_BLOB_PAGE_READ,
+	MONITOR_ZBLOB_PAGE_READ,
+	MONITOR_ZBLOB2_PAGE_READ,
+	MONITOR_OTHER_PAGE_READ,
+	MONITOR_INDEX_LEAF_PAGE_WRITTEN,
+	MONITOR_INDEX_NON_LEAF_PAGE_WRITTEN,
+	MONITOR_INDEX_IBUF_LEAF_PAGE_WRITTEN,
+	MONITOR_INDEX_IBUF_NON_LEAF_PAGE_WRITTEN,
+	MONITOR_UNDO_LOG_PAGE_WRITTEN,
+	MONITOR_INODE_PAGE_WRITTEN,
+	MONITOR_IBUF_FREELIST_PAGE_WRITTEN,
+	MONITOR_IBUF_BITMAP_PAGE_WRITTEN,
+	MONITOR_SYSTEM_PAGE_WRITTEN,
+	MONITOR_TRX_SYSTEM_PAGE_WRITTEN,
+	MONITOR_FSP_HDR_PAGE_WRITTEN,
+	MONITOR_XDES_PAGE_WRITTEN,
+	MONITOR_BLOB_PAGE_WRITTEN,
+	MONITOR_ZBLOB_PAGE_WRITTEN,
+	MONITOR_ZBLOB2_PAGE_WRITTEN,
+	MONITOR_OTHER_PAGE_WRITTEN,
+
+	/* OS level counters (I/O) */
+	MONITOR_MODULE_OS,
+	MONITOR_OVLD_OS_FILE_READ,
+	MONITOR_OVLD_OS_FILE_WRITE,
+	MONITOR_OVLD_OS_FSYNC,
+	MONITOR_OS_PENDING_READS,
+	MONITOR_OS_PENDING_WRITES,
+	MONITOR_OVLD_OS_LOG_WRITTEN,
+	MONITOR_OVLD_OS_LOG_FSYNC,
+	MONITOR_OVLD_OS_LOG_PENDING_FSYNC,
+	MONITOR_OVLD_OS_LOG_PENDING_WRITES,
+
+	/* Transaction related counters */
+	MONITOR_MODULE_TRX,
+	MONITOR_TRX_RW_COMMIT,
+	MONITOR_TRX_RO_COMMIT,
+	MONITOR_TRX_NL_RO_COMMIT,
+	MONITOR_TRX_COMMIT_UNDO,
+	MONITOR_TRX_ROLLBACK,
+	MONITOR_TRX_ROLLBACK_SAVEPOINT,
+	MONITOR_TRX_ROLLBACK_ACTIVE,
+	MONITOR_TRX_ACTIVE,
+	MONITOR_RSEG_HISTORY_LEN,
+	MONITOR_NUM_UNDO_SLOT_USED,
+	MONITOR_NUM_UNDO_SLOT_CACHED,
+	MONITOR_RSEG_CUR_SIZE,
+
+	/* Purge related counters */
+	MONITOR_MODULE_PURGE,
+	MONITOR_N_DEL_ROW_PURGE,
+	MONITOR_N_UPD_EXIST_EXTERN,
+	MONITOR_PURGE_INVOKED,
+	MONITOR_PURGE_N_PAGE_HANDLED,
+	MONITOR_DML_PURGE_DELAY,
+	MONITOR_PURGE_STOP_COUNT,
+	MONITOR_PURGE_RESUME_COUNT,
+
+	/* Recovery related counters */
+	MONITOR_MODULE_RECOVERY,
+	MONITOR_NUM_CHECKPOINT,
+	MONITOR_OVLD_LSN_FLUSHDISK,
+	MONITOR_OVLD_LSN_CHECKPOINT,
+	MONITOR_OVLD_LSN_CURRENT,
+	MONITOR_LSN_CHECKPOINT_AGE,
+	MONITOR_OVLD_BUF_OLDEST_LSN,
+	MONITOR_OVLD_MAX_AGE_ASYNC,
+	MONITOR_OVLD_MAX_AGE_SYNC,
+	MONITOR_PENDING_LOG_WRITE,
+	MONITOR_PENDING_CHECKPOINT_WRITE,
+	MONITOR_LOG_IO,
+	MONITOR_OVLD_LOG_WAITS,
+	MONITOR_OVLD_LOG_WRITE_REQUEST,
+	MONITOR_OVLD_LOG_WRITES,
+
+	/* Page Manager related counters */
+	MONITOR_MODULE_PAGE,
+	MONITOR_PAGE_COMPRESS,
+	MONITOR_PAGE_DECOMPRESS,
+	MONITOR_PAD_INCREMENTS,
+	MONITOR_PAD_DECREMENTS,
+
+	/* Index related counters */
+	MONITOR_MODULE_INDEX,
+	MONITOR_INDEX_SPLIT,
+	MONITOR_INDEX_MERGE,
+
+	/* Adaptive Hash Index related counters */
+	MONITOR_MODULE_ADAPTIVE_HASH,
+	MONITOR_OVLD_ADAPTIVE_HASH_SEARCH,
+	MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE,
+	MONITOR_ADAPTIVE_HASH_PAGE_ADDED,
+	MONITOR_ADAPTIVE_HASH_PAGE_REMOVED,
+	MONITOR_ADAPTIVE_HASH_ROW_ADDED,
+	MONITOR_ADAPTIVE_HASH_ROW_REMOVED,
+	MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND,
+	MONITOR_ADAPTIVE_HASH_ROW_UPDATED,
+
+	/* Tablespace related counters */
+	MONITOR_MODULE_FIL_SYSTEM,
+	MONITOR_OVLD_N_FILE_OPENED,
+
+	/* InnoDB Change Buffer related counters */
+	MONITOR_MODULE_IBUF_SYSTEM,
+	MONITOR_OVLD_IBUF_MERGE_INSERT,
+	MONITOR_OVLD_IBUF_MERGE_DELETE,
+	MONITOR_OVLD_IBUF_MERGE_PURGE,
+	MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT,
+	MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE,
+	MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE,
+	MONITOR_OVLD_IBUF_MERGES,
+	MONITOR_OVLD_IBUF_SIZE,
+
+	/* Counters for server operations */
+	MONITOR_MODULE_SERVER,
+	MONITOR_MASTER_THREAD_SLEEP,
+	MONITOR_OVLD_SERVER_ACTIVITY,
+	MONITOR_MASTER_ACTIVE_LOOPS,
+	MONITOR_MASTER_IDLE_LOOPS,
+	MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND,
+	MONITOR_SRV_IBUF_MERGE_MICROSECOND,
+	MONITOR_SRV_LOG_FLUSH_MICROSECOND,
+	MONITOR_SRV_MEM_VALIDATE_MICROSECOND,
+	MONITOR_SRV_PURGE_MICROSECOND,
+	MONITOR_SRV_DICT_LRU_MICROSECOND,
+	MONITOR_SRV_CHECKPOINT_MICROSECOND,
+	MONITOR_OVLD_SRV_DBLWR_WRITES,
+	MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN,
+	MONITOR_OVLD_SRV_PAGE_SIZE,
+	MONITOR_OVLD_RWLOCK_S_SPIN_WAITS,
+	MONITOR_OVLD_RWLOCK_X_SPIN_WAITS,
+	MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS,
+	MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS,
+	MONITOR_OVLD_RWLOCK_S_OS_WAITS,
+	MONITOR_OVLD_RWLOCK_X_OS_WAITS,
+
+	/* Data DML related counters */
+	MONITOR_MODULE_DML_STATS,
+	MONITOR_OLVD_ROW_READ,
+	MONITOR_OLVD_ROW_INSERTED,
+	MONITOR_OLVD_ROW_DELETED,
+	MONITOR_OLVD_ROW_UPDTATED,
+
+	/* Data DDL related counters */
+	MONITOR_MODULE_DDL_STATS,
+	MONITOR_BACKGROUND_DROP_INDEX,
+	MONITOR_BACKGROUND_DROP_TABLE,
+	MONITOR_ONLINE_CREATE_INDEX,
+	MONITOR_PENDING_ALTER_TABLE,
+
+	MONITOR_MODULE_ICP,
+	MONITOR_ICP_ATTEMPTS,
+	MONITOR_ICP_NO_MATCH,
+	MONITOR_ICP_OUT_OF_RANGE,
+	MONITOR_ICP_MATCH,
+
+	/* This is used only for control system to turn
+	on/off and reset all monitor counters */
+	MONITOR_ALL_COUNTER,
+
+	/* This must be the last member */
+	NUM_MONITOR
+};
+
+/** This informs the monitor control system to turn
+on/off and reset monitor counters through wild card match */
+#define	MONITOR_WILDCARD_MATCH		(NUM_MONITOR + 1)
+
+/** Cannot find monitor counter with a specified name */
+#define	MONITOR_NO_MATCH		(NUM_MONITOR + 2)
+
+/** struct monitor_info describes the basic/static information
+about each monitor counter. */
+struct monitor_info_t {
+	const char*	monitor_name;	/*!< Monitor name */
+	const char*	monitor_module;	/*!< Sub Module the monitor
+					belongs to */
+	const char*	monitor_desc;	/*!< Brief desc of monitor counter */
+	monitor_type_t	monitor_type;	/*!< Type of Monitor Info */
+	monitor_id_t	monitor_related_id;/*!< Monitor ID of counter that
+					related to this monitor. This is
+					set when the monitor belongs to
+					a "monitor set" */
+	monitor_id_t	monitor_id;	/*!< Monitor ID as defined in enum
+					monitor_id_t */
+};
+
+/** Following are the "set_option" values allowed for
+srv_mon_process_existing_counter() and srv_mon_process_existing_counter()
+functions. To turn on/off/reset the monitor counters. */
+enum mon_option_t {
+	MONITOR_TURN_ON = 1,		/*!< Turn on the counter */
+	MONITOR_TURN_OFF,		/*!< Turn off the counter */
+	MONITOR_RESET_VALUE,		/*!< Reset current values */
+	MONITOR_RESET_ALL_VALUE,	/*!< Reset all values */
+	MONITOR_GET_VALUE		/*!< Option for
+					srv_mon_process_existing_counter()
+					function */
+};
+
+/** Number of bit in a ulint datatype */
+#define	NUM_BITS_ULINT	(sizeof(ulint) * CHAR_BIT)
+
+/** This "monitor_set_tbl" is a bitmap records whether a particular monitor
+counter has been turned on or off */
+extern ulint		monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT - 1) /
+					NUM_BITS_ULINT];
+
+/** Macros to turn on/off the control bit in monitor_set_tbl for a monitor
+counter option. */
+#define MONITOR_ON(monitor)				\
+	(monitor_set_tbl[monitor / NUM_BITS_ULINT] |=	\
+			((ulint)1 << (monitor % NUM_BITS_ULINT)))
+
+#define MONITOR_OFF(monitor)				\
+	(monitor_set_tbl[monitor / NUM_BITS_ULINT] &=	\
+			~((ulint)1 << (monitor % NUM_BITS_ULINT)))
+
+/** Check whether the requested monitor is turned on/off */
+#define MONITOR_IS_ON(monitor)				\
+	(monitor_set_tbl[monitor / NUM_BITS_ULINT] &	\
+			((ulint)1 << (monitor % NUM_BITS_ULINT)))
+
+/** The actual monitor counter array that records each monintor counter
+value */
+extern monitor_value_t	 innodb_counter_value[NUM_MONITOR];
+
+/** Following are macro defines for basic montior counter manipulations.
+Please note we do not provide any synchronization for these monitor
+operations due to performance consideration. Most counters can
+be placed under existing mutex protections in respective code
+module. */
+
+/** Macros to access various fields of a monitor counters */
+#define MONITOR_FIELD(monitor, field)			\
+		(innodb_counter_value[monitor].field)
+
+#define MONITOR_VALUE(monitor)				\
+		MONITOR_FIELD(monitor, mon_value)
+
+#define MONITOR_MAX_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_max_value)
+
+#define MONITOR_MIN_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_min_value)
+
+#define MONITOR_VALUE_RESET(monitor)			\
+		MONITOR_FIELD(monitor, mon_value_reset)
+
+#define MONITOR_MAX_VALUE_START(monitor)		\
+		MONITOR_FIELD(monitor, mon_max_value_start)
+
+#define MONITOR_MIN_VALUE_START(monitor)		\
+		MONITOR_FIELD(monitor, mon_min_value_start)
+
+#define MONITOR_LAST_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_last_value)
+
+#define MONITOR_START_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_start_value)
+
+#define MONITOR_VALUE_SINCE_START(monitor)		\
+		(MONITOR_VALUE(monitor) + MONITOR_VALUE_RESET(monitor))
+
+#define MONITOR_STATUS(monitor)				\
+		MONITOR_FIELD(monitor, mon_status)
+
+#define MONITOR_SET_START(monitor)					\
+	do {								\
+		MONITOR_STATUS(monitor) = MONITOR_STARTED;		\
+		MONITOR_FIELD((monitor), mon_start_time) = time(NULL);	\
+	} while (0)
+
+#define MONITOR_SET_OFF(monitor)					\
+	do {								\
+		MONITOR_STATUS(monitor) = MONITOR_STOPPED;		\
+		MONITOR_FIELD((monitor), mon_stop_time) = time(NULL);	\
+	} while (0)
+
+#define	MONITOR_INIT_ZERO_VALUE		0
+
+/** Max and min values are initialized when we first turn on the monitor
+counter, and set the MONITOR_STATUS. */
+#define MONITOR_MAX_MIN_NOT_INIT(monitor)				\
+		(MONITOR_STATUS(monitor) == MONITOR_INIT_ZERO_VALUE	\
+		 && MONITOR_MIN_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE \
+		 && MONITOR_MAX_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE)
+
+#define MONITOR_INIT(monitor)						\
+	if (MONITOR_MAX_MIN_NOT_INIT(monitor)) {			\
+		MONITOR_MIN_VALUE(monitor) = MIN_RESERVED;		\
+		MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED;	\
+		MONITOR_MAX_VALUE(monitor) = MAX_RESERVED;		\
+		MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED;	\
+	}
+
+/** Macros to increment/decrement the counters. The normal
+monitor counter operation expects appropriate synchronization
+already exists. No additional mutex is necessary when operating
+on the counters */
+#define	MONITOR_INC(monitor)						\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor)++;				\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Increment a monitor counter under mutex protection.
+Use MONITOR_INC if appropriate mutex protection already exists.
+@param monitor	monitor to be incremented by 1
+@param mutex	mutex to acquire and relese */
+# define MONITOR_MUTEX_INC(mutex, monitor)				\
+	ut_ad(!mutex_own(mutex));					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		mutex_enter(mutex);					\
+		if (++MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor); \
+		}							\
+		mutex_exit(mutex);					\
+	}
+/** Decrement a monitor counter under mutex protection.
+Use MONITOR_DEC if appropriate mutex protection already exists.
+@param monitor	monitor to be decremented by 1
+@param mutex	mutex to acquire and relese */
+# define MONITOR_MUTEX_DEC(mutex, monitor)				\
+	ut_ad(!mutex_own(mutex));					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		mutex_enter(mutex);					\
+		if (--MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor); \
+		}							\
+		mutex_exit(mutex);					\
+	}
+
+#if defined HAVE_ATOMIC_BUILTINS_64
+/** Atomically increment a monitor counter.
+Use MONITOR_INC if appropriate mutex protection exists.
+@param monitor	monitor to be incremented by 1 */
+# define MONITOR_ATOMIC_INC(monitor)					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		ib_uint64_t	value;					\
+		value  = os_atomic_increment_uint64(			\
+			(ib_uint64_t*) &MONITOR_VALUE(monitor),	 1);	\
+		/* Note: This is not 100% accurate because of the	\
+		inherent race, we ignore it due to performance. */	\
+		if (value > (ib_uint64_t) MONITOR_MAX_VALUE(monitor)) {	\
+			MONITOR_MAX_VALUE(monitor) = value;		\
+		}							\
+	}
+
+/** Atomically decrement a monitor counter.
+Use MONITOR_DEC if appropriate mutex protection exists.
+@param monitor	monitor to be decremented by 1 */
+# define MONITOR_ATOMIC_DEC(monitor)					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		ib_uint64_t	value;					\
+		value = os_atomic_decrement_uint64(			\
+			(ib_uint64_t*) &MONITOR_VALUE(monitor), 1);	\
+		/* Note: This is not 100% accurate because of the	\
+		inherent race, we ignore it due to performance. */	\
+		if (value < (ib_uint64_t) MONITOR_MIN_VALUE(monitor)) {	\
+			MONITOR_MIN_VALUE(monitor) = value;		\
+		}							\
+	}
+# define srv_mon_create() ((void) 0)
+# define srv_mon_free() ((void) 0)
+#else /* HAVE_ATOMIC_BUILTINS_64 */
+/** Mutex protecting atomic operations on platforms that lack
+built-in operations for atomic memory access */
+extern ib_mutex_t	monitor_mutex;
+/****************************************************************//**
+Initialize the monitor subsystem. */
+UNIV_INTERN
+void
+srv_mon_create(void);
+/*================*/
+/****************************************************************//**
+Close the monitor subsystem. */
+UNIV_INTERN
+void
+srv_mon_free(void);
+/*==============*/
+
+/** Atomically increment a monitor counter.
+Use MONITOR_INC if appropriate mutex protection exists.
+@param monitor	monitor to be incremented by 1 */
+# define MONITOR_ATOMIC_INC(monitor) MONITOR_MUTEX_INC(&monitor_mutex, monitor)
+/** Atomically decrement a monitor counter.
+Use MONITOR_DEC if appropriate mutex protection exists.
+@param monitor	monitor to be decremented by 1 */
+# define MONITOR_ATOMIC_DEC(monitor) MONITOR_MUTEX_DEC(&monitor_mutex, monitor)
+#endif /* HAVE_ATOMIC_BUILTINS_64 */
+
+#define	MONITOR_DEC(monitor)						\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor)--;				\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+#ifdef UNIV_DEBUG_VALGRIND
+# define MONITOR_CHECK_DEFINED(value) do {	\
+	mon_type_t m = value;			\
+	UNIV_MEM_ASSERT_RW(&m, sizeof m);	\
+} while (0)
+#else /* UNIV_DEBUG_VALGRIND */
+# define MONITOR_CHECK_DEFINED(value) (void) 0
+#endif /* UNIV_DEBUG_VALGRIND */
+
+#define	MONITOR_INC_VALUE(monitor, value)				\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) += (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+#define	MONITOR_DEC_VALUE(monitor, value)				\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		ut_ad(MONITOR_VALUE(monitor) >= (mon_type_t) (value);	\
+		MONITOR_VALUE(monitor) -= (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/* Increment/decrement counter without check the monitor on/off bit, which
+could already be checked as a module group */
+#define	MONITOR_INC_NOCHECK(monitor)					\
+	do {								\
+		MONITOR_VALUE(monitor)++;				\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	} while (0)							\
+
+#define	MONITOR_DEC_NOCHECK(monitor)					\
+	do {								\
+		MONITOR_VALUE(monitor)--;				\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	} while (0)
+
+/** Directly set a monitor counter's value */
+#define	MONITOR_SET(monitor, value)					\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) = (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Add time difference between now and input "value" (in seconds) to the
+monitor counter
+@param monitor	monitor to update for the time difference
+@param value	the start time value */
+#define	MONITOR_INC_TIME_IN_MICRO_SECS(monitor, value)			\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		ullint	old_time = (value);				\
+		value = ut_time_us(NULL);				\
+		MONITOR_VALUE(monitor) += (mon_type_t) (value - old_time);\
+	}
+
+/** This macro updates 3 counters in one call. However, it only checks the
+main/first monitor counter 'monitor', to see it is on or off to decide
+whether to do the update.
+@param monitor		the main monitor counter to update. It accounts for
+			the accumulative value for the counter.
+@param monitor_n_calls	counter that counts number of times this macro is
+			called
+@param monitor_per_call	counter that records the current and max value of
+			each incremental value
+@param value		incremental value to record this time */
+#define MONITOR_INC_VALUE_CUMULATIVE(					\
+		monitor, monitor_n_calls, monitor_per_call, value)	\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor_n_calls)++;			\
+		MONITOR_VALUE(monitor_per_call) = (mon_type_t) (value);	\
+		if (MONITOR_VALUE(monitor_per_call)			\
+		    > MONITOR_MAX_VALUE(monitor_per_call)) {		\
+			MONITOR_MAX_VALUE(monitor_per_call) =		\
+				 (mon_type_t) (value);			\
+		}							\
+		MONITOR_VALUE(monitor) += (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Directly set a monitor counter's value, and if the value
+is monotonically increasing, only max value needs to be updated */
+#define	MONITOR_SET_UPD_MAX_ONLY(monitor, value)			\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) = (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Some values such as log sequence number are montomically increasing
+number, do not need to record max/min values */
+#define MONITOR_SET_SIMPLE(monitor, value)				\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) = (mon_type_t) (value);		\
+	}
+
+/** Reset the monitor value and max/min value to zero. The reset
+operation would only be conducted when the counter is turned off */
+#define MONITOR_RESET_ALL(monitor)					\
+	do {								\
+		MONITOR_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_MAX_VALUE(monitor) = MAX_RESERVED;		\
+		MONITOR_MIN_VALUE(monitor) = MIN_RESERVED;		\
+		MONITOR_VALUE_RESET(monitor) = MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED;	\
+		MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED;	\
+		MONITOR_LAST_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_FIELD(monitor, mon_start_time) =		\
+					MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_FIELD(monitor, mon_stop_time) =			\
+					MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_FIELD(monitor, mon_reset_time) =		\
+					MONITOR_INIT_ZERO_VALUE;	\
+	} while (0)
+
+/** Following four macros defines necessary operations to fetch and
+consolidate information from existing system status variables. */
+
+/** Save the passed-in value to mon_start_value field of monitor
+counters */
+#define MONITOR_SAVE_START(monitor, value) do {				\
+	MONITOR_CHECK_DEFINED(value);					\
+	(MONITOR_START_VALUE(monitor) =					\
+		(mon_type_t) (value) - MONITOR_VALUE_RESET(monitor));	\
+	} while (0)
+
+/** Save the passed-in value to mon_last_value field of monitor
+counters */
+#define MONITOR_SAVE_LAST(monitor)					\
+	do {								\
+		MONITOR_LAST_VALUE(monitor) = MONITOR_VALUE(monitor);	\
+		MONITOR_START_VALUE(monitor) += MONITOR_VALUE(monitor);	\
+	} while (0)
+
+/** Set monitor value to the difference of value and mon_start_value
+compensated by mon_last_value if accumulated value is required. */
+#define MONITOR_SET_DIFF(monitor, value)				\
+	MONITOR_SET_UPD_MAX_ONLY(monitor, ((value)			\
+	- MONITOR_VALUE_RESET(monitor)					\
+	- MONITOR_FIELD(monitor, mon_start_value)			\
+	+ MONITOR_FIELD(monitor, mon_last_value)))
+
+/****************************************************************//**
+Get monitor's monitor_info_t by its monitor id (index into the
+innodb_counter_info array
+@return	Point to corresponding monitor_info_t, or NULL if no such
+monitor */
+UNIV_INTERN
+monitor_info_t*
+srv_mon_get_info(
+/*=============*/
+	monitor_id_t	monitor_id);	/*!< id index into the
+					innodb_counter_info array */
+/****************************************************************//**
+Get monitor's name by its monitor id (index into the
+innodb_counter_info array
+@return	corresponding monitor name, or NULL if no such
+monitor */
+UNIV_INTERN
+const char*
+srv_mon_get_name(
+/*=============*/
+	monitor_id_t	monitor_id);	/*!< id index into the
+					innodb_counter_info array */
+
+/****************************************************************//**
+Turn on/off/reset monitor counters in a module. If module_value
+is NUM_MONITOR then turn on all monitor counters.
+@return	0 if successful, or the first monitor that cannot be
+turned on because it is already turned on. */
+UNIV_INTERN
+void
+srv_mon_set_module_control(
+/*=======================*/
+	monitor_id_t	module_id,	/*!< in: Module ID as in
+					monitor_counter_id. If it is
+					set to NUM_MONITOR, this means
+					we shall turn on all the counters */
+	mon_option_t	set_option);	/*!< in: Turn on/off reset the
+					counter */
+/****************************************************************//**
+This function consolidates some existing server counters used
+by "system status variables". These existing system variables do not have
+mechanism to start/stop and reset the counters, so we simulate these
+controls by remembering the corresponding counter values when the
+corresponding monitors are turned on/off/reset, and do appropriate
+mathematics to deduct the actual value. */
+UNIV_INTERN
+void
+srv_mon_process_existing_counter(
+/*=============================*/
+	monitor_id_t	monitor_id,	/*!< in: the monitor's ID as in
+					monitor_counter_id */
+	mon_option_t	set_option);	/*!< in: Turn on/off reset the
+					counter */
+/*************************************************************//**
+This function is used to calculate the maximum counter value
+since the start of monitor counter
+@return	max counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_max_since_start(
+/*=========================*/
+	monitor_id_t	monitor);	/*!< in: monitor id */
+/*************************************************************//**
+This function is used to calculate the minimum counter value
+since the start of monitor counter
+@return	min counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_min_since_start(
+/*=========================*/
+	monitor_id_t	monitor);	/*!< in: monitor id*/
+/*************************************************************//**
+Reset a monitor, create a new base line with the current monitor
+value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */
+UNIV_INTERN
+void
+srv_mon_reset(
+/*==========*/
+	monitor_id_t	monitor);	/*!< in: monitor id*/
+/*************************************************************//**
+This function resets all values of a monitor counter */
+UNIV_INLINE
+void
+srv_mon_reset_all(
+/*==============*/
+	monitor_id_t	monitor);	/*!< in: monitor id*/
+/*************************************************************//**
+Turn on monitor counters that are marked as default ON. */
+UNIV_INTERN
+void
+srv_mon_default_on(void);
+/*====================*/
+
+#ifndef UNIV_NONINL
+#include "srv0mon.ic"
+#endif
+#else /* !UNIV_HOTBACKUP */
+# define MONITOR_INC(x)		((void) 0)
+# define MONITOR_DEC(x)		((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/xtradb/include/srv0mon.ic b/storage/xtradb/include/srv0mon.ic
new file mode 100644
index 00000000000..17411d77a8b
--- /dev/null
+++ b/storage/xtradb/include/srv0mon.ic
@@ -0,0 +1,113 @@
+/*****************************************************************************
+
+Copyright (c) 2010, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/srv0mon.ic
+Server monitoring system
+
+Created 1/20/2010	Jimmy Yang
+************************************************************************/
+
+/*************************************************************//**
+This function is used to calculate the maximum counter value
+since the start of monitor counter
+@return	max counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_max_since_start(
+/*=========================*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	if (MONITOR_MAX_VALUE_START(monitor) == MAX_RESERVED) {
+
+		/* MONITOR_MAX_VALUE_START has not yet been
+		initialized, the max value since start is the
+		max count in MONITOR_MAX_VALUE */
+		MONITOR_MAX_VALUE_START(monitor) =
+				MONITOR_MAX_VALUE(monitor);
+
+	} else if (MONITOR_MAX_VALUE(monitor) != MAX_RESERVED
+		   && (MONITOR_MAX_VALUE(monitor)
+		       + MONITOR_VALUE_RESET(monitor)
+		      > MONITOR_MAX_VALUE_START(monitor))) {
+
+		/* If the max value since reset (as specified
+		in MONITOR_MAX_VALUE) plus the reset value is
+		larger than MONITOR_MAX_VALUE_START, reset
+		MONITOR_MAX_VALUE_START to this new max value */
+		MONITOR_MAX_VALUE_START(monitor) =
+				MONITOR_MAX_VALUE(monitor)
+				+ MONITOR_VALUE_RESET(monitor);
+	}
+
+	return(MONITOR_MAX_VALUE_START(monitor));
+}
+
+/*************************************************************//**
+This function is used to calculate the minimum counter value
+since the start of monitor counter
+@return	min counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_min_since_start(
+/*=========================*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	if (MONITOR_MIN_VALUE_START(monitor) == MIN_RESERVED) {
+
+		/* MONITOR_MIN_VALUE_START has not yet been
+		initialized, the min value since start is the
+		min count in MONITOR_MIN_VALUE */
+		MONITOR_MIN_VALUE_START(monitor) =
+				MONITOR_MIN_VALUE(monitor);
+
+	} else if (MONITOR_MIN_VALUE(monitor) != MIN_RESERVED
+		   && (MONITOR_MIN_VALUE(monitor)
+		       + MONITOR_VALUE_RESET(monitor)
+		       < MONITOR_MIN_VALUE_START(monitor))) {
+
+		/* If the min value since reset (as specified
+		in MONITOR_MIN_VALUE) plus the reset value is
+		less than MONITOR_MIN_VALUE_START, reset
+		MONITOR_MIN_VALUE_START to this new min value */
+		MONITOR_MIN_VALUE_START(monitor) =
+			MONITOR_MIN_VALUE(monitor)
+                        + MONITOR_VALUE_RESET(monitor);
+        }
+
+	return(MONITOR_MIN_VALUE_START(monitor));
+}
+
+/*************************************************************//**
+This function resets all values of a monitor counter */
+UNIV_INLINE
+void
+srv_mon_reset_all(
+/*==============*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	/* Do not reset all counter values if monitor is still on. */
+	if (MONITOR_IS_ON(monitor)) {
+		fprintf(stderr, "InnoDB: Cannot reset all values for "
+			"monitor counter %s while it is on. Please "
+			"turn it off and retry. \n",
+			srv_mon_get_name(monitor));
+	} else {
+		MONITOR_RESET_ALL(monitor);
+	}
+}
diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h
index 330e3c412ae..d278782daa8 100644
--- a/storage/xtradb/include/srv0srv.h
+++ b/storage/xtradb/include/srv0srv.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2011, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2012, Oracle and/or its affiliates. All rights reserved.
 Copyright (c) 2008, 2009, Google Inc.
 Copyright (c) 2009, Percona Inc.
 
@@ -26,8 +26,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -43,31 +43,119 @@ Created 10/10/1995 Heikki Tuuri
 
 #include "univ.i"
 #ifndef UNIV_HOTBACKUP
+#include "log0log.h"
 #include "sync0sync.h"
 #include "os0sync.h"
 #include "que0types.h"
 #include "trx0types.h"
+#include "srv0conc.h"
+#include "buf0checksum.h"
+#include "ut0counter.h"
+
+/* Global counters used inside InnoDB. */
+struct srv_stats_t {
+	typedef ib_counter_t<lsn_t, 1, single_indexer_t> lsn_ctr_1_t;
+	typedef ib_counter_t<ulint, 1, single_indexer_t> ulint_ctr_1_t;
+	typedef ib_counter_t<lint, 1, single_indexer_t> lint_ctr_1_t;
+	typedef ib_counter_t<ulint, 64> ulint_ctr_64_t;
+	typedef ib_counter_t<ib_int64_t, 1, single_indexer_t> ib_int64_ctr_1_t;
+
+	/** Count the amount of data written in total (in bytes) */
+	ulint_ctr_1_t		data_written;
+
+	/** Number of the log write requests done */
+	ulint_ctr_1_t		log_write_requests;
+
+	/** Number of physical writes to the log performed */
+	ulint_ctr_1_t		log_writes;
+
+	/** Amount of data written to the log files in bytes */
+	lsn_ctr_1_t		os_log_written;
+
+	/** Number of writes being done to the log files */
+	lint_ctr_1_t		os_log_pending_writes;
+
+	/** We increase this counter, when we don't have enough
+	space in the log buffer and have to flush it */
+	ulint_ctr_1_t		log_waits;
+
+	/** Count the number of times the doublewrite buffer was flushed */
+	ulint_ctr_1_t		dblwr_writes;
+
+	/** Store the number of pages that have been flushed to the
+	doublewrite buffer */
+	ulint_ctr_1_t		dblwr_pages_written;
+
+	/** Store the number of write requests issued */
+	ulint_ctr_1_t		buf_pool_write_requests;
+
+	/** Store the number of times when we had to wait for a free page
+	in the buffer pool. It happens when the buffer pool is full and we
+	need to make a flush, in order to be able to read or create a page. */
+	ulint_ctr_1_t		buf_pool_wait_free;
+
+	/** Count the number of pages that were written from buffer
+	pool to the disk */
+	ulint_ctr_1_t		buf_pool_flushed;
+
+	/** Number of buffer pool reads that led to the reading of
+	a disk page */
+	ulint_ctr_1_t		buf_pool_reads;
+
+	/** Number of data read in total (in bytes) */
+	ulint_ctr_1_t		data_read;
+
+	/** Wait time of database locks */
+	ib_int64_ctr_1_t	n_lock_wait_time;
+
+	/** Number of database lock waits */
+	ulint_ctr_1_t		n_lock_wait_count;
+
+	/** Number of threads currently waiting on database locks */
+	lint_ctr_1_t		n_lock_wait_current_count;
+
+	/** Number of rows read. */
+	ulint_ctr_64_t		n_rows_read;
+
+	/** Number of rows updated */
+	ulint_ctr_64_t		n_rows_updated;
+
+	/** Number of rows deleted */
+	ulint_ctr_64_t		n_rows_deleted;
+
+	/** Number of rows inserted */
+	ulint_ctr_64_t		n_rows_inserted;
+
+	ulint_ctr_1_t		lock_deadlock_count;
+
+	ulint_ctr_1_t		n_lock_max_wait_time;
+};
 
 extern const char*	srv_main_thread_op_info;
 
 /** Prefix used by MySQL to indicate pre-5.1 table name encoding */
-extern const char	srv_mysql50_table_name_prefix[9];
-
-/* When this event is set the lock timeout and InnoDB monitor
-thread starts running */
-extern os_event_t	srv_lock_timeout_thread_event;
+extern const char	srv_mysql50_table_name_prefix[10];
 
 /* The monitor thread waits on this event. */
 extern os_event_t	srv_monitor_event;
 
-/* The lock timeout thread waits on this event. */
-extern os_event_t	srv_timeout_event;
-
 /* The error monitor thread waits on this event. */
 extern os_event_t	srv_error_event;
 
-/* This event is set at shutdown to wakeup threads from sleep */
-extern os_event_t	srv_shutdown_event;
+/** The buffer pool dump/load thread waits on this event. */
+extern os_event_t	srv_buf_dump_event;
+
+/** The buffer pool dump/load file name */
+#define SRV_BUF_DUMP_FILENAME_DEFAULT	"ib_buffer_pool"
+extern char*		srv_buf_dump_filename;
+
+/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown
+and/or load it during startup. */
+extern char		srv_buffer_pool_dump_at_shutdown;
+extern char		srv_buffer_pool_load_at_startup;
+
+/* Whether to disable file system cache if it is defined */
+extern char		srv_disable_sort_file_cache;
 
 /* This event is set on checkpoint completion to wake the redo log parser
 thread */
@@ -82,87 +170,112 @@ at a time */
 #define SRV_AUTO_EXTEND_INCREMENT	\
 	(srv_auto_extend_increment * ((1024 * 1024) / UNIV_PAGE_SIZE))
 
+/* Mutex for locking srv_monitor_file. Not created if srv_read_only_mode */
+extern ib_mutex_t	srv_monitor_file_mutex;
+
 /* prototypes for new functions added to ha_innodb.cc */
 ibool	innobase_get_slow_log();
 
-/* Mutex for locking srv_monitor_file */
-extern mutex_t	srv_monitor_file_mutex;
 /* Temporary file for innodb monitor output */
 extern FILE*	srv_monitor_file;
-/* Mutex for locking srv_dict_tmpfile.
+/* Mutex for locking srv_dict_tmpfile. Only created if !srv_read_only_mode.
 This mutex has a very high rank; threads reserving it should not
 be holding any InnoDB latches. */
-extern mutex_t	srv_dict_tmpfile_mutex;
+extern ib_mutex_t	srv_dict_tmpfile_mutex;
 /* Temporary file for output from the data dictionary */
 extern FILE*	srv_dict_tmpfile;
-/* Mutex for locking srv_misc_tmpfile.
+/* Mutex for locking srv_misc_tmpfile. Only created if !srv_read_only_mode.
 This mutex has a very low rank; threads reserving it should not
 acquire any further latches or sleep before releasing this one. */
-extern mutex_t	srv_misc_tmpfile_mutex;
+extern ib_mutex_t	srv_misc_tmpfile_mutex;
 /* Temporary file for miscellanous diagnostic output */
 extern FILE*	srv_misc_tmpfile;
 
 /* Server parameters which are read from the initfile */
 
 extern char*	srv_data_home;
+
 #ifdef UNIV_LOG_ARCHIVE
 extern char*	srv_arch_dir;
 #endif /* UNIV_LOG_ARCHIVE */
 
+/** Set if InnoDB must operate in read-only mode. We don't do any
+recovery and open all tables in RO mode instead of RW mode. We don't
+sync the max trx id to disk either. */
+extern my_bool	srv_read_only_mode;
 /** store to its own file each table created by an user; data
 dictionary tables are in the system tablespace 0 */
-#ifndef UNIV_HOTBACKUP
 extern my_bool	srv_file_per_table;
-#else
-extern ibool	srv_file_per_table;
-#endif /* UNIV_HOTBACKUP */
+/** Sleep delay for threads waiting to enter InnoDB. In micro-seconds. */
+extern	ulong	srv_thread_sleep_delay;
+#if defined(HAVE_ATOMIC_BUILTINS)
+/** Maximum sleep delay (in micro-seconds), value of 0 disables it.*/
+extern	ulong	srv_adaptive_max_sleep_delay;
+#endif /* HAVE_ATOMIC_BUILTINS */
+
 /** The file format to use on new *.ibd files. */
 extern ulint	srv_file_format;
 /** Whether to check file format during startup.  A value of
-DICT_TF_FORMAT_MAX + 1 means no checking ie. FALSE.  The default is to
+UNIV_FORMAT_MAX + 1 means no checking ie. FALSE.  The default is to
 set it to the highest format we support. */
 extern ulint	srv_max_file_format_at_startup;
 /** Place locks to records only i.e. do not use next-key locking except
 on duplicate key checking and foreign key checking */
 extern ibool	srv_locks_unsafe_for_binlog;
 
+/** Sort buffer size in index creation */
+extern ulong	srv_sort_buf_size;
+/** Maximum modification log file size for online index creation */
+extern unsigned long long	srv_online_max_size;
+
 /* If this flag is TRUE, then we will use the native aio of the
 OS (provided we compiled Innobase with it in), otherwise we will
 use simulated aio we build below with threads.
 Currently we support native aio on windows and linux */
 extern my_bool	srv_use_native_aio;
-#endif /* !UNIV_HOTBACKUP */
 #ifdef __WIN__
 extern ibool	srv_use_native_conditions;
-#endif
+#endif /* __WIN__ */
+#endif /* !UNIV_HOTBACKUP */
+
+/** Server undo tablespaces directory, can be absolute path. */
+extern char*	srv_undo_dir;
+
+/** Number of undo tablespaces to use. */
+extern ulong	srv_undo_tablespaces;
+
+/** The number of UNDO tablespaces that are open and ready to use. */
+extern ulint	srv_undo_tablespaces_open;
+
+/* The number of undo segments to use */
+extern ulong	srv_undo_logs;
+
 extern ulint	srv_n_data_files;
 extern char**	srv_data_file_names;
 extern ulint*	srv_data_file_sizes;
 extern ulint*	srv_data_file_is_raw_partition;
 
-extern char*	srv_doublewrite_file;
-
-extern ibool	srv_recovery_stats;
-
 extern my_bool		srv_track_changed_pages;
-extern ib_uint64_t	srv_max_bitmap_file_size;
+extern ulonglong	srv_max_bitmap_file_size;
 
 extern
 ulonglong       srv_max_changed_pages;
 
 extern ibool	srv_auto_extend_last_data_file;
 extern ulint	srv_last_file_size_max;
-extern char**	srv_log_group_home_dirs;
+extern char*	srv_log_group_home_dir;
 #ifndef UNIV_HOTBACKUP
 extern ulong	srv_auto_extend_increment;
 
 extern ibool	srv_created_new_raw;
 
-extern ulint	srv_n_log_groups;
-extern ulint	srv_n_log_files;
-extern ulint	srv_log_file_size;
+/** Maximum number of srv_n_log_files, or innodb_log_files_in_group */
+#define SRV_N_LOG_FILES_MAX 100
+extern ulong	srv_n_log_files;
+extern ib_uint64_t	srv_log_file_size;
+extern ib_uint64_t	srv_log_file_size_requested;
 extern ulint	srv_log_buffer_size;
-//extern ulong	srv_flush_log_at_trx_commit;
+extern uint	srv_flush_log_at_timeout;
 extern char	srv_use_global_flush_log_at_trx_commit;
 extern char	srv_adaptive_flushing;
 
@@ -185,12 +298,55 @@ extern ibool	srv_use_sys_malloc;
 extern ulint	srv_buf_pool_size;	/*!< requested size in bytes */
 extern my_bool	srv_buf_pool_populate;	/*!< virtual page preallocation */
 extern ulint    srv_buf_pool_instances; /*!< requested number of buffer pool instances */
+extern ulong	srv_n_page_hash_locks;	/*!< number of locks to
+					protect buf_pool->page_hash */
+extern ulong	srv_LRU_scan_depth;	/*!< Scan depth for LRU
+					flush batch */
+extern ulong	srv_flush_neighbors;	/*!< whether or not to flush
+					neighbors of a block */
 extern ulint	srv_buf_pool_old_size;	/*!< previously requested size */
 extern ulint	srv_buf_pool_curr_size;	/*!< current size in bytes */
 extern ulint	srv_mem_pool_size;
 extern ulint	srv_lock_table_size;
 
-extern ibool	srv_thread_concurrency_timer_based;
+extern ulong	srv_foreground_preflush;/*!< Query thread preflush algorithm */
+
+extern ulint	srv_cleaner_max_lru_time;/*!< the maximum time limit for a
+					single LRU tail flush iteration by the
+					page cleaner thread */
+
+extern ulint	srv_cleaner_max_flush_time;/*!< the maximum time limit for a
+					single flush list flush iteration by
+					the page cleaner thread */
+
+extern ulint	srv_cleaner_flush_chunk_size;
+					/*!< page cleaner flush list flush
+					batches are further divided into this
+					chunk size  */
+
+extern ulint	srv_cleaner_lru_chunk_size;
+					/*!< page cleaner LRU list flush
+					batches are further divided into this
+					chunk size  */
+
+extern ulint	srv_cleaner_free_list_lwm;/*!< if free list length is lower
+					than this percentage of
+					srv_LRU_scan_depth, page cleaner LRU
+					flushes will issue flush batches to the
+					same instance in a row  */
+
+extern my_bool	srv_cleaner_eviction_factor;
+					/*!< if TRUE, page cleaner heuristics
+					use evicted instead of flushed page
+					counts for its heuristics  */
+
+extern ulong	srv_cleaner_lsn_age_factor;
+					/*!< page cleaner LSN age factor
+					formula option */
+
+extern ulong	srv_empty_free_list_algorithm;
+					/*!< Empty free list for a query thread
+					handling algorithm option */
 
 extern ulint	srv_n_file_io_threads;
 extern my_bool	srv_random_read_ahead;
@@ -200,10 +356,16 @@ extern ulint	srv_n_write_io_threads;
 
 /* Number of IO operations per second the server can do */
 extern ulong    srv_io_capacity;
+
+/* We use this dummy default value at startup for max_io_capacity.
+The real value is set based on the value of io_capacity. */
+#define SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT	(~0UL)
+#define SRV_MAX_IO_CAPACITY_LIMIT		(~0UL)
+extern ulong    srv_max_io_capacity;
 /* Returns the number of IO operations that is X percent of the
 capacity. PCT_IO(5) -> returns the number of IO operations that
 is 5% of the max where max is srv_io_capacity.  */
-#define PCT_IO(p) ((ulong) (srv_io_capacity * ((double) p / 100.0)))
+#define PCT_IO(p) ((ulong) (srv_io_capacity * ((double) (p) / 100.0)))
 
 /* The "innodb_stats_method" setting, decides how InnoDB is going
 to treat NULL value when collecting statistics. It is not defined
@@ -222,63 +384,57 @@ extern ulint	srv_win_file_flush_method;
 
 extern ulint	srv_max_n_open_files;
 
-extern ulint	srv_max_dirty_pages_pct;
-
-extern ulint	srv_force_recovery;
-extern ulong	srv_thread_concurrency;
+extern ulong	srv_max_dirty_pages_pct;
+extern ulong	srv_max_dirty_pages_pct_lwm;
 
-extern ulint	srv_max_n_threads;
+extern ulong	srv_adaptive_flushing_lwm;
+extern ulong	srv_flushing_avg_loops;
 
-extern lint	srv_conc_n_threads;
+extern ulong	srv_force_recovery;
+#ifndef DBUG_OFF
+extern ulong	srv_force_recovery_crash;
+#endif /* !DBUG_OFF */
 
-extern ulint	srv_fast_shutdown;	 /* If this is 1, do not do a
-					 purge and index buffer merge.
-					 If this 2, do not even flush the
-					 buffer pool to data files at the
-					 shutdown: we effectively 'crash'
-					 InnoDB (but lose no committed
-					 transactions). */
+extern ulint	srv_fast_shutdown;	/*!< If this is 1, do not do a
+					purge and index buffer merge.
+					If this 2, do not even flush the
+					buffer pool to data files at the
+					shutdown: we effectively 'crash'
+					InnoDB (but lose no committed
+					transactions). */
 extern ibool	srv_innodb_status;
 
-extern unsigned long long	srv_stats_sample_pages;
-extern ulint	srv_stats_auto_update;
-extern ulint	srv_stats_update_need_lock;
-extern ibool	srv_use_sys_stats_table;
-#ifdef UNIV_DEBUG
-extern ulong	srv_sys_stats_root_page;
-#endif
+extern unsigned long long	srv_stats_transient_sample_pages;
+extern my_bool			srv_stats_persistent;
+extern unsigned long long	srv_stats_persistent_sample_pages;
+extern my_bool			srv_stats_auto_recalc;
 
 extern ibool	srv_use_doublewrite_buf;
+extern ulong	srv_doublewrite_batch_size;
 extern ibool	srv_use_atomic_writes;
 #ifdef HAVE_POSIX_FALLOCATE
 extern ibool	srv_use_posix_fallocate;
 #endif
+extern ulong	srv_checksum_algorithm;
 
-extern ibool	srv_use_checksums;
-extern ibool	srv_fast_checksum;
+extern ulong	srv_log_arch_expire_sec;
 
 extern ulong	srv_max_buf_pool_modified_pct;
 extern ulong	srv_max_purge_lag;
+extern ulong	srv_max_purge_lag_delay;
 
 extern ulong	srv_replication_delay;
 
-extern long long	srv_ibuf_max_size;
-extern ulong	srv_ibuf_active_contract;
-extern ulong	srv_ibuf_accel_rate;
-extern ulint	srv_checkpoint_age_target;
-extern ulong	srv_flush_neighbor_pages;
-extern ulint	srv_deprecated_enable_unsafe_group_commit;
-extern ulong	srv_read_ahead;
-extern ulong	srv_adaptive_flushing_method;
+extern my_bool  srv_use_stacktrace;
 
-extern ulong	srv_expand_import;
 extern ulong	srv_pass_corrupt_table;
 
-extern my_bool  srv_use_stacktrace;
+extern ulong	srv_log_checksum_algorithm;
 
 /* Helper macro to support srv_pass_corrupt_table checks. If 'cond' is FALSE,
 execute 'code' if srv_pass_corrupt_table is non-zero, or trigger a fatal error
-otherwise. The break statement in 'code' will obviously not work as expected. */
+otherwise. The break statement in 'code' will obviously not work as
+expected. */
 
 #define SRV_CORRUPT_TABLE_CHECK(cond,code)		\
 	do {						\
@@ -291,14 +447,8 @@ otherwise. The break statement in 'code' will obviously not work as expected. */
 		}					\
 	} while(0)
 
-extern ulint	srv_dict_size_limit;
 /*-------------------------------------------*/
 
-extern ulint	srv_n_rows_inserted;
-extern ulint	srv_n_rows_updated;
-extern ulint	srv_n_rows_deleted;
-extern ulint	srv_n_rows_read;
-
 extern ulint	srv_read_views_memory;
 extern ulint	srv_descriptors_memory;
 
@@ -306,12 +456,21 @@ extern ibool	srv_print_innodb_monitor;
 extern ibool	srv_print_innodb_lock_monitor;
 extern ibool	srv_print_innodb_tablespace_monitor;
 extern ibool	srv_print_verbose_log;
+#define DEPRECATED_MSG_INNODB_TABLE_MONITOR \
+	"Using innodb_table_monitor is deprecated and it may be removed " \
+	"in future releases. Please use the InnoDB INFORMATION_SCHEMA " \
+	"tables instead, see " REFMAN "innodb-i_s-tables.html"
 extern ibool	srv_print_innodb_table_monitor;
 
-extern ibool	srv_lock_timeout_active;
 extern ibool	srv_monitor_active;
 extern ibool	srv_error_monitor_active;
 
+/* TRUE during the lifetime of the buffer pool dump/load thread */
+extern ibool	srv_buf_dump_thread_active;
+
+/* TRUE during the lifetime of the stats thread */
+extern ibool	srv_dict_stats_thread_active;
+
 extern ulong	srv_n_spin_wait_rounds;
 extern ulong	srv_n_free_tickets_to_enter;
 extern ulong	srv_thread_sleep_delay;
@@ -319,6 +478,7 @@ extern ulong	srv_spin_wait_delay;
 extern ibool	srv_priority_boost;
 
 extern ulint	srv_truncated_status_writes;
+extern ulint	srv_available_undo_logs;
 
 extern	ulint	srv_mem_pool_size;
 extern	ulint	srv_lock_table_size;
@@ -337,113 +497,97 @@ extern	ibool	srv_print_latch_waits;
 # define srv_print_latch_waits		FALSE
 #endif /* UNIV_DEBUG */
 
-extern ulint	srv_activity_count;
-extern ulint	srv_fatal_semaphore_wait_threshold;
-#define SRV_SEMAPHORE_WAIT_EXTENSION	7200
-extern ulint	srv_dml_needed_delay;
-extern long long	srv_kill_idle_transaction;
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+extern my_bool	srv_ibuf_disable_background_merge;
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 
 #ifdef UNIV_DEBUG
 extern my_bool	srv_purge_view_update_only_debug;
 #endif /* UNIV_DEBUG */
 
-extern mutex_t*	kernel_mutex_temp;/* mutex protecting the server, trx structs,
-				query threads, and lock table: we allocate
-				it from dynamic memory to get it to the
-				same DRAM page as other hotspot semaphores */
-#define kernel_mutex (*kernel_mutex_temp)
+extern ulint	srv_fatal_semaphore_wait_threshold;
+#define SRV_SEMAPHORE_WAIT_EXTENSION	7200
+extern ulint	srv_dml_needed_delay;
+extern long long	srv_kill_idle_transaction;
+
+#ifndef HAVE_ATOMIC_BUILTINS
+/** Mutex protecting some server global variables. */
+extern ib_mutex_t	server_mutex;
+#endif /* !HAVE_ATOMIC_BUILTINS */
 
 #define SRV_MAX_N_IO_THREADS	130
 
+#define SRV_MAX_N_PURGE_THREADS 32
+
 /* Array of English strings describing the current state of an
 i/o handler thread */
 extern const char* srv_io_thread_op_info[];
 extern const char* srv_io_thread_function[];
 
-/* the number of the log write requests done */
-extern ulint srv_log_write_requests;
+/* The tid of the cleaner thread */
+extern os_tid_t	srv_cleaner_tid;
 
-/* the number of physical writes to the log performed */
-extern ulint srv_log_writes;
+/* The tids of the purge threads */
+extern os_tid_t srv_purge_tids[];
 
-/* amount of data written to the log files in bytes */
-extern ulint srv_os_log_written;
+/* The tids of the I/O threads */
+extern os_tid_t	srv_io_tids[];
 
-/* amount of writes being done to the log files */
-extern ulint srv_os_log_pending_writes;
+/* The tid of the master thread */
+extern os_tid_t	srv_master_tid;
 
-/* we increase this counter, when there we don't have enough space in the
-log buffer and have to flush it */
-extern ulint srv_log_waits;
+/* The relative scheduling priority of the cleaner thread */
+extern ulint	srv_sched_priority_cleaner;
 
-/* the number of purge threads to use from the worker pool (currently 0 or 1) */
-extern ulong srv_n_purge_threads;
+/* The relative scheduling priority of the purge threads */
+extern ulint	srv_sched_priority_purge;
 
-/* the number of pages to purge in one batch */
-extern ulong srv_purge_batch_size;
+/* The relative scheduling priority of the I/O threads */
+extern ulint	srv_sched_priority_io;
 
-/* the number of rollback segments to use */
-extern ulong srv_rollback_segments;
+/* The relative scheduling priority of the master thread */
+extern ulint	srv_sched_priority_master;
 
-/* variable that counts amount of data read in total (in bytes) */
-extern ulint srv_data_read;
+/* The relative priority of the purge coordinator and worker threads.  */
+extern my_bool srv_purge_thread_priority;
 
-/* here we count the amount of data written in total (in bytes) */
-extern ulint srv_data_written;
+/* The relative priority of the I/O threads.  */
+extern my_bool srv_io_thread_priority;
 
-/* this variable counts the amount of times, when the doublewrite buffer
-was flushed */
-extern ulint srv_dblwr_writes;
+/* The relative priority of the cleaner thread.  */
+extern my_bool srv_cleaner_thread_priority;
 
-/* here we store the number of pages that have been flushed to the
-doublewrite buffer */
-extern ulint srv_dblwr_pages_written;
+/* The relative priority of the master thread.  */
+extern my_bool srv_master_thread_priority;
 
-/* in this variable we store the number of write requests issued */
-extern ulint srv_buf_pool_write_requests;
+/* the number of purge threads to use from the worker pool (currently 0 or 1) */
+extern ulong srv_n_purge_threads;
 
-/* here we store the number of times when we had to wait for a free page
-in the buffer pool. It happens when the buffer pool is full and we need
-to make a flush, in order to be able to read or create a page. */
-extern ulint srv_buf_pool_wait_free;
+/* the number of pages to purge in one batch */
+extern ulong srv_purge_batch_size;
 
-/* variable to count the number of pages that were written from the
-buffer pool to disk */
-extern ulint srv_buf_pool_flushed;
+/* the number of sync wait arrays */
+extern ulong srv_sync_array_size;
 
-extern ulint buf_lru_flush_page_count;
+/* print all user-level transactions deadlocks to mysqld stderr */
+extern my_bool srv_print_all_deadlocks;
 
-/** Number of buffer pool reads that led to the
-reading of a disk page */
-extern ulint srv_buf_pool_reads;
+extern my_bool	srv_cmp_per_index_enabled;
 
-/** Time in seconds between automatic buffer pool dumps */
-extern uint srv_auto_lru_dump;
+/** Status variables to be passed to MySQL */
+extern struct export_var_t export_vars;
 
-/** Whether startup should be blocked until buffer pool is fully restored */
-extern ibool srv_blocking_lru_restore;
+/** Global counters */
+extern srv_stats_t	srv_stats;
 
 /** When TRUE, fake change transcations take S rather than X row locks.
 When FALSE, row locks are not taken at all. */
 extern my_bool srv_fake_changes_locks;
 
-/** print all user-level transactions deadlocks to mysqld stderr */
-extern my_bool srv_print_all_deadlocks;
-
-/** Status variables to be passed to MySQL */
-typedef struct export_var_struct export_struc;
-
-/** Status variables to be passed to MySQL */
-extern export_struc export_vars;
-
-/** The server system */
-typedef struct srv_sys_struct	srv_sys_t;
-
-/** The server system */
-extern srv_sys_t*	srv_sys;
 
 # ifdef UNIV_PFS_THREAD
 /* Keys to register InnoDB threads with performance schema */
+extern mysql_pfs_key_t	buf_page_cleaner_thread_key;
 extern mysql_pfs_key_t	trx_rollback_clean_thread_key;
 extern mysql_pfs_key_t	io_handler_thread_key;
 extern mysql_pfs_key_t	srv_lock_timeout_thread_key;
@@ -451,26 +595,21 @@ extern mysql_pfs_key_t	srv_error_monitor_thread_key;
 extern mysql_pfs_key_t	srv_monitor_thread_key;
 extern mysql_pfs_key_t	srv_master_thread_key;
 extern mysql_pfs_key_t	srv_purge_thread_key;
+extern mysql_pfs_key_t	recv_writer_thread_key;
 extern mysql_pfs_key_t	srv_log_tracking_thread_key;
 
 /* This macro register the current thread and its key with performance
 schema */
 #  define pfs_register_thread(key)			\
 do {								\
-	if (PSI_server) {					\
-		struct PSI_thread* psi = PSI_server->new_thread(key, NULL, 0);\
-		if (psi) {					\
-			PSI_server->set_thread(psi);		\
-		}						\
-	}							\
+	struct PSI_thread* psi = PSI_THREAD_CALL(new_thread)(key, NULL, 0);\
+	PSI_THREAD_CALL(set_thread)(psi);			\
 } while (0)
 
 /* This macro delist the current thread from performance schema */
 #  define pfs_delete_thread()				\
 do {								\
-	if (PSI_server) {					\
-		PSI_server->delete_current_thread();		\
-	}							\
+	PSI_THREAD_CALL(delete_current_thread)();		\
 } while (0)
 # endif /* UNIV_PFS_THREAD */
 
@@ -494,8 +633,22 @@ enum {
 				after writing to log files */
 	SRV_UNIX_NOSYNC,	/*!< do not flush after writing */
 	SRV_UNIX_O_DIRECT,	/*!< invoke os_file_set_nocache() on
-				data files */
-	SRV_UNIX_ALL_O_DIRECT	/* new method for examination: logfile also open O_DIRECT */
+				data files. This implies using
+				non-buffered IO but still using fsync,
+				the reason for which is that some FS
+				do not flush meta-data when
+				unbuffered IO happens */
+	SRV_UNIX_O_DIRECT_NO_FSYNC,
+				/*!< do not use fsync() when using
+				direct IO i.e.: it can be set to avoid
+				the fsync() call that we make when
+				using SRV_UNIX_O_DIRECT. However, in
+				this case user/DBA should be sure about
+				the integrity of the meta-data */
+	SRV_UNIX_ALL_O_DIRECT   /*!< similar to O_DIRECT, invokes
+				os_file_set_nocache() on data and log files.
+				This implies using non-buffered IO but still
+				using fsync for data but not log files. */
 };
 
 /** Alternatives for file i/o in Windows */
@@ -544,17 +697,19 @@ typedef enum srv_stats_method_name_enum		srv_stats_method_name_t;
 #ifndef UNIV_HOTBACKUP
 /** Types of threads existing in the system. */
 enum srv_thread_type {
-	SRV_WORKER = 0,	/**< threads serving parallelized queries and
-			queries released from lock wait */
-	SRV_MASTER	/**< the master thread, (whose type number must
-			be biggest) */
+	SRV_NONE,			/*!< None */
+	SRV_WORKER,			/*!< threads serving parallelized
+					queries and queries released from
+					lock wait */
+	SRV_PURGE,			/*!< Purge coordinator thread */
+	SRV_MASTER			/*!< the master thread, (whose type
+					number must be biggest) */
 };
 
 /*********************************************************************//**
-Boots Innobase server.
-@return	DB_SUCCESS or error code */
+Boots Innobase server. */
 UNIV_INTERN
-ulint
+void
 srv_boot(void);
 /*==========*/
 /*********************************************************************//**
@@ -577,21 +732,6 @@ void
 srv_general_init(void);
 /*==================*/
 /*********************************************************************//**
-Gets the number of threads in the system.
-@return	sum of srv_n_threads[] */
-UNIV_INTERN
-ulint
-srv_get_n_threads(void);
-/*===================*/
-/*********************************************************************//**
-Check whether thread type has reserved a slot.
-@return	slot number or UNDEFINED if not found*/
-UNIV_INTERN
-ulint
-srv_thread_has_reserved_slot(
-/*=========================*/
-	enum srv_thread_type	type);	/*!< in: thread type to check */
-/*********************************************************************//**
 Sets the info describing an i/o thread current state. */
 UNIV_INTERN
 void
@@ -601,31 +741,21 @@ srv_set_io_thread_op_info(
 	const char*	str);	/*!< in: constant char string describing the
 				state */
 /*********************************************************************//**
-Releases threads of the type given from suspension in the thread table.
-NOTE! The server mutex has to be reserved by the caller!
-@return number of threads released: this may be less than n if not
-enough threads were suspended at the moment */
-UNIV_INTERN
-ulint
-srv_release_threads(
-/*================*/
-	enum srv_thread_type	type,	/*!< in: thread type */
-	ulint			n);	/*!< in: number of threads to release */
-/*********************************************************************//**
-The master thread controlling the server.
-@return	a dummy parameter */
+Resets the info describing an i/o thread current state. */
 UNIV_INTERN
-os_thread_ret_t
-srv_master_thread(
-/*==============*/
-	void*	arg);	/*!< in: a dummy parameter required by
-			os_thread_create */
+void
+srv_reset_io_thread_op_info();
+/*=========================*/
 /*******************************************************************//**
-Wakes up the purge thread if it's not already awake. */
+Tells the purge thread that there has been activity in the database
+and wakes up the purge thread if it is suspended (not sleeping).  Note
+that there is a small chance that the purge thread stays suspended
+(we do not protect our operation with the srv_sys_t:mutex, for
+performance reasons). */
 UNIV_INTERN
 void
-srv_wake_purge_thread(void);
-/*=======================*/
+srv_wake_purge_thread_if_not_active(void);
+/*=====================================*/
 /*******************************************************************//**
 Tells the Innobase server that there has been activity in the database
 and wakes up the master thread if it is suspended (not sleeping). Used
@@ -642,174 +772,184 @@ UNIV_INTERN
 void
 srv_wake_master_thread(void);
 /*========================*/
-/*******************************************************************//**
-Tells the purge thread that there has been activity in the database
-and wakes up the purge thread if it is suspended (not sleeping).  Note
-that there is a small chance that the purge thread stays suspended
-(we do not protect our operation with the kernel mutex, for
-performace reasons). */
+/******************************************************************//**
+A thread which follows the redo log and outputs the changed page bitmap.
+@return a dummy value */
+extern "C"
 UNIV_INTERN
-void
-srv_wake_purge_thread_if_not_active(void);
-/*=====================================*/
-/*********************************************************************//**
-Puts an OS thread to wait if there are too many concurrent threads
-(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
+os_thread_ret_t
+DECLARE_THREAD(srv_redo_log_follow_thread)(
+/*=======================*/
+	void*	arg);	/*!< in: a dummy parameter required by
+			os_thread_create */
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
 UNIV_INTERN
-void
-srv_conc_enter_innodb(
-/*==================*/
-	trx_t*	trx);	/*!< in: transaction object associated with the
-			thread */
-/*********************************************************************//**
-This lets a thread enter InnoDB regardless of the number of threads inside
-InnoDB. This must be called when a thread ends a lock wait. */
+ibool
+srv_printf_innodb_monitor(
+/*======================*/
+	FILE*	file,		/*!< in: output stream */
+	ibool	nowait,		/*!< in: whether to wait for the
+				lock_sys_t::mutex */
+	ulint*	trx_start,	/*!< out: file position of the start of
+				the list of active transactions */
+	ulint*	trx_end);	/*!< out: file position of the end of
+				the list of active transactions */
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
 UNIV_INTERN
 void
-srv_conc_force_enter_innodb(
+srv_export_innodb_status(void);
+/*==========================*/
+/*************************************************************//**
+Removes old archived transaction log files.
+Both parameters couldn't be provided at the same time.
+@return DB_SUCCESS on success, otherwise DB_ERROR */
+UNIV_INTERN
+dberr_t
+purge_archived_logs(
+	time_t	before_date,		/*!< in: all files modified
+					before timestamp should be removed */
+	lsn_t	before_lsn);		/*!< in: files with this lsn in name
+					and earler should be removed */
+/*==========================*/
+/*******************************************************************//**
+Get current server activity count. We don't hold srv_sys::mutex while
+reading this value as it is only used in heuristics.
+@return activity count. */
+UNIV_INTERN
+ulint
+srv_get_activity_count(void);
 /*========================*/
-	trx_t*	trx);	/*!< in: transaction object associated with the
-			thread */
-/*********************************************************************//**
-This must be called when a thread exits InnoDB in a lock wait or at the
-end of an SQL statement. */
+/*******************************************************************//**
+Check if there has been any activity.
+@return FALSE if no change in activity counter. */
 UNIV_INTERN
-void
-srv_conc_force_exit_innodb(
-/*=======================*/
-	trx_t*	trx);	/*!< in: transaction object associated with the
-			thread */
-/*********************************************************************//**
-This must be called when a thread exits InnoDB. */
+ibool
+srv_check_activity(
+/*===============*/
+	ulint		old_activity_count);	/*!< old activity count */
+/******************************************************************//**
+Increment the server activity counter. */
 UNIV_INTERN
 void
-srv_conc_exit_innodb(
-/*=================*/
-	trx_t*	trx);	/*!< in: transaction object associated with the
-			thread */
-/***************************************************************//**
-Puts a MySQL OS thread to wait for a lock to be released. If an error
-occurs during the wait trx->error_state associated with thr is
-!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
-are possible errors. DB_DEADLOCK is returned if selective deadlock
-resolution chose this transaction as a victim. */
+srv_inc_activity_count(void);
+/*=========================*/
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
 UNIV_INTERN
 void
-srv_suspend_mysql_thread(
+srv_que_task_enqueue_low(
 /*=====================*/
-	que_thr_t*	thr);	/*!< in: query thread associated with the MySQL
-				OS thread */
-/********************************************************************//**
-Releases a MySQL OS thread waiting for a lock to be released, if the
-thread is already suspended. */
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/**********************************************************************//**
+Check whether any background thread is active. If so, return the thread
+type.
+@return SRV_NONE if all are are suspended or have exited, thread
+type if any are still active. */
 UNIV_INTERN
-void
-srv_release_mysql_thread_if_suspended(
-/*==================================*/
-	que_thr_t*	thr);	/*!< in: query thread associated with the
-				MySQL OS thread	 */
+enum srv_thread_type
+srv_get_active_thread_type(void);
+/*============================*/
+
+extern "C" {
+
 /*********************************************************************//**
-A thread which wakes up threads whose lock wait may have lasted too long.
+A thread which prints the info output by various InnoDB monitors.
 @return	a dummy parameter */
 UNIV_INTERN
 os_thread_ret_t
-srv_lock_timeout_thread(
-/*====================*/
+DECLARE_THREAD(srv_monitor_thread)(
+/*===============================*/
 	void*	arg);	/*!< in: a dummy parameter required by
 			os_thread_create */
+
 /*********************************************************************//**
-A thread which prints the info output by various InnoDB monitors.
+The master thread controlling the server.
 @return	a dummy parameter */
 UNIV_INTERN
 os_thread_ret_t
-srv_monitor_thread(
-/*===============*/
+DECLARE_THREAD(srv_master_thread)(
+/*==============================*/
 	void*	arg);	/*!< in: a dummy parameter required by
 			os_thread_create */
+
 /*************************************************************************
 A thread which prints warnings about semaphore waits which have lasted
 too long. These can be used to track bugs which cause hangs.
 @return	a dummy parameter */
 UNIV_INTERN
 os_thread_ret_t
-srv_error_monitor_thread(
-/*=====================*/
+DECLARE_THREAD(srv_error_monitor_thread)(
+/*=====================================*/
 	void*	arg);	/*!< in: a dummy parameter required by
 			os_thread_create */
+
 /*********************************************************************//**
-A thread which restores the buffer pool from a dump file on startup and does
-periodic buffer pool dumps.
+Purge coordinator thread that schedules the purge tasks.
 @return	a dummy parameter */
 UNIV_INTERN
 os_thread_ret_t
-srv_LRU_dump_restore_thread(
-/*====================*/
-	void*	arg);	/*!< in: a dummy parameter required by
-			os_thread_create */
-/******************************************************************//**
-A thread which follows the redo log and outputs the changed page bitmap.
-@return a dummy value */
+DECLARE_THREAD(srv_purge_coordinator_thread)(
+/*=========================================*/
+	void*	arg __attribute__((unused)));	/*!< in: a dummy parameter
+						required by os_thread_create */
+
+/*********************************************************************//**
+Worker thread that reads tasks from the work queue and executes them.
+@return	a dummy parameter */
 UNIV_INTERN
 os_thread_ret_t
-srv_redo_log_follow_thread(
-/*=======================*/
-	void*	arg);	/*!< in: a dummy parameter required by
-			os_thread_create */
-/******************************************************************//**
-Outputs to a file the output of the InnoDB Monitor.
-@return FALSE if not all information printed
-due to failure to obtain necessary mutex */
-UNIV_INTERN
-ibool
-srv_printf_innodb_monitor(
-/*======================*/
-	FILE*	file,		/*!< in: output stream */
-	ibool	nowait,		/*!< in: whether to wait for kernel mutex */
-	ulint*	trx_start,	/*!< out: file position of the start of
-				the list of active transactions */
-	ulint*	trx_end);	/*!< out: file position of the end of
-				the list of active transactions */
+DECLARE_THREAD(srv_worker_thread)(
+/*==============================*/
+	void*	arg __attribute__((unused)));	/*!< in: a dummy parameter
+						required by os_thread_create */
+} /* extern "C" */
 
-/******************************************************************//**
-Function to pass InnoDB status variables to MySQL */
+/**********************************************************************//**
+Get count of tasks in the queue.
+@return number of tasks in queue  */
 UNIV_INTERN
-void
-srv_export_innodb_status(void);
-/*==========================*/
+ulint
+srv_get_task_queue_length(void);
+/*===========================*/
 
 /*********************************************************************//**
-Asynchronous purge thread.
-@return	a dummy parameter */
+Releases threads of the type given from suspension in the thread table.
+NOTE! The server mutex has to be reserved by the caller!
+@return number of threads released: this may be less than n if not
+enough threads were suspended at the moment */
 UNIV_INTERN
-os_thread_ret_t
-srv_purge_thread(
-/*=============*/
-	void*	arg __attribute__((unused))); /*!< in: a dummy parameter
-					      required by os_thread_create */
+ulint
+srv_release_threads(
+/*================*/
+	enum srv_thread_type	type,	/*!< in: thread type */
+	ulint			n);	/*!< in: number of threads to release */
 
 /**********************************************************************//**
-Enqueues a task to server task queue and releases a worker thread, if there
-is a suspended one. */
+Check whether any background thread are active. If so print which thread
+is active. Send the threads wakeup signal.
+@return name of thread that is active or NULL */
 UNIV_INTERN
-void
-srv_que_task_enqueue_low(
-/*=====================*/
-	que_thr_t*	thr);	/*!< in: query thread */
+const char*
+srv_any_background_threads_are_active(void);
+/*=======================================*/
 
 /**********************************************************************//**
-Check whether any background thread is active. If so, return the thread
-type.
-@return ULINT_UNDEFINED if all are are suspended or have exited, thread
-type if any are still active. */
+Wakeup the purge threads. */
 UNIV_INTERN
-ulint
-srv_get_active_thread_type(void);
-/*============================*/
+void
+srv_purge_wakeup(void);
+/*==================*/
 
 /** Status variables to be passed to MySQL */
-struct export_var_struct{
-	ulint innodb_adaptive_hash_cells;
-	ulint innodb_adaptive_hash_heap_buffers;
+struct export_var_t{
 	ulint innodb_adaptive_hash_hash_searches;
 	ulint innodb_adaptive_hash_non_hash_searches;
 	ulint innodb_background_log_sync;
@@ -821,7 +961,8 @@ struct export_var_struct{
 	ulint innodb_data_writes;		/*!< I/O write requests */
 	ulint innodb_data_written;		/*!< Data bytes written */
 	ulint innodb_data_reads;		/*!< I/O read requests */
-	ulint innodb_dict_tables;
+	char  innodb_buffer_pool_dump_status[512];/*!< Buf pool dump status */
+	char  innodb_buffer_pool_load_status[512];/*!< Buf pool load status */
 	ulint innodb_buffer_pool_pages_total;	/*!< Buffer pool size */
 	ulint innodb_buffer_pool_pages_data;	/*!< Data pages */
 	ulint innodb_buffer_pool_bytes_data;	/*!< File bytes used */
@@ -846,7 +987,6 @@ struct export_var_struct{
 	ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/
 	ulint innodb_checkpoint_age;
 	ulint innodb_checkpoint_max_age;
-	ulint innodb_checkpoint_target_age;
 	ulint innodb_dblwr_pages_written;	/*!< srv_dblwr_pages_written */
 	ulint innodb_dblwr_writes;		/*!< srv_dblwr_writes */
 	ulint innodb_deadlocks;
@@ -865,14 +1005,12 @@ struct export_var_struct{
 	ulint innodb_log_waits;			/*!< srv_log_waits */
 	ulint innodb_log_write_requests;	/*!< srv_log_write_requests */
 	ulint innodb_log_writes;		/*!< srv_log_writes */
-	ib_int64_t innodb_lsn_current;
-	ib_int64_t innodb_lsn_flushed;
-	ib_int64_t innodb_lsn_last_checkpoint;
-	ulint innodb_master_thread_1_second_loops;
-	ulint innodb_master_thread_10_second_loops;
-	ulint innodb_master_thread_background_loops;
-	ulint innodb_master_thread_main_flush_loops;
-	ulint innodb_master_thread_sleeps;
+	lsn_t innodb_os_log_written;		/*!< srv_os_log_written */
+	lsn_t innodb_lsn_current;
+	lsn_t innodb_lsn_flushed;
+	lsn_t innodb_lsn_last_checkpoint;
+	ulint innodb_master_thread_active_loops;/*!< srv_main_active_loops */
+	ulint innodb_master_thread_idle_loops;	/*!< srv_main_idle_loops */
 	ib_int64_t innodb_max_trx_id;
 	ulint innodb_mem_adaptive_hash;
 	ulint innodb_mem_dictionary;
@@ -881,7 +1019,6 @@ struct export_var_struct{
 	ib_int64_t innodb_mutex_spin_rounds;
 	ib_int64_t innodb_mutex_spin_waits;
 	ib_int64_t innodb_oldest_view_low_limit_trx_id;
-	ulint innodb_os_log_written;		/*!< srv_os_log_written */
 	ulint innodb_os_log_fsyncs;		/*!< fil_n_log_flushes */
 	ulint innodb_os_log_pending_writes;	/*!< srv_os_log_pending_writes */
 	ulint innodb_os_log_pending_fsyncs;	/*!< fil_n_pending_log_flushes */
@@ -905,7 +1042,9 @@ struct export_var_struct{
 	ulint innodb_rows_inserted;		/*!< srv_n_rows_inserted */
 	ulint innodb_rows_updated;		/*!< srv_n_rows_updated */
 	ulint innodb_rows_deleted;		/*!< srv_n_rows_deleted */
+	ulint innodb_num_open_files;		/*!< fil_n_file_opened */
 	ulint innodb_truncated_status_writes;	/*!< srv_truncated_status_writes */
+	ulint innodb_available_undo_logs;       /*!< srv_available_undo_logs */
 	ulint innodb_read_views_memory;		/*!< srv_read_views_memory */
 	ulint innodb_descriptors_memory;	/*!< srv_descriptors_memory */
 	ib_int64_t innodb_s_lock_os_waits;
@@ -915,32 +1054,43 @@ struct export_var_struct{
 	ib_int64_t innodb_x_lock_spin_rounds;
 	ib_int64_t innodb_x_lock_spin_waits;
 #ifdef UNIV_DEBUG
-	ulint innodb_purge_trx_id_age;		/*!< max_trx_id - purged trx_id */
+	ulint innodb_purge_trx_id_age;		/*!< rw_max_trx_id - purged trx_id */
 	ulint innodb_purge_view_trx_id_age;	/*!< rw_max_trx_id
 						- purged view's min trx_id */
 #endif /* UNIV_DEBUG */
 };
 
-/** Thread slot in the thread table */
-typedef struct srv_slot_struct	srv_slot_t;
-
-/** Thread table is an array of slots */
-typedef srv_slot_t	srv_table_t;
-
-/** The server system struct */
-struct srv_sys_struct{
-	srv_table_t*	threads;	/*!< server thread table */
-	UT_LIST_BASE_NODE_T(que_thr_t)
-			tasks;		/*!< task queue */
+/** Thread slot in the thread table.  */
+struct srv_slot_t{
+	srv_thread_type type;			/*!< thread type: user,
+						utility etc. */
+	ibool		in_use;			/*!< TRUE if this slot
+						is in use */
+	ibool		suspended;		/*!< TRUE if the thread is
+						waiting for the event of this
+						slot */
+	ib_time_t	suspend_time;		/*!< time when the thread was
+						suspended. Initialized by
+						lock_wait_table_reserve_slot()
+						for lock wait */
+	ulong		wait_timeout;		/*!< wait time that if exceeded
+						the thread will be timed out.
+						Initialized by
+						lock_wait_table_reserve_slot()
+						for lock wait */
+	os_event_t	event;			/*!< event used in suspending
+						the thread when it has nothing
+						to do */
+	que_thr_t*	thr;			/*!< suspended query thread
+						(only used for user threads) */
 };
 
-extern ulint	srv_n_threads_active[];
 #else /* !UNIV_HOTBACKUP */
 # define srv_use_adaptive_hash_indexes		FALSE
-# define srv_use_checksums			TRUE
 # define srv_use_native_aio			FALSE
 # define srv_force_recovery			0UL
 # define srv_set_io_thread_op_info(t,info)	((void) 0)
+# define srv_reset_io_thread_op_info()		((void) 0)
 # define srv_is_being_started			0
 # define srv_win_file_flush_method		SRV_WIN_IO_UNBUFFERED
 # define srv_unix_file_flush_method		SRV_UNIX_O_DSYNC
diff --git a/storage/xtradb/include/srv0srv.ic b/storage/xtradb/include/srv0srv.ic
index 19ba62cc3c2..53405c06f97 100644
--- a/storage/xtradb/include/srv0srv.ic
+++ b/storage/xtradb/include/srv0srv.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/srv0start.h b/storage/xtradb/include/srv0start.h
index ffbb0dafa5d..40d502f4459 100644
--- a/storage/xtradb/include/srv0start.h
+++ b/storage/xtradb/include/srv0start.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,8 +27,15 @@ Created 10/10/1995 Heikki Tuuri
 #define srv0start_h
 
 #include "univ.i"
+#include "log0log.h"
 #include "ut0byte.h"
 
+#ifdef __WIN__
+#define SRV_PATH_SEPARATOR	'\\'
+#else
+#define SRV_PATH_SEPARATOR	'/'
+#endif
+
 /*********************************************************************//**
 Normalizes a directory path for Windows: converts slashes to backslashes. */
 UNIV_INTERN
@@ -46,15 +53,6 @@ srv_parse_data_file_paths_and_sizes(
 /*================================*/
 	char*	str);	/*!< in/out: the data file path string */
 /*********************************************************************//**
-Reads log group home directories from a character string given in
-the .cnf file.
-@return	TRUE if ok, FALSE on parse error */
-UNIV_INTERN
-ibool
-srv_parse_log_group_home_dirs(
-/*==========================*/
-	char*	str);	/*!< in/out: character string */
-/*********************************************************************//**
 Frees the memory allocated by srv_parse_data_file_paths_and_sizes()
 and srv_parse_log_group_home_dirs(). */
 UNIV_INTERN
@@ -76,20 +74,54 @@ Starts Innobase and creates a new database if database files
 are not found and the user wants.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-int
+dberr_t
 innobase_start_or_create_for_mysql(void);
 /*====================================*/
 /****************************************************************//**
 Shuts down the Innobase database.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-int
+dberr_t
 innobase_shutdown_for_mysql(void);
+
+/********************************************************************
+Signal all per-table background threads to shutdown, and wait for them to do
+so. */
+UNIV_INTERN
+void
+srv_shutdown_table_bg_threads(void);
 /*=============================*/
+
+/*************************************************************//**
+Copy the file path component of the physical file to parameter. It will
+copy up to and including the terminating path separator.
+@return number of bytes copied or ULINT_UNDEFINED if destination buffer
+	is smaller than the path to be copied. */
+UNIV_INTERN
+ulint
+srv_path_copy(
+/*==========*/
+	char*		dest,		/*!< out: destination buffer */
+	ulint		dest_len,	/*!< in: max bytes to copy */
+	const char*	basedir,	/*!< in: base directory */
+	const char*	table_name)	/*!< in: source table name */
+	__attribute__((nonnull, warn_unused_result));
+
+/*****************************************************************//**
+Get the meta-data filename from the table name. */
+UNIV_INTERN
+void
+srv_get_meta_data_filename(
+/*======================*/
+	dict_table_t*	table,		/*!< in: table */
+	char*			filename,	/*!< out: filename */
+	ulint			max_len)	/*!< in: filename max length */
+	__attribute__((nonnull));
+
 /** Log sequence number at shutdown */
-extern	ib_uint64_t	srv_shutdown_lsn;
+extern	lsn_t	srv_shutdown_lsn;
 /** Log sequence number immediately after startup */
-extern	ib_uint64_t	srv_start_lsn;
+extern	lsn_t	srv_start_lsn;
 
 #ifdef HAVE_DARWIN_THREADS
 /** TRUE if the F_FULLFSYNC option is available */
@@ -113,6 +145,11 @@ enum srv_shutdown_state {
 	SRV_SHUTDOWN_NONE = 0,	/*!< Database running normally */
 	SRV_SHUTDOWN_CLEANUP,	/*!< Cleaning up in
 				logs_empty_and_mark_files_at_shutdown() */
+	SRV_SHUTDOWN_FLUSH_PHASE,/*!< At this phase the master and the
+				purge threads must have completed their
+				work. Once we enter this phase the
+				page_cleaner can clean up the buffer
+				pool and exit */
 	SRV_SHUTDOWN_LAST_PHASE,/*!< Last phase after ensuring that
 				the buffer pool can be freed: flush
 				all file spaces and close all files */
@@ -127,7 +164,4 @@ extern	enum srv_shutdown_state	srv_shutdown_state;
 /** Log 'spaces' have id's >= this */
 #define SRV_LOG_SPACE_FIRST_ID		0xFFFFFFF0UL
 
-/** reserved for extra system tables */
-#define SRV_EXTRA_SYS_SPACE_FIRST_ID	0xFFFFFFE0UL
-
 #endif
diff --git a/storage/xtradb/include/sync0arr.h b/storage/xtradb/include/sync0arr.h
index 4bce9435577..bb4d1037a62 100644
--- a/storage/xtradb/include/sync0arr.h
+++ b/storage/xtradb/include/sync0arr.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -32,36 +32,10 @@ Created 9/5/1995 Heikki Tuuri
 #include "os0thread.h"
 
 /** Synchronization wait array cell */
-typedef struct sync_cell_struct		sync_cell_t;
+struct sync_cell_t;
 /** Synchronization wait array */
-typedef struct sync_array_struct	sync_array_t;
-
-/** Parameters for sync_array_create() @{ */
-#define SYNC_ARRAY_OS_MUTEX	1	/*!< protected by os_mutex_t */
-#define SYNC_ARRAY_MUTEX	2	/*!< protected by mutex_t */
-/* @} */
-
-/*******************************************************************//**
-Creates a synchronization wait array. It is protected by a mutex
-which is automatically reserved when the functions operating on it
-are called.
-@return	own: created wait array */
-UNIV_INTERN
-sync_array_t*
-sync_array_create(
-/*==============*/
-	ulint	n_cells,	/*!< in: number of cells in the array
-				to create */
-	ulint	protection);	/*!< in: either SYNC_ARRAY_OS_MUTEX or
-				SYNC_ARRAY_MUTEX: determines the type
-				of mutex protecting the data structure */
-/******************************************************************//**
-Frees the resources in a wait array. */
-UNIV_INTERN
-void
-sync_array_free(
-/*============*/
-	sync_array_t*	arr);	/*!< in, own: sync wait array */
+struct sync_array_t;
+
 /******************************************************************//**
 Reserves a wait array cell for waiting for an object.
 The event of the cell is reset to nonsignalled state. */
@@ -99,9 +73,9 @@ sync_array_free_cell(
 Note that one of the wait objects was signalled. */
 UNIV_INTERN
 void
-sync_array_object_signalled(
-/*========================*/
-	sync_array_t*	arr);	/*!< in: wait array */
+sync_array_object_signalled(void);
+/*=============================*/
+
 /**********************************************************************//**
 If the wakeup algorithm does not work perfectly at semaphore relases,
 this function will do the waking (see the comment in mutex_exit). This
@@ -132,11 +106,30 @@ sync_array_validate(
 Prints info of the wait array. */
 UNIV_INTERN
 void
-sync_array_print_info(
+sync_array_print(
+/*=============*/
+	FILE*		file);	/*!< in: file where to print */
+
+/**********************************************************************//**
+Create the primary system wait array(s), they are protected by an OS mutex */
+UNIV_INTERN
+void
+sync_array_init(
+/*============*/
+	ulint		n_threads);	/*!< in: Number of slots to create */
+/**********************************************************************//**
+Close sync array wait sub-system. */
+UNIV_INTERN
+void
+sync_array_close(void);
 /*==================*/
-	FILE*		file,	/*!< in: file where to print */
-	sync_array_t*	arr);	/*!< in: wait array */
 
+/**********************************************************************//**
+Get an instance of the sync wait array. */
+UNIV_INTERN
+sync_array_t*
+sync_array_get(void);
+/*================*/
 
 #ifndef UNIV_NONINL
 #include "sync0arr.ic"
diff --git a/storage/xtradb/include/sync0arr.ic b/storage/xtradb/include/sync0arr.ic
index b49dce34017..0114a1ff5a2 100644
--- a/storage/xtradb/include/sync0arr.ic
+++ b/storage/xtradb/include/sync0arr.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -24,4 +24,3 @@ Inline code
 
 Created 9/5/1995 Heikki Tuuri
 *******************************************************/
-
diff --git a/storage/xtradb/include/sync0rw.h b/storage/xtradb/include/sync0rw.h
index 414d7ea43dc..ace3a0993c8 100644
--- a/storage/xtradb/include/sync0rw.h
+++ b/storage/xtradb/include/sync0rw.h
@@ -18,8 +18,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -36,6 +36,7 @@ Created 9/11/1995 Heikki Tuuri
 #include "univ.i"
 #ifndef UNIV_HOTBACKUP
 #include "ut0lst.h"
+#include "ut0counter.h"
 #include "sync0sync.h"
 #include "os0sync.h"
 
@@ -44,6 +45,43 @@ in MySQL: */
 #undef rw_lock_t
 #endif /* !UNIV_HOTBACKUP */
 
+/** Counters for RW locks. */
+struct rw_lock_stats_t {
+	typedef ib_counter_t<ib_int64_t, IB_N_SLOTS> ib_int64_counter_t;
+
+	/** number of spin waits on rw-latches,
+	resulted during shared (read) locks */
+	ib_int64_counter_t	rw_s_spin_wait_count;
+
+	/** number of spin loop rounds on rw-latches,
+	resulted during shared (read) locks */
+	ib_int64_counter_t	rw_s_spin_round_count;
+
+	/** number of OS waits on rw-latches,
+	resulted during shared (read) locks */
+	ib_int64_counter_t	rw_s_os_wait_count;
+
+	/** number of unlocks (that unlock shared locks),
+	set only when UNIV_SYNC_PERF_STAT is defined */
+	ib_int64_counter_t	rw_s_exit_count;
+
+	/** number of spin waits on rw-latches,
+	resulted during exclusive (write) locks */
+	ib_int64_counter_t	rw_x_spin_wait_count;
+
+	/** number of spin loop rounds on rw-latches,
+	resulted during exclusive (write) locks */
+	ib_int64_counter_t	rw_x_spin_round_count;
+
+	/** number of OS waits on rw-latches,
+	resulted during exclusive (write) locks */
+	ib_int64_counter_t	rw_x_os_wait_count;
+
+	/** number of unlocks (that unlock exclusive locks),
+	set only when UNIV_SYNC_PERF_STAT is defined */
+	ib_int64_counter_t	rw_x_exit_count;
+};
+
 /* Latch types; these are used also in btr0btr.h: keep the numerical values
 smaller than 30 and the order of the numerical values like below! */
 #define RW_S_LATCH	1
@@ -57,22 +95,23 @@ of concurrent read locks before the rw_lock breaks. The current value of
 0x00100000 allows 1,048,575 concurrent readers and 2047 recursive writers.*/
 #define X_LOCK_DECR		0x00100000
 
-typedef struct rw_lock_struct		rw_lock_t;
+struct rw_lock_t;
+struct prio_rw_lock_t;
 #ifdef UNIV_SYNC_DEBUG
-typedef struct rw_lock_debug_struct	rw_lock_debug_t;
+struct rw_lock_debug_t;
 #endif /* UNIV_SYNC_DEBUG */
 
 typedef UT_LIST_BASE_NODE_T(rw_lock_t)	rw_lock_list_t;
 
 extern rw_lock_list_t	rw_lock_list;
-extern mutex_t		rw_lock_list_mutex;
+extern ib_mutex_t		rw_lock_list_mutex;
 
 #ifdef UNIV_SYNC_DEBUG
 /* The global mutex which protects debug info lists of all rw-locks.
 To modify the debug info list of an rw-lock, this mutex has to be
 
 acquired in addition to the mutex protecting the lock. */
-extern mutex_t		rw_lock_debug_mutex;
+extern ib_mutex_t		rw_lock_debug_mutex;
 extern os_event_t	rw_lock_debug_event;	/*!< If deadlock detection does
 					not get immediately the mutex it
 					may wait for this event */
@@ -80,30 +119,8 @@ extern ibool		rw_lock_debug_waiters;	/*!< This is set to TRUE, if
 					there may be waiters for the event */
 #endif /* UNIV_SYNC_DEBUG */
 
-/** number of spin waits on rw-latches,
-resulted during exclusive (write) locks */
-extern	ib_int64_t	rw_s_spin_wait_count;
-/** number of spin loop rounds on rw-latches,
-resulted during exclusive (write) locks */
-extern	ib_int64_t	rw_s_spin_round_count;
-/** number of unlocks (that unlock shared locks),
-set only when UNIV_SYNC_PERF_STAT is defined */
-extern	ib_int64_t	rw_s_exit_count;
-/** number of OS waits on rw-latches,
-resulted during shared (read) locks */
-extern	ib_int64_t	rw_s_os_wait_count;
-/** number of spin waits on rw-latches,
-resulted during shared (read) locks */
-extern	ib_int64_t	rw_x_spin_wait_count;
-/** number of spin loop rounds on rw-latches,
-resulted during shared (read) locks */
-extern	ib_int64_t	rw_x_spin_round_count;
-/** number of OS waits on rw-latches,
-resulted during exclusive (write) locks */
-extern	ib_int64_t	rw_x_os_wait_count;
-/** number of unlocks (that unlock exclusive locks),
-set only when UNIV_SYNC_PERF_STAT is defined */
-extern	ib_int64_t	rw_x_exit_count;
+/** Counters for RW locks. */
+extern rw_lock_stats_t	rw_lock_stats;
 
 #ifdef UNIV_PFS_RWLOCK
 /* Following are rwlock keys used to register with MySQL
@@ -112,18 +129,22 @@ performance schema */
 extern	mysql_pfs_key_t	archive_lock_key;
 # endif /* UNIV_LOG_ARCHIVE */
 extern	mysql_pfs_key_t btr_search_latch_key;
-extern	mysql_pfs_key_t	buf_pool_page_hash_key;
 extern	mysql_pfs_key_t	buf_block_lock_key;
 # ifdef UNIV_SYNC_DEBUG
 extern	mysql_pfs_key_t	buf_block_debug_latch_key;
 # endif /* UNIV_SYNC_DEBUG */
 extern	mysql_pfs_key_t	dict_operation_lock_key;
-extern	mysql_pfs_key_t	fil_space_latch_key;
 extern	mysql_pfs_key_t	checkpoint_lock_key;
+extern	mysql_pfs_key_t	fil_space_latch_key;
+extern	mysql_pfs_key_t	fts_cache_rw_lock_key;
+extern	mysql_pfs_key_t	fts_cache_init_rw_lock_key;
 extern	mysql_pfs_key_t	trx_i_s_cache_lock_key;
 extern	mysql_pfs_key_t	trx_purge_latch_key;
 extern	mysql_pfs_key_t	index_tree_rw_lock_key;
+extern	mysql_pfs_key_t	index_online_log_key;
 extern	mysql_pfs_key_t	dict_table_stats_latch_key;
+extern  mysql_pfs_key_t trx_sys_rw_lock_key;
+extern  mysql_pfs_key_t hash_table_rw_lock_key;
 #endif /* UNIV_PFS_RWLOCK */
 
 
@@ -279,6 +300,24 @@ rw_lock_create_func(
 #endif /* UNIV_DEBUG */
 	const char*	cmutex_name);	/*!< in: mutex name */
 /******************************************************************//**
+Creates, or rather, initializes a priority rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+rw_lock_create_func(
+/*================*/
+	prio_rw_lock_t*	lock,		/*!< in: pointer to memory */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+	ulint		level,		/*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+	const char*	cfile_name,	/*!< in: file name where created */
+	ulint		cline,		/*!< in: file line where created */
+#endif /* UNIV_DEBUG */
+	const char*	cmutex_name);	/*!< in: mutex name */
+/******************************************************************//**
 Calling this function is obligatory only if the memory buffer containing
 the rw-lock is freed. Removes an rw-lock object from the global list. The
 rw-lock is checked to be in the non-locked state. */
@@ -287,6 +326,15 @@ void
 rw_lock_free_func(
 /*==============*/
 	rw_lock_t*	lock);	/*!< in: rw-lock */
+/******************************************************************//**
+Calling this function is obligatory only if the memory buffer containing
+the priority rw-lock is freed. Removes an rw-lock object from the global list.
+The rw-lock is checked to be in the non-locked state. */
+UNIV_INTERN
+void
+rw_lock_free_func(
+/*==============*/
+	prio_rw_lock_t*	lock);	/*!< in: rw-lock */
 #ifdef UNIV_DEBUG
 /******************************************************************//**
 Checks that the rw-lock has been initialized and that there are no
@@ -297,6 +345,15 @@ ibool
 rw_lock_validate(
 /*=============*/
 	rw_lock_t*	lock);	/*!< in: rw-lock */
+/******************************************************************//**
+Checks that the priority rw-lock has been initialized and that there are no
+simultaneous shared and exclusive locks.
+@return	TRUE */
+UNIV_INTERN
+ibool
+rw_lock_validate(
+/*=============*/
+	prio_rw_lock_t*	lock);	/*!< in: rw-lock */
 #endif /* UNIV_DEBUG */
 /******************************************************************//**
 Low-level function which tries to lock an rw-lock in s-mode. Performs no
@@ -329,6 +386,22 @@ rw_lock_s_lock_func(
 	const char*	file_name,/*!< in: file name where lock requested */
 	ulint		line);	/*!< in: line where requested */
 /******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function, except if
+you supply the file name and line number. Lock a priority rw-lock in shared
+mode for the current thread, using the relative thread priority.  If the
+rw-lock is locked in exclusive mode, or there is an exclusive lock request
+waiting, the function spins a preset time (controlled by SYNC_SPIN_ROUNDS),
+waiting for the lock, before suspending the thread. */
+UNIV_INLINE
+void
+rw_lock_s_lock_func(
+/*================*/
+	prio_rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+/******************************************************************//**
 NOTE! Use the corresponding macro, not directly this function! Lock an
 rw-lock in exclusive mode for the current thread if the lock can be
 obtained immediately.
@@ -353,6 +426,17 @@ rw_lock_s_unlock_func(
 	rw_lock_t*	lock);	/*!< in/out: rw-lock */
 
 /******************************************************************//**
+Releases a shared mode priority lock. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the lock may have
+				been passed to another thread to unlock */
+#endif
+	prio_rw_lock_t*	lock);	/*!< in/out: rw-lock */
+/******************************************************************//**
 NOTE! Use the corresponding macro, not directly this function! Lock an
 rw-lock in exclusive mode for the current thread. If the rw-lock is locked
 in shared or exclusive mode, or there is an exclusive lock request waiting,
@@ -365,7 +449,30 @@ UNIV_INTERN
 void
 rw_lock_x_lock_func(
 /*================*/
-	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	rw_lock_t*      lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line,	/*!< in: line where requested */
+	bool		priority_lock = false,
+				/*!< in: whether the lock is a priority lock */
+	bool		high_priority = false);
+				/*!< in: whether we are acquiring a priority
+				lock with high priority */
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock a priority
+rw-lock in exclusive mode for the current thread. If the rw-lock is locked
+in shared or exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the lock, before suspending the thread. If the same thread has an x-lock
+on the rw-lock, locking succeed, with the following exception: if pass != 0,
+only a single x-lock may be taken on the lock. NOTE: If the same thread has
+an s-lock, locking does not succeed! */
+UNIV_INTERN
+void
+rw_lock_x_lock_func(
+/*================*/
+	prio_rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
 	ulint		pass,	/*!< in: pass value; != 0, if the lock will
 				be passed to another thread to unlock */
 	const char*	file_name,/*!< in: file name where lock requested */
@@ -381,30 +488,17 @@ rw_lock_x_unlock_func(
 				been passed to another thread to unlock */
 #endif
 	rw_lock_t*	lock);	/*!< in/out: rw-lock */
-
-
-/******************************************************************//**
-Low-level function which locks an rw-lock in s-mode when we know that it
-is possible and none else is currently accessing the rw-lock structure.
-Then we can do the locking without reserving the mutex. */
-UNIV_INLINE
-void
-rw_lock_s_lock_direct(
-/*==================*/
-	rw_lock_t*	lock,		/*!< in/out: rw-lock */
-	const char*	file_name,	/*!< in: file name where requested */
-	ulint		line);		/*!< in: line where lock requested */
 /******************************************************************//**
-Low-level function which locks an rw-lock in x-mode when we know that it
-is not locked and none else is currently accessing the rw-lock structure.
-Then we can do the locking without reserving the mutex. */
+Releases an exclusive mode priority lock. */
 UNIV_INLINE
 void
-rw_lock_x_lock_direct(
+rw_lock_x_unlock_func(
 /*==================*/
-	rw_lock_t*	lock,		/*!< in/out: rw-lock */
-	const char*	file_name,	/*!< in: file name where requested */
-	ulint		line);		/*!< in: line where lock requested */
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the lock may have
+				been passed to another thread to unlock */
+#endif
+	prio_rw_lock_t*	lock);	/*!< in/out: rw-lock */
 /******************************************************************//**
 This function is used in the insert buffer to move the ownership of an
 x-latch on a buffer frame to the current thread. The x-latch was set by
@@ -420,22 +514,6 @@ rw_lock_x_lock_move_ownership(
 	rw_lock_t*	lock);	/*!< in: lock which was x-locked in the
 				buffer read */
 /******************************************************************//**
-Releases a shared mode lock when we know there are no waiters and none
-else will access the lock during the time this function is executed. */
-UNIV_INLINE
-void
-rw_lock_s_unlock_direct(
-/*====================*/
-	rw_lock_t*	lock);	/*!< in/out: rw-lock */
-/******************************************************************//**
-Releases an exclusive mode lock when we know there are no waiters, and
-none else will access the lock durint the time this function is executed. */
-UNIV_INLINE
-void
-rw_lock_x_unlock_direct(
-/*====================*/
-	rw_lock_t*	lock);	/*!< in/out: rw-lock */
-/******************************************************************//**
 Returns the value of writer_count for the lock. Does not reserve the lock
 mutex, so the caller must be sure it is not changed during the call.
 @return	value of writer_count */
@@ -444,6 +522,15 @@ ulint
 rw_lock_get_x_lock_count(
 /*=====================*/
 	const rw_lock_t*	lock);	/*!< in: rw-lock */
+/******************************************************************//**
+Returns the value of writer_count for the priority lock. Does not reserve the
+lock mutex, so the caller must be sure it is not changed during the call.
+@return	value of writer_count */
+UNIV_INLINE
+ulint
+rw_lock_get_x_lock_count(
+/*=====================*/
+	const prio_rw_lock_t*	lock);	/*!< in: rw-lock */
 /********************************************************************//**
 Check if there are threads waiting for the rw-lock.
 @return	1 if waiters, 0 otherwise */
@@ -452,6 +539,14 @@ ulint
 rw_lock_get_waiters(
 /*================*/
 	const rw_lock_t*	lock);	/*!< in: rw-lock */
+/********************************************************************//**
+Check if there are threads waiting for the priority rw-lock.
+@return	1 if waiters, 0 otherwise */
+UNIV_INLINE
+ulint
+rw_lock_get_waiters(
+/*================*/
+	const prio_rw_lock_t*	lock);	/*!< in: rw-lock */
 /******************************************************************//**
 Returns the write-status of the lock - this function made more sense
 with the old rw_lock implementation.
@@ -462,6 +557,15 @@ rw_lock_get_writer(
 /*===============*/
 	const rw_lock_t*	lock);	/*!< in: rw-lock */
 /******************************************************************//**
+Returns the write-status of the priority lock - this function made more sense
+with the old rw_lock implementation.
+@return	RW_LOCK_NOT_LOCKED, RW_LOCK_EX, RW_LOCK_WAIT_EX */
+UNIV_INLINE
+ulint
+rw_lock_get_writer(
+/*===============*/
+	const prio_rw_lock_t*	lock);	/*!< in: rw-lock */
+/******************************************************************//**
 Returns the number of readers.
 @return	number of readers */
 UNIV_INLINE
@@ -470,6 +574,14 @@ rw_lock_get_reader_count(
 /*=====================*/
 	const rw_lock_t*	lock);	/*!< in: rw-lock */
 /******************************************************************//**
+Returns the number of readers.
+@return	number of readers */
+UNIV_INLINE
+ulint
+rw_lock_get_reader_count(
+/*=====================*/
+	const prio_rw_lock_t*	lock);	/*!< in: rw-lock */
+/******************************************************************//**
 Decrements lock_word the specified amount if it is greater than 0.
 This is used by both s_lock and x_lock operations.
 @return	TRUE if decr occurs */
@@ -516,6 +628,17 @@ rw_lock_own(
 	ulint		lock_type)	/*!< in: lock type: RW_LOCK_SHARED,
 					RW_LOCK_EX */
 	__attribute__((warn_unused_result));
+/******************************************************************//**
+Checks if the thread has locked the priority rw-lock in the specified mode,
+with the pass value == 0. */
+UNIV_INTERN
+ibool
+rw_lock_own(
+/*========*/
+	prio_rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		lock_type)	/*!< in: lock type: RW_LOCK_SHARED,
+					RW_LOCK_EX */
+	__attribute__((warn_unused_result));
 #endif /* UNIV_SYNC_DEBUG */
 /******************************************************************//**
 Checks if somebody has locked the rw-lock in the specified mode. */
@@ -588,7 +711,7 @@ shared locks are allowed. To prevent starving of a writer blocked by
 readers, a writer may queue for x-lock by decrementing lock_word: no
 new readers will be let in while the thread waits for readers to
 exit. */
-struct rw_lock_struct {
+struct rw_lock_t {
 	volatile lint	lock_word;
 				/*!< Holds the state of the lock. */
 	volatile ulint	waiters;/*!< 1: there are waiters */
@@ -608,12 +731,12 @@ struct rw_lock_struct {
 				/*!< Thread id of writer thread. Is only
 				guaranteed to have sane and non-stale
 				value iff recursive flag is set. */
-	os_event_t	event;	/*!< Used by sync0arr.c for thread queueing */
+	os_event_t	event;	/*!< Used by sync0arr.cc for thread queueing */
 	os_event_t	wait_ex_event;
 				/*!< Event for next-writer to wait on. A thread
 				must decrement lock_word before waiting. */
 #ifndef INNODB_RW_LOCKS_USE_ATOMICS
-	mutex_t	mutex;		/*!< The mutex protecting rw_lock_struct */
+	ib_mutex_t	mutex;		/*!< The mutex protecting rw_lock_t */
 #endif /* INNODB_RW_LOCKS_USE_ATOMICS */
 
 	UT_LIST_NODE_T(rw_lock_t) list;
@@ -646,15 +769,36 @@ struct rw_lock_struct {
 	unsigned	last_x_line:14;	/*!< Line number where last time x-locked */
 #ifdef UNIV_DEBUG
 	ulint	magic_n;	/*!< RW_LOCK_MAGIC_N */
-/** Value of rw_lock_struct::magic_n */
+/** Value of rw_lock_t::magic_n */
 #define	RW_LOCK_MAGIC_N	22643
 #endif /* UNIV_DEBUG */
 };
 
+/** The structure implementing a priority rw lock.  */
+struct prio_rw_lock_t {
+	struct rw_lock_t	base_lock;	/* The regular rw latch
+						provides the lock word etc. for
+						the priority rw lock  */
+	volatile ulint		high_priority_s_waiters;
+						/* If 1, high priority S
+						waiters exist */
+	os_event_t		high_priority_s_event; /* High priority wait
+						array event for S waiters */
+	volatile ulint		high_priority_x_waiters;
+						/* If 1, high priority X
+						waiters exist */
+	os_event_t		high_priority_x_event;
+						/* High priority wait arraay
+						event for X waiters */
+	volatile ulint		high_priority_wait_ex_waiter;
+						/* If 1, a waiting next-writer
+						exists and is high-priority */
+};
+
 #ifdef UNIV_SYNC_DEBUG
 /** The structure for storing debug info of an rw-lock.  All access to this
 structure must be protected by rw_lock_debug_mutex_enter(). */
-struct	rw_lock_debug_struct {
+struct	rw_lock_debug_t {
 
 	os_thread_id_t thread_id;  /*!< The thread id of the thread which
 				locked the rw-lock */
@@ -691,9 +835,6 @@ rw_lock_s_lock_gen()
 rw_lock_s_lock_nowait()
 rw_lock_s_unlock_gen()
 rw_lock_free()
-
-Two function APIs rw_lock_x_unlock_direct() and rw_lock_s_unlock_direct()
-do not have any caller/user, they are not instrumented.
 */
 
 #ifdef UNIV_PFS_RWLOCK
@@ -718,6 +859,26 @@ pfs_rw_lock_create_func(
 	const char*	cmutex_name);	/*!< in: mutex name */
 
 /******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_create_func()
+NOTE! Please use the corresponding macro rw_lock_create(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_create_func(
+/*====================*/
+	PSI_rwlock_key  key,		/*!< in: key registered with
+					performance schema */
+	prio_rw_lock_t*	lock,		/*!< in: rw lock */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+	ulint		level,		/*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+	const char*	cfile_name,	/*!< in: file name where created */
+	ulint		cline,		/*!< in: file line where created */
+#endif /* UNIV_DEBUG */
+	const char*	cmutex_name);	/*!< in: mutex name */
+
+/******************************************************************//**
 Performance schema instrumented wrap function for rw_lock_x_lock_func()
 NOTE! Please use the corresponding macro rw_lock_x_lock(), not
 directly this function! */
@@ -730,6 +891,21 @@ pfs_rw_lock_x_lock_func(
 				be passed to another thread to unlock */
 	const char*	file_name,/*!< in: file name where lock requested */
 	ulint		line);	/*!< in: line where requested */
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_lock_func()
+NOTE! Please use the corresponding macro rw_lock_x_lock(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_lock_func(
+/*====================*/
+	prio_rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+
 /******************************************************************//**
 Performance schema instrumented wrap function for
 rw_lock_x_lock_func_nowait()
@@ -742,6 +918,7 @@ pfs_rw_lock_x_lock_func_nowait(
 	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
 	const char*	file_name,/*!< in: file name where lock requested */
 	ulint		line);	/*!< in: line where requested */
+
 /******************************************************************//**
 Performance schema instrumented wrap function for rw_lock_s_lock_func()
 NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly
@@ -755,6 +932,21 @@ pfs_rw_lock_s_lock_func(
 				be passed to another thread to unlock */
 	const char*	file_name,/*!< in: file name where lock requested */
 	ulint		line);	/*!< in: line where requested */
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_lock_func()
+NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_s_lock_func(
+/*====================*/
+	prio_rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+
 /******************************************************************//**
 Performance schema instrumented wrap function for rw_lock_s_lock_func()
 NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly
@@ -771,6 +963,21 @@ pfs_rw_lock_s_lock_low(
 	const char*	file_name, /*!< in: file name where lock requested */
 	ulint		line);	/*!< in: line where requested */
 /******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_lock_func()
+NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly
+this function!
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_rw_lock_s_lock_low(
+/*===================*/
+	prio_rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the
+				lock will be passed to another
+				thread to unlock */
+	const char*	file_name, /*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+/******************************************************************//**
 Performance schema instrumented wrap function for rw_lock_x_lock_func()
 NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly
 this function! */
@@ -784,6 +991,19 @@ pfs_rw_lock_x_lock_func(
 	const char*	file_name,/*!< in: file name where lock requested */
 	ulint		line);	/*!< in: line where requested */
 /******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_lock_func()
+NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_lock_func(
+/*====================*/
+	prio_rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+/******************************************************************//**
 Performance schema instrumented wrap function for rw_lock_s_unlock_func()
 NOTE! Please use the corresponding macro rw_lock_s_unlock(), not directly
 this function! */
@@ -799,6 +1019,20 @@ pfs_rw_lock_s_unlock_func(
 	rw_lock_t*	lock);	/*!< in/out: rw-lock */
 /******************************************************************//**
 Performance schema instrumented wrap function for rw_lock_s_unlock_func()
+NOTE! Please use the corresponding macro rw_lock_s_unlock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_s_unlock_func(
+/*======================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the
+				lock may have been passed to another
+			        thread to unlock */
+#endif
+	prio_rw_lock_t*	lock);	/*!< in/out: rw-lock */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_unlock_func()
 NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly
 this function! */
 UNIV_INLINE
@@ -812,6 +1046,20 @@ pfs_rw_lock_x_unlock_func(
 #endif
 	rw_lock_t*	lock);	/*!< in/out: rw-lock */
 /******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_unlock_func()
+NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_unlock_func(
+/*======================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the
+				lock may have been passed to another
+				thread to unlock */
+#endif
+	prio_rw_lock_t*	lock);	/*!< in/out: rw-lock */
+/******************************************************************//**
 Performance schema instrumented wrap function for rw_lock_free_func()
 NOTE! Please use the corresponding macro rw_lock_free(), not directly
 this function! */
@@ -820,6 +1068,15 @@ void
 pfs_rw_lock_free_func(
 /*==================*/
 	rw_lock_t*	lock);	/*!< in: rw-lock */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_free_func()
+NOTE! Please use the corresponding macro rw_lock_free(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_free_func(
+/*==================*/
+	prio_rw_lock_t*	lock);	/*!< in: rw-lock */
 #endif  /* UNIV_PFS_RWLOCK */
 
 
diff --git a/storage/xtradb/include/sync0rw.ic b/storage/xtradb/include/sync0rw.ic
index 706ccbc00de..c625ee39035 100644
--- a/storage/xtradb/include/sync0rw.ic
+++ b/storage/xtradb/include/sync0rw.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -18,8 +18,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -31,17 +31,22 @@ Created 9/11/1995 Heikki Tuuri
 *******************************************************/
 
 /******************************************************************//**
-Lock an rw-lock in shared mode for the current thread. If the rw-lock is
-locked in exclusive mode, or there is an exclusive lock request waiting,
-the function spins a preset time (controlled by SYNC_SPIN_ROUNDS),
+Lock a regular or priority rw-lock in shared mode for the current thread. If
+the rw-lock is locked in exclusive mode, or there is an exclusive lock request
+waiting, the function spins a preset time (controlled by SYNC_SPIN_ROUNDS),
 waiting for the lock before suspending the thread. */
 UNIV_INTERN
 void
 rw_lock_s_lock_spin(
 /*================*/
-	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	void*		_lock,	/*!< in: pointer to rw-lock */
 	ulint		pass,	/*!< in: pass value; != 0, if the lock will
 				be passed to another thread to unlock */
+	bool		priority_lock,
+				/*!< in: whether the lock is a priority lock */
+	bool		high_priority,
+				/*!< in: whether we are acquiring a priority
+				lock with high priority */
 	const char*	file_name,/*!< in: file name where lock requested */
 	ulint		line);	/*!< in: line where requested */
 #ifdef UNIV_SYNC_DEBUG
@@ -80,6 +85,20 @@ rw_lock_get_waiters(
 }
 
 /********************************************************************//**
+Check if there are threads waiting for the priority rw-lock.
+@return	1 if waiters, 0 otherwise */
+UNIV_INLINE
+ulint
+rw_lock_get_waiters(
+/*================*/
+	const prio_rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	return rw_lock_get_waiters(&lock->base_lock)
+		| lock->high_priority_s_waiters
+		| lock->high_priority_x_waiters;
+}
+
+/********************************************************************//**
 Sets lock->waiters to 1. It is not an error if lock->waiters is already
 1. On platforms where ATOMIC builtins are used this function enforces a
 memory barrier. */
@@ -128,15 +147,28 @@ rw_lock_get_writer(
 		/* return NOT_LOCKED in s-lock state, like the writer
 		member of the old lock implementation. */
 		return(RW_LOCK_NOT_LOCKED);
-	} else if (((-lock_word) % X_LOCK_DECR) == 0) {
+	} else if ((lock_word == 0) || (lock_word <= -X_LOCK_DECR)) {
 		return(RW_LOCK_EX);
 	} else {
-                ut_ad(lock_word > -X_LOCK_DECR);
+		ut_ad(lock_word > -X_LOCK_DECR);
 		return(RW_LOCK_WAIT_EX);
 	}
 }
 
 /******************************************************************//**
+Returns the write-status of the priority lock - this function made more sense
+with the old rw_lock implementation.
+@return	RW_LOCK_NOT_LOCKED, RW_LOCK_EX, RW_LOCK_WAIT_EX */
+UNIV_INLINE
+ulint
+rw_lock_get_writer(
+/*===============*/
+	const prio_rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	return(rw_lock_get_writer(&lock->base_lock));
+}
+
+/******************************************************************//**
 Returns the number of readers.
 @return	number of readers */
 UNIV_INLINE
@@ -156,9 +188,21 @@ rw_lock_get_reader_count(
 	return(0);
 }
 
+/******************************************************************//**
+Returns the number of readers.
+@return	number of readers */
+UNIV_INLINE
+ulint
+rw_lock_get_reader_count(
+/*=====================*/
+	const prio_rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	return(rw_lock_get_reader_count(&lock->base_lock));
+}
+
 #ifndef INNODB_RW_LOCKS_USE_ATOMICS
 UNIV_INLINE
-mutex_t*
+ib_mutex_t*
 rw_lock_get_mutex(
 /*==============*/
 	rw_lock_t*	lock)
@@ -178,11 +222,23 @@ rw_lock_get_x_lock_count(
 	const rw_lock_t*	lock)	/*!< in: rw-lock */
 {
 	lint lock_copy = lock->lock_word;
-	/* If there is a reader, lock_word is not divisible by X_LOCK_DECR */
-	if (lock_copy > 0 || (-lock_copy) % X_LOCK_DECR != 0) {
+	if ((lock_copy != 0) && (lock_copy > -X_LOCK_DECR)) {
 		return(0);
 	}
-	return(((-lock_copy) / X_LOCK_DECR) + 1);
+	return((lock_copy == 0) ? 1 : (2 - (lock_copy + X_LOCK_DECR)));
+}
+
+/******************************************************************//**
+Returns the value of writer_count for the priority lock. Does not reserve the
+lock mutex, so the caller must be sure it is not changed during the call.
+@return	value of writer_count */
+UNIV_INLINE
+ulint
+rw_lock_get_x_lock_count(
+/*=====================*/
+	const prio_rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	return(rw_lock_get_x_lock_count(&lock->base_lock));
 }
 
 /******************************************************************//**
@@ -200,7 +256,7 @@ rw_lock_lock_word_decr(
 	ulint		amount)		/*!< in: amount to decrement */
 {
 #ifdef INNODB_RW_LOCKS_USE_ATOMICS
-        lint local_lock_word = lock->lock_word;
+	lint local_lock_word = lock->lock_word;
 	while (local_lock_word > 0) {
 		if (os_compare_and_swap_lint(&lock->lock_word,
 					     local_lock_word,
@@ -244,7 +300,7 @@ rw_lock_lock_word_incr(
 
 	mutex_exit(&(lock->mutex));
 
-        return(local_lock_word);
+	return(local_lock_word);
 #endif /* INNODB_RW_LOCKS_USE_ATOMICS */
 }
 
@@ -308,7 +364,6 @@ rw_lock_s_lock_low(
 	const char*	file_name, /*!< in: file name where lock requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	/* TODO: study performance of UNIV_LIKELY branch prediction hints. */
 	if (!rw_lock_lock_word_decr(lock, 1)) {
 		/* Locking did not succeed */
 		return(FALSE);
@@ -318,7 +373,7 @@ rw_lock_s_lock_low(
 	rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name, line);
 #endif
 	/* These debugging values are not set safely: they may be incorrect
-        or even refer to a line that is invalid for the file name. */
+	or even refer to a line that is invalid for the file name. */
 	lock->last_s_file_name = file_name;
 	lock->last_s_line = line;
 
@@ -326,58 +381,6 @@ rw_lock_s_lock_low(
 }
 
 /******************************************************************//**
-Low-level function which locks an rw-lock in s-mode when we know that it
-is possible and none else is currently accessing the rw-lock structure.
-Then we can do the locking without reserving the mutex. */
-UNIV_INLINE
-void
-rw_lock_s_lock_direct(
-/*==================*/
-	rw_lock_t*	lock,		/*!< in/out: rw-lock */
-	const char*	file_name,	/*!< in: file name where requested */
-	ulint		line)		/*!< in: line where lock requested */
-{
-	ut_ad(lock->lock_word == X_LOCK_DECR);
-
-	/* Indicate there is a new reader by decrementing lock_word */
-	lock->lock_word--;
-
-	lock->last_s_file_name = file_name;
-	lock->last_s_line = line;
-
-#ifdef UNIV_SYNC_DEBUG
-	rw_lock_add_debug_info(lock, 0, RW_LOCK_SHARED, file_name, line);
-#endif
-}
-
-/******************************************************************//**
-Low-level function which locks an rw-lock in x-mode when we know that it
-is not locked and none else is currently accessing the rw-lock structure.
-Then we can do the locking without reserving the mutex. */
-UNIV_INLINE
-void
-rw_lock_x_lock_direct(
-/*==================*/
-	rw_lock_t*	lock,		/*!< in/out: rw-lock */
-	const char*	file_name,	/*!< in: file name where requested */
-	ulint		line)		/*!< in: line where lock requested */
-{
-	ut_ad(rw_lock_validate(lock));
-	ut_ad(lock->lock_word == X_LOCK_DECR);
-
-	lock->lock_word -= X_LOCK_DECR;
-	lock->writer_thread = os_thread_get_curr_id();
-	lock->recursive = TRUE;
-
-	lock->last_x_file_name = file_name;
-	lock->last_x_line = line;
-
-#ifdef UNIV_SYNC_DEBUG
-	rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line);
-#endif
-}
-
-/******************************************************************//**
 NOTE! Use the corresponding macro, not directly this function! Lock an
 rw-lock in shared mode for the current thread. If the rw-lock is locked
 in exclusive mode, or there is an exclusive lock request waiting, the
@@ -409,14 +412,81 @@ rw_lock_s_lock_func(
 	ut_ad(!rw_lock_own(lock, RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
 
-	/* TODO: study performance of UNIV_LIKELY branch prediction hints. */
 	if (rw_lock_s_lock_low(lock, pass, file_name, line)) {
 
 		return; /* Success */
 	} else {
 		/* Did not succeed, try spin wait */
 
-		rw_lock_s_lock_spin(lock, pass, file_name, line);
+		rw_lock_s_lock_spin(lock, pass, false, false, file_name, line);
+
+		return;
+	}
+}
+
+/******************************************************************//**
+Return true if waiters of higher priority than the current thread
+exist.
+@true if waiterss of higher priority exist */
+UNIV_INLINE
+bool
+rw_lock_higher_prio_waiters_exist(
+/*==============================*/
+	bool	priority_lock,	/*!< in: whether the lock is a priority lock */
+	bool	high_priority,	/*!< in: whether we are acquiring a priority
+				lock with high priority */
+	void*	lock)		/*!< in: rw lock */
+{
+	if (high_priority || !priority_lock) {
+		ut_ad(!(!priority_lock && high_priority));
+		return(false);
+	}
+
+	ut_ad(priority_lock && !high_priority);
+
+	prio_rw_lock_t *prio_rw_lock = (prio_rw_lock_t *) lock;
+	return prio_rw_lock->high_priority_wait_ex_waiter > 0
+		|| prio_rw_lock->high_priority_s_waiters > 0
+		|| prio_rw_lock->high_priority_x_waiters > 0;
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function, except if
+you supply the file name and line number. Lock a priority rw-lock in shared
+mode for the current thread, using the relative thread priority.  If the
+rw-lock is locked in exclusive mode, or there is an exclusive lock request
+waiting, the function spins a preset time (controlled by SYNC_SPIN_ROUNDS),
+waiting for the lock, before suspending the thread. */
+UNIV_INLINE
+void
+rw_lock_s_lock_func(
+/*================*/
+	prio_rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */
+	ut_ad(!rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	bool	high_priority = srv_current_thread_priority > 0;
+
+	/* Do not attempt to acquire a low-priority S latch if there are
+	high-priority waiters even if such attempt would be successful.  This
+	is to prevent a high priority X request from being starved by a
+	sequence of overlapping regular priority S requests.  */
+
+	if (!rw_lock_higher_prio_waiters_exist(true, high_priority, lock)
+	    && rw_lock_s_lock_low(&lock->base_lock, pass, file_name, line)) {
+
+		return; /* Success */
+	} else {
+		/* Did not succeed, try spin wait */
+		rw_lock_s_lock_spin(lock, pass, true, high_priority, file_name,
+				    line);
 
 		return;
 	}
@@ -435,8 +505,6 @@ rw_lock_x_lock_func_nowait(
 	const char*	file_name,/*!< in: file name where lock requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	os_thread_id_t	curr_thread	= os_thread_get_curr_id();
-
 	ibool success;
 
 #ifdef INNODB_RW_LOCKS_USE_ATOMICS
@@ -456,13 +524,19 @@ rw_lock_x_lock_func_nowait(
 		rw_lock_set_writer_id_and_recursion_flag(lock, TRUE);
 
 	} else if (lock->recursive
-		   && os_thread_eq(lock->writer_thread, curr_thread)) {
+		   && os_thread_eq(lock->writer_thread,
+				   os_thread_get_curr_id())) {
 		/* Relock: this lock_word modification is safe since no other
 		threads can modify (lock, unlock, or reserve) lock_word while
 		there is an exclusive writer and this is the writer thread. */
-		lock->lock_word -= X_LOCK_DECR;
+		if (lock->lock_word == 0) {
+			lock->lock_word = -X_LOCK_DECR;
+		} else {
+			lock->lock_word--;
+		}
 
-		ut_ad(((-lock->lock_word) % X_LOCK_DECR) == 0);
+		/* Watch for too many recursive locks */
+		ut_ad(lock->lock_word < 0);
 
 	} else {
 		/* Failure */
@@ -492,7 +566,9 @@ rw_lock_s_unlock_func(
 #endif
 	rw_lock_t*	lock)	/*!< in/out: rw-lock */
 {
-	ut_ad((lock->lock_word % X_LOCK_DECR) != 0);
+	ut_ad(lock->lock_word > -X_LOCK_DECR);
+	ut_ad(lock->lock_word != 0);
+	ut_ad(lock->lock_word < X_LOCK_DECR);
 
 #ifdef UNIV_SYNC_DEBUG
 	rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED);
@@ -502,10 +578,10 @@ rw_lock_s_unlock_func(
 	if (rw_lock_lock_word_incr(lock, 1) == 0) {
 
 		/* wait_ex waiter exists. It may not be asleep, but we signal
-                anyway. We do not wake other waiters, because they can't
-                exist without wait_ex waiter and wait_ex waiter goes first.*/
+		anyway. We do not wake other waiters, because they can't
+		exist without wait_ex waiter and wait_ex waiter goes first.*/
 		os_event_set(lock->wait_ex_event);
-		sync_array_object_signalled(sync_primary_wait_array);
+		sync_array_object_signalled();
 
 	}
 
@@ -517,43 +593,94 @@ rw_lock_s_unlock_func(
 }
 
 /******************************************************************//**
-Releases a shared mode lock when we know there are no waiters and none
-else will access the lock during the time this function is executed. */
+Releases a shared mode priority lock. */
 UNIV_INLINE
 void
-rw_lock_s_unlock_direct(
-/*====================*/
-	rw_lock_t*	lock)	/*!< in/out: rw-lock */
+rw_lock_s_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the lock may have
+				  been passed to another thread to unlock */
+#endif
+	prio_rw_lock_t*	lock)	/*!< in/out: rw-lock */
 {
-	ut_ad(lock->lock_word < X_LOCK_DECR);
+	lint lock_word;
+
+	ut_ad(lock->base_lock.lock_word > -X_LOCK_DECR);
+	ut_ad(lock->base_lock.lock_word != 0);
+	ut_ad(lock->base_lock.lock_word < X_LOCK_DECR);
 
 #ifdef UNIV_SYNC_DEBUG
-	rw_lock_remove_debug_info(lock, 0, RW_LOCK_SHARED);
+	rw_lock_remove_debug_info(&lock->base_lock, pass, RW_LOCK_SHARED);
 #endif
 
-	/* Decrease reader count by incrementing lock_word */
-	lock->lock_word++;
+	/* Increment lock_word to indicate 1 less reader */
+	lock_word = rw_lock_lock_word_incr(&lock->base_lock, 1);
+	if (lock_word == 0) {
+
+		/* A waiting next-writer exists, either high priority or
+		regular.  Wake up the first waiter in this order: 1) high
+		priority next-writer; 2) high priority X waiters; 3) high
+		priority S waiters; 4) regular priority next-waiter.  This
+		allows high priority requests to overtake an already-waiting
+		regular priority next-waiter.  */
+		if (lock->high_priority_wait_ex_waiter) {
+
+			lock->high_priority_wait_ex_waiter = 0;
+			/* Note that we do not have a separate high priority
+			next-waiter event.  There can be only one such waiter,
+			here we already know it's high priority, no
+			regular-priority wakeup may happen.  */
+			os_event_set(lock->base_lock.wait_ex_event);
+		} else if (lock->high_priority_x_waiters) {
+
+			lock->high_priority_x_waiters = 0;
+			os_event_set(lock->high_priority_x_event);
+		} else if (lock->high_priority_s_waiters) {
+
+			lock->high_priority_s_waiters = 0;
+			os_event_set(lock->high_priority_s_event);
+		} else {
+
+			os_event_set(lock->base_lock.wait_ex_event);
+		}
+		sync_array_object_signalled();
+	} else if (lock_word == X_LOCK_DECR) {
+
+		/* S-waiters may exist during an S unlock if a high-priority
+		thread released it, because low-priority threads are prevented
+		from acquiring S lock while high-priority thread holds it.  */
+		if (lock->base_lock.waiters) {
+
+			rw_lock_reset_waiter_flag(&lock->base_lock);
+			os_event_set(lock->base_lock.event);
+			sync_array_object_signalled();
+		}
+	}
 
-	ut_ad(!lock->waiters);
 	ut_ad(rw_lock_validate(lock));
+
 #ifdef UNIV_SYNC_PERF_STAT
 	rw_s_exit_count++;
 #endif
 }
 
 /******************************************************************//**
-Releases an exclusive mode lock. */
+Prepares an exclusive mode lock release: resets the recursion flag and removes
+the debug information if needed  and returns the required lock word increment
+value.
+@return lock word increment value to perform the unlock */
 UNIV_INLINE
-void
-rw_lock_x_unlock_func(
-/*==================*/
+ulint
+rw_lock_x_prepare_unlock(
+/*=====================*/
 #ifdef UNIV_SYNC_DEBUG
 	ulint		pass,	/*!< in: pass value; != 0, if the lock may have
 				been passed to another thread to unlock */
 #endif
 	rw_lock_t*	lock)	/*!< in/out: rw-lock */
 {
-	ut_ad((lock->lock_word % X_LOCK_DECR) == 0);
+	ut_ad(lock->lock_word == 0 || lock->lock_word <= -X_LOCK_DECR);
 
 	/* lock->recursive flag also indicates if lock->writer_thread is
 	valid or stale. If we are the last of the recursive callers
@@ -570,14 +697,46 @@ rw_lock_x_unlock_func(
 	rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX);
 #endif
 
-	if (rw_lock_lock_word_incr(lock, X_LOCK_DECR) == X_LOCK_DECR) {
+	ulint x_lock_incr;
+	if (lock->lock_word == 0) {
+		x_lock_incr = X_LOCK_DECR;
+	} else if (lock->lock_word == -X_LOCK_DECR) {
+		x_lock_incr = X_LOCK_DECR;
+	} else {
+		ut_ad(lock->lock_word < -X_LOCK_DECR);
+		x_lock_incr = 1;
+	}
+
+	return(x_lock_incr);
+}
+
+/******************************************************************//**
+Releases an exclusive mode lock. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the lock may have
+				been passed to another thread to unlock */
+#endif
+	rw_lock_t*	lock)	/*!< in/out: rw-lock */
+{
+	ulint x_lock_incr = rw_lock_x_prepare_unlock(
+#ifdef UNIV_SYNC_DEBUG
+						     pass,
+#endif
+						     lock);
+
+	if (rw_lock_lock_word_incr(lock, x_lock_incr) == X_LOCK_DECR) {
 		/* Lock is now free. May have to signal read/write waiters.
-                We do not need to signal wait_ex waiters, since they cannot
-                exist when there is a writer. */
+		We do not need to signal wait_ex waiters, since they cannot
+		exist when there is a writer. */
+
 		if (lock->waiters) {
 			rw_lock_reset_waiter_flag(lock);
 			os_event_set(lock->event);
-			sync_array_object_signalled(sync_primary_wait_array);
+			sync_array_object_signalled();
 		}
 	}
 
@@ -589,30 +748,50 @@ rw_lock_x_unlock_func(
 }
 
 /******************************************************************//**
-Releases an exclusive mode lock when we know there are no waiters, and
-none else will access the lock during the time this function is executed. */
+Releases an exclusive mode priority lock. */
 UNIV_INLINE
 void
-rw_lock_x_unlock_direct(
-/*====================*/
-	rw_lock_t*	lock)	/*!< in/out: rw-lock */
+rw_lock_x_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the lock may have
+				been passed to another thread to unlock */
+#endif
+	prio_rw_lock_t*	lock)	/*!< in/out: rw-lock */
 {
-	/* Reset the exclusive lock if this thread no longer has an x-mode
-	lock */
-
-	ut_ad((lock->lock_word % X_LOCK_DECR) == 0);
-
+	ulint x_lock_incr = rw_lock_x_prepare_unlock(
 #ifdef UNIV_SYNC_DEBUG
-	rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX);
+						     pass,
 #endif
+						     &lock->base_lock);
 
-	if (lock->lock_word == 0) {
-		lock->recursive = FALSE;
-	}
+	if (rw_lock_lock_word_incr(&lock->base_lock, x_lock_incr)
+	    == X_LOCK_DECR) {
+
+		/* Priority lock is now free.  Signal any waiters in this
+	        order: 1) high priority X waiters; 2) high priority S waiters;
+	        3) regular priority waiters.
+		We do not need to signal wait_ex waiters, since they cannot
+		exist when there is a writer. */
+
+		if (lock->high_priority_x_waiters) {
 
-	lock->lock_word += X_LOCK_DECR;
+			lock->high_priority_x_waiters = 0;
+			os_event_set(lock->high_priority_x_event);
+			sync_array_object_signalled();
+		} else if (lock->high_priority_s_waiters) {
+
+			lock->high_priority_s_waiters = 0;
+			os_event_set(lock->high_priority_s_event);
+			sync_array_object_signalled();
+		} else if (lock->base_lock.waiters) {
+
+			rw_lock_reset_waiter_flag(&lock->base_lock);
+			os_event_set(lock->base_lock.event);
+			sync_array_object_signalled();
+		}
+	}
 
-	ut_ad(!lock->waiters);
 	ut_ad(rw_lock_validate(lock));
 
 #ifdef UNIV_SYNC_PERF_STAT
@@ -643,9 +822,42 @@ pfs_rw_lock_create_func(
 	const char*	cmutex_name)	/*!< in: mutex name */
 {
 	/* Initialize the rwlock for performance schema */
-	lock->pfs_psi = (PSI_server && PFS_IS_INSTRUMENTED(key))
-				? PSI_server->init_rwlock(key, lock)
-				: NULL;
+	lock->pfs_psi = PSI_RWLOCK_CALL(init_rwlock)(key, lock);
+
+	/* The actual function to initialize an rwlock */
+	rw_lock_create_func(lock,
+# ifdef UNIV_DEBUG
+#  ifdef UNIV_SYNC_DEBUG
+			    level,
+#  endif /* UNIV_SYNC_DEBUG */
+			    cfile_name,
+			    cline,
+# endif /* UNIV_DEBUG */
+			    cmutex_name);
+}
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_create_func().
+NOTE! Please use the corresponding macro rw_lock_create(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_create_func(
+/*====================*/
+	mysql_pfs_key_t	key,		/*!< in: key registered with
+					performance schema */
+	prio_rw_lock_t*	lock,		/*!< in: pointer to memory */
+# ifdef UNIV_DEBUG
+#  ifdef UNIV_SYNC_DEBUG
+	ulint		level,		/*!< in: level */
+#  endif /* UNIV_SYNC_DEBUG */
+	const char*	cfile_name,	/*!< in: file name where created */
+	ulint		cline,		/*!< in: file line where created */
+# endif /* UNIV_DEBUG */
+	const char*	cmutex_name)	/*!< in: mutex name */
+{
+	/* Initialize the rwlock for performance schema */
+	lock->base_lock.pfs_psi = PSI_RWLOCK_CALL(init_rwlock)(key, lock);
 
 	/* The actual function to initialize an rwlock */
 	rw_lock_create_func(lock,
@@ -658,6 +870,7 @@ pfs_rw_lock_create_func(
 # endif /* UNIV_DEBUG */
 			    cmutex_name);
 }
+
 /******************************************************************//**
 Performance schema instrumented wrap function for rw_lock_x_lock_func()
 NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly
@@ -672,26 +885,61 @@ pfs_rw_lock_x_lock_func(
 	const char*	file_name,/*!< in: file name where lock requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	struct PSI_rwlock_locker*	locker = NULL;
-	PSI_rwlock_locker_state		state;
+	if (lock->pfs_psi != NULL)
+	{
+		PSI_rwlock_locker*	locker;
+		PSI_rwlock_locker_state	state;
 
-	/* Record the entry of rw x lock request in performance schema */
-	if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
-		locker = PSI_server->get_thread_rwlock_locker(
-			&state, lock->pfs_psi, PSI_RWLOCK_WRITELOCK);
+		/* Record the entry of rw x lock request in performance schema */
+		locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)(
+			&state, lock->pfs_psi, PSI_RWLOCK_WRITELOCK, file_name, line);
 
-		if (locker) {
-			PSI_server->start_rwlock_wrwait(locker,
-							file_name, line);
-		}
+		rw_lock_x_lock_func(lock, pass, file_name, line);
+
+		if (locker != NULL)
+			PSI_RWLOCK_CALL(end_rwlock_wrwait)(locker, 0);
 	}
+	else
+	{
+		rw_lock_x_lock_func(lock, pass, file_name, line);
+	}
+}
 
-	rw_lock_x_lock_func(lock, pass, file_name, line);
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_lock_func()
+NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_lock_func(
+/*====================*/
+	prio_rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	if (lock->base_lock.pfs_psi != NULL)
+	{
+		PSI_rwlock_locker*	locker;
+		PSI_rwlock_locker_state	state;
+
+		/* Record the entry of rw x lock request in performance schema */
+		locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)(
+			&state, lock->base_lock.pfs_psi, PSI_RWLOCK_WRITELOCK,
+			file_name, line);
 
-	if (locker) {
-		PSI_server->end_rwlock_wrwait(locker, 0);
+		rw_lock_x_lock_func(lock, pass, file_name, line);
+
+		if (locker != NULL)
+			PSI_RWLOCK_CALL(end_rwlock_wrwait)(locker, 0);
+	}
+	else
+	{
+		rw_lock_x_lock_func(lock, pass, file_name, line);
 	}
 }
+
 /******************************************************************//**
 Performance schema instrumented wrap function for
 rw_lock_x_lock_func_nowait()
@@ -707,25 +955,25 @@ pfs_rw_lock_x_lock_func_nowait(
 				requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	struct PSI_rwlock_locker*	locker = NULL;
-	PSI_rwlock_locker_state		state;
 	ibool	ret;
 
-	/* Record the entry of rw x lock request in performance schema */
-	if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
-		locker = PSI_server->get_thread_rwlock_locker(
-			&state, lock->pfs_psi, PSI_RWLOCK_WRITELOCK);
+	if (lock->pfs_psi != NULL)
+	{
+		PSI_rwlock_locker*	locker;
+		PSI_rwlock_locker_state		state;
 
-		if (locker) {
-			PSI_server->start_rwlock_wrwait(locker,
-							file_name, line);
-		}
-	}
+		/* Record the entry of rw x lock request in performance schema */
+		locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)(
+			&state, lock->pfs_psi, PSI_RWLOCK_WRITELOCK, file_name, line);
 
-	ret = rw_lock_x_lock_func_nowait(lock, file_name, line);
+		ret = rw_lock_x_lock_func_nowait(lock, file_name, line);
 
-	if (locker) {
-		PSI_server->end_rwlock_wrwait(locker, 0);
+		if (locker != NULL)
+			PSI_RWLOCK_CALL(end_rwlock_wrwait)(locker, ret);
+	}
+	else
+	{
+		ret = rw_lock_x_lock_func_nowait(lock, file_name, line);
 	}
 
 	return(ret);
@@ -740,13 +988,34 @@ pfs_rw_lock_free_func(
 /*==================*/
 	rw_lock_t*	lock)	/*!< in: pointer to rw-lock */
 {
-	if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
-		PSI_server->destroy_rwlock(lock->pfs_psi);
+	if (lock->pfs_psi != NULL)
+	{
+		PSI_RWLOCK_CALL(destroy_rwlock)(lock->pfs_psi);
 		lock->pfs_psi = NULL;
 	}
 
 	rw_lock_free_func(lock);
 }
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_free_func()
+NOTE! Please use the corresponding macro rw_lock_free(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_free_func(
+/*==================*/
+	prio_rw_lock_t*	lock)	/*!< in: pointer to rw-lock */
+{
+	if (lock->base_lock.pfs_psi != NULL)
+	{
+		PSI_RWLOCK_CALL(destroy_rwlock)(lock->base_lock.pfs_psi);
+		lock->base_lock.pfs_psi = NULL;
+	}
+
+	rw_lock_free_func(lock);
+}
+
 /******************************************************************//**
 Performance schema instrumented wrap function for rw_lock_s_lock_func()
 NOTE! Please use the corresponding macro rw_lock_s_lock(), not
@@ -763,25 +1032,67 @@ pfs_rw_lock_s_lock_func(
 				requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	struct PSI_rwlock_locker*	locker = NULL;
-	PSI_rwlock_locker_state		state;
+	if (lock->pfs_psi != NULL)
+	{
+		PSI_rwlock_locker*	locker;
+		PSI_rwlock_locker_state	state;
 
-	/* Instrumented to inform we are aquiring a shared rwlock */
-	if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
-		locker = PSI_server->get_thread_rwlock_locker(
-			&state, lock->pfs_psi, PSI_RWLOCK_READLOCK);
-		if (locker) {
-			PSI_server->start_rwlock_rdwait(locker,
-							file_name, line);
-		}
+		/* Instrumented to inform we are aquiring a shared rwlock */
+		locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)(
+			&state, lock->pfs_psi, PSI_RWLOCK_READLOCK, file_name, line);
+
+		rw_lock_s_lock_func(lock, pass, file_name, line);
+
+		if (locker != NULL)
+			PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
+	}
+	else
+	{
+		rw_lock_s_lock_func(lock, pass, file_name, line);
 	}
 
-	rw_lock_s_lock_func(lock, pass, file_name, line);
+	return;
+}
 
-	if (locker) {
-		PSI_server->end_rwlock_rdwait(locker, 0);
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_lock_func()
+NOTE! Please use the corresponding macro rw_lock_s_lock(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_s_lock_func(
+/*====================*/
+	prio_rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the
+				lock will be passed to another
+				thread to unlock */
+	const char*	file_name,/*!< in: file name where lock
+				requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	if (lock->base_lock.pfs_psi != NULL)
+	{
+		PSI_rwlock_locker*	locker;
+		PSI_rwlock_locker_state	state;
+
+		/* Instrumented to inform we are aquiring a shared rwlock */
+		locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)(
+			&state, lock->base_lock.pfs_psi, PSI_RWLOCK_READLOCK,
+			file_name, line);
+
+		rw_lock_s_lock_func(lock, pass, file_name, line);
+
+		if (locker != NULL)
+			PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
 	}
+	else
+	{
+		rw_lock_s_lock_func(lock, pass, file_name, line);
+	}
+
+	return;
 }
+
 /******************************************************************//**
 Performance schema instrumented wrap function for rw_lock_s_lock_func()
 NOTE! Please use the corresponding macro rw_lock_s_lock(), not
@@ -798,30 +1109,51 @@ pfs_rw_lock_s_lock_low(
 	const char*	file_name, /*!< in: file name where lock requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	struct PSI_rwlock_locker*	locker = NULL;
-	PSI_rwlock_locker_state		state;
 	ibool	ret;
 
-	/* Instrumented to inform we are aquiring a shared rwlock */
-	if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
-		locker = PSI_server->get_thread_rwlock_locker(
-			&state, lock->pfs_psi, PSI_RWLOCK_READLOCK);
-		if (locker) {
-			PSI_server->start_rwlock_rdwait(locker,
-							file_name, line);
-		}
-	}
+	if (lock->pfs_psi != NULL)
+	{
+		PSI_rwlock_locker*	locker;
+		PSI_rwlock_locker_state	state;
 
-	ret = rw_lock_s_lock_low(lock, pass, file_name, line);
+		/* Instrumented to inform we are aquiring a shared rwlock */
+		locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)(
+			&state, lock->pfs_psi, PSI_RWLOCK_READLOCK, file_name, line);
 
-	if (locker) {
-		PSI_server->end_rwlock_rdwait(locker, 0);
+		ret = rw_lock_s_lock_low(lock, pass, file_name, line);
+
+		if (locker != NULL)
+			PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, ret);
+	}
+	else
+	{
+		ret = rw_lock_s_lock_low(lock, pass, file_name, line);
 	}
 
 	return(ret);
 }
 
 /******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_lock_func()
+NOTE! Please use the corresponding macro rw_lock_s_lock(), not
+directly this function!
+@return	TRUE if success */
+UNIV_INLINE
+ibool
+pfs_rw_lock_s_lock_low(
+/*===================*/
+	prio_rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the
+				  lock will be passed to another
+				  thread to unlock */
+	const char*	file_name, /*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	return(pfs_rw_lock_s_lock_low(&lock->base_lock, pass,
+				      file_name, line));
+}
+
+/******************************************************************//**
 Performance schema instrumented wrap function for rw_lock_x_unlock_func()
 NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly
 this function! */
@@ -837,9 +1169,34 @@ pfs_rw_lock_x_unlock_func(
 	rw_lock_t*	lock)	/*!< in/out: rw-lock */
 {
 	/* Inform performance schema we are unlocking the lock */
-	if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
-		PSI_server->unlock_rwlock(lock->pfs_psi);
-	}
+	if (lock->pfs_psi != NULL)
+		PSI_RWLOCK_CALL(unlock_rwlock)(lock->pfs_psi);
+
+	rw_lock_x_unlock_func(
+#ifdef UNIV_SYNC_DEBUG
+		pass,
+#endif
+		lock);
+}
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_unlock_func()
+NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_unlock_func(
+/*======================*/
+#ifdef UNIV_SYNC_DEBUG
+		ulint		pass,	/*!< in: pass value; != 0, if the
+					lock may have been passed to another
+					thread to unlock */
+#endif
+		prio_rw_lock_t*	lock)	/*!< in/out: rw-lock */
+{
+	/* Inform performance schema we are unlocking the lock */
+	if (lock->base_lock.pfs_psi != NULL)
+		PSI_RWLOCK_CALL(unlock_rwlock)(lock->base_lock.pfs_psi);
 
 	rw_lock_x_unlock_func(
 #ifdef UNIV_SYNC_DEBUG
@@ -864,9 +1221,8 @@ pfs_rw_lock_s_unlock_func(
 	rw_lock_t*	lock)	/*!< in/out: rw-lock */
 {
 	/* Inform performance schema we are unlocking the lock */
-	if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
-		PSI_server->unlock_rwlock(lock->pfs_psi);
-	}
+	if (lock->pfs_psi != NULL)
+		PSI_RWLOCK_CALL(unlock_rwlock)(lock->pfs_psi);
 
 	rw_lock_s_unlock_func(
 #ifdef UNIV_SYNC_DEBUG
@@ -875,4 +1231,32 @@ pfs_rw_lock_s_unlock_func(
 		lock);
 
 }
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_unlock_func()
+NOTE! Please use the corresponding macro pfs_rw_lock_s_unlock(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_s_unlock_func(
+/*======================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the
+				  lock may have been passed to another
+				  thread to unlock */
+#endif
+	prio_rw_lock_t*	lock)	/*!< in/out: rw-lock */
+{
+	/* Inform performance schema we are unlocking the lock */
+	if (lock->base_lock.pfs_psi != NULL)
+		PSI_RWLOCK_CALL(unlock_rwlock)(lock->base_lock.pfs_psi);
+
+	rw_lock_s_unlock_func(
+#ifdef UNIV_SYNC_DEBUG
+		pass,
+#endif
+		lock);
+
+}
+
 #endif /* UNIV_PFS_RWLOCK */
diff --git a/storage/xtradb/include/sync0sync.h b/storage/xtradb/include/sync0sync.h
index b3b99b10630..f54c6d59af9 100644
--- a/storage/xtradb/include/sync0sync.h
+++ b/storage/xtradb/include/sync0sync.h
@@ -2,6 +2,7 @@
 
 Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
+Copyright (c) 2012, Facebook Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -40,9 +41,10 @@ Created 9/5/1995 Heikki Tuuri
 #include "os0thread.h"
 #include "os0sync.h"
 #include "sync0arr.h"
+#include "ut0counter.h"
 
 #if  defined(UNIV_DEBUG) && !defined(UNIV_HOTBACKUP)
-extern my_bool	timed_mutexes;
+extern "C" my_bool	timed_mutexes;
 #endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */
 
 #ifdef _WIN32
@@ -53,25 +55,19 @@ typedef byte lock_word_t;
 #endif
 
 #if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK
-/* There are mutexes/rwlocks that we want to exclude from
-instrumentation even if their corresponding performance schema
-define is set. And this PFS_NOT_INSTRUMENTED is used
-as the key value to dentify those objects that would
-be excluded from instrumentation. */
-# define PFS_NOT_INSTRUMENTED		ULINT32_UNDEFINED
-
-# define PFS_IS_INSTRUMENTED(key)	((key) != PFS_NOT_INSTRUMENTED)
 
 /* By default, buffer mutexes and rwlocks will be excluded from
 instrumentation due to their large number of instances. */
 # define PFS_SKIP_BUFFER_MUTEX_RWLOCK
 
+/* By default, event->mutex will also be excluded from instrumentation */
+# define PFS_SKIP_EVENT_MUTEX
+
 #endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
 
 #ifdef UNIV_PFS_MUTEX
 /* Key defines to register InnoDB mutexes with performance schema */
 extern mysql_pfs_key_t	autoinc_mutex_key;
-extern mysql_pfs_key_t	btr_search_enabled_mutex_key;
 extern mysql_pfs_key_t	buffer_block_mutex_key;
 extern mysql_pfs_key_t	buf_pool_mutex_key;
 extern mysql_pfs_key_t	buf_pool_zip_mutex_key;
@@ -79,12 +75,17 @@ extern mysql_pfs_key_t	buf_pool_LRU_list_mutex_key;
 extern mysql_pfs_key_t	buf_pool_free_list_mutex_key;
 extern mysql_pfs_key_t	buf_pool_zip_free_mutex_key;
 extern mysql_pfs_key_t	buf_pool_zip_hash_mutex_key;
+extern mysql_pfs_key_t	buf_pool_flush_state_mutex_key;
 extern mysql_pfs_key_t	cache_last_read_mutex_key;
 extern mysql_pfs_key_t	dict_foreign_err_mutex_key;
 extern mysql_pfs_key_t	dict_sys_mutex_key;
 extern mysql_pfs_key_t	file_format_max_mutex_key;
 extern mysql_pfs_key_t	fil_system_mutex_key;
 extern mysql_pfs_key_t	flush_list_mutex_key;
+extern mysql_pfs_key_t	fts_bg_threads_mutex_key;
+extern mysql_pfs_key_t	fts_delete_mutex_key;
+extern mysql_pfs_key_t	fts_optimize_mutex_key;
+extern mysql_pfs_key_t	fts_doc_id_mutex_key;
 extern mysql_pfs_key_t	hash_table_mutex_key;
 extern mysql_pfs_key_t	ibuf_bitmap_mutex_key;
 extern mysql_pfs_key_t	ibuf_mutex_key;
@@ -92,7 +93,9 @@ extern mysql_pfs_key_t	ibuf_pessimistic_insert_mutex_key;
 extern mysql_pfs_key_t	log_bmp_sys_mutex_key;
 extern mysql_pfs_key_t	log_sys_mutex_key;
 extern mysql_pfs_key_t	log_flush_order_mutex_key;
-extern mysql_pfs_key_t	kernel_mutex_key;
+# ifndef HAVE_ATOMIC_BUILTINS
+extern mysql_pfs_key_t	server_mutex_key;
+# endif /* !HAVE_ATOMIC_BUILTINS */
 # ifdef UNIV_MEM_DEBUG
 extern mysql_pfs_key_t	mem_hash_mutex_key;
 # endif /* UNIV_MEM_DEBUG */
@@ -100,6 +103,7 @@ extern mysql_pfs_key_t	mem_pool_mutex_key;
 extern mysql_pfs_key_t	mutex_list_mutex_key;
 extern mysql_pfs_key_t	purge_sys_bh_mutex_key;
 extern mysql_pfs_key_t	recv_sys_mutex_key;
+extern mysql_pfs_key_t	recv_writer_mutex_key;
 extern mysql_pfs_key_t	rseg_mutex_key;
 # ifdef UNIV_SYNC_DEBUG
 extern mysql_pfs_key_t	rw_lock_debug_mutex_key;
@@ -109,13 +113,29 @@ extern mysql_pfs_key_t	rw_lock_mutex_key;
 extern mysql_pfs_key_t	srv_dict_tmpfile_mutex_key;
 extern mysql_pfs_key_t	srv_innodb_monitor_mutex_key;
 extern mysql_pfs_key_t	srv_misc_tmpfile_mutex_key;
+extern mysql_pfs_key_t	srv_threads_mutex_key;
 extern mysql_pfs_key_t	srv_monitor_file_mutex_key;
-extern mysql_pfs_key_t	syn_arr_mutex_key;
 # ifdef UNIV_SYNC_DEBUG
 extern mysql_pfs_key_t	sync_thread_mutex_key;
 # endif /* UNIV_SYNC_DEBUG */
-extern mysql_pfs_key_t	trx_doublewrite_mutex_key;
+extern mysql_pfs_key_t	buf_dblwr_mutex_key;
 extern mysql_pfs_key_t	trx_undo_mutex_key;
+extern mysql_pfs_key_t	trx_mutex_key;
+extern mysql_pfs_key_t	lock_sys_mutex_key;
+extern mysql_pfs_key_t	lock_sys_wait_mutex_key;
+extern mysql_pfs_key_t	trx_sys_mutex_key;
+extern mysql_pfs_key_t	srv_sys_mutex_key;
+extern mysql_pfs_key_t	srv_sys_tasks_mutex_key;
+#ifndef HAVE_ATOMIC_BUILTINS
+extern mysql_pfs_key_t	srv_conc_mutex_key;
+#endif /* !HAVE_ATOMIC_BUILTINS */
+#ifndef HAVE_ATOMIC_BUILTINS_64
+extern mysql_pfs_key_t	monitor_mutex_key;
+#endif /* !HAVE_ATOMIC_BUILTINS_64 */
+extern mysql_pfs_key_t	event_os_mutex_key;
+extern mysql_pfs_key_t	ut_list_mutex_key;
+extern mysql_pfs_key_t	os_mutex_key;
+extern mysql_pfs_key_t  zip_pad_mutex_key;
 #endif /* UNIV_PFS_MUTEX */
 
 /******************************************************************//**
@@ -140,6 +160,8 @@ if "UNIV_PFS_MUTEX" is defined:
 
 mutex_create
 mutex_enter
+mutex_enter_first
+mutex_enter_last
 mutex_exit
 mutex_enter_nowait
 mutex_free
@@ -175,6 +197,12 @@ necessary only if the memory block containing it is freed. */
 # define mutex_enter_nowait(M)					\
 	pfs_mutex_enter_nowait_func((M), __FILE__, __LINE__)
 
+# define mutex_enter_first(M)					\
+	pfs_mutex_enter_func((M), __FILE__, __LINE__, HIGH_PRIO)
+
+# define mutex_enter_last(M)					\
+	pfs_mutex_enter_func((M), __FILE__, __LINE__, LOW_PRIO)
+
 # define mutex_exit(M)	pfs_mutex_exit_func(M)
 
 # define mutex_free(M)	pfs_mutex_free_func(M)
@@ -201,6 +229,12 @@ original non-instrumented functions */
 # define mutex_enter_nowait(M)	\
 	mutex_enter_nowait_func((M), __FILE__, __LINE__)
 
+# define mutex_enter_first(M)	\
+	mutex_enter_func((M), __FILE__, __LINE__, HIGH_PRIO)
+
+# define mutex_enter_last(M)	\
+	mutex_enter_func((M), __FILE__, __LINE__, LOW_PRIO)
+
 # define mutex_exit(M)	mutex_exit_func(M)
 
 # define mutex_free(M)	mutex_free_func(M)
@@ -216,7 +250,7 @@ UNIV_INTERN
 void
 mutex_create_func(
 /*==============*/
-	mutex_t*	mutex,		/*!< in: pointer to memory */
+	ib_mutex_t*	mutex,		/*!< in: pointer to memory */
 #ifdef UNIV_DEBUG
 # ifdef UNIV_SYNC_DEBUG
 	ulint		level,		/*!< in: level */
@@ -227,6 +261,26 @@ mutex_create_func(
 	const char*	cmutex_name);	/*!< in: mutex name */
 
 /******************************************************************//**
+Creates, or rather, initializes a priority mutex object in a specified memory
+location (which must be appropriately aligned). The mutex is initialized
+in the reset state. Explicit freeing of the mutex with mutex_free is
+necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+mutex_create_func(
+/*==============*/
+	ib_prio_mutex_t*	mutex,		/*!< in: pointer to memory */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+	ulint			level,		/*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+	const char*		cfile_name,	/*!< in: file name where
+						created */
+	ulint			cline,		/*!< in: file line where
+						created */
+#endif /* UNIV_DEBUG */
+	const char*		cmutex_name);	/*!< in: mutex name */
+/******************************************************************//**
 NOTE! Use the corresponding macro mutex_free(), not directly this function!
 Calling this function is obligatory only if the memory buffer containing
 the mutex is freed. Removes a mutex object from the mutex list. The mutex
@@ -235,7 +289,17 @@ UNIV_INTERN
 void
 mutex_free_func(
 /*============*/
-	mutex_t*	mutex);	/*!< in: mutex */
+	ib_mutex_t*	mutex);	/*!< in: mutex */
+/******************************************************************//**
+NOTE! Use the corresponding macro mutex_free(), not directly this function!
+Calling this function is obligatory only if the memory buffer containing
+the mutex is freed. Removes a priority mutex object from the mutex list. The
+mutex is checked to be in the reset state. */
+UNIV_INTERN
+void
+mutex_free_func(
+/*============*/
+	ib_prio_mutex_t*	mutex);	/*!< in: mutex */
 /**************************************************************//**
 NOTE! The following macro should be used in mutex locking, not the
 corresponding function. */
@@ -252,9 +316,29 @@ UNIV_INLINE
 void
 mutex_enter_func(
 /*=============*/
-	mutex_t*	mutex,		/*!< in: pointer to mutex */
+	ib_mutex_t*	mutex,		/*!< in: pointer to mutex */
 	const char*	file_name,	/*!< in: file name where locked */
 	ulint		line);		/*!< in: line where locked */
+/******************************************************************//**
+NOTE! Use the corresponding macro in the header file, not this function
+directly. Locks a priority mutex for the current thread. If the mutex is
+reserved the function spins a preset time (controlled by SYNC_SPIN_ROUNDS)
+waiting for the mutex before suspending the thread. If the thread is suspended,
+the priority argument value determines the relative order for its wake up.  Any
+HIGH_PRIO waiters will be woken up before any LOW_PRIO waiters.  In case of
+DEFAULT_PRIO, the relative priority will be set according to
+srv_current_thread_priority.  */
+UNIV_INLINE
+void
+mutex_enter_func(
+/*=============*/
+	ib_prio_mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*		file_name,	/*!< in: file name where
+						locked */
+	ulint			line,		/*!< in: line where locked */
+	enum ib_sync_priority	priority = DEFAULT_PRIO);
+						/*!<in: mutex acquisition
+						priority */
 /********************************************************************//**
 NOTE! Use the corresponding macro in the header file, not this function
 directly. Tries to lock the mutex for the current thread. If the lock is not
@@ -264,10 +348,24 @@ UNIV_INTERN
 ulint
 mutex_enter_nowait_func(
 /*====================*/
-	mutex_t*	mutex,		/*!< in: pointer to mutex */
+	ib_mutex_t*	mutex,		/*!< in: pointer to mutex */
 	const char*	file_name,	/*!< in: file name where mutex
 					requested */
 	ulint		line);		/*!< in: line where requested */
+/********************************************************************//**
+NOTE! Use the corresponding macro in the header file, not this function
+directly. Tries to lock the mutex for the current thread. If the lock is not
+acquired immediately, returns with return value 1.
+@return	0 if succeed, 1 if not */
+UNIV_INTERN
+ulint
+mutex_enter_nowait_func(
+/*====================*/
+	ib_prio_mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*		file_name,	/*!< in: file name where mutex
+						requested */
+	ulint			line);		/*!< in: line where
+						requested */
 /******************************************************************//**
 NOTE! Use the corresponding macro mutex_exit(), not directly this function!
 Unlocks a mutex owned by the current thread. */
@@ -275,7 +373,15 @@ UNIV_INLINE
 void
 mutex_exit_func(
 /*============*/
-	mutex_t*	mutex);	/*!< in: pointer to mutex */
+	ib_mutex_t*	mutex);	/*!< in: pointer to mutex */
+/******************************************************************//**
+NOTE! Use the corresponding macro mutex_exit(), not directly this function!
+Unlocks a priority mutex owned by the current thread. */
+UNIV_INLINE
+void
+mutex_exit_func(
+/*============*/
+	ib_prio_mutex_t*	mutex);	/*!< in: pointer to mutex */
 
 
 #ifdef UNIV_PFS_MUTEX
@@ -290,7 +396,7 @@ void
 pfs_mutex_create_func(
 /*==================*/
 	PSI_mutex_key	key,		/*!< in: Performance Schema key */
-	mutex_t*	mutex,		/*!< in: pointer to memory */
+	ib_mutex_t*	mutex,		/*!< in: pointer to memory */
 # ifdef UNIV_DEBUG
 #  ifdef UNIV_SYNC_DEBUG
 	ulint		level,		/*!< in: level */
@@ -300,6 +406,29 @@ pfs_mutex_create_func(
 # endif /* UNIV_DEBUG */
 	const char*	cmutex_name);
 /******************************************************************//**
+NOTE! Please use the corresponding macro mutex_create(), not directly
+this function!
+A wrapper function for mutex_create_func(), registers the mutex
+with peformance schema if "UNIV_PFS_MUTEX" is defined when
+creating the performance mutex */
+UNIV_INLINE
+void
+pfs_mutex_create_func(
+/*==================*/
+	PSI_mutex_key		key,		/*!< in: Performance Schema
+						key */
+	ib_prio_mutex_t*	mutex,		/*!< in: pointer to memory */
+# ifdef UNIV_DEBUG
+#  ifdef UNIV_SYNC_DEBUG
+	ulint			level,		/*!< in: level */
+#  endif /* UNIV_SYNC_DEBUG */
+	const char*		cfile_name,	/*!< in: file name where
+						created */
+	ulint			cline,		/*!< in: file line where
+						created */
+# endif /* UNIV_DEBUG */
+	const char*		cmutex_name);
+/******************************************************************//**
 NOTE! Please use the corresponding macro mutex_enter(), not directly
 this function!
 This is a performance schema instrumented wrapper function for
@@ -308,9 +437,25 @@ UNIV_INLINE
 void
 pfs_mutex_enter_func(
 /*=================*/
-	mutex_t*	mutex,		/*!< in: pointer to mutex */
+	ib_mutex_t*	mutex,		/*!< in: pointer to mutex */
 	const char*	file_name,	/*!< in: file name where locked */
 	ulint		line);		/*!< in: line where locked */
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_enter(), not directly
+this function!
+This is a performance schema instrumented wrapper function for
+mutex_enter_func(). */
+UNIV_INLINE
+void
+pfs_mutex_enter_func(
+/*=================*/
+	ib_prio_mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*		file_name,	/*!< in: file name where
+						locked */
+	ulint			line,		/*!< in: line where locked */
+	enum ib_sync_priority	priority = DEFAULT_PRIO);
+						/*!<in: mutex acquisition
+						priority */
 /********************************************************************//**
 NOTE! Please use the corresponding macro mutex_enter_nowait(), not directly
 this function!
@@ -321,10 +466,25 @@ UNIV_INLINE
 ulint
 pfs_mutex_enter_nowait_func(
 /*========================*/
-	mutex_t*	mutex,		/*!< in: pointer to mutex */
+	ib_mutex_t*	mutex,		/*!< in: pointer to mutex */
 	const char*	file_name,	/*!< in: file name where mutex
 					requested */
 	ulint		line);		/*!< in: line where requested */
+/********************************************************************//**
+NOTE! Please use the corresponding macro mutex_enter_nowait(), not directly
+this function!
+This is a performance schema instrumented wrapper function for
+mutex_enter_nowait_func.
+@return	0 if succeed, 1 if not */
+UNIV_INLINE
+ulint
+pfs_mutex_enter_nowait_func(
+/*========================*/
+	ib_prio_mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*		file_name,	/*!< in: file name where mutex
+						requested */
+	ulint			line);		/*!< in: line where
+						requested */
 /******************************************************************//**
 NOTE! Please use the corresponding macro mutex_exit(), not directly
 this function!
@@ -334,7 +494,17 @@ UNIV_INLINE
 void
 pfs_mutex_exit_func(
 /*================*/
-	mutex_t*	mutex);	/*!< in: pointer to mutex */
+	ib_mutex_t*	mutex);	/*!< in: pointer to mutex */
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_exit(), not directly
+this function!
+A wrap function of mutex_exit_func() with peformance schema instrumentation.
+Unlocks a priority mutex owned by the current thread. */
+UNIV_INLINE
+void
+pfs_mutex_exit_func(
+/*================*/
+	ib_prio_mutex_t*	mutex);	/*!< in: pointer to mutex */
 
 /******************************************************************//**
 NOTE! Please use the corresponding macro mutex_free(), not directly
@@ -345,7 +515,17 @@ UNIV_INLINE
 void
 pfs_mutex_free_func(
 /*================*/
-	mutex_t*	mutex);	/*!< in: mutex */
+	ib_mutex_t*	mutex);	/*!< in: mutex */
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_free(), not directly
+this function!
+Wrapper function for mutex_free_func(). Also destroys the performance
+schema probes when freeing the priority mutex */
+UNIV_INLINE
+void
+pfs_mutex_free_func(
+/*================*/
+	ib_prio_mutex_t*	mutex);	/*!< in: mutex */
 
 #endif /* UNIV_PFS_MUTEX */
 
@@ -383,7 +563,7 @@ UNIV_INTERN
 ibool
 mutex_validate(
 /*===========*/
-	const mutex_t*	mutex);	/*!< in: mutex */
+	const ib_mutex_t*	mutex);	/*!< in: mutex */
 /******************************************************************//**
 Checks that the current thread owns the mutex. Works only
 in the debug version.
@@ -392,7 +572,17 @@ UNIV_INTERN
 ibool
 mutex_own(
 /*======*/
-	const mutex_t*	mutex)	/*!< in: mutex */
+	const ib_mutex_t*	mutex)	/*!< in: mutex */
+	__attribute__((warn_unused_result));
+/******************************************************************//**
+Checks that the current thread owns the priority mutex. Works only
+in the debug version.
+@return	TRUE if owns */
+UNIV_INTERN
+ibool
+mutex_own(
+/*======*/
+	const ib_prio_mutex_t*	mutex)	/*!< in: priority mutex */
 	__attribute__((warn_unused_result));
 #endif /* UNIV_DEBUG */
 #ifdef UNIV_SYNC_DEBUG
@@ -463,7 +653,7 @@ UNIV_INTERN
 void
 mutex_get_debug_info(
 /*=================*/
-	mutex_t*	mutex,		/*!< in: mutex */
+	ib_mutex_t*	mutex,		/*!< in: mutex */
 	const char**	file_name,	/*!< out: file where requested */
 	ulint*		line,		/*!< out: line where requested */
 	os_thread_id_t* thread_id);	/*!< out: id of the thread which owns
@@ -483,7 +673,7 @@ UNIV_INLINE
 lock_word_t
 mutex_get_lock_word(
 /*================*/
-	const mutex_t*	mutex);	/*!< in: mutex */
+	const ib_mutex_t*	mutex);	/*!< in: mutex */
 #ifdef UNIV_SYNC_DEBUG
 /******************************************************************//**
 NOT to be used outside this module except in debugging! Gets the waiters
@@ -493,7 +683,7 @@ UNIV_INLINE
 ulint
 mutex_get_waiters(
 /*==============*/
-	const mutex_t*	mutex);	/*!< in: mutex */
+	const ib_mutex_t*	mutex);	/*!< in: mutex */
 #endif /* UNIV_SYNC_DEBUG */
 
 /*
@@ -596,15 +786,28 @@ V
 File system pages
 |
 V
-Kernel mutex				If a kernel operation needs a file
-|					page allocation, it must reserve the
-|					fsp x-latch before acquiring the kernel
-|					mutex.
+lock_sys_wait_mutex			Mutex protecting lock timeout data
+|
+V
+lock_sys_mutex				Mutex protecting lock_sys_t
+|
+V
+trx_sys->mutex				Mutex protecting trx_sys_t
+|
+V
+Threads mutex				Background thread scheduling mutex
+|
+V
+query_thr_mutex				Mutex protecting query threads
+|
+V
+trx_mutex				Mutex protecting trx_t fields
+|
 V
 Search system mutex
 |
 V
-Buffer pool mutex
+Buffer pool mutexes
 |
 V
 Log mutex
@@ -614,7 +817,8 @@ Any other latch
 V
 Memory pool mutex */
 
-/* Latching order levels */
+/* Latching order levels. If you modify these, you have to also update
+sync_thread_add_level(). */
 
 /* User transaction locks are higher than any of the latch levels below:
 no latches are allowed when a thread goes to wait for a normal table
@@ -634,12 +838,14 @@ or row lock! */
 					trx_i_s_cache_t::last_read_mutex */
 #define SYNC_FILE_FORMAT_TAG	1200	/* Used to serialize access to the
 					file format tag */
-#define	SYNC_DICT_OPERATION	1001	/* table create, drop, etc. reserve
+#define	SYNC_DICT_OPERATION	1010	/* table create, drop, etc. reserve
 					this in X-mode; implicit or backround
 					operations purge, rollback, foreign
 					key checks reserve this in S-mode */
+#define SYNC_FTS_CACHE		1005	/* FTS cache rwlock */
 #define SYNC_DICT		1000
 #define SYNC_DICT_AUTOINC_MUTEX	999
+#define SYNC_STATS_AUTO_RECALC	997
 #define SYNC_DICT_HEADER	995
 #define SYNC_IBUF_HEADER	914
 #define SYNC_IBUF_PESS_INSERT_MUTEX 912
@@ -657,26 +863,34 @@ or row lock! */
 #define SYNC_EXTERN_STORAGE	500
 #define	SYNC_FSP		400
 #define	SYNC_FSP_PAGE		395
-/*------------------------------------- Insert buffer headers */
+/*------------------------------------- Change buffer headers */
 #define SYNC_IBUF_MUTEX		370	/* ibuf_mutex */
-/*------------------------------------- Insert buffer tree */
+/*------------------------------------- Change buffer tree */
 #define SYNC_IBUF_INDEX_TREE	360
 #define SYNC_IBUF_TREE_NODE_NEW	359
 #define SYNC_IBUF_TREE_NODE	358
 #define	SYNC_IBUF_BITMAP_MUTEX	351
 #define	SYNC_IBUF_BITMAP	350
+/*------------------------------------- Change log for online create index */
+#define SYNC_INDEX_ONLINE_LOG	340
 /*------------------------------------- MySQL query cache mutex */
 /*------------------------------------- MySQL binlog mutex */
 /*-------------------------------*/
-#define	SYNC_KERNEL		300
-#define SYNC_REC_LOCK		299
-#define	SYNC_TRX_LOCK_HEAP	298
+#define SYNC_LOCK_WAIT_SYS	300
+#define SYNC_LOCK_SYS		299
+#define SYNC_TRX_SYS		298
+#define SYNC_TRX		297
+#define SYNC_THREADS		295
+#define SYNC_REC_LOCK		294
 #define SYNC_TRX_SYS_HEADER	290
 #define	SYNC_PURGE_QUEUE	200
 #define SYNC_LOG_ONLINE		175
 #define SYNC_LOG		170
-#define SYNC_LOG_FLUSH_ORDER	156
+#define SYNC_LOG_FLUSH_ORDER	147
 #define SYNC_RECV		168
+#define SYNC_FTS_CACHE_INIT	166	/* Used for FTS cache initialization */
+#define SYNC_FTS_BG_THREADS	165
+#define SYNC_FTS_OPTIMIZE       164     // FIXME: is this correct number, test
 #define	SYNC_WORK_QUEUE		162
 #define	SYNC_SEARCH_SYS		160	/* NOTE that if we have a memory
 					heap that can be extended to the
@@ -684,16 +898,15 @@ or row lock! */
 					SYNC_SEARCH_SYS, as memory allocation
 					can call routines there! Otherwise
 					the level is SYNC_MEM_HASH. */
-#define	SYNC_BUF_LRU_LIST	158
-#define	SYNC_BUF_PAGE_HASH	157
-#define	SYNC_BUF_BLOCK		155	/* Block mutex */
-#define	SYNC_BUF_FREE_LIST	153
-#define	SYNC_BUF_ZIP_FREE	152
-#define	SYNC_BUF_ZIP_HASH	151
-#define	SYNC_BUF_POOL		150	/* Buffer pool mutex */
-#define	SYNC_BUF_FLUSH_LIST	145	/* Buffer flush list mutex */
-#define SYNC_DOUBLEWRITE	140
-#define	SYNC_OUTER_ANY_LATCH	136
+#define	SYNC_BUF_LRU_LIST	151
+#define	SYNC_BUF_PAGE_HASH	149	/* buf_pool->page_hash rw_lock */
+#define	SYNC_BUF_BLOCK		146	/* Block mutex */
+#define	SYNC_BUF_FREE_LIST	145
+#define	SYNC_BUF_ZIP_FREE	144
+#define	SYNC_BUF_ZIP_HASH	143
+#define	SYNC_BUF_FLUSH_STATE	142
+#define	SYNC_BUF_FLUSH_LIST	141	/* Buffer flush list mutex */
+#define	SYNC_DOUBLEWRITE	139
 #define	SYNC_ANY_LATCH		135
 #define	SYNC_MEM_HASH		131
 #define	SYNC_MEM_POOL		130
@@ -705,14 +918,17 @@ or row lock! */
 #define RW_LOCK_SHARED		352
 #define RW_LOCK_WAIT_EX		353
 #define SYNC_MUTEX		354
+#define SYNC_PRIO_MUTEX		355
+#define PRIO_RW_LOCK_EX		356
+#define PRIO_RW_LOCK_SHARED	357
 
 /* NOTE! The structure appears here only for the compiler to know its size.
 Do not use its fields directly! The structure used in the spin lock
 implementation of a mutual exclusion semaphore. */
 
 /** InnoDB mutex */
-struct mutex_struct {
-	os_event_t	event;	/*!< Used by sync0arr.c for the wait queue */
+struct ib_mutex_t {
+	os_event_t	event;	/*!< Used by sync0arr.cc for the wait queue */
 	volatile lock_word_t	lock_word;	/*!< lock_word is the target
 				of the atomic test-and-set instruction when
 				atomic operations are enabled. */
@@ -722,11 +938,11 @@ struct mutex_struct {
 		os_fast_mutex;	/*!< We use this OS mutex in place of lock_word
 				when atomic operations are not enabled */
 #endif
-	volatile ulint	waiters;	/*!< This ulint is set to 1 if there are (or
+	ulint	waiters;	/*!< This ulint is set to 1 if there are (or
 				may be) threads waiting in the global wait
 				array for this mutex to be released.
 				Otherwise, this is 0. */
-	UT_LIST_NODE_T(mutex_t)	list; /*!< All allocated mutexes are put into
+	UT_LIST_NODE_T(ib_mutex_t)	list; /*!< All allocated mutexes are put into
 				a list.	Pointers to the next and prev. */
 #ifdef UNIV_SYNC_DEBUG
 	const char*	file_name;	/*!< File where the mutex was locked */
@@ -736,21 +952,17 @@ struct mutex_struct {
 #ifdef UNIV_DEBUG
 	const char*	cfile_name;/*!< File name where mutex created */
 	ulint		cline;	/*!< Line where created */
+#endif
+	ulong		count_os_wait;	/*!< count of os_wait */
+#ifdef UNIV_DEBUG
+
+/** Value of mutex_t::magic_n */
+# define MUTEX_MAGIC_N	979585UL
+
 	os_thread_id_t thread_id; /*!< The thread id of the thread
 				which locked the mutex. */
 	ulint		magic_n;	/*!< MUTEX_MAGIC_N */
-/** Value of mutex_struct::magic_n */
-# define MUTEX_MAGIC_N	(ulint)979585
-#endif /* UNIV_DEBUG */
-	ulong		count_os_wait;	/*!< count of os_wait */
-#ifdef UNIV_DEBUG
-	ulong		count_using;	/*!< count of times mutex used */
-	ulong		count_spin_loop; /*!< count of spin loops */
-	ulong		count_spin_rounds;/*!< count of spin rounds */
-	ulong		count_os_yield;	/*!< count of os_wait */
-	ulonglong	lspent_time;	/*!< mutex os_wait timer msec */
-	ulonglong	lmax_spent_time;/*!< mutex os_wait timer msec */
-	ulint		mutex_type;	/*!< 0=usual mutex, 1=rw_lock mutex */
+	ulint		ib_mutex_type;	/*!< 0=usual mutex, 1=rw_lock mutex */
 #endif /* UNIV_DEBUG */
 	const char*	cmutex_name;	/*!< mutex name */
 #ifdef UNIV_PFS_MUTEX
@@ -759,10 +971,19 @@ struct mutex_struct {
 #endif
 };
 
-/** The global array of wait cells for implementation of the databases own
-mutexes and read-write locks. */
-extern sync_array_t*	sync_primary_wait_array;/* Appears here for
-						debugging purposes only! */
+/** XtraDB priority mutex */
+struct ib_prio_mutex_t {
+	ib_mutex_t	base_mutex;	/* The regular mutex provides the lock
+					word etc. for the priority mutex  */
+	os_event_t	high_priority_event; /* High priority wait array
+					event */
+	volatile ulint	high_priority_waiters; /* Set to 1 if there are (or
+					may be) threads that asked for this
+					mutex to be acquired with high priority
+					in the global wait array for this mutex
+					to be released.  Otherwise, this is
+					0.  */
+};
 
 /** Constant determining how long spin wait is continued before suspending
 the thread. A value 600 rounds on a 1995 100 MHz Pentium seems to correspond
@@ -770,9 +991,15 @@ to 20 microseconds. */
 
 #define	SYNC_SPIN_ROUNDS	srv_n_spin_wait_rounds
 
-extern	ib_int64_t	mutex_spin_round_count;
-extern	ib_int64_t	mutex_spin_wait_count;
-extern	ib_int64_t	mutex_os_wait_count;
+/** The number of iterations in the mutex_spin_wait() spin loop.
+Intended for performance monitoring. */
+extern ib_counter_t<ib_int64_t, IB_N_SLOTS>	mutex_spin_round_count;
+/** The number of mutex_spin_wait() calls.  Intended for
+performance monitoring. */
+extern ib_counter_t<ib_int64_t, IB_N_SLOTS>	mutex_spin_wait_count;
+/** The number of OS waits in mutex_spin_wait().  Intended for
+performance monitoring. */
+extern ib_counter_t<ib_int64_t, IB_N_SLOTS>	mutex_os_wait_count;
 
 /** The number of mutex_exit calls. Intended for performance monitoring. */
 extern	ib_int64_t	mutex_exit_count;
@@ -786,13 +1013,37 @@ extern ibool	sync_order_checks_on;
 extern ibool	sync_initialized;
 
 /** Global list of database mutexes (not OS mutexes) created. */
-typedef UT_LIST_BASE_NODE_T(mutex_t)  ut_list_base_node_t;
+typedef UT_LIST_BASE_NODE_T(ib_mutex_t)  ut_list_base_node_t;
 /** Global list of database mutexes (not OS mutexes) created. */
 extern ut_list_base_node_t  mutex_list;
 
 /** Mutex protecting the mutex_list variable */
-extern mutex_t mutex_list_mutex;
+extern ib_mutex_t mutex_list_mutex;
 
+#ifndef HAVE_ATOMIC_BUILTINS
+/**********************************************************//**
+Function that uses a mutex to decrement a variable atomically */
+UNIV_INLINE
+void
+os_atomic_dec_ulint_func(
+/*=====================*/
+	ib_mutex_t*		mutex,		/*!< in: mutex guarding the
+						decrement */
+	volatile ulint*		var,		/*!< in/out: variable to
+						decrement */
+	ulint			delta);		/*!< in: delta to decrement */
+/**********************************************************//**
+Function that uses a mutex to increment a variable atomically */
+UNIV_INLINE
+void
+os_atomic_inc_ulint_func(
+/*=====================*/
+	ib_mutex_t*		mutex,		/*!< in: mutex guarding the
+						increment */
+	volatile ulint*		var,		/*!< in/out: variable to
+						increment */
+	ulint			delta);		/*!< in: delta to increment */
+#endif /* !HAVE_ATOMIC_BUILTINS */
 
 #ifndef UNIV_NONINL
 #include "sync0sync.ic"
diff --git a/storage/xtradb/include/sync0sync.ic b/storage/xtradb/include/sync0sync.ic
index 73e7379cac1..396005ec83a 100644
--- a/storage/xtradb/include/sync0sync.ic
+++ b/storage/xtradb/include/sync0sync.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -18,8 +18,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -36,17 +36,20 @@ UNIV_INTERN
 void
 mutex_set_waiters(
 /*==============*/
-	mutex_t*	mutex,	/*!< in: mutex */
+	ib_mutex_t*	mutex,	/*!< in: mutex */
 	ulint		n);	/*!< in: value to set */
 /******************************************************************//**
-Reserves a mutex for the current thread. If the mutex is reserved, the
-function spins a preset time (controlled by SYNC_SPIN_ROUNDS) waiting
-for the mutex before suspending the thread. */
+Reserves a mutex or a priority mutex for the current thread. If the mutex is
+reserved, the function spins a preset time (controlled by SYNC_SPIN_ROUNDS)
+waiting for the mutex before suspending the thread. */
 UNIV_INTERN
 void
 mutex_spin_wait(
 /*============*/
-	mutex_t*	mutex,		/*!< in: pointer to mutex */
+	void*		_mutex,		/*!< in: pointer to mutex */
+	bool		high_priority,	/*!< in: whether the mutex is a
+					priority mutex with high priority
+					specified */
 	const char*	file_name,	/*!< in: file name where mutex
 					requested */
 	ulint		line);		/*!< in: line where requested */
@@ -57,7 +60,7 @@ UNIV_INTERN
 void
 mutex_set_debug_info(
 /*=================*/
-	mutex_t*	mutex,		/*!< in: mutex */
+	ib_mutex_t*	mutex,		/*!< in: mutex */
 	const char*	file_name,	/*!< in: file where requested */
 	ulint		line);		/*!< in: line where requested */
 #endif /* UNIV_SYNC_DEBUG */
@@ -67,7 +70,7 @@ UNIV_INTERN
 void
 mutex_signal_object(
 /*================*/
-	mutex_t*	mutex);	/*!< in: mutex */
+	ib_mutex_t*	mutex);	/*!< in: mutex */
 
 /******************************************************************//**
 Performs an atomic test-and-set instruction to the lock_word field of a
@@ -75,9 +78,9 @@ mutex.
 @return	the previous value of lock_word: 0 or 1 */
 UNIV_INLINE
 byte
-mutex_test_and_set(
+ib_mutex_test_and_set(
 /*===============*/
-	mutex_t*	mutex)	/*!< in: mutex */
+	ib_mutex_t*	mutex)	/*!< in: mutex */
 {
 #if defined(HAVE_ATOMIC_BUILTINS)
 	return(os_atomic_test_and_set_byte(&mutex->lock_word, 1));
@@ -94,7 +97,7 @@ mutex_test_and_set(
 		mutex->lock_word = 1;
 	}
 
-	return((byte)ret);
+	return((byte) ret);
 #endif
 }
 
@@ -105,7 +108,7 @@ UNIV_INLINE
 void
 mutex_reset_lock_word(
 /*==================*/
-	mutex_t*	mutex)	/*!< in: mutex */
+	ib_mutex_t*	mutex)	/*!< in: mutex */
 {
 #if defined(HAVE_ATOMIC_BUILTINS)
 	/* In theory __sync_lock_release should be used to release the lock.
@@ -125,7 +128,7 @@ UNIV_INLINE
 lock_word_t
 mutex_get_lock_word(
 /*================*/
-	const mutex_t*	mutex)	/*!< in: mutex */
+	const ib_mutex_t*	mutex)	/*!< in: mutex */
 {
 	ut_ad(mutex);
 
@@ -139,7 +142,7 @@ UNIV_INLINE
 ulint
 mutex_get_waiters(
 /*==============*/
-	const mutex_t*	mutex)	/*!< in: mutex */
+	const ib_mutex_t*	mutex)	/*!< in: mutex */
 {
 	const volatile ulint*	ptr;	/*!< declared volatile to ensure that
 					the value is read from memory */
@@ -158,7 +161,7 @@ UNIV_INLINE
 void
 mutex_exit_func(
 /*============*/
-	mutex_t*	mutex)	/*!< in: pointer to mutex */
+	ib_mutex_t*	mutex)	/*!< in: pointer to mutex */
 {
 	ut_ad(mutex_own(mutex));
 
@@ -192,6 +195,55 @@ mutex_exit_func(
 }
 
 /******************************************************************//**
+NOTE! Use the corresponding macro mutex_exit(), not directly this function!
+Unlocks a priority mutex owned by the current thread. */
+UNIV_INLINE
+void
+mutex_exit_func(
+/*============*/
+	ib_prio_mutex_t*	mutex)	/*!< in: pointer to mutex */
+{
+	ut_ad(mutex_own(mutex));
+
+	ut_d(mutex->base_mutex.thread_id = (os_thread_id_t) ULINT_UNDEFINED);
+
+#ifdef UNIV_SYNC_DEBUG
+	sync_thread_reset_level(&mutex->base_mutex);
+#endif
+	mutex_reset_lock_word(&mutex->base_mutex);
+
+	/* A problem: we assume that mutex_reset_lock word
+	is a memory barrier, that is when we read the waiters
+	field next, the read must be serialized in memory
+	after the reset. A speculative processor might
+	perform the read first, which could leave a waiting
+	thread hanging indefinitely.
+
+	Our current solution call every second
+	sync_arr_wake_threads_if_sema_free()
+	to wake up possible hanging threads if
+	they are missed in mutex_signal_object. */
+
+	/* Wake up any high priority waiters first.  */
+	if (mutex->high_priority_waiters != 0) {
+
+		mutex->high_priority_waiters = 0;
+		os_event_set(mutex->high_priority_event);
+		sync_array_object_signalled();
+
+	} else if (mutex_get_waiters(&mutex->base_mutex) != 0) {
+
+		mutex_signal_object(&mutex->base_mutex);
+	}
+
+#ifdef UNIV_SYNC_PERF_STAT
+	mutex_exit_count++;
+#endif
+
+}
+
+
+/******************************************************************//**
 Locks a mutex for the current thread. If the mutex is reserved, the function
 spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for the mutex
 before suspending the thread. */
@@ -199,7 +251,7 @@ UNIV_INLINE
 void
 mutex_enter_func(
 /*=============*/
-	mutex_t*	mutex,		/*!< in: pointer to mutex */
+	ib_mutex_t*	mutex,		/*!< in: pointer to mutex */
 	const char*	file_name,	/*!< in: file name where locked */
 	ulint		line)		/*!< in: line where locked */
 {
@@ -209,9 +261,7 @@ mutex_enter_func(
 	/* Note that we do not peek at the value of lock_word before trying
 	the atomic test_and_set; we could peek, and possibly save time. */
 
-	ut_d(mutex->count_using++);
-
-	if (!mutex_test_and_set(mutex)) {
+	if (!ib_mutex_test_and_set(mutex)) {
 		ut_d(mutex->thread_id = os_thread_get_curr_id());
 #ifdef UNIV_SYNC_DEBUG
 		mutex_set_debug_info(mutex, file_name, line);
@@ -219,9 +269,55 @@ mutex_enter_func(
 		return;	/* Succeeded! */
 	}
 
-	mutex_spin_wait(mutex, file_name, line);
+	mutex_spin_wait(mutex, false, file_name, line);
 }
 
+/******************************************************************//**
+NOTE! Use the corresponding macro in the header file, not this function
+directly. Locks a priority mutex for the current thread. If the mutex is
+reserved the function spins a preset time (controlled by SYNC_SPIN_ROUNDS)
+waiting for the mutex before suspending the thread. If the thread is suspended,
+the priority argument value determines the relative order for its wake up.  Any
+HIGH_PRIO waiters will be woken up before any LOW_PRIO waiters.  In case of
+DEFAULT_PRIO, the relative priority will be set according to
+srv_current_thread_priority.  */
+UNIV_INLINE
+void
+mutex_enter_func(
+/*=============*/
+	ib_prio_mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*		file_name,	/*!< in: file name where
+						locked */
+	ulint			line,		/*!< in: line where locked */
+	enum ib_sync_priority	priority)
+						/*!<in: mutex acquisition
+						priority */
+{
+	bool	high_priority;
+
+	ut_ad(mutex_validate(&mutex->base_mutex));
+	ut_ad(!mutex_own(mutex));
+
+	/* Note that we do not peek at the value of lock_word before trying
+	the atomic test_and_set; we could peek, and possibly save time. */
+
+	if (!ib_mutex_test_and_set(&mutex->base_mutex)) {
+		ut_d(mutex->base_mutex.thread_id = os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+		mutex_set_debug_info(&mutex->base_mutex, file_name, line);
+#endif
+		return;	/* Succeeded! */
+	}
+
+	if (UNIV_LIKELY(priority == DEFAULT_PRIO)) {
+		high_priority = srv_current_thread_priority;
+	} else {
+		high_priority = (priority == HIGH_PRIO);
+	}
+	mutex_spin_wait(mutex, high_priority, file_name, line);
+}
+
+
 #ifdef UNIV_PFS_MUTEX
 /******************************************************************//**
 NOTE! Please use the corresponding macro mutex_enter(), not directly
@@ -232,28 +328,62 @@ UNIV_INLINE
 void
 pfs_mutex_enter_func(
 /*=================*/
-	mutex_t*	mutex,	/*!< in: pointer to mutex */
+	ib_mutex_t*	mutex,	/*!< in: pointer to mutex */
 	const char*	file_name,	/*!< in: file name where locked */
 	ulint		line)		/*!< in: line where locked */
 {
-	struct PSI_mutex_locker*	locker = NULL;
-	PSI_mutex_locker_state		state;
-	int	result = 0;
-
-	if (UNIV_LIKELY(PSI_server && mutex->pfs_psi)) {
-		locker = PSI_server->get_thread_mutex_locker(
-				&state, mutex->pfs_psi, PSI_MUTEX_LOCK);
-		if (locker) {
-			PSI_server->start_mutex_wait(locker, file_name, line);
+	if (mutex->pfs_psi != NULL) {
+		PSI_mutex_locker*	locker;
+		PSI_mutex_locker_state	state;
+
+		locker = PSI_MUTEX_CALL(start_mutex_wait)(
+			&state, mutex->pfs_psi,
+			PSI_MUTEX_LOCK, file_name, line);
+
+		mutex_enter_func(mutex, file_name, line);
+
+		if (locker != NULL) {
+			PSI_MUTEX_CALL(end_mutex_wait)(locker, 0);
 		}
+	} else {
+		mutex_enter_func(mutex, file_name, line);
 	}
+}
 
-	mutex_enter_func(mutex, file_name, line);
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_enter(), not directly
+this function!
+This is a performance schema instrumented wrapper function for
+mutex_enter_func(). */
+UNIV_INLINE
+void
+pfs_mutex_enter_func(
+/*=================*/
+	ib_prio_mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*		file_name,	/*!< in: file name where
+						locked */
+	ulint			line,		/*!< in: line where locked */
+	enum ib_sync_priority	priority)	/*!<in: mutex acquisition
+						priority */
+{
+	if (mutex->base_mutex.pfs_psi != NULL) {
+		PSI_mutex_locker*	locker;
+		PSI_mutex_locker_state	state;
 
-	if (locker) {
-		PSI_server->end_mutex_wait(locker, result);
+		locker = PSI_MUTEX_CALL(start_mutex_wait)(
+			&state, mutex->base_mutex.pfs_psi,
+			PSI_MUTEX_LOCK, file_name, line);
+
+		mutex_enter_func(mutex, file_name, line, priority);
+
+		if (locker != NULL) {
+			PSI_MUTEX_CALL(end_mutex_wait)(locker, 0);
+		}
+	} else {
+		mutex_enter_func(mutex, file_name, line, priority);
 	}
 }
+
 /********************************************************************//**
 NOTE! Please use the corresponding macro mutex_enter_nowait(), not directly
 this function!
@@ -264,31 +394,53 @@ UNIV_INLINE
 ulint
 pfs_mutex_enter_nowait_func(
 /*========================*/
-	mutex_t*	mutex,		/*!< in: pointer to mutex */
+	ib_mutex_t*	mutex,		/*!< in: pointer to mutex */
 	const char*	file_name,	/*!< in: file name where mutex
 					requested */
 	ulint		line)		/*!< in: line where requested */
 {
-	ulint	ret;
-	struct PSI_mutex_locker*	locker = NULL;
-	PSI_mutex_locker_state		state;
-
-	if (UNIV_LIKELY(PSI_server && mutex->pfs_psi)) {
-		locker = PSI_server->get_thread_mutex_locker(
-				&state, mutex->pfs_psi, PSI_MUTEX_TRYLOCK);
-		if (locker) {
-			PSI_server->start_mutex_wait(locker, file_name, line);
-		}
-	}
+	ulint		ret;
+
+	if (mutex->pfs_psi != NULL) {
+		PSI_mutex_locker*	locker;
+		PSI_mutex_locker_state		state;
+
+		locker = PSI_MUTEX_CALL(start_mutex_wait)(
+			&state, mutex->pfs_psi,
+			PSI_MUTEX_TRYLOCK, file_name, line);
 
-	ret = mutex_enter_nowait_func(mutex, file_name, line);
+		ret = mutex_enter_nowait_func(mutex, file_name, line);
 
-	if (locker) {
-		PSI_server->end_mutex_wait(locker, ret);
+		if (locker != NULL) {
+			PSI_MUTEX_CALL(end_mutex_wait)(locker, (int) ret);
+		}
+	} else {
+		ret = mutex_enter_nowait_func(mutex, file_name, line);
 	}
 
 	return(ret);
 }
+
+/********************************************************************//**
+NOTE! Please use the corresponding macro mutex_enter_nowait(), not directly
+this function!
+This is a performance schema instrumented wrapper function for
+mutex_enter_nowait_func.
+@return	0 if succeed, 1 if not */
+UNIV_INLINE
+ulint
+pfs_mutex_enter_nowait_func(
+/*========================*/
+	ib_prio_mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*		file_name,	/*!< in: file name where mutex
+						  requested */
+	ulint			line)		/*!< in: line where
+						  requested */
+{
+	return pfs_mutex_enter_nowait_func(&mutex->base_mutex, file_name,
+					   line);
+}
+
 /******************************************************************//**
 NOTE! Please use the corresponding macro mutex_exit(), not directly
 this function!
@@ -298,15 +450,34 @@ UNIV_INLINE
 void
 pfs_mutex_exit_func(
 /*================*/
-	mutex_t*	mutex)	/*!< in: pointer to mutex */
+	ib_mutex_t*	mutex)	/*!< in: pointer to mutex */
+{
+	if (mutex->pfs_psi != NULL) {
+		PSI_MUTEX_CALL(unlock_mutex)(mutex->pfs_psi);
+	}
+
+	mutex_exit_func(mutex);
+}
+
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_exit(), not directly
+this function!
+A wrap function of mutex_exit_func() with peformance schema instrumentation.
+Unlocks a priority mutex owned by the current thread. */
+UNIV_INLINE
+void
+pfs_mutex_exit_func(
+/*================*/
+	ib_prio_mutex_t*	mutex)	/*!< in: pointer to mutex */
 {
-	if (UNIV_LIKELY(PSI_server && mutex->pfs_psi)) {
-		PSI_server->unlock_mutex(mutex->pfs_psi);
+	if (mutex->base_mutex.pfs_psi != NULL) {
+		PSI_MUTEX_CALL(unlock_mutex)(mutex->base_mutex.pfs_psi);
 	}
 
 	mutex_exit_func(mutex);
 }
 
+
 /******************************************************************//**
 NOTE! Please use the corresponding macro mutex_create(), not directly
 this function!
@@ -318,7 +489,7 @@ void
 pfs_mutex_create_func(
 /*==================*/
 	mysql_pfs_key_t	key,		/*!< in: Performance Schema key */
-	mutex_t*	mutex,		/*!< in: pointer to memory */
+	ib_mutex_t*	mutex,		/*!< in: pointer to memory */
 # ifdef UNIV_DEBUG
 #  ifdef UNIV_SYNC_DEBUG
 	ulint		level,		/*!< in: level */
@@ -328,9 +499,7 @@ pfs_mutex_create_func(
 # endif /* UNIV_DEBUG */
 	const char*	cmutex_name)	/*!< in: mutex name */
 {
-	mutex->pfs_psi = (PSI_server && PFS_IS_INSTRUMENTED(key))
-				? PSI_server->init_mutex(key, mutex)
-				: NULL;
+	mutex->pfs_psi = PSI_MUTEX_CALL(init_mutex)(key, mutex);
 
 	mutex_create_func(mutex,
 # ifdef UNIV_DEBUG
@@ -342,6 +511,45 @@ pfs_mutex_create_func(
 # endif /* UNIV_DEBUG */
 			  cmutex_name);
 }
+
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_create(), not directly
+this function!
+A wrapper function for mutex_create_func(), registers the mutex
+with peformance schema if "UNIV_PFS_MUTEX" is defined when
+creating the performance mutex */
+UNIV_INLINE
+void
+pfs_mutex_create_func(
+/*==================*/
+	PSI_mutex_key		key,		/*!< in: Performance Schema
+						key */
+	ib_prio_mutex_t*	mutex,		/*!< in: pointer to memory */
+# ifdef UNIV_DEBUG
+#  ifdef UNIV_SYNC_DEBUG
+	ulint			level,		/*!< in: level */
+#  endif /* UNIV_SYNC_DEBUG */
+	const char*		cfile_name,	/*!< in: file name where
+						created */
+	ulint			cline,		/*!< in: file line where
+						  created */
+# endif /* UNIV_DEBUG */
+	const char*		cmutex_name)
+{
+	mutex->base_mutex.pfs_psi = PSI_MUTEX_CALL(init_mutex)(key, mutex);
+
+	mutex_create_func(mutex,
+# ifdef UNIV_DEBUG
+#  ifdef UNIV_SYNC_DEBUG
+			  level,
+#  endif /* UNIV_SYNC_DEBUG */
+			  cfile_name,
+			  cline,
+# endif /* UNIV_DEBUG */
+			  cmutex_name);
+}
+
+
 /******************************************************************//**
 NOTE! Please use the corresponding macro mutex_free(), not directly
 this function!
@@ -351,14 +559,74 @@ UNIV_INLINE
 void
 pfs_mutex_free_func(
 /*================*/
-	mutex_t*	mutex)	/*!< in: mutex */
+	ib_mutex_t*	mutex)	/*!< in: mutex */
 {
-	if (UNIV_LIKELY(PSI_server && mutex->pfs_psi)) {
-		PSI_server->destroy_mutex(mutex->pfs_psi);
+	if (mutex->pfs_psi != NULL) {
+		PSI_MUTEX_CALL(destroy_mutex)(mutex->pfs_psi);
 		mutex->pfs_psi = NULL;
 	}
 
 	mutex_free_func(mutex);
 }
 
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_free(), not directly
+this function!
+Wrapper function for mutex_free_func(). Also destroys the performance
+schema probes when freeing the priority mutex */
+UNIV_INLINE
+void
+pfs_mutex_free_func(
+/*================*/
+	ib_prio_mutex_t*	mutex)	/*!< in: mutex */
+{
+	if (mutex->base_mutex.pfs_psi != NULL) {
+		PSI_MUTEX_CALL(destroy_mutex)(mutex->base_mutex.pfs_psi);
+		mutex->base_mutex.pfs_psi = NULL;
+	}
+
+	mutex_free_func(mutex);
+}
+
+
 #endif /* UNIV_PFS_MUTEX */
+
+#ifndef HAVE_ATOMIC_BUILTINS
+/**********************************************************//**
+Function that uses a mutex to decrement a variable atomically */
+UNIV_INLINE
+void
+os_atomic_dec_ulint_func(
+/*=====================*/
+	ib_mutex_t*	mutex,		/*!< in: mutex guarding the dec */
+	volatile ulint*	var,		/*!< in/out: variable to decrement */
+	ulint		delta)		/*!< in: delta to decrement */
+{
+	mutex_enter(mutex);
+
+	/* I don't think we will encounter a situation where
+	this check will not be required. */
+	ut_ad(*var >= delta);
+
+	*var -= delta;
+
+	mutex_exit(mutex);
+}
+
+/**********************************************************//**
+Function that uses a mutex to increment a variable atomically */
+UNIV_INLINE
+void
+os_atomic_inc_ulint_func(
+/*=====================*/
+	ib_mutex_t*	mutex,		/*!< in: mutex guarding the increment */
+	volatile ulint*	var,		/*!< in/out: variable to increment */
+	ulint		delta)		/*!< in: delta to increment */
+{
+	mutex_enter(mutex);
+
+	*var += delta;
+
+	mutex_exit(mutex);
+}
+#endif /* !HAVE_ATOMIC_BUILTINS */
diff --git a/storage/xtradb/include/sync0types.h b/storage/xtradb/include/sync0types.h
index 5e800240888..67f613ab8ae 100644
--- a/storage/xtradb/include/sync0types.h
+++ b/storage/xtradb/include/sync0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,9 +26,19 @@ Created 9/5/1995 Heikki Tuuri
 #ifndef sync0types_h
 #define sync0types_h
 
-/** Rename mutex_t to avoid name space collision on some systems */
-#define mutex_t ib_mutex_t
-/** InnoDB mutex */
-typedef struct mutex_struct		mutex_t;
+struct ib_mutex_t;
+
+/* The relative priority of the current thread.  If 0, low priority; if 1, high
+priority.  */
+extern UNIV_THREAD_LOCAL ulint srv_current_thread_priority;
+
+struct ib_prio_mutex_t;
+
+/** Priority mutex and rwlatch acquisition priorities */
+enum ib_sync_priority {
+	DEFAULT_PRIO,
+	LOW_PRIO,
+	HIGH_PRIO
+};
 
 #endif
diff --git a/storage/xtradb/include/trx0i_s.h b/storage/xtradb/include/trx0i_s.h
index c67227369a7..ac5e00c6834 100644
--- a/storage/xtradb/include/trx0i_s.h
+++ b/storage/xtradb/include/trx0i_s.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -66,36 +66,34 @@ do {								\
 		strncpy(buff, data, constraint);		\
 		buff[constraint] = '\0';			\
 								\
-		field = ha_storage_put_memlim(			\
+		field = static_cast<const char*>(		\
+			ha_storage_put_memlim(			\
 			(tcache)->storage, buff, constraint + 1,\
-			MAX_ALLOWED_FOR_STORAGE(tcache));	\
+			MAX_ALLOWED_FOR_STORAGE(tcache)));	\
 	} else {						\
-		field = ha_storage_put_str_memlim(		\
+		field = static_cast<const char*>(		\
+			ha_storage_put_str_memlim(		\
 			(tcache)->storage, data,		\
-			MAX_ALLOWED_FOR_STORAGE(tcache));	\
+			MAX_ALLOWED_FOR_STORAGE(tcache)));	\
 	}							\
 } while (0)
 
 /** A row of INFORMATION_SCHEMA.innodb_locks */
-typedef struct i_s_locks_row_struct	i_s_locks_row_t;
-/** A row of INFORMATION_SCHEMA.innodb_trx */
-typedef struct i_s_trx_row_struct i_s_trx_row_t;
-/** A row of INFORMATION_SCHEMA.innodb_lock_waits */
-typedef struct i_s_lock_waits_row_struct i_s_lock_waits_row_t;
+struct i_s_locks_row_t;
 
 /** Objects of trx_i_s_cache_t::locks_hash */
-typedef struct i_s_hash_chain_struct	i_s_hash_chain_t;
+struct i_s_hash_chain_t;
 
 /** Objects of this type are added to the hash table
 trx_i_s_cache_t::locks_hash */
-struct i_s_hash_chain_struct {
+struct i_s_hash_chain_t {
 	i_s_locks_row_t*	value;	/*!< row of
 					INFORMATION_SCHEMA.innodb_locks*/
 	i_s_hash_chain_t*	next;	/*!< next item in the hash chain */
 };
 
 /** This structure represents INFORMATION_SCHEMA.innodb_locks row */
-struct i_s_locks_row_struct {
+struct i_s_locks_row_t {
 	trx_id_t	lock_trx_id;	/*!< transaction identifier */
 	const char*	lock_mode;	/*!< lock mode from
 					lock_get_mode_str() */
@@ -126,16 +124,16 @@ struct i_s_locks_row_struct {
 };
 
 /** This structure represents INFORMATION_SCHEMA.innodb_trx row */
-struct i_s_trx_row_struct {
+struct i_s_trx_row_t {
 	trx_id_t		trx_id;		/*!< transaction identifier */
 	const char*		trx_state;	/*!< transaction state from
 						trx_get_que_state_str() */
-	ib_time_t		trx_started;	/*!< trx_struct::start_time */
+	ib_time_t		trx_started;	/*!< trx_t::start_time */
 	const i_s_locks_row_t*	requested_lock_row;
 					/*!< pointer to a row
 					in innodb_locks if trx
 					is waiting, or NULL */
-	ib_time_t	trx_wait_started; /*!< trx_struct::wait_started */
+	ib_time_t	trx_wait_started; /*!< trx_t::wait_started */
 	ullint		trx_weight;	/*!< TRX_WEIGHT() */
 	ulint		trx_mysql_thread_id; /*!< thd_get_thread_id() */
 	const char*	trx_query;	/*!< MySQL statement being
@@ -143,46 +141,49 @@ struct i_s_trx_row_struct {
 	struct charset_info_st*	trx_query_cs;
 					/*!< charset encode the MySQL
 					statement */
-	const char*	trx_operation_state; /*!< trx_struct::op_info */
+	const char*	trx_operation_state; /*!< trx_t::op_info */
 	ulint		trx_tables_in_use;/*!< n_mysql_tables_in_use in
-					 trx_struct */
+					 trx_t */
 	ulint		trx_tables_locked;
 					/*!< mysql_n_tables_locked in
-					trx_struct */
+					trx_t */
 	ulint		trx_lock_structs;/*!< list len of trx_locks in
-					trx_struct */
+					trx_t */
 	ulint		trx_lock_memory_bytes;
 					/*!< mem_heap_get_size(
 					trx->lock_heap) */
 	ulint		trx_rows_locked;/*!< lock_number_of_rows_locked() */
-	ullint		trx_rows_modified;/*!< trx_struct::undo_no */
+	ullint		trx_rows_modified;/*!< trx_t::undo_no */
 	ulint		trx_concurrency_tickets;
 					/*!< n_tickets_to_enter_innodb in
-					trx_struct */
+					trx_t */
 	const char*	trx_isolation_level;
-					/*!< isolation_level in trx_struct*/
+					/*!< isolation_level in trx_t */
 	ibool		trx_unique_checks;
-					/*!< check_unique_secondary in
-					trx_struct*/
+					/*!< check_unique_secondary in trx_t*/
 	ibool		trx_foreign_key_checks;
-					/*!< check_foreigns in trx_struct */
+					/*!< check_foreigns in trx_t */
 	const char*	trx_foreign_key_error;
-					/*!< detailed_error in trx_struct */
+					/*!< detailed_error in trx_t */
 	ibool		trx_has_search_latch;
-					/*!< has_search_latch in trx_struct */
+					/*!< has_search_latch in trx_t */
 	ulint		trx_search_latch_timeout;
-					/*!< search_latch_timeout in
-					trx_struct */
+					/*!< search_latch_timeout in trx_t */
+	ulint		trx_is_read_only;
+					/*!< trx_t::read_only */
+	ulint		trx_is_autocommit_non_locking;
+					/*!< trx_is_autocommit_non_locking(trx)
+					*/
 };
 
 /** This structure represents INFORMATION_SCHEMA.innodb_lock_waits row */
-struct i_s_lock_waits_row_struct {
+struct i_s_lock_waits_row_t {
 	const i_s_locks_row_t*	requested_lock_row;	/*!< requested lock */
 	const i_s_locks_row_t*	blocking_lock_row;	/*!< blocking lock */
 };
 
 /** Cache of INFORMATION_SCHEMA table data */
-typedef struct trx_i_s_cache_struct	trx_i_s_cache_t;
+struct trx_i_s_cache_t;
 
 /** Auxiliary enum used by functions that need to select one of the
 INFORMATION_SCHEMA tables */
@@ -307,4 +308,8 @@ trx_i_s_create_lock_id(
 	ulint			lock_id_size);/*!< in: size of the lock id
 					buffer */
 
+UNIV_INTERN
+void
+trx_i_s_get_lock_sys_memory_usage(ulint *constant, ulint *variable);
+
 #endif /* trx0i_s_h */
diff --git a/storage/xtradb/include/trx0purge.h b/storage/xtradb/include/trx0purge.h
index f8f662125a7..a862523c092 100644
--- a/storage/xtradb/include/trx0purge.h
+++ b/storage/xtradb/include/trx0purge.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -52,17 +52,6 @@ trx_purge_get_log_from_hist(
 /*========================*/
 	fil_addr_t	node_addr);	/*!< in: file address of the history
 					list node of the log */
-/*****************************************************************//**
-Checks if trx_id is >= purge_view: then it is guaranteed that its update
-undo log still exists in the system.
-@return TRUE if is sure that it is preserved, also if the function
-returns FALSE, it is possible that the undo log still exists in the
-system */
-UNIV_INTERN
-ibool
-trx_purge_update_undo_must_exist(
-/*=============================*/
-	trx_id_t	trx_id);/*!< in: transaction id */
 /********************************************************************//**
 Creates the global purge system control structure and inits the history
 mutex. */
@@ -70,7 +59,8 @@ UNIV_INTERN
 void
 trx_purge_sys_create(
 /*=================*/
-	ib_bh_t*	ib_bh);	/*!< in/own: UNDO log min binary heap*/
+	ulint		n_purge_threads,/*!< in: number of purge threads */
+	ib_bh_t*	ib_bh);		/*!< in/own: UNDO log min binary heap*/
 /********************************************************************//**
 Frees the global purge system control structure. */
 UNIV_INTERN
@@ -88,26 +78,6 @@ trx_purge_add_update_undo_to_history(
 	page_t*	undo_page,	/*!< in: update undo log header page,
 				x-latched */
 	mtr_t*	mtr);		/*!< in: mtr */
-/********************************************************************//**
-Fetches the next undo log record from the history list to purge. It must be
-released with the corresponding release function.
-@return copy of an undo log record or pointer to trx_purge_dummy_rec,
-if the whole undo log can skipped in purge; NULL if none left */
-UNIV_INTERN
-trx_undo_rec_t*
-trx_purge_fetch_next_rec(
-/*=====================*/
-	roll_ptr_t*	roll_ptr,/*!< out: roll pointer to undo record */
-	trx_undo_inf_t** cell,	/*!< out: storage cell for the record in the
-				purge array */
-	mem_heap_t*	heap);	/*!< in: memory heap where copied */
-/*******************************************************************//**
-Releases a reserved purge undo record. */
-UNIV_INTERN
-void
-trx_purge_rec_release(
-/*==================*/
-	trx_undo_inf_t*	cell);	/*!< in: storage cell */
 /*******************************************************************//**
 This function runs a purge batch.
 @return	number of undo log pages handled in the batch */
@@ -115,48 +85,102 @@ UNIV_INTERN
 ulint
 trx_purge(
 /*======*/
-	ulint	limit);		/*!< in: the maximum number of records to
-				purge in one batch */
-/******************************************************************//**
-Prints information of the purge system to stderr. */
+	ulint	n_purge_threads,	/*!< in: number of purge tasks to
+					submit to task queue. */
+	ulint	limit,			/*!< in: the maximum number of
+					records to purge in one batch */
+	bool	truncate);		/*!< in: truncate history if true */
+/*******************************************************************//**
+Stop purge and wait for it to stop, move to PURGE_STATE_STOP. */
 UNIV_INTERN
 void
-trx_purge_sys_print(void);
-/*======================*/
+trx_purge_stop(void);
+/*================*/
+/*******************************************************************//**
+Resume purge, move to PURGE_STATE_RUN. */
+UNIV_INTERN
+void
+trx_purge_run(void);
+/*================*/
+
+/** Purge states */
+enum purge_state_t {
+	PURGE_STATE_INIT,		/*!< Purge instance created */
+	PURGE_STATE_RUN,		/*!< Purge should be running */
+	PURGE_STATE_STOP,		/*!< Purge should be stopped */
+	PURGE_STATE_EXIT,		/*!< Purge has been shutdown */
+	PURGE_STATE_DISABLED		/*!< Purge was never started */
+};
+
+/*******************************************************************//**
+Get the purge state.
+@return purge state. */
+UNIV_INTERN
+purge_state_t
+trx_purge_state(void);
+/*=================*/
+
+/** This is the purge pointer/iterator. We need both the undo no and the
+transaction no up to which purge has parsed and applied the records. */
+struct purge_iter_t {
+	trx_id_t	trx_no;		/*!< Purge has advanced past all
+					transactions whose number is less
+					than this */
+	undo_no_t	undo_no;	/*!< Purge has advanced past all records
+					whose undo number is less than this */
+};
 
 /** The control structure used in the purge operation */
-struct trx_purge_struct{
-	ulint		state;		/*!< Purge system state */
+struct trx_purge_t{
 	sess_t*		sess;		/*!< System session running the purge
 					query */
 	trx_t*		trx;		/*!< System transaction running the
-					purge
-					query: this trx is not in the trx list
-					of the trx system and it never ends */
-	que_t*		query;		/*!< The query graph which will do the
-					parallelized purge operation */
-	rw_lock_t	latch;		/*!< The latch protecting the purge
-					view.  A purge operation must acquire
-					an x-latch here for the instant at which
+					purge query: this trx is not in the
+					trx list of the trx system and it
+					never ends */
+	prio_rw_lock_t	latch;		/*!< The latch protecting the purge
+					view. A purge operation must acquire an
+					x-latch here for the instant at which
 					it changes the purge view: an undo
 					log operation can prevent this by
-					obtaining an s-latch here. */
+					obtaining an s-latch here. It also
+					protects state and running */
+	os_event_t	event;		/*!< State signal event */
+	ulint		n_stop;		/*!< Counter to track number stops */
+	volatile bool	running;	/*!< true, if purge is active,
+					we check this without the latch too */
+	volatile purge_state_t	state;	/*!< Purge coordinator thread states,
+					we check this in several places
+					without holding the latch. */
+	que_t*		query;		/*!< The query graph which will do the
+					parallelized purge operation */
 	read_view_t*	view;		/*!< The purge will not remove undo logs
 					which are >= this view (purge view) */
+	read_view_t*	prebuilt_clone;	/*!< Pre-built view which is used as a
+					temporary clone of the oldest view in
+					read_view_purge_open() */
 	read_view_t*	prebuilt_view;	/*!< Pre-built view array */
-	ulonglong	n_pages_handled;/*!< Approximate number of undo log
-					pages processed in purge */
-	ulonglong	handle_limit;	/*!< Target of how many pages to get
-					processed in the current purge */
+	volatile ulint	n_submitted;	/*!< Count of total tasks submitted
+					to the task queue */
+	volatile ulint	n_completed;	/*!< Count of total tasks completed */
+
 	/*------------------------------*/
 	/* The following two fields form the 'purge pointer' which advances
 	during a purge, and which is used in history list truncation */
 
-	trx_id_t	purge_trx_no;	/*!< Purge has advanced past all
-					transactions whose number is less
-					than this */
-	undo_no_t	purge_undo_no;	/*!< Purge has advanced past all records
-					whose undo number is less than this */
+	purge_iter_t	iter;		/* Limit up to which we have read and
+					parsed the UNDO log records.  Not
+					necessarily purged from the indexes.
+					Note that this can never be less than
+					the limit below, we check for this
+					invariant in trx0purge.cc */
+	purge_iter_t	limit;		/* The 'purge pointer' which advances
+					during a purge, and which is used in
+					history list truncation */
+#ifdef UNIV_DEBUG
+	purge_iter_t	done;		/* Indicate 'purge pointer' which have
+					purged already accurately. */
+#endif /* UNIV_DEBUG */
 	/*-----------------------------*/
 	ibool		next_stored;	/*!< TRUE if the info of the next record
 					to purge is stored below: if yes, then
@@ -175,9 +199,6 @@ struct trx_purge_struct{
 					the next record to purge belongs */
 	ulint		hdr_offset;	/*!< Header byte offset on the page */
 	/*-----------------------------*/
-	trx_undo_arr_t*	arr;		/*!< Array of transaction numbers and
-					undo numbers of the undo records
-					currently under processing in purge */
 	mem_heap_t*	heap;		/*!< Temporary storage used during a
 					purge: can be emptied after purge
 					completes */
@@ -185,12 +206,15 @@ struct trx_purge_struct{
 	ib_bh_t*	ib_bh;		/*!< Binary min-heap, ordered on
 					rseg_queue_t::trx_no. It is protected
 					by the bh_mutex */
-	mutex_t		bh_mutex;	/*!< Mutex protecting ib_bh */
+	ib_mutex_t		bh_mutex;	/*!< Mutex protecting ib_bh */
+};
+
+/** Info required to purge a record */
+struct trx_purge_rec_t {
+	trx_undo_rec_t*	undo_rec;	/*!< Record to purge */
+	roll_ptr_t	roll_ptr;	/*!< File pointr to UNDO record */
 };
 
-#define TRX_PURGE_ON		1	/* purge operation is running */
-#define TRX_STOP_PURGE		2	/* purge operation is stopped, or
-					it should be stopped */
 #ifndef UNIV_NONINL
 #include "trx0purge.ic"
 #endif
diff --git a/storage/xtradb/include/trx0purge.ic b/storage/xtradb/include/trx0purge.ic
index 800d26ba51b..ca9cc1fb894 100644
--- a/storage/xtradb/include/trx0purge.ic
+++ b/storage/xtradb/include/trx0purge.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -41,3 +41,22 @@ trx_purge_get_log_from_hist(
 	return(node_addr);
 }
 
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+address of its history list node.
+@return	TRUE if purge_sys_t::limit <= purge_sys_t::iter*/
+UNIV_INLINE
+ibool
+trx_purge_check_limit(void)
+/*=======================*/
+{
+	ut_ad(purge_sys->limit.trx_no <= purge_sys->iter.trx_no);
+
+	if (purge_sys->limit.trx_no == purge_sys->iter.trx_no) {
+		ut_ad(purge_sys->limit.undo_no <= purge_sys->iter.undo_no);
+	}
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
diff --git a/storage/xtradb/include/trx0rec.h b/storage/xtradb/include/trx0rec.h
index a6e54d6dfd1..50da55d2ea3 100644
--- a/storage/xtradb/include/trx0rec.h
+++ b/storage/xtradb/include/trx0rec.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -105,10 +105,11 @@ trx_undo_rec_get_pars(
 					TRX_UNDO_INSERT_REC, ... */
 	ulint*		cmpl_info,	/*!< out: compiler info, relevant only
 					for update type records */
-	ibool*		updated_extern,	/*!< out: TRUE if we updated an
+	bool*		updated_extern,	/*!< out: true if we updated an
 					externally stored fild */
 	undo_no_t*	undo_no,	/*!< out: undo log record number */
-	table_id_t*	table_id);	/*!< out: table id */
+	table_id_t*	table_id)	/*!< out: table id */
+	__attribute__((nonnull));
 /*******************************************************************//**
 Builds a row reference from an undo log record.
 @return	pointer to remaining part of undo record */
@@ -178,8 +179,9 @@ trx_undo_update_rec_get_update(
 				needed is allocated */
 	upd_t**		upd);	/*!< out, own: update vector */
 /*******************************************************************//**
-Builds a partial row from an update undo log record. It contains the
-columns which occur as ordering in any index of the table.
+Builds a partial row from an update undo log record, for purge.
+It contains the columns which occur as ordering in any index of the table.
+Any missing columns are indicated by col->mtype == DATA_MISSING.
 @return	pointer to remaining part of undo record */
 UNIV_INTERN
 byte*
@@ -197,8 +199,9 @@ trx_undo_rec_get_partial_row(
 	ibool		ignore_prefix, /*!< in: flag to indicate if we
 				expect blob prefixes in undo. Used
 				only in the assertion. */
-	mem_heap_t*	heap);	/*!< in: memory heap from which the memory
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
 				needed is allocated */
+	__attribute__((nonnull, warn_unused_result));
 /***********************************************************************//**
 Writes information to an undo log about an insert, update, or a delete marking
 of a clustered index record. This information is used in a rollback of the
@@ -206,7 +209,7 @@ transaction and in consistent reads that must look to the history of this
 transaction.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 trx_undo_report_row_operation(
 /*==========================*/
 	ulint		flags,		/*!< in: if BTR_NO_UNDO_LOG_FLAG bit is
@@ -225,10 +228,12 @@ trx_undo_report_row_operation(
 	const rec_t*	rec,		/*!< in: case of an update or delete
 					marking, the record in the clustered
 					index, otherwise NULL */
-	roll_ptr_t*	roll_ptr);	/*!< out: rollback pointer to the
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec) */
+	roll_ptr_t*	roll_ptr)	/*!< out: rollback pointer to the
 					inserted undo log record,
 					0 if BTR_NO_UNDO_LOG
 					flag was specified */
+	__attribute__((nonnull(3,4,10), warn_unused_result));
 /******************************************************************//**
 Copies an undo record to heap. This function can be called if we know that
 the undo log record exists.
@@ -238,35 +243,17 @@ trx_undo_rec_t*
 trx_undo_get_undo_rec_low(
 /*======================*/
 	roll_ptr_t	roll_ptr,	/*!< in: roll pointer to record */
-	mem_heap_t*	heap);		/*!< in: memory heap where copied */
-/******************************************************************//**
-Copies an undo record to heap.
-
-NOTE: the caller must have latches on the clustered index page and
-purge_view.
-
-@return DB_SUCCESS, or DB_MISSING_HISTORY if the undo log has been
-truncated and we cannot fetch the old version */
-UNIV_INTERN
-ulint
-trx_undo_get_undo_rec(
-/*==================*/
-	roll_ptr_t	roll_ptr,	/*!< in: roll pointer to record */
-	trx_id_t	trx_id,		/*!< in: id of the trx that generated
-					the roll pointer: it points to an
-					undo log of this transaction */
-	trx_undo_rec_t** undo_rec,	/*!< out, own: copy of the record */
-	mem_heap_t*	heap);		/*!< in: memory heap where copied */
+	mem_heap_t*	heap)		/*!< in: memory heap where copied */
+	__attribute__((nonnull, warn_unused_result));
 /*******************************************************************//**
-Build a previous version of a clustered index record. This function checks
-that the caller has a latch on the index page of the clustered index record
-and an s-latch on the purge_view. This guarantees that the stack of versions
-is locked.
-@return DB_SUCCESS, or DB_MISSING_HISTORY if the previous version is
-earlier than purge_view, which means that it may have been removed,
-DB_ERROR if corrupted record */
+Build a previous version of a clustered index record. The caller must
+hold a latch on the index page of the clustered index record.
+@retval true if previous version was built, or if it was an insert
+or the table has been rebuilt
+@retval false if the previous version is earlier than purge_view,
+which means that it may have been removed */
 UNIV_INTERN
-ulint
+bool
 trx_undo_prev_version_build(
 /*========================*/
 	const rec_t*	index_rec,/*!< in: clustered index record in the
@@ -275,12 +262,13 @@ trx_undo_prev_version_build(
 				index_rec page and purge_view */
 	const rec_t*	rec,	/*!< in: version of a clustered index record */
 	dict_index_t*	index,	/*!< in: clustered index */
-	ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
 	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
 				needed is allocated */
-	rec_t**		old_vers);/*!< out, own: previous version, or NULL if
+	rec_t**		old_vers)/*!< out, own: previous version, or NULL if
 				rec is the first inserted version, or if
 				history data has been deleted */
+	__attribute__((nonnull));
 #endif /* !UNIV_HOTBACKUP */
 /***********************************************************//**
 Parses a redo log record of adding an undo log record.
diff --git a/storage/xtradb/include/trx0rec.ic b/storage/xtradb/include/trx0rec.ic
index 4fc5a7147f9..08704f6b821 100644
--- a/storage/xtradb/include/trx0rec.ic
+++ b/storage/xtradb/include/trx0rec.ic
@@ -90,7 +90,7 @@ trx_undo_rec_get_offset(
 /*====================*/
 	undo_no_t	undo_no)	/*!< in: undo no read from node */
 {
-	return (3 + mach_ull_get_much_compressed_size(undo_no));
+	return(3 + mach_ull_get_much_compressed_size(undo_no));
 }
 
 /***********************************************************************//**
@@ -108,6 +108,6 @@ trx_undo_rec_copy(
 	len = mach_read_from_2(undo_rec)
 		- ut_align_offset(undo_rec, UNIV_PAGE_SIZE);
 	ut_ad(len < UNIV_PAGE_SIZE);
-	return(mem_heap_dup(heap, undo_rec, len));
+	return((trx_undo_rec_t*) mem_heap_dup(heap, undo_rec, len));
 }
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/trx0roll.h b/storage/xtradb/include/trx0roll.h
index db68ae0a8d6..aa3dbb1f6cd 100644
--- a/storage/xtradb/include/trx0roll.h
+++ b/storage/xtradb/include/trx0roll.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,13 +27,12 @@ Created 3/26/1996 Heikki Tuuri
 #define trx0roll_h
 
 #include "univ.i"
+#include "btr0types.h"
 #include "trx0trx.h"
 #include "trx0types.h"
 #include "mtr0mtr.h"
 #include "trx0sys.h"
 
-#define trx_roll_free_all_savepoints(s) trx_roll_savepoints_free((s), NULL)
-
 /*******************************************************************//**
 Determines if this transaction is rolling back an incomplete transaction
 in crash recovery.
@@ -53,12 +52,6 @@ trx_savept_take(
 /*============*/
 	trx_t*	trx);	/*!< in: transaction */
 /*******************************************************************//**
-Creates an undo number array. */
-UNIV_INTERN
-trx_undo_arr_t*
-trx_undo_arr_create(void);
-/*=====================*/
-/*******************************************************************//**
 Frees an undo number array. */
 UNIV_INTERN
 void
@@ -74,13 +67,6 @@ trx_undo_arr_get_nth_info(
 /*======================*/
 	trx_undo_arr_t*	arr,	/*!< in: undo number array */
 	ulint		n);	/*!< in: position */
-/***********************************************************************//**
-Tries truncate the undo logs. */
-UNIV_INTERN
-void
-trx_roll_try_truncate(
-/*==================*/
-	trx_t*	trx);	/*!< in/out: transaction */
 /********************************************************************//**
 Pops the topmost record when the two undo logs of a transaction are seen
 as a single stack of records ordered by their undo numbers. Inserts the
@@ -116,19 +102,6 @@ trx_undo_rec_release(
 /*=================*/
 	trx_t*		trx,	/*!< in/out: transaction */
 	undo_no_t	undo_no);/*!< in: undo number */
-/*********************************************************************//**
-Starts a rollback operation. */
-UNIV_INTERN
-void
-trx_rollback(
-/*=========*/
-	trx_t*		trx,	/*!< in: transaction */
-	trx_sig_t*	sig,	/*!< in: signal starting the rollback */
-	que_thr_t**	next_thr);/*!< in/out: next query thread to run;
-				if the value which is passed in is
-				a pointer to a NULL pointer, then the
-				calling function can start running
-				a new query thread */
 /*******************************************************************//**
 Rollback or clean up any incomplete transactions which were
 encountered in crash recovery.  If the transaction already was
@@ -147,38 +120,13 @@ committed, then we clean up a possible insert undo log. If the
 transaction was not yet committed, then we roll it back.
 Note: this is done in a background thread.
 @return	a dummy parameter */
-UNIV_INTERN
+extern "C" UNIV_INTERN
 os_thread_ret_t
-trx_rollback_or_clean_all_recovered(
-/*================================*/
+DECLARE_THREAD(trx_rollback_or_clean_all_recovered)(
+/*================================================*/
 	void*	arg __attribute__((unused)));
 			/*!< in: a dummy parameter required by
 			os_thread_create */
-/****************************************************************//**
-Finishes a transaction rollback. */
-UNIV_INTERN
-void
-trx_finish_rollback_off_kernel(
-/*===========================*/
-	que_t*		graph,	/*!< in: undo graph which can now be freed */
-	trx_t*		trx,	/*!< in: transaction */
-	que_thr_t**	next_thr);/*!< in/out: next query thread to run;
-				if the value which is passed in is
-				a pointer to a NULL pointer, then the
-				calling function can start running
-				a new query thread; if this parameter is
-				NULL, it is ignored */
-/****************************************************************//**
-Builds an undo 'query' graph for a transaction. The actual rollback is
-performed by executing this query graph like a query subprocedure call.
-The reply about the completion of the rollback will be sent by this
-graph.
-@return	own: the query graph */
-UNIV_INTERN
-que_t*
-trx_roll_graph_build(
-/*=================*/
-	trx_t*	trx);	/*!< in: trx handle */
 /*********************************************************************//**
 Creates a rollback command node struct.
 @return	own: rollback node struct */
@@ -199,29 +147,32 @@ trx_rollback_step(
 Rollback a transaction used in MySQL.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 trx_rollback_for_mysql(
 /*===================*/
-	trx_t*	trx);	/*!< in: transaction handle */
+	trx_t*	trx)	/*!< in/out: transaction */
+	__attribute__((nonnull));
 /*******************************************************************//**
 Rollback the latest SQL statement for MySQL.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 trx_rollback_last_sql_stat_for_mysql(
 /*=================================*/
-	trx_t*	trx);	/*!< in: transaction handle */
+	trx_t*	trx)	/*!< in/out: transaction */
+	__attribute__((nonnull));
 /*******************************************************************//**
-Rollback a transaction used in MySQL.
+Rollback a transaction to a given savepoint or do a complete rollback.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
-trx_general_rollback_for_mysql(
-/*===========================*/
+dberr_t
+trx_rollback_to_savepoint(
+/*======================*/
 	trx_t*		trx,	/*!< in: transaction handle */
-	trx_savept_t*	savept);/*!< in: pointer to savepoint undo number, if
+	trx_savept_t*	savept)	/*!< in: pointer to savepoint undo number, if
 				partial rollback requested, or NULL for
 				complete rollback */
+	__attribute__((nonnull(1)));
 /*******************************************************************//**
 Rolls back a transaction back to a named savepoint. Modifications after the
 savepoint are undone but InnoDB does NOT release the corresponding locks
@@ -232,17 +183,18 @@ were set after this savepoint are deleted.
 @return if no savepoint of the name found then DB_NO_SAVEPOINT,
 otherwise DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 trx_rollback_to_savepoint_for_mysql(
 /*================================*/
 	trx_t*		trx,			/*!< in: transaction handle */
 	const char*	savepoint_name,		/*!< in: savepoint name */
-	ib_int64_t*	mysql_binlog_cache_pos);/*!< out: the MySQL binlog cache
+	ib_int64_t*	mysql_binlog_cache_pos)	/*!< out: the MySQL binlog cache
 						position corresponding to this
 						savepoint; MySQL needs this
 						information to remove the
 						binlog entries of the queries
 						executed after the savepoint */
+	__attribute__((nonnull, warn_unused_result));
 /*******************************************************************//**
 Creates a named savepoint. If the transaction is not yet started, starts it.
 If there is already a savepoint of the same name, this call erases that old
@@ -250,40 +202,30 @@ savepoint and replaces it with a new. Savepoints are deleted in a transaction
 commit or rollback.
 @return	always DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 trx_savepoint_for_mysql(
 /*====================*/
 	trx_t*		trx,			/*!< in: transaction handle */
 	const char*	savepoint_name,		/*!< in: savepoint name */
-	ib_int64_t	binlog_cache_pos);	/*!< in: MySQL binlog cache
+	ib_int64_t	binlog_cache_pos)	/*!< in: MySQL binlog cache
 						position corresponding to this
 						connection at the time of the
 						savepoint */
-
+	__attribute__((nonnull));
 /*******************************************************************//**
 Releases a named savepoint. Savepoints which
 were set after this savepoint are deleted.
 @return if no savepoint of the name found then DB_NO_SAVEPOINT,
 otherwise DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 trx_release_savepoint_for_mysql(
 /*============================*/
 	trx_t*		trx,			/*!< in: transaction handle */
-	const char*	savepoint_name);	/*!< in: savepoint name */
-
-/*******************************************************************//**
-Frees a single savepoint struct. */
-UNIV_INTERN
-void
-trx_roll_savepoint_free(
-/*=====================*/
-	trx_t*			trx,	/*!< in: transaction handle */
-	trx_named_savept_t*	savep);	/*!< in: savepoint to free */
-
+	const char*	savepoint_name)		/*!< in: savepoint name */
+	__attribute__((nonnull, warn_unused_result));
 /*******************************************************************//**
-Frees savepoint structs starting from savep, if savep == NULL then
-free all savepoints. */
+Frees savepoint structs starting from savep. */
 UNIV_INTERN
 void
 trx_roll_savepoints_free(
@@ -293,34 +235,35 @@ trx_roll_savepoints_free(
 					if this is NULL, free all savepoints
 					of trx */
 
-/** A cell of trx_undo_arr_struct; used during a rollback and a purge */
-struct	trx_undo_inf_struct{
+/** A cell of trx_undo_arr_t; used during a rollback and a purge */
+struct	trx_undo_inf_t{
+	ibool		in_use;	/*!< true if cell is being used */
 	trx_id_t	trx_no;	/*!< transaction number: not defined during
 				a rollback */
 	undo_no_t	undo_no;/*!< undo number of an undo record */
-	ibool		in_use;	/*!< TRUE if the cell is in use */
 };
 
 /** During a rollback and a purge, undo numbers of undo records currently being
 processed are stored in this array */
 
-struct trx_undo_arr_struct{
+struct trx_undo_arr_t{
 	ulint		n_cells;	/*!< number of cells in the array */
-	ulint		n_used;		/*!< number of cells currently in use */
+	ulint		n_used;		/*!< number of cells in use */
 	trx_undo_inf_t*	infos;		/*!< the array of undo infos */
 	mem_heap_t*	heap;		/*!< memory heap from which allocated */
 };
 
 /** Rollback node states */
 enum roll_node_state {
-	ROLL_NODE_SEND = 1,	/*!< about to send a rollback signal to
-				the transaction */
-	ROLL_NODE_WAIT		/*!< rollback signal sent to the transaction,
-				waiting for completion */
+	ROLL_NODE_NONE = 0,		/*!< Unknown state */
+	ROLL_NODE_SEND,			/*!< about to send a rollback signal to
+					the transaction */
+	ROLL_NODE_WAIT			/*!< rollback signal sent to the
+				       	transaction, waiting for completion */
 };
 
 /** Rollback command node in a query graph */
-struct roll_node_struct{
+struct roll_node_t{
 	que_common_t		common;	/*!< node type: QUE_NODE_ROLLBACK */
 	enum roll_node_state	state;	/*!< node execution state */
 	ibool			partial;/*!< TRUE if we want a partial
@@ -328,10 +271,11 @@ struct roll_node_struct{
 	trx_savept_t		savept;	/*!< savepoint to which to
 					roll back, in the case of a
 					partial rollback */
+	que_thr_t*		undo_thr;/*!< undo query graph */
 };
 
 /** A savepoint set with SQL's "SAVEPOINT savepoint_id" command */
-struct trx_named_savept_struct{
+struct trx_named_savept_t{
 	char*		name;		/*!< savepoint name */
 	trx_savept_t	savept;		/*!< the undo number corresponding to
 					the savepoint */
diff --git a/storage/xtradb/include/trx0roll.ic b/storage/xtradb/include/trx0roll.ic
index 6a4a5f54459..178e9bb730a 100644
--- a/storage/xtradb/include/trx0roll.ic
+++ b/storage/xtradb/include/trx0roll.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/trx0rseg.h b/storage/xtradb/include/trx0rseg.h
index 703b6e411a5..b9c84ef2b06 100644
--- a/storage/xtradb/include/trx0rseg.h
+++ b/storage/xtradb/include/trx0rseg.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -29,6 +29,7 @@ Created 3/26/1996 Heikki Tuuri
 #include "univ.i"
 #include "trx0types.h"
 #include "trx0sys.h"
+#include "ut0bh.h"
 
 /******************************************************************//**
 Gets a rollback segment header.
@@ -86,11 +87,11 @@ trx_rsegf_undo_find_free(
 /******************************************************************//**
 Looks for a rollback segment, based on the rollback segment id.
 @return	rollback segment */
-UNIV_INTERN
+UNIV_INLINE
 trx_rseg_t*
 trx_rseg_get_on_id(
 /*===============*/
-	ulint	id);	/*!< in: rollback segment id */
+	ulint	id);		/*!< in: rollback segment id */
 /****************************************************************//**
 Creates a rollback segment header. This function is called only when
 a new rollback segment is created in the database.
@@ -107,30 +108,42 @@ trx_rseg_header_create(
 	mtr_t*	mtr);		/*!< in: mtr */
 /*********************************************************************//**
 Creates the memory copies for rollback segments and initializes the
-rseg list and array in trx_sys at a database startup. */
+rseg array in trx_sys at a database startup. */
 UNIV_INTERN
 void
-trx_rseg_list_and_array_init(
-/*=========================*/
-	trx_sysf_t*	sys_header,	/*!< in: trx system header */
+trx_rseg_array_init(
+/*================*/
+	trx_sysf_t*	sys_header,	/*!< in/out: trx system header */
 	ib_bh_t*	ib_bh,		/*!< in: rseg queue */
-	mtr_t*		mtr);		/*!< in: mtr */
-
+	mtr_t*		mtr);		/*!< in/out: mtr */
 /***************************************************************************
 Free's an instance of the rollback segment in memory. */
 UNIV_INTERN
 void
 trx_rseg_mem_free(
 /*==============*/
-	trx_rseg_t*	rseg);		/* in, own: instance to free */
+	trx_rseg_t*	rseg);		/*!< in, own: instance to free */
 
 /*********************************************************************
 Creates a rollback segment. */
 UNIV_INTERN
 trx_rseg_t*
-trx_rseg_create(void);
-/*==================*/
-
+trx_rseg_create(
+/*============*/
+	ulint	space);			/*!< in: id of UNDO tablespace */
+
+/********************************************************************
+Get the number of unique rollback tablespaces in use except space id 0.
+The last space id will be the sentinel value ULINT_UNDEFINED. The array
+will be sorted on space id. Note: space_ids should have have space for
+TRX_SYS_N_RSEGS + 1 elements.
+@return number of unique rollback tablespaces in use. */
+UNIV_INTERN
+ulint
+trx_rseg_get_n_undo_tablespaces(
+/*============================*/
+	ulint*		space_ids);	/*!< out: array of space ids of
+					UNDO tablespaces */
 /* Number of undo log slots in a rollback segment file copy */
 #define TRX_RSEG_N_SLOTS	(UNIV_PAGE_SIZE / 16)
 
@@ -138,11 +151,11 @@ trx_rseg_create(void);
 #define TRX_RSEG_MAX_N_TRXS	(TRX_RSEG_N_SLOTS / 2)
 
 /* The rollback segment memory object */
-struct trx_rseg_struct{
+struct trx_rseg_t{
 	/*--------------------------------------------------------*/
 	ulint		id;	/*!< rollback segment id == the index of
 				its slot in the trx system file copy */
-	mutex_t		mutex;	/*!< mutex protecting the fields in this
+	ib_prio_mutex_t		mutex;	/*!< mutex protecting the fields in this
 				struct except id, which is constant */
 	ulint		space;	/*!< space where the rollback segment is
 				header is placed */
@@ -176,20 +189,14 @@ struct trx_rseg_struct{
 					yet purged log */
 	ibool		last_del_marks;	/*!< TRUE if the last not yet purged log
 					needs purging */
-	/*--------------------------------------------------------*/
-	UT_LIST_NODE_T(trx_rseg_t) rseg_list;
-					/* the list of the rollback segment
-					memory objects */
 };
 
 /** For prioritising the rollback segments for purge. */
-struct rseg_queue_struct {
-	trx_id_t	trx_no;		/*!< trx_rseg_t::last_trx_no */
-	trx_rseg_t*	rseg;		/*!< Rollback segment */
+struct rseg_queue_t {
+        trx_id_t	trx_no;         /*!< trx_rseg_t::last_trx_no */
+        trx_rseg_t*     rseg;           /*!< Rollback segment */
 };
 
-typedef struct rseg_queue_struct rseg_queue_t;
-
 /* Undo log segment slot in a rollback segment header */
 /*-------------------------------------------------------------*/
 #define	TRX_RSEG_SLOT_PAGE_NO	0	/* Page number of the header page of
diff --git a/storage/xtradb/include/trx0rseg.ic b/storage/xtradb/include/trx0rseg.ic
index bb2684576d3..30743da9b8c 100644
--- a/storage/xtradb/include/trx0rseg.ic
+++ b/storage/xtradb/include/trx0rseg.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -86,7 +86,7 @@ trx_rsegf_get_nth_undo(
 	ulint		n,	/*!< in: index of slot */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	if (UNIV_UNLIKELY(n >= TRX_RSEG_N_SLOTS)) {
+	if (n >= TRX_RSEG_N_SLOTS) {
 		fprintf(stderr,
 			"InnoDB: Error: trying to get slot %lu of rseg\n",
 			(ulong) n);
@@ -108,7 +108,7 @@ trx_rsegf_set_nth_undo(
 	ulint		page_no,/*!< in: page number of the undo log segment */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	if (UNIV_UNLIKELY(n >= TRX_RSEG_N_SLOTS)) {
+	if (n >= TRX_RSEG_N_SLOTS) {
 		fprintf(stderr,
 			"InnoDB: Error: trying to set slot %lu of rseg\n",
 			(ulong) n);
@@ -150,3 +150,18 @@ trx_rsegf_undo_find_free(
 
 	return(ULINT_UNDEFINED);
 }
+
+/******************************************************************//**
+Looks for a rollback segment, based on the rollback segment id.
+@return	rollback segment */
+UNIV_INLINE
+trx_rseg_t*
+trx_rseg_get_on_id(
+/*===============*/
+	ulint	id)	/*!< in: rollback segment id */
+{
+	ut_a(id < TRX_SYS_N_RSEGS);
+
+	return(trx_sys->rseg_array[id]);
+}
+
diff --git a/storage/xtradb/include/trx0sys.h b/storage/xtradb/include/trx0sys.h
index f284790630c..7b97c6e99cd 100644
--- a/storage/xtradb/include/trx0sys.h
+++ b/storage/xtradb/include/trx0sys.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -41,6 +41,9 @@ Created 3/26/1996 Heikki Tuuri
 #include "ut0bh.h"
 #include "read0types.h"
 #include "page0types.h"
+#include "ut0bh.h"
+
+typedef UT_LIST_BASE_NODE_T(trx_t) trx_list_t;
 
 /** In a MySQL replication slave, in crash recovery we store the master log
 file name and position here. */
@@ -53,9 +56,6 @@ there was no master log position info inside InnoDB.*/
 extern ib_int64_t	trx_sys_mysql_master_log_pos;
 /* @} */
 
-extern char		trx_sys_mysql_relay_log_name[];
-extern ib_int64_t	trx_sys_mysql_relay_log_pos;
-
 /** If this MySQL server uses binary logging, after InnoDB has been inited
 and if it has done a crash recovery, we store the binlog file name and position
 here. */
@@ -69,53 +69,6 @@ extern ib_int64_t	trx_sys_mysql_bin_log_pos;
 /** The transaction system */
 extern trx_sys_t*	trx_sys;
 
-/** Doublewrite system */
-extern trx_doublewrite_t*	trx_doublewrite;
-/** The following is set to TRUE when we are upgrading from pre-4.1
-format data files to the multiple tablespaces format data files */
-extern ibool			trx_doublewrite_must_reset_space_ids;
-/** Set to TRUE when the doublewrite buffer is being created */
-extern ibool			trx_doublewrite_buf_is_being_created;
-/** The following is TRUE when we are using the database in the
-post-4.1 format, i.e., we have successfully upgraded, or have created
-a new database installation */
-extern ibool			trx_sys_multiple_tablespace_format;
-
-/****************************************************************//**
-Creates the doublewrite buffer to a new InnoDB installation. The header of the
-doublewrite buffer is placed on the trx system header page. */
-UNIV_INTERN
-void
-trx_sys_create_doublewrite_buf(void);
-/*================================*/
-/****************************************************************//**
-At a database startup initializes the doublewrite buffer memory structure if
-we already have a doublewrite buffer created in the data files. If we are
-upgrading to an InnoDB version which supports multiple tablespaces, then this
-function performs the necessary update operations. If we are in a crash
-recovery, this function uses a possible doublewrite buffer to restore
-half-written pages in the data files. */
-UNIV_INTERN
-void
-trx_sys_doublewrite_init_or_restore_pages(
-/*======================================*/
-	ibool	restore_corrupt_pages);	/*!< in: TRUE=restore pages */
-/****************************************************************//**
-Marks the trx sys header when we have successfully upgraded to the >= 4.1.x
-multiple tablespace format. */
-UNIV_INTERN
-void
-trx_sys_mark_upgraded_to_multiple_tablespaces(void);
-/*===============================================*/
-/****************************************************************//**
-Determines if a page number is located inside the doublewrite buffer.
-@return TRUE if the location is inside the two blocks of the
-doublewrite buffer */
-UNIV_INTERN
-ibool
-trx_doublewrite_page_inside(
-/*========================*/
-	ulint	page_no);	/*!< in: page number */
 /***************************************************************//**
 Checks if a page address is the trx sys header page.
 @return	TRUE if trx sys header page */
@@ -125,42 +78,26 @@ trx_sys_hdr_page(
 /*=============*/
 	ulint	space,	/*!< in: space */
 	ulint	page_no);/*!< in: page number */
-/***************************************************************//**
-Checks if a space is the system tablespaces.
-@return TRUE if system tablespace */
-UNIV_INLINE
-ibool
-trx_sys_sys_space(
-/*==============*/
-	ulint	space);	/*!< in: space */
-/***************************************************************//**
-Checks if a space is the doublewrite tablespace.
-@return TRUE if doublewrite tablespace */
-UNIV_INLINE
-ibool
-trx_sys_doublewrite_space(
-/*======================*/
-	ulint	space);	/*!< in: space */
 /*****************************************************************//**
 Creates and initializes the central memory structures for the transaction
-system. This is called when the database is started. */
+system. This is called when the database is started.
+@return min binary heap of rsegs to purge */
 UNIV_INTERN
-void
+ib_bh_t*
 trx_sys_init_at_db_start(void);
 /*==========================*/
 /*****************************************************************//**
-Creates and initializes the transaction system at the database creation. */
+Creates the trx_sys instance and initializes ib_bh and mutex. */
 UNIV_INTERN
 void
 trx_sys_create(void);
 /*================*/
 /*****************************************************************//**
-Creates and initializes the dummy transaction system page for tablespace. */
+Creates and initializes the transaction system at the database creation. */
 UNIV_INTERN
 void
-trx_sys_dummy_create(
-/*=================*/
-	ulint	space);
+trx_sys_create_sys_pages(void);
+/*==========================*/
 /****************************************************************//**
 Looks for a free slot for a rollback segment in the trx system file copy.
 @return	slot index or ULINT_UNDEFINED if not found */
@@ -178,16 +115,6 @@ trx_sys_get_nth_rseg(
 /*=================*/
 	trx_sys_t*	sys,	/*!< in: trx system */
 	ulint		n);	/*!< in: index of slot */
-/***************************************************************//**
-Sets the pointer in the nth slot of the rseg array. */
-UNIV_INLINE
-void
-trx_sys_set_nth_rseg(
-/*=================*/
-	trx_sys_t*	sys,	/*!< in: trx system */
-	ulint		n,	/*!< in: index of slot */
-	trx_rseg_t*	rseg);	/*!< in: pointer to rseg object, NULL if slot
-				not in use */
 /**********************************************************************//**
 Gets a pointer to the transaction system file copy and x-locks its page.
 @return	pointer to system file copy, page x-locked */
@@ -248,6 +175,14 @@ UNIV_INLINE
 trx_id_t
 trx_sys_get_new_trx_id(void);
 /*========================*/
+/*****************************************************************//**
+Determines the maximum transaction id.
+@return maximum currently allocated trx id; will be stale after the
+next call to trx_sys_get_new_trx_id() */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_max_trx_id(void);
+/*========================*/
 
 /*************************************************************//**
 Find a slot for a given trx ID in a descriptors array.
@@ -286,39 +221,75 @@ trx_read_trx_id(
 /*============*/
 	const byte*	ptr);	/*!< in: pointer to memory from where to read */
 /****************************************************************//**
-Looks for the trx handle with the given id in trx_list.
-@return	the trx handle or NULL if not found */
+Looks for the trx instance with the given id in the rw trx_list.
+The caller must be holding trx_sys->mutex.
+@return	the trx handle or NULL if not found;
+the pointer must not be dereferenced unless lock_sys->mutex was
+acquired before calling this function and is still being held */
 UNIV_INLINE
 trx_t*
-trx_get_on_id(
-/*==========*/
+trx_get_rw_trx_by_id(
+/*=================*/
 	trx_id_t	trx_id);/*!< in: trx id to search for */
 /****************************************************************//**
-Returns the minumum trx id in trx list. This is the smallest id for which
-the trx can possibly be active. (But, you must look at the trx->conc_state to
+Returns the minimum trx id in rw trx list. This is the smallest id for which
+the trx can possibly be active. (But, you must look at the trx->state to
 find out if the minimum trx id transaction itself is active, or already
 committed.)
 @return	the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */
 UNIV_INLINE
 trx_id_t
-trx_list_get_min_trx_id(void);
-/*=========================*/
+trx_rw_min_trx_id(void);
+/*===================*/
 /****************************************************************//**
-Checks if a transaction with the given id is active.
-@return	TRUE if active */
+Returns pointer to a transaction instance if a rw transaction with the given id
+is active. Caller must hold trx_sys->mutex. If the caller is not holding
+lock_sys->mutex, the transaction may already have been committed.
+@return transaction instance if active, or NULL;
+the pointer must not be dereferenced unless lock_sys->mutex was
+acquired before calling this function and is still being held */
 UNIV_INLINE
-ibool
-trx_is_active(
-/*==========*/
-	trx_id_t	trx_id);/*!< in: trx id of the transaction */
+trx_t*
+trx_rw_get_active_trx_by_id(
+/*========================*/
+	trx_id_t	trx_id,		/*!< in: trx id of the transaction */
+	ibool*		corrupt);	/*!< in: NULL or pointer to a flag
+					that will be set if corrupt */
+/****************************************************************//**
+Checks if a rw transaction with the given id is active. Caller must hold
+trx_sys->mutex. If the caller is not holding lock_sys->mutex, the
+transaction may already have been committed.
+@return	true if rw transaction it with a given id is active. */
+UNIV_INLINE
+bool
+trx_rw_is_active_low(
+/*=================*/
+	trx_id_t	trx_id,		/*!< in: trx id of the transaction */
+	ibool*		corrupt);	/*!< in: NULL or pointer to a flag
+					that will be set if corrupt */
+/****************************************************************//**
+Checks if a rw transaction with the given id is active. If the caller is
+not holding lock_sys->mutex, the transaction may already have been
+committed.
+@return	true if rw transaction it with a given id is active. */
+UNIV_INLINE
+bool
+trx_rw_is_active(
+/*=============*/
+	trx_id_t	trx_id,		/*!< in: trx id of the transaction */
+	ibool*		corrupt);	/*!< in: NULL or pointer to a flag
+					that will be set if corrupt */
+#ifdef UNIV_DEBUG
 /****************************************************************//**
-Checks that trx is in the trx list.
+Checks whether a trx is in one of rw_trx_list or ro_trx_list.
 @return	TRUE if is in */
 UNIV_INTERN
 ibool
 trx_in_trx_list(
 /*============*/
-	trx_t*	in_trx);/*!< in: trx */
+	const trx_t*	in_trx)		/*!< in: transaction */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 /***********************************************************//**
 Assert that a transaction has been recovered.
@@ -339,8 +310,7 @@ UNIV_INTERN
 void
 trx_sys_update_mysql_binlog_offset(
 /*===============================*/
-	trx_sysf_t*	sys_header,
-	const char*	file_name_in,/*!< in: MySQL log file name */
+	const char*	file_name,/*!< in: MySQL log file name */
 	ib_int64_t	offset,	/*!< in: position in that log file */
 	ulint		field,	/*!< in: offset of the MySQL log info field in
 				the trx sys header */
@@ -353,14 +323,6 @@ void
 trx_sys_print_mysql_binlog_offset(void);
 /*===================================*/
 /*****************************************************************//**
-Prints to stderr the MySQL master log offset info in the trx system header
-COMMIT set of fields if the magic number shows it valid and stores it
-in global variables. */
-UNIV_INTERN
-void
-trx_sys_print_committed_mysql_master_log_pos(void);
-/*==============================================*/
-/*****************************************************************//**
 Prints to stderr the MySQL master log offset info in the trx system header if
 the magic number shows it valid. */
 UNIV_INTERN
@@ -388,14 +350,12 @@ UNIV_INTERN
 void
 trx_sys_file_format_tag_init(void);
 /*==============================*/
-#ifndef UNIV_HOTBACKUP
 /*****************************************************************//**
 Shutdown/Close the transaction system. */
 UNIV_INTERN
 void
 trx_sys_close(void);
 /*===============*/
-#endif /* !UNIV_HOTBACKUP */
 /*****************************************************************//**
 Get the name representation of the file format from its id.
 @return	pointer to the name */
@@ -415,31 +375,30 @@ trx_sys_file_format_max_set(
 	ulint		format_id,	/*!< in: file format id */
 	const char**	name);		/*!< out: max file format name or
 					NULL if not needed. */
-/*****************************************************************//**
-Get the name representation of the file format from its id.
-@return	pointer to the max format name */
+/*********************************************************************
+Creates the rollback segments
+@return number of rollback segments that are active. */
 UNIV_INTERN
-const char*
-trx_sys_file_format_max_get(void);
-/*=============================*/
+ulint
+trx_sys_create_rsegs(
+/*=================*/
+	ulint	n_spaces,	/*!< number of tablespaces for UNDO logs */
+	ulint	n_rsegs);	/*!< number of rollback segments to create */
 /*****************************************************************//**
-Check for the max file format tag stored on disk.
-@return	DB_SUCCESS or error code */
-UNIV_INTERN
+Get the number of transaction in the system, independent of their state.
+@return count of transactions in trx_sys_t::trx_list */
+UNIV_INLINE
 ulint
-trx_sys_file_format_max_check(
-/*==========================*/
-	ulint		max_format_id);	/*!< in: the max format id to check */
-/********************************************************************//**
-Update the file format tag in the system tablespace only if the given
-format id is greater than the known max id.
-@return	TRUE if format_id was bigger than the known max id */
+trx_sys_get_n_rw_trx(void);
+/*======================*/
+
+/*********************************************************************
+Check if there are any active (non-prepared) transactions.
+@return total number of active transactions or 0 if none */
 UNIV_INTERN
-ibool
-trx_sys_file_format_max_upgrade(
-/*============================*/
-	const char**	name,		/*!< out: max file format name */
-	ulint		format_id);	/*!< in: file format identifier */
+ulint
+trx_sys_any_active_transactions(void);
+/*=================================*/
 #else /* !UNIV_HOTBACKUP */
 /*****************************************************************//**
 Prints to stderr the MySQL binlog info in the system header if the
@@ -476,6 +435,32 @@ trx_sys_read_pertable_file_format_id(
 				datafile */
 	ulint *format_id);	/*!< out: file format of the per-table
 				data file */
+#endif /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the max format name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_max_get(void);
+/*=============================*/
+/*****************************************************************//**
+Check for the max file format tag stored on disk.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+trx_sys_file_format_max_check(
+/*==========================*/
+	ulint		max_format_id);	/*!< in: the max format id to check */
+/********************************************************************//**
+Update the file format tag in the system tablespace only if the given
+format id is greater than the known max id.
+@return	TRUE if format_id was bigger than the known max id */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_upgrade(
+/*============================*/
+	const char**	name,		/*!< out: max file format name */
+	ulint		format_id);	/*!< in: file format identifier */
 /*****************************************************************//**
 Get the name representation of the file format from its id.
 @return	pointer to the name */
@@ -485,22 +470,20 @@ trx_sys_file_format_id_to_name(
 /*===========================*/
 	const ulint	id);	/*!< in: id of the file format */
 
-#endif /* !UNIV_HOTBACKUP */
-/*********************************************************************
-Creates the rollback segments */
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Validate the trx_sys_t::trx_list. */
 UNIV_INTERN
-void
-trx_sys_create_rsegs(
-/*=================*/
-	ulint	n_rsegs);	/*!< number of rollback segments to create */
+ibool
+trx_sys_validate_trx_list(void);
+/*===========================*/
+#endif /* UNIV_DEBUG */
 
 /* The automatically created system rollback segment has this id */
 #define TRX_SYS_SYSTEM_RSEG_ID	0
 
 /* Space id and page no where the trx system file copy resides */
 #define	TRX_SYS_SPACE	0	/* the SYSTEM tablespace */
-#define	TRX_DOUBLEWRITE_SPACE	0xFFFFFFE0UL	/* the doublewrite buffer tablespace if used */
-#define	TRX_SYS_SPACE_MAX	9	/* reserved max space id for system tablespaces */
 #include "fsp0fsp.h"
 #define	TRX_SYS_PAGE_NO	FSP_TRX_SYS_PAGE_NO
 
@@ -545,23 +528,15 @@ We must remember this limit in order to keep file compatibility. */
 @see trx_sys_mysql_master_log_name
 @see trx_sys_mysql_bin_log_name */
 #define TRX_SYS_MYSQL_LOG_NAME_LEN	512
-#define TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN	480	/* (500 - 12) is dead line. */
 /** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */
 #define TRX_SYS_MYSQL_LOG_MAGIC_N	873422344
 
-//#if UNIV_PAGE_SIZE < 4096
-//# error "UNIV_PAGE_SIZE < 4096"
-//#endif
+#if UNIV_PAGE_SIZE_MIN < 4096
+# error "UNIV_PAGE_SIZE_MIN < 4096"
+#endif
 /** The offset of the MySQL replication info in the trx system header;
-this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below.  These are
-written at prepare time and are the main copy. */
+this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */
 #define TRX_SYS_MYSQL_MASTER_LOG_INFO	(UNIV_PAGE_SIZE - 2000)
-#define TRX_SYS_MYSQL_RELAY_LOG_INFO	(UNIV_PAGE_SIZE - 1500)
-
-/** The copy of the above which is made at transaction COMMIT time. If binlog
-crash recovery rollbacks a PREPAREd transaction, they are copied back. */
-#define TRX_SYS_COMMIT_MASTER_LOG_INFO	(UNIV_PAGE_SIZE - 3000)
-#define TRX_SYS_COMMIT_RELAY_LOG_INFO	(UNIV_PAGE_SIZE - 2500)
 
 /** The offset of the MySQL binlog offset info in the trx system header */
 #define TRX_SYS_MYSQL_LOG_INFO		(UNIV_PAGE_SIZE - 1000)
@@ -613,7 +588,7 @@ crash recovery rollbacks a PREPAREd transaction, they are copied back. */
 /** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
 we must reset the doublewrite buffer, because starting from 4.1.x the
 space id of a data page is stored into
-FIL_PAGE_ARCH_LOG_NO_OR_SPACE_NO. */
+FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
 #define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE)
 
 /*-------------------------------------------------------------*/
@@ -647,29 +622,22 @@ identifier is added to this 64-bit constant. */
 #define TRX_DESCR_ARRAY_INITIAL_SIZE 	1000
 
 #ifndef UNIV_HOTBACKUP
-/** Doublewrite control struct */
-struct trx_doublewrite_struct{
-	mutex_t	mutex;		/*!< mutex protecting the first_free field and
-				write_buf */
-	ulint	block1;		/*!< the page number of the first
-				doublewrite block (64 pages) */
-	ulint	block2;		/*!< page number of the second block */
-	ulint	first_free;	/*!< first free position in write_buf measured
-				in units of UNIV_PAGE_SIZE */
-	byte*	write_buf;	/*!< write buffer used in writing to the
-				doublewrite buffer, aligned to an
-				address divisible by UNIV_PAGE_SIZE
-				(which is required by Windows aio) */
-	byte*	write_buf_unaligned;
-				/*!< pointer to write_buf, but unaligned */
-	buf_page_t**
-		buf_block_arr;	/*!< array to store pointers to the buffer
-				blocks which have been cached to write_buf */
-};
-
-/** The transaction system central memory data structure; protected by the
-kernel mutex */
-struct trx_sys_struct{
+/** The transaction system central memory data structure. */
+struct trx_sys_t{
+
+	ib_mutex_t		mutex;		/*!< mutex protecting most fields in
+					this structure except when noted
+					otherwise */
+	ulint		n_prepared_trx;	/*!< Number of transactions currently
+					in the XA PREPARED state */
+	ulint		n_prepared_recovered_trx; /*!< Number of transactions
+					currently in XA PREPARED state that are
+					also recovered. Such transactions cannot
+					be added during runtime. They can only
+					occur after recovery if mysqld crashed
+					while there were XA PREPARED
+					transactions. We disable query cache
+					if such transactions exist. */
 	trx_id_t	max_trx_id;	/*!< The smallest number not yet
 					assigned as a transaction id or
 					transaction number */
@@ -685,35 +653,53 @@ struct trx_sys_struct{
 					descriptors array. */
 	char		pad3[64];	/*!< Ensure descriptors do not share
 					cache line with other fields */
-	UT_LIST_BASE_NODE_T(trx_t) trx_list;
-					/*!< List of active and committed in
-					memory transactions, sorted on trx id,
-					biggest first */
+#ifdef UNIV_DEBUG
+	trx_id_t	rw_max_trx_id;	/*!< Max trx id of read-write transactions
+					which exist or existed */
+#endif
+	trx_list_t	rw_trx_list;	/*!< List of active and committed in
+					memory read-write transactions, sorted
+					on trx id, biggest first. Recovered
+					transactions are always on this list. */
 	char		pad4[64];	/*!< Ensure list base nodes do not
 					share cache line with other fields */
-	UT_LIST_BASE_NODE_T(trx_t) mysql_trx_list;
-					/*!< List of transactions created
-					for MySQL */
+	trx_list_t	ro_trx_list;	/*!< List of active and committed in
+					memory read-only transactions, sorted
+					on trx id, biggest first. NOTE:
+					The order for read-only transactions
+					is not necessary. We should exploit
+					this and increase concurrency during
+					add/remove. */
 	char		pad5[64];	/*!< Ensure list base nodes do not
 					share cache line with other fields */
-	UT_LIST_BASE_NODE_T(trx_t) trx_serial_list;
+	trx_list_t	mysql_trx_list;	/*!< List of transactions created
+					for MySQL. All transactions on
+					ro_trx_list are on mysql_trx_list. The
+					rw_trx_list can contain system
+					transactions and recovered transactions
+					that will not be in the mysql_trx_list.
+					There can be active non-locking
+					auto-commit read only transactions that
+					are on this list but not on ro_trx_list.
+					mysql_trx_list may additionally contain
+					transactions that have not yet been
+					started in InnoDB. */
+	char		pad6[64];	/*!< Ensure list base nodes do not
+					share cache line with other fields */
+	trx_list_t	trx_serial_list;
 					/*!< trx->no ordered List of
 					transactions in either TRX_PREPARED or
 					TRX_ACTIVE which have already been
 					assigned a serialization number */
-	char		pad6[64];	/*!< Ensure trx_serial_list does not
-					share cache line with other fields */
-	UT_LIST_BASE_NODE_T(trx_rseg_t) rseg_list;
-					/*!< List of rollback segment
-					objects */
 	char		pad7[64];	/*!< Ensure list base nodes do not
 					share cache line with other fields */
-	trx_rseg_t*	latest_rseg;	/*!< Latest rollback segment in the
-					round-robin assignment of rollback
-					segments to transactions */
-	trx_rseg_t*	rseg_array[TRX_SYS_N_RSEGS];
+	trx_rseg_t*	const rseg_array[TRX_SYS_N_RSEGS];
 					/*!< Pointer array to rollback
-					segments; NULL if slot not in use */
+					segments; NULL if slot not in use;
+					created and destroyed in
+					single-threaded mode; not protected
+					by any mutex, because it is read-only
+					during multi-threaded operation */
 	ulint		rseg_history_len;/*!< Length of the TRX_RSEG_HISTORY
 					list (update undo logs for committed
 					transactions), protected by
diff --git a/storage/xtradb/include/trx0sys.ic b/storage/xtradb/include/trx0sys.ic
index 17c94105cee..699148cff6d 100644
--- a/storage/xtradb/include/trx0sys.ic
+++ b/storage/xtradb/include/trx0sys.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -71,40 +71,6 @@ trx_sys_hdr_page(
 }
 
 /***************************************************************//**
-Checks if a space is the system tablespaces.
-@return TRUE if system tablespace */
-UNIV_INLINE
-ibool
-trx_sys_sys_space(
-/*==============*/
-	ulint	space)	/*!< in: space */
-{
-	if (srv_doublewrite_file) {
-		/* several spaces are reserved */
-		return((ibool)(space == TRX_SYS_SPACE || space == TRX_DOUBLEWRITE_SPACE));
-	} else {
-		return((ibool)(space == TRX_SYS_SPACE));
-	}
-}
-
-/***************************************************************//**
-Checks if a space is the doublewrite tablespace.
-@return TRUE if doublewrite tablespace */
-UNIV_INLINE
-ibool
-trx_sys_doublewrite_space(
-/*======================*/
-	ulint	space)	/*!< in: space */
-{
-	if (srv_doublewrite_file) {
-		/* doublewrite buffer is separated */
-		return((ibool)(space == TRX_DOUBLEWRITE_SPACE));
-	} else {
-		return((ibool)(space == TRX_SYS_SPACE));
-	}
-}
-
-/***************************************************************//**
 Gets the pointer in the nth slot of the rseg array.
 @return	pointer to rseg object, NULL if slot not in use */
 UNIV_INLINE
@@ -114,28 +80,11 @@ trx_sys_get_nth_rseg(
 	trx_sys_t*	sys,	/*!< in: trx system */
 	ulint		n)	/*!< in: index of slot */
 {
-	ut_ad(mutex_own(&(kernel_mutex)));
 	ut_ad(n < TRX_SYS_N_RSEGS);
 
 	return(sys->rseg_array[n]);
 }
 
-/***************************************************************//**
-Sets the pointer in the nth slot of the rseg array. */
-UNIV_INLINE
-void
-trx_sys_set_nth_rseg(
-/*=================*/
-	trx_sys_t*	sys,	/*!< in: trx system */
-	ulint		n,	/*!< in: index of slot */
-	trx_rseg_t*	rseg)	/*!< in: pointer to rseg object, NULL if slot
-				not in use */
-{
-	ut_ad(n < TRX_SYS_N_RSEGS);
-
-	sys->rseg_array[n] = rseg;
-}
-
 /**********************************************************************//**
 Gets a pointer to the transaction system header and x-latches its page.
 @return	pointer to system header, page x-latched. */
@@ -171,7 +120,6 @@ trx_sysf_rseg_get_space(
 	ulint		i,		/*!< in: slot index == rseg id */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	ut_ad(mutex_own(&(kernel_mutex)));
 	ut_ad(sys_header);
 	ut_ad(i < TRX_SYS_N_RSEGS);
 
@@ -193,7 +141,6 @@ trx_sysf_rseg_get_page_no(
 	mtr_t*		mtr)		/*!< in: mtr */
 {
 	ut_ad(sys_header);
-	ut_ad(mutex_own(&(kernel_mutex)));
 	ut_ad(i < TRX_SYS_N_RSEGS);
 
 	return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS
@@ -213,7 +160,6 @@ trx_sysf_rseg_set_space(
 	ulint		space,		/*!< in: space id */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	ut_ad(mutex_own(&(kernel_mutex)));
 	ut_ad(sys_header);
 	ut_ad(i < TRX_SYS_N_RSEGS);
 
@@ -237,7 +183,6 @@ trx_sysf_rseg_set_page_no(
 					slot is reset to unused */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	ut_ad(mutex_own(&(kernel_mutex)));
 	ut_ad(sys_header);
 	ut_ad(i < TRX_SYS_N_RSEGS);
 
@@ -285,30 +230,96 @@ trx_read_trx_id(
 }
 
 /****************************************************************//**
-Looks for the trx handle with the given id in trx_list.
-@return	the trx handle or NULL if not found */
+Looks for the trx handle with the given id in rw_trx_list.
+The caller must be holding trx_sys->mutex.
+@return	the trx handle or NULL if not found;
+the pointer must not be dereferenced unless lock_sys->mutex was
+acquired before calling this function and is still being held */
 UNIV_INLINE
 trx_t*
-trx_get_on_id(
-/*==========*/
+trx_get_rw_trx_by_id(
+/*=================*/
 	trx_id_t	trx_id)	/*!< in: trx id to search for */
 {
-	trx_t*	trx;
+	trx_t*		trx;
+	ulint		len;
+	trx_t*		first;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	len = UT_LIST_GET_LEN(trx_sys->rw_trx_list);
 
-	ut_ad(mutex_own(&(kernel_mutex)));
+	if (len == 0) {
+		return(NULL);
+	}
+
+	/* Because the list is ordered on trx id in descending order,
+	we try to speed things up a bit. */
+
+	trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+	assert_trx_in_rw_list(trx);
+
+	if (trx_id == trx->id) {
+		return(trx);
+	} else if (len == 1 || trx_id > trx->id) {
+		return(NULL);
+	}
+
+	first = trx;
 
-	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+	trx = UT_LIST_GET_LAST(trx_sys->rw_trx_list);
+	assert_trx_in_rw_list(trx);
 
-	while (trx != NULL) {
-		if (trx_id == trx->id) {
+	if (trx_id == trx->id) {
+		return(trx);
+	} else if (len == 2 || trx_id < trx->id) {
+		return(NULL);
+	}
 
-			return(trx);
+	/* Search the list from the lower end (tail). */
+	if (trx_id < (first->id + trx->id) >> 1) {
+		for (trx = UT_LIST_GET_PREV(trx_list, trx);
+		     trx != NULL && trx_id > trx->id;
+		     trx = UT_LIST_GET_PREV(trx_list, trx)) {
+			assert_trx_in_rw_list(trx);
+		}
+	} else {
+		for (trx = UT_LIST_GET_NEXT(trx_list, first);
+		     trx != NULL && trx_id < trx->id;
+		     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+			assert_trx_in_rw_list(trx);
 		}
+	}
+
+	return((trx != NULL && trx->id == trx_id) ? trx : NULL);
+}
 
-		trx = UT_LIST_GET_NEXT(trx_list, trx);
+/****************************************************************//**
+Returns the minimum trx id in trx list. This is the smallest id for which
+the trx can possibly be active. (But, you must look at the trx->state
+to find out if the minimum trx id transaction itself is active, or already
+committed.). The caller must be holding the trx_sys_t::mutex in shared mode.
+@return	the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */
+UNIV_INLINE
+trx_id_t
+trx_rw_min_trx_id_low(void)
+/*=======================*/
+{
+	trx_id_t	id;
+	const trx_t*	trx;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	trx = UT_LIST_GET_LAST(trx_sys->rw_trx_list);
+
+	if (trx == NULL) {
+		id = trx_sys->max_trx_id;
+	} else {
+		assert_trx_in_rw_list(trx);
+		id = trx->id;
 	}
 
-	return(NULL);
+	return(id);
 }
 
 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
@@ -321,62 +332,138 @@ trx_assert_recovered(
 /*=================*/
 	trx_id_t	trx_id)		/*!< in: transaction identifier */
 {
-	trx_t*		trx;
+	const trx_t*	trx;
 
-	mutex_enter(&kernel_mutex);
-	trx = trx_get_on_id(trx_id);
-	ut_a(trx);
+	mutex_enter(&trx_sys->mutex);
+
+	trx = trx_get_rw_trx_by_id(trx_id);
 	ut_a(trx->is_recovered);
-	mutex_exit(&kernel_mutex);
+
+	mutex_exit(&trx_sys->mutex);
 
 	return(TRUE);
 }
 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 
 /****************************************************************//**
-Returns the minumum trx id in trx list. This is the smallest id for which
-the trx can possibly be active. (But, you must look at the trx->conc_state to
-find out if the minimum trx id transaction itself is active, or already
+Returns the minimum trx id in rw trx list. This is the smallest id for which
+the rw trx can possibly be active. (But, you must look at the trx->state
+to find out if the minimum trx id transaction itself is active, or already
 committed.)
-@return	the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */
+@return	the minimum trx id, or trx_sys->max_trx_id if rw trx list is empty */
 UNIV_INLINE
 trx_id_t
-trx_list_get_min_trx_id(void)
-/*=========================*/
+trx_rw_min_trx_id(void)
+/*===================*/
 {
-	trx_t*	trx;
+	trx_id_t	id;
 
-	ut_ad(mutex_own(&(kernel_mutex)));
+	mutex_enter(&trx_sys->mutex);
 
-	trx = UT_LIST_GET_LAST(trx_sys->trx_list);
+	id = trx_rw_min_trx_id_low();
 
-	if (trx == NULL) {
+	mutex_exit(&trx_sys->mutex);
+
+	return(id);
+}
+
+/****************************************************************//**
+Returns pointer to a transaction instance if a rw transaction with the given id
+is active. Caller must hold trx_sys->mutex. If the caller is not holding
+lock_sys->mutex, the transaction may already have been committed.
+@return transaction instance if active, or NULL;
+the pointer must not be dereferenced unless lock_sys->mutex was
+acquired before calling this function and is still being held */
+UNIV_INLINE
+trx_t*
+trx_rw_get_active_trx_by_id(
+/*========================*/
+	trx_id_t	trx_id,		/*!< in: trx id of the transaction */
+	ibool*		corrupt)	/*!< in: NULL or pointer to a flag
+					that will be set if corrupt */
+{
+	trx_t*		trx;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	if (trx_id < trx_rw_min_trx_id_low()) {
+
+		trx = NULL;
+	} else if (trx_id >= trx_sys->max_trx_id) {
+
+		/* There must be corruption: we let the caller handle the
+		diagnostic prints in this case. */
 
-		return(trx_sys->max_trx_id);
+		trx = NULL;
+		if (corrupt != NULL) {
+			*corrupt = TRUE;
+		}
+	} else {
+		trx = trx_get_rw_trx_by_id(trx_id);
+
+		if (trx != NULL
+		    && trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)) {
+
+			trx = NULL;
+		}
 	}
 
-	return(trx->id);
+	return(trx);
 }
 
 /****************************************************************//**
-Checks if a transaction with the given id is active.
-@return	TRUE if active */
+Checks if a rw transaction with the given id is active. Caller must hold
+trx_sys->mutex. If the caller is not holding lock_sys->mutex, the
+transaction may already have been committed.
+@return	true if rw transaction it with a given id is active. */
 UNIV_INLINE
-ibool
-trx_is_active(
-/*==========*/
-	trx_id_t	trx_id)	/*!< in: trx id of the transaction */
+bool
+trx_rw_is_active_low(
+/*=================*/
+	trx_id_t	trx_id,		/*!< in: trx id of the transaction */
+	ibool*		corrupt)	/*!< in: NULL or pointer to a flag
+					that will be set if corrupt */
 {
-	ut_ad(mutex_own(&(kernel_mutex)));
+	ut_ad(mutex_own(&trx_sys->mutex));
 
-	if (trx_find_descriptor(trx_sys->descriptors,
-				trx_sys->descr_n_used,
-				trx_id)) {
+	if (UNIV_UNLIKELY(trx_id >= trx_sys->max_trx_id)) {
 
-		return(TRUE);
+		/* There must be corruption: we let the caller handle the
+		diagnostic prints in this case. */
+
+		if (corrupt != NULL) {
+			*corrupt = TRUE;
+		}
+
+		return(false);
 	}
 
-	return(FALSE);
+	return(trx_find_descriptor(trx_sys->descriptors, trx_sys->descr_n_used,
+				   trx_id) != NULL);
+}
+
+/****************************************************************//**
+Checks if a rw transaction with the given id is active. If the caller is
+not holding lock_sys->mutex, the transaction may already have been
+committed.
+@return	true if rw transaction it with a given id is active. */
+UNIV_INLINE
+bool
+trx_rw_is_active(
+/*=============*/
+	trx_id_t	trx_id,		/*!< in: trx id of the transaction */
+	ibool*		corrupt)	/*!< in: NULL or pointer to a flag
+					that will be set if corrupt */
+{
+	bool res;
+
+	mutex_enter(&trx_sys->mutex);
+
+	res = trx_rw_is_active_low(trx_id, corrupt);
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(res);
 }
 
 /*****************************************************************//**
@@ -387,9 +474,7 @@ trx_id_t
 trx_sys_get_new_trx_id(void)
 /*========================*/
 {
-	trx_id_t	id;
-
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(mutex_own(&trx_sys->mutex));
 
 	/* VERY important: after the database is started, max_trx_id value is
 	divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the following if
@@ -398,14 +483,60 @@ trx_sys_get_new_trx_id(void)
 	Thus trx id values will not overlap when the database is
 	repeatedly started! */
 
-	if ((ulint) trx_sys->max_trx_id % TRX_SYS_TRX_ID_WRITE_MARGIN == 0) {
+	if (!(trx_sys->max_trx_id % (trx_id_t) TRX_SYS_TRX_ID_WRITE_MARGIN)) {
 
 		trx_sys_flush_max_trx_id();
 	}
 
-	id = trx_sys->max_trx_id++;
+	return(trx_sys->max_trx_id++);
+}
 
-	return(id);
+/*****************************************************************//**
+Determines the maximum transaction id.
+@return maximum currently allocated trx id; will be stale after the
+next call to trx_sys_get_new_trx_id() */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_max_trx_id(void)
+/*========================*/
+{
+#if UNIV_WORD_SIZE < DATA_TRX_ID_LEN
+	trx_id_t	max_trx_id;
+#endif
+
+	ut_ad(!mutex_own(&trx_sys->mutex));
+
+#if UNIV_WORD_SIZE < DATA_TRX_ID_LEN
+	/* Avoid torn reads. */
+	mutex_enter(&trx_sys->mutex);
+	max_trx_id = trx_sys->max_trx_id;
+	mutex_exit(&trx_sys->mutex);
+	return(max_trx_id);
+#else
+	/* Perform a dirty read. Callers should be prepared for stale
+	values, and we know that the value fits in a machine word, so
+	that it will be read and written atomically. */
+	return(trx_sys->max_trx_id);
+#endif
+}
+
+/*****************************************************************//**
+Get the number of transaction in the system, independent of their state.
+@return count of transactions in trx_sys_t::rw_trx_list */
+UNIV_INLINE
+ulint
+trx_sys_get_n_rw_trx(void)
+/*======================*/
+{
+	ulint	n_trx;
+
+	mutex_enter(&trx_sys->mutex);
+
+	n_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list);
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(n_trx);
 }
 
 
@@ -418,10 +549,10 @@ trx_find_descriptor(
 /*================*/
 	const trx_id_t*	descriptors,	/*!< in: descriptors array */
 	ulint		n_descr,	/*!< in: array size */
-	trx_id_t	trx_id)		/*!< in: trx pointer */
+	trx_id_t	trx_id)		/*!< in: trx id */
 {
 	ut_ad(descriptors != trx_sys->descriptors ||
-	      mutex_own(&kernel_mutex));
+	      mutex_own(&trx_sys->mutex));
 
 	if (UNIV_UNLIKELY(n_descr == 0)) {
 
diff --git a/storage/xtradb/include/trx0trx.h b/storage/xtradb/include/trx0trx.h
index 4ab8e5b2cc5..82e9a90fcfb 100644
--- a/storage/xtradb/include/trx0trx.h
+++ b/storage/xtradb/include/trx0trx.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -31,23 +31,18 @@ Created 3/26/1996 Heikki Tuuri
 #include "dict0types.h"
 #ifndef UNIV_HOTBACKUP
 #include "lock0types.h"
+#include "log0log.h"
 #include "usr0types.h"
 #include "que0types.h"
 #include "mem0mem.h"
 #include "read0types.h"
 #include "trx0xa.h"
 #include "ut0vec.h"
+#include "fts0fts.h"
 
 /** Dummy session used currently in MySQL interface */
 extern sess_t*	trx_dummy_sess;
 
-/** Number of transactions currently allocated for MySQL: protected by
-the kernel mutex */
-extern ulint	trx_n_mysql_transactions;
-/** Number of transactions currently in the XA PREPARED state: protected by
-the kernel mutex */
-extern ulint	trx_n_prepared;
-
 /********************************************************************//**
 In XtraDB it is impossible for a transaction to own a search latch outside of
 InnoDB code, so there is nothing to release on demand.  We keep this function to
@@ -82,15 +77,6 @@ const dict_index_t*
 trx_get_error_info(
 /*===============*/
 	const trx_t*	trx);	/*!< in: trx object */
-/****************************************************************//**
-Creates and initializes a transaction object.
-@return	own: the transaction */
-UNIV_INTERN
-trx_t*
-trx_create(
-/*=======*/
-	sess_t*	sess)	/*!< in: session */
-	__attribute__((nonnull));
 /********************************************************************//**
 Creates a transaction object for MySQL.
 @return	own: transaction object */
@@ -106,11 +92,11 @@ trx_t*
 trx_allocate_for_background(void);
 /*=============================*/
 /********************************************************************//**
-Frees a transaction object. */
+Frees a transaction object of a background operation of the master thread. */
 UNIV_INTERN
 void
-trx_free(
-/*=====*/
+trx_free_for_background(
+/*====================*/
 	trx_t*	trx);	/*!< in, own: trx object */
 /********************************************************************//**
 At shutdown, frees a transaction object that is in the PREPARED state. */
@@ -127,13 +113,6 @@ void
 trx_free_for_mysql(
 /*===============*/
 	trx_t*	trx);	/*!< in, own: trx object */
-/********************************************************************//**
-Frees a transaction object of a background operation of the master thread. */
-UNIV_INTERN
-void
-trx_free_for_background(
-/*====================*/
-	trx_t*	trx);	/*!< in, own: trx object */
 /****************************************************************//**
 Creates trx objects for transactions and initializes the trx list of
 trx_sys at database start. Rollback segment and undo log lists must
@@ -144,51 +123,87 @@ UNIV_INTERN
 void
 trx_lists_init_at_db_start(void);
 /*============================*/
-/****************************************************************//**
-Starts a new transaction.
-@return TRUE if success, FALSE if the rollback segment could not
-support this many transactions */
-UNIV_INTERN
-ibool
-trx_start(
-/*======*/
-	trx_t*	trx,	/*!< in: transaction */
-	ulint	rseg_id);/*!< in: rollback segment id; if ULINT_UNDEFINED
-			is passed, the system chooses the rollback segment
-			automatically in a round-robin fashion */
-/****************************************************************//**
-Starts a new transaction.
-@return	TRUE */
-UNIV_INTERN
-ibool
-trx_start_low(
-/*==========*/
-	trx_t*	trx,	/*!< in: transaction */
-	ulint	rseg_id);/*!< in: rollback segment id; if ULINT_UNDEFINED
-			is passed, the system chooses the rollback segment
-			automatically in a round-robin fashion */
+
+#ifdef UNIV_DEBUG
+#define trx_start_if_not_started_xa(t)				\
+	{							\
+	(t)->start_line = __LINE__;				\
+	(t)->start_file = __FILE__;				\
+	trx_start_if_not_started_xa_low((t));			\
+	}
+#else
+#define trx_start_if_not_started_xa(t)				\
+	trx_start_if_not_started_xa_low((t))
+#endif /* UNIV_DEBUG */
+
 /*************************************************************//**
 Starts the transaction if it is not yet started. */
-UNIV_INLINE
+UNIV_INTERN
 void
-trx_start_if_not_started(
-/*=====================*/
+trx_start_if_not_started_xa_low(
+/*============================*/
 	trx_t*	trx);	/*!< in: transaction */
 /*************************************************************//**
-Starts the transaction if it is not yet started. Assumes we have reserved
-the kernel mutex! */
-UNIV_INLINE
+Starts the transaction if it is not yet started. */
+UNIV_INTERN
 void
 trx_start_if_not_started_low(
 /*=========================*/
 	trx_t*	trx);	/*!< in: transaction */
+
+#ifdef UNIV_DEBUG
+#define trx_start_if_not_started(t)				\
+	{							\
+	(t)->start_line = __LINE__;				\
+	(t)->start_file = __FILE__;				\
+	trx_start_if_not_started_low((t));			\
+	}
+#else
+#define trx_start_if_not_started(t)				\
+	trx_start_if_not_started_low((t))
+#endif /* UNIV_DEBUG */
+
+/*************************************************************//**
+Starts the transaction for a DDL operation. */
+UNIV_INTERN
+void
+trx_start_for_ddl_low(
+/*==================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	trx_dict_op_t	op)	/*!< in: dictionary operation type */
+	__attribute__((nonnull));
+
+#ifdef UNIV_DEBUG
+#define trx_start_for_ddl(t, o)					\
+	{							\
+	ut_ad((t)->start_file == 0);				\
+	(t)->start_line = __LINE__;				\
+	(t)->start_file = __FILE__;				\
+	trx_start_for_ddl_low((t), (o));			\
+	}
+#else
+#define trx_start_for_ddl(t, o)					\
+	trx_start_for_ddl_low((t), (o))
+#endif /* UNIV_DEBUG */
+
 /****************************************************************//**
 Commits a transaction. */
 UNIV_INTERN
 void
-trx_commit_off_kernel(
-/*==================*/
-	trx_t*	trx);	/*!< in: transaction */
+trx_commit(
+/*=======*/
+	trx_t*	trx)	/*!< in/out: transaction */
+	__attribute__((nonnull));
+/****************************************************************//**
+Commits a transaction and a mini-transaction. */
+UNIV_INTERN
+void
+trx_commit_low(
+/*===========*/
+	trx_t*	trx,	/*!< in/out: transaction */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction (will be committed),
+			or NULL if trx made no modifications */
+	__attribute__((nonnull(1)));
 /****************************************************************//**
 Cleans up a transaction at database startup. The cleanup is needed if
 the transaction already got to the middle of a commit when the database
@@ -202,18 +217,17 @@ trx_cleanup_at_db_startup(
 Does the transaction commit for MySQL.
 @return	DB_SUCCESS or error number */
 UNIV_INTERN
-ulint
+dberr_t
 trx_commit_for_mysql(
 /*=================*/
-	trx_t*	trx);	/*!< in: trx handle */
+	trx_t*	trx);	/*!< in/out: transaction */
 /**********************************************************************//**
-Does the transaction prepare for MySQL.
-@return	0 or error number */
+Does the transaction prepare for MySQL. */
 UNIV_INTERN
-ulint
+void
 trx_prepare_for_mysql(
 /*==================*/
-	trx_t*	trx);	/*!< in: trx handle */
+	trx_t*	trx);	/*!< in/out: trx handle */
 /**********************************************************************//**
 This function is used to find number of prepared transactions and
 their transaction objects for a recovery.
@@ -227,7 +241,9 @@ trx_recover_for_mysql(
 /*******************************************************************//**
 This function is used to find one X/Open XA distributed transaction
 which is in the prepared state
-@return	trx or NULL; on match, the trx->xid will be invalidated */
+@return	trx or NULL; on match, the trx->xid will be invalidated;
+note that the trx may have been committed, unless the caller is
+holding lock_sys->mutex */
 UNIV_INTERN
 trx_t *
 trx_get_trx_by_xid(
@@ -235,13 +251,13 @@ trx_get_trx_by_xid(
 	const XID*	xid);	/*!< in: X/Open XA transaction identifier */
 /**********************************************************************//**
 If required, flushes the log to disk if we called trx_commit_for_mysql()
-with trx->flush_log_later == TRUE.
-@return	0 or error number */
+with trx->flush_log_later == TRUE. */
 UNIV_INTERN
-ulint
+void
 trx_commit_complete_for_mysql(
 /*==========================*/
-	trx_t*	trx);	/*!< in: trx handle */
+	trx_t*	trx)	/*!< in/out: transaction */
+	__attribute__((nonnull));
 /**********************************************************************//**
 Marks the latest SQL statement ended. */
 UNIV_INTERN
@@ -259,86 +275,20 @@ read_view_t*
 trx_assign_read_view(
 /*=================*/
 	trx_t*	trx);	/*!< in: active transaction */
-/***********************************************************//**
-The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to
-the TRX_QUE_RUNNING state and releases query threads which were
-waiting for a lock in the wait_thrs list. */
-UNIV_INTERN
-void
-trx_end_lock_wait(
-/*==============*/
-	trx_t*	trx);	/*!< in: transaction */
 /****************************************************************//**
-Sends a signal to a trx object. */
+Prepares a transaction for commit/rollback. */
 UNIV_INTERN
 void
-trx_sig_send(
-/*=========*/
-	trx_t*		trx,		/*!< in: trx handle */
-	ulint		type,		/*!< in: signal type */
-	ulint		sender,		/*!< in: TRX_SIG_SELF or
-					TRX_SIG_OTHER_SESS */
-	que_thr_t*	receiver_thr,	/*!< in: query thread which wants the
-					reply, or NULL; if type is
-					TRX_SIG_END_WAIT, this must be NULL */
-	trx_savept_t*	savept,		/*!< in: possible rollback savepoint, or
-					NULL */
-	que_thr_t**	next_thr);	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread; if the parameter
-					is NULL, it is ignored */
-/****************************************************************//**
-Send the reply message when a signal in the queue of the trx has
-been handled. */
-UNIV_INTERN
-void
-trx_sig_reply(
-/*==========*/
-	trx_sig_t*	sig,		/*!< in: signal */
-	que_thr_t**	next_thr);	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread */
-/****************************************************************//**
-Removes the signal object from a trx signal queue. */
-UNIV_INTERN
-void
-trx_sig_remove(
-/*===========*/
-	trx_t*		trx,	/*!< in: trx handle */
-	trx_sig_t*	sig);	/*!< in, own: signal */
-/****************************************************************//**
-Starts handling of a trx signal. */
-UNIV_INTERN
-void
-trx_sig_start_handle(
-/*=================*/
-	trx_t*		trx,		/*!< in: trx handle */
-	que_thr_t**	next_thr);	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread */
-/****************************************************************//**
-Ends signal handling. If the session is in the error state, and
-trx->graph_before_signal_handling != NULL, returns control to the error
-handling routine of the graph (currently only returns the control to the
-graph root which then sends an error message to the client). */
-UNIV_INTERN
-void
-trx_end_signal_handling(
-/*====================*/
-	trx_t*	trx);	/*!< in: trx */
+trx_commit_or_rollback_prepare(
+/*===========================*/
+	trx_t*	trx);	/*!< in/out: transaction */
 /*********************************************************************//**
 Creates a commit command node struct.
 @return	own: commit node struct */
 UNIV_INTERN
 commit_node_t*
-commit_node_create(
-/*===============*/
+trx_commit_node_create(
+/*===================*/
 	mem_heap_t*	heap);	/*!< in: mem heap where created */
 /***********************************************************//**
 Performs an execution step for a commit type node in a query graph.
@@ -350,37 +300,59 @@ trx_commit_step(
 	que_thr_t*	thr);	/*!< in: query thread */
 
 /**********************************************************************//**
-Prints info about a transaction to the given file. The caller must own the
-kernel mutex. */
+Prints info about a transaction.
+Caller must hold trx_sys->mutex. */
+UNIV_INTERN
+void
+trx_print_low(
+/*==========*/
+	FILE*		f,
+			/*!< in: output stream */
+	const trx_t*	trx,
+			/*!< in: transaction */
+	ulint		max_query_len,
+			/*!< in: max query length to print,
+			or 0 to use the default max length */
+	ulint		n_rec_locks,
+			/*!< in: lock_number_of_rows_locked(&trx->lock) */
+	ulint		n_trx_locks,
+			/*!< in: length of trx->lock.trx_locks */
+	ulint		heap_size)
+			/*!< in: mem_heap_get_size(trx->lock.lock_heap) */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Prints info about a transaction.
+The caller must hold lock_sys->mutex and trx_sys->mutex.
+When possible, use trx_print() instead. */
+UNIV_INTERN
+void
+trx_print_latched(
+/*==============*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Prints info about a transaction.
+Acquires and releases lock_sys->mutex and trx_sys->mutex. */
 UNIV_INTERN
 void
 trx_print(
 /*======*/
-	FILE*	f,		/*!< in: output stream */
-	trx_t*	trx,		/*!< in: transaction */
-	ulint	max_query_len);	/*!< in: max query length to print, or 0 to
-				   use the default max length */
-
-/** Type of data dictionary operation */
-typedef enum trx_dict_op {
-	/** The transaction is not modifying the data dictionary. */
-	TRX_DICT_OP_NONE = 0,
-	/** The transaction is creating a table or an index, or
-	dropping a table.  The table must be dropped in crash
-	recovery.  This and TRX_DICT_OP_NONE are the only possible
-	operation modes in crash recovery. */
-	TRX_DICT_OP_TABLE = 1,
-	/** The transaction is creating or dropping an index in an
-	existing table.  In crash recovery, the data dictionary
-	must be locked, but the table must not be dropped. */
-	TRX_DICT_OP_INDEX = 2
-} trx_dict_op_t;
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+	__attribute__((nonnull));
 
 /**********************************************************************//**
 Determine if a transaction is a dictionary operation.
 @return	dictionary operation mode */
 UNIV_INLINE
-enum trx_dict_op
+enum trx_dict_op_t
 trx_get_dict_operation(
 /*===================*/
 	const trx_t*	trx)	/*!< in: transaction */
@@ -392,18 +364,49 @@ void
 trx_set_dict_operation(
 /*===================*/
 	trx_t*			trx,	/*!< in/out: transaction */
-	enum trx_dict_op	op);	/*!< in: operation, not
+	enum trx_dict_op_t	op);	/*!< in: operation, not
 					TRX_DICT_OP_NONE */
 
 #ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
+Determines if a transaction is in the given state.
+The caller must hold trx_sys->mutex, or it must be the thread
+that is serving a running transaction.
+A running transaction must be in trx_sys->ro_trx_list or trx_sys->rw_trx_list
+unless it is a non-locking autocommit read only transaction, which is only
+in trx_sys->mysql_trx_list.
+@return	TRUE if trx->state == state */
+UNIV_INLINE
+ibool
+trx_state_eq(
+/*=========*/
+	const trx_t*	trx,	/*!< in: transaction */
+	trx_state_t	state)	/*!< in: state;
+				if state != TRX_STATE_NOT_STARTED
+				asserts that
+				trx->state != TRX_STATE_NOT_STARTED */
+	__attribute__((nonnull, warn_unused_result));
+# ifdef UNIV_DEBUG
+/**********************************************************************//**
+Asserts that a transaction has been started.
+The caller must hold trx_sys->mutex.
+@return TRUE if started */
+UNIV_INTERN
+ibool
+trx_assert_started(
+/*===============*/
+	const trx_t*	trx)	/*!< in: transaction */
+	__attribute__((nonnull, warn_unused_result));
+# endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
 Determines if the currently running transaction has been interrupted.
 @return	TRUE if interrupted */
 UNIV_INTERN
 ibool
 trx_is_interrupted(
 /*===============*/
-	trx_t*	trx);	/*!< in: transaction */
+	const trx_t*	trx);	/*!< in: transaction */
 /**********************************************************************//**
 Determines if the currently running transaction is in strict mode.
 @return	TRUE if strict */
@@ -421,7 +424,7 @@ Calculates the "weight" of a transaction. The weight of one transaction
 is estimated as the number of altered rows + the number of locked rows.
 @param t	transaction
 @return		transaction weight */
-#define TRX_WEIGHT(t)	((t)->undo_no + UT_LIST_GET_LEN((t)->trx_locks))
+#define TRX_WEIGHT(t)	((t)->undo_no + UT_LIST_GET_LEN((t)->lock.trx_locks))
 
 /*******************************************************************//**
 Compares the "weight" (or size) of two transactions. Transactions that
@@ -449,6 +452,16 @@ trx_get_que_state_str(
 /*==================*/
 	const trx_t*	trx);	/*!< in: transaction */
 
+/****************************************************************//**
+Assign a read-only transaction a rollback-segment, if it is attempting
+to write to a TEMPORARY table. */
+UNIV_INTERN
+void
+trx_assign_rseg(
+/*============*/
+	trx_t*		trx);		/*!< A read-only transaction that
+					needs to be assigned a RBS. */
+
 /*************************************************************//**
 Callback function for trx_find_descriptor() to compare trx IDs. */
 UNIV_INTERN
@@ -466,53 +479,309 @@ trx_release_descriptor(
 /*===================*/
 	trx_t* trx);	/*!< in: trx pointer */
 
-/* Signal to a transaction */
-struct trx_sig_struct{
-	unsigned	type:3;		/*!< signal type */
-	unsigned	sender:1;	/*!< TRX_SIG_SELF or
-					TRX_SIG_OTHER_SESS */
-	que_thr_t*	receiver;	/*!< non-NULL if the sender of the signal
-					wants reply after the operation induced
-					by the signal is completed */
-	trx_savept_t	savept;		/*!< possible rollback savepoint */
-	UT_LIST_NODE_T(trx_sig_t)
-			signals;	/*!< queue of pending signals to the
-					transaction */
-	UT_LIST_NODE_T(trx_sig_t)
-			reply_signals;	/*!< list of signals for which the sender
-					transaction is waiting a reply */
+/*******************************************************************//**
+Transactions that aren't started by the MySQL server don't set
+the trx_t::mysql_thd field. For such transactions we set the lock
+wait timeout to 0 instead of the user configured value that comes
+from innodb_lock_wait_timeout via trx_t::mysql_thd.
+@param trx	transaction
+@return		lock wait timeout in seconds */
+#define trx_lock_wait_timeout_get(trx)					\
+	((trx)->mysql_thd != NULL					\
+	 ? thd_lock_wait_timeout((trx)->mysql_thd)			\
+	 : 0)
+
+/*******************************************************************//**
+Determine if the transaction is a non-locking autocommit select
+(implied read-only).
+@param t	transaction
+@return true	if non-locking autocommit select transaction. */
+#define trx_is_autocommit_non_locking(t)				\
+((t)->auto_commit && (t)->will_lock == 0)
+
+/*******************************************************************//**
+Determine if the transaction is a non-locking autocommit select
+with an explicit check for the read-only status.
+@param t	transaction
+@return true	if non-locking autocommit read-only transaction. */
+#define trx_is_ac_nl_ro(t)						\
+((t)->read_only && trx_is_autocommit_non_locking((t)))
+
+/*******************************************************************//**
+Assert that the transaction is in the trx_sys_t::rw_trx_list */
+#define assert_trx_in_rw_list(t) do {					\
+	ut_ad(!(t)->read_only);						\
+	assert_trx_in_list(t);						\
+} while (0)
+
+/*******************************************************************//**
+Assert that the transaction is either in trx_sys->ro_trx_list or
+trx_sys->rw_trx_list but not both and it cannot be an autocommit
+non-locking select */
+#define assert_trx_in_list(t) do {					\
+	ut_ad((t)->in_ro_trx_list == (t)->read_only);			\
+	ut_ad((t)->in_rw_trx_list == !(t)->read_only);			\
+	ut_ad(!trx_is_autocommit_non_locking((t)));			\
+	switch ((t)->state) {						\
+	case TRX_STATE_PREPARED:					\
+		/* fall through */					\
+	case TRX_STATE_ACTIVE:						\
+	case TRX_STATE_COMMITTED_IN_MEMORY:				\
+		continue;						\
+	case TRX_STATE_NOT_STARTED:					\
+		break;							\
+	}								\
+	ut_error;							\
+} while (0)
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Assert that an autocommit non-locking select cannot be in the
+ro_trx_list nor the rw_trx_list and that it is a read-only transaction.
+The tranasction must be in the mysql_trx_list. */
+# define assert_trx_nonlocking_or_in_list(t)				\
+	do {								\
+		if (trx_is_autocommit_non_locking(t)) {			\
+			trx_state_t	t_state = (t)->state;		\
+			ut_ad((t)->read_only);				\
+			ut_ad(!(t)->is_recovered);			\
+			ut_ad(!(t)->in_ro_trx_list);			\
+			ut_ad(!(t)->in_rw_trx_list);			\
+			ut_ad((t)->in_mysql_trx_list);			\
+			ut_ad(t_state == TRX_STATE_NOT_STARTED		\
+			      || t_state == TRX_STATE_ACTIVE);		\
+		} else {						\
+			assert_trx_in_list(t);				\
+		}							\
+	} while (0)
+#else /* UNIV_DEBUG */
+/*******************************************************************//**
+Assert that an autocommit non-locking slect cannot be in the
+ro_trx_list nor the rw_trx_list and that it is a read-only transaction.
+The tranasction must be in the mysql_trx_list. */
+# define assert_trx_nonlocking_or_in_list(trx) ((void)0)
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Latching protocol for trx_lock_t::que_state.  trx_lock_t::que_state
+captures the state of the query thread during the execution of a query.
+This is different from a transaction state. The query state of a transaction
+can be updated asynchronously by other threads.  The other threads can be
+system threads, like the timeout monitor thread or user threads executing
+other queries. Another thing to be mindful of is that there is a delay between
+when a query thread is put into LOCK_WAIT state and before it actually starts
+waiting.  Between these two events it is possible that the query thread is
+granted the lock it was waiting for, which implies that the state can be changed
+asynchronously.
+
+All these operations take place within the context of locking. Therefore state
+changes within the locking code must acquire both the lock mutex and the
+trx->mutex when changing trx->lock.que_state to TRX_QUE_LOCK_WAIT or
+trx->lock.wait_lock to non-NULL but when the lock wait ends it is sufficient
+to only acquire the trx->mutex.
+To query the state either of the mutexes is sufficient within the locking
+code and no mutex is required when the query thread is no longer waiting. */
+
+/** The locks and state of an active transaction. Protected by
+lock_sys->mutex, trx->mutex or both. */
+struct trx_lock_t {
+	ulint		n_active_thrs;	/*!< number of active query threads */
+
+	trx_que_t	que_state;	/*!< valid when trx->state
+					== TRX_STATE_ACTIVE: TRX_QUE_RUNNING,
+					TRX_QUE_LOCK_WAIT, ... */
+
+	lock_t*		wait_lock;	/*!< if trx execution state is
+					TRX_QUE_LOCK_WAIT, this points to
+					the lock request, otherwise this is
+					NULL; set to non-NULL when holding
+					both trx->mutex and lock_sys->mutex;
+					set to NULL when holding
+					lock_sys->mutex; readers should
+					hold lock_sys->mutex, except when
+					they are holding trx->mutex and
+					wait_lock==NULL */
+	ib_uint64_t	deadlock_mark;	/*!< A mark field that is initialized
+					to and checked against lock_mark_counter
+					by lock_deadlock_recursive(). */
+	ibool		was_chosen_as_deadlock_victim;
+					/*!< when the transaction decides to
+					wait for a lock, it sets this to FALSE;
+					if another transaction chooses this
+					transaction as a victim in deadlock
+					resolution, it sets this to TRUE.
+					Protected by trx->mutex. */
+	time_t		wait_started;	/*!< lock wait started at this time,
+					protected only by lock_sys->mutex */
+
+	que_thr_t*	wait_thr;	/*!< query thread belonging to this
+					trx that is in QUE_THR_LOCK_WAIT
+					state. For threads suspended in a
+					lock wait, this is protected by
+					lock_sys->mutex. Otherwise, this may
+					only be modified by the thread that is
+					serving the running transaction. */
+
+	mem_heap_t*	lock_heap;	/*!< memory heap for trx_locks;
+					protected by lock_sys->mutex */
+
+	UT_LIST_BASE_NODE_T(lock_t)
+			trx_locks;	/*!< locks requested
+					by the transaction;
+					insertions are protected by trx->mutex
+					and lock_sys->mutex; removals are
+					protected by lock_sys->mutex */
+
+	ib_vector_t*	table_locks;	/*!< All table locks requested by this
+					transaction, including AUTOINC locks */
+
+	ibool		cancel;		/*!< TRUE if the transaction is being
+					rolled back either via deadlock
+					detection or due to lock timeout. The
+					caller has to acquire the trx_t::mutex
+					in order to cancel the locks. In
+					lock_trx_table_locks_remove() we
+					check for this cancel of a transaction's
+					locks and avoid reacquiring the trx
+					mutex to prevent recursive deadlocks.
+					Protected by both the lock sys mutex
+					and the trx_t::mutex. */
 };
 
 #define TRX_MAGIC_N	91118598
 
-/* The transaction handle; every session has a trx object which is freed only
-when the session is freed; in addition there may be session-less transactions
-rolling back after a database recovery */
+/** The transaction handle
+
+Normally, there is a 1:1 relationship between a transaction handle
+(trx) and a session (client connection). One session is associated
+with exactly one user transaction. There are some exceptions to this:
+
+* For DDL operations, a subtransaction is allocated that modifies the
+data dictionary tables. Lock waits and deadlocks are prevented by
+acquiring the dict_operation_lock before starting the subtransaction
+and releasing it after committing the subtransaction.
+
+* The purge system uses a special transaction that is not associated
+with any session.
+
+* If the system crashed or it was quickly shut down while there were
+transactions in the ACTIVE or PREPARED state, these transactions would
+no longer be associated with a session when the server is restarted.
+
+A session may be served by at most one thread at a time. The serving
+thread of a session might change in some MySQL implementations.
+Therefore we do not have os_thread_get_curr_id() assertions in the code.
+
+Normally, only the thread that is currently associated with a running
+transaction may access (read and modify) the trx object, and it may do
+so without holding any mutex. The following are exceptions to this:
+
+* trx_rollback_resurrected() may access resurrected (connectionless)
+transactions while the system is already processing new user
+transactions. The trx_sys->mutex prevents a race condition between it
+and lock_trx_release_locks() [invoked by trx_commit()].
 
-struct trx_struct{
+* trx_print_low() may access transactions not associated with the current
+thread. The caller must be holding trx_sys->mutex and lock_sys->mutex.
+
+* When a transaction handle is in the trx_sys->mysql_trx_list or
+trx_sys->trx_list, some of its fields must not be modified without
+holding trx_sys->mutex exclusively.
+
+* The locking code (in particular, lock_deadlock_recursive() and
+lock_rec_convert_impl_to_expl()) will access transactions associated
+to other connections. The locks of transactions are protected by
+lock_sys->mutex and sometimes by trx->mutex. */
+
+struct trx_t{
 	ulint		magic_n;
 
+	ib_mutex_t	mutex;		/*!< Mutex protecting the fields
+					state and lock
+					(except some fields of lock, which
+					are protected by lock_sys->mutex) */
+
+	/** State of the trx from the point of view of concurrency control
+	and the valid state transitions.
+
+	Possible states:
+
+	TRX_STATE_NOT_STARTED
+	TRX_STATE_ACTIVE
+	TRX_STATE_PREPARED
+	TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED)
+
+	Valid state transitions are:
+
+	Regular transactions:
+	* NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED
+
+	Auto-commit non-locking read-only:
+	* NOT_STARTED -> ACTIVE -> NOT_STARTED
+
+	XA (2PC):
+	* NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED
+
+	Recovered XA:
+	* NOT_STARTED -> PREPARED -> COMMITTED -> (freed)
+
+	XA (2PC) (shutdown before ROLLBACK or COMMIT):
+	* NOT_STARTED -> PREPARED -> (freed)
+
+	Latching and various transaction lists membership rules:
+
+	XA (2PC) transactions are always treated as non-autocommit.
+
+	Transitions to ACTIVE or NOT_STARTED occur when
+	!in_rw_trx_list and !in_ro_trx_list (no trx_sys->mutex needed).
+
+	Autocommit non-locking read-only transactions move between states
+	without holding any mutex. They are !in_rw_trx_list, !in_ro_trx_list.
+
+	When a transaction is NOT_STARTED, it can be in_mysql_trx_list if
+	it is a user transaction. It cannot be in ro_trx_list or rw_trx_list.
+
+	ACTIVE->PREPARED->COMMITTED is only possible when trx->in_rw_trx_list.
+	The transition ACTIVE->PREPARED is protected by trx_sys->mutex.
+
+	ACTIVE->COMMITTED is possible when the transaction is in
+	ro_trx_list or rw_trx_list.
+
+	Transitions to COMMITTED are protected by both lock_sys->mutex
+	and trx->mutex.
+
+	NOTE: Some of these state change constraints are an overkill,
+	currently only required for a consistent view for printing stats.
+	This unnecessarily adds a huge cost for the general case.
+
+	NOTE: In the future we should add read only transactions to the
+	ro_trx_list the first time they try to acquire a lock ie. by default
+	we treat all read-only transactions as non-locking.  */
+	trx_state_t	state;
+
+	trx_lock_t	lock;		/*!< Information about the transaction
+					locks and state. Protected by
+					trx->mutex or lock_sys->mutex
+					or both */
+	ulint		is_recovered;	/*!< 0=normal transaction,
+					1=recovered, must be rolled back,
+					protected by trx_sys->mutex when
+					trx->in_rw_trx_list holds */
+
 	/* These fields are not protected by any mutex. */
 	const char*	op_info;	/*!< English text describing the
 					current operation, or an empty
 					string */
-	ulint		state;		/*!< state of the trx from the point of
-					view of concurrency control: TRX_ACTIVE,
-					TRX_COMMITTED_IN_MEMORY, ...  This was
-					called 'conc_state' in the upstream and
-					has been renamed in Percona Server,
-					because changing it's value to/from
-					either TRX_ACTIVE or TRX_PREPARED
-					requires calling
-					trx_reserve_descriptor() /
-					trx_release_descriptor(). Different name
-					ensures we notice any new code changing
-					the state. */
+	ulint		isolation_level;/*!< TRX_ISO_REPEATABLE_READ, ... */
+	ulint		check_foreigns;	/*!< normally TRUE, but if the user
+					wants to suppress foreign key checks,
+					(in table imports, for example) we
+					set this FALSE */
 	/*------------------------------*/
 	/* MySQL has a transaction coordinator to coordinate two phase
-       	commit between multiple storage engines and the binary log. When
-       	an engine participates in a transaction, it's responsible for
-       	registering itself using the trans_register_ha() API. */
+	commit between multiple storage engines and the binary log. When
+	an engine participates in a transaction, it's responsible for
+	registering itself using the trans_register_ha() API. */
 	unsigned	is_registered:1;/* This flag is set to 1 after the
 				       	transaction has been registered with
 				       	the coordinator using the XA API, and
@@ -521,17 +790,9 @@ struct trx_struct{
 					this is set to 1 then registered should
 					also be set to 1. This is used in the
 					XA code */
-	unsigned	is_in_trx_serial_list:1;
-					/* Set when transaction is in the
-					trx_serial_list */
 	/*------------------------------*/
-	ulint		isolation_level;/* TRX_ISO_REPEATABLE_READ, ... */
-	ulint		check_foreigns;	/* normally TRUE, but if the user
-					wants to suppress foreign key checks,
-					(in table imports, for example) we
-					set this FALSE */
 	ulint		check_unique_secondary;
-					/* normally TRUE, but if the user
+					/*!< normally TRUE, but if the user
 					wants to speed up inserts by
 					suppressing unique key checks
 					for secondary indexes when we decide
@@ -549,123 +810,120 @@ struct trx_struct{
 					defer flush of the logs to disk
 					until after we release the
 					mutex. */
-	ulint		must_flush_log_later;/* this flag is set to TRUE in
-					trx_commit_off_kernel() if
-					flush_log_later was TRUE, and there
-					were modifications by the transaction;
-					in that case we must flush the log
-					in trx_commit_complete_for_mysql() */
+	ulint		must_flush_log_later;/*!< this flag is set to TRUE in
+					trx_commit() if flush_log_later was
+					TRUE, and there were modifications by
+					the transaction; in that case we must
+					flush the log in
+					trx_commit_complete_for_mysql() */
 	ulint		duplicates;	/*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
-	ibool		has_search_latch;
-					/* TRUE if this trx has latched any
+	bool		has_search_latch;
+					/*!< true if this trx has latched any
 					search system latch in S-mode */
-	ulint		deadlock_mark;	/*!< a mark field used in deadlock
-					checking algorithm.  */
+	ulint		search_latch_timeout;
+					/*!< If we notice that someone is
+					waiting for our S-lock on the search
+					latch to be released, we wait in
+					row0sel.cc for BTR_SEA_TIMEOUT new
+					searches until we try to keep
+					the search latch again over
+					calls from MySQL; this is intended
+					to reduce contention on the search
+					latch */
 	trx_dict_op_t	dict_operation;	/**< @see enum trx_dict_op */
 
 	/* Fields protected by the srv_conc_mutex. */
 	ulint		declared_to_be_inside_innodb;
-					/* this is TRUE if we have declared
+					/*!< this is TRUE if we have declared
 					this transaction in
 					srv_conc_enter_innodb to be inside the
 					InnoDB engine */
-
-	/* Fields protected by dict_operation_lock. The very latch
-	it is used to track. */
+	ulint		n_tickets_to_enter_innodb;
+					/*!< this can be > 0 only when
+					declared_to_... is TRUE; when we come
+					to srv_conc_innodb_enter, if the value
+					here is > 0, we decrement this by 1 */
 	ulint		dict_operation_lock_mode;
 					/*!< 0, RW_S_LATCH, or RW_X_LATCH:
 					the latch mode trx currently holds
-					on dict_operation_lock */
+					on dict_operation_lock. Protected
+					by dict_operation_lock. */
+
+	trx_id_t	no;		/*!< transaction serialization number:
+					max trx id shortly before the
+					transaction is moved to
+					COMMITTED_IN_MEMORY state.
+					Protected by trx_sys_t::mutex
+					when trx->in_rw_trx_list. Initially
+					set to TRX_ID_MAX. */
 
-	/* All the next fields are protected by the kernel mutex, except the
-	undo logs which are protected by undo_mutex */
-	ulint		is_purge;	/*!< 0=user transaction, 1=purge */
-	ulint		is_recovered;	/*!< 0=normal transaction,
-					1=recovered, must be rolled back */
-	ulint		que_state;	/*!< valid when conc_state
-					== TRX_ACTIVE: TRX_QUE_RUNNING,
-					TRX_QUE_LOCK_WAIT, ... */
-	ulint		handling_signals;/* this is TRUE as long as the trx
-					is handling signals */
 	time_t		start_time;	/*!< time the trx object was created
 					or the state last time became
-					TRX_ACTIVE */
+					TRX_STATE_ACTIVE */
 	trx_id_t	id;		/*!< transaction id */
 	XID		xid;		/*!< X/Open XA transaction
 					identification to identify a
 					transaction branch */
-	trx_id_t	no;		/*!< transaction serialization number ==
-					max trx id when the transaction is
-					moved to COMMITTED_IN_MEMORY state */
-	ib_uint64_t	commit_lsn;	/*!< lsn at the time of the commit */
+	lsn_t		commit_lsn;	/*!< lsn at the time of the commit */
 	table_id_t	table_id;	/*!< Table to drop iff dict_operation
-					is TRUE, or 0. */
+					== TRX_DICT_OP_TABLE, or 0. */
 	/*------------------------------*/
-	void*		mysql_thd;	/*!< MySQL thread handle corresponding
+	THD*		mysql_thd;	/*!< MySQL thread handle corresponding
 					to this trx, or NULL */
 	const char*	mysql_log_file_name;
-					/* if MySQL binlog is used, this field
+					/*!< if MySQL binlog is used, this field
 					contains a pointer to the latest file
 					name; this is NULL if binlog is not
 					used */
-	ib_int64_t	mysql_log_offset;/* if MySQL binlog is used, this field
-					contains the end offset of the binlog
-					entry */
-	const char*	mysql_master_log_file_name;
-					/* if the database server is a MySQL
-					replication slave, we have here the
-					master binlog name up to which
-					replication has processed; otherwise
-					this is a pointer to a null
-					character */
-	ib_int64_t	mysql_master_log_pos;
-					/* if the database server is a MySQL
-					replication slave, this is the
-					position in the log file up to which
-					replication has processed */
-	const char*	mysql_relay_log_file_name;
-	ib_int64_t	mysql_relay_log_pos;
+	ib_int64_t	mysql_log_offset;
+					/*!< if MySQL binlog is used, this
+					field contains the end offset of the
+					binlog entry */
 	time_t		idle_start;
 	ib_int64_t	last_stmt_start;
 	/*------------------------------*/
-	ulint		n_mysql_tables_in_use; /* number of Innobase tables
+	ulint		n_mysql_tables_in_use; /*!< number of Innobase tables
 					used in the processing of the current
 					SQL statement in MySQL */
 	ulint		mysql_n_tables_locked;
-					/* how many tables the current SQL
+					/*!< how many tables the current SQL
 					statement uses, except those
 					in consistent read */
-	ulint		search_latch_timeout;
-					/* If we notice that someone is
-					waiting for our S-lock on the search
-					latch to be released, we wait in
-					row0sel.c for BTR_SEA_TIMEOUT new
-					searches until we try to keep
-					the search latch again over
-					calls from MySQL; this is intended
-					to reduce contention on the search
-					latch */
-	/*------------------------------*/
-	ulint		n_tickets_to_enter_innodb;
-					/* this can be > 0 only when
-					declared_to_... is TRUE; when we come
-					to srv_conc_innodb_enter, if the value
-					here is > 0, we decrement this by 1 */
 	/*------------------------------*/
 	UT_LIST_NODE_T(trx_t)
-			trx_list;	/*!< list of transactions */
+			trx_list;	/*!< list of transactions;
+					protected by trx_sys->mutex.
+					The same node is used for both
+					trx_sys_t::ro_trx_list and
+					trx_sys_t::rw_trx_list */
+#ifdef UNIV_DEBUG
+	/** The following two fields are mutually exclusive. */
+	/* @{ */
+
+	ibool		in_ro_trx_list;	/*!< TRUE if in trx_sys->ro_trx_list */
+	ibool		in_rw_trx_list;	/*!< TRUE if in trx_sys->rw_trx_list */
+	/* @} */
+#endif /* UNIV_DEBUG */
 	UT_LIST_NODE_T(trx_t)
 			mysql_trx_list;	/*!< list of transactions created for
-					MySQL */
+					MySQL; protected by trx_sys->mutex */
+#ifdef UNIV_DEBUG
+	ibool		in_mysql_trx_list;
+					/*!< TRUE if in
+					trx_sys->mysql_trx_list */
+#endif /* UNIV_DEBUG */
 	UT_LIST_NODE_T(trx_t)
 			trx_serial_list;/*!< list node for
 					trx_sys->trx_serial_list */
+	bool		in_trx_serial_list;
+					/* Set when transaction is in the
+					trx_serial_list */
 	/*------------------------------*/
-	ulint		error_state;	/*!< 0 if no error, otherwise error
+	dberr_t		error_state;	/*!< 0 if no error, otherwise error
 					number; NOTE That ONLY the thread
 					doing the transaction is allowed to
 					set this field: this is NOT protected
-					by the kernel mutex */
+					by any mutex */
 	const dict_index_t*error_info;	/*!< if the error number indicates a
 					duplicate key error, a pointer to
 					the problematic index is stored here */
@@ -679,47 +937,8 @@ struct trx_struct{
 					survive over a transaction commit, if
 					it is a stored procedure with a COMMIT
 					WORK statement, for instance */
-	ulint		n_active_thrs;	/*!< number of active query threads */
-	que_t*		graph_before_signal_handling;
-					/* value of graph when signal handling
-					for this trx started: this is used to
-					return control to the original query
-					graph for error processing */
-	trx_sig_t	sig;		/*!< one signal object can be allocated
-					in this space, avoiding mem_alloc */
-	UT_LIST_BASE_NODE_T(trx_sig_t)
-			signals;	/*!< queue of processed or pending
-					signals to the trx */
-	UT_LIST_BASE_NODE_T(trx_sig_t)
-			reply_signals;	/*!< list of signals sent by the query
-					threads of this trx for which a thread
-					is waiting for a reply; if this trx is
-					killed, the reply requests in the list
-					must be canceled */
-	/*------------------------------*/
-	lock_t*		wait_lock;	/*!< if trx execution state is
-					TRX_QUE_LOCK_WAIT, this points to
-					the lock request, otherwise this is
-					NULL */
-	ibool		was_chosen_as_deadlock_victim;
-					/* when the transaction decides to wait
-					for a lock, it sets this to FALSE;
-					if another transaction chooses this
-					transaction as a victim in deadlock
-					resolution, it sets this to TRUE */
-	time_t		wait_started;	/*!< lock wait started at this time */
-	UT_LIST_BASE_NODE_T(que_thr_t)
-			wait_thrs;	/*!< query threads belonging to this
-					trx that are in the QUE_THR_LOCK_WAIT
-					state */
-	/*------------------------------*/
-	mem_heap_t*	lock_heap;	/*!< memory heap for the locks of the
-					transaction */
-	UT_LIST_BASE_NODE_T(lock_t)
-			trx_locks;	/*!< locks reserved by the transaction */
-	/*------------------------------*/
 	read_view_t*	global_read_view;
-					/* consistent read view associated
+					/*!< consistent read view associated
 					to a transaction or NULL */
 	read_view_t*	read_view;	/*!< consistent read view used in the
 					transaction or NULL, this read view
@@ -733,7 +952,7 @@ struct trx_struct{
 			trx_savepoints;	/*!< savepoints set with SAVEPOINT ...,
 					oldest first */
 	/*------------------------------*/
-	mutex_t		undo_mutex;	/*!< mutex protecting the fields in this
+	ib_mutex_t	undo_mutex;	/*!< mutex protecting the fields in this
 					section (down to undo_no_arr), EXCEPT
 					last_sql_stat_start, which can be
 					accessed only when we know that there
@@ -747,7 +966,7 @@ struct trx_struct{
 					the number of modified/inserted
 					rows in a transaction */
 	trx_savept_t	last_sql_stat_start;
-					/* undo_no when the last sql statement
+					/*!< undo_no when the last sql statement
 					was started: in case of an error, trx
 					is rolled back down to this undo
 					number; see note at undo_mutex! */
@@ -773,7 +992,39 @@ struct trx_struct{
 					transaction. Note that these are
 					also in the lock list trx_locks. This
 					vector needs to be freed explicitly
-					when the trx_t instance is desrtoyed */
+					when the trx instance is destroyed.
+					Protected by lock_sys->mutex. */
+	/*------------------------------*/
+	ibool		read_only;	/*!< TRUE if transaction is flagged
+					as a READ-ONLY transaction.
+					if !auto_commit || will_lock > 0
+					then it will added to the list
+					trx_sys_t::ro_trx_list. A read only
+					transaction will not be assigned an
+					UNDO log. Non-locking auto-commit
+					read-only transaction will not be on
+					either list. */
+	ibool		auto_commit;	/*!< TRUE if it is an autocommit */
+	ulint		will_lock;	/*!< Will acquire some locks. Increment
+					each time we determine that a lock will
+					be acquired by the MySQL layer. */
+	bool		ddl;		/*!< true if it is a transaction that
+					is being started for a DDL operation */
+	/*------------------------------*/
+	fts_trx_t*	fts_trx;	/*!< FTS information, or NULL if
+					transaction hasn't modified tables
+					with FTS indexes (yet). */
+	doc_id_t	fts_next_doc_id;/* The document id used for updates */
+	/*------------------------------*/
+	ulint		flush_tables;	/*!< if "covering" the FLUSH TABLES",
+					count of tables being flushed. */
+
+	/*------------------------------*/
+#ifdef UNIV_DEBUG
+	ulint		start_line;	/*!< Track where it was started from */
+	const char*	start_file;	/*!< Filename where it was started */
+#endif /* UNIV_DEBUG */
+
 	/*------------------------------*/
 	char detailed_error[256];	/*!< detailed error message for last
 					error, or empty. */
@@ -790,23 +1041,6 @@ struct trx_struct{
 	ibool		take_stats;
 };
 
-#define TRX_MAX_N_THREADS	32	/* maximum number of
-					concurrent threads running a
-					single operation of a
-					transaction, e.g., a parallel
-					query */
-/* Transaction concurrency states (trx->conc_state) */
-#define	TRX_NOT_STARTED		0
-#define	TRX_ACTIVE		1
-#define	TRX_COMMITTED_IN_MEMORY	2
-#define	TRX_PREPARED		3	/* Support for 2PC/XA */
-
-/* Transaction execution states when trx->conc_state == TRX_ACTIVE */
-#define TRX_QUE_RUNNING		0	/* transaction is running */
-#define TRX_QUE_LOCK_WAIT	1	/* transaction is waiting for a lock */
-#define TRX_QUE_ROLLING_BACK	2	/* transaction is rolling back */
-#define TRX_QUE_COMMITTING	3	/* transaction is committing */
-
 /* Transaction isolation levels (trx->isolation_level) */
 #define TRX_ISO_READ_UNCOMMITTED	0	/* dirty read: non-locking
 						SELECTs are performed so that
@@ -853,7 +1087,6 @@ Multiple flags can be combined with bitwise OR. */
 #define TRX_SIG_TOTAL_ROLLBACK		1
 #define TRX_SIG_ROLLBACK_TO_SAVEPT	2
 #define TRX_SIG_COMMIT			3
-#define	TRX_SIG_ERROR_OCCURRED		4
 #define TRX_SIG_BREAK_EXECUTION		5
 
 /* Sender types of a signal */
@@ -876,13 +1109,40 @@ enum commit_node_state {
 };
 
 /** Commit command node in a query graph */
-struct commit_node_struct{
+struct commit_node_t{
 	que_common_t	common;	/*!< node type: QUE_NODE_COMMIT */
 	enum commit_node_state
 			state;	/*!< node execution state */
 };
 
 
+/** Test if trx->mutex is owned. */
+#define trx_mutex_own(t) mutex_own(&t->mutex)
+
+/** Acquire the trx->mutex. */
+#define trx_mutex_enter(t) do {			\
+	mutex_enter(&t->mutex);			\
+} while (0)
+
+/** Release the trx->mutex. */
+#define trx_mutex_exit(t) do {			\
+	mutex_exit(&t->mutex);			\
+} while (0)
+
+/** @brief The latch protecting the adaptive search system
+
+This latch protects the
+(1) hash index;
+(2) columns of a record to which we have a pointer in the hash index;
+
+but does NOT protect:
+
+(3) next record offset field in a record;
+(4) next or previous records on the same page.
+
+Bear in mind (3) and (4) when using the hash index.
+*/
+extern prio_rw_lock_t*	btr_search_latch_arr;
 
 #ifndef UNIV_NONINL
 #include "trx0trx.ic"
diff --git a/storage/xtradb/include/trx0trx.ic b/storage/xtradb/include/trx0trx.ic
index 97dda69f013..787931dc4b6 100644
--- a/storage/xtradb/include/trx0trx.ic
+++ b/storage/xtradb/include/trx0trx.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -23,50 +23,48 @@ The transaction
 Created 3/26/1996 Heikki Tuuri
 *******************************************************/
 
-/********************************************************************//**
-In XtraDB it is impossible for a transaction to own a search latch outside of
-InnoDB code, so there is nothing to release on demand.  We keep this function to
-simplify maintenance.*/
-UNIV_INLINE
-void
-trx_search_latch_release_if_reserved(
-/*=================================*/
-	trx_t*	   trx __attribute__((unused))) /*!< in: transaction */
-{
-	ut_ad(!trx->has_search_latch);
-}
-
-/*************************************************************//**
-Starts the transaction if it is not yet started. */
-UNIV_INLINE
-void
-trx_start_if_not_started(
-/*=====================*/
-	trx_t*	trx)	/*!< in: transaction */
-{
-	ut_ad(trx->state != TRX_COMMITTED_IN_MEMORY);
-
-	if (trx->state == TRX_NOT_STARTED) {
-
-		trx_start(trx, ULINT_UNDEFINED);
-	}
-}
-
-/*************************************************************//**
-Starts the transaction if it is not yet started. Assumes we have reserved
-the kernel mutex! */
+/**********************************************************************//**
+Determines if a transaction is in the given state.
+The caller must hold trx_sys->mutex, or it must be the thread
+that is serving a running transaction.
+A running transaction must be in trx_sys->ro_trx_list or trx_sys->rw_trx_list
+unless it is a non-locking autocommit read only transaction, which is only
+in trx_sys->mysql_trx_list.
+@return	TRUE if trx->state == state */
 UNIV_INLINE
-void
-trx_start_if_not_started_low(
-/*=========================*/
-	trx_t*	trx)	/*!< in: transaction */
+ibool
+trx_state_eq(
+/*=========*/
+	const trx_t*	trx,	/*!< in: transaction */
+	trx_state_t	state)	/*!< in: state;
+				if state != TRX_STATE_NOT_STARTED
+				asserts that
+				trx->state != TRX_STATE_NOT_STARTED */
 {
-	ut_ad(trx->state != TRX_COMMITTED_IN_MEMORY);
-
-	if (trx->state == TRX_NOT_STARTED) {
-
-		trx_start_low(trx, ULINT_UNDEFINED);
+#ifdef UNIV_DEBUG
+	switch (trx->state) {
+	case TRX_STATE_PREPARED:
+		ut_ad(!trx_is_autocommit_non_locking(trx));
+		return(trx->state == state);
+
+	case TRX_STATE_ACTIVE:
+		assert_trx_nonlocking_or_in_list(trx);
+		return(state == trx->state);
+
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		assert_trx_in_list(trx);
+		return(state == trx->state);
+
+	case TRX_STATE_NOT_STARTED:
+		/* This state is not allowed for running transactions. */
+		ut_a(state == TRX_STATE_NOT_STARTED);
+		ut_ad(!trx->in_rw_trx_list);
+		ut_ad(!trx->in_ro_trx_list);
+		return(state == trx->state);
 	}
+	ut_error;
+#endif /* UNIV_DEBUG */
+	return(trx->state == state);
 }
 
 /****************************************************************//**
@@ -92,7 +90,7 @@ trx_get_que_state_str(
 	const trx_t*	trx)	/*!< in: transaction */
 {
 	/* be sure to adjust TRX_QUE_STATE_STR_MAX_LEN if you change this */
-	switch (trx->que_state) {
+	switch (trx->lock.que_state) {
 	case TRX_QUE_RUNNING:
 		return("RUNNING");
 	case TRX_QUE_LOCK_WAIT:
@@ -110,12 +108,12 @@ trx_get_que_state_str(
 Determine if a transaction is a dictionary operation.
 @return	dictionary operation mode */
 UNIV_INLINE
-enum trx_dict_op
+enum trx_dict_op_t
 trx_get_dict_operation(
 /*===================*/
 	const trx_t*	trx)	/*!< in: transaction */
 {
-	enum trx_dict_op op = (enum trx_dict_op) trx->dict_operation;
+	trx_dict_op_t op = static_cast<trx_dict_op_t>(trx->dict_operation);
 
 #ifdef UNIV_DEBUG
 	switch (op) {
@@ -126,7 +124,7 @@ trx_get_dict_operation(
 	}
 	ut_error;
 #endif /* UNIV_DEBUG */
-	return((enum trx_dict_op) UNIV_EXPECT(op, TRX_DICT_OP_NONE));
+	return(op);
 }
 /**********************************************************************//**
 Flag a transaction a dictionary operation. */
@@ -135,11 +133,11 @@ void
 trx_set_dict_operation(
 /*===================*/
 	trx_t*			trx,	/*!< in/out: transaction */
-	enum trx_dict_op	op)	/*!< in: operation, not
+	enum trx_dict_op_t	op)	/*!< in: operation, not
 					TRX_DICT_OP_NONE */
 {
 #ifdef UNIV_DEBUG
-	enum trx_dict_op	old_op = trx_get_dict_operation(trx);
+	enum trx_dict_op_t	old_op = trx_get_dict_operation(trx);
 
 	switch (op) {
 	case TRX_DICT_OP_NONE:
@@ -161,5 +159,19 @@ trx_set_dict_operation(
 ok:
 #endif /* UNIV_DEBUG */
 
+	trx->ddl = true;
 	trx->dict_operation = op;
 }
+
+/********************************************************************//**
+In XtraDB it is impossible for a transaction to own a search latch outside of
+InnoDB code, so there is nothing to release on demand.  We keep this function to
+simplify maintenance.*/
+UNIV_INLINE
+void
+trx_search_latch_release_if_reserved(
+/*=================================*/
+	trx_t*	   trx __attribute__((unused))) /*!< in: transaction */
+{
+	ut_ad(!trx->has_search_latch);
+}
diff --git a/storage/xtradb/include/trx0types.h b/storage/xtradb/include/trx0types.h
index 7303892bec4..7ca95131328 100644
--- a/storage/xtradb/include/trx0types.h
+++ b/storage/xtradb/include/trx0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -29,38 +29,70 @@ Created 3/26/1996 Heikki Tuuri
 #include "ut0byte.h"
 
 /** printf(3) format used for printing DB_TRX_ID and other system fields */
-#define TRX_ID_FMT		"%llX"
+#define TRX_ID_FMT		IB_ID_FMT
 
 /** maximum length that a formatted trx_t::id could take, not including
 the terminating NUL character. */
 #define TRX_ID_MAX_LEN		17
 
+/** Transaction execution states when trx->state == TRX_STATE_ACTIVE */
+enum trx_que_t {
+	TRX_QUE_RUNNING,		/*!< transaction is running */
+	TRX_QUE_LOCK_WAIT,		/*!< transaction is waiting for
+					a lock */
+	TRX_QUE_ROLLING_BACK,		/*!< transaction is rolling back */
+	TRX_QUE_COMMITTING		/*!< transaction is committing */
+};
+
+/** Transaction states (trx_t::state) */
+enum trx_state_t {
+	TRX_STATE_NOT_STARTED,
+	TRX_STATE_ACTIVE,
+	TRX_STATE_PREPARED,			/* Support for 2PC/XA */
+	TRX_STATE_COMMITTED_IN_MEMORY
+};
+
+/** Type of data dictionary operation */
+enum trx_dict_op_t {
+	/** The transaction is not modifying the data dictionary. */
+	TRX_DICT_OP_NONE = 0,
+	/** The transaction is creating a table or an index, or
+	dropping a table.  The table must be dropped in crash
+	recovery.  This and TRX_DICT_OP_NONE are the only possible
+	operation modes in crash recovery. */
+	TRX_DICT_OP_TABLE = 1,
+	/** The transaction is creating or dropping an index in an
+	existing table.  In crash recovery, the data dictionary
+	must be locked, but the table must not be dropped. */
+	TRX_DICT_OP_INDEX = 2
+};
+
 /** Memory objects */
 /* @{ */
 /** Transaction */
-typedef struct trx_struct	trx_t;
+struct trx_t;
+/** The locks and state of an active transaction */
+struct trx_lock_t;
 /** Transaction system */
-typedef struct trx_sys_struct	trx_sys_t;
-/** Doublewrite information */
-typedef struct trx_doublewrite_struct	trx_doublewrite_t;
+struct trx_sys_t;
 /** Signal */
-typedef struct trx_sig_struct	trx_sig_t;
+struct trx_sig_t;
 /** Rollback segment */
-typedef struct trx_rseg_struct	trx_rseg_t;
+struct trx_rseg_t;
 /** Transaction undo log */
-typedef struct trx_undo_struct	trx_undo_t;
+struct trx_undo_t;
 /** Array of undo numbers of undo records being rolled back or purged */
-typedef struct trx_undo_arr_struct trx_undo_arr_t;
+struct trx_undo_arr_t;
 /** A cell of trx_undo_arr_t */
-typedef struct trx_undo_inf_struct trx_undo_inf_t;
+struct trx_undo_inf_t;
 /** The control structure used in the purge operation */
-typedef struct trx_purge_struct	trx_purge_t;
+struct trx_purge_t;
 /** Rollback command node in a query graph */
-typedef struct roll_node_struct	roll_node_t;
+struct roll_node_t;
 /** Commit command node in a query graph */
-typedef struct commit_node_struct commit_node_t;
+struct commit_node_t;
 /** SAVEPOINT command node in a query graph */
-typedef struct trx_named_savept_struct trx_named_savept_t;
+struct trx_named_savept_t;
 /* @} */
 
 /** Rollback contexts */
@@ -87,10 +119,11 @@ typedef ib_id_t	roll_ptr_t;
 /** Undo number */
 typedef ib_id_t	undo_no_t;
 
+/** Maximum transaction identifier */
+#define TRX_ID_MAX	IB_ID_MAX
+
 /** Transaction savepoint */
-typedef struct trx_savept_struct trx_savept_t;
-/** Transaction savepoint */
-struct trx_savept_struct{
+struct trx_savept_t{
 	undo_no_t	least_undo_no;	/*!< least undo number to undo */
 };
 
diff --git a/storage/xtradb/include/trx0undo.h b/storage/xtradb/include/trx0undo.h
index 4a1e40af505..61b0dabb1e6 100644
--- a/storage/xtradb/include/trx0undo.h
+++ b/storage/xtradb/include/trx0undo.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -65,6 +65,15 @@ ibool
 trx_undo_roll_ptr_is_insert(
 /*========================*/
 	roll_ptr_t	roll_ptr);	/*!< in: roll pointer */
+/***********************************************************************//**
+Returns true if the record is of the insert type.
+@return	true if the record was freshly inserted (not updated). */
+UNIV_INLINE
+bool
+trx_undo_trx_id_is_insert(
+/*======================*/
+	const byte*	trx_id)	/*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */
+	__attribute__((nonnull, pure, warn_unused_result));
 #endif /* !UNIV_HOTBACKUP */
 /*****************************************************************//**
 Writes a roll ptr to an index page. In case that the size changes in
@@ -166,6 +175,7 @@ trx_undo_get_prev_rec(
 	trx_undo_rec_t*	rec,	/*!< in: undo record */
 	ulint		page_no,/*!< in: undo log header page number */
 	ulint		offset,	/*!< in: undo log header offset on page */
+	bool		shared,	/*!< in: true=S-latch, false=X-latch */
 	mtr_t*		mtr);	/*!< in: mtr */
 /***********************************************************************//**
 Gets the next record in an undo log.
@@ -282,14 +292,15 @@ trx_undo_lists_init(
 Assigns an undo log for a transaction. A new undo log is created or a cached
 undo log reused.
 @return DB_SUCCESS if undo log assign successful, possible error codes
-are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE
+are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE DB_READ_ONLY
 DB_OUT_OF_MEMORY */
 UNIV_INTERN
-ulint
+dberr_t
 trx_undo_assign_undo(
 /*=================*/
 	trx_t*		trx,	/*!< in: transaction */
-	ulint		type);	/*!< in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */
+	ulint		type)	/*!< in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */
+	__attribute__((nonnull, warn_unused_result));
 /******************************************************************//**
 Sets the state of the undo log segment at a transaction finish.
 @return	undo log segment header page, x-latched */
@@ -404,7 +415,7 @@ trx_undo_mem_free(
 /** Transaction undo log memory object; this is protected by the undo_mutex
 in the corresponding transaction object */
 
-struct trx_undo_struct{
+struct trx_undo_t{
 	/*-----------------------------*/
 	ulint		id;		/*!< undo log slot number within the
 					rollback segment */
@@ -412,8 +423,8 @@ struct trx_undo_struct{
 					TRX_UNDO_UPDATE */
 	ulint		state;		/*!< state of the corresponding undo log
 					segment */
-	ibool		del_marks;	/*!< relevant only in an update undo log:
-					this is TRUE if the transaction may
+	ibool		del_marks;	/*!< relevant only in an update undo
+					log: this is TRUE if the transaction may
 					have delete marked records, because of
 					a delete of a row or an update of an
 					indexed field; purge is then
@@ -435,8 +446,8 @@ struct trx_undo_struct{
 					in bytes, or 0 for uncompressed */
 	ulint		hdr_page_no;	/*!< page number of the header page in
 					the undo log */
-	ulint		hdr_offset;	/*!< header offset of the undo log on the
-					page */
+	ulint		hdr_offset;	/*!< header offset of the undo log on
+				       	the page */
 	ulint		last_page_no;	/*!< page number of the last page in the
 					undo log; this may differ from
 					top_page_no during a rollback */
@@ -582,8 +593,8 @@ quite a large overhead. */
 #define	TRX_UNDO_XA_XID		(TRX_UNDO_XA_BQUAL_LEN + 4)
 /*--------------------------------------------------------------*/
 #define TRX_UNDO_LOG_XA_HDR_SIZE (TRX_UNDO_XA_XID + XIDDATASIZE)
-				/*!< Total size of the undo log header
-				with the XA XID */
+					/*!< Total size of the undo log header
+					with the XA XID */
 /* @} */
 
 #ifndef UNIV_NONINL
diff --git a/storage/xtradb/include/trx0undo.ic b/storage/xtradb/include/trx0undo.ic
index a12d38116b6..577759d6c3d 100644
--- a/storage/xtradb/include/trx0undo.ic
+++ b/storage/xtradb/include/trx0undo.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -101,6 +101,21 @@ trx_undo_roll_ptr_is_insert(
 	ut_ad(roll_ptr < (1ULL << 56));
 	return((ibool) (roll_ptr >> 55));
 }
+
+/***********************************************************************//**
+Returns true if the record is of the insert type.
+@return	true if the record was freshly inserted (not updated). */
+UNIV_INLINE
+bool
+trx_undo_trx_id_is_insert(
+/*======================*/
+	const byte*	trx_id)	/*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */
+{
+#if DATA_TRX_ID + 1 != DATA_ROLL_PTR
+# error
+#endif
+	return(static_cast<bool>(trx_id[DATA_TRX_ID_LEN] >> 7));
+}
 #endif /* !UNIV_HOTBACKUP */
 
 /*****************************************************************//**
diff --git a/storage/xtradb/include/trx0xa.h b/storage/xtradb/include/trx0xa.h
index 97c24c899a7..7caddfb7ba4 100644
--- a/storage/xtradb/include/trx0xa.h
+++ b/storage/xtradb/include/trx0xa.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/univ.i b/storage/xtradb/include/univ.i
index 0b105f573c2..c9447245124 100644
--- a/storage/xtradb/include/univ.i
+++ b/storage/xtradb/include/univ.i
@@ -1,8 +1,7 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
-Copyright (c) 2009, Sun Microsystems, Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -10,12 +9,6 @@ briefly in the InnoDB documentation. The contributions by Google are
 incorporated with their permission, and subject to the conditions contained in
 the file COPYING.Google.
 
-Portions of this file contain modifications contributed and copyrighted by
-Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
-are described briefly in the InnoDB documentation. The contributions by
-Sun Microsystems are incorporated with their permission, and subject to the
-conditions contained in the file COPYING.Sun_Microsystems.
-
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
 Foundation; version 2 of the License.
@@ -25,8 +18,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -49,10 +42,16 @@ Created 1/20/1994 Heikki Tuuri
 #define _IB_TO_STR(s)	#s
 #define IB_TO_STR(s)	_IB_TO_STR(s)
 
-#include <mysql_version.h>
+#define INNODB_VERSION_MAJOR	5
+#define INNODB_VERSION_MINOR	6
+#define INNODB_VERSION_BUGFIX	14
+
+#ifndef PERCONA_INNODB_VERSION
+#define PERCONA_INNODB_VERSION 62.0
+#endif
 
-#define INNODB_VERSION_MAJOR	MYSQL_MAJOR_VERSION
-#define INNODB_VERSION_MINOR	MYSQL_MINOR_VERSION
+/* Enable UNIV_LOG_ARCHIVE in XtraDB */
+#define UNIV_LOG_ARCHIVE 1
 
 /* The following is the InnoDB version as shown in
 SELECT plugin_version FROM information_schema.plugins;
@@ -63,15 +62,15 @@ component, i.e. we show M.N.P as M.N */
 #define INNODB_VERSION_SHORT	\
 	(INNODB_VERSION_MAJOR << 8 | INNODB_VERSION_MINOR)
 
-#ifndef PERCONA_INNODB_VERSION
-#define PERCONA_INNODB_VERSION 31.1
-#endif
-
-#define INNODB_VERSION_STR	MYSQL_SERVER_VERSION "-" IB_TO_STR(PERCONA_INNODB_VERSION)
+#define INNODB_VERSION_STR			\
+	IB_TO_STR(INNODB_VERSION_MAJOR) "."	\
+	IB_TO_STR(INNODB_VERSION_MINOR) "."	\
+	IB_TO_STR(INNODB_VERSION_BUGFIX) "-"	\
+	IB_TO_STR(PERCONA_INNODB_VERSION)
 
 #define REFMAN "http://dev.mysql.com/doc/refman/"	\
-	IB_TO_STR(MYSQL_MAJOR_VERSION) "."		\
-	IB_TO_STR(MYSQL_MINOR_VERSION) "/en/"
+	IB_TO_STR(INNODB_VERSION_MAJOR) "."		\
+	IB_TO_STR(INNODB_VERSION_MINOR) "/en/"
 
 #ifdef MYSQL_DYNAMIC_PLUGIN
 /* In the dynamic plugin, redefine some externally visible symbols
@@ -105,10 +104,10 @@ if we are compiling on Windows. */
 # include <my_pthread.h>
 #endif /* UNIV_HOTBACKUP */
 
-/* Include <sys/stat.h> to get S_I... macros defined for os0file.c */
+/* Include <sys/stat.h> to get S_I... macros defined for os0file.cc */
 # include <sys/stat.h>
 # if !defined(__WIN__)
-#  include <sys/mman.h> /* mmap() for os0proc.c */
+#  include <sys/mman.h> /* mmap() for os0proc.cc */
 # endif
 
 /* Include the header file generated by GNU autoconf */
@@ -125,21 +124,21 @@ if we are compiling on Windows. */
 /* We only try to do explicit inlining of functions with gcc and
 Sun Studio */
 
-# if !defined(__GNUC__) && !(defined(__SUNPRO_C) || defined(__SUNPRO_CC))
-#  undef  UNIV_MUST_NOT_INLINE			/* Remove compiler warning */
-#  define UNIV_MUST_NOT_INLINE
-# endif
-
 # ifdef HAVE_PREAD
 #  define HAVE_PWRITE
 # endif
 
 #endif /* #if (defined(WIN32) || ... */
 
+#ifndef __WIN__
+#define __STDC_FORMAT_MACROS    /* Enable C99 printf format macros */
+#include <inttypes.h>
+#endif /* !__WIN__ */
+
 /* Following defines are to enable performance schema
 instrumentation in each of four InnoDB modules if
 HAVE_PSI_INTERFACE is defined. */
-#ifdef HAVE_PSI_INTERFACE
+#if defined HAVE_PSI_INTERFACE && !defined UNIV_HOTBACKUP
 # define UNIV_PFS_MUTEX
 # define UNIV_PFS_RWLOCK
 /* For I/O instrumentation, performance schema rely
@@ -151,8 +150,22 @@ resolved */
 #  define UNIV_PFS_IO
 # endif
 # define UNIV_PFS_THREAD
+
+/* There are mutexes/rwlocks that we want to exclude from
+instrumentation even if their corresponding performance schema
+define is set. And this PFS_NOT_INSTRUMENTED is used
+as the key value to identify those objects that would
+be excluded from instrumentation. */
+# define PFS_NOT_INSTRUMENTED		ULINT32_UNDEFINED
+
+# define PFS_IS_INSTRUMENTED(key)	((key) != PFS_NOT_INSTRUMENTED)
+
 #endif /* HAVE_PSI_INTERFACE */
 
+#ifdef __WIN__
+# define YY_NO_UNISTD_H 1
+#endif /* __WIN__ */
+
 /*			DEBUG VERSION CONTROL
 			===================== */
 
@@ -180,8 +193,6 @@ command. Not tested on Windows. */
 						debugging without UNIV_DEBUG */
 #define UNIV_BLOB_LIGHT_DEBUG			/* Enable off-page column
 						debugging without UNIV_DEBUG */
-#define UNIV_BLOB_NULL_DEBUG			/* Enable deep off-page
-						column debugging */
 #define UNIV_DEBUG				/* Enable ut_ad() assertions
 						and disable UNIV_INLINE */
 #define UNIV_DEBUG_LOCK_VALIDATE		/* Enable
@@ -202,6 +213,9 @@ assumes that no BLOBs survive server restart */
 #define UNIV_IBUF_COUNT_DEBUG			/* debug the insert buffer;
 this limits the database to IBUF_COUNT_N_SPACES and IBUF_COUNT_N_PAGES,
 and the insert buffer must be empty when the database is started */
+#define UNIV_PERF_DEBUG                         /* debug flag that enables
+                                                light weight performance
+                                                related stuff. */
 #define UNIV_SYNC_DEBUG				/* debug mutex and latch
 operations (very slow); also UNIV_DEBUG must be defined */
 #define UNIV_SEARCH_DEBUG			/* debug B-tree comparisons */
@@ -210,7 +224,7 @@ operations (very slow); also UNIV_DEBUG must be defined */
 #define UNIV_SEARCH_PERF_STAT			/* statistics for the
 						adaptive hash index */
 #define UNIV_SRV_PRINT_LATCH_WAITS		/* enable diagnostic output
-						in sync0sync.c */
+						in sync0sync.cc */
 #define UNIV_BTR_PRINT				/* enable functions for
 						printing B-trees */
 #define UNIV_ZIP_DEBUG				/* extensive consistency checks
@@ -220,6 +234,11 @@ operations (very slow); also UNIV_DEBUG must be defined */
 #define UNIV_AIO_DEBUG				/* prints info about
 						submitted and reaped AIO
 						requests to the log. */
+#define UNIV_STATS_DEBUG			/* prints various stats
+						related debug info from
+						dict0stats.c */
+#define FTS_INTERNAL_DIAG_PRINT                 /* FTS internal debugging
+                                                info output */
 #endif
 
 #define UNIV_BTR_DEBUG				/* check B-tree links */
@@ -242,7 +261,9 @@ easy way to get it to work. See http://bugs.mysql.com/bug.php?id=52263. */
 #else
 # define UNIV_INTERN
 #endif
-#if defined __GNUC__ && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 3)
+#if defined(INNODB_COMPILER_HINTS)      \
+    && defined __GNUC__                 \
+    && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 3)
 /** Starting with GCC 4.3, the "cold" attribute is used to inform the
 compiler that a function is unlikely executed.  The function is
 optimized for size rather than speed and on many targets it is placed
@@ -256,40 +277,41 @@ rarely invoked function for size instead for speed. */
 # define UNIV_COLD /* empty */
 #endif
 
+#ifdef UNIV_LINUX
+# define UNIV_THREAD_LOCAL __thread
+#else
+/* FIXME: the TLS variables are silently broken on other platforms for now */
+# define UNIV_THREAD_LOCAL
+#endif
+
 #ifndef UNIV_MUST_NOT_INLINE
 /* Definition for inline version */
 
-#ifdef __WIN__
-# define UNIV_INLINE	__inline
-#elif defined(__SUNPRO_CC) || defined(__SUNPRO_C)
-# define UNIV_INLINE static inline
-#else
-# define UNIV_INLINE static __inline__
-#endif
+#define UNIV_INLINE static inline
 
-#else
+#else /* !UNIV_MUST_NOT_INLINE */
 /* If we want to compile a noninlined version we use the following macro
 definitions: */
 
 #define UNIV_NONINL
 #define UNIV_INLINE	UNIV_INTERN
 
-#endif	/* UNIV_DEBUG */
+#endif /* !UNIV_MUST_NOT_INLINE */
 
 #ifdef _WIN32
 #define UNIV_WORD_SIZE		4
 #elif defined(_WIN64)
 #define UNIV_WORD_SIZE		8
 #else
-/* MySQL config.h generated by GNU autoconf will define SIZEOF_LONG in Posix */
+/** MySQL config.h generated by GNU autoconf will define SIZEOF_LONG in Posix */
 #define UNIV_WORD_SIZE		SIZEOF_LONG
 #endif
 
-/* The following alignment is used in memory allocations in memory heap
+/** The following alignment is used in memory allocations in memory heap
 management to ensure correct alignment for doubles etc. */
-#define UNIV_MEM_ALIGNMENT      8
+#define UNIV_MEM_ALIGNMENT	8
 
-/* The following alignment is used in aligning lints etc. */
+/** The following alignment is used in aligning lints etc. */
 #define UNIV_WORD_ALIGNMENT	UNIV_WORD_SIZE
 
 /*
@@ -315,16 +337,62 @@ enum innodb_file_formats_enum {
 
 typedef enum innodb_file_formats_enum innodb_file_formats_t;
 
-/* The 2-logarithm of UNIV_PAGE_SIZE: */
-/* #define UNIV_PAGE_SIZE_SHIFT	14 */
-#define UNIV_PAGE_SIZE_SHIFT_MAX	14
+/** Minimum supported file format */
+#define UNIV_FORMAT_MIN		UNIV_FORMAT_A
+
+/** Maximum supported file format */
+#define UNIV_FORMAT_MAX		UNIV_FORMAT_B
+
+/** The 2-logarithm of UNIV_PAGE_SIZE: */
 #define UNIV_PAGE_SIZE_SHIFT	srv_page_size_shift
-/* The universal page size of the database */
-/* #define UNIV_PAGE_SIZE		(1u << UNIV_PAGE_SIZE_SHIFT) */
-#define UNIV_PAGE_SIZE		srv_page_size
-#define UNIV_PAGE_SIZE_MAX	(1u << UNIV_PAGE_SIZE_SHIFT_MAX)
 
-/* Maximum number of parallel threads in a parallelized operation */
+/** The universal page size of the database */
+#define UNIV_PAGE_SIZE		((ulint) srv_page_size)
+
+/** log2 of smallest compressed page size (1<<10 == 1024 bytes)
+Note: This must never change! */
+#define UNIV_ZIP_SIZE_SHIFT_MIN		10
+
+/** log2 of largest compressed page size (1<<14 == 16384 bytes).
+A compressed page directory entry reserves 14 bits for the start offset
+and 2 bits for flags. This limits the uncompressed page size to 16k.
+Even though a 16k uncompressed page can theoretically be compressed
+into a larger compressed page, it is not a useful feature so we will
+limit both with this same constant. */
+#define UNIV_ZIP_SIZE_SHIFT_MAX		14
+
+/* Define the Min, Max, Default page sizes. */
+/** Minimum Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_MIN	12
+/** Maximum Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_MAX	14
+/** Default Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_DEF	14
+/** Original 16k InnoDB Page Size Shift, in case the default changes */
+#define UNIV_PAGE_SIZE_SHIFT_ORIG	14
+
+/** Minimum page size InnoDB currently supports. */
+#define UNIV_PAGE_SIZE_MIN	(1 << UNIV_PAGE_SIZE_SHIFT_MIN)
+/** Maximum page size InnoDB currently supports. */
+#define UNIV_PAGE_SIZE_MAX	(1 << UNIV_PAGE_SIZE_SHIFT_MAX)
+/** Default page size for InnoDB tablespaces. */
+#define UNIV_PAGE_SIZE_DEF	(1 << UNIV_PAGE_SIZE_SHIFT_DEF)
+/** Original 16k page size for InnoDB tablespaces. */
+#define UNIV_PAGE_SIZE_ORIG	(1 << UNIV_PAGE_SIZE_SHIFT_ORIG)
+
+/** Smallest compressed page size */
+#define UNIV_ZIP_SIZE_MIN	(1 << UNIV_ZIP_SIZE_SHIFT_MIN)
+
+/** Largest compressed page size */
+#define UNIV_ZIP_SIZE_MAX	(1 << UNIV_ZIP_SIZE_SHIFT_MAX)
+
+/** Number of supported page sizes (The convention 'ssize' is used
+for 'log2 minus 9' or the number of shifts starting with 512.)
+This number varies depending on UNIV_PAGE_SIZE. */
+#define UNIV_PAGE_SSIZE_MAX					\
+	(UNIV_PAGE_SIZE_SHIFT - UNIV_ZIP_SIZE_SHIFT_MIN + 1)
+
+/** Maximum number of parallel threads in a parallelized operation */
 #define UNIV_MAX_PARALLELISM	32
 
 /** This is the "mbmaxlen" for my_charset_filename (defined in
@@ -338,12 +406,11 @@ FILENAME_CHARSET_MAXNAMLEN (5) = 320 bytes. The number does not include a
 terminating '\0'. InnoDB can handle longer names internally */
 #define MAX_TABLE_NAME_LEN	320
 
-
-/* The maximum length of a database name. Like MAX_TABLE_NAME_LEN this is
+/** The maximum length of a database name. Like MAX_TABLE_NAME_LEN this is
 the MySQL's NAME_LEN, see check_and_convert_db_name(). */
 #define MAX_DATABASE_NAME_LEN	MAX_TABLE_NAME_LEN
 
-/* MAX_FULL_NAME_LEN defines the full name path including the
+/** MAX_FULL_NAME_LEN defines the full name path including the
 database name and table name. In addition, 14 bytes is added for:
 	2 for surrounding quotes around table name
 	1 for the separating dot (.)
@@ -351,6 +418,16 @@ database name and table name. In addition, 14 bytes is added for:
 #define MAX_FULL_NAME_LEN				\
 	(MAX_TABLE_NAME_LEN + MAX_DATABASE_NAME_LEN + 14)
 
+/** The maximum length in bytes that a database name can occupy when stored in
+UTF8, including the terminating '\0', see dict_fs2utf8(). You must include
+mysql_com.h if you are to use this macro. */
+#define MAX_DB_UTF8_LEN		(NAME_LEN + 1)
+
+/** The maximum length in bytes that a table name can occupy when stored in
+UTF8, including the terminating '\0', see dict_fs2utf8(). You must include
+mysql_com.h if you are to use this macro. */
+#define MAX_TABLE_UTF8_LEN	(NAME_LEN + sizeof(srv_mysql50_table_name_prefix))
+
 /*
 			UNIVERSAL TYPE DEFINITIONS
 			==========================
@@ -359,41 +436,47 @@ database name and table name. In addition, 14 bytes is added for:
 /* Note that inside MySQL 'byte' is defined as char on Linux! */
 #define byte			unsigned char
 
-/* Define an unsigned integer type that is exactly 32 bits. */
-
-#if SIZEOF_INT == 4
-typedef unsigned int		ib_uint32_t;
-#elif SIZEOF_LONG == 4
-typedef unsigned long		ib_uint32_t;
-#else
-#error "Neither int or long is 4 bytes"
-#endif
-
 /* Another basic type we use is unsigned long integer which should be equal to
 the word size of the machine, that is on a 32-bit platform 32 bits, and on a
 64-bit platform 64 bits. We also give the printf format for the type as a
 macro ULINTPF. */
 
+
+#ifdef __WIN__
+/* Use the integer types and formatting strings defined in Visual Studio. */
+# define UINT32PF	"%I32u"
+# define INT64PF	"%I64d"
+# define UINT64PF	"%I64u"
+# define UINT64PFx	"%016I64u"
+# define DBUG_LSN_PF    "%llu"
+typedef __int64 ib_int64_t;
+typedef unsigned __int64 ib_uint64_t;
+typedef unsigned __int32 ib_uint32_t;
+#else
+/* Use the integer types and formatting strings defined in the C99 standard. */
+# define UINT32PF	"%"PRIu32
+# define INT64PF	"%"PRId64
+# define UINT64PF	"%"PRIu64
+# define UINT64PFx	"%016"PRIx64
+# define DBUG_LSN_PF    UINT64PF
+typedef int64_t ib_int64_t;
+typedef uint64_t ib_uint64_t;
+typedef uint32_t ib_uint32_t;
+# endif /* __WIN__ */
+
+# define IB_ID_FMT	UINT64PF
+
 #ifdef _WIN64
 typedef unsigned __int64	ulint;
-#define ULINTPF			"%I64u"
 typedef __int64			lint;
+# define ULINTPF		UINT64PF
 #define MYSQL_SYSVAR_ULINT MYSQL_SYSVAR_ULONGLONG
 #else
 typedef unsigned long int	ulint;
-#define ULINTPF			"%lu"
 typedef long int		lint;
+# define ULINTPF		"%lu"
 #define MYSQL_SYSVAR_ULINT MYSQL_SYSVAR_ULONG
-#endif
-
-#ifdef __WIN__
-typedef __int64			ib_int64_t;
-typedef unsigned __int64	ib_uint64_t;
-#elif !defined(UNIV_HOTBACKUP)
-/* Note: longlong and ulonglong come from MySQL headers. */
-typedef longlong		ib_int64_t;
-typedef ulonglong		ib_uint64_t;
-#endif
+#endif /* _WIN64 */
 
 #ifndef UNIV_HOTBACKUP
 typedef unsigned long long int	ullint;
@@ -405,27 +488,33 @@ typedef unsigned long long int	ullint;
 #endif
 #endif
 
-/* The 'undefined' value for a ulint */
+/** The 'undefined' value for a ulint */
 #define ULINT_UNDEFINED		((ulint)(-1))
 
+#define ULONG_UNDEFINED		((ulong)(-1))
+
+/** The 'undefined' value for a ib_uint64_t */
+#define UINT64_UNDEFINED	((ib_uint64_t)(-1))
+
 /** The bitmask of 32-bit unsigned integer */
 #define ULINT32_MASK		0xFFFFFFFF
-/* The undefined 32-bit unsigned integer */
+/** The undefined 32-bit unsigned integer */
 #define	ULINT32_UNDEFINED	ULINT32_MASK
 
-/* Maximum value for a ulint */
+/** Maximum value for a ulint */
 #define ULINT_MAX		((ulint)(-2))
 
-/* Maximum value for ib_uint64_t */
-#define IB_ULONGLONG_MAX	((ib_uint64_t) (~0ULL))
+/** Maximum value for ib_uint64_t */
+#define IB_UINT64_MAX		((ib_uint64_t) (~0ULL))
 
 /** The generic InnoDB system object identifier data type */
-typedef ib_uint64_t	ib_id_t;
+typedef ib_uint64_t		ib_id_t;
+#define IB_ID_MAX		IB_UINT64_MAX
 
-/* The 'undefined' value for a ullint */
+/** The 'undefined' value for a ullint */
 #define ULLINT_UNDEFINED        ((ullint)(-1))
 
-/* This 'ibool' type is used within Innobase. Remember that different included
+/** This 'ibool' type is used within Innobase. Remember that different included
 headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */
 #define ibool			ulint
 
@@ -436,7 +525,9 @@ headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */
 
 #endif
 
-/* The following number as the length of a logical field means that the field
+#define UNIV_NOTHROW
+
+/** The following number as the length of a logical field means that the field
 has the SQL NULL as its value. NOTE that because we assume that the length
 of a field is a 32-bit integer when we store it, for example, to an undo log
 on disk, we must have also this number fit in 32 bits, also in 64-bit
@@ -444,15 +535,23 @@ computers! */
 
 #define UNIV_SQL_NULL ULINT32_UNDEFINED
 
-/* Lengths which are not UNIV_SQL_NULL, but bigger than the following
+/** Lengths which are not UNIV_SQL_NULL, but bigger than the following
 number indicate that a field contains a reference to an externally
 stored part of the field in the tablespace. The length field then
 contains the sum of the following flag and the locally stored len. */
 
 #define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE_MAX)
 
-/* Some macros to improve branch prediction and reduce cache misses */
 #if defined(__GNUC__) && (__GNUC__ > 2) && ! defined(__INTEL_COMPILER)
+#define HAVE_GCC_GT_2
+/* Tell the compiler that variable/function is unused. */
+# define UNIV_UNUSED    __attribute__ ((unused))
+#else
+# define UNIV_UNUSED
+#endif /* CHECK FOR GCC VER_GT_2 */
+
+/* Some macros to improve branch prediction and reduce cache misses */
+#if defined(INNODB_COMPILER_HINTS) && defined(HAVE_GCC_GT_2)
 /* Tell the compiler that 'expr' probably evaluates to 'constant'. */
 # define UNIV_EXPECT(expr,constant) __builtin_expect(expr, constant)
 /* Tell the compiler that a pointer is likely to be NULL */
@@ -463,19 +562,30 @@ it is read. */
 /* Minimize cache-miss latency by moving data at addr into a cache before
 it is read or written. */
 # define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3)
+
 /* Sun Studio includes sun_prefetch.h as of version 5.9 */
 #elif (defined(__SUNPRO_C) && __SUNPRO_C >= 0x590) \
        || (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x590)
+
 # include <sun_prefetch.h>
+
 #if __SUNPRO_C >= 0x550
 # undef UNIV_INTERN
 # define UNIV_INTERN __hidden
 #endif /* __SUNPRO_C >= 0x550 */
-/* Use sun_prefetch when compile with Sun Studio */
+
 # define UNIV_EXPECT(expr,value) (expr)
 # define UNIV_LIKELY_NULL(expr) (expr)
-# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many((void*) addr)
-# define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr)
+
+# if defined(INNODB_COMPILER_HINTS)
+//# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many((void*) addr)
+#  define UNIV_PREFETCH_R(addr) ((void) 0)
+#  define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr)
+# else
+#  define UNIV_PREFETCH_R(addr) ((void) 0)
+#  define UNIV_PREFETCH_RW(addr) ((void) 0)
+# endif /* INNODB_COMPILER_HINTS */
+
 #else
 /* Dummy versions of the macros */
 # define UNIV_EXPECT(expr,value) (expr)
@@ -483,6 +593,7 @@ it is read or written. */
 # define UNIV_PREFETCH_R(addr) ((void) 0)
 # define UNIV_PREFETCH_RW(addr) ((void) 0)
 #endif
+
 /* Tell the compiler that cond is likely to hold */
 #define UNIV_LIKELY(cond) UNIV_EXPECT(cond, TRUE)
 /* Tell the compiler that cond is unlikely to hold */
@@ -513,17 +624,25 @@ typedef void* os_thread_ret_t;
 # define UNIV_MEM_INVALID(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size)
 # define UNIV_MEM_FREE(addr, size) VALGRIND_MAKE_MEM_NOACCESS(addr, size)
 # define UNIV_MEM_ALLOC(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size)
-# define UNIV_MEM_DESC(addr, size, b) VALGRIND_CREATE_BLOCK(addr, size, b)
+# define UNIV_MEM_DESC(addr, size) VALGRIND_CREATE_BLOCK(addr, size, #addr)
 # define UNIV_MEM_UNDESC(b) VALGRIND_DISCARD(b)
-# define UNIV_MEM_ASSERT_RW(addr, size) do {				\
+# define UNIV_MEM_ASSERT_RW_LOW(addr, size, should_abort) do {		\
 	const void* _p = (const void*) (ulint)				\
 		VALGRIND_CHECK_MEM_IS_DEFINED(addr, size);		\
-	if (UNIV_LIKELY_NULL(_p))					\
+	if (UNIV_LIKELY_NULL(_p)) {					\
 		fprintf(stderr, "%s:%d: %p[%u] undefined at %ld\n",	\
 			__FILE__, __LINE__,				\
 			(const void*) (addr), (unsigned) (size), (long)	\
 			(((const char*) _p) - ((const char*) (addr))));	\
-	} while (0)
+		if (should_abort) {					\
+			ut_error;					\
+		}							\
+	}								\
+} while (0)
+# define UNIV_MEM_ASSERT_RW(addr, size)					\
+	UNIV_MEM_ASSERT_RW_LOW(addr, size, false)
+# define UNIV_MEM_ASSERT_RW_ABORT(addr, size)				\
+	UNIV_MEM_ASSERT_RW_LOW(addr, size, true)
 # define UNIV_MEM_ASSERT_W(addr, size) do {				\
 	const void* _p = (const void*) (ulint)				\
 		VALGRIND_CHECK_MEM_IS_ADDRESSABLE(addr, size);		\
@@ -533,15 +652,22 @@ typedef void* os_thread_ret_t;
 			(const void*) (addr), (unsigned) (size), (long)	\
 			(((const char*) _p) - ((const char*) (addr))));	\
 	} while (0)
+# define UNIV_MEM_TRASH(addr, c, size) do {				\
+	ut_d(memset(addr, c, size));					\
+	UNIV_MEM_INVALID(addr, size);					\
+	} while (0)
 #else
 # define UNIV_MEM_VALID(addr, size) do {} while(0)
 # define UNIV_MEM_INVALID(addr, size) do {} while(0)
 # define UNIV_MEM_FREE(addr, size) do {} while(0)
 # define UNIV_MEM_ALLOC(addr, size) do {} while(0)
-# define UNIV_MEM_DESC(addr, size, b) do {} while(0)
+# define UNIV_MEM_DESC(addr, size) do {} while(0)
 # define UNIV_MEM_UNDESC(b) do {} while(0)
+# define UNIV_MEM_ASSERT_RW_LOW(addr, size, should_abort) do {} while(0)
 # define UNIV_MEM_ASSERT_RW(addr, size) do {} while(0)
+# define UNIV_MEM_ASSERT_RW_ABORT(addr, size) do {} while(0)
 # define UNIV_MEM_ASSERT_W(addr, size) do {} while(0)
+# define UNIV_MEM_TRASH(addr, c, size) do {} while(0)
 #endif
 #define UNIV_MEM_ASSERT_AND_FREE(addr, size) do {	\
 	UNIV_MEM_ASSERT_W(addr, size);			\
@@ -552,6 +678,7 @@ typedef void* os_thread_ret_t;
 	UNIV_MEM_ALLOC(addr, size);			\
 } while (0)
 
-extern ulint	srv_page_size_shift;
-extern ulint	srv_page_size;
+extern ulong	srv_page_size_shift;
+extern ulong	srv_page_size;
+
 #endif
diff --git a/storage/xtradb/include/usr0sess.h b/storage/xtradb/include/usr0sess.h
index bcc2f0d1d99..b5c80b97b43 100644
--- a/storage/xtradb/include/usr0sess.h
+++ b/storage/xtradb/include/usr0sess.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -51,8 +51,9 @@ sess_close(
 /*=======*/
 	sess_t*		sess);		/* in, own: session object */
 
-/* The session handle. All fields are protected by the kernel mutex */
-struct sess_struct{
+/* The session handle. This data structure is only used by purge and is
+not really necessary. We should get rid of it. */
+struct sess_t{
 	ulint		state;		/*!< state of the session */
 	trx_t*		trx;		/*!< transaction object permanently
 					assigned for the session: the
diff --git a/storage/xtradb/include/usr0sess.ic b/storage/xtradb/include/usr0sess.ic
index 1dcca8a3853..284e59537fe 100644
--- a/storage/xtradb/include/usr0sess.ic
+++ b/storage/xtradb/include/usr0sess.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/usr0types.h b/storage/xtradb/include/usr0types.h
index 6c224e6db17..6ba937cacc8 100644
--- a/storage/xtradb/include/usr0types.h
+++ b/storage/xtradb/include/usr0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,6 +26,6 @@ Created 6/25/1996 Heikki Tuuri
 #ifndef usr0types_h
 #define usr0types_h
 
-typedef struct sess_struct	sess_t;
+struct sess_t;
 
 #endif
diff --git a/storage/xtradb/include/ut0bh.h b/storage/xtradb/include/ut0bh.h
index e89d76a51b3..84ea6dd915a 100644
--- a/storage/xtradb/include/ut0bh.h
+++ b/storage/xtradb/include/ut0bh.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -31,7 +31,7 @@ Created 2010-05-28 by Sunny Bains
 /** Comparison function for objects in the binary heap. */
 typedef int (*ib_bh_cmp_t)(const void* p1, const void* p2);
 
-typedef struct ib_bh_struct ib_bh_t;
+struct ib_bh_t;
 
 /**********************************************************************//**
 Get the number of elements in the binary heap.
@@ -138,7 +138,7 @@ ib_bh_pop(
 	ib_bh_t*	ib_bh);			/*!< in/out: instance */
 
 /** Binary heap data structure */
-struct ib_bh_struct {
+struct ib_bh_t {
 	ulint		max_elems;		/*!< max elements allowed */
 	ulint		n_elems;		/*!< current size */
 	ulint		sizeof_elem;		/*!< sizeof element */
diff --git a/storage/xtradb/include/ut0bh.ic b/storage/xtradb/include/ut0bh.ic
index 4d04f9b6f49..a604237665d 100644
--- a/storage/xtradb/include/ut0bh.ic
+++ b/storage/xtradb/include/ut0bh.ic
@@ -1,5 +1,6 @@
 /***************************************************************************//**
-Copyright (c) 2011, Oracle Corpn. All Rights Reserved.
+
+Copyright (c) 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -10,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -122,4 +123,3 @@ ib_bh_last(
 		: ib_bh_get(ib_bh, ib_bh_size(ib_bh) - 1));
 }
 
-
diff --git a/storage/xtradb/include/ut0byte.h b/storage/xtradb/include/ut0byte.h
index 0c23e999268..5bdd553ca80 100644
--- a/storage/xtradb/include/ut0byte.h
+++ b/storage/xtradb/include/ut0byte.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -32,7 +32,7 @@ Created 1/20/1994 Heikki Tuuri
 
 /*******************************************************//**
 Creates a 64-bit integer out of two 32-bit integers.
-@return	created dulint */
+@return	created integer */
 UNIV_INLINE
 ib_uint64_t
 ut_ull_create(
diff --git a/storage/xtradb/include/ut0byte.ic b/storage/xtradb/include/ut0byte.ic
index 2892c5429fb..873d98c727e 100644
--- a/storage/xtradb/include/ut0byte.ic
+++ b/storage/xtradb/include/ut0byte.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -25,7 +25,7 @@ Created 5/30/1994 Heikki Tuuri
 
 /*******************************************************//**
 Creates a 64-bit integer out of two 32-bit integers.
-@return	created dulint */
+@return	created integer */
 UNIV_INLINE
 ib_uint64_t
 ut_ull_create(
@@ -90,7 +90,7 @@ ut_align(
 
 	ut_ad(sizeof(void*) == sizeof(ulint));
 
-	return((void*)((((ulint)ptr) + align_no - 1) & ~(align_no - 1)));
+	return((void*)((((ulint) ptr) + align_no - 1) & ~(align_no - 1)));
 }
 
 /*********************************************************//**
@@ -110,7 +110,7 @@ ut_align_down(
 
 	ut_ad(sizeof(void*) == sizeof(ulint));
 
-	return((void*)((((ulint)ptr)) & ~(align_no - 1)));
+	return((void*)((((ulint) ptr)) & ~(align_no - 1)));
 }
 
 /*********************************************************//**
@@ -130,7 +130,7 @@ ut_align_offset(
 
 	ut_ad(sizeof(void*) == sizeof(ulint));
 
-	return(((ulint)ptr) & (align_no - 1));
+	return(((ulint) ptr) & (align_no - 1));
 }
 
 /*****************************************************************//**
diff --git a/storage/xtradb/include/ut0counter.h b/storage/xtradb/include/ut0counter.h
new file mode 100644
index 00000000000..fe0f36dfff2
--- /dev/null
+++ b/storage/xtradb/include/ut0counter.h
@@ -0,0 +1,203 @@
+/*****************************************************************************
+
+Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ut0counter.h
+
+Counter utility class
+
+Created 2012/04/12 by Sunny Bains
+*******************************************************/
+
+#ifndef UT0COUNTER_H
+#define UT0COUNTER_H
+
+#include "univ.i"
+#include <string.h>
+#include "os0thread.h"
+
+/** CPU cache line size */
+#define CACHE_LINE_SIZE		64
+
+/** Default number of slots to use in ib_counter_t */
+#define IB_N_SLOTS		64
+
+/** Get the offset into the counter array. */
+template <typename Type, int N>
+struct generic_indexer_t {
+	/** Default constructor/destructor should be OK. */
+
+        /** @return offset within m_counter */
+        size_t offset(size_t index) const UNIV_NOTHROW {
+                return(((index % N) + 1) * (CACHE_LINE_SIZE / sizeof(Type)));
+        }
+};
+
+#ifdef HAVE_SCHED_GETCPU
+#include <utmpx.h>
+/** Use the cpu id to index into the counter array. If it fails then
+use the thread id. */
+template <typename Type, int N>
+struct get_sched_indexer_t : public generic_indexer_t<Type, N> {
+	/** Default constructor/destructor should be OK. */
+
+	/* @return result from sched_getcpu(), the thread id if it fails. */
+	size_t get_rnd_index() const UNIV_NOTHROW {
+
+		size_t	cpu = sched_getcpu();
+		if (cpu == -1) {
+			cpu = (lint) os_thread_get_curr_id();
+		}
+
+		return(cpu);
+	}
+};
+#endif /* HAVE_SCHED_GETCPU */
+
+/** Use the thread id to index into the counter array. */
+template <typename Type, int N>
+struct thread_id_indexer_t : public generic_indexer_t<Type, N> {
+	/** Default constructor/destructor should are OK. */
+
+	/* @return a random number, currently we use the thread id. Where
+	thread id is represented as a pointer, it may not work as
+	effectively. */
+	size_t get_rnd_index() const UNIV_NOTHROW {
+		return((lint) os_thread_get_curr_id());
+	}
+};
+
+/** For counters wher N=1 */
+template <typename Type, int N=1>
+struct single_indexer_t {
+	/** Default constructor/destructor should are OK. */
+
+        /** @return offset within m_counter */
+        size_t offset(size_t index) const UNIV_NOTHROW {
+		ut_ad(N == 1);
+                return((CACHE_LINE_SIZE / sizeof(Type)));
+        }
+
+	/* @return 1 */
+	size_t get_rnd_index() const UNIV_NOTHROW {
+		ut_ad(N == 1);
+		return(1);
+	}
+};
+
+/** Class for using fuzzy counters. The counter is not protected by any
+mutex and the results are not guaranteed to be 100% accurate but close
+enough. Creates an array of counters and separates each element by the
+CACHE_LINE_SIZE bytes */
+template <
+	typename Type,
+	int N = IB_N_SLOTS,
+	template<typename, int> class Indexer = thread_id_indexer_t>
+class ib_counter_t {
+public:
+	ib_counter_t() { memset(m_counter, 0x0, sizeof(m_counter)); }
+
+	~ib_counter_t()
+	{
+		ut_ad(validate());
+	}
+
+	bool validate() UNIV_NOTHROW {
+#ifdef UNIV_DEBUG
+		size_t	n = (CACHE_LINE_SIZE / sizeof(Type));
+
+		/* Check that we aren't writing outside our defined bounds. */
+		for (size_t i = 0; i < UT_ARR_SIZE(m_counter); i += n) {
+			for (size_t j = 1; j < n - 1; ++j) {
+				ut_ad(m_counter[i + j] == 0);
+			}
+		}
+#endif /* UNIV_DEBUG */
+		return(true);
+	}
+
+	/** If you can't use a good index id. Increment by 1. */
+	void inc() UNIV_NOTHROW { add(1); }
+
+	/** If you can't use a good index id.
+	* @param n  - is the amount to increment */
+	void add(Type n) UNIV_NOTHROW {
+		size_t	i = m_policy.offset(m_policy.get_rnd_index());
+
+		ut_ad(i < UT_ARR_SIZE(m_counter));
+
+		m_counter[i] += n;
+	}
+
+	/** Use this if you can use a unique indentifier, saves a
+	call to get_rnd_index().
+	@param i - index into a slot
+	@param n - amount to increment */
+	void add(size_t index, Type n) UNIV_NOTHROW {
+		size_t	i = m_policy.offset(index);
+
+		ut_ad(i < UT_ARR_SIZE(m_counter));
+
+		m_counter[i] += n;
+	}
+
+	/** If you can't use a good index id. Decrement by 1. */
+	void dec() UNIV_NOTHROW { sub(1); }
+
+	/** If you can't use a good index id.
+	* @param - n is the amount to decrement */
+	void sub(Type n) UNIV_NOTHROW {
+		size_t	i = m_policy.offset(m_policy.get_rnd_index());
+
+		ut_ad(i < UT_ARR_SIZE(m_counter));
+
+		m_counter[i] -= n;
+	}
+
+	/** Use this if you can use a unique indentifier, saves a
+	call to get_rnd_index().
+	@param i - index into a slot
+	@param n - amount to decrement */
+	void sub(size_t index, Type n) UNIV_NOTHROW {
+		size_t	i = m_policy.offset(index);
+
+		ut_ad(i < UT_ARR_SIZE(m_counter));
+
+		m_counter[i] -= n;
+	}
+
+	/* @return total value - not 100% accurate, since it is not atomic. */
+	operator Type() const UNIV_NOTHROW {
+		Type	total = 0;
+
+		for (size_t i = 0; i < N; ++i) {
+			total += m_counter[m_policy.offset(i)];
+		}
+
+		return(total);
+	}
+
+private:
+	/** Indexer into the array */
+	Indexer<Type, N>m_policy;
+
+        /** Slot 0 is unused. */
+	Type		m_counter[(N + 1) * (CACHE_LINE_SIZE / sizeof(Type))];
+};
+
+#endif /* UT0COUNTER_H */
diff --git a/storage/xtradb/include/ut0crc32.h b/storage/xtradb/include/ut0crc32.h
new file mode 100644
index 00000000000..86217692764
--- /dev/null
+++ b/storage/xtradb/include/ut0crc32.h
@@ -0,0 +1,51 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ut0crc32.h
+CRC32 implementation
+
+Created Aug 10, 2011 Vasil Dimov
+*******************************************************/
+
+#ifndef ut0crc32_h
+#define ut0crc32_h
+
+#include "univ.i"
+
+/********************************************************************//**
+Initializes the data structures used by ut_crc32(). Does not do any
+allocations, would not hurt if called twice, but would be pointless. */
+UNIV_INTERN
+void
+ut_crc32_init();
+/*===========*/
+
+/********************************************************************//**
+Calculates CRC32.
+@param ptr	- data over which to calculate CRC32.
+@param len	- data length in bytes.
+@return CRC32 (CRC-32C, using the GF(2) primitive polynomial 0x11EDC6F41,
+or 0x1EDC6F41 without the high-order bit) */
+typedef ib_uint32_t (*ib_ut_crc32_t)(const byte* ptr, ulint len);
+
+extern ib_ut_crc32_t	ut_crc32;
+
+extern bool	ut_crc32_sse2_enabled;
+
+#endif /* ut0crc32_h */
diff --git a/storage/xtradb/include/ut0dbg.h b/storage/xtradb/include/ut0dbg.h
index 5a854326b7b..6a4afe99597 100644
--- a/storage/xtradb/include/ut0dbg.h
+++ b/storage/xtradb/include/ut0dbg.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,6 +26,12 @@ Created 1/30/1994 Heikki Tuuri
 #ifndef ut0dbg_h
 #define ut0dbg_h
 
+#ifdef UNIV_INNOCHECKSUM
+#define ut_a		assert
+#define ut_ad		assert
+#define ut_error	assert(0)
+#else /* !UNIV_INNOCHECKSUM */
+
 #include "univ.i"
 #include <stdlib.h>
 #include "os0thread.h"
@@ -97,10 +103,10 @@ ut_dbg_assertion_failed(
 #include <sys/resource.h>
 
 /** structure used for recording usage statistics */
-typedef struct speedo_struct {
+struct speedo_t {
 	struct rusage	ru;	/*!< getrusage() result */
 	struct timeval	tv;	/*!< gettimeofday() result */
-} speedo_t;
+};
 
 /*******************************************************************//**
 Resets a speedo (records the current time in it). */
@@ -121,4 +127,6 @@ speedo_show(
 
 #endif /* UNIV_COMPILE_TEST_FUNCS */
 
+#endif /* !UNIV_INNOCHECKSUM */
+
 #endif
diff --git a/storage/xtradb/include/ut0list.h b/storage/xtradb/include/ut0list.h
index 4cfe4b9d8ce..29fc8669ce4 100644
--- a/storage/xtradb/include/ut0list.h
+++ b/storage/xtradb/include/ut0list.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -48,9 +48,8 @@ automatically freeing the list node when the item's heap is freed.
 
 #include "mem0mem.h"
 
-typedef struct ib_list_struct ib_list_t;
-typedef struct ib_list_node_struct ib_list_node_t;
-typedef struct ib_list_helper_struct ib_list_helper_t;
+struct ib_list_t;
+struct ib_list_node_t;
 
 /****************************************************************//**
 Create a new list using mem_alloc. Lists created with this function must be
@@ -142,8 +141,17 @@ ib_list_get_last(
 /*=============*/
 	ib_list_t*	list);	/*!< in: list */
 
+/********************************************************************
+Check if list is empty. */
+UNIV_INLINE
+ibool
+ib_list_is_empty(
+/*=============*/
+					/* out: TRUE if empty else  */
+	const ib_list_t*	list);	/* in: list */
+
 /* List. */
-struct ib_list_struct {
+struct ib_list_t {
 	ib_list_node_t*		first;		/*!< first node */
 	ib_list_node_t*		last;		/*!< last node */
 	ibool			is_heap_list;	/*!< TRUE if this list was
@@ -151,7 +159,7 @@ struct ib_list_struct {
 };
 
 /* A list node. */
-struct ib_list_node_struct {
+struct ib_list_node_t {
 	ib_list_node_t*		prev;		/*!< previous node */
 	ib_list_node_t*		next;		/*!< next node */
 	void*			data;		/*!< user data */
@@ -160,7 +168,7 @@ struct ib_list_node_struct {
 /* Quite often, the only additional piece of data you need is the per-item
 memory heap, so we have this generic struct available to use in those
 cases. */
-struct ib_list_helper_struct {
+struct ib_list_helper_t {
 	mem_heap_t*	heap;		/*!< memory heap */
 	void*		data;		/*!< user data */
 };
diff --git a/storage/xtradb/include/ut0list.ic b/storage/xtradb/include/ut0list.ic
index c8810675ca0..d9dcb2eac99 100644
--- a/storage/xtradb/include/ut0list.ic
+++ b/storage/xtradb/include/ut0list.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -46,3 +46,15 @@ ib_list_get_last(
 {
 	return(list->last);
 }
+
+/********************************************************************
+Check if list is empty. */
+UNIV_INLINE
+ibool
+ib_list_is_empty(
+/*=============*/
+					/* out: TRUE if empty else FALSE */
+	const ib_list_t*	list)	/* in: list */
+{
+	return(!(list->first || list->last));
+}
diff --git a/storage/xtradb/include/ut0lst.h b/storage/xtradb/include/ut0lst.h
index 9bb4bc7723f..b53e7ade4c1 100644
--- a/storage/xtradb/include/ut0lst.h
+++ b/storage/xtradb/include/ut0lst.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -28,10 +28,17 @@ Created 9/10/1995 Heikki Tuuri
 
 #include "univ.i"
 
+/*******************************************************************//**
+Return offset of F in POD T.
+@param T	- POD pointer
+@param F	- Field in T */
+#define IB_OFFSETOF(T, F)						\
+	(reinterpret_cast<byte*>(&(T)->F) - reinterpret_cast<byte*>(T))
+
 /* This module implements the two-way linear list which should be used
 if a list is used in the database. Note that a single struct may belong
 to two or more lists, provided that the list are given different names.
-An example of the usage of the lists can be found in fil0fil.c. */
+An example of the usage of the lists can be found in fil0fil.cc. */
 
 /*******************************************************************//**
 This macro expands to the unnamed type definition of a struct which acts
@@ -39,12 +46,16 @@ as the two-way list base node. The base node contains pointers
 to both ends of the list and a count of nodes in the list (excluding
 the base node from the count).
 @param TYPE	the name of the list node data type */
-#define UT_LIST_BASE_NODE_T(TYPE)\
-struct {\
-	ulint	count;	/*!< count of nodes in list */\
-	TYPE *	start;	/*!< pointer to list start, NULL if empty */\
-	TYPE *	end;	/*!< pointer to list end, NULL if empty */\
-}\
+template <typename TYPE>
+struct ut_list_base {
+	typedef TYPE elem_type;
+
+	ulint	count;	/*!< count of nodes in list */
+	TYPE*	start;	/*!< pointer to list start, NULL if empty */
+	TYPE*	end;	/*!< pointer to list end, NULL if empty */
+};
+
+#define UT_LIST_BASE_NODE_T(TYPE)	ut_list_base<TYPE>
 
 /*******************************************************************//**
 This macro expands to the unnamed type definition of a struct which
@@ -54,20 +65,36 @@ The name of the field in the node struct should be the name given
 to the list.
 @param TYPE	the list node type name */
 /* Example:
-typedef struct LRU_node_struct	LRU_node_t;
-struct LRU_node_struct {
+struct LRU_node_t {
 	UT_LIST_NODE_T(LRU_node_t)	LRU_list;
 	...
 }
 The example implements an LRU list of name LRU_list. Its nodes are of type
 LRU_node_t. */
 
-#define UT_LIST_NODE_T(TYPE)\
-struct {\
-	TYPE *	prev;	/*!< pointer to the previous node,\
-			NULL if start of list */\
-	TYPE *	next;	/*!< pointer to next node, NULL if end of list */\
-}\
+template <typename TYPE>
+struct ut_list_node {
+	TYPE* 	prev;	/*!< pointer to the previous node,
+			NULL if start of list */
+	TYPE* 	next;	/*!< pointer to next node, NULL if end of list */
+};
+
+#define UT_LIST_NODE_T(TYPE)	ut_list_node<TYPE>
+
+/*******************************************************************//**
+Get the list node at offset.
+@param elem	- list element
+@param offset	- offset within element.
+@return reference to list node. */
+template <typename Type>
+ut_list_node<Type>&
+ut_elem_get_node(Type&	elem, size_t offset)
+{
+	ut_a(offset < sizeof(elem));
+
+	return(*reinterpret_cast<ut_list_node<Type>*>(
+		reinterpret_cast<byte*>(&elem) + offset));
+}
 
 /*******************************************************************//**
 Initializes the base node of a two-way list.
@@ -82,108 +109,197 @@ Initializes the base node of a two-way list.
 
 /*******************************************************************//**
 Adds the node as the first element in a two-way linked list.
+@param list	the base node (not a pointer to it)
+@param elem	the element to add
+@param offset	offset of list node in elem. */
+template <typename List, typename Type>
+void
+ut_list_prepend(
+	List&		list,
+	Type&		elem,
+	size_t		offset)
+{
+	ut_list_node<Type>&	elem_node = ut_elem_get_node(elem, offset);
+
+ 	elem_node.prev = 0;
+ 	elem_node.next = list.start;
+
+	if (list.start != 0) {
+		ut_list_node<Type>&	base_node =
+			ut_elem_get_node(*list.start, offset);
+
+		ut_ad(list.start != &elem);
+
+		base_node.prev = &elem;
+	}
+
+	list.start = &elem;
+
+	if (list.end == 0) {
+		list.end = &elem;
+	}
+
+	++list.count;
+}
+
+/*******************************************************************//**
+Adds the node as the first element in a two-way linked list.
 @param NAME	list name
-@param BASE	the base node (not a pointer to it)
-@param N	pointer to the node to be added to the list.
-*/
-#define UT_LIST_ADD_FIRST(NAME, BASE, N)\
-{\
-	ut_ad(N);\
-	((BASE).count)++;\
-	((N)->NAME).next = (BASE).start;\
-	((N)->NAME).prev = NULL;\
-	if (UNIV_LIKELY((BASE).start != NULL)) {\
-		ut_ad((BASE).start != (N));\
-		(((BASE).start)->NAME).prev = (N);\
-	}\
-	(BASE).start = (N);\
-	if (UNIV_UNLIKELY((BASE).end == NULL)) {\
-		(BASE).end = (N);\
-	}\
-}\
+@param LIST	the base node (not a pointer to it)
+@param ELEM	the element to add */
+#define UT_LIST_ADD_FIRST(NAME, LIST, ELEM)	\
+	ut_list_prepend(LIST, *ELEM, IB_OFFSETOF(ELEM, NAME))
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param list	list
+@param elem	the element to add
+@param offset	offset of list node in elem */
+template <typename List, typename Type>
+void
+ut_list_append(
+	List&		list,
+	Type&		elem,
+	size_t		offset)
+{
+	ut_list_node<Type>&	elem_node = ut_elem_get_node(elem, offset);
+
+	elem_node.next = 0;
+	elem_node.prev = list.end;
+
+	if (list.end != 0) {
+		ut_list_node<Type>&	base_node =
+			ut_elem_get_node(*list.end, offset);
+
+		ut_ad(list.end != &elem);
+
+		base_node.next = &elem;
+	}
+
+	list.end = &elem;
+
+	if (list.start == 0) {
+		list.start = &elem;
+	}
+
+	++list.count;
+}
 
 /*******************************************************************//**
 Adds the node as the last element in a two-way linked list.
 @param NAME	list name
-@param BASE	the base node (not a pointer to it)
-@param N	pointer to the node to be added to the list
-*/
-#define UT_LIST_ADD_LAST(NAME, BASE, N)\
-{\
-	ut_ad(N != NULL);\
-	((BASE).count)++;\
-	((N)->NAME).prev = (BASE).end;\
-	((N)->NAME).next = NULL;\
-	if ((BASE).end != NULL) {\
-		ut_ad((BASE).end != (N));\
-		(((BASE).end)->NAME).next = (N);\
-	}\
-	(BASE).end = (N);\
-	if ((BASE).start == NULL) {\
-		(BASE).start = (N);\
-	}\
-}\
+@param LIST	list
+@param ELEM	the element to add */
+#define UT_LIST_ADD_LAST(NAME, LIST, ELEM)\
+	ut_list_append(LIST, *ELEM, IB_OFFSETOF(ELEM, NAME))
 
 /*******************************************************************//**
-Inserts a NODE2 after NODE1 in a list.
+Inserts a ELEM2 after ELEM1 in a list.
+@param list	the base node
+@param elem1	node after which ELEM2 is inserted
+@param elem2	node being inserted after NODE1
+@param offset	offset of list node in elem1 and elem2 */
+template <typename List, typename Type>
+void
+ut_list_insert(
+	List&		list,
+	Type&		elem1,
+	Type&		elem2,
+	size_t		offset)
+{
+	ut_ad(&elem1 != &elem2);
+
+	ut_list_node<Type>&	elem1_node = ut_elem_get_node(elem1, offset);
+	ut_list_node<Type>&	elem2_node = ut_elem_get_node(elem2, offset);
+
+	elem2_node.prev = &elem1;
+	elem2_node.next = elem1_node.next;
+
+	if (elem1_node.next != NULL) {
+		ut_list_node<Type>&	next_node =
+			ut_elem_get_node(*elem1_node.next, offset);
+
+		next_node.prev = &elem2;
+	}
+
+	elem1_node.next = &elem2;
+
+	if (list.end == &elem1) {
+		list.end = &elem2;
+	}
+
+	++list.count;
+}
+
+/*******************************************************************//**
+Inserts a ELEM2 after ELEM1 in a list.
 @param NAME	list name
-@param BASE	the base node (not a pointer to it)
-@param NODE1	pointer to node after which NODE2 is inserted
-@param NODE2	pointer to node being inserted after NODE1
-*/
-#define UT_LIST_INSERT_AFTER(NAME, BASE, NODE1, NODE2)\
-{\
-	ut_ad(NODE1);\
-	ut_ad(NODE2);\
-	ut_ad((NODE1) != (NODE2));\
-	((BASE).count)++;\
-	((NODE2)->NAME).prev = (NODE1);\
-	((NODE2)->NAME).next = ((NODE1)->NAME).next;\
-	if (((NODE1)->NAME).next != NULL) {\
-		((((NODE1)->NAME).next)->NAME).prev = (NODE2);\
-	}\
-	((NODE1)->NAME).next = (NODE2);\
-	if ((BASE).end == (NODE1)) {\
-		(BASE).end = (NODE2);\
-	}\
-}\
+@param LIST	the base node
+@param ELEM1	node after which ELEM2 is inserted
+@param ELEM2	node being inserted after ELEM1 */
+#define UT_LIST_INSERT_AFTER(NAME, LIST, ELEM1, ELEM2)\
+	ut_list_insert(LIST, *ELEM1, *ELEM2, IB_OFFSETOF(ELEM1, NAME))
 
 #ifdef UNIV_LIST_DEBUG
 /** Invalidate the pointers in a list node.
 @param NAME	list name
 @param N	pointer to the node that was removed */
-# define UT_LIST_REMOVE_CLEAR(NAME, N)		\
-((N)->NAME.prev = (N)->NAME.next = (void*) -1)
+# define UT_LIST_REMOVE_CLEAR(N)					\
+	(N).next = (Type*) -1;						\
+	(N).prev = (N).next
 #else
 /** Invalidate the pointers in a list node.
 @param NAME	list name
 @param N	pointer to the node that was removed */
-# define UT_LIST_REMOVE_CLEAR(NAME, N) do {} while (0)
-#endif
+# define UT_LIST_REMOVE_CLEAR(N)
+#endif /* UNIV_LIST_DEBUG */
 
 /*******************************************************************//**
 Removes a node from a two-way linked list.
-@param NAME	list name
-@param BASE	the base node (not a pointer to it)
-@param N	pointer to the node to be removed from the list
-*/
-#define UT_LIST_REMOVE(NAME, BASE, N)					\
-do {									\
-	ut_ad(N);							\
-	ut_a((BASE).count > 0);						\
-	((BASE).count)--;						\
-	if (((N)->NAME).next != NULL) {					\
-		((((N)->NAME).next)->NAME).prev = ((N)->NAME).prev;	\
-	} else {							\
-		(BASE).end = ((N)->NAME).prev;				\
-	}								\
-	if (((N)->NAME).prev != NULL) {					\
-		((((N)->NAME).prev)->NAME).next = ((N)->NAME).next;	\
-	} else {							\
-		(BASE).start = ((N)->NAME).next;			\
-	}								\
-	UT_LIST_REMOVE_CLEAR(NAME, N);					\
-} while (0)
+@param list	the base node (not a pointer to it)
+@param elem	node to be removed from the list
+@param offset	offset of list node within elem */
+template <typename List, typename Type>
+void
+ut_list_remove(
+	List&		list,
+ 	Type&		elem,
+	size_t		offset)
+{
+	ut_list_node<Type>&	elem_node = ut_elem_get_node(elem, offset);
+
+	ut_a(list.count > 0);
+
+	if (elem_node.next != NULL) {
+		ut_list_node<Type>&	next_node =
+			ut_elem_get_node(*elem_node.next, offset);
+
+		next_node.prev = elem_node.prev;
+	} else {
+		list.end = elem_node.prev;
+	}
+
+	if (elem_node.prev != NULL) {
+		ut_list_node<Type>&	prev_node =
+			ut_elem_get_node(*elem_node.prev, offset);
+
+		prev_node.next = elem_node.next;
+	} else {
+		list.start = elem_node.next;
+	}
+
+	UT_LIST_REMOVE_CLEAR(elem_node);
+
+	--list.count;
+}
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+  aram NAME	list name
+@param LIST	the base node (not a pointer to it)
+@param ELEM	node to be removed from the list */
+#define UT_LIST_REMOVE(NAME, LIST, ELEM)				\
+	ut_list_remove(LIST, *ELEM, IB_OFFSETOF(ELEM, NAME))
 
 /********************************************************************//**
 Gets the next node in a two-way list.
@@ -223,39 +339,70 @@ Gets the last node in a two-way list.
 #define UT_LIST_GET_LAST(BASE)\
 	(BASE).end
 
+struct	NullValidate { void operator()(const void* elem) { } };
+
+/********************************************************************//**
+Iterate over all the elements and call the functor for each element.
+@param list	base node (not a pointer to it)
+@param functor	Functor that is called for each element in the list
+@parm  node	pointer to member node within list element */
+template <typename List, class Functor>
+void
+ut_list_map(
+	List&		list,
+	ut_list_node<typename List::elem_type>
+			List::elem_type::*node,
+	Functor		functor)
+{
+	ulint		count = 0;
+
+	for (typename List::elem_type* elem = list.start;
+	     elem != 0;
+	     elem = (elem->*node).next, ++count) {
+
+		functor(elem);
+	}
+
+	ut_a(count == list.count);
+}
+
+/********************************************************************//**
+Checks the consistency of a two-way list.
+@param list	base node (not a pointer to it)
+@param functor	Functor that is called for each element in the list
+@parm  node	pointer to member node within list element */
+template <typename List, class Functor>
+void
+ut_list_validate(
+	List&		list,
+	ut_list_node<typename List::elem_type>
+			List::elem_type::*node,
+	Functor		functor = NullValidate())
+{
+	ut_list_map(list, node, functor);
+
+	ulint		count = 0;
+
+	for (typename List::elem_type* elem = list.end;
+	     elem != 0;
+	     elem = (elem->*node).prev, ++count) {
+
+		functor(elem);
+	}
+
+	ut_a(count == list.count);
+}
+
 /********************************************************************//**
 Checks the consistency of a two-way list.
 @param NAME		the name of the list
 @param TYPE		node type
-@param BASE		base node (not a pointer to it)
-@param ASSERTION	a condition on ut_list_node_313 */
-#define UT_LIST_VALIDATE(NAME, TYPE, BASE, ASSERTION)			\
-do {									\
-	ulint	ut_list_i_313;						\
-	TYPE*	ut_list_node_313;					\
-									\
-	ut_list_node_313 = (BASE).start;				\
-									\
-	for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) {		\
-		ut_a(ut_list_node_313);					\
-		ASSERTION;						\
-		ut_ad((ut_list_node_313->NAME).next || !ut_list_i_313);	\
-		ut_list_node_313 = (ut_list_node_313->NAME).next;	\
-	}								\
-									\
-	ut_a(ut_list_node_313 == NULL);					\
-									\
-	ut_list_node_313 = (BASE).end;					\
-									\
-	for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) {		\
-		ut_a(ut_list_node_313);					\
-		ASSERTION;						\
-		ut_ad((ut_list_node_313->NAME).prev || !ut_list_i_313);	\
-		ut_list_node_313 = (ut_list_node_313->NAME).prev;	\
-	}								\
-									\
-	ut_a(ut_list_node_313 == NULL);					\
-} while (0)
-
-#endif
+@param LIST		base node (not a pointer to it)
+@param FUNCTOR		called for each list element */
+#define UT_LIST_VALIDATE(NAME, TYPE, LIST, FUNCTOR)			\
+	ut_list_validate(LIST, &TYPE::NAME, FUNCTOR)
+
+#define UT_LIST_CHECK(NAME, TYPE, LIST)					\
+	ut_list_validate(LIST, &TYPE::NAME, NullValidate())
 
+#endif /* ut0lst.h */
diff --git a/storage/xtradb/include/ut0mem.h b/storage/xtradb/include/ut0mem.h
index 16c31c2c36c..af7eb4e9b1d 100644
--- a/storage/xtradb/include/ut0mem.h
+++ b/storage/xtradb/include/ut0mem.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -101,7 +101,7 @@ ut_free(
 	void* ptr);  /*!< in, own: memory block, can be NULL */
 #ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
-Implements realloc. This is needed by /pars/lexyy.c. Otherwise, you should not
+Implements realloc. This is needed by /pars/lexyy.cc. Otherwise, you should not
 use this function because the allocation functions in mem0mem.h are the
 recommended ones in InnoDB.
 
@@ -211,6 +211,18 @@ ut_strreplace(
 	const char*	s1,	/*!< in: string to replace */
 	const char*	s2);	/*!< in: string to replace s1 with */
 
+/********************************************************************
+Concatenate 3 strings.*/
+
+char*
+ut_str3cat(
+/*=======*/
+				/* out, own: concatenated string, must be
+				freed with mem_free() */
+	const char*	s1,	/* in: string 1 */
+	const char*	s2,	/* in: string 2 */
+	const char*	s3);	/* in: string 3 */
+
 /**********************************************************************//**
 Converts a raw binary data to a NUL-terminated hex string. The output is
 truncated if there is not enough space in "hex", make sure "hex_size" is at
diff --git a/storage/xtradb/include/ut0mem.ic b/storage/xtradb/include/ut0mem.ic
index de701bd50e3..5c9071d52cc 100644
--- a/storage/xtradb/include/ut0mem.ic
+++ b/storage/xtradb/include/ut0mem.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -280,7 +280,7 @@ ut_str_sql_format(
 		switch (ch) {
 		case '\0':
 
-			if (UNIV_UNLIKELY(buf_size - buf_i < 4)) {
+			if (buf_size - buf_i < 4) {
 
 				goto func_exit;
 			}
@@ -292,7 +292,7 @@ ut_str_sql_format(
 		case '\'':
 		case '\\':
 
-			if (UNIV_UNLIKELY(buf_size - buf_i < 4)) {
+			if (buf_size - buf_i < 4) {
 
 				goto func_exit;
 			}
diff --git a/storage/xtradb/include/ut0rbt.h b/storage/xtradb/include/ut0rbt.h
index 0540e1ee386..5c25104b5d7 100644
--- a/storage/xtradb/include/ut0rbt.h
+++ b/storage/xtradb/include/ut0rbt.h
@@ -1,12 +1,6 @@
 /***************************************************************************//**
 
-Copyright (c) 2007, 2010, Innobase Oy. All Rights Reserved.
-
-Portions of this file contain modifications contributed and copyrighted by
-Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
-are described briefly in the InnoDB documentation. The contributions by
-Sun Microsystems are incorporated with their permission, and subject to the
-conditions contained in the file COPYING.Sun_Microsystems.
+Copyright (c) 2007, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 /******************************************************************//**
@@ -50,24 +44,19 @@ Created 2007-03-20 Sunny Bains
 #define	FALSE		0
 #endif
 
-/* Red black tree typedefs */
-typedef struct ib_rbt_struct ib_rbt_t;
-typedef struct ib_rbt_node_struct ib_rbt_node_t;
-/* FIXME: Iterator is a better name than _bound_ */
-typedef struct ib_rbt_bound_struct ib_rbt_bound_t;
+struct ib_rbt_node_t;
 typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node);
 typedef int (*ib_rbt_compare)(const void* p1, const void* p2);
+typedef int (*ib_rbt_arg_compare)(const void*, const void* p1, const void* p2);
 
 /** Red black tree color types */
-enum ib_rbt_color_enum {
+enum ib_rbt_color_t {
 	IB_RBT_RED,
 	IB_RBT_BLACK
 };
 
-typedef enum ib_rbt_color_enum ib_rbt_color_t;
-
 /** Red black tree node */
-struct ib_rbt_node_struct {
+struct ib_rbt_node_t {
 	ib_rbt_color_t	color;			/* color of this node */
 
 	ib_rbt_node_t*	left;			/* points left child */
@@ -78,7 +67,7 @@ struct ib_rbt_node_struct {
 };
 
 /** Red black tree instance.*/
-struct	ib_rbt_struct {
+struct	ib_rbt_t {
 	ib_rbt_node_t*	nil;			/* Black colored node that is
 						used as a sentinel. This is
 						pre-allocated too.*/
@@ -90,12 +79,16 @@ struct	ib_rbt_struct {
 	ulint		n_nodes;		/* Total number of data nodes */
 
 	ib_rbt_compare	compare;		/* Fn. to use for comparison */
+	ib_rbt_arg_compare
+			compare_with_arg;	/* Fn. to use for comparison
+						with argument */
 	ulint		sizeof_value;		/* Sizeof the item in bytes */
+	void*		cmp_arg;		/* Compare func argument */
 };
 
 /** The result of searching for a key in the tree, this is useful for
 a speedy lookup and insert if key doesn't exist.*/
-struct ib_rbt_bound_struct {
+struct ib_rbt_bound_t {
 	const ib_rbt_node_t*
 			last;			/* Last node visited */
 
@@ -137,6 +130,18 @@ rbt_create(
 	size_t		sizeof_value,		/*!< in: size in bytes */
 	ib_rbt_compare	compare);		/*!< in: comparator */
 /**********************************************************************//**
+Create an instance of a red black tree, whose comparison function takes
+an argument
+@return	rb tree instance */
+UNIV_INTERN
+ib_rbt_t*
+rbt_create_arg_cmp(
+/*===============*/
+	size_t		sizeof_value,		/*!< in: size in bytes */
+	ib_rbt_arg_compare
+			compare,		/*!< in: comparator */
+	void*		cmp_arg);		/*!< in: compare fn arg */
+/**********************************************************************//**
 Delete a node from the red black tree, identified by key */
 UNIV_INTERN
 ibool
@@ -280,7 +285,10 @@ rbt_search_cmp(
 	const ib_rbt_t*	tree,			/*!< in: rb tree */
 	ib_rbt_bound_t*	parent,			/*!< in: search bounds */
 	const void*	key,			/*!< in: key to search */
-	ib_rbt_compare	compare);		/*!< in: comparator */
+	ib_rbt_compare	compare,		/*!< in: comparator */
+	ib_rbt_arg_compare
+			arg_compare);		/*!< in: fn to compare items
+						with argument */
 /**********************************************************************//**
 Clear the tree, deletes (and free's) all the nodes. */
 UNIV_INTERN
diff --git a/storage/xtradb/include/ut0rnd.h b/storage/xtradb/include/ut0rnd.h
index bed2c668c60..53b769849a5 100644
--- a/storage/xtradb/include/ut0rnd.h
+++ b/storage/xtradb/include/ut0rnd.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -28,6 +28,8 @@ Created 1/20/1994 Heikki Tuuri
 
 #include "univ.i"
 
+#ifndef UNIV_INNOCHECKSUM
+
 #include "ut0byte.h"
 
 /** The 'character code' for end of field or string (used
@@ -87,16 +89,6 @@ ut_hash_ulint(
 	ulint	 key,		/*!< in: value to be hashed */
 	ulint	 table_size);	/*!< in: hash table size */
 /*************************************************************//**
-Folds a pair of ulints.
-@return	folded value */
-UNIV_INLINE
-ulint
-ut_fold_ulint_pair(
-/*===============*/
-	ulint	n1,	/*!< in: ulint */
-	ulint	n2)	/*!< in: ulint */
-	__attribute__((const));
-/*************************************************************//**
 Folds a 64-bit integer.
 @return	folded value */
 UNIV_INLINE
@@ -114,23 +106,6 @@ ut_fold_string(
 /*===========*/
 	const char*	str)	/*!< in: null-terminated string */
 	__attribute__((pure));
-/*************************************************************//**
-Folds a binary string.
-@return	folded value */
-UNIV_INLINE
-ulint
-ut_fold_binary(
-/*===========*/
-	const byte*	str,	/*!< in: string of bytes */
-	ulint		len)	/*!< in: length */
-	__attribute__((pure));
-UNIV_INLINE
-ulint
-ut_fold_binary_32(
-/*==============*/
-	const byte*	str,	/*!< in: string of bytes */
-	ulint		len)	/*!< in: length */
-	__attribute__((pure));
 /***********************************************************//**
 Looks for a prime number slightly greater than the given argument.
 The prime is chosen so that it is not near any power of 2.
@@ -142,6 +117,29 @@ ut_find_prime(
 	ulint	n)	/*!< in: positive number > 100 */
 	__attribute__((const));
 
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+	ulint	n1,	/*!< in: ulint */
+	ulint	n2)	/*!< in: ulint */
+	__attribute__((const));
+/*************************************************************//**
+Folds a binary string.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+	const byte*	str,	/*!< in: string of bytes */
+	ulint		len)	/*!< in: length */
+	__attribute__((pure));
+
 
 #ifndef UNIV_NONINL
 #include "ut0rnd.ic"
diff --git a/storage/xtradb/include/ut0rnd.ic b/storage/xtradb/include/ut0rnd.ic
index 30bd32726fa..024c59e553b 100644
--- a/storage/xtradb/include/ut0rnd.ic
+++ b/storage/xtradb/include/ut0rnd.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -25,6 +25,9 @@ Created 5/30/1994 Heikki Tuuri
 
 #define UT_HASH_RANDOM_MASK	1463735687
 #define UT_HASH_RANDOM_MASK2	1653893711
+
+#ifndef UNIV_INNOCHECKSUM
+
 #define UT_RND1			151117737
 #define UT_RND2			119785373
 #define UT_RND3			 85689495
@@ -156,20 +159,6 @@ ut_hash_ulint(
 }
 
 /*************************************************************//**
-Folds a pair of ulints.
-@return	folded value */
-UNIV_INLINE
-ulint
-ut_fold_ulint_pair(
-/*===============*/
-	ulint	n1,	/*!< in: ulint */
-	ulint	n2)	/*!< in: ulint */
-{
-	return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1)
-		^ UT_HASH_RANDOM_MASK) + n2);
-}
-
-/*************************************************************//**
 Folds a 64-bit integer.
 @return	folded value */
 UNIV_INLINE
@@ -203,6 +192,22 @@ ut_fold_string(
 	return(fold);
 }
 
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+	ulint	n1,	/*!< in: ulint */
+	ulint	n2)	/*!< in: ulint */
+{
+	return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1)
+		^ UT_HASH_RANDOM_MASK) + n2);
+}
+
 /*************************************************************//**
 Folds a binary string.
 @return	folded value */
@@ -213,40 +218,37 @@ ut_fold_binary(
 	const byte*	str,	/*!< in: string of bytes */
 	ulint		len)	/*!< in: length */
 {
-	const byte*	str_end	= str + len;
 	ulint		fold = 0;
+	const byte*	str_end	= str + (len & 0xFFFFFFF8);
 
 	ut_ad(str || !len);
 
 	while (str < str_end) {
-		fold = ut_fold_ulint_pair(fold, (ulint)(*str));
-
-		str++;
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
 	}
 
-	return(fold);
-}
-
-UNIV_INLINE
-ulint
-ut_fold_binary_32(
-/*==============*/
-	const byte*	str,	/*!< in: string of bytes */
-	ulint		len)	/*!< in: length */
-{
-	const ib_uint32_t*	str_end = (const ib_uint32_t*) (str + len);
-	const ib_uint32_t*	str_32 = (const ib_uint32_t*) str;
-	ulint			fold = 0;
-
-	ut_ad(str);
-	/* This function is only for word-aligned data */
-	ut_ad(len % 4 == 0);
-	ut_ad((ulint)str % 4 == 0);
-
-	while (str_32 < str_end) {
-		fold = ut_fold_ulint_pair(fold, (ulint)(*str_32));
-
-		str_32++;
+	switch (len & 0x7) {
+	case 7:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 6:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 5:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 4:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 3:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 2:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 1:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
 	}
 
 	return(fold);
diff --git a/storage/xtradb/include/ut0sort.h b/storage/xtradb/include/ut0sort.h
index 8cc73e65b2a..75648b5c317 100644
--- a/storage/xtradb/include/ut0sort.h
+++ b/storage/xtradb/include/ut0sort.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/xtradb/include/ut0ut.h b/storage/xtradb/include/ut0ut.h
index 37f1c6064b6..163dc23b363 100644
--- a/storage/xtradb/include/ut0ut.h
+++ b/storage/xtradb/include/ut0ut.h
@@ -1,13 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
-Copyright (c) 2009, Sun Microsystems, Inc.
-
-Portions of this file contain modifications contributed and copyrighted by
-Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
-are described briefly in the InnoDB documentation. The contributions by
-Sun Microsystems are incorporated with their permission, and subject to the
-conditions contained in the file COPYING.Sun_Microsystems.
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -18,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -35,6 +28,8 @@ Created 1/20/1994 Heikki Tuuri
 
 #include "univ.i"
 
+#ifndef UNIV_INNOCHECKSUM
+
 #include "db0err.h"
 
 #ifndef UNIV_HOTBACKUP
@@ -46,6 +41,8 @@ Created 1/20/1994 Heikki Tuuri
 #include <ctype.h>
 #endif
 
+#include <stdarg.h> /* for va_list */
+
 /** Index name prefix in fast index creation */
 #define	TEMP_INDEX_PREFIX	'\377'
 /** Index name prefix in fast index creation, as a string constant */
@@ -55,27 +52,32 @@ Created 1/20/1994 Heikki Tuuri
 typedef time_t	ib_time_t;
 
 #ifndef UNIV_HOTBACKUP
-#if defined(HAVE_PAUSE_INSTRUCTION)
+# if defined(HAVE_PAUSE_INSTRUCTION)
    /* According to the gcc info page, asm volatile means that the
    instruction has important side-effects and must not be removed.
    Also asm volatile may trigger a memory barrier (spilling all registers
    to memory). */
-#  define UT_RELAX_CPU() __asm__ __volatile__ ("pause")
-#elif defined(HAVE_FAKE_PAUSE_INSTRUCTION)
+#  ifdef __SUNPRO_CC
+#   define UT_RELAX_CPU() asm ("pause" )
+#  else
+#   define UT_RELAX_CPU() __asm__ __volatile__ ("pause")
+#  endif /* __SUNPRO_CC */
+
+# elif defined(HAVE_FAKE_PAUSE_INSTRUCTION)
 #  define UT_RELAX_CPU() __asm__ __volatile__ ("rep; nop")
-#elif defined(HAVE_WINDOWS_ATOMICS)
-   /* In the Win32 API, the x86 PAUSE instruction is executed by calling
-   the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
-   independent way by using YieldProcessor. */
-#  define UT_RELAX_CPU() YieldProcessor()
-#elif defined(HAVE_ATOMIC_BUILTINS)
+# elif defined(HAVE_ATOMIC_BUILTINS)
 #  define UT_RELAX_CPU() do { \
      volatile lint	volatile_var; \
      os_compare_and_swap_lint(&volatile_var, 0, 1); \
    } while (0)
-#else
+# elif defined(HAVE_WINDOWS_ATOMICS)
+   /* In the Win32 API, the x86 PAUSE instruction is executed by calling
+   the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
+   independent way by using YieldProcessor. */
+#  define UT_RELAX_CPU() YieldProcessor()
+# else
 #  define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */
-#endif
+# endif
 
 /*********************************************************************//**
 Delays execution for at most max_wait_us microseconds or returns earlier
@@ -94,16 +96,9 @@ do {								\
 } while (0)
 #endif /* !UNIV_HOTBACKUP */
 
-/********************************************************//**
-Gets the high 32 bits in a ulint. That is makes a shift >> 32,
-but since there seem to be compiler bugs in both gcc and Visual C++,
-we do this by a special conversion.
-@return	a >> 32 */
-UNIV_INTERN
-ulint
-ut_get_high32(
-/*==========*/
-	ulint	a);	/*!< in: ulint */
+template <class T> T ut_min(T a, T b) { return(a < b ? a : b); }
+template <class T> T ut_max(T a, T b) { return(a > b ? a : b); }
+
 /******************************************************//**
 Calculates the minimum of two ulints.
 @return	minimum */
@@ -122,15 +117,6 @@ ut_max(
 /*===*/
 	ulint	 n1,	/*!< in: first number */
 	ulint	 n2);	/*!< in: second number */
-/******************************************************//**
-Calculates the maximum of two ib_uint64_t values.
-@return	the maximum */
-UNIV_INLINE
-ib_uint64_t
-ut_max_uint64(
-/*==========*/
-	ib_uint64_t	n1,	/*!< in: first number */
-	ib_uint64_t	n2);	/*!< in: second number */
 /****************************************************************//**
 Calculates minimum of two ulint-pairs. */
 UNIV_INLINE
@@ -270,6 +256,16 @@ ut_time_ms(void);
 #endif /* !UNIV_HOTBACKUP */
 
 /**********************************************************//**
+Returns the number of milliseconds since some epoch.  The
+value may wrap around.  It should only be used for heuristic
+purposes.
+@return ms since epoch */
+UNIV_INTERN
+ulint
+ut_time_ms(void);
+/*============*/
+
+/**********************************************************//**
 Returns the difference of two times in seconds.
 @return	time2 - time1 expressed in seconds */
 UNIV_INTERN
@@ -278,6 +274,9 @@ ut_difftime(
 /*========*/
 	ib_time_t	time2,	/*!< in: time */
 	ib_time_t	time1);	/*!< in: time */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
 /**********************************************************//**
 Prints a timestamp to a file. */
 UNIV_INTERN
@@ -286,6 +285,9 @@ ut_print_timestamp(
 /*===============*/
 	FILE*	file)	/*!< in: file where to print */
 	UNIV_COLD __attribute__((nonnull));
+
+#ifndef UNIV_INNOCHECKSUM
+
 /**********************************************************//**
 Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
 UNIV_INTERN
@@ -343,7 +345,7 @@ ut_print_filename(
 
 #ifndef UNIV_HOTBACKUP
 /* Forward declaration of transaction handle */
-struct trx_struct;
+struct trx_t;
 
 /**********************************************************************//**
 Outputs a fixed-length string, quoted as an SQL identifier.
@@ -355,7 +357,7 @@ void
 ut_print_name(
 /*==========*/
 	FILE*		f,	/*!< in: output stream */
-	struct trx_struct*trx,	/*!< in: transaction */
+	const trx_t*	trx,	/*!< in: transaction */
 	ibool		table_id,/*!< in: TRUE=print a table name,
 				FALSE=print other identifier */
 	const char*	name);	/*!< in: name to print */
@@ -370,13 +372,31 @@ void
 ut_print_namel(
 /*===========*/
 	FILE*		f,	/*!< in: output stream */
-	struct trx_struct*trx,	/*!< in: transaction (NULL=no quotes) */
+	const trx_t*	trx,	/*!< in: transaction (NULL=no quotes) */
 	ibool		table_id,/*!< in: TRUE=print a table name,
 				FALSE=print other identifier */
 	const char*	name,	/*!< in: name to print */
 	ulint		namelen);/*!< in: length of name */
 
 /**********************************************************************//**
+Formats a table or index name, quoted as an SQL identifier. If the name
+contains a slash '/', the result will contain two identifiers separated by
+a period (.), as in SQL database_name.identifier.
+@return pointer to 'formatted' */
+UNIV_INTERN
+char*
+ut_format_name(
+/*===========*/
+	const char*	name,		/*!< in: table or index name, must be
+					'\0'-terminated */
+	ibool		is_table,	/*!< in: if TRUE then 'name' is a table
+					name */
+	char*		formatted,	/*!< out: formatted result, will be
+					'\0'-terminated */
+	ulint		formatted_size);/*!< out: no more than this number of
+					bytes will be written to 'formatted' */
+
+/**********************************************************************//**
 Catenate files. */
 UNIV_INTERN
 void
@@ -388,6 +408,22 @@ ut_copy_file(
 
 #ifdef __WIN__
 /**********************************************************************//**
+A substitute for vsnprintf(3), formatted output conversion into
+a limited buffer. Note: this function DOES NOT return the number of
+characters that would have been printed if the buffer was unlimited because
+VC's _vsnprintf() returns -1 in this case and we would need to call
+_vscprintf() in addition to estimate that but we would need another copy
+of "ap" for that and VC does not provide va_copy(). */
+UNIV_INTERN
+void
+ut_vsnprintf(
+/*=========*/
+	char*		str,	/*!< out: string */
+	size_t		size,	/*!< in: str size */
+	const char*	fmt,	/*!< in: format */
+	va_list		ap);	/*!< in: format values */
+
+/**********************************************************************//**
 A substitute for snprintf(3), formatted output conversion into
 a limited buffer.
 @return number of characters that would have been printed if the size
@@ -402,6 +438,15 @@ ut_snprintf(
 	...);			/*!< in: format values */
 #else
 /**********************************************************************//**
+A wrapper for vsnprintf(3), formatted output conversion into
+a limited buffer. Note: this function DOES NOT return the number of
+characters that would have been printed if the buffer was unlimited because
+VC's _vsnprintf() returns -1 in this case and we would need to call
+_vscprintf() in addition to estimate that but we would need another copy
+of "ap" for that and VC does not provide va_copy(). */
+# define ut_vsnprintf(buf, size, fmt, ap)	\
+	((void) vsnprintf(buf, size, fmt, ap))
+/**********************************************************************//**
 A wrapper for snprintf(3), formatted output conversion into
 a limited buffer. */
 # define ut_snprintf	snprintf
@@ -415,11 +460,25 @@ UNIV_INTERN
 const char*
 ut_strerr(
 /*======*/
-	enum db_err	num);	/*!< in: error number */
+	dberr_t	num);	/*!< in: error number */
+
+/****************************************************************
+Sort function for ulint arrays. */
+UNIV_INTERN
+void
+ut_ulint_sort(
+/*==========*/
+	ulint*	arr,		/*!< in/out: array to sort */
+	ulint*	aux_arr,	/*!< in/out: aux array to use in sort */
+	ulint	low,		/*!< in: lower bound */
+	ulint	high)		/*!< in: upper bound */
+	__attribute__((nonnull));
 
 #ifndef UNIV_NONINL
 #include "ut0ut.ic"
 #endif
 
+#endif /* !UNIV_INNOCHECKSUM */
+
 #endif
 
diff --git a/storage/xtradb/include/ut0ut.ic b/storage/xtradb/include/ut0ut.ic
index 019b3d216cf..4e0f76e1957 100644
--- a/storage/xtradb/include/ut0ut.ic
+++ b/storage/xtradb/include/ut0ut.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -49,19 +49,6 @@ ut_max(
 	return((n1 <= n2) ? n2 : n1);
 }
 
-/******************************************************//**
-Calculates the maximum of two ib_uint64_t values.
-@return	the maximum */
-UNIV_INLINE
-ib_uint64_t
-ut_max_uint64(
-/*==========*/
-	ib_uint64_t	n1,	/*!< in: first number */
-	ib_uint64_t	n2)	/*!< in: second number */
-{
-	return((n1 <= n2) ? n2 : n1);
-}
-
 /****************************************************************//**
 Calculates minimum of two ulint-pairs. */
 UNIV_INLINE
diff --git a/storage/xtradb/include/ut0vec.h b/storage/xtradb/include/ut0vec.h
index 316ae87c2cb..432fb348a09 100644
--- a/storage/xtradb/include/ut0vec.h
+++ b/storage/xtradb/include/ut0vec.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -29,59 +29,116 @@ Created 4/6/2006 Osku Salerma
 #include "univ.i"
 #include "mem0mem.h"
 
-/** An automatically resizing vector data type. */
-typedef struct ib_vector_struct ib_vector_t;
+struct ib_alloc_t;
+struct ib_vector_t;
 
-/* An automatically resizing vector datatype with the following properties:
+typedef void* (*ib_mem_alloc_t)(
+					/* out: Pointer to allocated memory */
+	ib_alloc_t*	allocator,	/* in: Pointer to allocator instance */
+	ulint		size);		/* in: Number of bytes to allocate */
+
+typedef void (*ib_mem_free_t)(
+	ib_alloc_t*	allocator,	/* in: Pointer to allocator instance */
+	void*		ptr);		/* in: Memory to free */
 
- -Contains void* items.
+typedef void* (*ib_mem_resize_t)(
+					/* out: Pointer to resized memory */
+	ib_alloc_t*	allocator,	/* in: Pointer to allocator */
+	void*		ptr,		/* in: Memory to resize */
+	ulint		old_size,	/* in: Old memory size in bytes */
+	ulint		new_size);	/* in: New size in bytes */
 
- -The items are owned by the caller.
+typedef int (*ib_compare_t)(const void*, const void*);
 
- -All memory allocation is done through a heap owned by the caller, who is
- responsible for freeing it when done with the vector.
+/* An automatically resizing vector datatype with the following properties:
 
- -When the vector is resized, the old memory area is left allocated since it
- uses the same heap as the new memory area, so this is best used for
- relatively small or short-lived uses.
+ -All memory allocation is done through an allocator, which is  responsible for
+freeing it when done with the vector.
 */
 
-/****************************************************************//**
-Create a new vector with the given initial size.
-@return	vector */
+/* This is useful shorthand for elements of type void* */
+#define	ib_vector_getp(v, n)	(*(void**) ib_vector_get(v, n))
+#define	ib_vector_getp_const(v, n)	(*(void**) ib_vector_get_const(v, n))
+
+#define ib_vector_allocator(v)	(v->allocator)
+
+/********************************************************************
+Create a new vector with the given initial size. */
 UNIV_INTERN
 ib_vector_t*
 ib_vector_create(
 /*=============*/
-	mem_heap_t*	heap,	/*!< in: heap */
-	ulint		size);	/*!< in: initial size */
+					/* out: vector */
+	ib_alloc_t*	alloc,		/* in: Allocator */
+					/* in: size of the data item */
+	ulint		sizeof_value,
+	ulint		size);		/* in: initial size */
 
-/****************************************************************//**
-Push a new element to the vector, increasing its size if necessary. */
-UNIV_INTERN
+/********************************************************************
+Destroy the vector. Make sure the vector owns the allocator, e.g.,
+the heap in the the heap allocator. */
+UNIV_INLINE
 void
+ib_vector_free(
+/*===========*/
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
+Push a new element to the vector, increasing its size if necessary,
+if elem is not NULL then elem is copied to the vector.*/
+UNIV_INLINE
+void*
 ib_vector_push(
 /*===========*/
+					/* out: pointer the "new" element */
+	ib_vector_t*	vec,		/* in/out: vector */
+	const void*	elem);		/* in: data element */
+
+/********************************************************************
+Pop the last element from the vector.*/
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+					/* out: pointer to the "new" element */
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/*******************************************************************//**
+Remove an element to the vector
+@return pointer to the "removed" element */
+UNIV_INLINE
+void*
+ib_vector_remove(
+/*=============*/
 	ib_vector_t*	vec,	/*!< in: vector */
-	void*		elem);	/*!< in: data element */
+	const void*	elem);	/*!< in: value to remove */
 
-/****************************************************************//**
-Get the number of elements in the vector.
-@return	number of elements in vector */
+/********************************************************************
+Get the number of elements in the vector. */
 UNIV_INLINE
 ulint
 ib_vector_size(
 /*===========*/
-	const ib_vector_t*	vec);	/*!< in: vector */
+					/* out: number of elements in vector */
+	const ib_vector_t*	vec);	/* in: vector */
 
-/****************************************************************//**
+/********************************************************************
+Increase the size of the vector. */
+UNIV_INTERN
+void
+ib_vector_resize(
+/*=============*/
+					/* out: number of elements in vector */
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
 Test whether a vector is empty or not.
-@return	TRUE if empty */
+@return TRUE if empty */
 UNIV_INLINE
 ibool
 ib_vector_is_empty(
 /*===============*/
-	const ib_vector_t*	vec);	/*!< in: vector */
+	const ib_vector_t*	vec);    /*!< in: vector */
 
 /****************************************************************//**
 Get the n'th element.
@@ -93,6 +150,15 @@ ib_vector_get(
 	ib_vector_t*	vec,	/*!< in: vector */
 	ulint		n);	/*!< in: element index to get */
 
+/********************************************************************
+Const version of the get n'th element.
+@return n'th element */
+UNIV_INLINE
+const void*
+ib_vector_get_const(
+/*================*/
+	const ib_vector_t*	vec,	/* in: vector */
+	ulint			n);	/* in: element index to get */
 /****************************************************************//**
 Get last element. The vector must not be empty.
 @return	last element */
@@ -101,7 +167,6 @@ void*
 ib_vector_get_last(
 /*===============*/
 	ib_vector_t*	vec);	/*!< in: vector */
-
 /****************************************************************//**
 Set the n'th element. */
 UNIV_INLINE
@@ -112,33 +177,161 @@ ib_vector_set(
 	ulint		n,	/*!< in: element index to set */
 	void*		elem);	/*!< in: data element */
 
-/****************************************************************//**
-Remove the last element from the vector. */
+/********************************************************************
+Reset the vector size to 0 elements. */
+UNIV_INLINE
+void
+ib_vector_reset(
+/*============*/
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
+Get the last element of the vector. */
 UNIV_INLINE
 void*
-ib_vector_pop(
-/*==========*/
-	ib_vector_t*	vec);	/*!< in: vector */
+ib_vector_last(
+/*===========*/
+					/* out: pointer to last element */
+	ib_vector_t*	vec);		/* in/out: vector */
 
-/****************************************************************//**
-Free the underlying heap of the vector. Note that vec is invalid
-after this call. */
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+const void*
+ib_vector_last_const(
+/*=================*/
+					/* out: pointer to last element */
+	const ib_vector_t*	vec);	/* in: vector */
+
+/********************************************************************
+Sort the vector elements. */
 UNIV_INLINE
 void
-ib_vector_free(
+ib_vector_sort(
+/*===========*/
+	ib_vector_t*	vec,		/* in/out: vector */
+	ib_compare_t	compare);	/* in: the comparator to use for sort */
+
+/********************************************************************
+The default ib_vector_t heap free. Does nothing. */
+UNIV_INLINE
+void
+ib_heap_free(
+/*=========*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		ptr);		/* in: size in bytes */
+
+/********************************************************************
+The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_malloc(
+/*===========*/
+					/* out: pointer to allocated memory */
+	ib_alloc_t*	allocator,	/* in: allocator */
+	ulint		size);		/* in: size in bytes */
+
+/********************************************************************
+The default ib_vector_t heap resize. Since we can't resize the heap
+we have to copy the elements from the old ptr to the new ptr.
+Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_resize(
 /*===========*/
-	ib_vector_t*	vec);	/*!< in,own: vector */
-
-/** An automatically resizing vector data type. */
-struct ib_vector_struct {
-	mem_heap_t*	heap;	/*!< heap */
-	void**		data;	/*!< data elements */
-	ulint		used;	/*!< number of elements currently used */
-	ulint		total;	/*!< number of elements allocated */
+					/* out: pointer to reallocated
+					memory */
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		old_ptr,	/* in: pointer to memory */
+	ulint		old_size,	/* in: old size in bytes */
+	ulint		new_size);	/* in: new size in bytes */
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_heap_allocator_create(
+/*=====================*/
+					/* out: heap allocator instance */
+	mem_heap_t*	heap);		/* in: heap to use */
+
+/********************************************************************
+Free a heap allocator. */
+UNIV_INLINE
+void
+ib_heap_allocator_free(
+/*===================*/
+	ib_alloc_t*	ib_ut_alloc);	/* in: alloc instace to free */
+
+/********************************************************************
+Wrapper for ut_free(). */
+UNIV_INLINE
+void
+ib_ut_free(
+/*=======*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		ptr);		/* in: size in bytes */
+
+/********************************************************************
+Wrapper for ut_malloc(). */
+UNIV_INLINE
+void*
+ib_ut_malloc(
+/*=========*/
+					/* out: pointer to allocated memory */
+	ib_alloc_t*	allocator,	/* in: allocator */
+	ulint		size);		/* in: size in bytes */
+
+/********************************************************************
+Wrapper for ut_realloc(). */
+UNIV_INLINE
+void*
+ib_ut_resize(
+/*=========*/
+					/* out: pointer to reallocated
+					memory */
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		old_ptr,	/* in: pointer to memory */
+	ulint		old_size,	/* in: old size in bytes */
+	ulint		new_size);	/* in: new size in bytes */
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_ut_allocator_create(void);
+/*=========================*/
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+void
+ib_ut_allocator_free(
+/*=================*/
+	ib_alloc_t*	ib_ut_alloc);	/* in: alloc instace to free */
+
+/* Allocator used by ib_vector_t. */
+struct ib_alloc_t {
+	ib_mem_alloc_t	mem_malloc;	/* For allocating memory */
+	ib_mem_free_t	mem_release;	/* For freeing memory */
+	ib_mem_resize_t	mem_resize;	/* For resizing memory */
+	void*		arg;		/* Currently if not NULL then it
+					points to the heap instance */
+};
+
+/* See comment at beginning of file. */
+struct ib_vector_t {
+	ib_alloc_t*	allocator;	/* Allocator, because one size
+					doesn't fit all */
+	void*		data;		/* data elements */
+	ulint		used;		/* number of elements currently used */
+	ulint		total;		/* number of elements allocated */
+					/* Size of a data item */
+	ulint		sizeof_value;
 };
 
 #ifndef UNIV_NONINL
 #include "ut0vec.ic"
 #endif
 
-#endif
+#endif /* IB_VECTOR_H */
diff --git a/storage/xtradb/include/ut0vec.ic b/storage/xtradb/include/ut0vec.ic
index fce41362d3a..f41a85e1d1d 100644
--- a/storage/xtradb/include/ut0vec.ic
+++ b/storage/xtradb/include/ut0vec.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -23,21 +23,169 @@ A vector of pointers to data items
 Created 4/6/2006 Osku Salerma
 ************************************************************************/
 
-/****************************************************************//**
-Get number of elements in vector.
-@return	number of elements in vector */
+#define	IB_VEC_OFFSET(v, i)	(vec->sizeof_value * i)
+
+/********************************************************************
+The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_malloc(
+/*===========*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	ulint		size)		/* in: size in bytes */
+{
+	mem_heap_t*	heap = (mem_heap_t*) allocator->arg;
+
+	return(mem_heap_alloc(heap, size));
+}
+
+/********************************************************************
+The default ib_vector_t heap free. Does nothing. */
+UNIV_INLINE
+void
+ib_heap_free(
+/*=========*/
+	ib_alloc_t*	allocator UNIV_UNUSED,	/* in: allocator */
+	void*		ptr UNIV_UNUSED)	/* in: size in bytes */
+{
+	/* We can't free individual elements. */
+}
+
+/********************************************************************
+The default ib_vector_t heap resize. Since we can't resize the heap
+we have to copy the elements from the old ptr to the new ptr.
+Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_resize(
+/*===========*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		old_ptr,	/* in: pointer to memory */
+	ulint		old_size,	/* in: old size in bytes */
+	ulint		new_size)	/* in: new size in bytes */
+{
+	void*		new_ptr;
+	mem_heap_t*	heap = (mem_heap_t*) allocator->arg;
+
+	new_ptr = mem_heap_alloc(heap, new_size);
+	memcpy(new_ptr, old_ptr, old_size);
+
+	return(new_ptr);
+}
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_heap_allocator_create(
+/*=====================*/
+	mem_heap_t*	heap)		/* in: heap to use */
+{
+	ib_alloc_t*	heap_alloc;
+
+	heap_alloc = (ib_alloc_t*) mem_heap_alloc(heap, sizeof(*heap_alloc));
+
+	heap_alloc->arg = heap;
+	heap_alloc->mem_release = ib_heap_free;
+	heap_alloc->mem_malloc = ib_heap_malloc;
+	heap_alloc->mem_resize = ib_heap_resize;
+
+	return(heap_alloc);
+}
+
+/********************************************************************
+Free a heap allocator. */
+UNIV_INLINE
+void
+ib_heap_allocator_free(
+/*===================*/
+	ib_alloc_t*	ib_ut_alloc)	/* in: alloc instace to free */
+{
+	mem_heap_free((mem_heap_t*) ib_ut_alloc->arg);
+}
+
+/********************************************************************
+Wrapper around ut_malloc(). */
+UNIV_INLINE
+void*
+ib_ut_malloc(
+/*=========*/
+	ib_alloc_t*	allocator UNIV_UNUSED,	/* in: allocator */
+	ulint		size)			/* in: size in bytes */
+{
+	return(ut_malloc(size));
+}
+
+/********************************************************************
+Wrapper around ut_free(). */
+UNIV_INLINE
+void
+ib_ut_free(
+/*=======*/
+	ib_alloc_t*	allocator UNIV_UNUSED,	/* in: allocator */
+	void*		ptr)			/* in: size in bytes */
+{
+	ut_free(ptr);
+}
+
+/********************************************************************
+Wrapper aroung ut_realloc(). */
+UNIV_INLINE
+void*
+ib_ut_resize(
+/*=========*/
+	ib_alloc_t*	allocator UNIV_UNUSED,	/* in: allocator */
+	void*		old_ptr,	/* in: pointer to memory */
+	ulint		old_size UNIV_UNUSED,/* in: old size in bytes */
+	ulint		new_size)	/* in: new size in bytes */
+{
+	return(ut_realloc(old_ptr, new_size));
+}
+
+/********************************************************************
+Create a ut allocator. */
+UNIV_INLINE
+ib_alloc_t*
+ib_ut_allocator_create(void)
+/*========================*/
+{
+	ib_alloc_t*	ib_ut_alloc;
+
+	ib_ut_alloc = (ib_alloc_t*) ut_malloc(sizeof(*ib_ut_alloc));
+
+	ib_ut_alloc->arg = NULL;
+	ib_ut_alloc->mem_release = ib_ut_free;
+	ib_ut_alloc->mem_malloc = ib_ut_malloc;
+	ib_ut_alloc->mem_resize = ib_ut_resize;
+
+	return(ib_ut_alloc);
+}
+
+/********************************************************************
+Free a ut allocator. */
+UNIV_INLINE
+void
+ib_ut_allocator_free(
+/*=================*/
+	ib_alloc_t*	ib_ut_alloc)	/* in: alloc instace to free */
+{
+	ut_free(ib_ut_alloc);
+}
+
+/********************************************************************
+Get number of elements in vector. */
 UNIV_INLINE
 ulint
 ib_vector_size(
 /*===========*/
-	const ib_vector_t*	vec)	/*!< in: vector */
+					/* out: number of elements in vector*/
+	const ib_vector_t*	vec)	/* in: vector */
 {
 	return(vec->used);
 }
 
 /****************************************************************//**
-Get n'th element.
-@return	n'th element */
+Get n'th element. */
 UNIV_INLINE
 void*
 ib_vector_get(
@@ -47,9 +195,23 @@ ib_vector_get(
 {
 	ut_a(n < vec->used);
 
-	return(vec->data[n]);
+	return((byte*) vec->data + IB_VEC_OFFSET(vec, n));
 }
 
+/********************************************************************
+Const version of the get n'th element.
+@return n'th element */
+UNIV_INLINE
+const void*
+ib_vector_get_const(
+/*================*/
+	const ib_vector_t*	vec,	/* in: vector */
+	ulint			n)	/* in: element index to get */
+{
+	ut_a(n < vec->used);
+
+	return((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+}
 /****************************************************************//**
 Get last element. The vector must not be empty.
 @return	last element */
@@ -61,7 +223,7 @@ ib_vector_get_last(
 {
 	ut_a(vec->used > 0);
 
-	return(vec->data[vec->used - 1]);
+	return((byte*) ib_vector_get(vec, vec->used - 1));
 }
 
 /****************************************************************//**
@@ -74,9 +236,52 @@ ib_vector_set(
 	ulint		n,	/*!< in: element index to set */
 	void*		elem)	/*!< in: data element */
 {
+	void*		slot;
+
 	ut_a(n < vec->used);
 
-	vec->data[n] = elem;
+	slot = ((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+	memcpy(slot, elem, vec->sizeof_value);
+}
+
+/********************************************************************
+Reset the vector size to 0 elements. */
+UNIV_INLINE
+void
+ib_vector_reset(
+/*============*/
+					/* out: void */
+	ib_vector_t*	vec)		/* in: vector */
+{
+	vec->used = 0;
+}
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+void*
+ib_vector_last(
+/*===========*/
+					/* out: void */
+	ib_vector_t*	vec)		/* in: vector */
+{
+	ut_a(ib_vector_size(vec) > 0);
+
+	return(ib_vector_get(vec, ib_vector_size(vec) - 1));
+}
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+const void*
+ib_vector_last_const(
+/*=================*/
+					/* out: void */
+	const ib_vector_t*	vec)	/* in: vector */
+{
+	ut_a(ib_vector_size(vec) > 0);
+
+	return(ib_vector_get_const(vec, ib_vector_size(vec) - 1));
 }
 
 /****************************************************************//**
@@ -86,35 +291,130 @@ UNIV_INLINE
 void*
 ib_vector_pop(
 /*==========*/
-	ib_vector_t*    vec)    /*!< in/out: vector */
+				/* out: pointer to element */
+	ib_vector_t*	vec)	/* in: vector */
 {
-	void*           elem;
+	void*		elem;
 
 	ut_a(vec->used > 0);
-	--vec->used;
-	elem = vec->data[vec->used];
 
-	ut_d(vec->data[vec->used] = NULL);
-	UNIV_MEM_INVALID(&vec->data[vec->used], sizeof(*vec->data));
+	elem = ib_vector_last(vec);
+	--vec->used;
 
 	return(elem);
 }
 
-/****************************************************************//**
-Free the underlying heap of the vector. Note that vec is invalid
-after this call. */
+/********************************************************************
+Append an element to the vector, if elem != NULL then copy the data
+from elem.*/
+UNIV_INLINE
+void*
+ib_vector_push(
+/*===========*/
+				/* out: pointer to the "new" element */
+	ib_vector_t*	vec,	/* in: vector */
+	const void*	elem)	/* in: element to add (can be NULL) */
+{
+	void*		last;
+
+	if (vec->used >= vec->total) {
+		ib_vector_resize(vec);
+	}
+
+	last = (byte*) vec->data + IB_VEC_OFFSET(vec, vec->used);
+
+#ifdef UNIV_DEBUG
+	memset(last, 0, vec->sizeof_value);
+#endif
+
+	if (elem) {
+		memcpy(last, elem, vec->sizeof_value);
+	}
+
+	++vec->used;
+
+	return(last);
+}
+
+/*******************************************************************//**
+Remove an element to the vector
+@return pointer to the "removed" element */
+UNIV_INLINE
+void*
+ib_vector_remove(
+/*=============*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	const void*	elem)	/*!< in: value to remove */
+{
+	void*		current = NULL;
+	void*		next;
+	ulint		i;
+	ulint		old_used_count = vec->used;
+
+	for (i = 0; i < vec->used; i++) {
+		current = ib_vector_get(vec, i);
+
+		if (*(void**) current == elem) {
+			if (i == vec->used - 1) {
+				return(ib_vector_pop(vec));
+			}
+
+			next = ib_vector_get(vec, i + 1);
+			memmove(current, next, vec->sizeof_value
+			        * (vec->used - i - 1));
+			--vec->used;
+			break;
+		}
+	}
+
+	return((old_used_count != vec->used) ? current : NULL);
+}
+
+/********************************************************************
+Sort the vector elements. */
+UNIV_INLINE
+void
+ib_vector_sort(
+/*===========*/
+				/* out: void */
+	ib_vector_t*	vec,	/* in: vector */
+	ib_compare_t	compare)/* in: the comparator to use for sort */
+{
+	qsort(vec->data, vec->used, vec->sizeof_value, compare);
+}
+
+/********************************************************************
+Destroy the vector. Make sure the vector owns the allocator, e.g.,
+the heap in the the heap allocator. */
 UNIV_INLINE
 void
 ib_vector_free(
 /*===========*/
-	ib_vector_t*    vec)    /*!< in, own: vector */
+	ib_vector_t*	vec)		/* in, own: vector */
 {
-	mem_heap_free(vec->heap);
+	/* Currently we only support two types of allocators, heap
+	and ut_malloc(), when the heap is freed all the elements are
+	freed too. With ut allocator, we need to free the elements,
+	the vector instance and the allocator separately. */
+
+	/* Only the heap allocator uses the arg field. */
+	if (vec->allocator->arg) {
+		mem_heap_free((mem_heap_t*) vec->allocator->arg);
+	} else {
+		ib_alloc_t*	allocator;
+
+		allocator = vec->allocator;
+
+		allocator->mem_release(allocator, vec->data);
+		allocator->mem_release(allocator, vec);
+
+		ib_ut_allocator_free(allocator);
+	}
 }
 
-/****************************************************************//**
+/********************************************************************
 Test whether a vector is empty or not.
-@return	TRUE if empty */
+@return TRUE if empty */
 UNIV_INLINE
 ibool
 ib_vector_is_empty(
diff --git a/storage/xtradb/include/ut0wqueue.h b/storage/xtradb/include/ut0wqueue.h
index aedcc2b435d..33385ddf2d4 100644
--- a/storage/xtradb/include/ut0wqueue.h
+++ b/storage/xtradb/include/ut0wqueue.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -37,7 +37,7 @@ processing.
 #include "os0sync.h"
 #include "sync0types.h"
 
-typedef struct ib_wqueue_struct ib_wqueue_t;
+struct ib_wqueue_t;
 
 /****************************************************************//**
 Create a new work queue.
@@ -66,6 +66,16 @@ ib_wqueue_add(
 	mem_heap_t*	heap);	/*!< in: memory heap to use for allocating the
 				list node */
 
+/********************************************************************
+Check if queue is empty. */
+
+ibool
+ib_wqueue_is_empty(
+/*===============*/
+					/* out: TRUE if queue empty
+					else FALSE */
+	const ib_wqueue_t*      wq);    /* in: work queue */
+
 /****************************************************************//**
 Wait for a work item to appear in the queue.
 @return	work item */
@@ -75,9 +85,19 @@ ib_wqueue_wait(
 /*===========*/
 	ib_wqueue_t*	wq);	/*!< in: work queue */
 
+/********************************************************************
+Wait for a work item to appear in the queue for specified time. */
+
+void*
+ib_wqueue_timedwait(
+/*================*/
+					/* out: work item or NULL on timeout*/
+	ib_wqueue_t*	wq,		/* in: work queue */
+	ib_time_t	wait_in_usecs); /* in: wait time in micro seconds */
+
 /* Work queue. */
-struct ib_wqueue_struct {
-	mutex_t		mutex;	/*!< mutex protecting everything */
+struct ib_wqueue_t {
+	ib_mutex_t		mutex;	/*!< mutex protecting everything */
 	ib_list_t*	items;	/*!< work item list */
 	os_event_t	event;	/*!< event we use to signal additions to list */
 };
diff --git a/storage/xtradb/lock/lock0iter.c b/storage/xtradb/lock/lock0iter.cc
index 506ec02875a..b424d2fc757 100644
--- a/storage/xtradb/lock/lock0iter.c
+++ b/storage/xtradb/lock/lock0iter.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file lock/lock0iter.c
+@file lock/lock0iter.cc
 Lock queue iterator. Can iterate over table and record
 lock queues.
 
@@ -32,9 +32,6 @@ Created July 16, 2007 Vasil Dimov
 #include "lock0priv.h"
 #include "ut0dbg.h"
 #include "ut0lst.h"
-#ifdef UNIV_DEBUG
-# include "srv0srv.h" /* kernel_mutex */
-#endif /* UNIV_DEBUG */
 
 /*******************************************************************//**
 Initialize lock queue iterator so that it starts to iterate from
@@ -54,7 +51,7 @@ lock_queue_iterator_reset(
 	ulint			bit_no)	/*!< in: record number in the
 					heap */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	iter->current_lock = lock;
 
@@ -90,7 +87,7 @@ lock_queue_iterator_get_prev(
 {
 	const lock_t*	prev_lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	switch (lock_get_type_low(iter->current_lock)) {
 	case LOCK_REC:
diff --git a/storage/xtradb/lock/lock0lock.c b/storage/xtradb/lock/lock0lock.cc
index 547f13828b9..3e60680882a 100644
--- a/storage/xtradb/lock/lock0lock.c
+++ b/storage/xtradb/lock/lock0lock.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file lock/lock0lock.c
+@file lock/lock0lock.cc
 The transaction lock system
 
 Created 5/7/1996 Heikki Tuuri
@@ -37,21 +37,31 @@ Created 5/7/1996 Heikki Tuuri
 #include "usr0sess.h"
 #include "trx0purge.h"
 #include "dict0mem.h"
+#include "dict0boot.h"
 #include "trx0sys.h"
+#include "pars0pars.h" /* pars_complete_graph_for_exec() */
+#include "que0que.h" /* que_node_get_parent() */
+#include "row0mysql.h" /* row_mysql_handle_errors() */
+#include "row0sel.h" /* sel_node_create(), sel_node_t */
+#include "row0types.h" /* sel_node_t */
+#include "srv0mon.h"
+#include "ut0vec.h"
 #include "btr0btr.h"
+#include "dict0boot.h"
+#include <set>
 
 /* Restricts the length of search we will do in the waits-for
 graph of transactions */
 #define LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK 1000000
 
-/* Restricts the recursion depth of the search we will do in the waits-for
-graph of transactions */
+/* Restricts the search depth we will do in the waits-for graph of
+transactions */
 #define LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK 200
 
 /* When releasing transaction locks, this specifies how often we release
-the kernel mutex for a moment to give also others access to it */
+the lock mutex for a moment to give also others access to it */
 
-#define LOCK_RELEASE_KERNEL_INTERVAL	1000
+#define LOCK_RELEASE_INTERVAL		1000
 
 /* Safety margin when creating a new record lock: this many extra records
 can be inserted to the page without need to create a lock with a bigger
@@ -293,6 +303,8 @@ waiting, in its lock queue. Solution: We can copy the locks as gap type
 locks, so that also the waiting locks are transformed to granted gap type
 locks on the inserted record. */
 
+#define LOCK_STACK_SIZE		OS_THREAD_MAX_N
+
 /* LOCK COMPATIBILITY MATRIX
  *    IS IX S  X  AI
  * IS +	 +  +  -  +
@@ -308,18 +320,14 @@ locks on the inserted record. */
  * statement-level MySQL binlog.
  * See also lock_mode_compatible().
  */
-#define LK(a,b) (1 << ((a) * LOCK_NUM + (b)))
-#define LKS(a,b) LK(a,b) | LK(b,a)
-
-/* Define the lock compatibility matrix in a ulint.  The first line below
-defines the diagonal entries.  The following lines define the compatibility
-for LOCK_IX, LOCK_S, and LOCK_AUTO_INC using LKS(), since the matrix
-is symmetric. */
-#define LOCK_MODE_COMPATIBILITY 0					\
- | LK(LOCK_IS, LOCK_IS) | LK(LOCK_IX, LOCK_IX) | LK(LOCK_S, LOCK_S)	\
- | LKS(LOCK_IX, LOCK_IS) | LKS(LOCK_IS, LOCK_AUTO_INC)			\
- | LKS(LOCK_S, LOCK_IS)							\
- | LKS(LOCK_AUTO_INC, LOCK_IS) | LKS(LOCK_AUTO_INC, LOCK_IX)
+static const byte lock_compatibility_matrix[5][5] = {
+ /**         IS     IX       S     X       AI */
+ /* IS */ {  TRUE,  TRUE,  TRUE,  FALSE,  TRUE},
+ /* IX */ {  TRUE,  TRUE,  FALSE, FALSE,  TRUE},
+ /* S  */ {  TRUE,  FALSE, TRUE,  FALSE,  FALSE},
+ /* X  */ {  FALSE, FALSE, FALSE, FALSE,  FALSE},
+ /* AI */ {  TRUE,  TRUE,  FALSE, FALSE,  FALSE}
+};
 
 /* STRONGER-OR-EQUAL RELATION (mode1=row, mode2=column)
  *    IS IX S  X  AI
@@ -330,17 +338,55 @@ is symmetric. */
  * AI -  -  -  -  +
  * See lock_mode_stronger_or_eq().
  */
+static const byte lock_strength_matrix[5][5] = {
+ /**         IS     IX       S     X       AI */
+ /* IS */ {  TRUE,  FALSE, FALSE,  FALSE, FALSE},
+ /* IX */ {  TRUE,  TRUE,  FALSE, FALSE,  FALSE},
+ /* S  */ {  TRUE,  FALSE, TRUE,  FALSE,  FALSE},
+ /* X  */ {  TRUE,  TRUE,  TRUE,  TRUE,   TRUE},
+ /* AI */ {  FALSE, FALSE, FALSE, FALSE,  TRUE}
+};
+
+/** Deadlock check context. */
+struct lock_deadlock_ctx_t {
+	const trx_t*	start;		/*!< Joining transaction that is
+					requesting a lock in an incompatible
+					mode */
 
-/* Define the stronger-or-equal lock relation in a ulint.  This relation
-contains all pairs LK(mode1, mode2) where mode1 is stronger than or
-equal to mode2. */
-#define LOCK_MODE_STRONGER_OR_EQ 0					\
- | LK(LOCK_IS, LOCK_IS)							\
- | LK(LOCK_IX, LOCK_IS) | LK(LOCK_IX, LOCK_IX)				\
- | LK(LOCK_S, LOCK_IS) | LK(LOCK_S, LOCK_S)				\
- | LK(LOCK_AUTO_INC, LOCK_AUTO_INC)					\
- | LK(LOCK_X, LOCK_IS) | LK(LOCK_X, LOCK_IX) | LK(LOCK_X, LOCK_S)	\
- | LK(LOCK_X, LOCK_AUTO_INC) | LK(LOCK_X, LOCK_X)
+	const lock_t*	wait_lock;	/*!< Lock that trx wants */
+
+	ib_uint64_t	mark_start;	/*!<  Value of lock_mark_count at
+					the start of the deadlock check. */
+
+	ulint		depth;		/*!< Stack depth */
+
+	ulint		cost;		/*!< Calculation steps thus far */
+
+	ibool		too_deep;	/*!< TRUE if search was too deep and
+					was aborted */
+};
+
+/** DFS visited node information used during deadlock checking. */
+struct lock_stack_t {
+	const lock_t*	lock;			/*!< Current lock */
+	const lock_t*	wait_lock;		/*!< Waiting for lock */
+	ulint		heap_no;		/*!< heap number if rec lock */
+};
+
+/** Stack to use during DFS search. Currently only a single stack is required
+because there is no parallel deadlock check. This stack is protected by
+the lock_sys_t::mutex. */
+static lock_stack_t*	lock_stack;
+
+/** The count of the types of locks. */
+static const ulint	lock_types = UT_ARR_SIZE(lock_compatibility_matrix);
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	lock_sys_mutex_key;
+/* Key to register mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	lock_sys_wait_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
 
 #ifdef UNIV_DEBUG
 UNIV_INTERN ibool	lock_print_waits	= FALSE;
@@ -349,9 +395,9 @@ UNIV_INTERN ibool	lock_print_waits	= FALSE;
 Validates the lock system.
 @return	TRUE if ok */
 static
-ibool
-lock_validate(void);
-/*===============*/
+bool
+lock_validate();
+/*============*/
 
 /*********************************************************************//**
 Validates the record lock queues on a page.
@@ -367,48 +413,26 @@ lock_rec_validate_page(
 /* The lock system */
 UNIV_INTERN lock_sys_t*	lock_sys	= NULL;
 
-/* We store info on the latest deadlock error to this buffer. InnoDB
+/** We store info on the latest deadlock error to this buffer. InnoDB
 Monitor will then fetch it and print */
 UNIV_INTERN ibool	lock_deadlock_found = FALSE;
-UNIV_INTERN FILE*	lock_latest_err_file;
-
-/* Flags for recursive deadlock search */
-#define LOCK_VICTIM_IS_START	1
-#define LOCK_VICTIM_IS_OTHER	2
-#define LOCK_EXCEED_MAX_DEPTH	3
+/** Only created if !srv_read_only_mode */
+static FILE*		lock_latest_err_file;
 
 /********************************************************************//**
-Checks if a lock request results in a deadlock.
-@return TRUE if a deadlock was detected and we chose trx as a victim;
-FALSE if no deadlock, or there was a deadlock, but we chose other
-transaction(s) as victim(s) */
-static
-ibool
-lock_deadlock_occurs(
-/*=================*/
-	lock_t*	lock,	/*!< in: lock the transaction is requesting */
-	trx_t*	trx);	/*!< in: transaction */
-/********************************************************************//**
-Looks recursively for a deadlock.
-@return 0 if no deadlock found, LOCK_VICTIM_IS_START if there was a
-deadlock and we chose 'start' as the victim, LOCK_VICTIM_IS_OTHER if a
-deadlock was found and we chose some other trx as a victim: we must do
-the search again in this last case because there may be another
-deadlock!
-LOCK_EXCEED_MAX_DEPTH if the lock search exceeds max steps or max depth. */
+Checks if a joining lock request results in a deadlock. If a deadlock is
+found this function will resolve the dadlock by choosing a victim transaction
+and rolling it back. It will attempt to resolve all deadlocks. The returned
+transaction id will be the joining transaction id or 0 if some other
+transaction was chosen as a victim and rolled back or no deadlock found.
+
+@return id of transaction chosen as victim or 0 */
 static
-ulint
-lock_deadlock_recursive(
-/*====================*/
-	trx_t*	start,		/*!< in: recursion starting point */
-	trx_t*	trx,		/*!< in: a transaction waiting for a lock */
-	lock_t*	wait_lock,	/*!< in:  lock that is waiting to be granted */
-	ulint*	cost,		/*!< in/out: number of calculation steps thus
-				far: if this exceeds LOCK_MAX_N_STEPS_...
-				we return LOCK_EXCEED_MAX_DEPTH */
-	ulint	depth);		/*!< in: recursion depth: if this exceeds
-				LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK, we
-				return LOCK_EXCEED_MAX_DEPTH */
+trx_id_t
+lock_deadlock_check_and_resolve(
+/*===========================*/
+	const lock_t*	lock,	/*!< in: lock the transaction is requesting */
+	const trx_t*	trx);	/*!< in: transaction */
 
 /*********************************************************************//**
 Gets the nth bit of a record lock.
@@ -420,8 +444,7 @@ lock_rec_get_nth_bit(
 	const lock_t*	lock,	/*!< in: record lock */
 	ulint		i)	/*!< in: index of the bit */
 {
-	ulint	byte_index;
-	ulint	bit_index;
+	const byte*	b;
 
 	ut_ad(lock);
 	ut_ad(lock_get_type_low(lock) == LOCK_REC);
@@ -431,62 +454,64 @@ lock_rec_get_nth_bit(
 		return(FALSE);
 	}
 
-	byte_index = i / 8;
-	bit_index = i % 8;
+	b = ((const byte*) &lock[1]) + (i / 8);
 
-	return(1 & ((const byte*) &lock[1])[byte_index] >> bit_index);
+	return(1 & *b >> (i % 8));
 }
 
-/*************************************************************************/
-
-#define lock_mutex_enter_kernel()	mutex_enter(&kernel_mutex)
-#define lock_mutex_exit_kernel()	mutex_exit(&kernel_mutex)
+/*********************************************************************//**
+Reports that a transaction id is insensible, i.e., in the future. */
+UNIV_INTERN
+void
+lock_report_trx_id_insanity(
+/*========================*/
+	trx_id_t	trx_id,		/*!< in: trx id */
+	const rec_t*	rec,		/*!< in: user record */
+	dict_index_t*	index,		/*!< in: index */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	trx_id_t	max_trx_id)	/*!< in: trx_sys_get_max_trx_id() */
+{
+	ut_print_timestamp(stderr);
+	fputs("  InnoDB: Error: transaction id associated with record\n",
+	      stderr);
+	rec_print_new(stderr, rec, offsets);
+	fputs("InnoDB: in ", stderr);
+	dict_index_name_print(stderr, NULL, index);
+	fprintf(stderr, "\n"
+		"InnoDB: is " TRX_ID_FMT " which is higher than the"
+		" global trx id counter " TRX_ID_FMT "!\n"
+		"InnoDB: The table is corrupt. You have to do"
+		" dump + drop + reimport.\n",
+		trx_id, max_trx_id);
+}
 
 /*********************************************************************//**
 Checks that a transaction id is sensible, i.e., not in the future.
-@return	TRUE if ok */
+@return	true if ok */
+#ifdef UNIV_DEBUG
 UNIV_INTERN
-ibool
+#else
+static __attribute__((nonnull, warn_unused_result))
+#endif
+bool
 lock_check_trx_id_sanity(
 /*=====================*/
 	trx_id_t	trx_id,		/*!< in: trx id */
 	const rec_t*	rec,		/*!< in: user record */
 	dict_index_t*	index,		/*!< in: index */
-	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
-	ibool		has_kernel_mutex)/*!< in: TRUE if the caller owns the
-					kernel mutex */
+	const ulint*	offsets)	/*!< in: rec_get_offsets(rec, index) */
 {
-	ibool	is_ok		= TRUE;
+	bool		is_ok;
+	trx_id_t	max_trx_id;
 
 	ut_ad(rec_offs_validate(rec, index, offsets));
 
-	if (!has_kernel_mutex) {
-		mutex_enter(&kernel_mutex);
-	}
-
-	/* A sanity check: the trx_id in rec must be smaller than the global
-	trx id counter */
-
-	if (UNIV_UNLIKELY(trx_id >= trx_sys->max_trx_id)) {
-		ut_print_timestamp(stderr);
-		fputs("  InnoDB: Error: transaction id associated"
-		      " with record\n",
-		      stderr);
-		rec_print_new(stderr, rec, offsets);
-		fputs("InnoDB: in ", stderr);
-		dict_index_name_print(stderr, NULL, index);
-		fprintf(stderr, "\n"
-			"InnoDB: is " TRX_ID_FMT " which is higher than the"
-			" global trx id counter " TRX_ID_FMT "!\n"
-			"InnoDB: The table is corrupt. You have to do"
-			" dump + drop + reimport.\n",
-			(ullint) trx_id, (ullint) trx_sys->max_trx_id);
+	max_trx_id = trx_sys_get_max_trx_id();
+	is_ok = trx_id < max_trx_id;
 
-		is_ok = FALSE;
-	}
-
-	if (!has_kernel_mutex) {
-		mutex_exit(&kernel_mutex);
+	if (UNIV_UNLIKELY(!is_ok)) {
+		lock_report_trx_id_insanity(trx_id,
+					    rec, index, offsets, max_trx_id);
 	}
 
 	return(is_ok);
@@ -494,10 +519,10 @@ lock_check_trx_id_sanity(
 
 /*********************************************************************//**
 Checks that a record is seen in a consistent read.
-@return TRUE if sees, or FALSE if an earlier version of the record
+@return true if sees, or false if an earlier version of the record
 should be retrieved */
 UNIV_INTERN
-ibool
+bool
 lock_clust_rec_cons_read_sees(
 /*==========================*/
 	const rec_t*	rec,	/*!< in: user record which should be read or
@@ -513,8 +538,7 @@ lock_clust_rec_cons_read_sees(
 	ut_ad(rec_offs_validate(rec, index, offsets));
 
 	/* NOTE that we call this function while holding the search
-	system latch. To obey the latching order we must NOT reserve the
-	kernel mutex here! */
+	system latch. */
 
 	trx_id = row_get_rec_trx_id(rec, index, offsets);
 
@@ -525,14 +549,14 @@ lock_clust_rec_cons_read_sees(
 Checks that a non-clustered index record is seen in a consistent read.
 
 NOTE that a non-clustered index page contains so little information on
-its modifications that also in the case FALSE, the present version of
+its modifications that also in the case false, the present version of
 rec may be the right, but we must check this from the clustered index
 record.
 
-@return TRUE if certainly sees, or FALSE if an earlier version of the
+@return true if certainly sees, or false if an earlier version of the
 clustered index record might be needed */
 UNIV_INTERN
-ulint
+bool
 lock_sec_rec_cons_read_sees(
 /*========================*/
 	const rec_t*		rec,	/*!< in: user record which
@@ -545,12 +569,11 @@ lock_sec_rec_cons_read_sees(
 	ut_ad(page_rec_is_user_rec(rec));
 
 	/* NOTE that we might call this function while holding the search
-	system latch. To obey the latching order we must NOT reserve the
-	kernel mutex here! */
+	system latch. */
 
 	if (recv_recovery_is_on()) {
 
-		return(FALSE);
+		return(false);
 	}
 
 	max_trx_id = page_get_max_trx_id(page_align(rec));
@@ -567,15 +590,36 @@ lock_sys_create(
 /*============*/
 	ulint	n_cells)	/*!< in: number of slots in lock hash table */
 {
-	lock_sys = mem_alloc(sizeof(lock_sys_t));
+	ulint	lock_sys_sz;
+
+	lock_sys_sz = sizeof(*lock_sys)
+		+ OS_THREAD_MAX_N * sizeof(srv_slot_t);
+
+	lock_sys = static_cast<lock_sys_t*>(mem_zalloc(lock_sys_sz));
+
+	lock_stack = static_cast<lock_stack_t*>(
+		mem_zalloc(sizeof(*lock_stack) * LOCK_STACK_SIZE));
+
+	void*	ptr = &lock_sys[1];
+
+	lock_sys->waiting_threads = static_cast<srv_slot_t*>(ptr);
+
+	lock_sys->last_slot = lock_sys->waiting_threads;
+
+	mutex_create(lock_sys_mutex_key, &lock_sys->mutex, SYNC_LOCK_SYS);
+
+	mutex_create(lock_sys_wait_mutex_key,
+		     &lock_sys->wait_mutex, SYNC_LOCK_WAIT_SYS);
+
+	lock_sys->timeout_event = os_event_create();
 
 	lock_sys->rec_hash = hash_create(n_cells);
 	lock_sys->rec_num = 0;
 
-	/* hash_create_mutexes(lock_sys->rec_hash, 2, SYNC_REC_LOCK); */
-
-	lock_latest_err_file = os_file_create_tmpfile();
-	ut_a(lock_latest_err_file);
+	if (!srv_read_only_mode) {
+		lock_latest_err_file = os_file_create_tmpfile();
+		ut_a(lock_latest_err_file);
+	}
 }
 
 /*********************************************************************//**
@@ -591,8 +635,15 @@ lock_sys_close(void)
 	}
 
 	hash_table_free(lock_sys->rec_hash);
+
+	mutex_free(&lock_sys->mutex);
+	mutex_free(&lock_sys->wait_mutex);
+
+	mem_free(lock_stack);
 	mem_free(lock_sys);
+
 	lock_sys = NULL;
+	lock_stack = NULL;
 }
 
 /*********************************************************************//**
@@ -603,7 +654,7 @@ ulint
 lock_get_size(void)
 /*===============*/
 {
-	return((ulint)sizeof(lock_t));
+	return((ulint) sizeof(lock_t));
 }
 
 /*********************************************************************//**
@@ -617,26 +668,21 @@ lock_get_mode(
 {
 	ut_ad(lock);
 
-	return(lock->type_mode & LOCK_MODE_MASK);
+	return(static_cast<enum lock_mode>(lock->type_mode & LOCK_MODE_MASK));
 }
 
 /*********************************************************************//**
 Gets the wait flag of a lock.
-@return	TRUE if waiting */
+@return	LOCK_WAIT if waiting, 0 if not */
 UNIV_INLINE
-ibool
+ulint
 lock_get_wait(
 /*==========*/
 	const lock_t*	lock)	/*!< in: lock */
 {
 	ut_ad(lock);
 
-	if (UNIV_UNLIKELY(lock->type_mode & LOCK_WAIT)) {
-
-		return(TRUE);
-	}
-
-	return(FALSE);
+	return(lock->type_mode & LOCK_WAIT);
 }
 
 /*********************************************************************//**
@@ -657,11 +703,21 @@ lock_get_src_table(
 	dict_table_t*	src;
 	lock_t*		lock;
 
+	ut_ad(!lock_mutex_own());
+
 	src = NULL;
 	*mode = LOCK_NONE;
 
-	for (lock = UT_LIST_GET_FIRST(trx->trx_locks);
-	     lock;
+	/* The trx mutex protects the trx_locks for our purposes.
+	Other transactions could want to convert one of our implicit
+	record locks to an explicit one. For that, they would need our
+	trx mutex. Waiting locks can be removed while only holding
+	lock_sys->mutex, but this is a running transaction and cannot
+	thus be holding any waiting locks. */
+	trx_mutex_enter(trx);
+
+	for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
+	     lock != NULL;
 	     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
 		lock_table_t*	tab_lock;
 		enum lock_mode	lock_mode;
@@ -680,12 +736,14 @@ lock_get_src_table(
 			    || UT_LIST_GET_FIRST(src->locks) != lock) {
 				/* We only support the case when
 				there is only one lock on this table. */
-				return(NULL);
+				src = NULL;
+				goto func_exit;
 			}
 		} else if (src != tab_lock->table) {
 			/* The transaction is locking more than
 			two tables (src and dest): abort */
-			return(NULL);
+			src = NULL;
+			goto func_exit;
 		}
 
 		/* Check that the source table is locked by
@@ -694,7 +752,8 @@ lock_get_src_table(
 		if (lock_mode == LOCK_IX || lock_mode == LOCK_IS) {
 			if (*mode != LOCK_NONE && *mode != lock_mode) {
 				/* There are multiple locks on src. */
-				return(NULL);
+				src = NULL;
+				goto func_exit;
 			}
 			*mode = lock_mode;
 		}
@@ -705,6 +764,8 @@ lock_get_src_table(
 		src = dest;
 	}
 
+func_exit:
+	trx_mutex_exit(trx);
 	return(src);
 }
 
@@ -718,8 +779,8 @@ UNIV_INTERN
 ibool
 lock_is_table_exclusive(
 /*====================*/
-	dict_table_t*	table,	/*!< in: table */
-	trx_t*		trx)	/*!< in: transaction */
+	const dict_table_t*	table,	/*!< in: table */
+	const trx_t*		trx)	/*!< in: transaction */
 {
 	const lock_t*	lock;
 	ibool		ok	= FALSE;
@@ -727,10 +788,10 @@ lock_is_table_exclusive(
 	ut_ad(table);
 	ut_ad(trx);
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	for (lock = UT_LIST_GET_FIRST(table->locks);
-	     lock;
+	     lock != NULL;
 	     lock = UT_LIST_GET_NEXT(locks, &lock->un_member.tab_lock)) {
 		if (lock->trx != trx) {
 			/* A lock on the table is held
@@ -760,7 +821,7 @@ not_ok:
 	}
 
 func_exit:
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 
 	return(ok);
 }
@@ -772,12 +833,15 @@ void
 lock_set_lock_and_trx_wait(
 /*=======================*/
 	lock_t*	lock,	/*!< in: lock */
-	trx_t*	trx)	/*!< in: trx */
+	trx_t*	trx)	/*!< in/out: trx */
 {
 	ut_ad(lock);
-	ut_ad(trx->wait_lock == NULL);
+	ut_ad(lock->trx == trx);
+	ut_ad(trx->lock.wait_lock == NULL);
+	ut_ad(lock_mutex_own());
+	ut_ad(trx_mutex_own(trx));
 
-	trx->wait_lock = lock;
+	trx->lock.wait_lock = lock;
 	lock->type_mode |= LOCK_WAIT;
 }
 
@@ -788,15 +852,15 @@ UNIV_INLINE
 void
 lock_reset_lock_and_trx_wait(
 /*=========================*/
-	lock_t*	lock)	/*!< in: record lock */
+	lock_t*	lock)	/*!< in/out: record lock */
 {
 	ut_ad(lock_get_wait(lock));
+	ut_ad(lock_mutex_own());
 
 	/* Reset the back pointer in trx to this waiting lock request */
-
 	if (!(lock->type_mode & LOCK_CONV_BY_OTHER)) {
-		ut_ad((lock->trx)->wait_lock == lock);
-		(lock->trx)->wait_lock = NULL;
+		ut_ad(lock->trx->lock.wait_lock == lock);
+		lock->trx->lock.wait_lock = NULL;
 	} else {
 		ut_ad(lock_get_type_low(lock) == LOCK_REC);
 	}
@@ -805,9 +869,9 @@ lock_reset_lock_and_trx_wait(
 
 /*********************************************************************//**
 Gets the gap flag of a record lock.
-@return	TRUE if gap flag set */
+@return	LOCK_GAP or 0 */
 UNIV_INLINE
-ibool
+ulint
 lock_rec_get_gap(
 /*=============*/
 	const lock_t*	lock)	/*!< in: record lock */
@@ -815,19 +879,14 @@ lock_rec_get_gap(
 	ut_ad(lock);
 	ut_ad(lock_get_type_low(lock) == LOCK_REC);
 
-	if (lock->type_mode & LOCK_GAP) {
-
-		return(TRUE);
-	}
-
-	return(FALSE);
+	return(lock->type_mode & LOCK_GAP);
 }
 
 /*********************************************************************//**
 Gets the LOCK_REC_NOT_GAP flag of a record lock.
-@return	TRUE if LOCK_REC_NOT_GAP flag set */
+@return	LOCK_REC_NOT_GAP or 0 */
 UNIV_INLINE
-ibool
+ulint
 lock_rec_get_rec_not_gap(
 /*=====================*/
 	const lock_t*	lock)	/*!< in: record lock */
@@ -835,19 +894,14 @@ lock_rec_get_rec_not_gap(
 	ut_ad(lock);
 	ut_ad(lock_get_type_low(lock) == LOCK_REC);
 
-	if (lock->type_mode & LOCK_REC_NOT_GAP) {
-
-		return(TRUE);
-	}
-
-	return(FALSE);
+	return(lock->type_mode & LOCK_REC_NOT_GAP);
 }
 
 /*********************************************************************//**
 Gets the waiting insert flag of a record lock.
-@return	TRUE if gap flag set */
+@return	LOCK_INSERT_INTENTION or 0 */
 UNIV_INLINE
-ibool
+ulint
 lock_rec_get_insert_intention(
 /*==========================*/
 	const lock_t*	lock)	/*!< in: record lock */
@@ -855,12 +909,7 @@ lock_rec_get_insert_intention(
 	ut_ad(lock);
 	ut_ad(lock_get_type_low(lock) == LOCK_REC);
 
-	if (lock->type_mode & LOCK_INSERT_INTENTION) {
-
-		return(TRUE);
-	}
-
-	return(FALSE);
+	return(lock->type_mode & LOCK_INSERT_INTENTION);
 }
 
 /*********************************************************************//**
@@ -873,12 +922,10 @@ lock_mode_stronger_or_eq(
 	enum lock_mode	mode1,	/*!< in: lock mode */
 	enum lock_mode	mode2)	/*!< in: lock mode */
 {
-	ut_ad(mode1 == LOCK_X || mode1 == LOCK_S || mode1 == LOCK_IX
-	      || mode1 == LOCK_IS || mode1 == LOCK_AUTO_INC);
-	ut_ad(mode2 == LOCK_X || mode2 == LOCK_S || mode2 == LOCK_IX
-	      || mode2 == LOCK_IS || mode2 == LOCK_AUTO_INC);
+	ut_ad((ulint) mode1 < lock_types);
+	ut_ad((ulint) mode2 < lock_types);
 
-	return((LOCK_MODE_STRONGER_OR_EQ) & LK(mode1, mode2));
+	return(lock_strength_matrix[mode1][mode2]);
 }
 
 /*********************************************************************//**
@@ -891,12 +938,10 @@ lock_mode_compatible(
 	enum lock_mode	mode1,	/*!< in: lock mode */
 	enum lock_mode	mode2)	/*!< in: lock mode */
 {
-	ut_ad(mode1 == LOCK_X || mode1 == LOCK_S || mode1 == LOCK_IX
-	      || mode1 == LOCK_IS || mode1 == LOCK_AUTO_INC);
-	ut_ad(mode2 == LOCK_X || mode2 == LOCK_S || mode2 == LOCK_IX
-	      || mode2 == LOCK_IS || mode2 == LOCK_AUTO_INC);
+	ut_ad((ulint) mode1 < lock_types);
+	ut_ad((ulint) mode2 < lock_types);
 
-	return((LOCK_MODE_COMPATIBILITY) & LK(mode1, mode2));
+	return(lock_compatibility_matrix[mode1][mode2]);
 }
 
 /*********************************************************************//**
@@ -924,7 +969,8 @@ lock_rec_has_to_wait(
 	ut_ad(lock_get_type_low(lock2) == LOCK_REC);
 
 	if (trx != lock2->trx
-	    && !lock_mode_compatible(LOCK_MODE_MASK & type_mode,
+	    && !lock_mode_compatible(static_cast<enum lock_mode>(
+			             LOCK_MODE_MASK & type_mode),
 				     lock_get_mode(lock2))) {
 
 		/* We have somewhat complex rules when gap type record locks
@@ -1112,14 +1158,14 @@ lock_rec_get_next_on_page_const(
 	ulint	space;
 	ulint	page_no;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_type_low(lock) == LOCK_REC);
 
 	space = lock->un_member.rec_lock.space;
 	page_no = lock->un_member.rec_lock.page_no;
 
 	for (;;) {
-		lock = HASH_GET_NEXT(hash, lock);
+		lock = static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock));
 
 		if (!lock) {
 
@@ -1161,46 +1207,41 @@ lock_rec_get_first_on_page_addr(
 {
 	lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
-	lock = HASH_GET_FIRST(lock_sys->rec_hash,
-			      lock_rec_hash(space, page_no));
-	while (lock) {
-		if ((lock->un_member.rec_lock.space == space)
-		    && (lock->un_member.rec_lock.page_no == page_no)) {
+	for (lock = static_cast<lock_t*>(
+			HASH_GET_FIRST(lock_sys->rec_hash,
+				       lock_rec_hash(space, page_no)));
+	      lock != NULL;
+	      lock = static_cast<lock_t*>(HASH_GET_NEXT(hash, lock))) {
+
+		if (lock->un_member.rec_lock.space == space
+		    && lock->un_member.rec_lock.page_no == page_no) {
 
 			break;
 		}
-
-		lock = HASH_GET_NEXT(hash, lock);
 	}
 
 	return(lock);
 }
 
 /*********************************************************************//**
-Returns TRUE if there are explicit record locks on a page.
-@return	TRUE if there are explicit record locks on the page */
+Determines if there are explicit record locks on a page.
+@return	an explicit record lock on the page, or NULL if there are none */
 UNIV_INTERN
-ibool
+lock_t*
 lock_rec_expl_exist_on_page(
 /*========================*/
 	ulint	space,	/*!< in: space id */
 	ulint	page_no)/*!< in: page number */
 {
-	ibool	ret;
-
-	mutex_enter(&kernel_mutex);
-
-	if (lock_rec_get_first_on_page_addr(space, page_no)) {
-		ret = TRUE;
-	} else {
-		ret = FALSE;
-	}
+	lock_t*	lock;
 
-	mutex_exit(&kernel_mutex);
+	lock_mutex_enter();
+	lock = lock_rec_get_first_on_page_addr(space, page_no);
+	lock_mutex_exit();
 
-	return(ret);
+	return(lock);
 }
 
 /*********************************************************************//**
@@ -1218,20 +1259,20 @@ lock_rec_get_first_on_page(
 	ulint	space	= buf_block_get_space(block);
 	ulint	page_no	= buf_block_get_page_no(block);
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	hash = buf_block_get_lock_hash_val(block);
 
-	lock = HASH_GET_FIRST(lock_sys->rec_hash, hash);
+	for (lock = static_cast<lock_t*>(
+			HASH_GET_FIRST( lock_sys->rec_hash, hash));
+	     lock != NULL;
+	     lock = static_cast<lock_t*>(HASH_GET_NEXT(hash, lock))) {
 
-	while (lock) {
 		if ((lock->un_member.rec_lock.space == space)
 		    && (lock->un_member.rec_lock.page_no == page_no)) {
 
 			break;
 		}
-
-		lock = HASH_GET_NEXT(hash, lock);
 	}
 
 	return(lock);
@@ -1247,7 +1288,7 @@ lock_rec_get_next(
 	ulint	heap_no,/*!< in: heap number of the record */
 	lock_t*	lock)	/*!< in: lock */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	do {
 		ut_ad(lock_get_type_low(lock) == LOCK_REC);
@@ -1258,6 +1299,19 @@ lock_rec_get_next(
 }
 
 /*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return	next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_const(
+/*====================*/
+	ulint		heap_no,/*!< in: heap number of the record */
+	const lock_t*	lock)	/*!< in: lock */
+{
+	return(lock_rec_get_next(heap_no, (lock_t*) lock));
+}
+
+/*********************************************************************//**
 Gets the first explicit lock request on a record.
 @return	first lock, NULL if none exists */
 UNIV_INLINE
@@ -1269,7 +1323,7 @@ lock_rec_get_first(
 {
 	lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	for (lock = lock_rec_get_first_on_page(block); lock;
 	     lock = lock_rec_get_next_on_page(lock)) {
@@ -1321,7 +1375,7 @@ lock_rec_copy(
 
 	size = sizeof(lock_t) + lock_rec_get_n_bits(lock) / 8;
 
-	return(mem_heap_dup(heap, lock, size));
+	return(static_cast<lock_t*>(mem_heap_dup(heap, lock, size)));
 }
 
 /*********************************************************************//**
@@ -1339,15 +1393,16 @@ lock_rec_get_prev(
 	ulint	page_no;
 	lock_t*	found_lock	= NULL;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
 
 	space = in_lock->un_member.rec_lock.space;
 	page_no = in_lock->un_member.rec_lock.page_no;
 
-	lock = lock_rec_get_first_on_page_addr(space, page_no);
+	for (lock = lock_rec_get_first_on_page_addr(space, page_no);
+	     /* No op */;
+	     lock = lock_rec_get_next_on_page(lock)) {
 
-	for (;;) {
 		ut_ad(lock);
 
 		if (lock == in_lock) {
@@ -1359,46 +1414,55 @@ lock_rec_get_prev(
 
 			found_lock = lock;
 		}
-
-		lock = lock_rec_get_next_on_page(lock);
 	}
 }
 
 /*============= FUNCTIONS FOR ANALYZING TABLE LOCK QUEUE ================*/
 
 /*********************************************************************//**
-Checks if a transaction has the specified table lock, or stronger.
+Checks if a transaction has the specified table lock, or stronger. This
+function should only be called by the thread that owns the transaction.
 @return	lock or NULL */
 UNIV_INLINE
-lock_t*
+const lock_t*
 lock_table_has(
 /*===========*/
-	trx_t*		trx,	/*!< in: transaction */
-	dict_table_t*	table,	/*!< in: table */
-	enum lock_mode	mode)	/*!< in: lock mode */
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_table_t*	table,	/*!< in: table */
+	enum lock_mode		mode)	/*!< in: lock mode */
 {
-	lock_t*	lock;
+	lint			i;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	if (ib_vector_is_empty(trx->lock.table_locks)) {
+		return(NULL);
+	}
 
 	/* Look for stronger locks the same trx already has on the table */
 
-	lock = UT_LIST_GET_LAST(table->locks);
+	for (i = ib_vector_size(trx->lock.table_locks) - 1; i >= 0; --i) {
+		const lock_t*	lock;
+		enum lock_mode	lock_mode;
 
-	while (lock != NULL) {
+		lock = *static_cast<const lock_t**>(
+			ib_vector_get(trx->lock.table_locks, i));
 
-		if (lock->trx == trx
-		    && lock_mode_stronger_or_eq(lock_get_mode(lock), mode)) {
+		if (lock == NULL) {
+			continue;
+		}
+
+		lock_mode = lock_get_mode(lock);
+
+		ut_ad(trx == lock->trx);
+		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+		ut_ad(lock->un_member.tab_lock.table != NULL);
 
-			/* The same trx already has locked the table in
-			a mode stronger or equal to the mode given */
+		if (table == lock->un_member.tab_lock.table
+		    && lock_mode_stronger_or_eq(lock_mode, mode)) {
 
 			ut_ad(!lock_get_wait(lock));
 
 			return(lock);
 		}
-
-		lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
 	}
 
 	return(NULL);
@@ -1422,34 +1486,35 @@ lock_rec_has_expl(
 	const buf_block_t*	block,	/*!< in: buffer block containing
 					the record */
 	ulint			heap_no,/*!< in: heap number of the record */
-	trx_t*			trx)	/*!< in: transaction */
+	const trx_t*		trx)	/*!< in: transaction */
 {
 	lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S
 	      || (precise_mode & LOCK_MODE_MASK) == LOCK_X);
 	ut_ad(!(precise_mode & LOCK_INSERT_INTENTION));
 
-	lock = lock_rec_get_first(block, heap_no);
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
 
-	while (lock) {
 		if (lock->trx == trx
+		    && !lock_rec_get_insert_intention(lock)
 		    && !lock_is_wait_not_by_other(lock->type_mode)
-		    && lock_mode_stronger_or_eq(lock_get_mode(lock),
-						precise_mode & LOCK_MODE_MASK)
+		    && lock_mode_stronger_or_eq(
+			    lock_get_mode(lock),
+			    static_cast<enum lock_mode>(
+				    precise_mode & LOCK_MODE_MASK))
 		    && (!lock_rec_get_rec_not_gap(lock)
 			|| (precise_mode & LOCK_REC_NOT_GAP)
 			|| heap_no == PAGE_HEAP_NO_SUPREMUM)
 		    && (!lock_rec_get_gap(lock)
 			|| (precise_mode & LOCK_GAP)
-			|| heap_no == PAGE_HEAP_NO_SUPREMUM)
-		    && (!lock_rec_get_insert_intention(lock))) {
+			|| heap_no == PAGE_HEAP_NO_SUPREMUM)) {
 
 			return(lock);
 		}
-
-		lock = lock_rec_get_next(heap_no, lock);
 	}
 
 	return(NULL);
@@ -1460,7 +1525,7 @@ lock_rec_has_expl(
 Checks if some other transaction has a lock request in the queue.
 @return	lock or NULL */
 static
-lock_t*
+const lock_t*
 lock_rec_other_has_expl_req(
 /*========================*/
 	enum lock_mode		mode,	/*!< in: LOCK_S or LOCK_X */
@@ -1477,16 +1542,17 @@ lock_rec_other_has_expl_req(
 					requests by all transactions
 					are taken into account */
 {
-	lock_t*	lock;
+	const lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(mode == LOCK_X || mode == LOCK_S);
 	ut_ad(gap == 0 || gap == LOCK_GAP);
 	ut_ad(wait == 0 || wait == LOCK_WAIT);
 
-	lock = lock_rec_get_first(block, heap_no);
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next_const(heap_no, lock)) {
 
-	while (lock) {
 		if (lock->trx != trx
 		    && (gap
 			|| !(lock_rec_get_gap(lock)
@@ -1496,8 +1562,6 @@ lock_rec_other_has_expl_req(
 
 			return(lock);
 		}
-
-		lock = lock_rec_get_next(heap_no, lock);
 	}
 
 	return(NULL);
@@ -1509,7 +1573,7 @@ Checks if some other transaction has a conflicting explicit lock request
 in the queue, so that we have to wait.
 @return	lock or NULL */
 static
-lock_t*
+const lock_t*
 lock_rec_other_has_conflicting(
 /*===========================*/
 	enum lock_mode		mode,	/*!< in: LOCK_S or LOCK_X,
@@ -1519,35 +1583,21 @@ lock_rec_other_has_conflicting(
 	const buf_block_t*	block,	/*!< in: buffer block containing
 					the record */
 	ulint			heap_no,/*!< in: heap number of the record */
-	trx_t*			trx)	/*!< in: our transaction */
+	const trx_t*		trx)	/*!< in: our transaction */
 {
-	lock_t*	lock;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	lock = lock_rec_get_first(block, heap_no);
-
-	if (UNIV_LIKELY_NULL(lock)) {
-		if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+	const lock_t*		lock;
+	ibool			is_supremum;
 
-			do {
-				if (lock_rec_has_to_wait(trx, mode, lock,
-							 TRUE)) {
-					return(lock);
-				}
+	ut_ad(lock_mutex_own());
 
-				lock = lock_rec_get_next(heap_no, lock);
-			} while (lock);
-		} else {
+	is_supremum = (heap_no == PAGE_HEAP_NO_SUPREMUM);
 
-			do {
-				if (lock_rec_has_to_wait(trx, mode, lock,
-							 FALSE)) {
-					return(lock);
-				}
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next_const(heap_no, lock)) {
 
-				lock = lock_rec_get_next(heap_no, lock);
-			} while (lock);
+		if (lock_rec_has_to_wait(trx, mode, lock, is_supremum)) {
+			return(lock);
 		}
 	}
 
@@ -1568,17 +1618,18 @@ lock_rec_find_similar_on_page(
 	lock_t*		lock,		/*!< in: lock_rec_get_first_on_page() */
 	const trx_t*	trx)		/*!< in: transaction */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
+
+	for (/* No op */;
+	     lock != NULL;
+	     lock = lock_rec_get_next_on_page(lock)) {
 
-	while (lock != NULL) {
 		if (lock->trx == trx
 		    && lock->type_mode == type_mode
 		    && lock_rec_get_n_bits(lock) > heap_no) {
 
 			return(lock);
 		}
-
-		lock = lock_rec_get_next_on_page(lock);
 	}
 
 	return(NULL);
@@ -1587,69 +1638,80 @@ lock_rec_find_similar_on_page(
 /*********************************************************************//**
 Checks if some transaction has an implicit x-lock on a record in a secondary
 index.
-@return	transaction which has the x-lock, or NULL */
+@return	transaction id of the transaction which has the x-lock, or 0;
+NOTE that this function can return false positives but never false
+negatives. The caller must confirm all positive results by calling
+trx_is_active(). */
 static
-trx_t*
-lock_sec_rec_some_has_impl_off_kernel(
-/*==================================*/
+trx_id_t
+lock_sec_rec_some_has_impl(
+/*=======================*/
 	const rec_t*	rec,	/*!< in: user record */
 	dict_index_t*	index,	/*!< in: secondary index */
 	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
 {
+	trx_id_t	trx_id;
+	trx_id_t	max_trx_id;
 	const page_t*	page = page_align(rec);
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(!lock_mutex_own());
+	ut_ad(!mutex_own(&trx_sys->mutex));
 	ut_ad(!dict_index_is_clust(index));
 	ut_ad(page_rec_is_user_rec(rec));
 	ut_ad(rec_offs_validate(rec, index, offsets));
 
+	max_trx_id = page_get_max_trx_id(page);
+
 	/* Some transaction may have an implicit x-lock on the record only
 	if the max trx id for the page >= min trx id for the trx list, or
 	database recovery is running. We do not write the changes of a page
 	max trx id to the log, and therefore during recovery, this value
 	for a page may be incorrect. */
 
-	if (page_get_max_trx_id(page) < trx_list_get_min_trx_id()
-	    && !recv_recovery_is_on()) {
+	if (max_trx_id < trx_rw_min_trx_id() && !recv_recovery_is_on()) {
 
-		return(NULL);
-	}
+		trx_id = 0;
 
-	/* Ok, in this case it is possible that some transaction has an
-	implicit x-lock. We have to look in the clustered index. */
+	} else if (!lock_check_trx_id_sanity(max_trx_id, rec, index, offsets)) {
 
-	if (!lock_check_trx_id_sanity(page_get_max_trx_id(page),
-				      rec, index, offsets, TRUE)) {
 		buf_page_print(page, 0, 0);
 
-		/* The page is corrupt: try to avoid a crash by returning
-		NULL */
-		return(NULL);
+		/* The page is corrupt: try to avoid a crash by returning 0 */
+		trx_id = 0;
+
+	/* In this case it is possible that some transaction has an implicit
+	x-lock. We have to look in the clustered index. */
+
+	} else {
+		trx_id = row_vers_impl_x_locked(rec, index, offsets);
 	}
 
-	return(row_vers_impl_x_locked_off_kernel(rec, index, offsets));
+	return(trx_id);
 }
 
 /*********************************************************************//**
 Return approximate number or record locks (bits set in the bitmap) for
 this transaction. Since delete-marked records may be removed, the
-record count will not be precise. */
+record count will not be precise.
+The caller must be holding lock_sys->mutex. */
 UNIV_INTERN
 ulint
 lock_number_of_rows_locked(
 /*=======================*/
-	const trx_t*	trx)	/*!< in: transaction */
+	const trx_lock_t*	trx_lock)	/*!< in: transaction locks */
 {
-	lock_t*	lock;
-	ulint   n_records = 0;
-	ulint	n_bits;
-	ulint	n_bit;
+	const lock_t*	lock;
+	ulint		n_records = 0;
 
-	lock = UT_LIST_GET_FIRST(trx->trx_locks);
+	ut_ad(lock_mutex_own());
+
+	for (lock = UT_LIST_GET_FIRST(trx_lock->trx_locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
 
-	while (lock) {
 		if (lock_get_type_low(lock) == LOCK_REC) {
-			n_bits = lock_rec_get_n_bits(lock);
+			ulint	n_bit;
+			ulint	n_bits = lock_rec_get_n_bits(lock);
 
 			for (n_bit = 0; n_bit < n_bits; n_bit++) {
 				if (lock_rec_get_nth_bit(lock, n_bit)) {
@@ -1657,11 +1719,9 @@ lock_number_of_rows_locked(
 				}
 			}
 		}
-
-		lock = UT_LIST_GET_NEXT(trx_locks, lock);
 	}
 
-	return (n_records);
+	return(n_records);
 }
 
 /*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/
@@ -1681,7 +1741,10 @@ lock_rec_create(
 					the record */
 	ulint			heap_no,/*!< in: heap number of the record */
 	dict_index_t*		index,	/*!< in: index of record */
-	trx_t*			trx)	/*!< in: transaction */
+	trx_t*			trx,	/*!< in/out: transaction */
+	ibool			caller_owns_trx_mutex)
+					/*!< in: TRUE if caller owns
+					trx mutex */
 {
 	lock_t*		lock;
 	ulint		page_no;
@@ -1690,7 +1753,13 @@ lock_rec_create(
 	ulint		n_bytes;
 	const page_t*	page;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
+	ut_ad(caller_owns_trx_mutex == trx_mutex_own(trx));
+	ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
+
+	/* Non-locking autocommit read-only transactions should not set
+	any locks. */
+	assert_trx_in_list(trx);
 
 	space = buf_block_get_space(block);
 	page_no	= buf_block_get_page_no(block);
@@ -1712,9 +1781,8 @@ lock_rec_create(
 	n_bits = page_dir_get_n_heap(page) + LOCK_PAGE_BITMAP_MARGIN;
 	n_bytes = 1 + n_bits / 8;
 
-	lock = mem_heap_alloc(trx->lock_heap, sizeof(lock_t) + n_bytes);
-
-	UT_LIST_ADD_LAST(trx_locks, trx->trx_locks, lock);
+	lock = static_cast<lock_t*>(
+		mem_heap_alloc(trx->lock.lock_heap, sizeof(lock_t) + n_bytes));
 
 	lock->trx = trx;
 
@@ -1733,14 +1801,34 @@ lock_rec_create(
 	/* Set the bit corresponding to rec */
 	lock_rec_set_nth_bit(lock, heap_no);
 
+	index->table->n_rec_locks++;
+
+	ut_ad(index->table->n_ref_count > 0 || !index->table->can_be_evicted);
+
 	HASH_INSERT(lock_t, hash, lock_sys->rec_hash,
 		    lock_rec_fold(space, page_no), lock);
 
 	lock_sys->rec_num++;
+
+	if (!caller_owns_trx_mutex) {
+		trx_mutex_enter(trx);
+	}
+	ut_ad(trx_mutex_own(trx));
+
 	if (lock_is_wait_not_by_other(type_mode)) {
+
 		lock_set_lock_and_trx_wait(lock, trx);
 	}
 
+	UT_LIST_ADD_LAST(trx_locks, trx->lock.trx_locks, lock);
+
+	if (!caller_owns_trx_mutex) {
+		trx_mutex_exit(trx);
+	}
+
+	MONITOR_INC(MONITOR_RECLOCK_CREATED);
+	MONITOR_INC(MONITOR_NUM_RECLOCK);
+
 	return(lock);
 }
 
@@ -1752,7 +1840,7 @@ DB_SUCCESS_LOCKED_REC; DB_SUCCESS_LOCKED_REC means that
 there was a deadlock, but another transaction was chosen as a victim,
 and we got the lock immediately: no need to wait then */
 static
-enum db_err
+dberr_t
 lock_rec_enqueue_waiting(
 /*=====================*/
 	ulint			type_mode,/*!< in: lock mode this
@@ -1772,25 +1860,30 @@ lock_rec_enqueue_waiting(
 	dict_index_t*		index,	/*!< in: index of record */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
-	trx_t*	trx;
-	ulint   sec;
-	ulint   ms;
+	trx_t*			trx;
+	trx_id_t		victim_trx_id;
+	ulint			sec;
+	ulint			ms;
+
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
+	ut_ad(!srv_read_only_mode);
+	ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
+
+	trx = thr_get_trx(thr);
+
+	ut_ad(trx_mutex_own(trx));
 
 	/* Test if there already is some other reason to suspend thread:
 	we do not enqueue a lock request if the query thread should be
 	stopped anyway */
 
-	if (UNIV_UNLIKELY(que_thr_stop(thr))) {
-
+	if (que_thr_stop(thr)) {
 		ut_error;
 
 		return(DB_QUE_THR_SUSPENDED);
 	}
 
-	trx = thr_get_trx(thr);
-
 	switch (trx_get_dict_operation(trx)) {
 	case TRX_DICT_OP_NONE:
 		break;
@@ -1809,9 +1902,12 @@ lock_rec_enqueue_waiting(
 	}
 
 	if (lock == NULL) {
-		/* Enqueue the lock request that will wait to be granted */
-		lock = lock_rec_create(type_mode | LOCK_WAIT,
-				       block, heap_no, index, trx);
+		/* Enqueue the lock request that will wait
+		to be granted, note that we already own
+		the trx mutex. */
+		lock = lock_rec_create(
+			type_mode | LOCK_WAIT, block, heap_no,
+			index, trx, TRUE);
 	} else {
 		ut_ad(lock->type_mode & LOCK_WAIT);
 		ut_ad(lock->type_mode & LOCK_CONV_BY_OTHER);
@@ -1820,28 +1916,43 @@ lock_rec_enqueue_waiting(
 		lock_set_lock_and_trx_wait(lock, trx);
 	}
 
-	/* Check if a deadlock occurs: if yes, remove the lock request and
-	return an error code */
+	/* Release the mutex to obey the latching order.
+	This is safe, because lock_deadlock_check_and_resolve()
+	is invoked when a lock wait is enqueued for the currently
+	running transaction. Because trx is a running transaction
+	(it is not currently suspended because of a lock wait),
+	its state can only be changed by this thread, which is
+	currently associated with the transaction. */
+
+	trx_mutex_exit(trx);
 
-	if (UNIV_UNLIKELY(lock_deadlock_occurs(lock, trx))) {
+	victim_trx_id = lock_deadlock_check_and_resolve(lock, trx);
+
+	trx_mutex_enter(trx);
+
+	if (victim_trx_id != 0) {
+
+		ut_ad(victim_trx_id == trx->id);
 
 		lock_reset_lock_and_trx_wait(lock);
 		lock_rec_reset_nth_bit(lock, heap_no);
 
 		return(DB_DEADLOCK);
-	}
 
-	/* If there was a deadlock but we chose another transaction as a
-	victim, it is possible that we already have the lock now granted! */
+	} else if (trx->lock.wait_lock == NULL) {
 
-	if (trx->wait_lock == NULL) {
+		/* If there was a deadlock but we chose another
+		transaction as a victim, it is possible that we
+		already have the lock now granted! */
 
 		return(DB_SUCCESS_LOCKED_REC);
 	}
 
-	trx->que_state = TRX_QUE_LOCK_WAIT;
-	trx->was_chosen_as_deadlock_victim = FALSE;
-	trx->wait_started = time(NULL);
+	trx->lock.que_state = TRX_QUE_LOCK_WAIT;
+
+	trx->lock.was_chosen_as_deadlock_victim = FALSE;
+	trx->lock.wait_started = ut_time();
+
 	if (UNIV_UNLIKELY(trx->take_stats)) {
 		ut_usectime(&sec, &ms);
 		trx->lock_que_wait_ustarted = (ib_uint64_t)sec * 1000000 + ms;
@@ -1852,11 +1963,13 @@ lock_rec_enqueue_waiting(
 #ifdef UNIV_DEBUG
 	if (lock_print_waits) {
 		fprintf(stderr, "Lock wait for trx " TRX_ID_FMT " in index ",
-			(ullint) trx->id);
+			trx->id);
 		ut_print_name(stderr, trx, FALSE, index->name);
 	}
 #endif /* UNIV_DEBUG */
 
+	MONITOR_INC(MONITOR_LOCKREC_WAIT);
+
 	return(DB_LOCK_WAIT);
 }
 
@@ -1879,11 +1992,17 @@ lock_rec_add_to_queue(
 					the record */
 	ulint			heap_no,/*!< in: heap number of the record */
 	dict_index_t*		index,	/*!< in: index of record */
-	trx_t*			trx)	/*!< in: transaction */
+	trx_t*			trx,	/*!< in/out: transaction */
+	ibool			caller_owns_trx_mutex)
+					/*!< in: TRUE if caller owns the
+					transaction mutex */
 {
 	lock_t*	lock;
+	lock_t*	first_lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
+	ut_ad(caller_owns_trx_mutex == trx_mutex_own(trx));
+	ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
 #ifdef UNIV_DEBUG
 	switch (type_mode & LOCK_MODE_MASK) {
 	case LOCK_X:
@@ -1897,7 +2016,7 @@ lock_rec_add_to_queue(
 		enum lock_mode	mode = (type_mode & LOCK_MODE_MASK) == LOCK_S
 			? LOCK_X
 			: LOCK_S;
-		lock_t*		other_lock
+		const lock_t*	other_lock
 			= lock_rec_other_has_expl_req(mode, 0, LOCK_WAIT,
 						      block, heap_no, trx);
 		ut_a(!other_lock);
@@ -1922,16 +2041,15 @@ lock_rec_add_to_queue(
 
 	/* Look for a waiting lock request on the same record or on a gap */
 
-	lock = lock_rec_get_first_on_page(block);
+	for (first_lock = lock = lock_rec_get_first_on_page(block);
+	     lock != NULL;
+	     lock = lock_rec_get_next_on_page(lock)) {
 
-	while (lock != NULL) {
 		if (lock_get_wait(lock)
-		    && (lock_rec_get_nth_bit(lock, heap_no))) {
+		    && lock_rec_get_nth_bit(lock, heap_no)) {
 
 			goto somebody_waits;
 		}
-
-		lock = lock_rec_get_next_on_page(lock);
 	}
 
 	if (UNIV_LIKELY(!(type_mode & LOCK_WAIT))) {
@@ -1941,8 +2059,7 @@ lock_rec_add_to_queue(
 		we can just set the bit */
 
 		lock = lock_rec_find_similar_on_page(
-			type_mode, heap_no,
-			lock_rec_get_first_on_page(block), trx);
+			type_mode, heap_no, first_lock, trx);
 
 		if (lock) {
 
@@ -1953,7 +2070,9 @@ lock_rec_add_to_queue(
 	}
 
 somebody_waits:
-	return(lock_rec_create(type_mode, block, heap_no, index, trx));
+	return(lock_rec_create(
+			type_mode, block, heap_no, index, trx,
+			caller_owns_trx_mutex));
 }
 
 /** Record locking request status */
@@ -1991,10 +2110,11 @@ lock_rec_lock_fast(
 	dict_index_t*		index,	/*!< in: index of record */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
-	lock_t*	lock;
-	trx_t*	trx;
+	lock_t*			lock;
+	trx_t*			trx;
+	enum lock_rec_req_status status = LOCK_REC_SUCCESS;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
 	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
 	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
@@ -2004,6 +2124,7 @@ lock_rec_lock_fast(
 	ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
 	      || mode - (LOCK_MODE_MASK & mode) == 0
 	      || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP);
+	ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
 
 	DBUG_EXECUTE_IF("innodb_report_deadlock", return(LOCK_REC_FAIL););
 
@@ -2013,35 +2134,35 @@ lock_rec_lock_fast(
 
 	if (lock == NULL) {
 		if (!impl) {
-			lock_rec_create(mode, block, heap_no, index, trx);
-		}
-
-		return(LOCK_REC_SUCCESS_CREATED);
-	}
-
-	if (lock_rec_get_next_on_page(lock)) {
-
-		return(LOCK_REC_FAIL);
-	}
+			/* Note that we don't own the trx mutex. */
+			lock = lock_rec_create(
+				mode, block, heap_no, index, trx, FALSE);
 
-	if (lock->trx != trx
-	    || lock->type_mode != (mode | LOCK_REC)
-	    || lock_rec_get_n_bits(lock) <= heap_no) {
-
-		return(LOCK_REC_FAIL);
-	}
-
-	if (!impl) {
-		/* If the nth bit of the record lock is already set then we
-		do not set a new lock bit, otherwise we do set */
-
-		if (!lock_rec_get_nth_bit(lock, heap_no)) {
-			lock_rec_set_nth_bit(lock, heap_no);
-			return(LOCK_REC_SUCCESS_CREATED);
 		}
+		status = LOCK_REC_SUCCESS_CREATED;
+	} else {
+		trx_mutex_enter(trx);
+
+		if (lock_rec_get_next_on_page(lock)
+		     || lock->trx != trx
+		     || lock->type_mode != (mode | LOCK_REC)
+		     || lock_rec_get_n_bits(lock) <= heap_no) {
+
+			status = LOCK_REC_FAIL;
+		} else if (!impl) {
+			/* If the nth bit of the record lock is already set
+			then we do not set a new lock bit, otherwise we do
+			set */
+			if (!lock_rec_get_nth_bit(lock, heap_no)) {
+				lock_rec_set_nth_bit(lock, heap_no);
+				status = LOCK_REC_SUCCESS_CREATED;
+			}
+		}
+
+		trx_mutex_exit(trx);
 	}
 
-	return(LOCK_REC_SUCCESS);
+	return(status);
 }
 
 /*********************************************************************//**
@@ -2052,7 +2173,7 @@ lock, or in the case of a page supremum record, a gap type lock.
 @return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
 or DB_QUE_THR_SUSPENDED */
 static
-enum db_err
+dberr_t
 lock_rec_lock_slow(
 /*===============*/
 	ibool			impl,	/*!< in: if TRUE, no lock is set
@@ -2068,10 +2189,11 @@ lock_rec_lock_slow(
 	dict_index_t*		index,	/*!< in: index of record */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
-	trx_t*	trx;
-	lock_t*	lock;
+	trx_t*			trx;
+	lock_t*			lock;
+	dberr_t			err = DB_SUCCESS;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
 	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
 	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
@@ -2081,11 +2203,13 @@ lock_rec_lock_slow(
 	ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
 	      || mode - (LOCK_MODE_MASK & mode) == 0
 	      || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP);
-
-	trx = thr_get_trx(thr);
+	ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
 
 	DBUG_EXECUTE_IF("innodb_report_deadlock", return(DB_DEADLOCK););
 
+	trx = thr_get_trx(thr);
+	trx_mutex_enter(trx);
+
 	lock = lock_rec_has_expl(mode, block, heap_no, trx);
 	if (lock) {
 		if (lock->type_mode & LOCK_CONV_BY_OTHER) {
@@ -2110,25 +2234,33 @@ lock_rec_lock_slow(
 		/* The trx already has a strong enough lock on rec: do
 		nothing */
 
-	} else if (lock_rec_other_has_conflicting(mode, block, heap_no, trx)) {
+	} else if (lock_rec_other_has_conflicting(
+			static_cast<enum lock_mode>(mode),
+			block, heap_no, trx)) {
 
-		/* If another transaction has a non-gap conflicting request in
-		the queue, as this transaction does not have a lock strong
-		enough already granted on the record, we have to wait. */
+		/* If another transaction has a non-gap conflicting
+		request in the queue, as this transaction does not
+		have a lock strong enough already granted on the
+		record, we have to wait. */
 
 		ut_ad(lock == NULL);
 enqueue_waiting:
-		return(lock_rec_enqueue_waiting(mode, block, heap_no,
-						lock, index, thr));
+		err = lock_rec_enqueue_waiting(
+			mode, block, heap_no, lock, index, thr);
+
 	} else if (!impl) {
-		/* Set the requested lock on the record */
+		/* Set the requested lock on the record, note that
+		we already own the transaction mutex. */
 
-		lock_rec_add_to_queue(LOCK_REC | mode, block,
-				      heap_no, index, trx);
-		return(DB_SUCCESS_LOCKED_REC);
+		lock_rec_add_to_queue(
+			LOCK_REC | mode, block, heap_no, index, trx, TRUE);
+
+		err = DB_SUCCESS_LOCKED_REC;
 	}
 
-	return(DB_SUCCESS);
+	trx_mutex_exit(trx);
+
+	return(err);
 }
 
 /*********************************************************************//**
@@ -2140,7 +2272,7 @@ of a page supremum record, a gap type lock.
 @return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
 or DB_QUE_THR_SUSPENDED */
 static
-enum db_err
+dberr_t
 lock_rec_lock(
 /*==========*/
 	ibool			impl,	/*!< in: if TRUE, no lock is set
@@ -2156,7 +2288,7 @@ lock_rec_lock(
 	dict_index_t*		index,	/*!< in: index of record */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
 	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
 	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
@@ -2166,6 +2298,7 @@ lock_rec_lock(
 	ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
 	      || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP
 	      || mode - (LOCK_MODE_MASK & mode) == 0);
+	ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
 
 	/* We try a simplified and faster subroutine for the most
 	common cases */
@@ -2185,19 +2318,21 @@ lock_rec_lock(
 
 /*********************************************************************//**
 Checks if a waiting record lock request still has to wait in a queue.
-@return	TRUE if still has to wait */
+@return	lock that is causing the wait */
 static
-ibool
+const lock_t*
 lock_rec_has_to_wait_in_queue(
 /*==========================*/
-	lock_t*	wait_lock)	/*!< in: waiting record lock */
+	const lock_t*	wait_lock)	/*!< in: waiting record lock */
 {
-	lock_t*	lock;
-	ulint	space;
-	ulint	page_no;
-	ulint	heap_no;
+	const lock_t*	lock;
+	ulint		space;
+	ulint		page_no;
+	ulint		heap_no;
+	ulint		bit_mask;
+	ulint		bit_offset;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_wait(wait_lock));
 	ut_ad(lock_get_type_low(wait_lock) == LOCK_REC);
 
@@ -2205,54 +2340,59 @@ lock_rec_has_to_wait_in_queue(
 	page_no = wait_lock->un_member.rec_lock.page_no;
 	heap_no = lock_rec_find_set_bit(wait_lock);
 
-	lock = lock_rec_get_first_on_page_addr(space, page_no);
+	bit_offset = heap_no / 8;
+	bit_mask = 1 << (heap_no % 8);
+
+	for (lock = lock_rec_get_first_on_page_addr(space, page_no);
+	     lock != wait_lock;
+	     lock = lock_rec_get_next_on_page_const(lock)) {
 
-	while (lock != wait_lock) {
+		const byte*	p = (const byte*) &lock[1];
 
-		if (lock_rec_get_nth_bit(lock, heap_no)
+		if (heap_no < lock_rec_get_n_bits(lock)
+		    && (p[bit_offset] & bit_mask)
 		    && lock_has_to_wait(wait_lock, lock)) {
 
-			return(TRUE);
+			return(lock);
 		}
-
-		lock = lock_rec_get_next_on_page(lock);
 	}
 
-	return(FALSE);
+	return(NULL);
 }
 
 /*************************************************************//**
-Grants a lock to a waiting lock request and releases the waiting
-transaction. */
+Grants a lock to a waiting lock request and releases the waiting transaction.
+The caller must hold lock_sys->mutex but not lock->trx->mutex. */
 static
 void
 lock_grant(
 /*=======*/
 	lock_t*	lock)	/*!< in/out: waiting lock request */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	lock_reset_lock_and_trx_wait(lock);
 
+	trx_mutex_enter(lock->trx);
+
 	if (lock_get_mode(lock) == LOCK_AUTO_INC) {
-		trx_t*		trx = lock->trx;
 		dict_table_t*	table = lock->un_member.tab_lock.table;
 
-		if (table->autoinc_trx == trx) {
+		if (UNIV_UNLIKELY(table->autoinc_trx == lock->trx)) {
 			fprintf(stderr,
 				"InnoDB: Error: trx already had"
 				" an AUTO-INC lock!\n");
 		} else {
-			table->autoinc_trx = trx;
+			table->autoinc_trx = lock->trx;
 
-			ib_vector_push(trx->autoinc_locks, lock);
+			ib_vector_push(lock->trx->autoinc_locks, &lock);
 		}
 	}
 
 #ifdef UNIV_DEBUG
 	if (lock_print_waits) {
 		fprintf(stderr, "Lock wait for trx " TRX_ID_FMT " ends\n",
-			(ullint) lock->trx->id);
+			lock->trx->id);
 	}
 #endif /* UNIV_DEBUG */
 
@@ -2262,9 +2402,17 @@ lock_grant(
 	for it */
 
 	if (!(lock->type_mode & LOCK_CONV_BY_OTHER)
-	    && lock->trx->que_state == TRX_QUE_LOCK_WAIT) {
-		trx_end_lock_wait(lock->trx);
+	    && lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+		que_thr_t*	thr;
+
+		thr = que_thr_end_lock_wait(lock->trx);
+
+		if (thr != NULL) {
+			lock_wait_release_thread_if_suspended(thr);
+		}
 	}
+
+	trx_mutex_exit(lock->trx);
 }
 
 /*************************************************************//**
@@ -2277,7 +2425,9 @@ lock_rec_cancel(
 /*============*/
 	lock_t*	lock)	/*!< in: waiting record lock request */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	que_thr_t*	thr;
+
+	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_type_low(lock) == LOCK_REC);
 	ut_ad(!(lock->type_mode & LOCK_CONV_BY_OTHER));
 
@@ -2290,7 +2440,15 @@ lock_rec_cancel(
 
 	/* The following function releases the trx from lock wait */
 
-	trx_end_lock_wait(lock->trx);
+	trx_mutex_enter(lock->trx);
+
+	thr = que_thr_end_lock_wait(lock->trx);
+
+	if (thr != NULL) {
+		lock_wait_release_thread_if_suspended(thr);
+	}
+
+	trx_mutex_exit(lock->trx);
 }
 
 /*************************************************************//**
@@ -2301,44 +2459,53 @@ static
 void
 lock_rec_dequeue_from_page(
 /*=======================*/
-	lock_t*	in_lock)/*!< in: record lock object: all record locks which
-			are contained in this lock object are removed;
-			transactions waiting behind will get their lock
-			requests granted, if they are now qualified to it */
+	lock_t*		in_lock)	/*!< in: record lock object: all
+					record locks which are contained in
+					this lock object are removed;
+					transactions waiting behind will
+					get their lock requests granted,
+					if they are now qualified to it */
 {
-	ulint	space;
-	ulint	page_no;
-	lock_t*	lock;
-	trx_t*	trx;
+	ulint		space;
+	ulint		page_no;
+	lock_t*		lock;
+	trx_lock_t*	trx_lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
+	/* We may or may not be holding in_lock->trx->mutex here. */
 
-	trx = in_lock->trx;
+	trx_lock = &in_lock->trx->lock;
 
 	space = in_lock->un_member.rec_lock.space;
 	page_no = in_lock->un_member.rec_lock.page_no;
 
+	in_lock->index->table->n_rec_locks--;
+
 	HASH_DELETE(lock_t, hash, lock_sys->rec_hash,
 		    lock_rec_fold(space, page_no), in_lock);
 	lock_sys->rec_num--;
 
-	UT_LIST_REMOVE(trx_locks, trx->trx_locks, in_lock);
+	UT_LIST_REMOVE(trx_locks, trx_lock->trx_locks, in_lock);
+
+	MONITOR_INC(MONITOR_RECLOCK_REMOVED);
+	MONITOR_DEC(MONITOR_NUM_RECLOCK);
 
 	/* Check if waiting locks in the queue can now be granted: grant
-	locks if there are no conflicting locks ahead. */
+	locks if there are no conflicting locks ahead. Stop at the first
+	X lock that is waiting or has been granted. */
 
-	lock = lock_rec_get_first_on_page_addr(space, page_no);
+	for (lock = lock_rec_get_first_on_page_addr(space, page_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next_on_page(lock)) {
 
-	while (lock != NULL) {
 		if (lock_get_wait(lock)
 		    && !lock_rec_has_to_wait_in_queue(lock)) {
 
 			/* Grant the lock */
+			ut_ad(lock->trx != in_lock->trx);
 			lock_grant(lock);
 		}
-
-		lock = lock_rec_get_next_on_page(lock);
 	}
 }
 
@@ -2348,26 +2515,32 @@ static
 void
 lock_rec_discard(
 /*=============*/
-	lock_t*	in_lock)/*!< in: record lock object: all record locks which
-			are contained in this lock object are removed */
+	lock_t*		in_lock)	/*!< in: record lock object: all
+					record locks which are contained
+					in this lock object are removed */
 {
-	ulint	space;
-	ulint	page_no;
-	trx_t*	trx;
+	ulint		space;
+	ulint		page_no;
+	trx_lock_t*	trx_lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
 
-	trx = in_lock->trx;
+	trx_lock = &in_lock->trx->lock;
 
 	space = in_lock->un_member.rec_lock.space;
 	page_no = in_lock->un_member.rec_lock.page_no;
 
+	in_lock->index->table->n_rec_locks--;
+
 	HASH_DELETE(lock_t, hash, lock_sys->rec_hash,
 		    lock_rec_fold(space, page_no), in_lock);
 	lock_sys->rec_num--;
 
-	UT_LIST_REMOVE(trx_locks, trx->trx_locks, in_lock);
+	UT_LIST_REMOVE(trx_locks, trx_lock->trx_locks, in_lock);
+
+	MONITOR_INC(MONITOR_RECLOCK_REMOVED);
+	MONITOR_DEC(MONITOR_NUM_RECLOCK);
 }
 
 /*************************************************************//**
@@ -2385,7 +2558,7 @@ lock_rec_free_all_from_discard_page(
 	lock_t*	lock;
 	lock_t*	next_lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	space = buf_block_get_space(block);
 	page_no = buf_block_get_page_no(block);
@@ -2419,11 +2592,12 @@ lock_rec_reset_and_release_wait(
 {
 	lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
-	lock = lock_rec_get_first(block, heap_no);
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
 
-	while (lock != NULL) {
 		if (lock_is_wait_not_by_other(lock->type_mode)) {
 			lock_rec_cancel(lock);
 		} else if (lock_get_wait(lock)) {
@@ -2433,8 +2607,6 @@ lock_rec_reset_and_release_wait(
 		} else {
 			lock_rec_reset_nth_bit(lock, heap_no);
 		}
-
-		lock = lock_rec_get_next(heap_no, lock);
 	}
 }
 
@@ -2460,9 +2632,7 @@ lock_rec_inherit_to_gap(
 {
 	lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
-
-	lock = lock_rec_get_first(block, heap_no);
+	ut_ad(lock_mutex_own());
 
 	/* If srv_locks_unsafe_for_binlog is TRUE or session is using
 	READ COMMITTED isolation level, we do not want locks set
@@ -2470,20 +2640,21 @@ lock_rec_inherit_to_gap(
 	DO want S-locks set by a consistency constraint to be inherited also
 	then. */
 
-	while (lock != NULL) {
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
+
 		if (!lock_rec_get_insert_intention(lock)
 		    && !((srv_locks_unsafe_for_binlog
 			  || lock->trx->isolation_level
 			  <= TRX_ISO_READ_COMMITTED)
 			 && lock_get_mode(lock) == LOCK_X)) {
 
-			lock_rec_add_to_queue(LOCK_REC | LOCK_GAP
-					      | lock_get_mode(lock),
-					      heir_block, heir_heap_no,
-					      lock->index, lock->trx);
+			lock_rec_add_to_queue(
+				LOCK_REC | LOCK_GAP | lock_get_mode(lock),
+				heir_block, heir_heap_no, lock->index,
+				lock->trx, FALSE);
 		}
-
-		lock = lock_rec_get_next(heap_no, lock);
 	}
 }
 
@@ -2505,23 +2676,24 @@ lock_rec_inherit_to_gap_if_gap_lock(
 {
 	lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	lock_mutex_enter();
 
-	lock = lock_rec_get_first(block, heap_no);
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
 
-	while (lock != NULL) {
 		if (!lock_rec_get_insert_intention(lock)
 		    && (heap_no == PAGE_HEAP_NO_SUPREMUM
 			|| !lock_rec_get_rec_not_gap(lock))) {
 
-			lock_rec_add_to_queue(LOCK_REC | LOCK_GAP
-					      | lock_get_mode(lock),
-					      block, heir_heap_no,
-					      lock->index, lock->trx);
+			lock_rec_add_to_queue(
+				LOCK_REC | LOCK_GAP | lock_get_mode(lock),
+				block, heir_heap_no, lock->index,
+				lock->trx, FALSE);
 		}
-
-		lock = lock_rec_get_next(heap_no, lock);
 	}
+
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -2544,13 +2716,14 @@ lock_rec_move(
 {
 	lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
-
-	lock = lock_rec_get_first(donator, donator_heap_no);
+	ut_ad(lock_mutex_own());
 
 	ut_ad(lock_rec_get_first(receiver, receiver_heap_no) == NULL);
 
-	while (lock != NULL) {
+	for (lock = lock_rec_get_first(donator, donator_heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next(donator_heap_no, lock)) {
+
 		const ulint	type_mode = lock->type_mode;
 
 		lock_rec_reset_nth_bit(lock, donator_heap_no);
@@ -2562,9 +2735,9 @@ lock_rec_move(
 		/* Note that we FIRST reset the bit, and then set the lock:
 		the function works also if donator == receiver */
 
-		lock_rec_add_to_queue(type_mode, receiver, receiver_heap_no,
-				      lock->index, lock->trx);
-		lock = lock_rec_get_next(donator_heap_no, lock);
+		lock_rec_add_to_queue(
+			type_mode, receiver, receiver_heap_no,
+			lock->index, lock->trx, FALSE);
 	}
 
 	ut_ad(lock_rec_get_first(donator, donator_heap_no) == NULL);
@@ -2589,12 +2762,12 @@ lock_move_reorganize_page(
 	mem_heap_t*	heap		= NULL;
 	ulint		comp;
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	lock = lock_rec_get_first_on_page(block);
 
 	if (lock == NULL) {
-		lock_mutex_exit_kernel();
+		lock_mutex_exit();
 
 		return;
 	}
@@ -2617,6 +2790,7 @@ lock_move_reorganize_page(
 		lock_rec_bitmap_reset(lock);
 
 		if (lock_get_wait(lock)) {
+
 			lock_reset_lock_and_trx_wait(lock);
 		}
 
@@ -2669,9 +2843,9 @@ lock_move_reorganize_page(
 				/* NOTE that the old lock bitmap could be too
 				small for the new heap number! */
 
-				lock_rec_add_to_queue(lock->type_mode, block,
-						      new_heap_no,
-						      lock->index, lock->trx);
+				lock_rec_add_to_queue(
+					lock->type_mode, block, new_heap_no,
+					lock->index, lock->trx, FALSE);
 
 				/* if (new_heap_no == PAGE_HEAP_NO_SUPREMUM
 				&& lock_get_wait(lock)) {
@@ -2708,7 +2882,7 @@ lock_move_reorganize_page(
 #endif /* UNIV_DEBUG */
 	}
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 
 	mem_heap_free(heap);
 
@@ -2732,7 +2906,7 @@ lock_move_rec_list_end(
 	lock_t*		lock;
 	const ulint	comp	= page_rec_is_comp(rec);
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	/* Note: when we move locks from record to record, waiting locks
 	and possible granted gap type locks behind them are enqueued in
@@ -2788,9 +2962,9 @@ lock_move_rec_list_end(
 						page_cur_get_rec(&cur2));
 				}
 
-				lock_rec_add_to_queue(type_mode,
-						      new_block, heap_no,
-						      lock->index, lock->trx);
+				lock_rec_add_to_queue(
+					type_mode, new_block, heap_no,
+					lock->index, lock->trx, FALSE);
 			}
 
 			page_cur_move_to_next(&cur1);
@@ -2798,7 +2972,7 @@ lock_move_rec_list_end(
 		}
 	}
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 
 #ifdef UNIV_DEBUG_LOCK_VALIDATE
 	ut_ad(lock_rec_validate_page(block));
@@ -2813,7 +2987,8 @@ UNIV_INTERN
 void
 lock_move_rec_list_start(
 /*=====================*/
-	const buf_block_t*	new_block,	/*!< in: index page to move to */
+	const buf_block_t*	new_block,	/*!< in: index page to
+						move to */
 	const buf_block_t*	block,		/*!< in: index page */
 	const rec_t*		rec,		/*!< in: record on page:
 						this is the first
@@ -2830,7 +3005,7 @@ lock_move_rec_list_start(
 	ut_ad(block->frame == page_align(rec));
 	ut_ad(new_block->frame == page_align(old_end));
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	for (lock = lock_rec_get_first_on_page(block); lock;
 	     lock = lock_rec_get_next_on_page(lock)) {
@@ -2878,9 +3053,9 @@ lock_move_rec_list_start(
 						page_cur_get_rec(&cur2));
 				}
 
-				lock_rec_add_to_queue(type_mode,
-						      new_block, heap_no,
-						      lock->index, lock->trx);
+				lock_rec_add_to_queue(
+					type_mode, new_block, heap_no,
+					lock->index, lock->trx, FALSE);
 			}
 
 			page_cur_move_to_next(&cur1);
@@ -2907,7 +3082,7 @@ lock_move_rec_list_start(
 #endif /* UNIV_DEBUG */
 	}
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 
 #ifdef UNIV_DEBUG_LOCK_VALIDATE
 	ut_ad(lock_rec_validate_page(block));
@@ -2925,7 +3100,7 @@ lock_update_split_right(
 {
 	ulint	heap_no = lock_get_min_heap_no(right_block);
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	/* Move the locks on the supremum of the left page to the supremum
 	of the right page */
@@ -2939,7 +3114,7 @@ lock_update_split_right(
 	lock_rec_inherit_to_gap(left_block, right_block,
 				PAGE_HEAP_NO_SUPREMUM, heap_no);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -2958,7 +3133,7 @@ lock_update_merge_right(
 						page which will be
 						discarded */
 {
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	/* Inherit the locks from the supremum of the left page to the
 	original successor of infimum on the right page, to which the left
@@ -2976,7 +3151,7 @@ lock_update_merge_right(
 
 	lock_rec_free_all_from_discard_page(left_block);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -2993,14 +3168,14 @@ lock_update_root_raise(
 	const buf_block_t*	block,	/*!< in: index page to which copied */
 	const buf_block_t*	root)	/*!< in: root page */
 {
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	/* Move the locks on the supremum of the root to the supremum
 	of block */
 
 	lock_rec_move(block, root,
 		      PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -3015,7 +3190,7 @@ lock_update_copy_and_discard(
 	const buf_block_t*	block)		/*!< in: index page;
 						NOT the root! */
 {
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	/* Move the locks on the supremum of the old page to the supremum
 	of new_page */
@@ -3024,7 +3199,7 @@ lock_update_copy_and_discard(
 		      PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
 	lock_rec_free_all_from_discard_page(block);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -3038,7 +3213,7 @@ lock_update_split_left(
 {
 	ulint	heap_no = lock_get_min_heap_no(right_block);
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	/* Inherit the locks to the supremum of the left page from the
 	successor of the infimum on the right page */
@@ -3046,7 +3221,7 @@ lock_update_split_left(
 	lock_rec_inherit_to_gap(left_block, right_block,
 				PAGE_HEAP_NO_SUPREMUM, heap_no);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -3067,7 +3242,7 @@ lock_update_merge_left(
 
 	ut_ad(left_block->frame == page_align(orig_pred));
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	left_next_rec = page_rec_get_next_const(orig_pred);
 
@@ -3095,7 +3270,7 @@ lock_update_merge_left(
 
 	lock_rec_free_all_from_discard_page(right_block);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -3116,13 +3291,13 @@ lock_rec_reset_and_inherit_gap_locks(
 	ulint			heap_no)	/*!< in: heap_no of the
 						donating record */
 {
-	mutex_enter(&kernel_mutex);
+	lock_mutex_enter();
 
 	lock_rec_reset_and_release_wait(heir_block, heir_heap_no);
 
 	lock_rec_inherit_to_gap(heir_block, block, heir_heap_no, heap_no);
 
-	mutex_exit(&kernel_mutex);
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -3142,12 +3317,12 @@ lock_update_discard(
 	const rec_t*	rec;
 	ulint		heap_no;
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	if (!lock_rec_get_first_on_page(block)) {
 		/* No locks exist on page, nothing to do */
 
-		lock_mutex_exit_kernel();
+		lock_mutex_exit();
 
 		return;
 	}
@@ -3185,7 +3360,7 @@ lock_update_discard(
 
 	lock_rec_free_all_from_discard_page(block);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -3215,10 +3390,8 @@ lock_update_insert(
 			page_rec_get_next_low(rec, FALSE));
 	}
 
-	lock_mutex_enter_kernel();
-	lock_rec_inherit_to_gap_if_gap_lock(block,
-					    receiver_heap_no, donator_heap_no);
-	lock_mutex_exit_kernel();
+	lock_rec_inherit_to_gap_if_gap_lock(
+		block, receiver_heap_no, donator_heap_no);
 }
 
 /*************************************************************//**
@@ -3248,7 +3421,7 @@ lock_update_delete(
 								       FALSE));
 	}
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	/* Let the next record inherit the locks from rec, in gap mode */
 
@@ -3258,7 +3431,7 @@ lock_update_delete(
 
 	lock_rec_reset_and_release_wait(block, heap_no);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*********************************************************************//**
@@ -3283,11 +3456,11 @@ lock_rec_store_on_page_infimum(
 
 	ut_ad(block->frame == page_align(rec));
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	lock_rec_move(block, block, PAGE_HEAP_NO_INFIMUM, heap_no);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*********************************************************************//**
@@ -3308,11 +3481,11 @@ lock_rec_restore_from_page_infimum(
 {
 	ulint	heap_no = page_rec_get_heap_no(rec);
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	lock_rec_move(block, donator, heap_no, PAGE_HEAP_NO_INFIMUM);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*=========== DEADLOCK CHECKING ======================================*/
@@ -3326,10 +3499,14 @@ void
 lock_deadlock_start_print()
 /*=======================*/
 {
+	ut_ad(lock_mutex_own());
+	ut_ad(!srv_read_only_mode);
+
 	rewind(lock_latest_err_file);
 	ut_print_timestamp(lock_latest_err_file);
 
 	if (srv_print_all_deadlocks) {
+		ut_print_timestamp(stderr);
 		fprintf(stderr, "InnoDB: transactions deadlock detected, "
 			"dumping detailed information.\n");
 		ut_print_timestamp(stderr);
@@ -3344,10 +3521,12 @@ lock_deadlock_fputs(
 /*================*/
 	const char*	msg)	/*!< in: message to print */
 {
-	fputs(msg, lock_latest_err_file);
+	if (!srv_read_only_mode) {
+		fputs(msg, lock_latest_err_file);
 
-	if (srv_print_all_deadlocks) {
-		fputs(msg, stderr);
+		if (srv_print_all_deadlocks) {
+			fputs(msg, stderr);
+		}
 	}
 }
 
@@ -3357,15 +3536,28 @@ UNIV_INLINE
 void
 lock_deadlock_trx_print(
 /*====================*/
-	trx_t*	trx,		/*!< in: transaction */
-	ulint	max_query_len)	/*!< in: max query length to print, or 0 to
-				use the default max length */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
 {
-	trx_print(lock_latest_err_file, trx, max_query_len);
+	ut_ad(lock_mutex_own());
+	ut_ad(!srv_read_only_mode);
+
+	ulint	n_rec_locks = lock_number_of_rows_locked(&trx->lock);
+	ulint	n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
+	ulint	heap_size = mem_heap_get_size(trx->lock.lock_heap);
+
+	mutex_enter(&trx_sys->mutex);
+
+	trx_print_low(lock_latest_err_file, trx, max_query_len,
+		      n_rec_locks, n_trx_locks, heap_size);
 
 	if (srv_print_all_deadlocks) {
-		trx_print(stderr, trx, max_query_len);
+		trx_print_low(stderr, trx, max_query_len,
+			      n_rec_locks, n_trx_locks, heap_size);
 	}
+
+	mutex_exit(&trx_sys->mutex);
 }
 
 /*********************************************************************//**
@@ -3376,6 +3568,9 @@ lock_deadlock_lock_print(
 /*=====================*/
 	const lock_t*	lock)	/*!< in: record or table type lock */
 {
+	ut_ad(lock_mutex_own());
+	ut_ad(!srv_read_only_mode);
+
 	if (lock_get_type_low(lock) == LOCK_REC) {
 		lock_rec_print(lock_latest_err_file, lock);
 
@@ -3391,297 +3586,462 @@ lock_deadlock_lock_print(
 	}
 }
 
+/** Used in deadlock tracking. Protected by lock_sys->mutex. */
+static ib_uint64_t	lock_mark_counter = 0;
+
+/** Check if the search is too deep. */
+#define lock_deadlock_too_deep(c)				\
+	(c->depth > LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK		\
+	 || c->cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK)
+
 /********************************************************************//**
-Checks if a lock request results in a deadlock.
-@return TRUE if a deadlock was detected and we chose trx as a victim;
-FALSE if no deadlock, or there was a deadlock, but we chose other
-transaction(s) as victim(s) */
+Get the next lock in the queue that is owned by a transaction whose
+sub-tree has not already been searched.
+@return next lock or NULL if at end of queue */
 static
-ibool
-lock_deadlock_occurs(
-/*=================*/
-	lock_t*	lock,	/*!< in: lock the transaction is requesting */
-	trx_t*	trx)	/*!< in: transaction */
+const lock_t*
+lock_get_next_lock(
+/*===============*/
+	const lock_deadlock_ctx_t*
+				ctx,	/*!< in: deadlock context */
+	const lock_t*		lock,	/*!< in: lock in the queue */
+	ulint			heap_no)/*!< in: heap no if rec lock else
+					ULINT_UNDEFINED */
 {
-	trx_t*		mark_trx;
-	ulint		ret;
-	ulint		cost	= 0;
+	ut_ad(lock_mutex_own());
 
-	ut_ad(trx);
-	ut_ad(lock);
-	ut_ad(mutex_own(&kernel_mutex));
-retry:
-	/* We check that adding this trx to the waits-for graph
-	does not produce a cycle. First mark all active transactions
-	with 0: */
-
-	mark_trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+	do {
+		if (lock_get_type_low(lock) == LOCK_REC) {
+			ut_ad(heap_no != ULINT_UNDEFINED);
+			lock = lock_rec_get_next_const(heap_no, lock);
+		} else {
+			ut_ad(heap_no == ULINT_UNDEFINED);
+			ut_ad(lock_get_type_low(lock) == LOCK_TABLE);
 
-	while (mark_trx) {
-		mark_trx->deadlock_mark = 0;
-		mark_trx = UT_LIST_GET_NEXT(trx_list, mark_trx);
-	}
+			lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
+		}
+	} while (lock != NULL
+		 && lock->trx->lock.deadlock_mark > ctx->mark_start);
 
-	ret = lock_deadlock_recursive(trx, trx, lock, &cost, 0);
+	ut_ad(lock == NULL
+	      || lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock));
 
-	switch (ret) {
-	case LOCK_VICTIM_IS_OTHER:
-		/* We chose some other trx as a victim: retry if there still
-		is a deadlock */
-		goto retry;
+	return(lock);
+}
 
-	case LOCK_EXCEED_MAX_DEPTH:
-		/* If the lock search exceeds the max step
-		or the max depth, the current trx will be
-		the victim. Print its information. */
-		lock_deadlock_start_print();
+/********************************************************************//**
+Get the first lock to search. The search starts from the current
+wait_lock. What we are really interested in is an edge from the
+current wait_lock's owning transaction to another transaction that has
+a lock ahead in the queue. We skip locks where the owning transaction's
+sub-tree has already been searched.
+@return first lock or NULL */
+static
+const lock_t*
+lock_get_first_lock(
+/*================*/
+	const lock_deadlock_ctx_t*
+				ctx,	/*!< in: deadlock context */
+	ulint*			heap_no)/*!< out: heap no if rec lock,
+					else ULINT_UNDEFINED */
+{
+	const lock_t*		lock;
 
-		lock_deadlock_fputs(
-			"TOO DEEP OR LONG SEARCH IN THE LOCK TABLE"
-			" WAITS-FOR GRAPH, WE WILL ROLL BACK"
-			" FOLLOWING TRANSACTION \n\n"
-			"*** TRANSACTION:\n");
+	ut_ad(lock_mutex_own());
 
-		lock_deadlock_trx_print(trx, 3000);
+	lock = ctx->wait_lock;
 
-		lock_deadlock_fputs(
-			"*** WAITING FOR THIS LOCK TO BE GRANTED:\n");
+	if (lock_get_type_low(lock) == LOCK_REC) {
 
-		lock_deadlock_lock_print(lock);
+		*heap_no = lock_rec_find_set_bit(lock);
+		ut_ad(*heap_no != ULINT_UNDEFINED);
 
-		break;
+		lock = lock_rec_get_first_on_page_addr(
+			lock->un_member.rec_lock.space,
+			lock->un_member.rec_lock.page_no);
 
-	case LOCK_VICTIM_IS_START:
-		srv_n_lock_deadlock_count++;
-		lock_deadlock_fputs("*** WE ROLL BACK TRANSACTION (2)\n");
-		break;
+		/* Position on the first lock on the physical record. */
+		if (!lock_rec_get_nth_bit(lock, *heap_no)) {
+			lock = lock_rec_get_next_const(*heap_no, lock);
+		}
 
-	default:
-		/* No deadlock detected*/
-		return(FALSE);
+	} else {
+		*heap_no = ULINT_UNDEFINED;
+		ut_ad(lock_get_type_low(lock) == LOCK_TABLE);
+		lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
 	}
 
-	lock_deadlock_found = TRUE;
+	ut_a(lock != NULL);
+	ut_a(lock != ctx->wait_lock);
+	ut_ad(lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock));
 
-	return(TRUE);
+	return(lock);
 }
 
 /********************************************************************//**
-Looks recursively for a deadlock.
-@return 0 if no deadlock found, LOCK_VICTIM_IS_START if there was a
-deadlock and we chose 'start' as the victim, LOCK_VICTIM_IS_OTHER if a
-deadlock was found and we chose some other trx as a victim: we must do
-the search again in this last case because there may be another
-deadlock!
-LOCK_EXCEED_MAX_DEPTH if the lock search exceeds max steps or max depth. */
+Notify that a deadlock has been detected and print the conflicting
+transaction info. */
 static
-ulint
-lock_deadlock_recursive(
-/*====================*/
-	trx_t*	start,		/*!< in: recursion starting point */
-	trx_t*	trx,		/*!< in: a transaction waiting for a lock */
-	lock_t*	wait_lock,	/*!< in: lock that is waiting to be granted */
-	ulint*	cost,		/*!< in/out: number of calculation steps thus
-				far: if this exceeds LOCK_MAX_N_STEPS_...
-				we return LOCK_EXCEED_MAX_DEPTH */
-	ulint	depth)		/*!< in: recursion depth: if this exceeds
-				LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK, we
-				return LOCK_EXCEED_MAX_DEPTH */
-{
-	ulint	ret;
-	lock_t*	lock;
-	trx_t*	lock_trx;
-	ulint	heap_no		= ULINT_UNDEFINED;
+void
+lock_deadlock_notify(
+/*=================*/
+	const lock_deadlock_ctx_t*	ctx,	/*!< in: deadlock context */
+	const lock_t*			lock)	/*!< in: lock causing
+						deadlock */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(!srv_read_only_mode);
+
+	lock_deadlock_start_print();
+
+	lock_deadlock_fputs("\n*** (1) TRANSACTION:\n");
+
+	lock_deadlock_trx_print(ctx->wait_lock->trx, 3000);
+
+	lock_deadlock_fputs("*** (1) WAITING FOR THIS LOCK TO BE GRANTED:\n");
+
+	lock_deadlock_lock_print(ctx->wait_lock);
+
+	lock_deadlock_fputs("*** (2) TRANSACTION:\n");
 
-	ut_a(trx);
-	ut_a(start);
-	ut_a(wait_lock);
-	ut_ad(mutex_own(&kernel_mutex));
+	lock_deadlock_trx_print(lock->trx, 3000);
 
-	if (trx->deadlock_mark == 1) {
-		/* We have already exhaustively searched the subtree starting
-		from this trx */
+	lock_deadlock_fputs("*** (2) HOLDS THE LOCK(S):\n");
 
-		return(0);
+	lock_deadlock_lock_print(lock);
+
+	/* It is possible that the joining transaction was granted its
+	lock when we rolled back some other waiting transaction. */
+
+	if (ctx->start->lock.wait_lock != 0) {
+		lock_deadlock_fputs(
+			"*** (2) WAITING FOR THIS LOCK TO BE GRANTED:\n");
+
+		lock_deadlock_lock_print(ctx->start->lock.wait_lock);
+	}
+
+#ifdef UNIV_DEBUG
+	if (lock_print_waits) {
+		fputs("Deadlock detected\n", stderr);
+	}
+#endif /* UNIV_DEBUG */
+}
+
+/********************************************************************//**
+Select the victim transaction that should be rolledback.
+@return victim transaction */
+static
+const trx_t*
+lock_deadlock_select_victim(
+/*========================*/
+	const lock_deadlock_ctx_t*	ctx)	/*!< in: deadlock context */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(ctx->start->lock.wait_lock != 0);
+	ut_ad(ctx->wait_lock->trx != ctx->start);
+
+	if (trx_weight_ge(ctx->wait_lock->trx, ctx->start)) {
+		/* The joining  transaction is 'smaller',
+		choose it as the victim and roll it back. */
+
+		return(ctx->start);
 	}
 
-	*cost = *cost + 1;
+	return(ctx->wait_lock->trx);
+}
 
-	if (lock_get_type_low(wait_lock) == LOCK_REC) {
-		ulint		space;
-		ulint		page_no;
+/********************************************************************//**
+Pop the deadlock search state from the stack.
+@return stack slot instance that was on top of the stack. */
+static
+const lock_stack_t*
+lock_deadlock_pop(
+/*==============*/
+	lock_deadlock_ctx_t*	ctx)		/*!< in/out: context */
+{
+	ut_ad(lock_mutex_own());
 
-		heap_no = lock_rec_find_set_bit(wait_lock);
-		ut_a(heap_no != ULINT_UNDEFINED);
+	ut_ad(ctx->depth > 0);
 
-		space = wait_lock->un_member.rec_lock.space;
-		page_no = wait_lock->un_member.rec_lock.page_no;
+	return(&lock_stack[--ctx->depth]);
+}
 
-		lock = lock_rec_get_first_on_page_addr(space, page_no);
+/********************************************************************//**
+Push the deadlock search state onto the stack.
+@return slot that was used in the stack */
+static
+lock_stack_t*
+lock_deadlock_push(
+/*===============*/
+	lock_deadlock_ctx_t*	ctx,		/*!< in/out: context */
+	const lock_t*		lock,		/*!< in: current lock */
+	ulint			heap_no)	/*!< in: heap number */
+{
+	ut_ad(lock_mutex_own());
 
-		/* Position the iterator on the first matching record lock. */
-		while (lock != NULL
-		       && lock != wait_lock
-		       && !lock_rec_get_nth_bit(lock, heap_no)) {
+	/* Save current search state. */
 
-			lock = lock_rec_get_next_on_page(lock);
-		}
+	if (LOCK_STACK_SIZE > ctx->depth) {
+		lock_stack_t*	stack;
 
-		if (lock == wait_lock) {
-			lock = NULL;
-		}
+		stack = &lock_stack[ctx->depth++];
 
-		ut_ad(lock == NULL || lock_rec_get_nth_bit(lock, heap_no));
+		stack->lock = lock;
+		stack->heap_no = heap_no;
+		stack->wait_lock = ctx->wait_lock;
 
-	} else {
-		lock = wait_lock;
+		return(stack);
 	}
 
-	/* Look at the locks ahead of wait_lock in the lock queue */
+	return(NULL);
+}
+
+/********************************************************************//**
+Looks iteratively for a deadlock. Note: the joining transaction may
+have been granted its lock by the deadlock checks.
+@return 0 if no deadlock else the victim transaction id.*/
+static
+trx_id_t
+lock_deadlock_search(
+/*=================*/
+	lock_deadlock_ctx_t*	ctx)	/*!< in/out: deadlock context */
+{
+	const lock_t*	lock;
+	ulint		heap_no;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(!trx_mutex_own(ctx->start));
+
+	ut_ad(ctx->start != NULL);
+	ut_ad(ctx->wait_lock != NULL);
+	assert_trx_in_list(ctx->wait_lock->trx);
+	ut_ad(ctx->mark_start <= lock_mark_counter);
+
+	/* Look at the locks ahead of wait_lock in the lock queue. */
+	lock = lock_get_first_lock(ctx, &heap_no);
 
 	for (;;) {
-		/* Get previous table lock. */
-		if (heap_no == ULINT_UNDEFINED) {
 
-			lock = UT_LIST_GET_PREV(
-				un_member.tab_lock.locks, lock);
+		/* We should never visit the same sub-tree more than once. */
+		ut_ad(lock == NULL
+		      || lock->trx->lock.deadlock_mark <= ctx->mark_start);
+
+		while (ctx->depth > 0 && lock == NULL) {
+			const lock_stack_t*	stack;
+
+			/* Restore previous search state. */
+
+			stack = lock_deadlock_pop(ctx);
+
+			lock = stack->lock;
+			heap_no = stack->heap_no;
+			ctx->wait_lock = stack->wait_lock;
+
+			lock = lock_get_next_lock(ctx, lock, heap_no);
 		}
 
 		if (lock == NULL) {
+			break;
+		} else if (lock == ctx->wait_lock) {
+
 			/* We can mark this subtree as searched */
-			trx->deadlock_mark = 1;
+			ut_ad(lock->trx->lock.deadlock_mark <= ctx->mark_start);
 
-			return(FALSE);
-		}
+			lock->trx->lock.deadlock_mark = ++lock_mark_counter;
 
-		if (lock_has_to_wait(wait_lock, lock)) {
+			/* We are not prepared for an overflow. This 64-bit
+			counter should never wrap around. At 10^9 increments
+			per second, it would take 10^3 years of uptime. */
 
-			ibool	too_far
-				= depth > LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK
-				|| *cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK;
+			ut_ad(lock_mark_counter > 0);
 
-			lock_trx = lock->trx;
+			lock = NULL;
 
-			if (lock_trx == start) {
+		} else if (!lock_has_to_wait(ctx->wait_lock, lock)) {
 
-				/* We came back to the recursion starting
-				point: a deadlock detected; or we have
-				searched the waits-for graph too long */
+			/* No conflict, next lock */
+			lock = lock_get_next_lock(ctx, lock, heap_no);
 
-				lock_deadlock_start_print();
+		} else if (lock->trx == ctx->start) {
 
-				lock_deadlock_fputs("\n*** (1) TRANSACTION:\n");
+			/* Found a cycle. */
 
-				lock_deadlock_trx_print(wait_lock->trx, 3000);
+			lock_deadlock_notify(ctx, lock);
 
-				lock_deadlock_fputs(
-					"*** (1) WAITING FOR THIS LOCK"
-					" TO BE GRANTED:\n");
+			return(lock_deadlock_select_victim(ctx)->id);
 
-				lock_deadlock_lock_print(wait_lock);
+		} else if (lock_deadlock_too_deep(ctx)) {
 
-				lock_deadlock_fputs("*** (2) TRANSACTION:\n");
+			/* Search too deep to continue. */
 
-				lock_deadlock_trx_print(lock->trx, 3000);
+			ctx->too_deep = TRUE;
 
-				lock_deadlock_fputs(
-					"*** (2) HOLDS THE LOCK(S):\n");
+			/* Select the joining transaction as the victim. */
+			return(ctx->start->id);
 
-				lock_deadlock_lock_print(lock);
+		} else if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
 
-				lock_deadlock_fputs(
-					"*** (2) WAITING FOR THIS LOCK"
-					" TO BE GRANTED:\n");
+			/* Another trx ahead has requested a lock in an
+			incompatible mode, and is itself waiting for a lock. */
 
-				lock_deadlock_lock_print(start->wait_lock);
+			++ctx->cost;
 
-#ifdef UNIV_DEBUG
-				if (lock_print_waits) {
-					fputs("Deadlock detected\n",
-					      stderr);
-				}
-#endif /* UNIV_DEBUG */
+			/* Save current search state. */
+			if (!lock_deadlock_push(ctx, lock, heap_no)) {
 
-				if (trx_weight_ge(wait_lock->trx, start)) {
-					/* Our recursion starting point
-					transaction is 'smaller', let us
-					choose 'start' as the victim and roll
-					back it */
+				/* Unable to save current search state, stack
+				size not big enough. */
 
-					return(LOCK_VICTIM_IS_START);
-				}
+				ctx->too_deep = TRUE;
 
-				lock_deadlock_found = TRUE;
+				return(ctx->start->id);
+			}
 
-				/* Let us choose the transaction of wait_lock
-				as a victim to try to avoid deadlocking our
-				recursion starting point transaction */
+			ctx->wait_lock = lock->trx->lock.wait_lock;
+			lock = lock_get_first_lock(ctx, &heap_no);
 
-				lock_deadlock_fputs(
-					"*** WE ROLL BACK TRANSACTION (1)\n");
+			if (lock->trx->lock.deadlock_mark > ctx->mark_start) {
+				lock = lock_get_next_lock(ctx, lock, heap_no);
+			}
 
-				wait_lock->trx->was_chosen_as_deadlock_victim
-					= TRUE;
+		} else {
+			lock = lock_get_next_lock(ctx, lock, heap_no);
+		}
+	}
 
-				lock_cancel_waiting_and_release(wait_lock);
+	ut_a(lock == NULL && ctx->depth == 0);
+ 
+	/* No deadlock found. */
+	return(0);
+}
 
-				/* Since trx and wait_lock are no longer
-				in the waits-for graph, we can return FALSE;
-				note that our selective algorithm can choose
-				several transactions as victims, but still
-				we may end up rolling back also the recursion
-				starting point transaction! */
+/********************************************************************//**
+Print info about transaction that was rolled back. */
+static
+void
+lock_deadlock_joining_trx_print(
+/*============================*/
+	const trx_t*	trx,		/*!< in: transaction rolled back */
+	const lock_t*	lock)		/*!< in: lock trx wants */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(!srv_read_only_mode);
 
-				return(LOCK_VICTIM_IS_OTHER);
-			}
+	/* If the lock search exceeds the max step
+	or the max depth, the current trx will be
+	the victim. Print its information. */
+	lock_deadlock_start_print();
 
-			if (too_far) {
+	lock_deadlock_fputs(
+		"TOO DEEP OR LONG SEARCH IN THE LOCK TABLE"
+		" WAITS-FOR GRAPH, WE WILL ROLL BACK"
+		" FOLLOWING TRANSACTION \n\n"
+		"*** TRANSACTION:\n");
 
-#ifdef UNIV_DEBUG
-				if (lock_print_waits) {
-					fputs("Deadlock search exceeds"
-					      " max steps or depth.\n",
-					      stderr);
-				}
-#endif /* UNIV_DEBUG */
-				/* The information about transaction/lock
-				to be rolled back is available in the top
-				level. Do not print anything here. */
-				return(LOCK_EXCEED_MAX_DEPTH);
-			}
+	lock_deadlock_trx_print(trx, 3000);
 
-			if (lock_trx->que_state == TRX_QUE_LOCK_WAIT) {
+	lock_deadlock_fputs("*** WAITING FOR THIS LOCK TO BE GRANTED:\n");
 
-				/* Another trx ahead has requested lock	in an
-				incompatible mode, and is itself waiting for
-				a lock */
+	lock_deadlock_lock_print(lock);
+}
 
-				ret = lock_deadlock_recursive(
-					start, lock_trx,
-					lock_trx->wait_lock, cost, depth + 1);
+/********************************************************************//**
+Rollback transaction selected as the victim. */
+static
+void
+lock_deadlock_trx_rollback(
+/*=======================*/
+	lock_deadlock_ctx_t*	ctx)		/*!< in: deadlock context */
+{
+	trx_t*			trx;
 
-				if (ret != 0) {
+	ut_ad(lock_mutex_own());
 
-					return(ret);
-				}
-			}
-		}
-		/* Get the next record lock to check. */
-		if (heap_no != ULINT_UNDEFINED) {
+	trx = ctx->wait_lock->trx;
+
+	lock_deadlock_fputs("*** WE ROLL BACK TRANSACTION (1)\n");
+
+	trx_mutex_enter(trx);
+
+	trx->lock.was_chosen_as_deadlock_victim = TRUE;
+
+	lock_cancel_waiting_and_release(trx->lock.wait_lock);
+
+	trx_mutex_exit(trx);
+}
+
+/********************************************************************//**
+Checks if a joining lock request results in a deadlock. If a deadlock is
+found this function will resolve the dadlock by choosing a victim transaction
+and rolling it back. It will attempt to resolve all deadlocks. The returned
+transaction id will be the joining transaction id or 0 if some other
+transaction was chosen as a victim and rolled back or no deadlock found.
 
-			ut_a(lock != NULL);
+@return id of transaction chosen as victim or 0 */
+static
+trx_id_t
+lock_deadlock_check_and_resolve(
+/*============================*/
+	const lock_t*	lock,	/*!< in: lock the transaction is requesting */
+	const trx_t*	trx)	/*!< in: transaction */
+{
+	trx_id_t	victim_trx_id;
+
+	ut_ad(trx != NULL);
+	ut_ad(lock != NULL);
+	ut_ad(lock_mutex_own());
+	assert_trx_in_list(trx);
+
+	/* Try and resolve as many deadlocks as possible. */
+	do {
+		lock_deadlock_ctx_t	ctx;
 
-			do {
-				lock = lock_rec_get_next_on_page(lock);
-			} while (lock != NULL
-				&& lock != wait_lock
-				&& !lock_rec_get_nth_bit(lock, heap_no));
+		/* Reset the context. */
+		ctx.cost = 0;
+		ctx.depth = 0;
+		ctx.start = trx;
+		ctx.too_deep = FALSE;
+		ctx.wait_lock = lock;
+		ctx.mark_start = lock_mark_counter;
 
-			if (lock == wait_lock) {
-				lock = NULL;
+		victim_trx_id = lock_deadlock_search(&ctx);
+
+		/* Search too deep, we rollback the joining transaction. */
+		if (ctx.too_deep) {
+
+			ut_a(trx == ctx.start);
+			ut_a(victim_trx_id == trx->id);
+
+			if (!srv_read_only_mode) {
+				lock_deadlock_joining_trx_print(trx, lock);
 			}
+
+			MONITOR_INC(MONITOR_DEADLOCK);
+
+		} else if (victim_trx_id != 0 && victim_trx_id != trx->id) {
+
+			ut_ad(victim_trx_id == ctx.wait_lock->trx->id);
+			lock_deadlock_trx_rollback(&ctx);
+
+			lock_deadlock_found = TRUE;
+
+			MONITOR_INC(MONITOR_DEADLOCK);
 		}
-	}/* end of the 'for (;;)'-loop */
+
+	} while (victim_trx_id != 0 && victim_trx_id != trx->id);
+
+	/* If the joining transaction was selected as the victim. */
+	if (victim_trx_id != 0) {
+		ut_a(victim_trx_id == trx->id);
+
+		srv_stats.lock_deadlock_count.inc();
+
+		lock_deadlock_fputs("*** WE ROLL BACK TRANSACTION (2)\n");
+
+		lock_deadlock_found = TRUE;
+	}
+
+	return(victim_trx_id);
 }
 
 /*========================= TABLE LOCKS ==============================*/
@@ -3694,7 +4054,8 @@ UNIV_INLINE
 lock_t*
 lock_table_create(
 /*==============*/
-	dict_table_t*	table,	/*!< in: database table in dictionary cache */
+	dict_table_t*	table,	/*!< in/out: database table
+				in dictionary cache */
 	ulint		type_mode,/*!< in: lock mode possibly ORed with
 				LOCK_WAIT */
 	trx_t*		trx)	/*!< in: trx */
@@ -3702,9 +4063,14 @@ lock_table_create(
 	lock_t*	lock;
 
 	ut_ad(table && trx);
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
+	ut_ad(trx_mutex_own(trx));
 	ut_ad(!(type_mode & LOCK_CONV_BY_OTHER));
 
+	/* Non-locking autocommit read-only transactions should not set
+	any locks. */
+	assert_trx_in_list(trx);
+
 	if ((type_mode & LOCK_MODE_MASK) == LOCK_AUTO_INC) {
 		++table->n_waiting_or_granted_auto_inc_locks;
 	}
@@ -3718,18 +4084,20 @@ lock_table_create(
 
 		table->autoinc_trx = trx;
 
-		ib_vector_push(trx->autoinc_locks, lock);
+		ib_vector_push(trx->autoinc_locks, &lock);
 	} else {
-		lock = mem_heap_alloc(trx->lock_heap, sizeof(lock_t));
+		lock = static_cast<lock_t*>(
+			mem_heap_alloc(trx->lock.lock_heap, sizeof(*lock)));
 	}
 
-	UT_LIST_ADD_LAST(trx_locks, trx->trx_locks, lock);
-
 	lock->type_mode = type_mode | LOCK_TABLE;
 	lock->trx = trx;
 
 	lock->un_member.tab_lock.table = table;
 
+	ut_ad(table->n_ref_count > 0 || !table->can_be_evicted);
+
+	UT_LIST_ADD_LAST(trx_locks, trx->lock.trx_locks, lock);
 	UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock);
 
 	if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
@@ -3737,6 +4105,11 @@ lock_table_create(
 		lock_set_lock_and_trx_wait(lock, trx);
 	}
 
+	ib_vector_push(lock->trx->lock.table_locks, &lock);
+
+	MONITOR_INC(MONITOR_TABLELOCK_CREATED);
+	MONITOR_INC(MONITOR_NUM_TABLELOCK);
+
 	return(lock);
 }
 
@@ -3750,7 +4123,7 @@ lock_table_pop_autoinc_locks(
 /*=========================*/
 	trx_t*	trx)	/*!< in/out: transaction that owns the AUTOINC locks */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(!ib_vector_is_empty(trx->autoinc_locks));
 
 	/* Skip any gaps, gaps are NULL lock entries in the
@@ -3763,7 +4136,7 @@ lock_table_pop_autoinc_locks(
 			return;
 		}
 
-	} while (ib_vector_get_last(trx->autoinc_locks) == NULL);
+	} while (*(lock_t**) ib_vector_get_last(trx->autoinc_locks) == NULL);
 }
 
 /*************************************************************//**
@@ -3778,7 +4151,7 @@ lock_table_remove_autoinc_lock(
 	lock_t*	autoinc_lock;
 	lint	i = ib_vector_size(trx->autoinc_locks) - 1;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_mode(lock) == LOCK_AUTO_INC);
 	ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
 	ut_ad(!ib_vector_is_empty(trx->autoinc_locks));
@@ -3788,7 +4161,8 @@ lock_table_remove_autoinc_lock(
 	to be handled by deleting only those AUTOINC locks that were
 	held by the table being dropped. */
 
-	autoinc_lock = ib_vector_get(trx->autoinc_locks, i);
+	autoinc_lock = *static_cast<lock_t**>(
+		ib_vector_get(trx->autoinc_locks, i));
 
 	/* This is the default fast case. */
 
@@ -3801,10 +4175,12 @@ lock_table_remove_autoinc_lock(
 		/* Handle freeing the locks from within the stack. */
 
 		while (--i >= 0) {
-			autoinc_lock = ib_vector_get(trx->autoinc_locks, i);
+			autoinc_lock = *static_cast<lock_t**>(
+				ib_vector_get(trx->autoinc_locks, i));
 
 			if (UNIV_LIKELY(autoinc_lock == lock)) {
-				ib_vector_set(trx->autoinc_locks, i, NULL);
+				void*	null_var = NULL;
+				ib_vector_set(trx->autoinc_locks, i, &null_var);
 				return;
 			}
 		}
@@ -3822,18 +4198,18 @@ UNIV_INLINE
 void
 lock_table_remove_low(
 /*==================*/
-	lock_t*	lock)	/*!< in: table lock */
+	lock_t*	lock)	/*!< in/out: table lock */
 {
 	trx_t*		trx;
 	dict_table_t*	table;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	trx = lock->trx;
 	table = lock->un_member.tab_lock.table;
 
 	/* Remove the table from the transaction's AUTOINC vector, if
-	the lock that is being release is an AUTOINC lock. */
+	the lock that is being released is an AUTOINC lock. */
 	if (lock_get_mode(lock) == LOCK_AUTO_INC) {
 
 		/* The table's AUTOINC lock can get transferred to
@@ -3844,7 +4220,7 @@ lock_table_remove_low(
 
 		/* The locks must be freed in the reverse order from
 		the one in which they were acquired. This is to avoid
-		traversing the AUTOINC lock vector unnecessarily. 
+		traversing the AUTOINC lock vector unnecessarily.
 
 		We only store locks that were granted in the
 		trx->autoinc_locks vector (see lock_table_create()
@@ -3858,11 +4234,14 @@ lock_table_remove_low(
 		}
 
 		ut_a(table->n_waiting_or_granted_auto_inc_locks > 0);
-		--table->n_waiting_or_granted_auto_inc_locks;
+		table->n_waiting_or_granted_auto_inc_locks--;
 	}
 
-	UT_LIST_REMOVE(trx_locks, trx->trx_locks, lock);
+	UT_LIST_REMOVE(trx_locks, trx->lock.trx_locks, lock);
 	UT_LIST_REMOVE(un_member.tab_lock.locks, table->locks, lock);
+
+	MONITOR_INC(MONITOR_TABLELOCK_REMOVED);
+	MONITOR_DEC(MONITOR_NUM_TABLELOCK);
 }
 
 /*********************************************************************//**
@@ -3873,20 +4252,25 @@ DB_SUCCESS; DB_SUCCESS means that there was a deadlock, but another
 transaction was chosen as a victim, and we got the lock immediately:
 no need to wait then */
 static
-ulint
+dberr_t
 lock_table_enqueue_waiting(
 /*=======================*/
 	ulint		mode,	/*!< in: lock mode this transaction is
 				requesting */
-	dict_table_t*	table,	/*!< in: table */
+	dict_table_t*	table,	/*!< in/out: table */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	lock_t*	lock;
-	trx_t*	trx;
-	ulint   sec;
-	ulint   ms;
+	trx_t*		trx;
+	lock_t*		lock;
+	trx_id_t	victim_trx_id;
+	ulint		sec;
+	ulint		ms;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(!srv_read_only_mode);
 
-	ut_ad(mutex_own(&kernel_mutex));
+	trx = thr_get_trx(thr);
+	ut_ad(trx_mutex_own(trx));
 
 	/* Test if there already is some other reason to suspend thread:
 	we do not enqueue a lock request if the query thread should be
@@ -3898,8 +4282,6 @@ lock_table_enqueue_waiting(
 		return(DB_QUE_THR_SUSPENDED);
 	}
 
-	trx = thr_get_trx(thr);
-
 	switch (trx_get_dict_operation(trx)) {
 	case TRX_DICT_OP_NONE:
 		break;
@@ -3921,10 +4303,22 @@ lock_table_enqueue_waiting(
 
 	lock = lock_table_create(table, mode | LOCK_WAIT, trx);
 
-	/* Check if a deadlock occurs: if yes, remove the lock request and
-	return an error code */
+	/* Release the mutex to obey the latching order.
+	This is safe, because lock_deadlock_check_and_resolve()
+	is invoked when a lock wait is enqueued for the currently
+	running transaction. Because trx is a running transaction
+	(it is not currently suspended because of a lock wait),
+	its state can only be changed by this thread, which is
+	currently associated with the transaction. */
+
+	trx_mutex_exit(trx);
+
+	victim_trx_id = lock_deadlock_check_and_resolve(lock, trx);
+
+	trx_mutex_enter(trx);
 
-	if (lock_deadlock_occurs(lock, trx)) {
+	if (victim_trx_id != 0) {
+		ut_ad(victim_trx_id == trx->id);
 
 		/* The order here is important, we don't want to
 		lose the state of the lock before calling remove. */
@@ -3932,25 +4326,27 @@ lock_table_enqueue_waiting(
 		lock_reset_lock_and_trx_wait(lock);
 
 		return(DB_DEADLOCK);
-	}
-
-	if (trx->wait_lock == NULL) {
+	} else if (trx->lock.wait_lock == NULL) {
 		/* Deadlock resolution chose another transaction as a victim,
 		and we accidentally got our lock granted! */
 
 		return(DB_SUCCESS);
 	}
 
+	trx->lock.que_state = TRX_QUE_LOCK_WAIT;
+
+	trx->lock.wait_started = ut_time();
+	trx->lock.was_chosen_as_deadlock_victim = FALSE;
+
 	if (UNIV_UNLIKELY(trx->take_stats)) {
 		ut_usectime(&sec, &ms);
 		trx->lock_que_wait_ustarted = (ib_uint64_t)sec * 1000000 + ms;
 	}
-	trx->que_state = TRX_QUE_LOCK_WAIT;
-	trx->was_chosen_as_deadlock_victim = FALSE;
-	trx->wait_started = time(NULL);
 
 	ut_a(que_thr_stop(thr));
 
+	MONITOR_INC(MONITOR_TABLELOCK_WAIT);
+
 	return(DB_LOCK_WAIT);
 }
 
@@ -3972,20 +4368,18 @@ lock_table_other_has_incompatible(
 {
 	const lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
-	lock = UT_LIST_GET_LAST(table->locks);
-
-	while (lock != NULL) {
+	for (lock = UT_LIST_GET_LAST(table->locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock)) {
 
-		if ((lock->trx != trx)
-		    && (!lock_mode_compatible(lock_get_mode(lock), mode))
-		    && (wait || !(lock_get_wait(lock)))) {
+		if (lock->trx != trx
+		    && !lock_mode_compatible(lock_get_mode(lock), mode)
+		    && (wait || !lock_get_wait(lock))) {
 
 			return(lock);
 		}
-
-		lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
 	}
 
 	return(NULL);
@@ -3996,17 +4390,19 @@ Locks the specified database table in the mode given. If the lock cannot
 be granted immediately, the query thread is put to wait.
 @return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
 UNIV_INTERN
-ulint
+dberr_t
 lock_table(
 /*=======*/
 	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is set,
 				does nothing */
-	dict_table_t*	table,	/*!< in: database table in dictionary cache */
+	dict_table_t*	table,	/*!< in/out: database table
+				in dictionary cache */
 	enum lock_mode	mode,	/*!< in: lock mode */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	trx_t*	trx;
-	ulint	err;
+	trx_t*		trx;
+	dberr_t		err;
+	const lock_t*	wait_for;
 
 	ut_ad(table && thr);
 
@@ -4023,39 +4419,73 @@ lock_table(
 		mode = LOCK_IS;
 	}
 
-	lock_mutex_enter_kernel();
-
-	/* Look for stronger locks the same trx already has on the table */
+	/* Look for equal or stronger locks the same trx already
+	has on the table. No need to acquire the lock mutex here
+	because only this transacton can add/access table locks
+	to/from trx_t::table_locks. */
 
 	if (lock_table_has(trx, table, mode)) {
 
-		lock_mutex_exit_kernel();
-
 		return(DB_SUCCESS);
 	}
 
+	lock_mutex_enter();
+
 	/* We have to check if the new lock is compatible with any locks
 	other transactions have in the table lock queue. */
 
-	if (lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode)) {
+	wait_for = lock_table_other_has_incompatible(
+		trx, LOCK_WAIT, table, mode);
 
-		/* Another trx has a request on the table in an incompatible
-		mode: this trx may have to wait */
+	trx_mutex_enter(trx);
 
+	/* Another trx has a request on the table in an incompatible
+	mode: this trx may have to wait */
+
+	if (wait_for != NULL) {
 		err = lock_table_enqueue_waiting(mode | flags, table, thr);
+	} else {
+		lock_table_create(table, mode | flags, trx);
+
+		ut_a(!flags || mode == LOCK_S || mode == LOCK_X);
 
-		lock_mutex_exit_kernel();
+		err = DB_SUCCESS;
+	}
 
-		return(err);
+	lock_mutex_exit();
+
+	trx_mutex_exit(trx);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Creates a table IX lock object for a resurrected transaction. */
+UNIV_INTERN
+void
+lock_table_ix_resurrect(
+/*====================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx)	/*!< in/out: transaction */
+{
+	ut_ad(trx->is_recovered);
+
+	if (lock_table_has(trx, table, LOCK_IX)) {
+		return;
 	}
 
-	lock_table_create(table, mode | flags, trx);
+	lock_mutex_enter();
 
-	ut_a(!flags || mode == LOCK_S || mode == LOCK_X);
+	/* We have to check if the new lock is compatible with any locks
+	other transactions have in the table lock queue. */
 
-	lock_mutex_exit_kernel();
+	ut_ad(!lock_table_other_has_incompatible(
+		      trx, LOCK_WAIT, table, LOCK_IX));
 
-	return(DB_SUCCESS);
+	trx_mutex_enter(trx);
+	lock_table_create(table, LOCK_IX, trx);
+	lock_mutex_exit();
+	trx_mutex_exit(trx);
 }
 
 /*********************************************************************//**
@@ -4070,21 +4500,19 @@ lock_table_has_to_wait_in_queue(
 	const dict_table_t*	table;
 	const lock_t*		lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_wait(wait_lock));
 
 	table = wait_lock->un_member.tab_lock.table;
 
-	lock = UT_LIST_GET_FIRST(table->locks);
-
-	while (lock != wait_lock) {
+	for (lock = UT_LIST_GET_FIRST(table->locks);
+	     lock != wait_lock;
+	     lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
 
 		if (lock_has_to_wait(wait_lock, lock)) {
 
 			return(TRUE);
 		}
-
-		lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock);
 	}
 
 	return(FALSE);
@@ -4098,13 +4526,13 @@ static
 void
 lock_table_dequeue(
 /*===============*/
-	lock_t*	in_lock)/*!< in: table lock object; transactions waiting
+	lock_t*	in_lock)/*!< in/out: table lock object; transactions waiting
 			behind will get their lock requests granted, if
 			they are now qualified to it */
 {
 	lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_a(lock_get_type_low(in_lock) == LOCK_TABLE);
 
 	lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, in_lock);
@@ -4114,16 +4542,17 @@ lock_table_dequeue(
 	/* Check if waiting locks in the queue can now be granted: grant
 	locks if there are no conflicting locks ahead. */
 
-	while (lock != NULL) {
+	for (/* No op */;
+	     lock != NULL;
+	     lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
 
 		if (lock_get_wait(lock)
 		    && !lock_table_has_to_wait_in_queue(lock)) {
 
 			/* Grant the lock */
+			ut_ad(in_lock->trx != lock->trx);
 			lock_grant(lock);
 		}
-
-		lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock);
 	}
 }
 
@@ -4137,48 +4566,60 @@ UNIV_INTERN
 void
 lock_rec_unlock(
 /*============*/
-	trx_t*			trx,	/*!< in: transaction that has
+	trx_t*			trx,	/*!< in/out: transaction that has
 					set a record lock */
 	const buf_block_t*	block,	/*!< in: buffer block containing rec */
 	const rec_t*		rec,	/*!< in: record */
 	enum lock_mode		lock_mode)/*!< in: LOCK_S or LOCK_X */
 {
-	lock_t*	first_lock;
-	lock_t*	lock;
-	ulint	heap_no;
+	lock_t*		first_lock;
+	lock_t*		lock;
+	ulint		heap_no;
+	const char*	stmt;
+	size_t		stmt_len;
 
-	ut_ad(trx && rec);
+	ut_ad(trx);
+	ut_ad(rec);
 	ut_ad(block->frame == page_align(rec));
-	ut_ad(trx->state == TRX_ACTIVE);
+	ut_ad(!trx->lock.wait_lock);
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
 
 	heap_no = page_rec_get_heap_no(rec);
 
-	mutex_enter(&kernel_mutex);
+	lock_mutex_enter();
+	trx_mutex_enter(trx);
 
 	first_lock = lock_rec_get_first(block, heap_no);
 
 	/* Find the last lock with the same lock_mode and transaction
-	from the record. */
+	on the record. */
 
 	for (lock = first_lock; lock != NULL;
 	     lock = lock_rec_get_next(heap_no, lock)) {
 		if (lock->trx == trx && lock_get_mode(lock) == lock_mode) {
-			ut_a(!lock_get_wait(lock));
-			lock_rec_reset_nth_bit(lock, heap_no);
 			goto released;
 		}
 	}
 
-	mutex_exit(&kernel_mutex);
+	lock_mutex_exit();
+	trx_mutex_exit(trx);
+
+	stmt = innobase_get_stmt(trx->mysql_thd, &stmt_len);
 	ut_print_timestamp(stderr);
 	fprintf(stderr,
-		"  InnoDB: Error: unlock row could not"
+		" InnoDB: Error: unlock row could not"
 		" find a %lu mode lock on the record\n",
 		(ulong) lock_mode);
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: current statement: %.*s\n",
+		(int) stmt_len, stmt);
 
 	return;
 
 released:
+	ut_a(!lock_get_wait(lock));
+	lock_rec_reset_nth_bit(lock, heap_no);
+
 	/* Check if we can now grant waiting lock requests */
 
 	for (lock = first_lock; lock != NULL;
@@ -4187,42 +4628,70 @@ released:
 		    && !lock_rec_has_to_wait_in_queue(lock)) {
 
 			/* Grant the lock */
+			ut_ad(trx != lock->trx);
 			lock_grant(lock);
 		}
 	}
 
-	mutex_exit(&kernel_mutex);
+	lock_mutex_exit();
+	trx_mutex_exit(trx);
 }
 
 /*********************************************************************//**
 Releases transaction locks, and releases possible other transactions waiting
 because of these locks. */
-UNIV_INTERN
+static
 void
-lock_release_off_kernel(
-/*====================*/
-	trx_t*	trx)	/*!< in: transaction */
+lock_release(
+/*=========*/
+	trx_t*	trx)	/*!< in/out: transaction */
 {
-	dict_table_t*	table;
-	ulint		count;
 	lock_t*		lock;
+	ulint		count = 0;
+	trx_id_t	max_trx_id;
 
-	ut_ad(mutex_own(&kernel_mutex));
-
-	lock = UT_LIST_GET_LAST(trx->trx_locks);
+	ut_ad(lock_mutex_own());
+	ut_ad(!trx_mutex_own(trx));
 
-	count = 0;
+	max_trx_id = trx_sys_get_max_trx_id();
 
-	while (lock != NULL) {
-
-		count++;
+	for (lock = UT_LIST_GET_LAST(trx->lock.trx_locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_LAST(trx->lock.trx_locks)) {
 
 		if (lock_get_type_low(lock) == LOCK_REC) {
 
+#ifdef UNIV_DEBUG
+			/* Check if the transcation locked a record
+			in a system table in X mode. It should have set
+			the dict_op code correctly if it did. */
+			if (lock->index->table->id < DICT_HDR_FIRST_ID
+			    && lock_get_mode(lock) == LOCK_X) {
+
+				ut_ad(lock_get_mode(lock) != LOCK_IX);
+				ut_ad(trx->dict_operation != TRX_DICT_OP_NONE);
+			}
+#endif /* UNIV_DEBUG */
+
 			lock_rec_dequeue_from_page(lock);
 		} else {
+			dict_table_t*	table;
+
+			table = lock->un_member.tab_lock.table;
+#ifdef UNIV_DEBUG
 			ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
 
+			/* Check if the transcation locked a system table
+			in IX mode. It should have set the dict_op code
+			correctly if it did. */
+			if (table->id < DICT_HDR_FIRST_ID
+			    && (lock_get_mode(lock) == LOCK_X
+				|| lock_get_mode(lock) == LOCK_IX)) {
+
+				ut_ad(trx->dict_operation != TRX_DICT_OP_NONE);
+			}
+#endif /* UNIV_DEBUG */
+
 			if (lock_get_mode(lock) != LOCK_IS
 			    && trx->undo_no != 0) {
 
@@ -4230,80 +4699,103 @@ lock_release_off_kernel(
 				block the use of the MySQL query cache for
 				all currently active transactions. */
 
-				table = lock->un_member.tab_lock.table;
-
-				table->query_cache_inv_trx_id
-					= trx_sys->max_trx_id;
+				table->query_cache_inv_trx_id = max_trx_id;
 			}
 
 			lock_table_dequeue(lock);
 		}
 
-		if (count == LOCK_RELEASE_KERNEL_INTERVAL) {
-			/* Release the kernel mutex for a while, so that we
+		if (count == LOCK_RELEASE_INTERVAL) {
+			/* Release the  mutex for a while, so that we
 			do not monopolize it */
 
-			lock_mutex_exit_kernel();
+			lock_mutex_exit();
 
-			lock_mutex_enter_kernel();
+			lock_mutex_enter();
 
 			count = 0;
 		}
 
-		lock = UT_LIST_GET_LAST(trx->trx_locks);
+		++count;
 	}
 
-	ut_a(ib_vector_size(trx->autoinc_locks) == 0);
+	/* We don't remove the locks one by one from the vector for
+	efficiency reasons. We simply reset it because we would have
+	released all the locks anyway. */
 
-	mem_heap_empty(trx->lock_heap);
+	ib_vector_reset(trx->lock.table_locks);
+
+	ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+	ut_a(ib_vector_is_empty(trx->autoinc_locks));
+	ut_a(ib_vector_is_empty(trx->lock.table_locks));
+
+	mem_heap_empty(trx->lock.lock_heap);
 }
 
+/* True if a lock mode is S or X */
+#define IS_LOCK_S_OR_X(lock) \
+	(lock_get_mode(lock) == LOCK_S \
+	 || lock_get_mode(lock) == LOCK_X)
+
 /*********************************************************************//**
-Cancels a waiting lock request and releases possible other transactions
-waiting behind it. */
-UNIV_INTERN
+Removes table locks of the transaction on a table to be dropped. */
+static
 void
-lock_cancel_waiting_and_release(
-/*============================*/
-	lock_t*	lock)	/*!< in: waiting lock request */
+lock_trx_table_locks_remove(
+/*========================*/
+	const lock_t*	lock_to_remove)		/*!< in: lock to remove */
 {
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(!(lock->type_mode & LOCK_CONV_BY_OTHER));
+	lint		i;
+	trx_t*		trx = lock_to_remove->trx;
 
-	if (lock_get_type_low(lock) == LOCK_REC) {
+	ut_ad(lock_mutex_own());
 
-		lock_rec_dequeue_from_page(lock);
+	/* It is safe to read this because we are holding the lock mutex */
+	if (!trx->lock.cancel) {
+		trx_mutex_enter(trx);
 	} else {
-		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+		ut_ad(trx_mutex_own(trx));
+	}
 
-		if (lock->trx->autoinc_locks != NULL) {
-			/* Release the transaction's AUTOINC locks/ */
-			lock_release_autoinc_locks(lock->trx);
-		}
+	for (i = ib_vector_size(trx->lock.table_locks) - 1; i >= 0; --i) {
+		const lock_t*	lock;
 
-		lock_table_dequeue(lock);
-	}
+		lock = *static_cast<lock_t**>(
+			ib_vector_get(trx->lock.table_locks, i));
 
-	/* Reset the wait flag and the back pointer to lock in trx */
+		if (lock == NULL) {
+			continue;
+		}
 
-	lock_reset_lock_and_trx_wait(lock);
+		ut_a(trx == lock->trx);
+		ut_a(lock_get_type_low(lock) & LOCK_TABLE);
+		ut_a(lock->un_member.tab_lock.table != NULL);
 
-	/* The following function releases the trx from lock wait */
+		if (lock == lock_to_remove) {
+			void*	null_var = NULL;
+			ib_vector_set(trx->lock.table_locks, i, &null_var);
 
-	trx_end_lock_wait(lock->trx);
-}
+			if (!trx->lock.cancel) {
+				trx_mutex_exit(trx);
+			}
 
-/* True if a lock mode is S or X */
-#define IS_LOCK_S_OR_X(lock) \
-	(lock_get_mode(lock) == LOCK_S \
-	 || lock_get_mode(lock) == LOCK_X)
+			return;
+		}
+	}
+
+	if (!trx->lock.cancel) {
+		trx_mutex_exit(trx);
+	}
 
+	/* Lock must exist in the vector. */
+	ut_error;
+}
 
 /*********************************************************************//**
 Removes locks of a transaction on a table to be dropped.
 If remove_also_table_sx_locks is TRUE then table-level S and X locks are
 also removed in addition to other table-level and record-level locks.
-No lock, that is going to be removed, is allowed to be a wait lock. */
+No lock that is going to be removed is allowed to be a wait lock. */
 static
 void
 lock_remove_all_on_table_for_trx(
@@ -4313,14 +4805,15 @@ lock_remove_all_on_table_for_trx(
 	ibool		remove_also_table_sx_locks)/*!< in: also removes
 						table S and X locks */
 {
-	lock_t*	lock;
-	lock_t*	prev_lock;
+	lock_t*		lock;
+	lock_t*		prev_lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
-	lock = UT_LIST_GET_LAST(trx->trx_locks);
+	for (lock = UT_LIST_GET_LAST(trx->lock.trx_locks);
+	     lock != NULL;
+	     lock = prev_lock) {
 
-	while (lock != NULL) {
 		prev_lock = UT_LIST_GET_PREV(trx_locks, lock);
 
 		if (lock_get_type_low(lock) == LOCK_REC
@@ -4335,11 +4828,83 @@ lock_remove_all_on_table_for_trx(
 
 			ut_a(!lock_get_wait(lock));
 
+			lock_trx_table_locks_remove(lock);
 			lock_table_remove_low(lock);
 		}
+	}
+}
+
+/*******************************************************************//**
+Remove any explicit record locks held by recovering transactions on
+the table.
+@return number of recovered transactions examined */
+static
+ulint
+lock_remove_recovered_trx_record_locks(
+/*===================================*/
+	dict_table_t*	table)	/*!< in: check if there are any locks
+				held on records in this table or on the
+				table itself */
+{
+	trx_t*		trx;
+	ulint		n_recovered_trx = 0;
+
+	ut_a(table != NULL);
+	ut_ad(lock_mutex_own());
+
+	mutex_enter(&trx_sys->mutex);
+
+	for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		lock_t*	lock;
+		lock_t*	next_lock;
 
-		lock = prev_lock;
+		assert_trx_in_rw_list(trx);
+
+		if (!trx->is_recovered) {
+			continue;
+		}
+
+		/* Because we are holding the lock_sys->mutex,
+		implicit locks cannot be converted to explicit ones
+		while we are scanning the explicit locks. */
+
+		for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
+		     lock != NULL;
+		     lock = next_lock) {
+
+			ut_a(lock->trx == trx);
+
+			/* Recovered transactions can't wait on a lock. */
+
+			ut_a(!lock_get_wait(lock));
+
+			next_lock = UT_LIST_GET_NEXT(trx_locks, lock);
+
+			switch (lock_get_type_low(lock)) {
+			default:
+				ut_error;
+			case LOCK_TABLE:
+				if (lock->un_member.tab_lock.table == table) {
+					lock_trx_table_locks_remove(lock);
+					lock_table_remove_low(lock);
+				}
+				break;
+			case LOCK_REC:
+				if (lock->index->table == table) {
+					lock_rec_discard(lock);
+				}
+			}
+		}
+
+		++n_recovered_trx;
 	}
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(n_recovered_trx);
 }
 
 /*********************************************************************//**
@@ -4356,17 +4921,17 @@ lock_remove_all_on_table(
 	ibool		remove_also_table_sx_locks)/*!< in: also removes
 						table S and X locks */
 {
-	lock_t*	lock;
-	lock_t*	prev_lock;
+	lock_t*		lock;
 
-	mutex_enter(&kernel_mutex);
+	lock_mutex_enter();
 
-	lock = UT_LIST_GET_FIRST(table->locks);
+	for (lock = UT_LIST_GET_FIRST(table->locks);
+	     lock != NULL;
+	     /* No op */) {
 
-	while (lock != NULL) {
+		lock_t*	prev_lock;
 
-		prev_lock = UT_LIST_GET_PREV(un_member.tab_lock.locks,
-					     lock);
+		prev_lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
 
 		/* If we should remove all locks (remove_also_table_sx_locks
 		is TRUE), or if the lock is not table-level S or X lock,
@@ -4378,8 +4943,8 @@ lock_remove_all_on_table(
 			ut_a(!lock_get_wait(lock));
 		}
 
-		lock_remove_all_on_table_for_trx(table, lock->trx,
-						 remove_also_table_sx_locks);
+		lock_remove_all_on_table_for_trx(
+			table, lock->trx, remove_also_table_sx_locks);
 
 		if (prev_lock == NULL) {
 			if (lock == UT_LIST_GET_FIRST(table->locks)) {
@@ -4404,7 +4969,18 @@ lock_remove_all_on_table(
 		}
 	}
 
-	mutex_exit(&kernel_mutex);
+	/* Note: Recovered transactions don't have table level IX or IS locks
+	but can have implicit record locks that have been converted to explicit
+	record locks. Such record locks cannot be freed by traversing the
+	transaction lock list in dict_table_t (as above). */
+
+	if (!lock_sys->rollback_complete
+	    && lock_remove_recovered_trx_record_locks(table) == 0) {
+
+		lock_sys->rollback_complete = TRUE;
+	}
+
+	lock_mutex_exit();
 }
 
 /*===================== VALIDATION AND DEBUGGING  ====================*/
@@ -4418,13 +4994,13 @@ lock_table_print(
 	FILE*		file,	/*!< in: file where to print */
 	const lock_t*	lock)	/*!< in: table type lock */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_a(lock_get_type_low(lock) == LOCK_TABLE);
 
 	fputs("TABLE LOCK table ", file);
 	ut_print_name(file, lock->trx, TRUE,
 		      lock->un_member.tab_lock.table->name);
-	fprintf(file, " trx id " TRX_ID_FMT, (ullint) lock->trx->id);
+	fprintf(file, " trx id " TRX_ID_FMT, lock->trx->id);
 
 	if (lock_get_mode(lock) == LOCK_S) {
 		fputs(" lock mode S", file);
@@ -4467,7 +5043,7 @@ lock_rec_print(
 	ulint*			offsets		= offsets_;
 	rec_offs_init(offsets_);
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_a(lock_get_type_low(lock) == LOCK_REC);
 
 	space = lock->un_member.rec_lock.space;
@@ -4477,7 +5053,7 @@ lock_rec_print(
 		(ulong) space, (ulong) page_no,
 		(ulong) lock_rec_get_n_bits(lock));
 	dict_index_name_print(file, lock->trx, lock->index);
-	fprintf(file, " trx id " TRX_ID_FMT, (ullint) lock->trx->id);
+	fprintf(file, " trx id " TRX_ID_FMT, lock->trx->id);
 
 	if (lock_get_mode(lock) == LOCK_S) {
 		fputs(" lock mode S", file);
@@ -4558,20 +5134,21 @@ ulint
 lock_get_n_rec_locks(void)
 /*======================*/
 {
-	lock_t*	lock;
 	ulint	n_locks	= 0;
 	ulint	i;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	for (i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) {
+		const lock_t*	lock;
 
-		lock = HASH_GET_FIRST(lock_sys->rec_hash, i);
+		for (lock = static_cast<const lock_t*>(
+				HASH_GET_FIRST(lock_sys->rec_hash, i));
+		     lock != 0;
+		     lock = static_cast<const lock_t*>(
+				HASH_GET_NEXT(hash, lock))) {
 
-		while (lock) {
 			n_locks++;
-
-			lock = HASH_GET_NEXT(hash, lock);
 		}
 	}
 
@@ -4581,22 +5158,22 @@ lock_get_n_rec_locks(void)
 
 /*********************************************************************//**
 Prints info of locks for all transactions.
-@return FALSE if not able to obtain kernel mutex
+@return FALSE if not able to obtain lock mutex
 and exits without printing info */
 UNIV_INTERN
 ibool
 lock_print_info_summary(
 /*====================*/
 	FILE*	file,	/*!< in: file where to print */
-	ibool   nowait)	/*!< in: whether to wait for the kernel mutex */
+	ibool   nowait)	/*!< in: whether to wait for the lock mutex */
 {
-	/* if nowait is FALSE, wait on the kernel mutex,
+	/* if nowait is FALSE, wait on the lock mutex,
 	otherwise return immediately if fail to obtain the
 	mutex. */
 	if (!nowait) {
-		lock_mutex_enter_kernel();
-	} else if (mutex_enter_nowait(&kernel_mutex)) {
-		fputs("FAIL TO OBTAIN KERNEL MUTEX, "
+		lock_mutex_enter();
+	} else if (lock_mutex_enter_nowait()) {
+		fputs("FAIL TO OBTAIN LOCK MUTEX, "
 		      "SKIP LOCK INFO PRINTING\n", file);
 		return(FALSE);
 	}
@@ -4606,7 +5183,9 @@ lock_print_info_summary(
 		      "LATEST DETECTED DEADLOCK\n"
 		      "------------------------\n", file);
 
-		ut_copy_file(file, lock_latest_err_file);
+		if (!srv_read_only_mode) {
+			ut_copy_file(file, lock_latest_err_file);
+		}
 	}
 
 	fputs("------------\n"
@@ -4614,13 +5193,42 @@ lock_print_info_summary(
 	      "------------\n", file);
 
 	fprintf(file, "Trx id counter " TRX_ID_FMT "\n",
-		(ullint) trx_sys->max_trx_id);
+		trx_sys_get_max_trx_id());
 
 	fprintf(file,
 		"Purge done for trx's n:o < " TRX_ID_FMT
-		" undo n:o < " TRX_ID_FMT "\n",
-		(ullint) purge_sys->purge_trx_no,
-		(ullint) purge_sys->purge_undo_no);
+		" undo n:o < " TRX_ID_FMT " state: ",
+		purge_sys->iter.trx_no,
+		purge_sys->iter.undo_no);
+
+	/* Note: We are reading the state without the latch. One because it
+	will violate the latching order and two because we are merely querying
+	the state of the variable for display. */
+
+	switch (purge_sys->state){
+	case PURGE_STATE_EXIT:
+	case PURGE_STATE_INIT:
+		/* Should never be in this state while the system is running. */
+		ut_error;
+
+	case PURGE_STATE_DISABLED:
+		fprintf(file, "disabled");
+		break;
+
+	case PURGE_STATE_RUN:
+		fprintf(file, "running");
+		/* Check if it is waiting for more data to arrive. */
+		if (!purge_sys->running) {
+			fprintf(file, " but idle");
+		}
+		break;
+
+	case PURGE_STATE_STOP:
+		fprintf(file, "stopped");
+		break;
+	}
+
+	fprintf(file, "\n");
 
 	fprintf(file,
 		"History list length %lu\n",
@@ -4635,83 +5243,113 @@ lock_print_info_summary(
 }
 
 /*********************************************************************//**
-Prints info of locks for each transaction. */
+Prints info of locks for each transaction. This function assumes that the
+caller holds the lock mutex and more importantly it will release the lock
+mutex on behalf of the caller. (This should be fixed in the future). */
 UNIV_INTERN
 void
 lock_print_info_all_transactions(
 /*=============================*/
 	FILE*	file)	/*!< in: file where to print */
 {
-	lock_t*	lock;
-	ibool	load_page_first = TRUE;
-	ulint	nth_trx		= 0;
-	ulint	nth_lock	= 0;
-	ulint	i;
-	mtr_t	mtr;
-	trx_t*	trx;
+	const lock_t*	lock;
+	ibool		load_page_first = TRUE;
+	ulint		nth_trx		= 0;
+	ulint		nth_lock	= 0;
+	ulint		i;
+	mtr_t		mtr;
+	const trx_t*	trx;
+	trx_list_t*	trx_list = &trx_sys->rw_trx_list;
 
 	fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n");
 
+	ut_ad(lock_mutex_own());
+
+	mutex_enter(&trx_sys->mutex);
+
 	/* First print info on non-active transactions */
 
-	trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
+	/* NOTE: information of auto-commit non-locking read-only
+	transactions will be omitted here. The information will be
+	available from INFORMATION_SCHEMA.INNODB_TRX. */
+
+	for (trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(mysql_trx_list, trx)) {
+
+		ut_ad(trx->in_mysql_trx_list);
+
+		/* See state transitions and locking rules in trx0trx.h */
 
-	while (trx) {
-		if (trx->state == TRX_NOT_STARTED) {
+		if (trx_state_eq(trx, TRX_STATE_NOT_STARTED)) {
 			fputs("---", file);
-			trx_print(file, trx, 600);
+			trx_print_latched(file, trx, 600);
 		}
-
-		trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
 	}
 
 loop:
-	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
-
-	i = 0;
+	/* Since we temporarily release lock_sys->mutex and
+	trx_sys->mutex when reading a database page in below,
+	variable trx may be obsolete now and we must loop
+	through the trx list to get probably the same trx,
+	or some other trx. */
 
-	/* Since we temporarily release the kernel mutex when
-	reading a database page in below, variable trx may be
-	obsolete now and we must loop through the trx list to
-	get probably the same trx, or some other trx. */
+	for (trx = UT_LIST_GET_FIRST(*trx_list), i = 0;
+	     trx && (i < nth_trx);
+	     trx = UT_LIST_GET_NEXT(trx_list, trx), i++) {
 
-	while (trx && (i < nth_trx)) {
-		trx = UT_LIST_GET_NEXT(trx_list, trx);
-		i++;
+		assert_trx_in_list(trx);
+		ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list));
 	}
 
+	ut_ad(trx == NULL
+	      || trx->read_only == (trx_list == &trx_sys->ro_trx_list));
+
 	if (trx == NULL) {
-		lock_mutex_exit_kernel();
+		/* Check the read-only transaction list next. */
+		if (trx_list == &trx_sys->rw_trx_list) {
+			trx_list = &trx_sys->ro_trx_list;
+			nth_trx = 0;
+			nth_lock = 0;
+			goto loop;
+		}
+
+		lock_mutex_exit();
+		mutex_exit(&trx_sys->mutex);
 
 		ut_ad(lock_validate());
 
 		return;
 	}
 
+	assert_trx_in_list(trx);
+
 	if (nth_lock == 0) {
 		fputs("---", file);
-		trx_print(file, trx, 600);
+
+		trx_print_latched(file, trx, 600);
 
 		if (trx->read_view) {
 			fprintf(file,
 				"Trx read view will not see trx with"
 				" id >= " TRX_ID_FMT
 				", sees < " TRX_ID_FMT "\n",
-				(ullint) trx->read_view->low_limit_id,
-				(ullint) trx->read_view->up_limit_id);
+				trx->read_view->low_limit_id,
+				trx->read_view->up_limit_id);
 		}
 
-		if (trx->que_state == TRX_QUE_LOCK_WAIT) {
+		if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
 			fprintf(file,
 				"------- TRX HAS BEEN WAITING %lu SEC"
 				" FOR THIS LOCK TO BE GRANTED:\n",
-				(ulong) difftime(time(NULL),
-						 trx->wait_started));
+				(ulong) difftime(ut_time(),
+						 trx->lock.wait_started));
 
-			if (lock_get_type_low(trx->wait_lock) == LOCK_REC) {
-				lock_rec_print(file, trx->wait_lock);
+			if (lock_get_type_low(trx->lock.wait_lock) == LOCK_REC) {
+				lock_rec_print(file, trx->lock.wait_lock);
 			} else {
-				lock_table_print(file, trx->wait_lock);
+				lock_table_print(file, trx->lock.wait_lock);
 			}
 
 			fputs("------------------\n", file);
@@ -4728,7 +5366,7 @@ loop:
 	/* Look at the note about the trx loop above why we loop here:
 	lock may be an obsolete pointer now. */
 
-	lock = UT_LIST_GET_FIRST(trx->trx_locks);
+	lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
 
 	while (lock && (i < nth_lock)) {
 		lock = UT_LIST_GET_NEXT(trx_locks, lock);
@@ -4762,9 +5400,11 @@ loop:
 				goto print_rec;
 			}
 
-			lock_mutex_exit_kernel();
+			lock_mutex_exit();
+			mutex_exit(&trx_sys->mutex);
 
 			if (srv_show_verbose_locks) {
+
 				mtr_start(&mtr);
 
 				buf_page_get_gen(space, zip_size, page_no,
@@ -4777,7 +5417,9 @@ loop:
 
 			load_page_first = FALSE;
 
-			lock_mutex_enter_kernel();
+			lock_mutex_enter();
+
+			mutex_enter(&trx_sys->mutex);
 
 			goto loop;
 		}
@@ -4801,8 +5443,6 @@ print_rec:
 
 		nth_trx++;
 		nth_lock = 0;
-
-		goto loop;
 	}
 
 	goto loop;
@@ -4810,6 +5450,45 @@ print_rec:
 
 #ifdef UNIV_DEBUG
 /*********************************************************************//**
+Find the the lock in the trx_t::trx_lock_t::table_locks vector.
+@return TRUE if found */
+static
+ibool
+lock_trx_table_locks_find(
+/*======================*/
+	trx_t*		trx,		/*!< in: trx to validate */
+	const lock_t*	find_lock)	/*!< in: lock to find */
+{
+	lint		i;
+	ibool		found = FALSE;
+
+	trx_mutex_enter(trx);
+
+	for (i = ib_vector_size(trx->lock.table_locks) - 1; i >= 0; --i) {
+		const lock_t*	lock;
+
+		lock = *static_cast<const lock_t**>(
+			ib_vector_get(trx->lock.table_locks, i));
+
+		if (lock == NULL) {
+			continue;
+		} else if (lock == find_lock) {
+			/* Can't be duplicates. */
+			ut_a(!found);
+			found = TRUE;
+		}
+
+		ut_a(trx == lock->trx);
+		ut_a(lock_get_type_low(lock) & LOCK_TABLE);
+		ut_a(lock->un_member.tab_lock.table != NULL);
+	}
+
+	trx_mutex_exit(trx);
+
+	return(found);
+}
+
+/*********************************************************************//**
 Validates the lock queue on a table.
 @return	TRUE if ok */
 static
@@ -4820,14 +5499,18 @@ lock_table_queue_validate(
 {
 	const lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
 
-	lock = UT_LIST_GET_FIRST(table->locks);
+	for (lock = UT_LIST_GET_FIRST(table->locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
 
-	while (lock) {
-		ut_a(((lock->trx)->state == TRX_ACTIVE)
-		     || ((lock->trx)->state == TRX_PREPARED)
-		     || ((lock->trx)->state == TRX_COMMITTED_IN_MEMORY));
+		/* lock->trx->state cannot change from or to NOT_STARTED
+		while we are holding the trx_sys->mutex. It may change
+		from ACTIVE to PREPARED, but it may not change to
+		COMMITTED, because we are holding the lock_sys->mutex. */
+		ut_ad(trx_assert_started(lock->trx));
 
 		if (!lock_get_wait(lock)) {
 
@@ -4839,7 +5522,7 @@ lock_table_queue_validate(
 			ut_a(lock_table_has_to_wait_in_queue(lock));
 		}
 
-		lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock);
+		ut_a(lock_trx_table_locks_find(lock->trx, lock));
 	}
 
 	return(TRUE);
@@ -4852,37 +5535,39 @@ static
 ibool
 lock_rec_queue_validate(
 /*====================*/
+	ibool			locked_lock_trx_sys,
+					/*!< in: if the caller holds
+					both the lock mutex and
+					trx_sys_t->lock. */
 	const buf_block_t*	block,	/*!< in: buffer block containing rec */
 	const rec_t*		rec,	/*!< in: record to look at */
 	const dict_index_t*	index,	/*!< in: index, or NULL if not known */
 	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
 {
-	trx_t*	impl_trx;
-	lock_t*	lock;
-	ulint	heap_no;
+	const trx_t*	impl_trx;
+	const lock_t*	lock;
+	ulint		heap_no;
 
 	ut_a(rec);
 	ut_a(block->frame == page_align(rec));
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+	ut_ad(lock_mutex_own() == locked_lock_trx_sys);
+	ut_ad(!index || dict_index_is_clust(index)
+	      || !dict_index_is_online_ddl(index));
 
 	heap_no = page_rec_get_heap_no(rec);
 
-	lock_mutex_enter_kernel();
+	if (!locked_lock_trx_sys) {
+		lock_mutex_enter();
+		mutex_enter(&trx_sys->mutex);
+	}
 
 	if (!page_rec_is_user_rec(rec)) {
 
-		lock = lock_rec_get_first(block, heap_no);
-
-		while (lock) {
-			switch(lock->trx->state) {
-			case TRX_ACTIVE:
-			case TRX_PREPARED:
-			case TRX_COMMITTED_IN_MEMORY:
-				break;
-			default:
-				ut_error;
-			}
+		for (lock = lock_rec_get_first(block, heap_no);
+		     lock != NULL;
+		     lock = lock_rec_get_next_const(heap_no, lock)) {
 
 			ut_a(trx_in_trx_list(lock->trx));
 
@@ -4893,78 +5578,38 @@ lock_rec_queue_validate(
 			if (index) {
 				ut_a(lock->index == index);
 			}
-
-			lock = lock_rec_get_next(heap_no, lock);
 		}
 
-		lock_mutex_exit_kernel();
-
-		return(TRUE);
+		goto func_exit;
 	}
 
 	if (!index);
 	else if (dict_index_is_clust(index)) {
+		trx_id_t	trx_id;
 
-		impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets);
+		/* Unlike the non-debug code, this invariant can only succeed
+		if the check and assertion are covered by the lock mutex. */
 
-		if (impl_trx
-		    && lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT,
-						   block, heap_no, impl_trx)) {
+		trx_id = lock_clust_rec_some_has_impl(rec, index, offsets);
+		impl_trx = trx_rw_get_active_trx_by_id(trx_id, NULL);
 
-			ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
-					       block, heap_no, impl_trx));
-		}
-#if 0
-	} else {
+		ut_ad(lock_mutex_own());
+		/* impl_trx cannot be committed until lock_mutex_exit()
+		because lock_trx_release_locks() acquires lock_sys->mutex */
 
-		/* The kernel mutex may get released temporarily in the
-		next function call: we have to release lock table mutex
-		to obey the latching order */
-
-		/* If this thread is holding the file space latch
-		(fil_space_t::latch), the following check WILL break
-		latching order and may cause a deadlock of threads. */
-
-		/* NOTE: This is a bogus check that would fail in the
-		following case: Our transaction is updating a
-		row. After it has updated the clustered index record,
-		it goes to a secondary index record and finds someone
-		else holding an explicit S- or X-lock on that
-		secondary index record, presumably from a locking
-		read. Our transaction cannot update the secondary
-		index immediately, but places a waiting X-lock request
-		on the secondary index record. There is nothing
-		illegal in this. The assertion is simply too strong. */
-
-		/* From the locking point of view, each secondary
-		index is a separate table. A lock that is held on
-		secondary index rec does not give any rights to modify
-		or read the clustered index rec. Therefore, we can
-		think of the sec index as a separate 'table' from the
-		clust index 'table'. Conversely, a transaction that
-		has acquired a lock on and modified a clustered index
-		record may need to wait for a lock on the
-		corresponding record in a secondary index. */
-
-		impl_trx = lock_sec_rec_some_has_impl_off_kernel(
-			rec, index, offsets);
-
-		if (impl_trx
+		if (impl_trx != NULL
 		    && lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT,
 						   block, heap_no, impl_trx)) {
 
 			ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
 					       block, heap_no, impl_trx));
 		}
-#endif
 	}
 
-	lock = lock_rec_get_first(block, heap_no);
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next_const(heap_no, lock)) {
 
-	while (lock) {
-		ut_a(lock->trx->state == TRX_ACTIVE
-		     || lock->trx->state == TRX_PREPARED
-		     || lock->trx->state == TRX_COMMITTED_IN_MEMORY);
 		ut_a(trx_in_trx_list(lock->trx));
 
 		if (index) {
@@ -4987,11 +5632,13 @@ lock_rec_queue_validate(
 
 			ut_a(lock_rec_has_to_wait_in_queue(lock));
 		}
-
-		lock = lock_rec_get_next(heap_no, lock);
 	}
 
-	lock_mutex_exit_kernel();
+func_exit:
+	if (!locked_lock_trx_sys) {
+		lock_mutex_exit();
+		mutex_exit(&trx_sys->mutex);
+	}
 
 	return(TRUE);
 }
@@ -5007,7 +5654,6 @@ lock_rec_validate_page(
 {
 	const lock_t*	lock;
 	const rec_t*	rec;
-	dict_index_t*	index;
 	ulint		nth_lock	= 0;
 	ulint		nth_bit		= 0;
 	ulint		i;
@@ -5016,10 +5662,10 @@ lock_rec_validate_page(
 	ulint*		offsets		= offsets_;
 	rec_offs_init(offsets_);
 
-	ut_ad(!mutex_own(&kernel_mutex));
-	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(!lock_mutex_own());
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
+	mutex_enter(&trx_sys->mutex);
 loop:
 	lock = lock_rec_get_first_on_page_addr(buf_block_get_space(block),
 					       buf_block_get_page_no(block));
@@ -5042,9 +5688,6 @@ loop:
 	}
 
 	ut_a(trx_in_trx_list(lock->trx));
-	ut_a(lock->trx->state == TRX_ACTIVE
-	     || lock->trx->state == TRX_PREPARED
-	     || lock->trx->state == TRX_COMMITTED_IN_MEMORY);
 
 # ifdef UNIV_SYNC_DEBUG
 	/* Only validate the record queues when this thread is not
@@ -5057,7 +5700,6 @@ loop:
 
 		if (i == 1 || lock_rec_get_nth_bit(lock, i)) {
 
-			index = lock->index;
 			rec = page_find_rec_with_heap_no(block->frame, i);
 			ut_a(rec);
 			offsets = rec_get_offsets(rec, lock->index, offsets,
@@ -5067,17 +5709,13 @@ loop:
 				"Validating %u %u\n",
 				block->page.space, block->page.offset);
 #endif
-			lock_mutex_exit_kernel();
-
 			/* If this thread is holding the file space
 			latch (fil_space_t::latch), the following
 			check WILL break the latching order and may
 			cause a deadlock of threads. */
 
-			lock_rec_queue_validate(block, rec, index,
-						offsets);
-
-			lock_mutex_enter_kernel();
+			lock_rec_queue_validate(
+				TRUE, block, rec, lock->index, offsets);
 
 			nth_bit = i + 1;
 
@@ -5091,7 +5729,8 @@ loop:
 	goto loop;
 
 function_exit:
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
+	mutex_exit(&trx_sys->mutex);
 
 	if (UNIV_LIKELY_NULL(heap)) {
 		mem_heap_free(heap);
@@ -5100,6 +5739,47 @@ function_exit:
 }
 
 /*********************************************************************//**
+Validates the table locks.
+@return	TRUE if ok */
+static
+ibool
+lock_validate_table_locks(
+/*======================*/
+	const trx_list_t*	trx_list)	/*!< in: trx list */
+{
+	const trx_t*	trx;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_ad(trx_list == &trx_sys->rw_trx_list
+	      || trx_list == &trx_sys->ro_trx_list);
+
+	for (trx = UT_LIST_GET_FIRST(*trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		const lock_t*	lock;
+
+		assert_trx_in_list(trx);
+		ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list));
+
+		for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
+		     lock != NULL;
+		     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+
+			if (lock_get_type_low(lock) & LOCK_TABLE) {
+
+				lock_table_queue_validate(
+					lock->un_member.tab_lock.table);
+			}
+		}
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
 Validate record locks up to a limit.
 @return lock at limit or NULL if no more locks in the hash bucket */
 static __attribute__((nonnull, warn_unused_result))
@@ -5111,12 +5791,13 @@ lock_rec_validate(
 	ib_uint64_t*	limit)		/*!< in/out: upper limit of
 					(space, page_no) */
 {
-	lock_t*		lock;
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
 
-	for (lock = HASH_GET_FIRST(lock_sys->rec_hash, start);
+	for (const lock_t* lock = static_cast<const lock_t*>(
+			HASH_GET_FIRST(lock_sys->rec_hash, start));
 	     lock != NULL;
-	     lock = HASH_GET_NEXT(hash, lock)) {
+	     lock = static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock))) {
 
 		ib_uint64_t	current;
 
@@ -5133,7 +5814,7 @@ lock_rec_validate(
 		}
 	}
 
-	return(NULL);
+	return(0);
 }
 
 /*********************************************************************//**
@@ -5150,8 +5831,8 @@ lock_rec_block_validate(
 	If the lock exists in lock_rec_validate_page() we assert
 	!block->page.file_page_was_freed. */
 
-	mtr_t		mtr;
 	buf_block_t*	block;
+	mtr_t		mtr;
 
 	/* Make sure that the tablespace is not deleted while we are
 	trying to access the page. */
@@ -5176,56 +5857,47 @@ lock_rec_block_validate(
 Validates the lock system.
 @return	TRUE if ok */
 static
-ibool
-lock_validate(void)
-/*===============*/
+bool
+lock_validate()
+/*===========*/
 {
-	const lock_t*	lock;
-	const trx_t*	trx;
-	ulint		i;
+	typedef	std::pair<ulint, ulint> page_addr_t;
+	typedef std::set<page_addr_t> page_addr_set;
+	page_addr_set pages;
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
+	mutex_enter(&trx_sys->mutex);
 
-	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
-
-	while (trx) {
-		lock = UT_LIST_GET_FIRST(trx->trx_locks);
-
-		while (lock) {
-			if (lock_get_type_low(lock) & LOCK_TABLE) {
-
-				lock_table_queue_validate(
-					lock->un_member.tab_lock.table);
-			}
-
-			lock = UT_LIST_GET_NEXT(trx_locks, lock);
-		}
-
-		trx = UT_LIST_GET_NEXT(trx_list, trx);
-	}
+	ut_a(lock_validate_table_locks(&trx_sys->rw_trx_list));
+	ut_a(lock_validate_table_locks(&trx_sys->ro_trx_list));
 
 	/* Iterate over all the record locks and validate the locks. We
 	don't want to hog the lock_sys_t::mutex and the trx_sys_t::mutex.
 	Release both mutexes during the validation check. */
 
-	for (i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) {
+	for (ulint i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) {
 		const lock_t*	lock;
 		ib_uint64_t	limit = 0;
 
-		while ((lock = lock_rec_validate(i, &limit)) != NULL) {
+		while ((lock = lock_rec_validate(i, &limit)) != 0) {
 
 			ulint	space = lock->un_member.rec_lock.space;
 			ulint	page_no = lock->un_member.rec_lock.page_no;
 
-			lock_mutex_exit_kernel();
-			lock_rec_block_validate(space, page_no);
-			lock_mutex_enter_kernel();
+			pages.insert(std::make_pair(space, page_no));
 		}
 	}
 
-	lock_mutex_exit_kernel();
+	mutex_exit(&trx_sys->mutex);
+	lock_mutex_exit();
 
-	return(TRUE);
+	for (page_addr_set::const_iterator it = pages.begin();
+	     it != pages.end();
+	     ++it) {
+		lock_rec_block_validate((*it).first, (*it).second);
+	}
+
+	return(true);
 }
 #endif /* UNIV_DEBUG */
 /*============ RECORD LOCK CHECKS FOR ROW OPERATIONS ====================*/
@@ -5238,7 +5910,7 @@ the query thread to the lock wait state and inserts a waiting request
 for a gap x-lock to the lock queue.
 @return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
 UNIV_INTERN
-ulint
+dberr_t
 lock_rec_insert_check_and_lock(
 /*===========================*/
 	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is
@@ -5256,10 +5928,13 @@ lock_rec_insert_check_and_lock(
 	const rec_t*	next_rec;
 	trx_t*		trx;
 	lock_t*		lock;
-	ulint		err;
+	dberr_t		err;
 	ulint		next_rec_heap_no;
 
 	ut_ad(block->frame == page_align(rec));
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
 
 	if (flags & BTR_NO_LOCKING_FLAG) {
 
@@ -5275,21 +5950,22 @@ lock_rec_insert_check_and_lock(
 	next_rec = page_rec_get_next_const(rec);
 	next_rec_heap_no = page_rec_get_heap_no(next_rec);
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
+	/* Because this code is invoked for a running transaction by
+	the thread that is serving the transaction, it is not necessary
+	to hold trx->mutex here. */
 
 	/* When inserting a record into an index, the table must be at
-	least IX-locked or we must be building an index, in which case
-	the table must be at least S-locked. */
-	ut_ad(lock_table_has(trx, index->table, LOCK_IX)
-	      || (*index->name == TEMP_INDEX_PREFIX
-		  && lock_table_has(trx, index->table, LOCK_S)));
+	least IX-locked. When we are building an index, we would pass
+	BTR_NO_LOCKING_FLAG and skip the locking altogether. */
+	ut_ad(lock_table_has(trx, index->table, LOCK_IX));
 
 	lock = lock_rec_get_first(block, next_rec_heap_no);
 
 	if (UNIV_LIKELY(lock == NULL)) {
 		/* We optimize CPU time usage in the simplest case */
 
-		lock_mutex_exit_kernel();
+		lock_mutex_exit();
 
 		if (!dict_index_is_clust(index)) {
 			/* Update the page max trx id field */
@@ -5316,19 +5992,23 @@ lock_rec_insert_check_and_lock(
 	on the successor, which produced an unnecessary deadlock. */
 
 	if (lock_rec_other_has_conflicting(
-		    LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION,
+		    static_cast<enum lock_mode>(
+			    LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION),
 		    block, next_rec_heap_no, trx)) {
 
 		/* Note that we may get DB_SUCCESS also here! */
-		err = lock_rec_enqueue_waiting(LOCK_X | LOCK_GAP
-					       | LOCK_INSERT_INTENTION,
-					       block, next_rec_heap_no,
-					       NULL, index, thr);
+		trx_mutex_enter(trx);
+
+		err = lock_rec_enqueue_waiting(
+			LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION,
+			block, next_rec_heap_no, NULL, index, thr);
+
+		trx_mutex_exit(trx);
 	} else {
 		err = DB_SUCCESS;
 	}
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 
 	switch (err) {
 	case DB_SUCCESS_LOCKED_REC:
@@ -5342,6 +6022,9 @@ lock_rec_insert_check_and_lock(
 		page_update_max_trx_id(block,
 				       buf_block_get_page_zip(block),
 				       trx->id, mtr);
+	default:
+		/* We only care about the two return values. */
+		break;
 	}
 
 #ifdef UNIV_DEBUG
@@ -5353,8 +6036,10 @@ lock_rec_insert_check_and_lock(
 
 		offsets = rec_get_offsets(next_rec, index, offsets_,
 					  ULINT_UNDEFINED, &heap);
-		ut_ad(lock_rec_queue_validate(block,
-					      next_rec, index, offsets));
+
+		ut_ad(lock_rec_queue_validate(
+				FALSE, block, next_rec, index, offsets));
+
 		if (UNIV_LIKELY_NULL(heap)) {
 			mem_heap_free(heap);
 		}
@@ -5366,8 +6051,7 @@ lock_rec_insert_check_and_lock(
 
 /*********************************************************************//**
 If a transaction has an implicit x-lock on a record, but no explicit x-lock
-set on the record, sets one for it. NOTE that in the case of a secondary
-index, the kernel mutex may get temporarily released. */
+set on the record, sets one for it. */
 static
 void
 lock_rec_convert_impl_to_expl(
@@ -5377,27 +6061,44 @@ lock_rec_convert_impl_to_expl(
 	dict_index_t*		index,	/*!< in: index of record */
 	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
 {
-	trx_t*	impl_trx;
+	trx_id_t		trx_id;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(!lock_mutex_own());
 	ut_ad(page_rec_is_user_rec(rec));
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
 
 	if (dict_index_is_clust(index)) {
-		impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets);
+		trx_id = lock_clust_rec_some_has_impl(rec, index, offsets);
+		/* The clustered index record was last modified by
+		this transaction. The transaction may have been
+		committed a long time ago. */
 	} else {
-		impl_trx = lock_sec_rec_some_has_impl_off_kernel(
-			rec, index, offsets);
+		ut_ad(!dict_index_is_online_ddl(index));
+		trx_id = lock_sec_rec_some_has_impl(rec, index, offsets);
+		/* The transaction can be committed before the
+		trx_is_active(trx_id, NULL) check below, because we are not
+		holding lock_mutex. */
 	}
 
-	if (impl_trx) {
+	if (trx_id != 0) {
+		trx_t*	impl_trx;
 		ulint	heap_no = page_rec_get_heap_no(rec);
 
-		/* If the transaction has no explicit x-lock set on the
-		record, set one for it */
+		lock_mutex_enter();
+
+		/* If the transaction is still active and has no
+		explicit x-lock set on the record, set one for it */
 
-		if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block,
+		mutex_enter(&trx_sys->mutex);
+		impl_trx = trx_rw_get_active_trx_by_id(trx_id, NULL);
+		mutex_exit(&trx_sys->mutex);
+
+		/* impl_trx cannot be committed until lock_mutex_exit()
+		because lock_trx_release_locks() acquires lock_sys->mutex */
+
+		if (impl_trx != NULL
+		    && !lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block,
 				       heap_no, impl_trx)) {
 			ulint	type_mode = (LOCK_REC | LOCK_X
 					     | LOCK_REC_NOT_GAP);
@@ -5408,15 +6109,20 @@ lock_rec_convert_impl_to_expl(
 
 			if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))
 			    && lock_rec_other_has_conflicting(
-					LOCK_X | LOCK_REC_NOT_GAP, block,
+					static_cast<enum lock_mode>
+					(LOCK_X | LOCK_REC_NOT_GAP), block,
 					heap_no, impl_trx)) {
 
-				type_mode |= (LOCK_WAIT | LOCK_CONV_BY_OTHER);
+				type_mode |= (LOCK_WAIT
+					      | LOCK_CONV_BY_OTHER);
 			}
 
 			lock_rec_add_to_queue(
-				type_mode, block, heap_no, index, impl_trx);
+				type_mode, block, heap_no, index,
+				impl_trx, FALSE);
 		}
+
+		lock_mutex_exit();
 	}
 }
 
@@ -5429,7 +6135,7 @@ lock wait state and inserts a waiting request for a record x-lock to the
 lock queue.
 @return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
 UNIV_INTERN
-ulint
+dberr_t
 lock_clust_rec_modify_check_and_lock(
 /*=================================*/
 	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
@@ -5441,7 +6147,7 @@ lock_clust_rec_modify_check_and_lock(
 	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
-	ulint	err;
+	dberr_t	err;
 	ulint	heap_no;
 
 	ut_ad(rec_offs_validate(rec, index, offsets));
@@ -5461,21 +6167,23 @@ lock_clust_rec_modify_check_and_lock(
 		? rec_get_heap_no_new(rec)
 		: rec_get_heap_no_old(rec);
 
-	lock_mutex_enter_kernel();
-
-	ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
-
 	/* If a transaction has no explicit x-lock set on the record, set one
 	for it */
 
 	lock_rec_convert_impl_to_expl(block, rec, index, offsets);
 
+	lock_mutex_enter();
+
+	ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+
 	err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP,
 			    block, heap_no, index, thr);
 
-	lock_mutex_exit_kernel();
+	MONITOR_INC(MONITOR_NUM_RECLOCK_REQ);
+
+	lock_mutex_exit();
 
-	ut_ad(lock_rec_queue_validate(block, rec, index, offsets));
+	ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
 
 	if (UNIV_UNLIKELY(err == DB_SUCCESS_LOCKED_REC)) {
 		err = DB_SUCCESS;
@@ -5489,7 +6197,7 @@ Checks if locks of other transactions prevent an immediate modify (delete
 mark or delete unmark) of a secondary index record.
 @return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
 UNIV_INTERN
-ulint
+dberr_t
 lock_sec_rec_modify_check_and_lock(
 /*===============================*/
 	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG
@@ -5501,13 +6209,15 @@ lock_sec_rec_modify_check_and_lock(
 				clustered index record first: see the
 				comment below */
 	dict_index_t*	index,	/*!< in: secondary index */
-	que_thr_t*	thr,	/*!< in: query thread */
+	que_thr_t*	thr,	/*!< in: query thread
+				(can be NULL if BTR_NO_LOCKING_FLAG) */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	ulint	err;
+	dberr_t	err;
 	ulint	heap_no;
 
 	ut_ad(!dict_index_is_clust(index));
+	ut_ad(!dict_index_is_online_ddl(index) || (flags & BTR_CREATE_FLAG));
 	ut_ad(block->frame == page_align(rec));
 
 	if (flags & BTR_NO_LOCKING_FLAG) {
@@ -5526,14 +6236,16 @@ lock_sec_rec_modify_check_and_lock(
 	index record, and this would not have been possible if another active
 	transaction had modified this secondary index record. */
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
 
 	err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP,
 			    block, heap_no, index, thr);
 
-	lock_mutex_exit_kernel();
+	MONITOR_INC(MONITOR_NUM_RECLOCK_REQ);
+
+	lock_mutex_exit();
 
 #ifdef UNIV_DEBUG
 	{
@@ -5544,7 +6256,10 @@ lock_sec_rec_modify_check_and_lock(
 
 		offsets = rec_get_offsets(rec, index, offsets_,
 					  ULINT_UNDEFINED, &heap);
-		ut_ad(lock_rec_queue_validate(block, rec, index, offsets));
+
+		ut_ad(lock_rec_queue_validate(
+			FALSE, block, rec, index, offsets));
+
 		if (UNIV_LIKELY_NULL(heap)) {
 			mem_heap_free(heap);
 		}
@@ -5571,7 +6286,7 @@ secondary index record.
 @return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
 or DB_QUE_THR_SUSPENDED */
 UNIV_INTERN
-enum db_err
+dberr_t
 lock_sec_rec_read_check_and_lock(
 /*=============================*/
 	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
@@ -5592,10 +6307,11 @@ lock_sec_rec_read_check_and_lock(
 					LOCK_REC_NOT_GAP */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
-	enum db_err	err;
-	ulint		heap_no;
+	dberr_t	err;
+	ulint	heap_no;
 
 	ut_ad(!dict_index_is_clust(index));
+	ut_ad(!dict_index_is_online_ddl(index));
 	ut_ad(block->frame == page_align(rec));
 	ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec));
 	ut_ad(rec_offs_validate(rec, index, offsets));
@@ -5606,7 +6322,7 @@ lock_sec_rec_read_check_and_lock(
 		return(DB_SUCCESS);
 	}
 
-	if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) {
+	if (UNIV_UNLIKELY((thr && thr_get_trx(thr)->fake_changes))) {
 		if (!srv_fake_changes_locks) {
 			return(DB_SUCCESS);
 		}
@@ -5617,30 +6333,32 @@ lock_sec_rec_read_check_and_lock(
 
 	heap_no = page_rec_get_heap_no(rec);
 
-	lock_mutex_enter_kernel();
-
-	ut_ad(mode != LOCK_X
-	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
-	ut_ad(mode != LOCK_S
-	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
-
 	/* Some transaction may have an implicit x-lock on the record only
 	if the max trx id for the page >= min trx id for the trx list or a
 	database recovery is running. */
 
-	if ((page_get_max_trx_id(block->frame) >= trx_list_get_min_trx_id()
+	if ((page_get_max_trx_id(block->frame) >= trx_rw_min_trx_id()
 	     || recv_recovery_is_on())
 	    && !page_rec_is_supremum(rec)) {
 
 		lock_rec_convert_impl_to_expl(block, rec, index, offsets);
 	}
 
+	lock_mutex_enter();
+
+	ut_ad(mode != LOCK_X
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad(mode != LOCK_S
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+
 	err = lock_rec_lock(FALSE, mode | gap_mode,
 			    block, heap_no, index, thr);
 
-	lock_mutex_exit_kernel();
+	MONITOR_INC(MONITOR_NUM_RECLOCK_REQ);
+
+	lock_mutex_exit();
 
-	ut_ad(lock_rec_queue_validate(block, rec, index, offsets));
+	ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
 
 	return(err);
 }
@@ -5655,7 +6373,7 @@ lock on the record.
 @return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
 or DB_QUE_THR_SUSPENDED */
 UNIV_INTERN
-enum db_err
+dberr_t
 lock_clust_rec_read_check_and_lock(
 /*===============================*/
 	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
@@ -5676,8 +6394,8 @@ lock_clust_rec_read_check_and_lock(
 					LOCK_REC_NOT_GAP */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
-	enum db_err	err;
-	ulint		heap_no;
+	dberr_t	err;
+	ulint	heap_no;
 
 	ut_ad(dict_index_is_clust(index));
 	ut_ad(block->frame == page_align(rec));
@@ -5691,7 +6409,7 @@ lock_clust_rec_read_check_and_lock(
 		return(DB_SUCCESS);
 	}
 
-	if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) {
+	if (UNIV_UNLIKELY((thr && thr_get_trx(thr)->fake_changes))) {
 		if (!srv_fake_changes_locks) {
 			return(DB_SUCCESS);
 		}
@@ -5702,24 +6420,26 @@ lock_clust_rec_read_check_and_lock(
 
 	heap_no = page_rec_get_heap_no(rec);
 
-	lock_mutex_enter_kernel();
+	if (UNIV_LIKELY(heap_no != PAGE_HEAP_NO_SUPREMUM)) {
+
+		lock_rec_convert_impl_to_expl(block, rec, index, offsets);
+	}
+
+	lock_mutex_enter();
 
 	ut_ad(mode != LOCK_X
 	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
 	ut_ad(mode != LOCK_S
 	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
 
-	if (UNIV_LIKELY(heap_no != PAGE_HEAP_NO_SUPREMUM)) {
-
-		lock_rec_convert_impl_to_expl(block, rec, index, offsets);
-	}
-
 	err = lock_rec_lock(FALSE, mode | gap_mode,
 			    block, heap_no, index, thr);
 
-	lock_mutex_exit_kernel();
+	MONITOR_INC(MONITOR_NUM_RECLOCK_REQ);
 
-	ut_ad(lock_rec_queue_validate(block, rec, index, offsets));
+	lock_mutex_exit();
+
+	ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
 
 	return(err);
 }
@@ -5734,7 +6454,7 @@ lock_clust_rec_read_check_and_lock() that does not require the parameter
 "offsets".
 @return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
 UNIV_INTERN
-ulint
+dberr_t
 lock_clust_rec_read_check_and_lock_alt(
 /*===================================*/
 	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
@@ -5757,7 +6477,7 @@ lock_clust_rec_read_check_and_lock_alt(
 	mem_heap_t*	tmp_heap	= NULL;
 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
 	ulint*		offsets		= offsets_;
-	ulint		err;
+	dberr_t		err;
 	rec_offs_init(offsets_);
 
 	offsets = rec_get_offsets(rec, index, offsets,
@@ -5786,12 +6506,12 @@ lock_release_autoinc_last_lock(
 	ulint		last;
 	lock_t*		lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_a(!ib_vector_is_empty(autoinc_locks));
 
 	/* The lock to be release must be the last lock acquired. */
 	last = ib_vector_size(autoinc_locks) - 1;
-	lock = ib_vector_get(autoinc_locks, last);
+	lock = *static_cast<lock_t**>(ib_vector_get(autoinc_locks, last));
 
 	/* Should have only AUTOINC locks in the vector. */
 	ut_a(lock_get_mode(lock) == LOCK_AUTO_INC);
@@ -5801,12 +6521,15 @@ lock_release_autoinc_last_lock(
 
 	/* This will remove the lock from the trx autoinc_locks too. */
 	lock_table_dequeue(lock);
+
+	/* Remove from the table vector too. */
+	lock_trx_table_locks_remove(lock);
 }
 
 /*******************************************************************//**
-Check if a transaction holds any autoinc locks. 
+Check if a transaction holds any autoinc locks.
 @return TRUE if the transaction holds any AUTOINC locks. */
-UNIV_INTERN
+static
 ibool
 lock_trx_holds_autoinc_locks(
 /*=========================*/
@@ -5819,13 +6542,16 @@ lock_trx_holds_autoinc_locks(
 
 /*******************************************************************//**
 Release all the transaction's autoinc locks. */
-UNIV_INTERN
+static
 void
 lock_release_autoinc_locks(
 /*=======================*/
 	trx_t*		trx)		/*!< in/out: transaction */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
+	/* If this is invoked for a running transaction by the thread
+	that is serving the transaction, then it is not necessary to
+	hold trx->mutex here. */
 
 	ut_a(trx->autoinc_locks != NULL);
 
@@ -5946,6 +6672,8 @@ lock_get_table(
 {
 	switch (lock_get_type_low(lock)) {
 	case LOCK_REC:
+		ut_ad(dict_index_is_clust(lock->index)
+		      || !dict_index_is_online_ddl(lock->index));
 		return(lock->index->table);
 	case LOCK_TABLE:
 		return(lock->un_member.tab_lock.table);
@@ -5998,6 +6726,8 @@ lock_rec_get_index(
 	const lock_t*	lock)	/*!< in: lock */
 {
 	ut_a(lock_get_type_low(lock) == LOCK_REC);
+	ut_ad(dict_index_is_clust(lock->index)
+	      || !dict_index_is_online_ddl(lock->index));
 
 	return(lock->index);
 }
@@ -6013,6 +6743,8 @@ lock_rec_get_index_name(
 	const lock_t*	lock)	/*!< in: lock */
 {
 	ut_a(lock_get_type_low(lock) == LOCK_REC);
+	ut_ad(dict_index_is_clust(lock->index)
+	      || !dict_index_is_online_ddl(lock->index));
 
 	return(lock->index->name);
 }
@@ -6044,3 +6776,401 @@ lock_rec_get_page_no(
 
 	return(lock->un_member.rec_lock.page_no);
 }
+
+/*********************************************************************//**
+Cancels a waiting lock request and releases possible other transactions
+waiting behind it. */
+UNIV_INTERN
+void
+lock_cancel_waiting_and_release(
+/*============================*/
+	lock_t*	lock)	/*!< in/out: waiting lock request */
+{
+	que_thr_t*	thr;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(trx_mutex_own(lock->trx));
+	ut_ad(!(lock->type_mode & LOCK_CONV_BY_OTHER));
+
+	lock->trx->lock.cancel = TRUE;
+
+	if (lock_get_type_low(lock) == LOCK_REC) {
+
+		lock_rec_dequeue_from_page(lock);
+	} else {
+		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+
+		if (lock->trx->autoinc_locks != NULL) {
+			/* Release the transaction's AUTOINC locks. */
+			lock_release_autoinc_locks(lock->trx);
+		}
+
+		lock_table_dequeue(lock);
+	}
+
+	/* Reset the wait flag and the back pointer to lock in trx. */
+
+	lock_reset_lock_and_trx_wait(lock);
+
+	/* The following function releases the trx from lock wait. */
+
+	thr = que_thr_end_lock_wait(lock->trx);
+
+	if (thr != NULL) {
+		lock_wait_release_thread_if_suspended(thr);
+	}
+
+	lock->trx->lock.cancel = FALSE;
+}
+
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
+function should be called at the the end of an SQL statement, by the
+connection thread that owns the transaction (trx->mysql_thd). */
+UNIV_INTERN
+void
+lock_unlock_table_autoinc(
+/*======================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	ut_ad(!lock_mutex_own());
+	ut_ad(!trx_mutex_own(trx));
+	ut_ad(!trx->lock.wait_lock);
+	/* This can be invoked on NOT_STARTED, ACTIVE, PREPARED,
+	but not COMMITTED transactions. */
+	ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED)
+	      || !trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY));
+
+	/* This function is invoked for a running transaction by the
+	thread that is serving the transaction. Therefore it is not
+	necessary to hold trx->mutex here. */
+
+	if (lock_trx_holds_autoinc_locks(trx)) {
+		lock_mutex_enter();
+
+		lock_release_autoinc_locks(trx);
+
+		lock_mutex_exit();
+	}
+}
+
+/*********************************************************************//**
+Releases a transaction's locks, and releases possible other transactions
+waiting because of these locks. Change the state of the transaction to
+TRX_STATE_COMMITTED_IN_MEMORY. */
+UNIV_INTERN
+void
+lock_trx_release_locks(
+/*===================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	assert_trx_in_list(trx);
+
+	if (trx_state_eq(trx, TRX_STATE_PREPARED)) {
+		mutex_enter(&trx_sys->mutex);
+		ut_a(trx_sys->n_prepared_trx > 0);
+		trx_sys->n_prepared_trx--;
+		if (trx->is_recovered) {
+			ut_a(trx_sys->n_prepared_recovered_trx > 0);
+			trx_sys->n_prepared_recovered_trx--;
+		}
+		mutex_exit(&trx_sys->mutex);
+	} else {
+		ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	}
+
+	/* The transition of trx->state to TRX_STATE_COMMITTED_IN_MEMORY
+	is protected by both the lock_sys->mutex and the trx->mutex.
+	We also lock trx_sys->mutex, because state transition to
+	TRX_STATE_COMMITTED_IN_MEMORY must be atomic with removing trx
+	from the descriptors array. */
+	lock_mutex_enter();
+	mutex_enter(&trx_sys->mutex);
+	trx_mutex_enter(trx);
+
+	/* The following assignment makes the transaction committed in memory
+	and makes its changes to data visible to other transactions.
+	NOTE that there is a small discrepancy from the strict formal
+	visibility rules here: a human user of the database can see
+	modifications made by another transaction T even before the necessary
+	log segment has been flushed to the disk. If the database happens to
+	crash before the flush, the user has seen modifications from T which
+	will never be a committed transaction. However, any transaction T2
+	which sees the modifications of the committing transaction T, and
+	which also itself makes modifications to the database, will get an lsn
+	larger than the committing transaction T. In the case where the log
+	flush fails, and T never gets committed, also T2 will never get
+	committed. */
+
+	/*--------------------------------------*/
+	trx->state = TRX_STATE_COMMITTED_IN_MEMORY;
+	/* The following also removes trx from trx_serial_list */
+	trx_release_descriptor(trx);
+	/*--------------------------------------*/
+
+	/* If the background thread trx_rollback_or_clean_recovered()
+	is still active then there is a chance that the rollback
+	thread may see this trx as COMMITTED_IN_MEMORY and goes ahead
+	to clean it up calling trx_cleanup_at_db_startup(). This can
+	happen in the case we are committing a trx here that is left
+	in PREPARED state during the crash. Note that commit of the
+	rollback of a PREPARED trx happens in the recovery thread
+	while the rollback of other transactions happen in the
+	background thread. To avoid this race we unconditionally unset
+	the is_recovered flag. */
+
+	trx->is_recovered = FALSE;
+
+	trx_mutex_exit(trx);
+
+	mutex_exit(&trx_sys->mutex);
+
+	lock_release(trx);
+
+	lock_mutex_exit();
+}
+
+/*********************************************************************//**
+Check whether the transaction has already been rolled back because it
+was selected as a deadlock victim, or if it has to wait then cancel
+the wait lock.
+@return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+lock_trx_handle_wait(
+/*=================*/
+	trx_t*	trx)	/*!< in/out: trx lock state */
+{
+	dberr_t	err;
+
+	lock_mutex_enter();
+
+	trx_mutex_enter(trx);
+
+	if (trx->lock.was_chosen_as_deadlock_victim) {
+		err = DB_DEADLOCK;
+	} else if (trx->lock.wait_lock != NULL) {
+		lock_cancel_waiting_and_release(trx->lock.wait_lock);
+		err = DB_LOCK_WAIT;
+	} else {
+		/* The lock was probably granted before we got here. */
+		err = DB_SUCCESS;
+	}
+
+	lock_mutex_exit();
+	trx_mutex_exit(trx);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Get the number of locks on a table.
+@return number of locks */
+UNIV_INTERN
+ulint
+lock_table_get_n_locks(
+/*===================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ulint		n_table_locks;
+
+	lock_mutex_enter();
+
+	n_table_locks = UT_LIST_GET_LEN(table->locks);
+
+	lock_mutex_exit();
+
+	return(n_table_locks);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Do an exhaustive check for any locks (table or rec) against the table.
+@return	lock if found */
+static
+const lock_t*
+lock_table_locks_lookup(
+/*====================*/
+	const dict_table_t*	table,		/*!< in: check if there are
+						any locks held on records in
+						this table or on the table
+						itself */
+	const trx_list_t*	trx_list)	/*!< in: trx list to check */
+{
+	trx_t*			trx;
+
+	ut_a(table != NULL);
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_ad(trx_list == &trx_sys->rw_trx_list
+	      || trx_list == &trx_sys->ro_trx_list);
+
+	for (trx = UT_LIST_GET_FIRST(*trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		const lock_t*	lock;
+
+		assert_trx_in_list(trx);
+		ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list));
+
+		for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
+		     lock != NULL;
+		     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+
+			ut_a(lock->trx == trx);
+
+			if (lock_get_type_low(lock) == LOCK_REC) {
+				ut_ad(!dict_index_is_online_ddl(lock->index)
+				      || dict_index_is_clust(lock->index));
+				if (lock->index->table == table) {
+					return(lock);
+				}
+			} else if (lock->un_member.tab_lock.table == table) {
+				return(lock);
+			}
+		}
+	}
+
+	return(NULL);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Check if there are any locks (table or rec) against table.
+@return	TRUE if table has either table or record locks. */
+UNIV_INTERN
+ibool
+lock_table_has_locks(
+/*=================*/
+	const dict_table_t*	table)	/*!< in: check if there are any locks
+					held on records in this table or on the
+					table itself */
+{
+	ibool			has_locks;
+
+	lock_mutex_enter();
+
+	has_locks = UT_LIST_GET_LEN(table->locks) > 0 || table->n_rec_locks > 0;
+
+#ifdef UNIV_DEBUG
+	if (!has_locks) {
+		mutex_enter(&trx_sys->mutex);
+
+		ut_ad(!lock_table_locks_lookup(table, &trx_sys->rw_trx_list));
+		ut_ad(!lock_table_locks_lookup(table, &trx_sys->ro_trx_list));
+
+		mutex_exit(&trx_sys->mutex);
+	}
+#endif /* UNIV_DEBUG */
+
+	lock_mutex_exit();
+
+	return(has_locks);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Check if the transaction holds any locks on the sys tables
+or its records.
+@return	the strongest lock found on any sys table or 0 for none */
+UNIV_INTERN
+const lock_t*
+lock_trx_has_sys_table_locks(
+/*=========================*/
+	const trx_t*	trx)	/*!< in: transaction to check */
+{
+	lint		i;
+	const lock_t*	strongest_lock = 0;
+	lock_mode	strongest = LOCK_NONE;
+
+	lock_mutex_enter();
+
+	/* Find a valid mode. Note: ib_vector_size() can be 0. */
+	for (i = ib_vector_size(trx->lock.table_locks) - 1; i >= 0; --i) {
+		const lock_t*	lock;
+
+		lock = *static_cast<const lock_t**>(
+			ib_vector_get(trx->lock.table_locks, i));
+
+		if (lock != NULL
+		    && dict_is_sys_table(lock->un_member.tab_lock.table->id)) {
+
+			strongest = lock_get_mode(lock);
+			ut_ad(strongest != LOCK_NONE);
+			strongest_lock = lock;
+			break;
+		}
+	}
+
+	if (strongest == LOCK_NONE) {
+		lock_mutex_exit();
+		return(NULL);
+	}
+
+	for (/* No op */; i >= 0; --i) {
+		const lock_t*	lock;
+
+		lock = *static_cast<const lock_t**>(
+			ib_vector_get(trx->lock.table_locks, i));
+
+		if (lock == NULL) {
+			continue;
+		}
+
+		ut_ad(trx == lock->trx);
+		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+		ut_ad(lock->un_member.tab_lock.table != NULL);
+
+		lock_mode	mode = lock_get_mode(lock);
+
+		if (dict_is_sys_table(lock->un_member.tab_lock.table->id)
+		    && lock_mode_stronger_or_eq(mode, strongest)) {
+
+			strongest = mode;
+			strongest_lock = lock;
+		}
+	}
+
+	lock_mutex_exit();
+
+	return(strongest_lock);
+}
+
+/*******************************************************************//**
+Check if the transaction holds an exclusive lock on a record.
+@return	whether the locks are held */
+UNIV_INTERN
+bool
+lock_trx_has_rec_x_lock(
+/*====================*/
+	const trx_t*		trx,	/*!< in: transaction to check */
+	const dict_table_t*	table,	/*!< in: table to check */
+	const buf_block_t*	block,	/*!< in: buffer block of the record */
+	ulint			heap_no)/*!< in: record heap number */
+{
+	enum lock_mode	intention_lock;
+	enum lock_mode	rec_lock;
+	ut_ad(heap_no > PAGE_HEAP_NO_SUPREMUM);
+
+	if (UNIV_UNLIKELY(trx->fake_changes)) {
+
+		intention_lock = LOCK_IS;
+		rec_lock = LOCK_S;
+	} else {
+
+		intention_lock = LOCK_IX;
+		rec_lock = LOCK_X;
+	}
+	lock_mutex_enter();
+	ut_a(lock_table_has(trx, table, intention_lock));
+	if (UNIV_LIKELY(srv_fake_changes_locks)) {
+
+		ut_a(lock_rec_has_expl(rec_lock | LOCK_REC_NOT_GAP,
+				       block, heap_no, trx));
+	}
+	lock_mutex_exit();
+	return(true);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/xtradb/lock/lock0wait.cc b/storage/xtradb/lock/lock0wait.cc
new file mode 100644
index 00000000000..a1c35e20ead
--- /dev/null
+++ b/storage/xtradb/lock/lock0wait.cc
@@ -0,0 +1,543 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file lock/lock0wait.cc
+The transaction lock system
+
+Created 25/5/2010 Sunny Bains
+*******************************************************/
+
+#define LOCK_MODULE_IMPLEMENTATION
+
+#include "srv0mon.h"
+#include "que0que.h"
+#include "lock0lock.h"
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "ha_prototypes.h"
+#include "lock0priv.h"
+
+/*********************************************************************//**
+Print the contents of the lock_sys_t::waiting_threads array. */
+static
+void
+lock_wait_table_print(void)
+/*=======================*/
+{
+	ulint			i;
+	const srv_slot_t*	slot;
+
+	ut_ad(lock_wait_mutex_own());
+
+	slot = lock_sys->waiting_threads;
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++, ++slot) {
+
+		fprintf(stderr,
+			"Slot %lu: thread type %lu,"
+			" in use %lu, susp %lu, timeout %lu, time %lu\n",
+			(ulong) i,
+			(ulong) slot->type,
+			(ulong) slot->in_use,
+			(ulong) slot->suspended,
+			slot->wait_timeout,
+			(ulong) difftime(ut_time(), slot->suspend_time));
+	}
+}
+
+/*********************************************************************//**
+Release a slot in the lock_sys_t::waiting_threads. Adjust the array last pointer
+if there are empty slots towards the end of the table. */
+static
+void
+lock_wait_table_release_slot(
+/*=========================*/
+	srv_slot_t*	slot)		/*!< in: slot to release */
+{
+#ifdef UNIV_DEBUG
+	srv_slot_t*	upper = lock_sys->waiting_threads + OS_THREAD_MAX_N;
+#endif /* UNIV_DEBUG */
+
+	lock_wait_mutex_enter();
+
+	ut_ad(slot->in_use);
+	ut_ad(slot->thr != NULL);
+	ut_ad(slot->thr->slot != NULL);
+	ut_ad(slot->thr->slot == slot);
+
+	/* Must be within the array boundaries. */
+	ut_ad(slot >= lock_sys->waiting_threads);
+	ut_ad(slot < upper);
+
+	/* Note: When we reserve the slot we use the trx_t::mutex to update
+	the slot values to change the state to reserved. Here we are using the
+	lock mutex to change the state of the slot to free. This is by design,
+	because when we query the slot state we always hold both the lock and
+	trx_t::mutex. To reduce contention on the lock mutex when reserving the
+	slot we avoid acquiring the lock mutex. */
+
+	lock_mutex_enter();
+
+	slot->thr->slot = NULL;
+	slot->thr = NULL;
+	slot->in_use = FALSE;
+
+	lock_mutex_exit();
+
+	/* Scan backwards and adjust the last free slot pointer. */
+	for (slot = lock_sys->last_slot;
+	     slot > lock_sys->waiting_threads && !slot->in_use;
+	     --slot) {
+		/* No op */
+	}
+
+	/* Either the array is empty or the last scanned slot is in use. */
+	ut_ad(slot->in_use || slot == lock_sys->waiting_threads);
+
+	lock_sys->last_slot = slot + 1;
+
+	/* The last slot is either outside of the array boundary or it's
+	on an empty slot. */
+	ut_ad(lock_sys->last_slot == upper || !lock_sys->last_slot->in_use);
+
+	ut_ad(lock_sys->last_slot >= lock_sys->waiting_threads);
+	ut_ad(lock_sys->last_slot <= upper);
+
+	lock_wait_mutex_exit();
+}
+
+/*********************************************************************//**
+Reserves a slot in the thread table for the current user OS thread.
+@return	reserved slot */
+static
+srv_slot_t*
+lock_wait_table_reserve_slot(
+/*=========================*/
+	que_thr_t*	thr,		/*!< in: query thread associated
+					with the user OS thread */
+	ulong		wait_timeout)	/*!< in: lock wait timeout value */
+{
+	ulint		i;
+	srv_slot_t*	slot;
+
+	ut_ad(lock_wait_mutex_own());
+	ut_ad(trx_mutex_own(thr_get_trx(thr)));
+
+	slot = lock_sys->waiting_threads;
+
+	for (i = OS_THREAD_MAX_N; i--; ++slot) {
+		if (!slot->in_use) {
+			slot->in_use = TRUE;
+			slot->thr = thr;
+			slot->thr->slot = slot;
+
+			if (slot->event == NULL) {
+				slot->event = os_event_create();
+				ut_a(slot->event);
+			}
+
+			os_event_reset(slot->event);
+			slot->suspended = TRUE;
+			slot->suspend_time = ut_time();
+			slot->wait_timeout = wait_timeout;
+
+			if (slot == lock_sys->last_slot) {
+				++lock_sys->last_slot;
+			}
+
+			ut_ad(lock_sys->last_slot
+			      <= lock_sys->waiting_threads + OS_THREAD_MAX_N);
+
+			return(slot);
+		}
+	}
+
+	ut_print_timestamp(stderr);
+
+	fprintf(stderr,
+		"  InnoDB: There appear to be %lu user"
+		" threads currently waiting\n"
+		"InnoDB: inside InnoDB, which is the"
+		" upper limit. Cannot continue operation.\n"
+		"InnoDB: As a last thing, we print"
+		" a list of waiting threads.\n", (ulong) OS_THREAD_MAX_N);
+
+	lock_wait_table_print();
+
+	ut_error;
+	return(NULL);
+}
+
+/***************************************************************//**
+Puts a user OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
+UNIV_INTERN
+void
+lock_wait_suspend_thread(
+/*=====================*/
+	que_thr_t*	thr)	/*!< in: query thread associated with the
+				user OS thread */
+{
+	srv_slot_t*	slot;
+	double		wait_time;
+	trx_t*		trx;
+	ulint		had_dict_lock;
+	ibool		was_declared_inside_innodb;
+	ib_int64_t	start_time			= 0;
+	ib_int64_t	finish_time;
+	ulint		sec;
+	ulint		ms;
+	ulong		lock_wait_timeout;
+
+	trx = thr_get_trx(thr);
+
+	if (trx->mysql_thd != 0) {
+		DEBUG_SYNC_C("lock_wait_suspend_thread_enter");
+	}
+
+	/* InnoDB system transactions (such as the purge, and
+	incomplete transactions that are being rolled back after crash
+	recovery) will use the global value of
+	innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */
+	lock_wait_timeout = trx_lock_wait_timeout_get(trx);
+
+	lock_wait_mutex_enter();
+
+	trx_mutex_enter(trx);
+
+	trx->error_state = DB_SUCCESS;
+
+	if (thr->state == QUE_THR_RUNNING) {
+
+		ut_ad(thr->is_active);
+
+		/* The lock has already been released or this transaction
+		was chosen as a deadlock victim: no need to suspend */
+
+		if (trx->lock.was_chosen_as_deadlock_victim) {
+
+			trx->error_state = DB_DEADLOCK;
+			trx->lock.was_chosen_as_deadlock_victim = FALSE;
+		}
+
+		lock_wait_mutex_exit();
+		trx_mutex_exit(trx);
+		return;
+	}
+
+	ut_ad(!thr->is_active);
+
+	slot = lock_wait_table_reserve_slot(thr, lock_wait_timeout);
+
+	if (thr->lock_state == QUE_THR_LOCK_ROW) {
+		srv_stats.n_lock_wait_count.inc();
+		srv_stats.n_lock_wait_current_count.inc();
+
+		if (ut_usectime(&sec, &ms) == -1) {
+			start_time = -1;
+		} else {
+			start_time = (ib_int64_t) sec * 1000000 + ms;
+		}
+	}
+
+	/* Wake the lock timeout monitor thread, if it is suspended */
+
+	os_event_set(lock_sys->timeout_event);
+
+	lock_wait_mutex_exit();
+	trx_mutex_exit(trx);
+
+	ulint	lock_type = ULINT_UNDEFINED;
+
+	lock_mutex_enter();
+
+	if (const lock_t* wait_lock = trx->lock.wait_lock) {
+		lock_type = lock_get_type_low(wait_lock);
+	}
+
+	lock_mutex_exit();
+
+	had_dict_lock = trx->dict_operation_lock_mode;
+
+	switch (had_dict_lock) {
+	case 0:
+		break;
+	case RW_S_LATCH:
+		/* Release foreign key check latch */
+		row_mysql_unfreeze_data_dictionary(trx);
+
+		DEBUG_SYNC_C("lock_wait_release_s_latch_before_sleep");
+		break;
+	default:
+		/* There should never be a lock wait when the
+		dictionary latch is reserved in X mode.  Dictionary
+		transactions should only acquire locks on dictionary
+		tables, not other tables. All access to dictionary
+		tables should be covered by dictionary
+		transactions. */
+		ut_error;
+	}
+
+	ut_a(trx->dict_operation_lock_mode == 0);
+
+	/* Suspend this thread and wait for the event. */
+
+	was_declared_inside_innodb = trx->declared_to_be_inside_innodb;
+
+	if (was_declared_inside_innodb) {
+		/* We must declare this OS thread to exit InnoDB, since a
+		possible other thread holding a lock which this thread waits
+		for must be allowed to enter, sooner or later */
+
+		srv_conc_force_exit_innodb(trx);
+	}
+
+	/* Unknown is also treated like a record lock */
+	if (lock_type == ULINT_UNDEFINED || lock_type == LOCK_REC) {
+		thd_wait_begin(trx->mysql_thd, THD_WAIT_ROW_LOCK);
+	} else {
+		ut_ad(lock_type == LOCK_TABLE);
+		thd_wait_begin(trx->mysql_thd, THD_WAIT_TABLE_LOCK);
+	}
+
+	os_event_wait(slot->event);
+
+	thd_wait_end(trx->mysql_thd);
+
+	/* After resuming, reacquire the data dictionary latch if
+	necessary. */
+
+	if (was_declared_inside_innodb) {
+
+		/* Return back inside InnoDB */
+
+		srv_conc_force_enter_innodb(trx);
+	}
+
+	if (had_dict_lock) {
+
+		row_mysql_freeze_data_dictionary(trx);
+	}
+
+	wait_time = ut_difftime(ut_time(), slot->suspend_time);
+
+	/* Release the slot for others to use */
+
+	lock_wait_table_release_slot(slot);
+
+	if (thr->lock_state == QUE_THR_LOCK_ROW) {
+		ulint	diff_time;
+
+		if (ut_usectime(&sec, &ms) == -1) {
+			finish_time = -1;
+		} else {
+			finish_time = (ib_int64_t) sec * 1000000 + ms;
+		}
+
+		diff_time = (finish_time > start_time) ?
+			    (ulint) (finish_time - start_time) : 0;
+
+		srv_stats.n_lock_wait_current_count.dec();
+		srv_stats.n_lock_wait_time.add(diff_time);
+
+		/* Only update the variable if we successfully
+		retrieved the start and finish times. See Bug#36819. */
+		if (diff_time > lock_sys->n_lock_max_wait_time
+		    && start_time != -1
+		    && finish_time != -1) {
+
+			lock_sys->n_lock_max_wait_time = diff_time;
+		}
+
+		/* Record the lock wait time for this thread */
+		thd_set_lock_wait_time(trx->mysql_thd, diff_time);
+
+	}
+
+	if (lock_wait_timeout < 100000000
+	    && wait_time > (double) lock_wait_timeout) {
+
+		trx->error_state = DB_LOCK_WAIT_TIMEOUT;
+
+		MONITOR_INC(MONITOR_TIMEOUT);
+	}
+
+	if (trx_is_interrupted(trx)) {
+
+		trx->error_state = DB_INTERRUPTED;
+	}
+}
+
+/********************************************************************//**
+Releases a user OS thread waiting for a lock to be released, if the
+thread is already suspended. */
+UNIV_INTERN
+void
+lock_wait_release_thread_if_suspended(
+/*==================================*/
+	que_thr_t*	thr)	/*!< in: query thread associated with the
+				user OS thread	 */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(trx_mutex_own(thr_get_trx(thr)));
+
+	/* We own both the lock mutex and the trx_t::mutex but not the
+	lock wait mutex. This is OK because other threads will see the state
+	of this slot as being in use and no other thread can change the state
+	of the slot to free unless that thread also owns the lock mutex. */
+
+	if (thr->slot != NULL && thr->slot->in_use && thr->slot->thr == thr) {
+		trx_t*	trx = thr_get_trx(thr);
+
+		if (trx->lock.was_chosen_as_deadlock_victim) {
+
+			trx->error_state = DB_DEADLOCK;
+			trx->lock.was_chosen_as_deadlock_victim = FALSE;
+		}
+
+		os_event_set(thr->slot->event);
+	}
+}
+
+/*********************************************************************//**
+Check if the thread lock wait has timed out. Release its locks if the
+wait has actually timed out. */
+static
+void
+lock_wait_check_and_cancel(
+/*=======================*/
+	const srv_slot_t*	slot)	/*!< in: slot reserved by a user
+					thread when the wait started */
+{
+	trx_t*		trx;
+	double		wait_time;
+	ib_time_t	suspend_time = slot->suspend_time;
+
+	ut_ad(lock_wait_mutex_own());
+
+	ut_ad(slot->in_use);
+
+	ut_ad(slot->suspended);
+
+	wait_time = ut_difftime(ut_time(), suspend_time);
+
+	trx = thr_get_trx(slot->thr);
+
+	if (trx_is_interrupted(trx)
+	    || (slot->wait_timeout < 100000000
+		&& (wait_time > (double) slot->wait_timeout
+		   || wait_time < 0))) {
+
+		/* Timeout exceeded or a wrap-around in system
+		time counter: cancel the lock request queued
+		by the transaction and release possible
+		other transactions waiting behind; it is
+		possible that the lock has already been
+		granted: in that case do nothing */
+
+		lock_mutex_enter();
+
+		trx_mutex_enter(trx);
+
+		if (trx->lock.wait_lock) {
+
+			ut_a(trx->lock.que_state == TRX_QUE_LOCK_WAIT);
+
+			lock_cancel_waiting_and_release(trx->lock.wait_lock);
+		}
+
+		lock_mutex_exit();
+
+		trx_mutex_exit(trx);
+	}
+
+}
+
+/*********************************************************************//**
+A thread which wakes up threads whose lock wait may have lasted too long.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(lock_wait_timeout_thread)(
+/*=====================================*/
+	void*	arg __attribute__((unused)))
+			/* in: a dummy parameter required by
+			os_thread_create */
+{
+	ib_int64_t	sig_count = 0;
+	os_event_t	event = lock_sys->timeout_event;
+
+	ut_ad(!srv_read_only_mode);
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(srv_lock_timeout_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+	lock_sys->timeout_thread_active = true;
+
+	do {
+		srv_slot_t*	slot;
+
+		/* When someone is waiting for a lock, we wake up every second
+		and check if a timeout has passed for a lock wait */
+
+		os_event_wait_time_low(event, 1000000, sig_count);
+		sig_count = os_event_reset(event);
+
+		if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
+			break;
+		}
+
+		lock_wait_mutex_enter();
+
+		/* Check all slots for user threads that are waiting
+	       	on locks, and if they have exceeded the time limit. */
+
+		for (slot = lock_sys->waiting_threads;
+		     slot < lock_sys->last_slot;
+		     ++slot) {
+
+			/* We are doing a read without the lock mutex
+			and/or the trx mutex. This is OK because a slot
+		       	can't be freed or reserved without the lock wait
+		       	mutex. */
+
+			if (slot->in_use) {
+				lock_wait_check_and_cancel(slot);
+			}
+		}
+
+		sig_count = os_event_reset(event);
+
+		lock_wait_mutex_exit();
+
+	} while (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP);
+
+	lock_sys->timeout_thread_active = false;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
diff --git a/storage/xtradb/log/log0log.c b/storage/xtradb/log/log0log.cc
index 9b09d52e576..403ceda7a10 100644
--- a/storage/xtradb/log/log0log.c
+++ b/storage/xtradb/log/log0log.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2009, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -24,7 +24,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file log/log0log.c
+@file log/log0log.cc
 Database log
 
 Created 12/9/1995 Heikki Tuuri
@@ -48,7 +48,7 @@ Created 12/9/1995 Heikki Tuuri
 #include "srv0start.h"
 #include "trx0sys.h"
 #include "trx0trx.h"
-#include "ha_prototypes.h"
+#include "srv0mon.h"
 
 /*
 General philosophy of InnoDB redo-logs:
@@ -76,13 +76,13 @@ reduce the size of the log.
 
 */
 
-/* Current free limit of space 0; protected by the log sys mutex; 0 means
-uninitialized */
-UNIV_INTERN ulint	log_fsp_current_free_limit		= 0;
-
 /* Global log system variable */
 UNIV_INTERN log_t*	log_sys	= NULL;
 
+/** Pointer to the log checksum calculation function */
+UNIV_INTERN log_checksum_func_t log_checksum_algorithm_ptr	=
+	log_block_calc_checksum_innodb;
+
 #ifdef UNIV_PFS_RWLOCK
 UNIV_INTERN mysql_pfs_key_t	checkpoint_lock_key;
 # ifdef UNIV_LOG_ARCHIVE
@@ -167,42 +167,15 @@ log_io_complete_archive(void);
 #endif /* UNIV_LOG_ARCHIVE */
 
 /****************************************************************//**
-Sets the global variable log_fsp_current_free_limit. Also makes a checkpoint,
-so that we know that the limit has been written to a log checkpoint field
-on disk. */
-UNIV_INTERN
-void
-log_fsp_current_free_limit_set_and_checkpoint(
-/*==========================================*/
-	ulint	limit)	/*!< in: limit to set */
-{
-	ibool	success;
-
-	mutex_enter(&(log_sys->mutex));
-
-	log_fsp_current_free_limit = limit;
-
-	mutex_exit(&(log_sys->mutex));
-
-	/* Try to make a synchronous checkpoint */
-
-	success = FALSE;
-
-	while (!success) {
-          success = log_checkpoint(TRUE, TRUE, FALSE);
-	}
-}
-
-/****************************************************************//**
 Returns the oldest modified block lsn in the pool, or log_sys->lsn if none
 exists.
 @return	LSN of oldest modification */
 static
-ib_uint64_t
+lsn_t
 log_buf_pool_get_oldest_modification(void)
 /*======================================*/
 {
-	ib_uint64_t	lsn;
+	lsn_t	lsn;
 
 	ut_ad(mutex_own(&(log_sys->mutex)));
 
@@ -224,7 +197,7 @@ log0online.c.
 
 @return log_sys->tracked_lsn value. */
 UNIV_INLINE
-ib_uint64_t
+lsn_t
 log_get_tracked_lsn()
 {
 #ifdef HAVE_ATOMIC_BUILTINS_64
@@ -247,8 +220,8 @@ log_check_tracking_margin(
 				plan to write.  If zero, the margin will be
 				checked for the already-written log. */
 {
-	ib_uint64_t	tracked_lsn;
-	ib_uint64_t	tracked_lsn_age;
+	lsn_t	tracked_lsn;
+	lsn_t	tracked_lsn_age;
 
 	if (!srv_track_changed_pages) {
 		return FALSE;
@@ -268,7 +241,7 @@ log_check_tracking_margin(
 Opens the log for log_write_low. The log must be closed with log_close.
 @return	start lsn of the log record */
 UNIV_INTERN
-ib_uint64_t
+lsn_t
 log_open(
 /*=====*/
 	ulint	len)	/*!< in: length of data to be catenated */
@@ -299,7 +272,7 @@ loop:
 
 		log_buffer_flush_to_disk();
 
-		srv_log_waits++;
+		srv_stats.log_waits.inc();
 
 		ut_ad(++count < 50);
 
@@ -394,8 +367,10 @@ part_loop:
 	str_len -= len;
 	str = str + len;
 
-	log_block = ut_align_down(log->buf + log->buf_free,
-				  OS_FILE_LOG_BLOCK_SIZE);
+	log_block = static_cast<byte*>(
+		ut_align_down(
+			log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
+
 	log_block_set_data_len(log_block, data_len);
 
 	if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
@@ -421,60 +396,35 @@ part_loop:
 		goto part_loop;
 	}
 
-	srv_log_write_requests++;
-}
-
-/************************************************************//**
-*/
-UNIV_INLINE
-ulint
-log_max_modified_age_async()
-{
-	if (srv_checkpoint_age_target) {
-		return(ut_min(log_sys->max_modified_age_async,
-				srv_checkpoint_age_target
-				- srv_checkpoint_age_target / 8));
-	} else {
-		return(log_sys->max_modified_age_async);
-	}
-}
-
-UNIV_INLINE
-ulint
-log_max_checkpoint_age_async()
-{
-	if (srv_checkpoint_age_target) {
-		return(ut_min(log_sys->max_checkpoint_age_async,
-				srv_checkpoint_age_target));
-	} else {
-		return(log_sys->max_checkpoint_age_async);
-	}
+	srv_stats.log_write_requests.inc();
 }
 
 /************************************************************//**
 Closes the log.
 @return	lsn */
 UNIV_INTERN
-ib_uint64_t
+lsn_t
 log_close(void)
 /*===========*/
 {
 	byte*		log_block;
 	ulint		first_rec_group;
-	ib_uint64_t	oldest_lsn;
-	ib_uint64_t	lsn;
-	ib_uint64_t	tracked_lsn;
-	ib_uint64_t	tracked_lsn_age;
+	lsn_t		oldest_lsn;
+	lsn_t		lsn;
+	lsn_t		tracked_lsn;
+	lsn_t		tracked_lsn_age;
 	log_t*		log	= log_sys;
-	ib_uint64_t	checkpoint_age;
+	lsn_t		checkpoint_age;
 
 	ut_ad(mutex_own(&(log->mutex)));
 	ut_ad(!recv_no_log_write);
 
 	lsn = log->lsn;
 
-	log_block = ut_align_down(log->buf + log->buf_free,
-				  OS_FILE_LOG_BLOCK_SIZE);
+	log_block = static_cast<byte*>(
+		ut_align_down(
+			log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
+
 	first_rec_group = log_block_get_first_rec_group(log_block);
 
 	if (first_rec_group == 0) {
@@ -502,7 +452,8 @@ log_close(void)
 				"oldest untracked record exceeds the log "
 				"group capacity!\n");
 			fprintf(stderr, "InnoDB: Error: stopping the log "
-				"tracking thread at LSN %llu\n", tracked_lsn);
+				"tracking thread at LSN " LSN_PF "\n",
+				tracked_lsn);
 			srv_track_changed_pages = FALSE;
 		}
 	}
@@ -524,21 +475,21 @@ log_close(void)
 
 			ut_print_timestamp(stderr);
 			fprintf(stderr,
-				"  InnoDB: ERROR: the age of the last"
-				" checkpoint is %lu,\n"
+				" InnoDB: ERROR: the age of the last"
+				" checkpoint is " LSN_PF ",\n"
 				"InnoDB: which exceeds the log group"
-				" capacity %lu.\n"
+				" capacity " LSN_PF ".\n"
 				"InnoDB: If you are using big"
 				" BLOB or TEXT rows, you must set the\n"
 				"InnoDB: combined size of log files"
 				" at least 10 times bigger than the\n"
 				"InnoDB: largest such row.\n",
-				(ulong) checkpoint_age,
-				(ulong) log->log_group_capacity);
+				checkpoint_age,
+				log->log_group_capacity);
 		}
 	}
 
-	if (checkpoint_age <= log_max_modified_age_async()) {
+	if (checkpoint_age <= log->max_modified_age_sync) {
 
 		goto function_exit;
 	}
@@ -546,8 +497,8 @@ log_close(void)
 	oldest_lsn = buf_pool_get_oldest_modification();
 
 	if (!oldest_lsn
-	    || lsn - oldest_lsn > log_max_modified_age_async()
-	    || checkpoint_age > log_max_checkpoint_age_async()) {
+	    || lsn - oldest_lsn > log->max_modified_age_sync
+	    || checkpoint_age > log->max_checkpoint_age_async) {
 
 		log->check_flush_or_checkpoint = TRUE;
 	}
@@ -600,7 +551,7 @@ Calculates the data capacity of a log group, when the log file headers are not
 included.
 @return	capacity in bytes */
 UNIV_INTERN
-ulint
+lsn_t
 log_group_get_capacity(
 /*===================*/
 	const log_group_t*	group)	/*!< in: log group */
@@ -615,10 +566,10 @@ Calculates the offset within a log group, when the log file headers are not
 included.
 @return	size offset (<= offset) */
 UNIV_INLINE
-ulint
+lsn_t
 log_group_calc_size_offset(
 /*=======================*/
-	ulint			offset,	/*!< in: real offset within the
+	lsn_t			offset,	/*!< in: real offset within the
 					log group */
 	const log_group_t*	group)	/*!< in: log group */
 {
@@ -632,10 +583,10 @@ Calculates the offset within a log group, when the log file headers are
 included.
 @return	real offset (>= offset) */
 UNIV_INLINE
-ulint
+lsn_t
 log_group_calc_real_offset(
 /*=======================*/
-	ulint			offset,	/*!< in: size offset within the
+	lsn_t			offset,	/*!< in: size offset within the
 					log group */
 	const log_group_t*	group)	/*!< in: log group */
 {
@@ -649,36 +600,31 @@ log_group_calc_real_offset(
 Calculates the offset of an lsn within a log group.
 @return	offset within the log group */
 static
-ulint
+lsn_t
 log_group_calc_lsn_offset(
 /*======================*/
-	ib_uint64_t		lsn,	/*!< in: lsn, must be within 4 GB of
-					group->lsn */
+	lsn_t			lsn,	/*!< in: lsn */
 	const log_group_t*	group)	/*!< in: log group */
 {
-	ib_uint64_t	gr_lsn;
-	ib_int64_t	gr_lsn_size_offset;
-	ib_int64_t	difference;
-	ib_int64_t	group_size;
-	ib_int64_t	offset;
+	lsn_t	gr_lsn;
+	lsn_t	gr_lsn_size_offset;
+	lsn_t	difference;
+	lsn_t	group_size;
+	lsn_t	offset;
 
 	ut_ad(mutex_own(&(log_sys->mutex)));
 
-	/* If total log file size is > 2 GB we can easily get overflows
-	with 32-bit integers. Use 64-bit integers instead. */
-
 	gr_lsn = group->lsn;
 
-	gr_lsn_size_offset = (ib_int64_t)
-		log_group_calc_size_offset(group->lsn_offset, group);
+	gr_lsn_size_offset = log_group_calc_size_offset(group->lsn_offset, group);
 
-	group_size = (ib_int64_t) log_group_get_capacity(group);
+	group_size = log_group_get_capacity(group);
 
 	if (lsn >= gr_lsn) {
 
-		difference = (ib_int64_t) (lsn - gr_lsn);
+		difference = lsn - gr_lsn;
 	} else {
-		difference = (ib_int64_t) (gr_lsn - lsn);
+		difference = gr_lsn - lsn;
 
 		difference = difference % group_size;
 
@@ -687,16 +633,13 @@ log_group_calc_lsn_offset(
 
 	offset = (gr_lsn_size_offset + difference) % group_size;
 
-	if (sizeof(ulint) == 4) {
-	ut_a(offset < (((ib_int64_t) 1) << 32)); /* offset must be < 4 GB */
-	}
-
 	/* fprintf(stderr,
-	"Offset is %lu gr_lsn_offset is %lu difference is %lu\n",
-	(ulint)offset,(ulint)gr_lsn_size_offset, (ulint)difference);
+	"Offset is " LSN_PF " gr_lsn_offset is " LSN_PF
+	" difference is " LSN_PF "\n",
+	offset, gr_lsn_size_offset, difference);
 	*/
 
-	return(log_group_calc_real_offset((ulint)offset, group));
+	return(log_group_calc_real_offset(offset, group));
 }
 #endif /* !UNIV_HOTBACKUP */
 
@@ -728,9 +671,9 @@ log_calc_where_lsn_is(
 
 	if (lsn < first_header_lsn) {
 		add_this_many = 1 + (first_header_lsn - lsn)
-			/ (capacity * (ib_int64_t)n_log_files);
+			/ (capacity * (ib_int64_t) n_log_files);
 		lsn += add_this_many
-			* capacity * (ib_int64_t)n_log_files;
+			* capacity * (ib_int64_t) n_log_files;
 	}
 
 	ut_a(lsn >= first_header_lsn);
@@ -754,7 +697,7 @@ void
 log_group_set_fields(
 /*=================*/
 	log_group_t*	group,	/*!< in/out: group */
-	ib_uint64_t	lsn)	/*!< in: lsn for which the values should be
+	lsn_t		lsn)	/*!< in: lsn for which the values should be
 				set */
 {
 	group->lsn_offset = log_group_calc_lsn_offset(lsn, group);
@@ -772,12 +715,12 @@ log_calc_max_ages(void)
 /*===================*/
 {
 	log_group_t*	group;
-	ulint		margin;
+	lsn_t		margin;
 	ulint		free;
 	ibool		success		= TRUE;
-	ulint		smallest_capacity;
-	ulint		archive_margin;
-	ulint		smallest_archive_margin;
+	lsn_t		smallest_capacity;
+	lsn_t		archive_margin;
+	lsn_t		smallest_archive_margin;
 
 	mutex_enter(&(log_sys->mutex));
 
@@ -785,8 +728,8 @@ log_calc_max_ages(void)
 
 	ut_ad(group);
 
-	smallest_capacity = ULINT_MAX;
-	smallest_archive_margin = ULINT_MAX;
+	smallest_capacity = LSN_MAX;
+	smallest_archive_margin = LSN_MAX;
 
 	while (group) {
 		if (log_group_get_capacity(group) < smallest_capacity) {
@@ -824,8 +767,6 @@ log_calc_max_ages(void)
 		margin = smallest_capacity - free;
 	}
 
-	margin = ut_min(margin, log_sys->adm_checkpoint_interval);
-
 	margin = margin - margin / 10;	/* Add still some extra safety */
 
 	log_sys->log_group_capacity = smallest_capacity;
@@ -864,7 +805,7 @@ failure:
 			"InnoDB: " REFMAN "adding-and-removing.html\n"
 			"InnoDB: Cannot continue operation."
 			" Calling exit(1).\n",
-			(ulong)srv_thread_concurrency);
+			(ulong) srv_thread_concurrency);
 
 		exit(1);
 	}
@@ -879,7 +820,7 @@ void
 log_init(void)
 /*==========*/
 {
-	log_sys = mem_alloc(sizeof(log_t));
+	log_sys = static_cast<log_t*>(mem_alloc(sizeof(log_t)));
 
 	mutex_create(log_sys_mutex_key, &log_sys->mutex, SYNC_LOG);
 
@@ -897,12 +838,13 @@ log_init(void)
 	ut_a(LOG_BUFFER_SIZE >= 16 * OS_FILE_LOG_BLOCK_SIZE);
 	ut_a(LOG_BUFFER_SIZE >= 4 * UNIV_PAGE_SIZE);
 
-	log_sys->buf_ptr = mem_alloc(LOG_BUFFER_SIZE + OS_FILE_LOG_BLOCK_SIZE);
-	log_sys->buf = ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE);
+	log_sys->buf_ptr = static_cast<byte*>(
+		mem_zalloc(LOG_BUFFER_SIZE + OS_FILE_LOG_BLOCK_SIZE));
 
-	log_sys->buf_size = LOG_BUFFER_SIZE;
+	log_sys->buf = static_cast<byte*>(
+		ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
 
-	memset(log_sys->buf, '\0', LOG_BUFFER_SIZE);
+	log_sys->buf_size = LOG_BUFFER_SIZE;
 
 	log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
 		- LOG_BUF_FLUSH_MARGIN;
@@ -926,28 +868,30 @@ log_init(void)
 
 	log_sys->n_pending_writes = 0;
 
-	log_sys->no_flush_event = os_event_create(NULL);
+	log_sys->no_flush_event = os_event_create();
 
 	os_event_set(log_sys->no_flush_event);
 
-	log_sys->one_flushed_event = os_event_create(NULL);
+	log_sys->one_flushed_event = os_event_create();
 
 	os_event_set(log_sys->one_flushed_event);
 
 	/*----------------------------*/
-	log_sys->adm_checkpoint_interval = ULINT_MAX;
 
 	log_sys->next_checkpoint_no = 0;
 	log_sys->last_checkpoint_lsn = log_sys->lsn;
 	log_sys->n_pending_checkpoint_writes = 0;
 
+
 	rw_lock_create(checkpoint_lock_key, &log_sys->checkpoint_lock,
 		       SYNC_NO_ORDER_CHECK);
 
-	log_sys->checkpoint_buf_ptr = mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE);
-	log_sys->checkpoint_buf = ut_align(log_sys->checkpoint_buf_ptr,
-					   OS_FILE_LOG_BLOCK_SIZE);
-	memset(log_sys->checkpoint_buf, '\0', OS_FILE_LOG_BLOCK_SIZE);
+	log_sys->checkpoint_buf_ptr = static_cast<byte*>(
+		mem_zalloc(2 * OS_FILE_LOG_BLOCK_SIZE));
+
+	log_sys->checkpoint_buf = static_cast<byte*>(
+		ut_align(log_sys->checkpoint_buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
+
 	/*----------------------------*/
 
 #ifdef UNIV_LOG_ARCHIVE
@@ -961,17 +905,15 @@ log_init(void)
 	rw_lock_create(archive_lock_key, &log_sys->archive_lock,
 		       SYNC_NO_ORDER_CHECK);
 
-	log_sys->archive_buf = NULL;
+	log_sys->archive_buf_ptr = static_cast<byte*>(
+		mem_zalloc(LOG_ARCHIVE_BUF_SIZE + OS_FILE_LOG_BLOCK_SIZE));
 
-	/* ut_align(
-	ut_malloc(LOG_ARCHIVE_BUF_SIZE
-	+ OS_FILE_LOG_BLOCK_SIZE),
-	OS_FILE_LOG_BLOCK_SIZE); */
-	log_sys->archive_buf_size = 0;
+	log_sys->archive_buf = static_cast<byte*>(
+		ut_align(log_sys->archive_buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
 
-	/* memset(log_sys->archive_buf, '\0', LOG_ARCHIVE_BUF_SIZE); */
+	log_sys->archive_buf_size = LOG_ARCHIVE_BUF_SIZE;
 
-	log_sys->archiving_on = os_event_create(NULL);
+	log_sys->archiving_on = os_event_create();
 #endif /* UNIV_LOG_ARCHIVE */
 
 	log_sys->tracked_lsn = 0;
@@ -984,6 +926,9 @@ log_init(void)
 	log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
 	log_sys->lsn = LOG_START_LSN + LOG_BLOCK_HDR_SIZE;
 
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
+		    log_sys->lsn - log_sys->last_checkpoint_lsn);
+
 	mutex_exit(&(log_sys->mutex));
 
 #ifdef UNIV_LOG_DEBUG
@@ -994,7 +939,7 @@ log_init(void)
 	recv_sys->scanned_lsn = log_sys->lsn;
 	recv_sys->scanned_checkpoint_no = 0;
 	recv_sys->recovered_lsn = log_sys->lsn;
-	recv_sys->limit_lsn = IB_ULONGLONG_MAX;
+	recv_sys->limit_lsn = LSN_MAX;
 #endif
 }
 
@@ -1006,7 +951,7 @@ log_group_init(
 /*===========*/
 	ulint	id,			/*!< in: group id */
 	ulint	n_files,		/*!< in: number of log files */
-	ulint	file_size,		/*!< in: log file size in bytes */
+	lsn_t	file_size,		/*!< in: log file size in bytes */
 	ulint	space_id,		/*!< in: space id of the file space
 					which contains the log files of this
 					group */
@@ -1021,7 +966,7 @@ log_group_init(
 
 	log_group_t*	group;
 
-	group = mem_alloc(sizeof(log_group_t));
+	group = static_cast<log_group_t*>(mem_alloc(sizeof(log_group_t)));
 
 	group->id = id;
 	group->n_files = n_files;
@@ -1032,50 +977,50 @@ log_group_init(
 	group->lsn_offset = LOG_FILE_HDR_SIZE;
 	group->n_pending_writes = 0;
 
-	group->file_header_bufs_ptr = mem_alloc(sizeof(byte*) * n_files);
-	group->file_header_bufs = mem_alloc(sizeof(byte*) * n_files);
+	group->file_header_bufs_ptr = static_cast<byte**>(
+		mem_zalloc(sizeof(byte*) * n_files));
+
+	group->file_header_bufs = static_cast<byte**>(
+		mem_zalloc(sizeof(byte**) * n_files));
+
 #ifdef UNIV_LOG_ARCHIVE
-	group->archive_file_header_bufs_ptr = mem_alloc(
-		sizeof(byte*) * n_files);
-	group->archive_file_header_bufs = mem_alloc(sizeof(byte*) * n_files);
+	group->archive_file_header_bufs_ptr = static_cast<byte**>(
+		mem_zalloc( sizeof(byte*) * n_files));
+
+	group->archive_file_header_bufs = static_cast<byte**>(
+		mem_zalloc(sizeof(byte*) * n_files));
 #endif /* UNIV_LOG_ARCHIVE */
 
 	for (i = 0; i < n_files; i++) {
-		group->file_header_bufs_ptr[i] = mem_alloc(
-			LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
-
-		group->file_header_bufs[i] = ut_align(
-			group->file_header_bufs_ptr[i],
-			OS_FILE_LOG_BLOCK_SIZE);
+		group->file_header_bufs_ptr[i] = static_cast<byte*>(
+			mem_zalloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE));
 
-		memset(*(group->file_header_bufs + i), '\0',
-		       LOG_FILE_HDR_SIZE);
+		group->file_header_bufs[i] = static_cast<byte*>(
+			ut_align(group->file_header_bufs_ptr[i],
+				 OS_FILE_LOG_BLOCK_SIZE));
 
 #ifdef UNIV_LOG_ARCHIVE
-		group->archive_file_header_bufs_ptr[i] = mem_alloc(
-			LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+		group->archive_file_header_bufs_ptr[i] = static_cast<byte*>(
+			mem_zalloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE));
 
-		group->archive_file_header_bufs[i] = ut_align(
-			group->archive_file_header_bufs_ptr[i],
-			OS_FILE_LOG_BLOCK_SIZE);
-
-		memset(*(group->archive_file_header_bufs + i), '\0',
-		       LOG_FILE_HDR_SIZE);
+		group->archive_file_header_bufs[i] = static_cast<byte*>(
+			ut_align(group->archive_file_header_bufs_ptr[i],
+				 OS_FILE_LOG_BLOCK_SIZE));
 #endif /* UNIV_LOG_ARCHIVE */
 	}
 
 #ifdef UNIV_LOG_ARCHIVE
 	group->archive_space_id = archive_space_id;
 
-	group->archived_file_no = 0;
+	group->archived_file_no = LOG_START_LSN;
 	group->archived_offset = 0;
 #endif /* UNIV_LOG_ARCHIVE */
 
-	group->checkpoint_buf_ptr = mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE);
-	group->checkpoint_buf = ut_align(group->checkpoint_buf_ptr,
-					 OS_FILE_LOG_BLOCK_SIZE);
+	group->checkpoint_buf_ptr = static_cast<byte*>(
+		mem_zalloc(2 * OS_FILE_LOG_BLOCK_SIZE));
 
-	memset(group->checkpoint_buf, '\0', OS_FILE_LOG_BLOCK_SIZE);
+	group->checkpoint_buf = static_cast<byte*>(
+		ut_align(group->checkpoint_buf_ptr,OS_FILE_LOG_BLOCK_SIZE));
 
 	UT_LIST_ADD_LAST(log_groups, log_sys->log_groups, group);
 
@@ -1201,7 +1146,7 @@ log_io_complete(
 	ulint	unlock;
 
 #ifdef UNIV_LOG_ARCHIVE
-	if ((byte*)group == &log_archive_io) {
+	if ((byte*) group == &log_archive_io) {
 		/* It was an archive write */
 
 		log_io_complete_archive();
@@ -1210,15 +1155,15 @@ log_io_complete(
 	}
 #endif /* UNIV_LOG_ARCHIVE */
 
-	if ((ulint)group & 0x1UL) {
+	if ((ulint) group & 0x1UL) {
 		/* It was a checkpoint write */
-		group = (log_group_t*)((ulint)group - 1);
+		group = (log_group_t*)((ulint) group - 1);
 
 		if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
 		    && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT
 		    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
 
-			fil_flush(group->space_id, FALSE);
+			fil_flush(group->space_id);
 		}
 
 #ifdef UNIV_DEBUG
@@ -1241,7 +1186,7 @@ log_io_complete(
 	    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
 	    && thd_flush_log_at_trx_commit(NULL) != 2) {
 
-		fil_flush(group->space_id, FALSE);
+		fil_flush(group->space_id);
 	}
 
 	mutex_enter(&(log_sys->mutex));
@@ -1252,6 +1197,7 @@ log_io_complete(
 
 	group->n_pending_writes--;
 	log_sys->n_pending_writes--;
+	MONITOR_DEC(MONITOR_PENDING_LOG_WRITE);
 
 	unlock = log_group_check_flush_completion(group);
 	unlock = unlock | log_sys_check_flush_completion();
@@ -1270,11 +1216,11 @@ log_group_file_header_flush(
 	log_group_t*	group,		/*!< in: log group */
 	ulint		nth_file,	/*!< in: header to the nth file in the
 					log file space */
-	ib_uint64_t	start_lsn)	/*!< in: log file data starts at this
+	lsn_t		start_lsn)	/*!< in: log file data starts at this
 					lsn */
 {
 	byte*	buf;
-	ulint	dest_offset;
+	lsn_t	dest_offset;
 
 	ut_ad(mutex_own(&(log_sys->mutex)));
 	ut_ad(!recv_no_log_write);
@@ -1303,15 +1249,17 @@ log_group_file_header_flush(
 	if (log_do_write) {
 		log_sys->n_log_ios++;
 
-		srv_os_log_pending_writes++;
+		MONITOR_INC(MONITOR_LOG_IO);
+
+		srv_stats.os_log_pending_writes.inc();
 
-		fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, 0,
-		       dest_offset / UNIV_PAGE_SIZE,
-		       dest_offset % UNIV_PAGE_SIZE,
+		fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, group->space_id, 0,
+		       (ulint) (dest_offset / UNIV_PAGE_SIZE),
+		       (ulint) (dest_offset % UNIV_PAGE_SIZE),
 		       OS_FILE_LOG_BLOCK_SIZE,
 		       buf, group);
 
-		srv_os_log_pending_writes--;
+		srv_stats.os_log_pending_writes.dec();
 	}
 }
 
@@ -1338,7 +1286,7 @@ log_group_write_buf(
 	byte*		buf,		/*!< in: buffer */
 	ulint		len,		/*!< in: buffer len; must be divisible
 					by OS_FILE_LOG_BLOCK_SIZE */
-	ib_uint64_t	start_lsn,	/*!< in: start lsn of the buffer; must
+	lsn_t		start_lsn,	/*!< in: start lsn of the buffer; must
 					be divisible by
 					OS_FILE_LOG_BLOCK_SIZE */
 	ulint		new_data_offset)/*!< in: start offset of new data in
@@ -1346,15 +1294,15 @@ log_group_write_buf(
 					if we have to write a new log file
 					header */
 {
-	ulint	write_len;
-	ibool	write_header;
-	ulint	next_offset;
-	ulint	i;
+	ulint		write_len;
+	ibool		write_header;
+	lsn_t		next_offset;
+	ulint		i;
 
 	ut_ad(mutex_own(&(log_sys->mutex)));
 	ut_ad(!recv_no_log_write);
 	ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
-	ut_a(((ulint) start_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
 
 	if (new_data_offset == 0) {
 		write_header = TRUE;
@@ -1373,17 +1321,22 @@ loop:
 	    && write_header) {
 		/* We start to write a new log file instance in the group */
 
-		log_group_file_header_flush(group,
-					    next_offset / group->file_size,
+		ut_a(next_offset / group->file_size <= ULINT_MAX);
+
+		log_group_file_header_flush(group, (ulint)
+					    (next_offset / group->file_size),
 					    start_lsn);
-		srv_os_log_written+= OS_FILE_LOG_BLOCK_SIZE;
-		srv_log_writes++;
+		srv_stats.os_log_written.add(OS_FILE_LOG_BLOCK_SIZE);
+
+		srv_stats.log_writes.inc();
 	}
 
 	if ((next_offset % group->file_size) + len > group->file_size) {
 
-		write_len = group->file_size
-			- (next_offset % group->file_size);
+		/* if the above condition holds, then the below expression
+		is < len which is ulint, so the typecast is ok */
+		write_len = (ulint)
+			(group->file_size - (next_offset % group->file_size));
 	} else {
 		write_len = len;
 	}
@@ -1393,11 +1346,11 @@ loop:
 
 		fprintf(stderr,
 			"Writing log file segment to group %lu"
-			" offset %lu len %lu\n"
-			"start lsn %llu\n"
+			" offset " LSN_PF " len %lu\n"
+			"start lsn " LSN_PF "\n"
 			"First block n:o %lu last block n:o %lu\n",
-			(ulong) group->id, (ulong) next_offset,
-			(ulong) write_len,
+			(ulong) group->id, next_offset,
+			write_len,
 			start_lsn,
 			(ulong) log_block_get_hdr_no(buf),
 			(ulong) log_block_get_hdr_no(
@@ -1423,16 +1376,21 @@ loop:
 	if (log_do_write) {
 		log_sys->n_log_ios++;
 
-		srv_os_log_pending_writes++;
+		MONITOR_INC(MONITOR_LOG_IO);
+
+		srv_stats.os_log_pending_writes.inc();
 
-		fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, 0,
-		       next_offset / UNIV_PAGE_SIZE,
-		       next_offset % UNIV_PAGE_SIZE, write_len, buf, group);
+		ut_a(next_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
 
-		srv_os_log_pending_writes--;
+		fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, group->space_id, 0,
+		       (ulint) (next_offset / UNIV_PAGE_SIZE),
+		       (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf,
+		       group);
 
-		srv_os_log_written+= write_len;
-		srv_log_writes++;
+		srv_stats.os_log_pending_writes.dec();
+
+		srv_stats.os_log_written.add(write_len);
+		srv_stats.log_writes.inc();
 	}
 
 	if (write_len < len) {
@@ -1455,14 +1413,14 @@ UNIV_INTERN
 void
 log_write_up_to(
 /*============*/
-	ib_uint64_t	lsn,	/*!< in: log sequence number up to which
-				the log should be written,
-				IB_ULONGLONG_MAX if not specified */
-	ulint		wait,	/*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
-				or LOG_WAIT_ALL_GROUPS */
-	ibool		flush_to_disk)
-				/*!< in: TRUE if we want the written log
-				also to be flushed to disk */
+	lsn_t	lsn,	/*!< in: log sequence number up to which
+			the log should be written,
+			LSN_MAX if not specified */
+	ulint	wait,	/*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
+			or LOG_WAIT_ALL_GROUPS */
+	ibool	flush_to_disk)
+			/*!< in: TRUE if we want the written log
+			also to be flushed to disk */
 {
 	log_group_t*	group;
 	ulint		start_offset;
@@ -1476,6 +1434,8 @@ log_write_up_to(
 	ib_uint64_t	write_lsn;
 	ib_uint64_t	flush_lsn;
 
+	ut_ad(!srv_read_only_mode);
+
 	if (recv_no_ibuf_operations) {
 		/* Recovery is running and no operations on the log files are
 		allowed yet (the variable name .._no_ibuf_.. is misleading) */
@@ -1523,7 +1483,7 @@ loop:
 		if (flush_to_disk
 		    && log_sys->current_flush_lsn >= lsn) {
 			/* The write + flush will write enough: wait for it to
-			complete  */
+			complete */
 
 			goto do_waits;
 		}
@@ -1531,7 +1491,7 @@ loop:
 		if (!flush_to_disk
 		    && log_sys->write_lsn >= lsn) {
 			/* The write will write enough: wait for it to
-			complete  */
+			complete */
 
 			goto do_waits;
 		}
@@ -1558,12 +1518,13 @@ loop:
 #ifdef UNIV_DEBUG
 	if (log_debug_writes) {
 		fprintf(stderr,
-			"Writing log from %llu up to lsn %llu\n",
+			"Writing log from " LSN_PF " up to lsn " LSN_PF "\n",
 			log_sys->written_to_all_lsn,
 			log_sys->lsn);
 	}
 #endif /* UNIV_DEBUG */
 	log_sys->n_pending_writes++;
+	MONITOR_INC(MONITOR_PENDING_LOG_WRITE);
 
 	group = UT_LIST_GET_FIRST(log_sys->log_groups);
 	group->n_pending_writes++;	/*!< We assume here that we have only
@@ -1625,8 +1586,9 @@ loop:
 
 	if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC
 	    || srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
-		/* O_DSYNC means the OS did not buffer the log file at all:
-		so we have also flushed to disk what we have written */
+		/* O_DSYNC or ALL_O_DIRECT means the OS did not buffer the log
+		file at all: so we have also flushed to disk what we have
+		written */
 
 		log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
 
@@ -1634,7 +1596,7 @@ loop:
 
 		group = UT_LIST_GET_FIRST(log_sys->log_groups);
 
-		fil_flush(group->space_id, FALSE);
+		fil_flush(group->space_id);
 		log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
 	}
 
@@ -1647,6 +1609,7 @@ loop:
 
 	group->n_pending_writes--;
 	log_sys->n_pending_writes--;
+	MONITOR_DEC(MONITOR_PENDING_LOG_WRITE);
 
 	unlock = log_group_check_flush_completion(group);
 	unlock = unlock | log_sys_check_flush_completion();
@@ -1688,8 +1651,9 @@ void
 log_buffer_flush_to_disk(void)
 /*==========================*/
 {
-	ib_uint64_t	lsn;
+	lsn_t	lsn;
 
+	ut_ad(!srv_read_only_mode);
 	mutex_enter(&(log_sys->mutex));
 
 	lsn = log_sys->lsn;
@@ -1710,7 +1674,7 @@ log_buffer_sync_in_background(
 /*==========================*/
 	ibool	flush)	/*!< in: flush the logs to disk */
 {
-	ib_uint64_t	lsn;
+	lsn_t	lsn;
 
 	mutex_enter(&(log_sys->mutex));
 
@@ -1730,8 +1694,8 @@ void
 log_flush_margin(void)
 /*==================*/
 {
-	log_t*		log	= log_sys;
-	ib_uint64_t	lsn	= 0;
+	log_t*	log	= log_sys;
+	lsn_t	lsn	= 0;
 
 	mutex_enter(&(log->mutex));
 
@@ -1756,19 +1720,17 @@ log_flush_margin(void)
 Advances the smallest lsn for which there are unflushed dirty blocks in the
 buffer pool. NOTE: this function may only be called if the calling thread owns
 no synchronization objects!
-@return FALSE if there was a flush batch of the same type running,
+@return false if there was a flush batch of the same type running,
 which means that we could not start this flush batch */
-UNIV_INTERN
-ibool
+static
+bool
 log_preflush_pool_modified_pages(
 /*=============================*/
-	ib_uint64_t	new_oldest,	/*!< in: try to advance
-					oldest_modified_lsn at least
-					to this lsn */
-	ibool		sync)		/*!< in: TRUE if synchronous
-					operation is desired */
+	lsn_t	new_oldest)	/*!< in: try to advance oldest_modified_lsn
+				at least to this lsn */
 {
-	ulint	n_pages;
+	lsn_t	current_oldest;
+	ulint	i;
 
 	if (recv_recovery_on) {
 		/* If the recovery is running, we must first apply all
@@ -1783,18 +1745,51 @@ log_preflush_pool_modified_pages(
 		recv_apply_hashed_log_recs(TRUE);
 	}
 
-	n_pages = buf_flush_list(ULINT_MAX, new_oldest);
+	if (!buf_page_cleaner_is_active
+	    || (srv_foreground_preflush
+		== SRV_FOREGROUND_PREFLUSH_SYNC_PREFLUSH)
+	    || (new_oldest == LSN_MAX)) {
+
+		ulint n_pages;
+
+		bool success = buf_flush_list(ULINT_MAX, new_oldest, &n_pages);
 
-	if (sync) {
 		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+
+		if (!success) {
+			MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
+		}
+
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+			MONITOR_FLUSH_SYNC_COUNT,
+			MONITOR_FLUSH_SYNC_PAGES,
+			n_pages);
+
+		return(success);
 	}
 
-	if (n_pages == ULINT_UNDEFINED) {
+	ut_ad(srv_foreground_preflush == SRV_FOREGROUND_PREFLUSH_EXP_BACKOFF);
 
-		return(FALSE);
+	current_oldest = buf_pool_get_oldest_modification();
+	i = 0;
+
+	while (current_oldest < new_oldest && current_oldest) {
+
+		while (!buf_flush_flush_list_in_progress()) {
+
+			/* If a flush list flush by the cleaner thread is not
+			running, backoff until one is started.  */
+			os_thread_sleep(ut_rnd_interval(0, 1 << i));
+			i++;
+			i %= 16;
+		}
+		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+
+		current_oldest = buf_pool_get_oldest_modification();
 	}
 
-	return(TRUE);
+	return(current_oldest >= new_oldest || !current_oldest);
 }
 
 /******************************************************//**
@@ -1810,6 +1805,8 @@ log_complete_checkpoint(void)
 	log_sys->next_checkpoint_no++;
 
 	log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn;
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
+		    log_sys->lsn - log_sys->last_checkpoint_lsn);
 
 	rw_lock_x_unlock_gen(&(log_sys->checkpoint_lock), LOG_CHECKPOINT);
 }
@@ -1826,6 +1823,7 @@ log_io_complete_checkpoint(void)
 	ut_ad(log_sys->n_pending_checkpoint_writes > 0);
 
 	log_sys->n_pending_checkpoint_writes--;
+	MONITOR_DEC(MONITOR_PENDING_CHECKPOINT_WRITE);
 
 	if (log_sys->n_pending_checkpoint_writes == 0) {
 		log_complete_checkpoint();
@@ -1848,15 +1846,13 @@ log_checkpoint_set_nth_group_info(
 /*==============================*/
 	byte*	buf,	/*!< in: buffer for checkpoint info */
 	ulint	n,	/*!< in: nth slot */
-	ulint	file_no,/*!< in: archived file number */
-	ulint	offset)	/*!< in: archived file offset */
+	lsn_t	file_no)/*!< in: archived file number */
 {
 	ut_ad(n < LOG_MAX_N_GROUPS);
 
-	mach_write_to_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
-			+ 8 * n + LOG_CHECKPOINT_ARCHIVED_FILE_NO, file_no);
-	mach_write_to_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
-			+ 8 * n + LOG_CHECKPOINT_ARCHIVED_OFFSET, offset);
+	mach_write_to_8(buf + LOG_CHECKPOINT_GROUP_ARRAY +
+			8 * n + LOG_CHECKPOINT_ARCHIVED_FILE_NO,
+			file_no);
 }
 
 /*******************************************************************//**
@@ -1867,15 +1863,12 @@ log_checkpoint_get_nth_group_info(
 /*==============================*/
 	const byte*	buf,	/*!< in: buffer containing checkpoint info */
 	ulint		n,	/*!< in: nth slot */
-	ulint*		file_no,/*!< out: archived file number */
-	ulint*		offset)	/*!< out: archived file offset */
+	lsn_t*		file_no)/*!< out: archived file number */
 {
 	ut_ad(n < LOG_MAX_N_GROUPS);
 
-	*file_no = mach_read_from_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
-				    + 8 * n + LOG_CHECKPOINT_ARCHIVED_FILE_NO);
-	*offset = mach_read_from_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
-				   + 8 * n + LOG_CHECKPOINT_ARCHIVED_OFFSET);
+	*file_no = mach_read_from_8(buf + LOG_CHECKPOINT_GROUP_ARRAY +
+				8 * n + LOG_CHECKPOINT_ARCHIVED_FILE_NO);
 }
 
 /******************************************************//**
@@ -1889,13 +1882,14 @@ log_group_checkpoint(
 	log_group_t*	group2;
 #ifdef UNIV_LOG_ARCHIVE
 	ib_uint64_t	archived_lsn;
-	ib_uint64_t	next_archived_lsn;
 #endif /* UNIV_LOG_ARCHIVE */
+	lsn_t		lsn_offset;
 	ulint		write_offset;
 	ulint		fold;
 	byte*		buf;
 	ulint		i;
 
+	ut_ad(!srv_read_only_mode);
 	ut_ad(mutex_own(&(log_sys->mutex)));
 	ut_a(LOG_CHECKPOINT_SIZE <= OS_FILE_LOG_BLOCK_SIZE);
 
@@ -1904,34 +1898,29 @@ log_group_checkpoint(
 	mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
 	mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn);
 
-	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET,
-			log_group_calc_lsn_offset(
-				log_sys->next_checkpoint_lsn, group));
+	lsn_offset = log_group_calc_lsn_offset(log_sys->next_checkpoint_lsn,
+					       group);
+	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_LOW32,
+			lsn_offset & 0xFFFFFFFFUL);
+	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_HIGH32,
+			lsn_offset >> 32);
 
 	mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, log_sys->buf_size);
 
 #ifdef UNIV_LOG_ARCHIVE
-#error "UNIV_LOG_ARCHIVE could not be enabled"
 	if (log_sys->archiving_state == LOG_ARCH_OFF) {
-		archived_lsn = IB_ULONGLONG_MAX;
+		archived_lsn = LSN_MAX;
 	} else {
 		archived_lsn = log_sys->archived_lsn;
-
-		if (archived_lsn != log_sys->next_archived_lsn) {
-			next_archived_lsn = log_sys->next_archived_lsn;
-			/* For debugging only */
-		}
 	}
 
 	mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN, archived_lsn);
 #else /* UNIV_LOG_ARCHIVE */
-	mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN,
-			(ib_uint64_t)log_group_calc_lsn_offset(
-				log_sys->next_checkpoint_lsn, group));
+	mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN, LSN_MAX);
 #endif /* UNIV_LOG_ARCHIVE */
 
 	for (i = 0; i < LOG_MAX_N_GROUPS; i++) {
-		log_checkpoint_set_nth_group_info(buf, i, 0, 0);
+		log_checkpoint_set_nth_group_info(buf, i, 0);
 	}
 
 	group2 = UT_LIST_GET_FIRST(log_sys->log_groups);
@@ -1939,10 +1928,9 @@ log_group_checkpoint(
 	while (group2) {
 		log_checkpoint_set_nth_group_info(buf, group2->id,
 #ifdef UNIV_LOG_ARCHIVE
-						  group2->archived_file_no,
-						  group2->archived_offset
+						  group2->archived_file_no
 #else /* UNIV_LOG_ARCHIVE */
-						  0, 0
+						  0
 #endif /* UNIV_LOG_ARCHIVE */
 						  );
 
@@ -1956,15 +1944,6 @@ log_group_checkpoint(
 			      LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN);
 	mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_2, fold);
 
-	/* Starting from InnoDB-3.23.50, we also write info on allocated
-	size in the tablespace */
-
-	mach_write_to_4(buf + LOG_CHECKPOINT_FSP_FREE_LIMIT,
-			log_fsp_current_free_limit);
-
-	mach_write_to_4(buf + LOG_CHECKPOINT_FSP_MAGIC_N,
-			LOG_CHECKPOINT_FSP_MAGIC_N_VAL);
-
 	/* We alternate the physical place of the checkpoint info in the first
 	log file */
 
@@ -1982,20 +1961,23 @@ log_group_checkpoint(
 		}
 
 		log_sys->n_pending_checkpoint_writes++;
+		MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE);
 
 		log_sys->n_log_ios++;
 
+		MONITOR_INC(MONITOR_LOG_IO);
+
 		/* We send as the last parameter the group machine address
 		added with 1, as we want to distinguish between a normal log
 		file write and a checkpoint field write */
 
-		fil_io(OS_FILE_WRITE | OS_FILE_LOG, FALSE, group->space_id, 0,
+		fil_io(OS_FILE_WRITE | OS_FILE_LOG, false, group->space_id, 0,
 		       write_offset / UNIV_PAGE_SIZE,
 		       write_offset % UNIV_PAGE_SIZE,
 		       OS_FILE_LOG_BLOCK_SIZE,
-		       buf, ((byte*)group + 1));
+		       buf, ((byte*) group + 1));
 
-		ut_ad(((ulint)group & 0x1UL) == 0);
+		ut_ad(((ulint) group & 0x1UL) == 0);
 	}
 }
 #endif /* !UNIV_HOTBACKUP */
@@ -2034,12 +2016,13 @@ log_reset_first_header_and_checkpoint(
 	mach_write_to_8(buf + LOG_CHECKPOINT_NO, 0);
 	mach_write_to_8(buf + LOG_CHECKPOINT_LSN, lsn);
 
-	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET,
+	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_LOW32,
 			LOG_FILE_HDR_SIZE + LOG_BLOCK_HDR_SIZE);
+	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_HIGH32, 0);
 
 	mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, 2 * 1024 * 1024);
 
-	mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN, IB_ULONGLONG_MAX);
+	mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN, LSN_MAX);
 
 	fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1);
 	mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_1, fold);
@@ -2068,7 +2051,9 @@ log_group_read_checkpoint_info(
 
 	log_sys->n_log_ios++;
 
-	fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, group->space_id, 0,
+	MONITOR_INC(MONITOR_LOG_IO);
+
+	fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->space_id, 0,
 	       field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE,
 	       OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL);
 }
@@ -2084,12 +2069,13 @@ log_groups_write_checkpoint_info(void)
 
 	ut_ad(mutex_own(&(log_sys->mutex)));
 
-	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+	if (!srv_read_only_mode) {
+		for (group = UT_LIST_GET_FIRST(log_sys->log_groups);
+		     group;
+		     group = UT_LIST_GET_NEXT(log_groups, group)) {
 
-	while (group) {
-		log_group_checkpoint(group);
-
-		group = UT_LIST_GET_NEXT(log_groups, group);
+			log_group_checkpoint(group);
+		}
 	}
 }
 
@@ -2114,13 +2100,16 @@ log_checkpoint(
         ibool   safe_to_ignore) /*!< in: TRUE if checkpoint can be ignored in
                                   the case checkpoint's are disabled */
 {
-	ib_uint64_t	oldest_lsn;
+	lsn_t	oldest_lsn;
+
+	ut_ad(!srv_read_only_mode);
 
 	if (recv_recovery_is_on()) {
 		recv_apply_hashed_log_recs(TRUE);
 	}
 
-	if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
+	if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC &&
+	    srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT) {
 		fil_flush_file_spaces(FIL_TABLESPACE);
 	}
 
@@ -2184,14 +2173,17 @@ log_checkpoint(
 
 #ifdef UNIV_DEBUG
 	if (log_debug_writes) {
-		fprintf(stderr, "Making checkpoint no %lu at lsn %llu\n",
-			(ulong) log_sys->next_checkpoint_no,
+		fprintf(stderr, "Making checkpoint no "
+			LSN_PF " at lsn " LSN_PF "\n",
+			log_sys->next_checkpoint_no,
 			oldest_lsn);
 	}
 #endif /* UNIV_DEBUG */
 
 	log_groups_write_checkpoint_info();
 
+	MONITOR_INC(MONITOR_NUM_CHECKPOINT);
+
 	mutex_exit(&(log_sys->mutex));
 
 	if (sync) {
@@ -2209,22 +2201,26 @@ UNIV_INTERN
 void
 log_make_checkpoint_at(
 /*===================*/
-	ib_uint64_t	lsn,		/*!< in: make a checkpoint at this or a
-					later lsn, if IB_ULONGLONG_MAX, makes
-					a checkpoint at the latest lsn */
-	ibool		write_always)	/*!< in: the function normally checks if
-					the new checkpoint would have a
-					greater lsn than the previous one: if
-					not, then no physical write is done;
-					by setting this parameter TRUE, a
-					physical write will always be made to
-					log files */
+	lsn_t	lsn,		/*!< in: make a checkpoint at this or a
+				later lsn, if LSN_MAX, makes
+				a checkpoint at the latest lsn */
+	ibool	write_always)	/*!< in: the function normally checks if
+				the new checkpoint would have a
+				greater lsn than the previous one: if
+				not, then no physical write is done;
+				by setting this parameter TRUE, a
+				physical write will always be made to
+				log files */
 {
 	/* Preflush pages synchronously */
 
-	while (!log_preflush_pool_modified_pages(lsn, TRUE));
+	while (!log_preflush_pool_modified_pages(lsn)) {
+		/* Flush as much as we can */
+	}
 
-	while (!log_checkpoint(TRUE, write_always, FALSE));
+	while (!log_checkpoint(TRUE, write_always, FALSE)) {
+		/* Force a checkpoint */
+	}
 }
 
 /****************************************************************//**
@@ -2304,18 +2300,17 @@ log_checkpoint_margin(void)
 /*=======================*/
 {
 	log_t*		log		= log_sys;
-	ib_uint64_t	age;
-	ib_uint64_t	checkpoint_age;
+	lsn_t		age;
+	lsn_t		checkpoint_age;
 	ib_uint64_t	advance;
-	ib_uint64_t	oldest_lsn;
-	ibool		sync;
+	lsn_t		oldest_lsn;
 	ibool		checkpoint_sync;
 	ibool		do_checkpoint;
-	ibool		success;
+	bool		success;
 loop:
-	sync = FALSE;
 	checkpoint_sync = FALSE;
 	do_checkpoint = FALSE;
+	advance = 0;
 
 	mutex_enter(&(log->mutex));
 	ut_ad(!recv_no_log_write);
@@ -2333,15 +2328,7 @@ loop:
 	if (age > log->max_modified_age_sync) {
 
 		/* A flush is urgent: we have to do a synchronous preflush */
-
-		sync = TRUE;
 		advance = 2 * (age - log->max_modified_age_sync);
-	} else if (age > log_max_modified_age_async()) {
-
-		/* A flush is not urgent: we do an asynchronous preflush */
-		advance = age - log_max_modified_age_async();
-	} else {
-		advance = 0;
 	}
 
 	checkpoint_age = log->lsn - log->last_checkpoint_lsn;
@@ -2353,7 +2340,7 @@ loop:
 
 		do_checkpoint = TRUE;
 
-	} else if (checkpoint_age > log_max_checkpoint_age_async()) {
+	} else if (checkpoint_age > log->max_checkpoint_age_async) {
 		/* A checkpoint is not urgent: do it asynchronously */
 
 		do_checkpoint = TRUE;
@@ -2366,17 +2353,14 @@ loop:
 	mutex_exit(&(log->mutex));
 
 	if (advance) {
-		ib_uint64_t	new_oldest = oldest_lsn + advance;
+		lsn_t	new_oldest = oldest_lsn + advance;
 
-		success = log_preflush_pool_modified_pages(new_oldest, sync);
+		success = log_preflush_pool_modified_pages(new_oldest);
 
 		/* If the flush succeeded, this thread has done its part
 		and can proceed. If it did not succeed, there was another
-		thread doing a flush at the same time. If sync was FALSE,
-		the flush was not urgent, and we let this thread proceed.
-		Otherwise, we let it start from the beginning again. */
-
-		if (sync && !success) {
+		thread doing a flush at the same time. */
+		if (!success) {
 			mutex_enter(&(log->mutex));
 
 			log->check_flush_or_checkpoint = TRUE;
@@ -2406,14 +2390,14 @@ log_group_read_log_seg(
 	ulint		type,		/*!< in: LOG_ARCHIVE or LOG_RECOVER */
 	byte*		buf,		/*!< in: buffer where to read */
 	log_group_t*	group,		/*!< in: log group */
-	ib_uint64_t	start_lsn,	/*!< in: read area start */
-	ib_uint64_t	end_lsn,	/*!< in: read area end */
+	lsn_t		start_lsn,	/*!< in: read area start */
+	lsn_t		end_lsn,	/*!< in: read area end */
 	ibool		release_mutex)	/*!< in: whether the log_sys->mutex
 					should be released before the read */
 {
 	ulint	len;
-	ulint	source_offset;
-	ibool	sync;
+	lsn_t	source_offset;
+	bool	sync;
 
 	ut_ad(mutex_own(&(log_sys->mutex)));
 
@@ -2421,13 +2405,17 @@ log_group_read_log_seg(
 loop:
 	source_offset = log_group_calc_lsn_offset(start_lsn, group);
 
+	ut_a(end_lsn - start_lsn <= ULINT_MAX);
 	len = (ulint) (end_lsn - start_lsn);
 
 	ut_ad(len != 0);
 
 	if ((source_offset % group->file_size) + len > group->file_size) {
 
-		len = group->file_size - (source_offset % group->file_size);
+		/* If the above condition is true then len (which is ulint)
+		is > the expression below, so the typecast is ok */
+		len = (ulint) (group->file_size -
+			(source_offset % group->file_size));
 	}
 
 #ifdef UNIV_LOG_ARCHIVE
@@ -2439,13 +2427,18 @@ loop:
 
 	log_sys->n_log_ios++;
 
+	MONITOR_INC(MONITOR_LOG_IO);
+
+	ut_a(source_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
+
 	if (release_mutex) {
 		mutex_exit(&(log_sys->mutex));
 	}
 
 	fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, 0,
-	       source_offset / UNIV_PAGE_SIZE, source_offset % UNIV_PAGE_SIZE,
-	       len, buf, NULL);
+	       (ulint) (source_offset / UNIV_PAGE_SIZE),
+	       (ulint) (source_offset % UNIV_PAGE_SIZE),
+	       len, buf, (type == LOG_ARCHIVE) ? &log_archive_io : NULL);
 
 	start_lsn += len;
 	buf += len;
@@ -2467,12 +2460,68 @@ void
 log_archived_file_name_gen(
 /*=======================*/
 	char*	buf,	/*!< in: buffer where to write */
+	ulint	buf_len,/*!< in: buffer length */
 	ulint	id __attribute__((unused)),
 			/*!< in: group id;
 			currently we only archive the first group */
-	ulint	file_no)/*!< in: file number */
+	lsn_t	file_no)/*!< in: file number */
 {
-	sprintf(buf, "%sib_arch_log_%010lu", srv_arch_dir, (ulong) file_no);
+	ulint	dirnamelen;
+
+	dirnamelen = strlen(srv_arch_dir);
+
+	ut_a(buf_len > dirnamelen +
+		       IB_ARCHIVED_LOGS_SERIAL_LEN +
+		       IB_ARCHIVED_LOGS_PREFIX_LEN + 2);
+
+	strcpy(buf, srv_arch_dir);
+
+	if (buf[dirnamelen-1] != SRV_PATH_SEPARATOR) {
+		buf[dirnamelen++] = SRV_PATH_SEPARATOR;
+	}
+	sprintf(buf + dirnamelen, IB_ARCHIVED_LOGS_PREFIX 
+		"%0" IB_TO_STR(IB_ARCHIVED_LOGS_SERIAL_LEN) "llu",
+		(unsigned long long)file_no);
+}
+
+/******************************************************//**
+Get offset within archived log file to continue to write
+with. */
+UNIV_INTERN
+void
+log_archived_get_offset(
+/*=====================*/
+	log_group_t*	group,		/*!< in: log group */
+	lsn_t		file_no,	/*!< in: archive log file number */
+	lsn_t		archived_lsn,	/*!< in: last archived LSN */
+	lsn_t*		offset)		/*!< out: offset within archived file */
+{
+	char		file_name[OS_FILE_MAX_PATH];
+	ibool		exists;
+	os_file_type_t	type;
+
+	log_archived_file_name_gen(file_name,
+		sizeof(file_name), group->id, file_no);
+
+	ut_a(os_file_status(file_name, &exists,	&type));
+
+	if (!exists) {
+		*offset = 0;
+		return;
+	}
+
+	*offset = archived_lsn - file_no + LOG_FILE_HDR_SIZE;
+
+	if (archived_lsn != LSN_MAX) {
+		*offset = archived_lsn - file_no + LOG_FILE_HDR_SIZE;
+	} else {
+		/* Archiving was OFF prior startup */
+		*offset = 0;
+	}
+
+	ut_a(group->file_size >= *offset + LOG_FILE_HDR_SIZE);
+
+	return;
 }
 
 /******************************************************//**
@@ -2484,7 +2533,7 @@ log_group_archive_file_header_write(
 	log_group_t*	group,		/*!< in: log group */
 	ulint		nth_file,	/*!< in: header to the nth file in the
 					archive log file space */
-	ulint		file_no,	/*!< in: archived file number */
+	lsn_t		file_no,	/*!< in: archived file number */
 	ib_uint64_t	start_lsn)	/*!< in: log file data starts at this
 					lsn */
 {
@@ -2507,7 +2556,10 @@ log_group_archive_file_header_write(
 
 	log_sys->n_log_ios++;
 
-	fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->archive_space_id,
+	MONITOR_INC(MONITOR_LOG_IO);
+
+	fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, group->archive_space_id,
+	       0,
 	       dest_offset / UNIV_PAGE_SIZE,
 	       dest_offset % UNIV_PAGE_SIZE,
 	       2 * OS_FILE_LOG_BLOCK_SIZE,
@@ -2540,7 +2592,10 @@ log_group_archive_completed_header_write(
 
 	log_sys->n_log_ios++;
 
-	fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->archive_space_id,
+	MONITOR_INC(MONITOR_LOG_IO);
+
+	fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, group->archive_space_id,
+	       0,
 	       dest_offset / UNIV_PAGE_SIZE,
 	       dest_offset % UNIV_PAGE_SIZE,
 	       OS_FILE_LOG_BLOCK_SIZE,
@@ -2556,14 +2611,14 @@ log_group_archive(
 /*==============*/
 	log_group_t*	group)	/*!< in: log group */
 {
-	os_file_t	 file_handle;
-	ib_uint64_t	start_lsn;
-	ib_uint64_t	end_lsn;
-	char		name[1024];
+	os_file_t	file_handle;
+	lsn_t		start_lsn;
+	lsn_t		end_lsn;
+	char		name[OS_FILE_MAX_PATH];
 	byte*		buf;
 	ulint		len;
 	ibool		ret;
-	ulint		next_offset;
+	lsn_t		next_offset;
 	ulint		n_files;
 	ulint		open_mode;
 
@@ -2591,12 +2646,19 @@ loop:
 
 		if (next_offset % group->file_size == 0) {
 			open_mode = OS_FILE_CREATE;
+			if (n_files == 0) {
+				/* Adjust archived_file_no to match start_lsn
+				   which is written in file header as well */
+				group->archived_file_no = start_lsn;
+			}
 		} else {
 			open_mode = OS_FILE_OPEN;
 		}
 
-		log_archived_file_name_gen(name, group->id,
-					   group->archived_file_no + n_files);
+		log_archived_file_name_gen(name, sizeof(name), group->id,
+					   group->archived_file_no +
+					   n_files * (group->file_size -
+					   LOG_FILE_HDR_SIZE));
 
 		file_handle = os_file_create(innodb_file_log_key,
 					     name, open_mode,
@@ -2633,13 +2695,14 @@ loop:
 
 		/* Add the archive file as a node to the space */
 
-		fil_node_create(name, group->file_size / UNIV_PAGE_SIZE,
-				group->archive_space_id, FALSE);
+		ut_a(fil_node_create(name, group->file_size / UNIV_PAGE_SIZE,
+				     group->archive_space_id, FALSE));
 
 		if (next_offset % group->file_size == 0) {
 			log_group_archive_file_header_write(
 				group, n_files,
-				group->archived_file_no + n_files,
+				group->archived_file_no +
+				n_files * (group->file_size - LOG_FILE_HDR_SIZE),
 				start_lsn);
 
 			next_offset += LOG_FILE_HDR_SIZE;
@@ -2656,7 +2719,7 @@ loop:
 #ifdef UNIV_DEBUG
 	if (log_debug_writes) {
 		fprintf(stderr,
-			"Archiving starting at lsn %llu, len %lu"
+			"Archiving starting at lsn " LSN_PF ", len %lu"
 			" to group %lu\n",
 			start_lsn,
 			(ulong) len, (ulong) group->id);
@@ -2667,8 +2730,12 @@ loop:
 
 	log_sys->n_log_ios++;
 
-	fil_io(OS_FILE_WRITE | OS_FILE_LOG, FALSE, group->archive_space_id,
-	       next_offset / UNIV_PAGE_SIZE, next_offset % UNIV_PAGE_SIZE,
+	MONITOR_INC(MONITOR_LOG_IO);
+
+	fil_io(OS_FILE_WRITE | OS_FILE_LOG, false, group->archive_space_id,
+	       0,
+	       (ulint) (next_offset / UNIV_PAGE_SIZE),
+	       (ulint) (next_offset % UNIV_PAGE_SIZE),
 	       ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf,
 	       &log_archive_io);
 
@@ -2685,7 +2752,8 @@ loop:
 		goto loop;
 	}
 
-	group->next_archived_file_no = group->archived_file_no + n_files;
+	group->next_archived_file_no = group->archived_file_no +
+			n_files * (group->file_size - LOG_FILE_HDR_SIZE);
 	group->next_archived_offset = next_offset % group->file_size;
 
 	ut_a(group->next_archived_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
@@ -2717,7 +2785,7 @@ log_archive_write_complete_groups(void)
 /*===================================*/
 {
 	log_group_t*	group;
-	ulint		end_offset;
+	lsn_t		end_offset;
 	ulint		trunc_files;
 	ulint		n_files;
 	ib_uint64_t	start_lsn;
@@ -2833,7 +2901,7 @@ log_io_complete_archive(void)
 
 	mutex_exit(&(log_sys->mutex));
 
-	fil_flush(group->archive_space_id, TRUE);
+	fil_flush(group->archive_space_id);
 
 	mutex_enter(&(log_sys->mutex));
 
@@ -2937,7 +3005,7 @@ arch_none:
 #ifdef UNIV_DEBUG
 	if (log_debug_writes) {
 		fprintf(stderr,
-			"Archiving from lsn %llu to lsn %llu\n",
+			"Archiving from lsn " LSN_PF " to lsn " LSN_PF "\n",
 			log_sys->archived_lsn, limit_lsn);
 	}
 #endif /* UNIV_DEBUG */
@@ -3038,18 +3106,8 @@ log_archive_close_groups(
 					 trunc_len);
 		if (increment_file_count) {
 			group->archived_offset = 0;
-			group->archived_file_no += 2;
 		}
 
-#ifdef UNIV_DEBUG
-		if (log_debug_writes) {
-			fprintf(stderr,
-				"Incrementing arch file no to %lu"
-				" in log group %lu\n",
-				(ulong) group->archived_file_no + 2,
-				(ulong) group->id);
-		}
-#endif /* UNIV_DEBUG */
 	}
 }
 
@@ -3108,7 +3166,7 @@ log_archive_stop(void)
 	success = FALSE;
 
 	while (!success) {
-		success = log_checkpoint(TRUE, TRUE);
+		success = log_checkpoint(TRUE, TRUE, FALSE);
 	}
 
 	mutex_enter(&(log_sys->mutex));
@@ -3311,18 +3369,16 @@ void
 logs_empty_and_mark_files_at_shutdown(void)
 /*=======================================*/
 {
-	ib_uint64_t		lsn;
-	ib_uint64_t		tracked_lsn;
-	ulint			arch_log_no;
-	ibool			server_busy;
+	lsn_t			lsn;
+	lsn_t			tracked_lsn;
 	ulint			count = 0;
+	ulint			total_trx;
 	ulint			pending_io;
-	ulint			active_thd;
+	enum srv_thread_type	active_thd;
+	const char*		thread_name;
+	ibool			server_busy;
 
-	if (srv_print_verbose_log) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr, "  InnoDB: Starting shutdown...\n");
-	}
+	ib_logf(IB_LOG_LEVEL_INFO, "Starting shutdown...");
 
         /* Enable checkpoints if someone had turned them off */
 	if (log_disable_checkpoint_active)
@@ -3332,46 +3388,24 @@ logs_empty_and_mark_files_at_shutdown(void)
 	algorithm only works if the server is idle at shutdown */
 
 	srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
-	os_event_set(srv_shutdown_event);
 loop:
 	os_thread_sleep(100000);
 
 	count++;
 
-	mutex_enter(&kernel_mutex);
-
 	/* We need the monitor threads to stop before we proceed with
 	a shutdown. */
 
-	if (srv_error_monitor_active
-	    || srv_lock_timeout_active
-	    || srv_monitor_active) {
-		const char*	thread_active = NULL;
+	thread_name = srv_any_background_threads_are_active();
 
+	if (thread_name != NULL) {
 		/* Print a message every 60 seconds if we are waiting
-		for the monitor thread to exit. Master and worker threads
-		check will be done later. */
-		if (srv_print_verbose_log && count > 600) {
+		for the monitor thread to exit. Master and worker
+		threads check will be done later. */
 
-		       if (srv_error_monitor_active) {
-			       thread_active = "srv_error_monitor_thread";
-		       } else if (srv_lock_timeout_active) {
-			       thread_active = "srv_lock_timeout thread";
-		       } else if (srv_monitor_active) {
-			       thread_active = "srv_monitor_thread";
-		       }
-		}
-
-		mutex_exit(&kernel_mutex);
-
-		os_event_set(srv_error_event);
-		os_event_set(srv_monitor_event);
-		os_event_set(srv_timeout_event);
-
-		if (thread_active) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr, "  InnoDB: Waiting for %s to exit\n",
-				thread_active);
+		if (srv_print_verbose_log && count > 600) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Waiting for %s to exit", thread_name);
 			count = 0;
 		}
 
@@ -3383,32 +3417,30 @@ loop:
 	shutdown, because the InnoDB layer may have committed or
 	prepared transactions and we don't want to lose them. */
 
-	server_busy = trx_n_mysql_transactions > 0
-		|| UT_LIST_GET_LEN(trx_sys->trx_list) > trx_n_prepared;
-
-	if (server_busy) {
-		ulint	total_trx = UT_LIST_GET_LEN(trx_sys->trx_list)
-				    + trx_n_mysql_transactions;
+	total_trx = trx_sys_any_active_transactions();
 
-		mutex_exit(&kernel_mutex);
+	if (total_trx > 0) {
 
 		if (srv_print_verbose_log && count > 600) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr, "  InnoDB: Waiting for %lu "
-				"active transactions to finish\n",
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Waiting for %lu active transactions to finish",
 				(ulong) total_trx);
+
 			count = 0;
 		}
 
 		goto loop;
 	}
 
-	mutex_exit(&kernel_mutex);
-
 	/* Check that the background threads are suspended */
+
 	active_thd = srv_get_active_thread_type();
 
-	if (active_thd != ULINT_UNDEFINED) {
+	if (active_thd != SRV_NONE) {
+
+		if (active_thd == SRV_PURGE) {
+			srv_purge_wakeup();
+		}
 
 		/* The srv_lock_timeout_thread, srv_error_monitor_thread
 		and srv_monitor_thread should already exit by now. The
@@ -3416,26 +3448,51 @@ loop:
 		and worker threads (purge threads). Print the thread
 		type if any of such threads not in suspended mode */
 		if (srv_print_verbose_log && count > 600) {
-			const char*     thread_type = "<null>";
+			const char*	thread_type = "<null>";
 
 			switch (active_thd) {
+			case SRV_NONE:
+				/* This shouldn't happen because we've
+				already checked for this case before
+				entering the if(). We handle it here
+				to avoid a compiler warning. */
+				ut_error;
 			case SRV_WORKER:
 				thread_type = "worker threads";
 				break;
 			case SRV_MASTER:
 				thread_type = "master thread";
 				break;
+			case SRV_PURGE:
+				thread_type = "purge thread";
+				break;
 			}
 
-			ut_print_timestamp(stderr);
-			fprintf(stderr, "  InnoDB: Waiting for %s "
-				"to be suspended\n", thread_type);
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Waiting for %s to be suspended",
+				thread_type);
 			count = 0;
 		}
 
 		goto loop;
 	}
 
+	/* At this point only page_cleaner should be active. We wait
+	here to let it complete the flushing of the buffer pools
+	before proceeding further. */
+	srv_shutdown_state = SRV_SHUTDOWN_FLUSH_PHASE;
+	count = 0;
+	while (buf_page_cleaner_is_active) {
+		++count;
+		os_thread_sleep(100000);
+		if (srv_print_verbose_log && count > 600) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Waiting for page_cleaner to "
+				"finish flushing of buffer pool");
+			count = 0;
+		}
+	}
+
 	mutex_enter(&log_sys->mutex);
 	server_busy = log_sys->n_pending_checkpoint_writes
 #ifdef UNIV_LOG_ARCHIVE
@@ -3446,25 +3503,22 @@ loop:
 
 	if (server_busy) {
 		if (srv_print_verbose_log && count > 600) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: Pending checkpoint_writes: %lu\n"
-				"  InnoDB: Pending log flush writes: %lu\n",
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Pending checkpoint_writes: %lu. "
+				"Pending log flush writes: %lu",
 				(ulong) log_sys->n_pending_checkpoint_writes,
 				(ulong) log_sys->n_pending_writes);
 			count = 0;
 		}
-
 		goto loop;
 	}
 
-	pending_io = buf_pool_check_num_pending_io();
+	pending_io = buf_pool_check_no_pending_io();
 
 	if (pending_io) {
 		if (srv_print_verbose_log && count > 600) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr, "  InnoDB: Waiting for %lu buffer page "
-				"I/Os to complete\n",
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Waiting for %lu buffer page I/Os to complete",
 				(ulong) pending_io);
 			count = 0;
 		}
@@ -3472,50 +3526,61 @@ loop:
 		goto loop;
 	}
 
-
 #ifdef UNIV_LOG_ARCHIVE
 	log_archive_all();
 #endif /* UNIV_LOG_ARCHIVE */
 	if (srv_fast_shutdown == 2) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: MySQL has requested a very fast shutdown"
-			" without flushing "
-			"the InnoDB buffer pool to data files."
-			" At the next mysqld startup "
-			"InnoDB will do a crash recovery!\n");
-
-		/* In this fastest shutdown we do not flush the buffer
-		pool: it is essentially a 'crash' of the InnoDB
-		server. Make sure that the log is all flushed to disk,
-		so that we can recover all committed transactions in a
-		crash recovery. We must not write the lsn stamps to
-		the data files, since at a startup InnoDB deduces from
-		the stamps if the previous shutdown was clean. */
-
-		log_buffer_flush_to_disk();
-
-		/* Check that the background threads stay suspended */
-		if (srv_get_active_thread_type() != ULINT_UNDEFINED) {
-			fprintf(stderr,
-				"InnoDB: Warning: some background thread"
-				" woke up during shutdown\n");
-			goto loop;
+		if (!srv_read_only_mode) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"MySQL has requested a very fast shutdown "
+				"without flushing the InnoDB buffer pool to "
+				"data files. At the next mysqld startup "
+				"InnoDB will do a crash recovery!");
+
+			/* In this fastest shutdown we do not flush the
+			buffer pool:
+
+			it is essentially a 'crash' of the InnoDB server.
+			Make sure that the log is all flushed to disk, so
+			that we can recover all committed transactions in
+			a crash recovery. We must not write the lsn stamps
+			to the data files, since at a startup InnoDB deduces
+			from the stamps if the previous shutdown was clean. */
+
+			log_buffer_flush_to_disk();
+
+			/* Check that the background threads stay suspended */
+			thread_name = srv_any_background_threads_are_active();
+
+			if (thread_name != NULL) {
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"Background thread %s woke up "
+					"during shutdown", thread_name);
+				goto loop;
+			}
 		}
 
 		srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
+
 		/* Wake the log tracking thread which will then immediatelly
 		quit because of srv_shutdown_state value */
 		if (srv_track_changed_pages) {
 			os_event_set(srv_checkpoint_completed_event);
 			os_event_wait(srv_redo_log_thread_finished_event);
 		}
+
 		fil_close_all_files();
-		ut_a(srv_get_active_thread_type() == ULINT_UNDEFINED);
+
+		thread_name = srv_any_background_threads_are_active();
+
+		ut_a(!thread_name);
+
 		return;
 	}
 
-	log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+	if (!srv_read_only_mode) {
+		log_make_checkpoint_at(LSN_MAX, TRUE);
+	}
 
 	mutex_enter(&log_sys->mutex);
 
@@ -3536,15 +3601,7 @@ loop:
 		goto loop;
 	}
 
-	arch_log_no = 0;
-
 #ifdef UNIV_LOG_ARCHIVE
-	UT_LIST_GET_FIRST(log_sys->log_groups)->archived_file_no;
-
-	if (0 == UT_LIST_GET_FIRST(log_sys->log_groups)->archived_offset) {
-
-		arch_log_no--;
-	}
 
 	log_archive_close_groups(TRUE);
 #endif /* UNIV_LOG_ARCHIVE */
@@ -3552,16 +3609,19 @@ loop:
 	mutex_exit(&log_sys->mutex);
 
 	/* Check that the background threads stay suspended */
-	if (srv_get_active_thread_type() != ULINT_UNDEFINED) {
-		fprintf(stderr,
-			"InnoDB: Warning: some background thread woke up"
-			" during shutdown\n");
+	thread_name = srv_any_background_threads_are_active();
+	if (thread_name != NULL) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Background thread %s woke up during shutdown",
+			thread_name);
 
 		goto loop;
 	}
 
-	fil_flush_file_spaces(FIL_TABLESPACE);
-	fil_flush_file_spaces(FIL_LOG);
+	if (!srv_read_only_mode) {
+		fil_flush_file_spaces(FIL_TABLESPACE);
+		fil_flush_file_spaces(FIL_LOG);
+	}
 
 	/* The call fil_write_flushed_lsn_to_data_files() will pass the buffer
 	pool: therefore it is essential that the buffer pool has been
@@ -3571,9 +3631,8 @@ loop:
 	if (!buf_all_freed()) {
 
 		if (srv_print_verbose_log && count > 600) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr, " InnoDB: Waiting for dirty buffer "
-				"pages to be flushed\n");
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Waiting for dirty buffer pages to be flushed");
 			count = 0;
 		}
 
@@ -3588,24 +3647,28 @@ loop:
 	}
 
 	/* Make some checks that the server really is quiet */
-	ut_a(srv_get_active_thread_type() == ULINT_UNDEFINED);
+	srv_thread_type	type = srv_get_active_thread_type();
+	ut_a(type == SRV_NONE);
+
+	bool	freed = buf_all_freed();
+	ut_a(freed);
 
-	ut_a(buf_all_freed());
 	ut_a(lsn == log_sys->lsn);
 
 	if (lsn < srv_start_lsn) {
-		fprintf(stderr,
-			"InnoDB: Error: log sequence number"
-			" at shutdown %llu\n"
-			"InnoDB: is lower than at startup %llu!\n",
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Log sequence number at shutdown " LSN_PF " "
+			"is lower than at startup " LSN_PF "!",
 			lsn, srv_start_lsn);
 	}
 
 	srv_shutdown_lsn = lsn;
 
-	fil_write_flushed_lsn_to_data_files(lsn, arch_log_no);
+	if (!srv_read_only_mode) {
+		fil_write_flushed_lsn_to_data_files(lsn, 0);
 
-	fil_flush_file_spaces(FIL_TABLESPACE);
+		fil_flush_file_spaces(FIL_TABLESPACE);
+	}
 
 	if (srv_track_changed_pages) {
 		os_event_wait(srv_redo_log_thread_finished_event);
@@ -3614,9 +3677,12 @@ loop:
 	fil_close_all_files();
 
 	/* Make some checks that the server really is quiet */
-	ut_a(srv_get_active_thread_type() == ULINT_UNDEFINED);
+	type = srv_get_active_thread_type();
+	ut_a(type == SRV_NONE);
+
+	freed = buf_all_freed();
+	ut_a(freed);
 
-	ut_a(buf_all_freed());
 	ut_a(lsn == log_sys->lsn);
 }
 
@@ -3679,7 +3745,7 @@ UNIV_INTERN
 ibool
 log_peek_lsn(
 /*=========*/
-	ib_uint64_t*	lsn)	/*!< out: if returns TRUE, current lsn is here */
+	lsn_t*	lsn)	/*!< out: if returns TRUE, current lsn is here */
 {
 	if (0 == mutex_enter_nowait(&(log_sys->mutex))) {
 		*lsn = log_sys->lsn;
@@ -3706,35 +3772,41 @@ log_print(
 	mutex_enter(&(log_sys->mutex));
 
 	fprintf(file,
-		"Log sequence number %llu\n"
-		"Log flushed up to   %llu\n"
-		"Last checkpoint at  %llu\n",
+		"Log sequence number " LSN_PF "\n"
+		"Log flushed up to   " LSN_PF "\n"
+		"Pages flushed up to " LSN_PF "\n"
+		"Last checkpoint at  " LSN_PF "\n",
 		log_sys->lsn,
 		log_sys->flushed_to_disk_lsn,
+		log_buf_pool_get_oldest_modification(),
 		log_sys->last_checkpoint_lsn);
 
 	fprintf(file,
-		"Max checkpoint age    %lu\n"
-		"Checkpoint age target %lu\n"
-		"Modified age          %lu\n"
-		"Checkpoint age        %lu\n",
-		(ulong) log_sys->max_checkpoint_age,
-		(ulong) log_max_checkpoint_age_async(),
-		(ulong) (log_sys->lsn -
-				log_buf_pool_get_oldest_modification()),
-		(ulong) (log_sys->lsn - log_sys->last_checkpoint_lsn));
+		"Max checkpoint age    " LSN_PF "\n"
+		"Checkpoint age target " LSN_PF "\n"
+		"Modified age          " LSN_PF "\n"
+		"Checkpoint age        " LSN_PF "\n",
+		log_sys->max_checkpoint_age,
+		log_sys->max_checkpoint_age_async,
+		log_sys->lsn -log_buf_pool_get_oldest_modification(),
+		log_sys->lsn - log_sys->last_checkpoint_lsn);
 
 	current_time = time(NULL);
 
-	time_elapsed = 0.001 + difftime(current_time,
-					log_sys->last_printout_time);
+	time_elapsed = difftime(current_time,
+				log_sys->last_printout_time);
+
+	if (time_elapsed <= 0) {
+		time_elapsed = 1;
+	}
+
 	fprintf(file,
 		"%lu pending log writes, %lu pending chkp writes\n"
 		"%lu log i/o's done, %.2f log i/o's/second\n",
 		(ulong) log_sys->n_pending_writes,
 		(ulong) log_sys->n_pending_checkpoint_writes,
 		(ulong) log_sys->n_log_ios,
-		((log_sys->n_log_ios - log_sys->n_log_ios_old)
+		((double)(log_sys->n_log_ios - log_sys->n_log_ios_old)
 		 / time_elapsed));
 
 	if (srv_track_changed_pages) {
@@ -3743,8 +3815,8 @@ log_print(
 		checkpoint age */
 		fprintf(file,
 			"Log tracking enabled\n"
-			"Log tracked up to   %llu\n"
-			"Max tracked LSN age %lu\n",
+			"Log tracked up to   " LSN_PF "\n"
+			"Max tracked LSN age " LSN_PF "\n",
 			log_get_tracked_lsn(),
 			log_sys->max_checkpoint_age);
 	}
@@ -3766,7 +3838,7 @@ log_refresh_stats(void)
 	log_sys->last_printout_time = time(NULL);
 }
 
-/**********************************************************************
+/********************************************************//**
 Closes a log group. */
 static
 void
@@ -3796,12 +3868,12 @@ log_group_close(
 	mem_free(group);
 }
 
-/**********************************************************
-Shutdown the log system but do not release all the memory. */
+/********************************************************//**
+Closes all log groups. */
 UNIV_INTERN
 void
-log_shutdown(void)
-/*==============*/
+log_group_close_all(void)
+/*=====================*/
 {
 	log_group_t*	group;
 
@@ -3815,6 +3887,16 @@ log_shutdown(void)
 
 		log_group_close(prev_group);
 	}
+}
+
+/********************************************************//**
+Shutdown the log system but do not release all the memory. */
+UNIV_INTERN
+void
+log_shutdown(void)
+/*==============*/
+{
+	log_group_close_all();
 
 	mem_free(log_sys->buf_ptr);
 	log_sys->buf_ptr = NULL;
@@ -3822,6 +3904,9 @@ log_shutdown(void)
 	mem_free(log_sys->checkpoint_buf_ptr);
 	log_sys->checkpoint_buf_ptr = NULL;
 	log_sys->checkpoint_buf = NULL;
+	mem_free(log_sys->archive_buf_ptr);
+	log_sys->archive_buf_ptr = NULL;
+	log_sys->archive_buf = NULL;
 
 	os_event_free(log_sys->no_flush_event);
 	os_event_free(log_sys->one_flushed_event);
@@ -3832,7 +3917,7 @@ log_shutdown(void)
 
 #ifdef UNIV_LOG_ARCHIVE
 	rw_lock_free(&log_sys->archive_lock);
-	os_event_create(log_sys->archiving_on);
+	os_event_free(log_sys->archiving_on);
 #endif /* UNIV_LOG_ARCHIVE */
 
 #ifdef UNIV_LOG_DEBUG
@@ -3842,7 +3927,7 @@ log_shutdown(void)
 	recv_sys_close();
 }
 
-/**********************************************************
+/********************************************************//**
 Free the log system data structures. */
 UNIV_INTERN
 void
diff --git a/storage/xtradb/log/log0online.c b/storage/xtradb/log/log0online.cc
index 5dfe08a4b65..8c2bc5602a9 100644
--- a/storage/xtradb/log/log0online.c
+++ b/storage/xtradb/log/log0online.cc
@@ -17,7 +17,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA
 *****************************************************************************/
 
 /**************************************************//**
-@file log/log0online.c
+@file log/log0online.cc
 Online database log parsing for changed page tracking
 
 *******************************************************/
@@ -65,13 +65,13 @@ struct log_bitmap_struct {
 					/*!< directory for bitmap files */
 	log_online_bitmap_file_t out;	/*!< The current bitmap file */
 	ulint		out_seq_num;	/*!< the bitmap file sequence number */
-	ib_uint64_t	start_lsn;	/*!< the LSN of the next unparsed
+	lsn_t		start_lsn;	/*!< the LSN of the next unparsed
 					record and the start of the next LSN
 					interval to be parsed.  */
-	ib_uint64_t	end_lsn;	/*!< the end of the LSN interval to be
+	lsn_t		end_lsn;	/*!< the end of the LSN interval to be
 					parsed, equal to the next checkpoint
 					LSN at the time of parse */
-	ib_uint64_t	next_parse_lsn;	/*!< the LSN of the next unparsed
+	lsn_t		next_parse_lsn;	/*!< the LSN of the next unparsed
 					record in the current parse */
 	ib_rbt_t*	modified_pages; /*!< the current modified page set,
 					organized as the RB-tree with the keys
@@ -84,7 +84,7 @@ struct log_bitmap_struct {
 					both the correct type and the tree does
 					not mind its overwrite during
 					rbt_next() tree traversal. */
-	mutex_t		mutex;		/*!< mutex protecting all the fields.*/
+	ib_mutex_t	mutex;		/*!< mutex protecting all the fields.*/
 };
 
 /* The log parsing and bitmap output struct instance */
@@ -215,8 +215,9 @@ log_online_set_page_bit(
 			log_bmp_sys->page_free_list = new_node->left;
 		}
 		else {
-			new_node = ut_malloc(SIZEOF_NODE(
-				  log_bmp_sys->modified_pages));
+			new_node = static_cast<ib_rbt_node_t *>
+				(ut_malloc
+				 (SIZEOF_NODE(log_bmp_sys->modified_pages)));
 		}
 		memset(new_node, 0, SIZEOF_NODE(log_bmp_sys->modified_pages));
 
@@ -280,8 +281,6 @@ log_online_read_bitmap_page(
 	ibool				*checksum_ok)	/*!<out: TRUE if page
 							checksum OK */
 {
-	ulint	offset_low	= (ulint)(bitmap_file->offset & 0xFFFFFFFF);
-	ulint	offset_high	= (ulint)(bitmap_file->offset >> 32);
 	ulint	checksum;
 	ulint	actual_checksum;
 	ibool	success;
@@ -291,16 +290,16 @@ log_online_read_bitmap_page(
 	     <= bitmap_file->size - MODIFIED_PAGE_BLOCK_SIZE);
 	ut_a(bitmap_file->offset % MODIFIED_PAGE_BLOCK_SIZE == 0);
 
-	success = os_file_read(bitmap_file->file, page, offset_low,
-			       offset_high, MODIFIED_PAGE_BLOCK_SIZE);
+	success = os_file_read(bitmap_file->file, page, bitmap_file->offset,
+			       MODIFIED_PAGE_BLOCK_SIZE);
 
 	if (UNIV_UNLIKELY(!success)) {
 
 		/* The following call prints an error message */
 		os_file_get_last_error(TRUE);
-		fprintf(stderr,
-			"InnoDB: Warning: failed reading changed page bitmap "
-			"file \'%s\'\n", bitmap_file->name);
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"failed reading changed page bitmap file \'%s\'\n",
+			bitmap_file->name);
 		return FALSE;
 	}
 
@@ -329,15 +328,15 @@ its name is correct and use it for (re-)tracking start.
 
 @return the last fully tracked LSN */
 static
-ib_uint64_t
+lsn_t
 log_online_read_last_tracked_lsn(void)
 /*==================================*/
 {
 	byte		page[MODIFIED_PAGE_BLOCK_SIZE];
 	ibool		is_last_page	= FALSE;
 	ibool		checksum_ok	= FALSE;
-	ib_uint64_t	result;
-	ib_uint64_t	read_offset	= log_bmp_sys->out.offset;
+	lsn_t		result;
+	os_offset_t	read_offset	= log_bmp_sys->out.offset;
 
 	while (!checksum_ok && read_offset > 0 && !is_last_page)
 	{
@@ -357,9 +356,9 @@ log_online_read_last_tracked_lsn(void)
 				(page + MODIFIED_PAGE_IS_LAST_BLOCK);
 		} else {
 
-			fprintf(stderr,
-				"InnoDB: Warning: corruption detected in "
-				"\'%s\' at offset %llu\n",
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"corruption detected in \'%s\' at offset "
+				UINT64PF "\n",
 				log_bmp_sys->out.name, read_offset);
 		}
 	};
@@ -371,8 +370,9 @@ log_online_read_last_tracked_lsn(void)
 	any */
 	if (!os_file_set_eof_at(log_bmp_sys->out.file,
 				log_bmp_sys->out.offset)) {
-		fprintf(stderr, "InnoDB: Warning: failed truncating "
-			"changed page bitmap file \'%s\' to %llu bytes\n",
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"failed truncating changed page bitmap file \'%s\' to "
+			UINT64PF " bytes\n",
 			log_bmp_sys->out.name, log_bmp_sys->out.offset);
 		result = 0;
 	}
@@ -388,12 +388,11 @@ UNIV_INLINE
 void
 log_set_tracked_lsn(
 /*================*/
-	ib_uint64_t	tracked_lsn)	/*!<in: new value */
+	lsn_t	tracked_lsn)	/*!<in: new value */
 {
 #ifdef HAVE_ATOMIC_BUILTINS_64
 	/* Single writer, no data race here */
-	ib_uint64_t old_value
-		= os_atomic_increment_uint64(&log_sys->tracked_lsn, 0);
+	lsn_t old_value = os_atomic_increment_uint64(&log_sys->tracked_lsn, 0);
 	(void) os_atomic_increment_uint64(&log_sys->tracked_lsn,
 					  tracked_lsn - old_value);
 #else
@@ -414,19 +413,19 @@ static
 ibool
 log_online_can_track_missing(
 /*=========================*/
-	ib_uint64_t	last_tracked_lsn,	/*!<in: last tracked LSN */
-	ib_uint64_t	tracking_start_lsn)	/*!<in:	current LSN */
+	lsn_t	last_tracked_lsn,	/*!<in: last tracked LSN */
+	lsn_t	tracking_start_lsn)	/*!<in:	current LSN */
 {
 	/* last_tracked_lsn might be < MIN_TRACKED_LSN in the case of empty
 	bitmap file, handle this too. */
-	last_tracked_lsn = ut_max_uint64(last_tracked_lsn, MIN_TRACKED_LSN);
+	last_tracked_lsn = ut_max(last_tracked_lsn, MIN_TRACKED_LSN);
 
 	if (last_tracked_lsn > tracking_start_lsn) {
-		fprintf(stderr,
-			"InnoDB: Error: last tracked LSN %llu is ahead of "
-			"tracking start LSN %llu.  This can be caused by "
-			"mismatched bitmap files.\n", last_tracked_lsn,
-			tracking_start_lsn);
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"last tracked LSN " LSN_PF " is ahead of tracking "
+			"start LSN " LSN_PF ".  This can be caused by "
+			"mismatched bitmap files.\n",
+			last_tracked_lsn, tracking_start_lsn);
 		exit(1);
 	}
 
@@ -444,40 +443,40 @@ static
 void
 log_online_track_missing_on_startup(
 /*================================*/
-	ib_uint64_t	last_tracked_lsn,	/*!<in: last tracked LSN read
-						from the bitmap file */
-	ib_uint64_t	tracking_start_lsn)	/*!<in: last checkpoint LSN of
-						the current server startup */
+	lsn_t	last_tracked_lsn,	/*!<in: last tracked LSN read from the
+					bitmap file */
+	lsn_t	tracking_start_lsn)	/*!<in: last checkpoint LSN of the
+					current server startup */
 {
 	ut_ad(last_tracked_lsn != tracking_start_lsn);
 
-	fprintf(stderr, "InnoDB: last tracked LSN in \'%s\' is %llu, but the "
-		"last checkpoint LSN is %llu.  This might be due to a server "
-		"crash or a very fast shutdown.  ", log_bmp_sys->out.name,
-		last_tracked_lsn, tracking_start_lsn);
+	ib_logf(IB_LOG_LEVEL_WARN, "last tracked LSN in \'%s\' is " LSN_PF
+		", but the last checkpoint LSN is " LSN_PF ".  This might be "
+		"due to a server crash or a very fast shutdown.  ",
+		log_bmp_sys->out.name, last_tracked_lsn, tracking_start_lsn);
 
 	/* See if we can fully recover the missing interval */
 	if (log_online_can_track_missing(last_tracked_lsn,
 					 tracking_start_lsn)) {
 
-		fprintf(stderr,
-			"Reading the log to advance the last tracked LSN.\n");
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"reading the log to advance the last tracked LSN.\n");
 
-		log_bmp_sys->start_lsn = ut_max_uint64(last_tracked_lsn,
-						       MIN_TRACKED_LSN);
+		log_bmp_sys->start_lsn = ut_max(last_tracked_lsn,
+						MIN_TRACKED_LSN);
 		log_set_tracked_lsn(log_bmp_sys->start_lsn);
 		if (!log_online_follow_redo_log()) {
 			exit(1);
 		}
 		ut_ad(log_bmp_sys->end_lsn >= tracking_start_lsn);
 
-		fprintf(stderr,
-			"InnoDB: continuing tracking changed pages from LSN "
-			"%llu\n", log_bmp_sys->end_lsn);
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"continuing tracking changed pages from LSN " LSN_PF
+			"\n", log_bmp_sys->end_lsn);
 	}
 	else {
-		fprintf(stderr,
-			"The age of last tracked LSN exceeds log capacity, "
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"the age of last tracked LSN exceeds log capacity, "
 			"tracking-based incremental backups will work only "
 			"from the higher LSN!\n");
 
@@ -485,9 +484,9 @@ log_online_track_missing_on_startup(
 			= tracking_start_lsn;
 		log_set_tracked_lsn(log_bmp_sys->start_lsn);
 
-		fprintf(stderr,
-			"InnoDB: starting tracking changed pages from LSN "
-			"%llu\n", log_bmp_sys->end_lsn);
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"starting tracking changed pages from LSN " LSN_PF
+			"\n", log_bmp_sys->end_lsn);
 	}
 }
 
@@ -497,7 +496,7 @@ static
 void
 log_online_make_bitmap_name(
 /*=========================*/
-	ib_uint64_t	start_lsn)	/*!< in: the start LSN name part */
+	lsn_t	start_lsn)	/*!< in: the start LSN name part */
 {
 	ut_snprintf(log_bmp_sys->out.name, FN_REFLEN, bmp_file_name_template,
 		    log_bmp_sys->bmp_file_home, bmp_file_name_stem,
@@ -513,12 +512,12 @@ log_online_should_overwrite(
 /*========================*/
 	const char	*path)	/*!< in: path to file */
 {
-	ibool		success;
+	dberr_t		err;
 	os_file_stat_t	file_info;
 
 	/* Currently, it's OK to overwrite 0-sized files only */
-	success = os_file_get_status(path, &file_info);
-	return success && file_info.type == OS_FILE_TYPE_FILE
+	err = os_file_get_status(path, &file_info, false);
+	return err == DB_SUCCESS && file_info.type == OS_FILE_TYPE_FILE
 		&& file_info.size == 0LL;
 }
 
@@ -535,7 +534,10 @@ log_online_start_bitmap_file(void)
 
 	/* Check for an old file that should be deleted first */
 	if (log_online_should_overwrite(log_bmp_sys->out.name)) {
-		success = os_file_delete_if_exists(log_bmp_sys->out.name);
+
+		success = static_cast<ibool>(
+			os_file_delete_if_exists(innodb_file_bmp_key,
+						 log_bmp_sys->out.name));
 	}
 
 	if (UNIV_LIKELY(success)) {
@@ -551,9 +553,8 @@ log_online_start_bitmap_file(void)
 
 		/* The following call prints an error message */
 		os_file_get_last_error(TRUE);
-		fprintf(stderr,
-			"InnoDB: Error: Cannot create \'%s\'\n",
-			log_bmp_sys->out.name);
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"cannot create \'%s\'\n", log_bmp_sys->out.name);
 		return FALSE;
 	}
 
@@ -569,8 +570,8 @@ static
 ibool
 log_online_rotate_bitmap_file(
 /*===========================*/
-	ib_uint64_t	next_file_start_lsn)	/*!<in: the start LSN name
-						part */
+	lsn_t	next_file_start_lsn)	/*!<in: the start LSN name
+					part */
 {
 	if (log_bmp_sys->out.file != os_file_invalid) {
 		os_file_close(log_bmp_sys->out.file);
@@ -595,7 +596,7 @@ log_online_is_bitmap_file(
 							check */
 	ulong*			bitmap_file_seq_num,	/*!<out: bitmap file
 							sequence number */
-	ib_uint64_t*		bitmap_file_start_lsn)	/*!<out: bitmap file
+	lsn_t*			bitmap_file_start_lsn)	/*!<out: bitmap file
 							start LSN */
 {
 	char	stem[FN_REFLEN];
@@ -605,7 +606,8 @@ log_online_is_bitmap_file(
 	return ((file_info->type == OS_FILE_TYPE_FILE
 		 || file_info->type == OS_FILE_TYPE_LINK)
 		&& (sscanf(file_info->name, "%[a-z_]%lu_%llu.xdb", stem,
-			   bitmap_file_seq_num, bitmap_file_start_lsn) == 3)
+			   bitmap_file_seq_num,
+			   (unsigned long long *)bitmap_file_start_lsn) == 3)
 		&& (!strcmp(stem, bmp_file_name_stem)));
 }
 
@@ -616,24 +618,25 @@ void
 log_online_read_init(void)
 /*======================*/
 {
-	ibool		success;
-	ib_uint64_t	tracking_start_lsn
-		= ut_max_uint64(log_sys->last_checkpoint_lsn, MIN_TRACKED_LSN);
+	ibool	success;
+	lsn_t	tracking_start_lsn
+		= ut_max(log_sys->last_checkpoint_lsn, MIN_TRACKED_LSN);
 	os_file_dir_t	bitmap_dir;
 	os_file_stat_t	bitmap_dir_file_info;
-	ib_uint64_t	last_file_start_lsn	= MIN_TRACKED_LSN;
-	size_t		srv_data_home_len;
+	lsn_t	last_file_start_lsn	= MIN_TRACKED_LSN;
+	size_t	srv_data_home_len;
 
 	/* Bitmap data start and end in a bitmap block must be 8-byte
 	aligned. */
 	compile_time_assert(MODIFIED_PAGE_BLOCK_BITMAP % 8 == 0);
 	compile_time_assert(MODIFIED_PAGE_BLOCK_BITMAP_LEN % 8 == 0);
 
-	log_bmp_sys = ut_malloc(sizeof(*log_bmp_sys));
-	log_bmp_sys->read_buf_ptr = ut_malloc(FOLLOW_SCAN_SIZE
-					      + OS_FILE_LOG_BLOCK_SIZE);
-	log_bmp_sys->read_buf = ut_align(log_bmp_sys->read_buf_ptr,
-					 OS_FILE_LOG_BLOCK_SIZE);
+	log_bmp_sys = static_cast<log_bitmap_struct *>
+		(ut_malloc(sizeof(*log_bmp_sys)));
+	log_bmp_sys->read_buf_ptr = static_cast<byte *>
+		(ut_malloc(FOLLOW_SCAN_SIZE + OS_FILE_LOG_BLOCK_SIZE));
+	log_bmp_sys->read_buf = static_cast<byte *>
+		(ut_align(log_bmp_sys->read_buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
 
 	mutex_create(log_bmp_sys_mutex_key, &log_bmp_sys->mutex,
 		     SYNC_LOG_ONLINE);
@@ -664,8 +667,8 @@ log_online_read_init(void)
 	while (!os_file_readdir_next_file(log_bmp_sys->bmp_file_home,
 					  bitmap_dir, &bitmap_dir_file_info)) {
 
-		ulong		file_seq_num;
-		ib_uint64_t	file_start_lsn;
+		ulong	file_seq_num;
+		lsn_t	file_start_lsn;
 
 		if (!log_online_is_bitmap_file(&bitmap_dir_file_info,
 					      &file_seq_num,
@@ -687,7 +690,7 @@ log_online_read_init(void)
 
 	if (os_file_closedir(bitmap_dir)) {
 		os_file_get_last_error(TRUE);
-		fprintf(stderr, "InnoDB: Error: cannot close \'%s\'\n",
+		ib_logf(IB_LOG_LEVEL_ERROR, "cannot close \'%s\'\n",
 			log_bmp_sys->bmp_file_home);
 		exit(1);
 	}
@@ -716,24 +719,18 @@ log_online_read_init(void)
 	else {
 
 		/* Read the last tracked LSN from the last file */
-		ulint		size_low;
-		ulint		size_high;
-		ib_uint64_t	last_tracked_lsn;
-		ib_uint64_t	file_start_lsn;
-
-		success = os_file_get_size(log_bmp_sys->out.file, &size_low,
-					   &size_high);
-		ut_a(success);
+		lsn_t	last_tracked_lsn;
+		lsn_t	file_start_lsn;
 
 		log_bmp_sys->out.size
-			= ((ib_uint64_t)size_high << 32) | size_low;
+			= os_file_get_size(log_bmp_sys->out.file);
 		log_bmp_sys->out.offset	= log_bmp_sys->out.size;
 
 		if (log_bmp_sys->out.offset % MODIFIED_PAGE_BLOCK_SIZE != 0) {
 
-			fprintf(stderr,
-				"InnoDB: Warning: truncated block detected "
-				"in \'%s\' at offset %llu\n",
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"truncated block detected in \'%s\' at offset "
+				UINT64PF "\n",
 				log_bmp_sys->out.name,
 				log_bmp_sys->out.offset);
 			log_bmp_sys->out.offset -=
@@ -767,17 +764,18 @@ log_online_read_init(void)
 
 		if (last_tracked_lsn > tracking_start_lsn) {
 
-			fprintf(stderr, "InnoDB: last tracked LSN is %llu, "
-				"but last the checkpoint LSN is %llu. "
-				"The tracking-based incremental backups will "
-				"work only from the latter LSN!\n",
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"last tracked LSN is " LSN_PF ", but the last "
+				"checkpoint LSN is " LSN_PF ". The "
+				"tracking-based incremental backups will work "
+				"only from the latter LSN!\n",
 				last_tracked_lsn, tracking_start_lsn);
 		}
 
 	}
 
-	fprintf(stderr, "InnoDB: starting tracking changed pages from "
-		"LSN %llu\n", tracking_start_lsn);
+	ib_logf(IB_LOG_LEVEL_INFO, "starting tracking changed pages from LSN "
+		LSN_PF "\n", tracking_start_lsn);
 	log_bmp_sys->start_lsn = tracking_start_lsn;
 	log_set_tracked_lsn(tracking_start_lsn);
 }
@@ -875,8 +873,7 @@ log_online_parse_redo_log(void)
 					 &body);
 		if (len > 0) {
 
-			if (log_online_rec_page_means_page(type)
-			    && (space != TRX_DOUBLEWRITE_SPACE)) {
+			if (log_online_rec_page_means_page(type)) {
 
 				ut_a(len >= 3);
 				log_online_set_page_bit(space, page_no);
@@ -920,11 +917,11 @@ log_online_is_valid_log_seg(
 
 	if (!checksum_is_ok) {
 
-		fprintf(stderr,
-			"InnoDB Error: log block checksum mismatch"
-			"expected %lu, calculated checksum %lu\n",
-			(ulong) log_block_get_checksum(log_block),
-			(ulong) log_block_calc_checksum(log_block));
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"log block checksum mismatch: expected " ULINTPF ", "
+			"calculated checksum " ULINTPF "\n",
+			log_block_get_checksum(log_block),
+			log_block_calc_checksum(log_block));
 	}
 
 	return checksum_is_ok;
@@ -995,8 +992,8 @@ void
 log_online_follow_log_seg(
 /*======================*/
 	log_group_t*	group,		       /*!< in: the log group to use */
-	ib_uint64_t	block_start_lsn,       /*!< in: the LSN to read from */
-	ib_uint64_t	block_end_lsn)	       /*!< in: the LSN to read to */
+	lsn_t		block_start_lsn,       /*!< in: the LSN to read from */
+	lsn_t		block_end_lsn)	       /*!< in: the LSN to read to */
 {
 	/* Pointer to the current OS_FILE_LOG_BLOCK-sized chunk of the read log
 	data to parse */
@@ -1062,11 +1059,11 @@ void
 log_online_follow_log_group(
 /*========================*/
 	log_group_t*	group,		/*!< in: the log group to use */
-	ib_uint64_t	contiguous_lsn)	/*!< in: the LSN of log block start
+	lsn_t		contiguous_lsn)	/*!< in: the LSN of log block start
 					containing the log_parse_start_lsn */
 {
-	ib_uint64_t block_start_lsn = contiguous_lsn;
-	ib_uint64_t block_end_lsn;
+	lsn_t	block_start_lsn = contiguous_lsn;
+	lsn_t	block_end_lsn;
 
 	ut_ad(mutex_own(&log_bmp_sys->mutex));
 
@@ -1114,27 +1111,24 @@ log_online_write_bitmap_page(
 	DBUG_EXECUTE_IF("bitmap_page_write_error", return FALSE;);
 
 	success = os_file_write(log_bmp_sys->out.name, log_bmp_sys->out.file,
-				block,
-				(ulint)(log_bmp_sys->out.offset & 0xFFFFFFFF),
-				(ulint)(log_bmp_sys->out.offset >> 32),
+				block, log_bmp_sys->out.offset,
 				MODIFIED_PAGE_BLOCK_SIZE);
 	if (UNIV_UNLIKELY(!success)) {
 
 		/* The following call prints an error message */
 		os_file_get_last_error(TRUE);
-		fprintf(stderr, "InnoDB: Error: failed writing changed page "
+		ib_logf(IB_LOG_LEVEL_ERROR, "failed writing changed page "
 			"bitmap file \'%s\'\n", log_bmp_sys->out.name);
 		return FALSE;
 	}
 
-	success = os_file_flush(log_bmp_sys->out.file, FALSE);
+	success = os_file_flush(log_bmp_sys->out.file);
 	if (UNIV_UNLIKELY(!success)) {
 
 		/* The following call prints an error message */
 		os_file_get_last_error(TRUE);
-		fprintf(stderr, "InnoDB: Error: failed flushing "
-			"changed page bitmap file \'%s\'\n",
-			log_bmp_sys->out.name);
+		ib_logf(IB_LOG_LEVEL_ERROR, "failed flushing changed page "
+			"bitmap file \'%s\'\n",	log_bmp_sys->out.name);
 		return FALSE;
 	}
 
@@ -1218,7 +1212,7 @@ ibool
 log_online_follow_redo_log(void)
 /*============================*/
 {
-	ib_uint64_t	contiguous_start_lsn;
+	lsn_t		contiguous_start_lsn;
 	log_group_t*	group;
 	ibool		result;
 
@@ -1229,6 +1223,8 @@ log_online_follow_redo_log(void)
 		return FALSE;
 	}
 
+	ut_ad(!srv_read_only_mode);
+
 	/* Grab the LSN of the last checkpoint, we will parse up to it */
 	mutex_enter(&(log_sys->mutex));
 	log_bmp_sys->end_lsn = log_sys->last_checkpoint_lsn;
@@ -1272,7 +1268,7 @@ log_online_diagnose_inconsistent_dir(
 	log_online_bitmap_file_range_t	*bitmap_files)	/*!<in/out: bitmap file
 							range */
 {
-	fprintf(stderr,
+	ib_logf(IB_LOG_LEVEL_WARN,
 		"InnoDB: Warning: inconsistent bitmap file "
 		"directory for a "
 		"INFORMATION_SCHEMA.INNODB_CHANGED_PAGES query"
@@ -1298,14 +1294,14 @@ log_online_setup_bitmap_file_range(
 /*===============================*/
 	log_online_bitmap_file_range_t	*bitmap_files,	/*!<in/out: bitmap file
 							range */
-	ib_uint64_t			range_start,	/*!<in: start LSN */
-	ib_uint64_t			range_end)	/*!<in: end LSN */
+	lsn_t				range_start,	/*!<in: start LSN */
+	lsn_t				range_end)	/*!<in: end LSN */
 {
 	os_file_dir_t	bitmap_dir;
 	os_file_stat_t	bitmap_dir_file_info;
 	ulong		first_file_seq_num	= ULONG_MAX;
 	ulong		last_file_seq_num	= 0;
-	ib_uint64_t	first_file_start_lsn	= IB_ULONGLONG_MAX;
+	lsn_t		first_file_start_lsn	= LSN_MAX;
 
 	ut_ad(range_end >= range_start);
 
@@ -1316,8 +1312,8 @@ log_online_setup_bitmap_file_range(
 
 	bitmap_dir = os_file_opendir(srv_data_home, FALSE);
 	if (UNIV_UNLIKELY(!bitmap_dir)) {
-		fprintf(stderr,
-			"InnoDB: Error: "
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
 			"failed to open bitmap directory \'%s\'\n",
 			srv_data_home);
 		return FALSE;
@@ -1326,8 +1322,8 @@ log_online_setup_bitmap_file_range(
 	while (!os_file_readdir_next_file(srv_data_home, bitmap_dir,
 					  &bitmap_dir_file_info)) {
 
-		ulong		file_seq_num;
-		ib_uint64_t	file_start_lsn;
+		ulong	file_seq_num;
+		lsn_t	file_start_lsn;
 
 		if (!log_online_is_bitmap_file(&bitmap_dir_file_info,
 					       &file_seq_num,
@@ -1368,7 +1364,7 @@ log_online_setup_bitmap_file_range(
 	if (UNIV_UNLIKELY(os_file_closedir(bitmap_dir))) {
 
 		os_file_get_last_error(TRUE);
-		fprintf(stderr, "InnoDB: Error: cannot close \'%s\'\n",
+		ib_logf(IB_LOG_LEVEL_ERROR, "cannot close \'%s\'\n",
 			srv_data_home);
 		return FALSE;
 	}
@@ -1388,23 +1384,25 @@ log_online_setup_bitmap_file_range(
 	bitmap_dir = os_file_opendir(srv_data_home, FALSE);
 	if (UNIV_UNLIKELY(!bitmap_dir)) {
 
-		fprintf(stderr, "InnoDB: Error: "
+		ib_logf(IB_LOG_LEVEL_ERROR,
 			"failed to open bitmap directory \'%s\'\n",
 			srv_data_home);
 		return FALSE;
 	}
 
-	bitmap_files->files = ut_malloc(bitmap_files->count
-					* sizeof(bitmap_files->files[0]));
+	bitmap_files->files
+		= static_cast<log_online_bitmap_file_range_struct::files_t *>
+		(ut_malloc(bitmap_files->count
+			   * sizeof(bitmap_files->files[0])));
 	memset(bitmap_files->files, 0,
 	       bitmap_files->count * sizeof(bitmap_files->files[0]));
 
 	while (!os_file_readdir_next_file(srv_data_home, bitmap_dir,
 					  &bitmap_dir_file_info)) {
 
-		ulong		file_seq_num;
-		ib_uint64_t	file_start_lsn;
-		size_t		array_pos;
+		ulong	file_seq_num;
+		lsn_t	file_start_lsn;
+		size_t	array_pos;
 
 		if (!log_online_is_bitmap_file(&bitmap_dir_file_info,
 					       &file_seq_num,
@@ -1438,7 +1436,7 @@ log_online_setup_bitmap_file_range(
 	if (UNIV_UNLIKELY(os_file_closedir(bitmap_dir))) {
 
 		os_file_get_last_error(TRUE);
-		fprintf(stderr, "InnoDB: Error: cannot close \'%s\'\n",
+		ib_logf(IB_LOG_LEVEL_ERROR, "cannot close \'%s\'\n",
 			srv_data_home);
 		free(bitmap_files->files);
 		return FALSE;
@@ -1484,8 +1482,6 @@ log_online_open_bitmap_file_read_only(
 							file */
 {
 	ibool	success	= FALSE;
-	ulint	size_low;
-	ulint	size_high;
 
 	ut_ad(name[0] != '\0');
 
@@ -1500,14 +1496,13 @@ log_online_open_bitmap_file_read_only(
 
 		/* Here and below assume that bitmap file names do not
 		contain apostrophes, thus no need for ut_print_filename(). */
-		fprintf(stderr,
-			"InnoDB: Warning: error opening the changed page "
-			"bitmap \'%s\'\n", bitmap_file->name);
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"error opening the changed page bitmap \'%s\'\n",
+			bitmap_file->name);
 		return FALSE;
 	}
 
-	success = os_file_get_size(bitmap_file->file, &size_low, &size_high);
-	bitmap_file->size = (((ib_uint64_t)size_high) << 32) | size_low;
+	bitmap_file->size = os_file_get_size(bitmap_file->file);
 	bitmap_file->offset = 0;
 
 #ifdef UNIV_LINUX
@@ -1546,10 +1541,9 @@ log_online_diagnose_bitmap_eof(
 			to read, it's junk.  This error is not fatal in
 			itself. */
 
-			fprintf(stderr,
-				"InnoDB: Warning: junk at the end of changed "
-				"page bitmap file \'%s\'.\n",
-				bitmap_file->name);
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"junk at the end of changed page bitmap file "
+				"\'%s\'.\n", bitmap_file->name);
 		}
 
 		if (UNIV_UNLIKELY(!last_page_in_run)) {
@@ -1558,10 +1552,10 @@ log_online_diagnose_bitmap_eof(
 			a run */
 			/* It's a "Warning" here because it's not a fatal error
 			for the whole server */
-			fprintf(stderr,
-				"InnoDB: Warning: changed page bitmap "
-				"file \'%s\' does not contain a complete run "
-				"at the end.\n", bitmap_file->name);
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"changed page bitmap file \'%s\' does not "
+				"contain a complete run at the end.\n",
+				bitmap_file->name);
 			return FALSE;
 		}
 	}
@@ -1585,8 +1579,8 @@ ibool
 log_online_bitmap_iterator_init(
 /*============================*/
 	log_bitmap_iterator_t	*i,	/*!<in/out:  iterator */
-	ib_uint64_t		min_lsn,/*!< in: start LSN */
-	ib_uint64_t		max_lsn)/*!< in: end LSN */
+	lsn_t			min_lsn,/*!< in: start LSN */
+	lsn_t			max_lsn)/*!< in: end LSN */
 {
 	ut_a(i);
 
@@ -1619,7 +1613,7 @@ log_online_bitmap_iterator_init(
 		return FALSE;
 	}
 
-	i->page = ut_malloc(MODIFIED_PAGE_BLOCK_SIZE);
+	i->page = static_cast<byte *>(ut_malloc(MODIFIED_PAGE_BLOCK_SIZE));
 	i->bit_offset = MODIFIED_PAGE_BLOCK_BITMAP_LEN;
 	i->start_lsn = i->end_lsn = 0;
 	i->space_id = 0;
@@ -1740,10 +1734,9 @@ log_online_bitmap_iterator_next(
 		if (UNIV_UNLIKELY(!success)) {
 
 			os_file_get_last_error(TRUE);
-			fprintf(stderr,
-				"InnoDB: Warning: failed reading "
-				"changed page bitmap file \'%s\'\n",
-				i->in_files.files[i->in_i].name);
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"failed reading changed page bitmap file "
+				"\'%s\'\n", i->in_files.files[i->in_i].name);
 			i->failed = TRUE;
 			return FALSE;
 		}
@@ -1765,23 +1758,22 @@ log_online_bitmap_iterator_next(
 
 /************************************************************//**
 Delete all the bitmap files for data less than the specified LSN.
-If called with lsn == 0 (i.e. set by RESET request) or
-IB_ULONGLONG_MAX, restart the bitmap file sequence, otherwise
-continue it.
+If called with lsn == 0 (i.e. set by RESET request) or LSN_MAX,
+restart the bitmap file sequence, otherwise continue it.
 
 @return FALSE to indicate success, TRUE for failure. */
 UNIV_INTERN
 ibool
 log_online_purge_changed_page_bitmaps(
 /*==================================*/
-	ib_uint64_t lsn)	/*!< in: LSN to purge files up to */
+	lsn_t	lsn)	/*!< in: LSN to purge files up to */
 {
 	log_online_bitmap_file_range_t	bitmap_files;
 	size_t				i;
 	ibool				result = FALSE;
 
 	if (lsn == 0) {
-		lsn = IB_ULONGLONG_MAX;
+		lsn = LSN_MAX;
 	}
 
 	if (srv_track_changed_pages) {
@@ -1807,9 +1799,12 @@ log_online_purge_changed_page_bitmaps(
 	for (i = 0; i < bitmap_files.count; i++) {
 		if (bitmap_files.files[i].seq_num == 0
 		    || bitmap_files.files[i].start_lsn >= lsn) {
+
 			break;
 		}
-		if (!os_file_delete_if_exists(bitmap_files.files[i].name)) {
+		if (!os_file_delete_if_exists(innodb_file_bmp_key,
+					      bitmap_files.files[i].name)) {
+
 			os_file_get_last_error(TRUE);
 			result = TRUE;
 			break;
@@ -1818,8 +1813,8 @@ log_online_purge_changed_page_bitmaps(
 
 	if (srv_track_changed_pages) {
 		if (lsn > log_bmp_sys->end_lsn) {
-			ib_uint64_t	new_file_lsn;
-			if (lsn == IB_ULONGLONG_MAX) {
+			lsn_t	new_file_lsn;
+			if (lsn == LSN_MAX) {
 				/* RESET restarts the sequence */
 				log_bmp_sys->out_seq_num = 0;
 				new_file_lsn = 0;
diff --git a/storage/xtradb/log/log0recv.c b/storage/xtradb/log/log0recv.cc
index 61239dfb25d..d0b833f2bba 100644
--- a/storage/xtradb/log/log0recv.c
+++ b/storage/xtradb/log/log0recv.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +12,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file log/log0recv.c
+@file log/log0recv.cc
 Recovery
 
 Created 9/20/1997 Heikki Tuuri
@@ -153,7 +154,7 @@ UNIV_INTERN ulint	recv_n_pool_free_frames;
 /** The maximum lsn we see for a page during the recovery process. If this
 is bigger than the lsn we are able to scan up to, that is an indication that
 the recovery failed and the database may be corrupt. */
-UNIV_INTERN ib_uint64_t	recv_max_page_lsn;
+UNIV_INTERN lsn_t	recv_max_page_lsn;
 
 #ifdef UNIV_PFS_THREAD
 UNIV_INTERN mysql_pfs_key_t	trx_rollback_clean_thread_key;
@@ -163,6 +164,20 @@ UNIV_INTERN mysql_pfs_key_t	trx_rollback_clean_thread_key;
 UNIV_INTERN mysql_pfs_key_t	recv_sys_mutex_key;
 #endif /* UNIV_PFS_MUTEX */
 
+#ifndef UNIV_HOTBACKUP
+# ifdef UNIV_PFS_THREAD
+UNIV_INTERN mysql_pfs_key_t	recv_writer_thread_key;
+# endif /* UNIV_PFS_THREAD */
+
+# ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	recv_writer_mutex_key;
+# endif /* UNIV_PFS_MUTEX */
+
+/** Flag indicating if recv_writer thread is active. */
+UNIV_INTERN bool		recv_writer_thread_active = false;
+UNIV_INTERN os_thread_t		recv_writer_thread_handle = 0;
+#endif /* !UNIV_HOTBACKUP */
+
 /* prototypes */
 
 #ifndef UNIV_HOTBACKUP
@@ -187,16 +202,17 @@ recv_sys_create(void)
 		return;
 	}
 
-	recv_sys = mem_alloc(sizeof(*recv_sys));
-	memset(recv_sys, 0x0, sizeof(*recv_sys));
+	recv_sys = static_cast<recv_sys_t*>(mem_zalloc(sizeof(*recv_sys)));
 
 	mutex_create(recv_sys_mutex_key, &recv_sys->mutex, SYNC_RECV);
 
+#ifndef UNIV_HOTBACKUP
+	mutex_create(recv_writer_mutex_key, &recv_sys->writer_mutex,
+		     SYNC_LEVEL_VARYING);
+#endif /* !UNIV_HOTBACKUP */
+
 	recv_sys->heap = NULL;
 	recv_sys->addr_hash = NULL;
-
-	recv_sys->stats_recv_start_time = time(NULL);
-	recv_sys->stats_oldest_modified_lsn = IB_ULONGLONG_MAX;
 }
 
 /********************************************************//**
@@ -223,6 +239,11 @@ recv_sys_close(void)
 			mem_free(recv_sys->last_block_buf_start);
 		}
 
+#ifndef UNIV_HOTBACKUP
+		ut_ad(!recv_writer_thread_active);
+		mutex_free(&recv_sys->writer_mutex);
+#endif /* !UNIV_HOTBACKUP */
+
 		mutex_free(&recv_sys->mutex);
 
 		mem_free(recv_sys);
@@ -299,6 +320,58 @@ recv_sys_var_init(void)
 
 	recv_max_page_lsn = 0;
 }
+
+/******************************************************************//**
+recv_writer thread tasked with flushing dirty pages from the buffer
+pools.
+@return a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(recv_writer_thread)(
+/*===============================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	ut_ad(!srv_read_only_mode);
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(recv_writer_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "InnoDB: recv_writer thread running, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	recv_writer_thread_active = true;
+
+	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+
+		os_thread_sleep(100000);
+
+		mutex_enter(&recv_sys->writer_mutex);
+
+		if (!recv_recovery_on) {
+			mutex_exit(&recv_sys->writer_mutex);
+			break;
+		}
+
+		/* Flush pages from end of LRU if required */
+		buf_flush_LRU_tail();
+
+		mutex_exit(&recv_sys->writer_mutex);
+	}
+
+	recv_writer_thread_active = false;
+
+	/* We count the number of threads in os_thread_exit().
+	A created thread should always use that to exit and not
+	use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
 #endif /* !UNIV_HOTBACKUP */
 
 /************************************************************
@@ -319,13 +392,12 @@ recv_sys_init(
 	flush_list during recovery process.
 	As this initialization is done while holding the buffer pool
 	mutex we perform it before acquiring recv_sys->mutex. */
-#ifndef UNIV_HOTBACKUP
 	buf_flush_init_flush_rbt();
-#endif /* !UNIV_HOTBACKUP */
 
 	mutex_enter(&(recv_sys->mutex));
 
-	recv_sys->heap = mem_heap_create_in_buffer(256);
+	recv_sys->heap = mem_heap_create_typed(256,
+					MEM_HEAP_FOR_RECV_SYS);
 #else /* !UNIV_HOTBACKUP */
 	recv_sys->heap = mem_heap_create(256);
 	recv_is_from_backup = TRUE;
@@ -337,12 +409,7 @@ recv_sys_init(
 		recv_n_pool_free_frames = 512;
 	}
 
-	if (buf_pool_get_curr_size() >= (32 * 1024 * 1024)) {
-		/* Buffer pool of size greater than 32 MB. */
-		recv_n_pool_free_frames = 1024;
-	}
-
-	recv_sys->buf = ut_malloc(RECV_PARSING_BUF_SIZE);
+	recv_sys->buf = static_cast<byte*>(ut_malloc(RECV_PARSING_BUF_SIZE));
 	recv_sys->len = 0;
 	recv_sys->recovered_offset = 0;
 
@@ -352,10 +419,12 @@ recv_sys_init(
 	recv_sys->apply_log_recs = FALSE;
 	recv_sys->apply_batch_on = FALSE;
 
-	recv_sys->last_block_buf_start = mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE);
+	recv_sys->last_block_buf_start = static_cast<byte*>(
+		mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE));
+
+	recv_sys->last_block = static_cast<byte*>(ut_align(
+		recv_sys->last_block_buf_start, OS_FILE_LOG_BLOCK_SIZE));
 
-	recv_sys->last_block = ut_align(recv_sys->last_block_buf_start,
-					OS_FILE_LOG_BLOCK_SIZE);
 	recv_sys->found_corrupt_log = FALSE;
 
 	recv_max_page_lsn = 0;
@@ -417,6 +486,7 @@ recv_sys_debug_free(void)
 }
 # endif /* UNIV_LOG_DEBUG */
 
+# ifdef UNIV_LOG_ARCHIVE
 /********************************************************//**
 Truncates possible corrupted or extra records from a log group. */
 static
@@ -424,24 +494,22 @@ void
 recv_truncate_group(
 /*================*/
 	log_group_t*	group,		/*!< in: log group */
-	ib_uint64_t	recovered_lsn,	/*!< in: recovery succeeded up to this
+	lsn_t		recovered_lsn,	/*!< in: recovery succeeded up to this
 					lsn */
-	ib_uint64_t	limit_lsn,	/*!< in: this was the limit for
+	lsn_t		limit_lsn,	/*!< in: this was the limit for
 					recovery */
-	ib_uint64_t	checkpoint_lsn,	/*!< in: recovery was started from this
+	lsn_t		checkpoint_lsn,	/*!< in: recovery was started from this
 					checkpoint */
-	ib_uint64_t	archived_lsn)	/*!< in: the log has been archived up to
+	lsn_t		archived_lsn)	/*!< in: the log has been archived up to
 					this lsn */
 {
-	ib_uint64_t	start_lsn;
-	ib_uint64_t	end_lsn;
-	ib_uint64_t	finish_lsn1;
-	ib_uint64_t	finish_lsn2;
-	ib_uint64_t	finish_lsn;
-	ulint		len;
-	ulint		i;
+	lsn_t		start_lsn;
+	lsn_t		end_lsn;
+	lsn_t		finish_lsn1;
+	lsn_t		finish_lsn2;
+	lsn_t		finish_lsn;
 
-	if (archived_lsn == IB_ULONGLONG_MAX) {
+	if (archived_lsn == LSN_MAX) {
 		/* Checkpoint was taken in the NOARCHIVELOG mode */
 		archived_lsn = checkpoint_lsn;
 	}
@@ -454,7 +522,7 @@ recv_truncate_group(
 					 OS_FILE_LOG_BLOCK_SIZE)
 		+ recv_sys->last_log_buf_size;
 
-	if (limit_lsn != IB_ULONGLONG_MAX) {
+	if (limit_lsn != LSN_MAX) {
 		/* We do not know how far we should erase log records: erase
 		as much as possible */
 
@@ -467,11 +535,7 @@ recv_truncate_group(
 
 	ut_a(RECV_SCAN_SIZE <= log_sys->buf_size);
 
-	/* Write the log buffer full of zeros */
-	for (i = 0; i < RECV_SCAN_SIZE; i++) {
-
-		*(log_sys->buf + i) = '\0';
-	}
+	memset(log_sys->buf, 0, RECV_SCAN_SIZE);
 
 	start_lsn = ut_uint64_align_down(recovered_lsn,
 					 OS_FILE_LOG_BLOCK_SIZE);
@@ -479,11 +543,13 @@ recv_truncate_group(
 	if (start_lsn != recovered_lsn) {
 		/* Copy the last incomplete log block to the log buffer and
 		edit its data length: */
+		lsn_t	diff = recovered_lsn - start_lsn;
+
+		ut_a(diff <= 0xFFFFUL);
 
 		ut_memcpy(log_sys->buf, recv_sys->last_block,
 			  OS_FILE_LOG_BLOCK_SIZE);
-		log_block_set_data_len(log_sys->buf,
-				       (ulint) (recovered_lsn - start_lsn));
+		log_block_set_data_len(log_sys->buf, (ulint) diff);
 	}
 
 	if (start_lsn >= finish_lsn) {
@@ -492,6 +558,8 @@ recv_truncate_group(
 	}
 
 	for (;;) {
+		ulint	len;
+
 		end_lsn = start_lsn + RECV_SCAN_SIZE;
 
 		if (end_lsn > finish_lsn) {
@@ -507,11 +575,7 @@ recv_truncate_group(
 			return;
 		}
 
-		/* Write the log buffer full of zeros */
-		for (i = 0; i < RECV_SCAN_SIZE; i++) {
-
-			*(log_sys->buf + i) = '\0';
-		}
+		memset(log_sys->buf, 0, RECV_SCAN_SIZE);
 
 		start_lsn = end_lsn;
 	}
@@ -528,12 +592,11 @@ recv_copy_group(
 						group */
 	log_group_t*	group,			/*!< in: copy to this log
 						group */
-	ib_uint64_t	recovered_lsn)		/*!< in: recovery succeeded up
+	lsn_t		recovered_lsn)		/*!< in: recovery succeeded up
 						to this lsn */
 {
-	ib_uint64_t	start_lsn;
-	ib_uint64_t	end_lsn;
-	ulint		len;
+	lsn_t		start_lsn;
+	lsn_t		end_lsn;
 
 	if (group->scanned_lsn >= recovered_lsn) {
 
@@ -545,6 +608,8 @@ recv_copy_group(
 	start_lsn = ut_uint64_align_down(group->scanned_lsn,
 					 OS_FILE_LOG_BLOCK_SIZE);
 	for (;;) {
+		ulint	len;
+
 		end_lsn = start_lsn + RECV_SCAN_SIZE;
 
 		if (end_lsn > recovered_lsn) {
@@ -568,6 +633,7 @@ recv_copy_group(
 		start_lsn = end_lsn;
 	}
 }
+# endif /* UNIV_LOG_ARCHIVE */
 
 /********************************************************//**
 Copies a log segment from the most up-to-date log group to the other log
@@ -578,13 +644,15 @@ static
 void
 recv_synchronize_groups(
 /*====================*/
-	log_group_t*	up_to_date_group)	/*!< in: the most up-to-date
+#ifdef UNIV_LOG_ARCHIVE
+	log_group_t*	up_to_date_group	/*!< in: the most up-to-date
 						log group */
+#endif
+	)
 {
-	log_group_t*	group;
-	ib_uint64_t	start_lsn;
-	ib_uint64_t	end_lsn;
-	ib_uint64_t	recovered_lsn;
+	lsn_t		start_lsn;
+	lsn_t		end_lsn;
+	lsn_t		recovered_lsn;
 
 	recovered_lsn = recv_sys->recovered_lsn;
 
@@ -598,11 +666,17 @@ recv_synchronize_groups(
 	ut_a(start_lsn != end_lsn);
 
 	log_group_read_log_seg(LOG_RECOVER, recv_sys->last_block,
-			       up_to_date_group, start_lsn, end_lsn, FALSE);
-
-	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+#ifdef UNIV_LOG_ARCHIVE
+			       up_to_date_group,
+#else /* UNIV_LOG_ARCHIVE */
+			       UT_LIST_GET_FIRST(log_sys->log_groups),
+#endif /* UNIV_LOG_ARCHIVE */
+			       start_lsn, end_lsn, FALSE);
 
-	while (group) {
+	for (log_group_t* group = UT_LIST_GET_FIRST(log_sys->log_groups);
+	     group;
+	     group = UT_LIST_GET_NEXT(log_groups, group)) {
+#ifdef UNIV_LOG_ARCHIVE
 		if (group != up_to_date_group) {
 
 			/* Copy log data if needed */
@@ -610,13 +684,11 @@ recv_synchronize_groups(
 			recv_copy_group(group, up_to_date_group,
 					recovered_lsn);
 		}
-
+#endif /* UNIV_LOG_ARCHIVE */
 		/* Update the fields in the group struct to correspond to
 		recovered_lsn */
 
 		log_group_set_fields(group, recovered_lsn);
-
-		group = UT_LIST_GET_NEXT(log_groups, group);
 	}
 
 	/* Copy the checkpoint info to the groups; remember that we have
@@ -669,8 +741,8 @@ recv_check_cp_is_consistent(
 /********************************************************//**
 Looks for the maximum consistent checkpoint from the log groups.
 @return	error code or DB_SUCCESS */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 recv_find_max_checkpoint(
 /*=====================*/
 	log_group_t**	max_group,	/*!< out: max group */
@@ -720,22 +792,10 @@ recv_find_max_checkpoint(
 
 			group->lsn = mach_read_from_8(
 				buf + LOG_CHECKPOINT_LSN);
-
-#ifdef UNIV_LOG_ARCHIVE
-#error "UNIV_LOG_ARCHIVE could not be enabled"
-#endif
-			{
-			ib_uint64_t tmp_lsn_offset = mach_read_from_8(
-					buf + LOG_CHECKPOINT_ARCHIVED_LSN);
-				if (sizeof(ulint) != 4
-				    && tmp_lsn_offset != IB_ULONGLONG_MAX) {
-					group->lsn_offset = (ulint) tmp_lsn_offset;
-				} else {
 			group->lsn_offset = mach_read_from_4(
-				buf + LOG_CHECKPOINT_OFFSET);
-				}
-			}
-
+				buf + LOG_CHECKPOINT_OFFSET_LOW32);
+			group->lsn_offset |= ((lsn_t) mach_read_from_4(
+				buf + LOG_CHECKPOINT_OFFSET_HIGH32)) << 32;
 			checkpoint_no = mach_read_from_8(
 				buf + LOG_CHECKPOINT_NO);
 
@@ -785,17 +845,14 @@ Reads the checkpoint info needed in hot backup.
 @return	TRUE if success */
 UNIV_INTERN
 ibool
-recv_read_cp_info_for_backup(
-/*=========================*/
+recv_read_checkpoint_info_for_backup(
+/*=================================*/
 	const byte*	hdr,	/*!< in: buffer containing the log group
 				header */
-	ib_uint64_t*	lsn,	/*!< out: checkpoint lsn */
-	ulint*		offset,	/*!< out: checkpoint offset in the log group */
-	ulint*		fsp_limit,/*!< out: fsp limit of space 0,
-				1000000000 if the database is running
-				with < version 3.23.50 of InnoDB */
-	ib_uint64_t*	cp_no,	/*!< out: checkpoint number */
-	ib_uint64_t*	first_header_lsn)
+	lsn_t*		lsn,	/*!< out: checkpoint lsn */
+	lsn_t*		offset,	/*!< out: checkpoint offset in the log group */
+	lsn_t*		cp_no,	/*!< out: checkpoint number */
+	lsn_t*		first_header_lsn)
 				/*!< out: lsn of of the start of the
 				first log file */
 {
@@ -825,24 +882,10 @@ recv_read_cp_info_for_backup(
 	cp_buf = hdr + max_cp;
 
 	*lsn = mach_read_from_8(cp_buf + LOG_CHECKPOINT_LSN);
-	*offset = mach_read_from_4(cp_buf + LOG_CHECKPOINT_OFFSET);
-
-	/* If the user is running a pre-3.23.50 version of InnoDB, its
-	checkpoint data does not contain the fsp limit info */
-	if (mach_read_from_4(cp_buf + LOG_CHECKPOINT_FSP_MAGIC_N)
-	    == LOG_CHECKPOINT_FSP_MAGIC_N_VAL) {
-
-		*fsp_limit = mach_read_from_4(
-			cp_buf + LOG_CHECKPOINT_FSP_FREE_LIMIT);
-
-		if (*fsp_limit == 0) {
-			*fsp_limit = 1000000000;
-		}
-	} else {
-		*fsp_limit = 1000000000;
-	}
-
-	/*	fprintf(stderr, "fsp limit %lu MB\n", *fsp_limit); */
+	*offset = mach_read_from_4(
+		cp_buf + LOG_CHECKPOINT_OFFSET_LOW32);
+	*offset |= ((lsn_t) mach_read_from_4(
+			    cp_buf + LOG_CHECKPOINT_OFFSET_HIGH32)) << 32;
 
 	*cp_no = mach_read_from_8(cp_buf + LOG_CHECKPOINT_NO);
 
@@ -867,12 +910,72 @@ log_block_checksum_is_ok_or_old_format(
 #ifdef UNIV_LOG_DEBUG
 	return(TRUE);
 #endif /* UNIV_LOG_DEBUG */
-	if (log_block_calc_checksum(block) == log_block_get_checksum(block)) {
+
+	ulint block_checksum = log_block_get_checksum(block);
+
+	if (UNIV_LIKELY(srv_log_checksum_algorithm ==
+			SRV_CHECKSUM_ALGORITHM_NONE ||
+			log_block_calc_checksum(block) == block_checksum)) {
+
+		return(TRUE);
+	}
+
+	if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32 ||
+	    srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB ||
+	    srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_NONE) {
+
+		const char*	algo = NULL;
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"log block checksum mismatch: expected " ULINTPF ", "
+			"calculated checksum " ULINTPF,
+			block_checksum,
+			log_block_calc_checksum(block));
+
+		if (block_checksum == LOG_NO_CHECKSUM_MAGIC) {
+
+			algo = "none";
+		} else if (block_checksum ==
+			   log_block_calc_checksum_crc32(block)) {
+
+			algo = "crc32";
+		} else if (block_checksum ==
+			   log_block_calc_checksum_innodb(block)) {
+
+			algo = "innodb";
+		}
+
+		if (algo) {
+
+			const char*	current_algo;
+
+			current_algo = buf_checksum_algorithm_name(
+				(srv_checksum_algorithm_t)
+				srv_log_checksum_algorithm);
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"current InnoDB log checksum type: %s, "
+				"detected log checksum type: %s",
+				current_algo,
+				algo);
+		}
+
+		ib_logf(IB_LOG_LEVEL_FATAL,
+			"STRICT method was specified for innodb_log_checksum, "
+			"so we intentionally assert here.");
+	}
+
+	ut_ad(srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_CRC32 ||
+	      srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+	if (block_checksum == LOG_NO_CHECKSUM_MAGIC ||
+	    block_checksum == log_block_calc_checksum_crc32(block) ||
+	    block_checksum == log_block_calc_checksum_innodb(block)) {
 
 		return(TRUE);
 	}
 
-	if (log_block_get_hdr_no(block) == log_block_get_checksum(block)) {
+	if (log_block_get_hdr_no(block) == block_checksum) {
 
 		/* We assume the log block is in the format of
 		InnoDB version < 3.23.52 and the block is ok */
@@ -898,7 +1001,7 @@ recv_scan_log_seg_for_backup(
 /*=========================*/
 	byte*		buf,		/*!< in: buffer containing log data */
 	ulint		buf_len,	/*!< in: data length in that buffer */
-	ib_uint64_t*	scanned_lsn,	/*!< in/out: lsn of buffer start,
+	lsn_t*		scanned_lsn,	/*!< in/out: lsn of buffer start,
 					we return scanned lsn */
 	ulint*		scanned_checkpoint_no,
 					/*!< in/out: 4 lowest bytes of the
@@ -1191,18 +1294,22 @@ recv_parse_or_apply_log_rec_body(
 				ptr, end_ptr, block, index, mtr);
 		}
 		break;
-	case MLOG_PAGE_REORGANIZE: case MLOG_COMP_PAGE_REORGANIZE:
+	case MLOG_PAGE_REORGANIZE:
+	case MLOG_COMP_PAGE_REORGANIZE:
+	case MLOG_ZIP_PAGE_REORGANIZE:
 		ut_ad(!page || page_type == FIL_PAGE_INDEX);
 
 		if (NULL != (ptr = mlog_parse_index(
 				     ptr, end_ptr,
-				     type == MLOG_COMP_PAGE_REORGANIZE,
+				     type != MLOG_PAGE_REORGANIZE,
 				     &index))) {
 			ut_a(!page
 			     || (ibool)!!page_is_comp(page)
 			     == dict_table_is_comp(index->table));
-			ptr = btr_parse_page_reorganize(ptr, end_ptr, index,
-							block, mtr);
+			ptr = btr_parse_page_reorganize(
+				ptr, end_ptr, index,
+				type == MLOG_ZIP_PAGE_REORGANIZE,
+				block, mtr);
 		}
 		break;
 	case MLOG_PAGE_CREATE: case MLOG_COMP_PAGE_CREATE:
@@ -1301,6 +1408,16 @@ recv_parse_or_apply_log_rec_body(
 		ptr = page_zip_parse_compress(ptr, end_ptr,
 					      page, page_zip);
 		break;
+	case MLOG_ZIP_PAGE_COMPRESS_NO_DATA:
+		if (NULL != (ptr = mlog_parse_index(
+				ptr, end_ptr, TRUE, &index))) {
+
+			ut_a(!page || ((ibool)!!page_is_comp(page)
+				== dict_table_is_comp(index->table)));
+			ptr = page_zip_parse_compress_no_data(
+				ptr, end_ptr, page, page_zip, index);
+		}
+		break;
 	default:
 		ptr = NULL;
 		recv_sys->found_corrupt_log = TRUE;
@@ -1356,19 +1473,21 @@ recv_get_fil_addr_struct(
 {
 	recv_addr_t*	recv_addr;
 
-	recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
-				   recv_hash(space, page_no));
-	while (recv_addr) {
-		if ((recv_addr->space == space)
-		    && (recv_addr->page_no == page_no)) {
+	for (recv_addr = static_cast<recv_addr_t*>(
+			HASH_GET_FIRST(recv_sys->addr_hash,
+				       recv_hash(space, page_no)));
+	     recv_addr != 0;
+	     recv_addr = static_cast<recv_addr_t*>(
+		     HASH_GET_NEXT(addr_hash, recv_addr))) {
 
-			break;
-		}
+		if (recv_addr->space == space
+		    && recv_addr->page_no == page_no) {
 
-		recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+			return(recv_addr);
+		}
 	}
 
-	return(recv_addr);
+	return(NULL);
 }
 
 /*******************************************************************//**
@@ -1377,13 +1496,13 @@ static
 void
 recv_add_to_hash_table(
 /*===================*/
-	byte		type,		/*!< in: log record type */
-	ulint		space,		/*!< in: space id */
-	ulint		page_no,	/*!< in: page number */
-	byte*		body,		/*!< in: log record body */
-	byte*		rec_end,	/*!< in: log record end */
-	ib_uint64_t	start_lsn,	/*!< in: start lsn of the mtr */
-	ib_uint64_t	end_lsn)	/*!< in: end lsn of the mtr */
+	byte	type,		/*!< in: log record type */
+	ulint	space,		/*!< in: space id */
+	ulint	page_no,	/*!< in: page number */
+	byte*	body,		/*!< in: log record body */
+	byte*	rec_end,	/*!< in: log record end */
+	lsn_t	start_lsn,	/*!< in: start lsn of the mtr */
+	lsn_t	end_lsn)	/*!< in: end lsn of the mtr */
 {
 	recv_t*		recv;
 	ulint		len;
@@ -1400,12 +1519,9 @@ recv_add_to_hash_table(
 
 	len = rec_end - body;
 
-	if (srv_recovery_stats) {
-		recv_sys->stats_log_recs++;
-		recv_sys->stats_log_len_sum += len;
-	}
+	recv = static_cast<recv_t*>(
+		mem_heap_alloc(recv_sys->heap, sizeof(recv_t)));
 
-	recv = mem_heap_alloc(recv_sys->heap, sizeof(recv_t));
 	recv->type = type;
 	recv->len = rec_end - body;
 	recv->start_lsn = start_lsn;
@@ -1414,8 +1530,9 @@ recv_add_to_hash_table(
 	recv_addr = recv_get_fil_addr_struct(space, page_no);
 
 	if (recv_addr == NULL) {
-		recv_addr = mem_heap_alloc(recv_sys->heap,
-					   sizeof(recv_addr_t));
+		recv_addr = static_cast<recv_addr_t*>(
+			mem_heap_alloc(recv_sys->heap, sizeof(recv_addr_t)));
+
 		recv_addr->space = space;
 		recv_addr->page_no = page_no;
 		recv_addr->state = RECV_NOT_PROCESSED;
@@ -1447,8 +1564,10 @@ recv_add_to_hash_table(
 			len = RECV_DATA_BLOCK_SIZE;
 		}
 
-		recv_data = mem_heap_alloc(recv_sys->heap,
-					   sizeof(recv_data_t) + len);
+		recv_data = static_cast<recv_data_t*>(
+			mem_heap_alloc(recv_sys->heap,
+				       sizeof(recv_data_t) + len));
+
 		*prev_field = recv_data;
 
 		memcpy(recv_data + 1, body, len);
@@ -1484,7 +1603,7 @@ recv_data_copy_to_buf(
 			part_len = len;
 		}
 
-		ut_memcpy(buf, ((byte*)recv_data) + sizeof(recv_data_t),
+		ut_memcpy(buf, ((byte*) recv_data) + sizeof(recv_data_t),
 			  part_len);
 		buf += part_len;
 		len -= part_len;
@@ -1513,11 +1632,10 @@ recv_recover_page_func(
 	recv_addr_t*	recv_addr;
 	recv_t*		recv;
 	byte*		buf;
-	ib_uint64_t	start_lsn;
-	ib_uint64_t	end_lsn;
-	ib_uint64_t	page_lsn;
-	ib_uint64_t	page_lsn_orig;
-	ib_uint64_t	page_newest_lsn;
+	lsn_t		start_lsn;
+	lsn_t		end_lsn;
+	lsn_t		page_lsn;
+	lsn_t		page_newest_lsn;
 	ibool		modification_to_page;
 #ifndef UNIV_HOTBACKUP
 	ibool		success;
@@ -1556,14 +1674,6 @@ recv_recover_page_func(
 
 	recv_addr->state = RECV_BEING_PROCESSED;
 
-	if (srv_recovery_stats) {
-		if (just_read_in) {
-			recv_sys->stats_recover_pages_with_read++;
-		} else {
-			recv_sys->stats_recover_pages_without_read++;
-		}
-	}
-
 	mutex_exit(&(recv_sys->mutex));
 
 	mtr_start(&mtr);
@@ -1593,7 +1703,6 @@ recv_recover_page_func(
 
 	/* Read the newest modification lsn from the page */
 	page_lsn = mach_read_from_8(page + FIL_PAGE_LSN);
-	page_lsn_orig = page_lsn;
 
 #ifndef UNIV_HOTBACKUP
 	/* It may be that the page has been modified in the buffer
@@ -1613,21 +1722,6 @@ recv_recover_page_func(
 	modification_to_page = FALSE;
 	start_lsn = end_lsn = 0;
 
-	if (srv_recovery_stats) {
-		mutex_enter(&(recv_sys->mutex));
-		if (page_lsn_orig && recv_sys->stats_oldest_modified_lsn > page_lsn_orig) {
-			recv_sys->stats_oldest_modified_lsn = page_lsn_orig;
-		}
-		if (page_lsn_orig && recv_sys->stats_newest_modified_lsn < page_lsn_orig) {
-			recv_sys->stats_newest_modified_lsn = page_lsn_orig;
-		}
-		if (UT_LIST_GET_LAST(recv_addr->rec_list)->start_lsn
-		    < page_lsn_orig) {
-			recv_sys->stats_pages_already_new++;
-		}
-		mutex_exit(&(recv_sys->mutex));
-	}
-
 	recv = UT_LIST_GET_FIRST(recv_addr->rec_list);
 
 	while (recv) {
@@ -1637,7 +1731,7 @@ recv_recover_page_func(
 			/* We have to copy the record body to a separate
 			buffer */
 
-			buf = mem_alloc(recv->len);
+			buf = static_cast<byte*>(mem_alloc(recv->len));
 
 			recv_data_copy_to_buf(buf, recv);
 		} else {
@@ -1658,7 +1752,7 @@ recv_recover_page_func(
 
 		if (recv->start_lsn >= page_lsn) {
 
-			ib_uint64_t	end_lsn;
+			lsn_t	end_lsn;
 
 			if (!modification_to_page) {
 
@@ -1666,30 +1760,19 @@ recv_recover_page_func(
 				start_lsn = recv->start_lsn;
 			}
 
-#ifdef UNIV_DEBUG
-			if (log_debug_writes) {
-				fprintf(stderr,
-					"InnoDB: Applying log rec"
-					" type %lu len %lu"
-					" to space %lu page no %lu\n",
-					(ulong) recv->type, (ulong) recv->len,
-					(ulong) recv_addr->space,
-					(ulong) recv_addr->page_no);
-			}
-#endif /* UNIV_DEBUG */
+			DBUG_PRINT("ib_log",
+				   ("apply " DBUG_LSN_PF ": %u len %u "
+				    "page %u:%u", recv->start_lsn,
+				    (unsigned) recv->type,
+				    (unsigned) recv->len,
+				    (unsigned) recv_addr->space,
+				    (unsigned) recv_addr->page_no));
 
 			recv_parse_or_apply_log_rec_body(recv->type, buf,
 							 buf + recv->len,
 							 block, &mtr,
 							 recv_addr->space);
 
-			if (srv_recovery_stats) {
-				mutex_enter(&(recv_sys->mutex));
-				recv_sys->stats_applied_log_recs++;
-				recv_sys->stats_applied_log_len_sum += recv->len;
-				mutex_exit(&(recv_sys->mutex));
-			}
-
 			end_lsn = recv->start_lsn + recv->len;
 			mach_write_to_8(FIL_PAGE_LSN + page, end_lsn);
 			mach_write_to_8(UNIV_PAGE_SIZE
@@ -1792,13 +1875,6 @@ recv_read_in_area(
 		}
 	}
 
-	if (srv_recovery_stats && n) {
-		mutex_enter(&(recv_sys->mutex));
-		recv_sys->stats_read_requested_pages += n;
-		recv_sys->stats_read_in_area[n - 1]++;
-		mutex_exit(&(recv_sys->mutex));
-	}
-
 	buf_read_recv_pages(FALSE, space, zip_size, page_nos, n);
 	/*
 	fprintf(stderr, "Recv pages at %lu n %lu\n", page_nos[0], n);
@@ -1825,7 +1901,6 @@ recv_apply_hashed_log_recs(
 {
 	recv_addr_t* recv_addr;
 	ulint	i;
-	ulint	n_pages;
 	ibool	has_printed	= FALSE;
 	mtr_t	mtr;
 loop:
@@ -1851,20 +1926,23 @@ loop:
 
 	for (i = 0; i < hash_get_n_cells(recv_sys->addr_hash); i++) {
 
-		recv_addr = HASH_GET_FIRST(recv_sys->addr_hash, i);
+		for (recv_addr = static_cast<recv_addr_t*>(
+				HASH_GET_FIRST(recv_sys->addr_hash, i));
+		     recv_addr != 0;
+		     recv_addr = static_cast<recv_addr_t*>(
+				HASH_GET_NEXT(addr_hash, recv_addr))) {
 
-		while (recv_addr) {
 			ulint	space = recv_addr->space;
 			ulint	zip_size = fil_space_get_zip_size(space);
 			ulint	page_no = recv_addr->page_no;
 
 			if (recv_addr->state == RECV_NOT_PROCESSED) {
 				if (!has_printed) {
-					ut_print_timestamp(stderr);
-					fputs("  InnoDB: Starting an"
-					      " apply batch of log records"
-					      " to the database...\n"
-					      "InnoDB: Progress in percents: ",
+					ib_logf(IB_LOG_LEVEL_INFO,
+						"Starting an apply batch"
+						" of log records"
+						" to the database...");
+					fputs("InnoDB: Progress in percent: ",
 					      stderr);
 					has_printed = TRUE;
 				}
@@ -1891,8 +1969,6 @@ loop:
 
 				mutex_enter(&(recv_sys->mutex));
 			}
-
-			recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
 		}
 
 		if (has_printed
@@ -1923,6 +1999,8 @@ loop:
 	}
 
 	if (!allow_ibuf) {
+		bool	success;
+
 		/* Flush all the file pages to disk and invalidate them in
 		the buffer pool */
 
@@ -1930,13 +2008,24 @@ loop:
 		mutex_exit(&(recv_sys->mutex));
 		mutex_exit(&(log_sys->mutex));
 
- 		n_pages = buf_flush_list(ULINT_MAX, IB_ULONGLONG_MAX);
-  		ut_a(n_pages != ULINT_UNDEFINED);
-  
- 		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+		/* Stop the recv_writer thread from issuing any LRU
+		flush batches. */
+		mutex_enter(&recv_sys->writer_mutex);
+
+		/* Wait for any currently run batch to end. */
+		buf_flush_wait_LRU_batch_end();
+
+		success = buf_flush_list(ULINT_MAX, LSN_MAX, NULL);
+
+		ut_a(success);
+
+		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
 
 		buf_pool_invalidate();
 
+		/* Allow batches from recv_writer thread. */
+		mutex_exit(&recv_sys->writer_mutex);
+
 		mutex_enter(&(log_sys->mutex));
 		mutex_enter(&(recv_sys->mutex));
 		ut_d(recv_no_log_write = FALSE);
@@ -1951,10 +2040,6 @@ loop:
 
 	if (has_printed) {
 		fprintf(stderr, "InnoDB: Apply batch completed\n");
-
-		if (srv_recovery_stats) {
-			recv_sys->stats_recv_turns++;
-		}
 	}
 
 	mutex_exit(&(recv_sys->mutex));
@@ -1980,9 +2065,10 @@ recv_apply_log_recs_for_backup(void)
 
 	block = back_block1;
 
-	fputs("InnoDB: Starting an apply batch of log records"
-	      " to the database...\n"
-	      "InnoDB: Progress in percents: ", stderr);
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Starting an apply batch of log records to the database...");
+
+	fputs("InnoDB: Progress in percent: ", stderr);
 
 	n_hash_cells = hash_get_n_cells(recv_sys->addr_hash);
 
@@ -2032,17 +2118,17 @@ recv_apply_log_recs_for_backup(void)
 			if (!success) {
 				fprintf(stderr,
 					"InnoDB: Fatal error: cannot extend"
-					" tablespace %lu to hold %lu pages\n",
+					" tablespace %u to hold %u pages\n",
 					recv_addr->space, recv_addr->page_no);
 
 				exit(1);
 			}
 
 			/* Read the page from the tablespace file using the
-			fil0fil.c routines */
+			fil0fil.cc routines */
 
 			if (zip_size) {
-				error = fil_io(OS_FILE_READ, TRUE,
+				error = fil_io(OS_FILE_READ, true,
 					       recv_addr->space, zip_size,
 					       recv_addr->page_no, 0, zip_size,
 					       block->page.zip.data, NULL);
@@ -2051,7 +2137,7 @@ recv_apply_log_recs_for_backup(void)
 					exit(1);
 				}
 			} else {
-				error = fil_io(OS_FILE_READ, TRUE,
+				error = fil_io(OS_FILE_READ, true,
 					       recv_addr->space, 0,
 					       recv_addr->page_no, 0,
 					       UNIV_PAGE_SIZE,
@@ -2073,20 +2159,20 @@ recv_apply_log_recs_for_backup(void)
 			recv_recover_page(FALSE, block);
 
 			/* Write the page back to the tablespace file using the
-			fil0fil.c routines */
+			fil0fil.cc routines */
 
 			buf_flush_init_for_writing(
 				block->frame, buf_block_get_page_zip(block),
 				mach_read_from_8(block->frame + FIL_PAGE_LSN));
 
 			if (zip_size) {
-				error = fil_io(OS_FILE_WRITE, TRUE,
+				error = fil_io(OS_FILE_WRITE, true,
 					       recv_addr->space, zip_size,
 					       recv_addr->page_no, 0,
 					       zip_size,
 					       block->page.zip.data, NULL);
 			} else {
-				error = fil_io(OS_FILE_WRITE, TRUE,
+				error = fil_io(OS_FILE_WRITE, true,
 					       recv_addr->space, 0,
 					       recv_addr->page_no, 0,
 					       UNIV_PAGE_SIZE,
@@ -2157,7 +2243,7 @@ recv_parse_log_rec(
 
 #ifdef UNIV_LOG_LSN_DEBUG
 	if (*type == MLOG_LSN) {
-		ib_uint64_t	lsn = (ib_uint64_t) *space << 32 | *page_no;
+		lsn_t	lsn = (lsn_t) *space << 32 | *page_no;
 # ifdef UNIV_LOG_DEBUG
 		ut_a(lsn == log_sys->old_lsn);
 # else /* UNIV_LOG_DEBUG */
@@ -2183,21 +2269,20 @@ recv_parse_log_rec(
 /*******************************************************//**
 Calculates the new value for lsn when more data is added to the log. */
 UNIV_INTERN
-ib_uint64_t
+lsn_t
 recv_calc_lsn_on_data_add(
 /*======================*/
-	ib_uint64_t	lsn,	/*!< in: old lsn */
+	lsn_t		lsn,	/*!< in: old lsn */
 	ib_uint64_t	len)	/*!< in: this many bytes of data is
 				added, log block headers not included */
 {
-	ulint	frag_len;
-	ulint	lsn_len;
+	ulint		frag_len;
+	ib_uint64_t	lsn_len;
 
-	frag_len = (((ulint) lsn) % OS_FILE_LOG_BLOCK_SIZE)
-		- LOG_BLOCK_HDR_SIZE;
+	frag_len = (lsn % OS_FILE_LOG_BLOCK_SIZE) - LOG_BLOCK_HDR_SIZE;
 	ut_ad(frag_len < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE
 	      - LOG_BLOCK_TRL_SIZE);
-	lsn_len = (ulint) len;
+	lsn_len = len;
 	lsn_len += (lsn_len + frag_len)
 		/ (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE
 		   - LOG_BLOCK_TRL_SIZE)
@@ -2244,7 +2329,7 @@ recv_report_corrupt_log(
 	fprintf(stderr,
 		"InnoDB: ############### CORRUPT LOG RECORD FOUND\n"
 		"InnoDB: Log record type %lu, space id %lu, page number %lu\n"
-		"InnoDB: Log parsing proceeded successfully up to %llu\n"
+		"InnoDB: Log parsing proceeded successfully up to " LSN_PF "\n"
 		"InnoDB: Previous log record type %lu, is multi %lu\n"
 		"InnoDB: Recv offset %lu, prev %lu\n",
 		(ulong) type, (ulong) space, (ulong) page_no,
@@ -2305,18 +2390,18 @@ recv_parse_log_recs(
 				to the hash table; this is set to FALSE if just
 				debug checking is needed */
 {
-	byte*		ptr;
-	byte*		end_ptr;
-	ulint		single_rec;
-	ulint		len;
-	ulint		total_len;
-	ib_uint64_t	new_recovered_lsn;
-	ib_uint64_t	old_lsn;
-	byte		type;
-	ulint		space;
-	ulint		page_no;
-	byte*		body;
-	ulint		n_recs;
+	byte*	ptr;
+	byte*	end_ptr;
+	ulint	single_rec;
+	ulint	len;
+	ulint	total_len;
+	lsn_t	new_recovered_lsn;
+	lsn_t	old_lsn;
+	byte	type;
+	ulint	space;
+	ulint	page_no;
+	byte*	body;
+	ulint	n_recs;
 
 	ut_ad(mutex_own(&(log_sys->mutex)));
 	ut_ad(recv_sys->parse_start_lsn != 0);
@@ -2363,22 +2448,18 @@ loop:
 			return(FALSE);
 		}
 
-		recv_previous_parsed_rec_type = (ulint)type;
+		recv_previous_parsed_rec_type = (ulint) type;
 		recv_previous_parsed_rec_offset = recv_sys->recovered_offset;
 		recv_previous_parsed_rec_is_multi = 0;
 
 		recv_sys->recovered_offset += len;
 		recv_sys->recovered_lsn = new_recovered_lsn;
 
-#ifdef UNIV_DEBUG
-		if (log_debug_writes) {
-			fprintf(stderr,
-				"InnoDB: Parsed a single log rec"
-				" type %lu len %lu space %lu page no %lu\n",
-				(ulong) type, (ulong) len, (ulong) space,
-				(ulong) page_no);
-		}
-#endif /* UNIV_DEBUG */
+		DBUG_PRINT("ib_log",
+			   ("scan " DBUG_LSN_PF ": log rec %u len %u "
+			    "page %u:%u", old_lsn,
+			    (unsigned) type, (unsigned) len,
+			    (unsigned) space, (unsigned) page_no));
 
 		if (type == MLOG_DUMMY_RECORD) {
 			/* Do nothing */
@@ -2413,7 +2494,7 @@ loop:
 						" space %lu not complete in\n"
 						"InnoDB: the replay phase."
 						" Path %s\n",
-						(ulint)type, space,
+						(ulint) type, space,
 						(char*)(body + 2));
 
 					ut_error;
@@ -2454,7 +2535,7 @@ loop:
 				return(FALSE);
 			}
 
-			recv_previous_parsed_rec_type = (ulint)type;
+			recv_previous_parsed_rec_type = (ulint) type;
 			recv_previous_parsed_rec_offset
 				= recv_sys->recovered_offset + total_len;
 			recv_previous_parsed_rec_is_multi = 1;
@@ -2465,16 +2546,12 @@ loop:
 			}
 #endif /* UNIV_LOG_DEBUG */
 
-#ifdef UNIV_DEBUG
-			if (log_debug_writes) {
-				fprintf(stderr,
-					"InnoDB: Parsed a multi log rec"
-					" type %lu len %lu"
-					" space %lu page no %lu\n",
-					(ulong) type, (ulong) len,
-					(ulong) space, (ulong) page_no);
-			}
-#endif /* UNIV_DEBUG */
+			DBUG_PRINT("ib_log",
+				   ("scan " DBUG_LSN_PF ": multi-log rec %u "
+				    "len %u page %u:%u",
+				    recv_sys->recovered_lsn,
+				    (unsigned) type, (unsigned) len,
+				    (unsigned) space, (unsigned) page_no));
 
 			total_len += len;
 			n_recs++;
@@ -2554,7 +2631,7 @@ ibool
 recv_sys_add_to_parsing_buf(
 /*========================*/
 	const byte*	log_block,	/*!< in: log block */
-	ib_uint64_t	scanned_lsn)	/*!< in: lsn of how far we were able
+	lsn_t		scanned_lsn)	/*!< in: lsn of how far we were able
 					to find data in this log block */
 {
 	ulint	more_len;
@@ -2655,16 +2732,16 @@ recv_scan_log_recs(
 	const byte*	buf,		/*!< in: buffer containing a log
 					segment or garbage */
 	ulint		len,		/*!< in: buffer length */
-	ib_uint64_t	start_lsn,	/*!< in: buffer start lsn */
-	ib_uint64_t*	contiguous_lsn,	/*!< in/out: it is known that all log
+	lsn_t		start_lsn,	/*!< in: buffer start lsn */
+	lsn_t*		contiguous_lsn,	/*!< in/out: it is known that all log
 					groups contain contiguous log data up
 					to this lsn */
-	ib_uint64_t*	group_scanned_lsn)/*!< out: scanning succeeded up to
+	lsn_t*		group_scanned_lsn)/*!< out: scanning succeeded up to
 					this lsn */
 {
 	const byte*	log_block;
 	ulint		no;
-	ib_uint64_t	scanned_lsn;
+	lsn_t		scanned_lsn;
 	ibool		finished;
 	ulint		data_len;
 	ibool		more_data;
@@ -2696,7 +2773,7 @@ recv_scan_log_recs(
 				    log_block)) {
 				fprintf(stderr,
 					"InnoDB: Log block no %lu at"
-					" lsn %llu has\n"
+					" lsn " LSN_PF " has\n"
 					"InnoDB: ok header, but checksum field"
 					" contains %lu, should be %lu\n",
 					(ulong) no,
@@ -2775,11 +2852,21 @@ recv_scan_log_recs(
 			if (recv_log_scan_is_startup_type
 			    && !recv_needed_recovery) {
 
-				fprintf(stderr,
-					"InnoDB: Log scan progressed"
-					" past the checkpoint lsn %llu\n",
-					recv_sys->scanned_lsn);
-				recv_init_crash_recovery();
+				if (!srv_read_only_mode) {
+					ib_logf(IB_LOG_LEVEL_INFO,
+						"Log scan progressed past the "
+						"checkpoint lsn " LSN_PF "",
+						recv_sys->scanned_lsn);
+
+					recv_init_crash_recovery();
+				} else {
+
+					ib_logf(IB_LOG_LEVEL_WARN,
+						"Recovery skipped, "
+						"--innodb-read-only set!");
+
+					return(TRUE);
+				}
 			}
 #endif /* !UNIV_HOTBACKUP */
 
@@ -2836,7 +2923,7 @@ recv_scan_log_recs(
 
 			fprintf(stderr,
 				"InnoDB: Doing recovery: scanned up to"
-				" log sequence number %llu\n",
+				" log sequence number " LSN_PF "\n",
 				*group_scanned_lsn);
 		}
 	}
@@ -2879,15 +2966,15 @@ void
 recv_group_scan_log_recs(
 /*=====================*/
 	log_group_t*	group,		/*!< in: log group */
-	ib_uint64_t*	contiguous_lsn,	/*!< in/out: it is known that all log
+	lsn_t*		contiguous_lsn,	/*!< in/out: it is known that all log
 					groups contain contiguous log data up
 					to this lsn */
-	ib_uint64_t*	group_scanned_lsn)/*!< out: scanning succeeded up to
+	lsn_t*		group_scanned_lsn)/*!< out: scanning succeeded up to
 					this lsn */
 {
-	ibool		finished;
-	ib_uint64_t	start_lsn;
-	ib_uint64_t	end_lsn;
+	ibool	finished;
+	lsn_t	start_lsn;
+	lsn_t	end_lsn;
 
 	finished = FALSE;
 
@@ -2912,7 +2999,7 @@ recv_group_scan_log_recs(
 	if (log_debug_writes) {
 		fprintf(stderr,
 			"InnoDB: Scanned group %lu up to"
-			" log sequence number %llu\n",
+			" log sequence number " LSN_PF "\n",
 			(ulong) group->id,
 			*group_scanned_lsn);
 	}
@@ -2927,20 +3014,15 @@ void
 recv_init_crash_recovery(void)
 /*==========================*/
 {
+	ut_ad(!srv_read_only_mode);
 	ut_a(!recv_needed_recovery);
 
 	recv_needed_recovery = TRUE;
 
-	ut_print_timestamp(stderr);
-
-	fprintf(stderr,
-		"  InnoDB: Database was not"
-		" shut down normally!\n"
-		"InnoDB: Starting crash recovery.\n");
-
-	fprintf(stderr,
-		"InnoDB: Reading tablespace information"
-		" from the .ibd files...\n");
+	ib_logf(IB_LOG_LEVEL_INFO, "Database was not shutdown normally!");
+	ib_logf(IB_LOG_LEVEL_INFO, "Starting crash recovery.");
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Reading tablespace information from the .ibd files...");
 
 	fil_load_single_table_tablespaces();
 
@@ -2951,12 +3033,18 @@ recv_init_crash_recovery(void)
 
 	if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) {
 
-		fprintf(stderr,
-			"InnoDB: Restoring possible"
-			" half-written data pages from"
-			" the doublewrite\n"
-			"InnoDB: buffer...\n");
-		trx_sys_doublewrite_init_or_restore_pages(TRUE);
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Restoring possible half-written data pages ");
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"from the doublewrite buffer...");
+
+		buf_dblwr_init_or_restore_pages(TRUE);
+
+		/* Spawn the background thread to flush dirty pages
+		from the buffer pools. */
+		recv_writer_thread_handle = os_thread_create(
+			recv_writer_thread, 0, 0);
 	}
 }
 
@@ -2967,43 +3055,38 @@ recv_recovery_from_checkpoint_finish should be called later to complete
 the recovery and free the resources used in it.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 recv_recovery_from_checkpoint_start_func(
 /*=====================================*/
 #ifdef UNIV_LOG_ARCHIVE
-	ulint		type,		/*!< in: LOG_CHECKPOINT or
-					LOG_ARCHIVE */
-	ib_uint64_t	limit_lsn,	/*!< in: recover up to this lsn
-					if possible */
+	ulint	type,		/*!< in: LOG_CHECKPOINT or LOG_ARCHIVE */
+	lsn_t	limit_lsn,	/*!< in: recover up to this lsn if possible */
 #endif /* UNIV_LOG_ARCHIVE */
-	ib_uint64_t	min_flushed_lsn,/*!< in: min flushed lsn from
-					data files */
-	ib_uint64_t	max_flushed_lsn)/*!< in: max flushed lsn from
-					data files */
+	lsn_t	min_flushed_lsn,/*!< in: min flushed lsn from data files */
+	lsn_t	max_flushed_lsn)/*!< in: max flushed lsn from data files */
 {
 	log_group_t*	group;
 	log_group_t*	max_cp_group;
-	log_group_t*	up_to_date_group;
 	ulint		max_cp_field;
 	ulint		log_hdr_log_block_size;
-	ib_uint64_t	checkpoint_lsn;
+	lsn_t		checkpoint_lsn;
 	ib_uint64_t	checkpoint_no;
-	ib_uint64_t	old_scanned_lsn;
-	ib_uint64_t	group_scanned_lsn= 0;
-	ib_uint64_t	contiguous_lsn;
+	lsn_t		group_scanned_lsn = 0;
+	lsn_t		contiguous_lsn;
 #ifdef UNIV_LOG_ARCHIVE
-	ib_uint64_t	archived_lsn;
+	log_group_t*	up_to_date_group;
+	lsn_t		archived_lsn;
 #endif /* UNIV_LOG_ARCHIVE */
 	byte*		buf;
 	byte*		log_hdr_buf;
-	byte		*log_hdr_buf_base;
-	ulint		err;
+	byte*		log_hdr_buf_base = (byte*)alloca(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+	dberr_t		err;
 
-        log_hdr_buf_base= alloca(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
-	log_hdr_buf = ut_align(log_hdr_buf_base, OS_FILE_LOG_BLOCK_SIZE);
+	log_hdr_buf = static_cast<byte *>
+		(ut_align(log_hdr_buf_base, OS_FILE_LOG_BLOCK_SIZE));
 
 #ifdef UNIV_LOG_ARCHIVE
-	ut_ad(type != LOG_CHECKPOINT || limit_lsn == IB_ULONGLONG_MAX);
+	ut_ad(type != LOG_CHECKPOINT || limit_lsn == LSN_MAX);
 /** TRUE when recovering from a checkpoint */
 # define TYPE_CHECKPOINT	(type == LOG_CHECKPOINT)
 /** Recover up to this log sequence number */
@@ -3012,7 +3095,7 @@ recv_recovery_from_checkpoint_start_func(
 /** TRUE when recovering from a checkpoint */
 # define TYPE_CHECKPOINT	1
 /** Recover up to this log sequence number */
-# define LIMIT_LSN		IB_ULONGLONG_MAX
+# define LIMIT_LSN		LSN_MAX
 #endif /* UNIV_LOG_ARCHIVE */
 
 	if (TYPE_CHECKPOINT) {
@@ -3021,10 +3104,10 @@ recv_recovery_from_checkpoint_start_func(
 	}
 
 	if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) {
-		fprintf(stderr,
-			"InnoDB: The user has set SRV_FORCE_NO_LOG_REDO on\n");
-		fprintf(stderr,
-			"InnoDB: Skipping log redo\n");
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"The user has set SRV_FORCE_NO_LOG_REDO on, "
+			"skipping log redo");
 
 		return(DB_SUCCESS);
 	}
@@ -3059,30 +3142,37 @@ recv_recovery_from_checkpoint_start_func(
 	/* Read the first log file header to print a note if this is
 	a recovery from a restored InnoDB Hot Backup */
 
-	fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, max_cp_group->space_id, 0,
+	fil_io(OS_FILE_READ | OS_FILE_LOG, true, max_cp_group->space_id, 0,
 	       0, 0, LOG_FILE_HDR_SIZE,
 	       log_hdr_buf, max_cp_group);
 
 	if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
 			   (byte*)"ibbackup", (sizeof "ibbackup") - 1)) {
+
+		if (srv_read_only_mode) {
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Cannot restore from ibbackup, InnoDB running "
+				"in read-only mode!");
+
+			return(DB_ERROR);
+		}
+
 		/* This log file was created by ibbackup --restore: print
 		a note to the user about it */
 
-		fprintf(stderr,
-			"InnoDB: The log file was created by"
-			" ibbackup --apply-log at\n"
-			"InnoDB: %s\n",
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"The log file was created by ibbackup --apply-log "
+			"at %s. The following crash recovery is part of a "
+			"normal restore.",
 			log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP);
-		fprintf(stderr,
-			"InnoDB: NOTE: the following crash recovery"
-			" is part of a normal restore.\n");
 
 		/* Wipe over the label now */
 
 		memset(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
 		       ' ', 4);
 		/* Write to the log file to wipe over the label */
-		fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE,
+		fil_io(OS_FILE_WRITE | OS_FILE_LOG, true,
 		       max_cp_group->space_id, 0,
 		       0, 0, OS_FILE_LOG_BLOCK_SIZE,
 		       log_hdr_buf, max_cp_group);
@@ -3094,10 +3184,10 @@ recv_recovery_from_checkpoint_start_func(
 		/* 0 means default value */
 		log_hdr_log_block_size = 512;
 	}
-	if (log_hdr_log_block_size != srv_log_block_size) {
+	if (UNIV_UNLIKELY(log_hdr_log_block_size != srv_log_block_size)) {
 		fprintf(stderr,
-			"InnoDB: Error: The block size of ib_logfile (%lu) "
-			"is not equal to innodb_log_block_size.\n"
+			"InnoDB: Error: The block size of ib_logfile (" ULINTPF
+			") is not equal to innodb_log_block_size.\n"
 			"InnoDB: Error: Suggestion - Recreate log files.\n",
 			log_hdr_log_block_size);
 		return(DB_ERROR);
@@ -3108,8 +3198,10 @@ recv_recovery_from_checkpoint_start_func(
 
 	while (group) {
 		log_checkpoint_get_nth_group_info(buf, group->id,
-						  &(group->archived_file_no),
-						  &(group->archived_offset));
+						  &(group->archived_file_no));
+
+		log_archived_get_offset(group, group->archived_file_no,
+			archived_lsn, &(group->archived_offset));
 
 		group = UT_LIST_GET_NEXT(log_groups, group);
 	}
@@ -3130,9 +3222,9 @@ recv_recovery_from_checkpoint_start_func(
 
 	contiguous_lsn = ut_uint64_align_down(recv_sys->scanned_lsn,
 					      OS_FILE_LOG_BLOCK_SIZE);
+#ifdef UNIV_LOG_ARCHIVE
 	if (TYPE_CHECKPOINT) {
 		up_to_date_group = max_cp_group;
-#ifdef UNIV_LOG_ARCHIVE
 	} else {
 		ulint	capacity;
 
@@ -3168,8 +3260,8 @@ recv_recovery_from_checkpoint_start_func(
 
 		group->scanned_lsn = group_scanned_lsn;
 		up_to_date_group = group;
-#endif /* UNIV_LOG_ARCHIVE */
 	}
+#endif /* UNIV_LOG_ARCHIVE */
 
 	ut_ad(RECV_SCAN_SIZE <= log_sys->buf_size);
 
@@ -3184,19 +3276,21 @@ recv_recovery_from_checkpoint_start_func(
 	/* Set the flag to publish that we are doing startup scan. */
 	recv_log_scan_is_startup_type = TYPE_CHECKPOINT;
 	while (group) {
-		old_scanned_lsn = recv_sys->scanned_lsn;
+#ifdef UNIV_LOG_ARCHIVE
+		lsn_t	old_scanned_lsn	= recv_sys->scanned_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
 
 		recv_group_scan_log_recs(group, &contiguous_lsn,
 					 &group_scanned_lsn);
 		group->scanned_lsn = group_scanned_lsn;
 
+#ifdef UNIV_LOG_ARCHIVE
 		if (old_scanned_lsn < group_scanned_lsn) {
 			/* We found a more up-to-date group */
 
 			up_to_date_group = group;
 		}
 
-#ifdef UNIV_LOG_ARCHIVE
 		if ((type == LOG_ARCHIVE)
 		    && (group == recv_sys->archive_group)) {
 			group = UT_LIST_GET_NEXT(log_groups, group);
@@ -3217,69 +3311,63 @@ recv_recovery_from_checkpoint_start_func(
 		    || checkpoint_lsn != min_flushed_lsn) {
 
 			if (checkpoint_lsn < max_flushed_lsn) {
-				fprintf(stderr,
-					"InnoDB: #########################"
-					"#################################\n"
-					"InnoDB:                          "
-					"WARNING!\n"
-					"InnoDB: The log sequence number"
-					" in ibdata files is higher\n"
-					"InnoDB: than the log sequence number"
-					" in the ib_logfiles! Are you sure\n"
-					"InnoDB: you are using the right"
-					" ib_logfiles to start up"
-					" the database?\n"
-					"InnoDB: Log sequence number in"
-					" ib_logfiles is %llu, log\n"
-					"InnoDB: sequence numbers stamped"
-					" to ibdata file headers are between\n"
-					"InnoDB: %llu and %llu.\n"
-					"InnoDB: #########################"
-					"#################################\n",
+
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"The log sequence number "
+					"in the ibdata files is higher "
+					"than the log sequence number "
+					"in the ib_logfiles! Are you sure "
+					"you are using the right "
+					"ib_logfiles to start up the database. "
+					"Log sequence number in the "
+					"ib_logfiles is " LSN_PF ", log"
+					"sequence numbers stamped "
+					"to ibdata file headers are between "
+					"" LSN_PF " and " LSN_PF ".",
 					checkpoint_lsn,
 					min_flushed_lsn,
 					max_flushed_lsn);
 			}
 
 			if (!recv_needed_recovery) {
-				fprintf(stderr,
-					"InnoDB: The log sequence number"
-					" in ibdata files does not match\n"
-					"InnoDB: the log sequence number"
-					" in the ib_logfiles!\n");
-				recv_init_crash_recovery();
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"The log sequence numbers "
+					LSN_PF " and " LSN_PF
+					" in ibdata files do not match"
+					" the log sequence number "
+					LSN_PF
+					" in the ib_logfiles!",
+					min_flushed_lsn,
+					max_flushed_lsn,
+					checkpoint_lsn);
+
+				if (!srv_read_only_mode) {
+					recv_init_crash_recovery();
+				} else {
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"Can't initiate database "
+						"recovery, running "
+						"in read-only-mode.");
+					return(DB_READ_ONLY);
+				}
 			}
 		}
 
-		if (!recv_needed_recovery) {
+		if (!recv_needed_recovery && !srv_read_only_mode) {
 			/* Init the doublewrite buffer memory structure */
-			trx_sys_doublewrite_init_or_restore_pages(FALSE);
+			buf_dblwr_init_or_restore_pages(FALSE);
 		}
 	}
 
 	/* We currently have only one log group */
-	if (group_scanned_lsn < checkpoint_lsn) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: ERROR: We were only able to scan the log"
-			" up to\n"
-			"InnoDB: %llu, but a checkpoint was at %llu.\n"
-			"InnoDB: It is possible that"
-			" the database is now corrupt!\n",
-			group_scanned_lsn,
-			checkpoint_lsn);
-	}
-
-	if (group_scanned_lsn < recv_max_page_lsn) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: ERROR: We were only able to scan the log"
-			" up to %llu\n"
-			"InnoDB: but a database page a had an lsn %llu."
-			" It is possible that the\n"
-			"InnoDB: database is now corrupt!\n",
-			group_scanned_lsn,
-			recv_max_page_lsn);
+	if (group_scanned_lsn < checkpoint_lsn
+	    || group_scanned_lsn < recv_max_page_lsn) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"We scanned the log up to "
+			LSN_PF ". A checkpoint was at " LSN_PF
+			" and the maximum LSN on a database page was " LSN_PF
+			". It is possible that the database is now corrupt!",
+			group_scanned_lsn, checkpoint_lsn, recv_max_page_lsn);
 	}
 
 	if (recv_sys->recovered_lsn < checkpoint_lsn) {
@@ -3291,7 +3379,10 @@ recv_recovery_from_checkpoint_start_func(
 			return(DB_SUCCESS);
 		}
 
-		ut_error;
+		/* No harm in trying to do RO access. */
+		if (!srv_read_only_mode) {
+			ut_error;
+		}
 
 		return(DB_ERROR);
 	}
@@ -3304,9 +3395,11 @@ recv_recovery_from_checkpoint_start_func(
 
 #ifdef UNIV_LOG_ARCHIVE
 	log_sys->archived_lsn = archived_lsn;
-#endif /* UNIV_LOG_ARCHIVE */
 
 	recv_synchronize_groups(up_to_date_group);
+#else /* UNIV_LOG_ARCHIVE */
+	recv_synchronize_groups();
+#endif /* UNIV_LOG_ARCHIVE */
 
 	if (!recv_needed_recovery) {
 		ut_a(checkpoint_lsn == recv_sys->recovered_lsn);
@@ -3325,22 +3418,25 @@ recv_recovery_from_checkpoint_start_func(
 
 	log_sys->last_checkpoint_lsn = checkpoint_lsn;
 
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
+		    log_sys->lsn - log_sys->last_checkpoint_lsn);
+
 	log_sys->next_checkpoint_no = checkpoint_no + 1;
 
 #ifdef UNIV_LOG_ARCHIVE
-	if (archived_lsn == IB_ULONGLONG_MAX) {
+	if (archived_lsn == LSN_MAX) {
 
 		log_sys->archiving_state = LOG_ARCH_OFF;
 	}
 #endif /* UNIV_LOG_ARCHIVE */
 
-	mutex_enter(&(recv_sys->mutex));
+	mutex_enter(&recv_sys->mutex);
 
 	recv_sys->apply_log_recs = TRUE;
 
-	mutex_exit(&(recv_sys->mutex));
+	mutex_exit(&recv_sys->mutex);
 
-	mutex_exit(&(log_sys->mutex));
+	mutex_exit(&log_sys->mutex);
 
 	recv_lsn_checks_on = TRUE;
 
@@ -3368,96 +3464,7 @@ recv_recovery_from_checkpoint_finish(void)
 		recv_apply_hashed_log_recs(TRUE);
 	}
 
-#ifdef UNIV_DEBUG
-	if (log_debug_writes) {
-		fprintf(stderr,
-			"InnoDB: Log records applied to the database\n");
-	}
-#endif /* UNIV_DEBUG */
-
-	if (recv_needed_recovery && srv_recovery_stats) {
-		ulint	flush_list_len = 0;
-		ulint	i;
-
-		fprintf(stderr,
-			"InnoDB: Log records have been applied. The statistics that were gathered follow.\n");
-
-		fprintf(stderr,
-			"============================================================\n"
-			"-------------------\n"
-			"RECOVERY STATISTICS\n"
-			"-------------------\n");
-		fprintf(stderr,
-			"Recovery time: %g sec. (%lu turns)\n",
-			difftime(time(NULL), recv_sys->stats_recv_start_time),
-			recv_sys->stats_recv_turns);
-
-		for (i = 0; i < srv_buf_pool_instances; i++) {
-			buf_pool_t*	buf_pool;
-
-			buf_pool = buf_pool_from_array(i);
-			flush_list_len += UT_LIST_GET_LEN(buf_pool->flush_list);
-		}
-		fprintf(stderr,
-			"\n"
-			"Data page IO statistics\n"
-			"  Requested pages: %lu\n"
-			"  Read pages:      %lu\n"
-			"  Written pages:   %lu\n"
-			"  (Dirty blocks):  %lu\n",
-			recv_sys->stats_read_requested_pages,
-			recv_sys->stats_read_io_pages,
-			recv_sys->stats_write_io_pages,
-			flush_list_len);
-
-		fprintf(stderr,
-			"  Grouping IO [times]:\n"
-			"\tnumber of pages,\n"
-			"\t\tread request neighbors (in %d pages chunk),\n"
-			"\t\t\tcombined read IO,\n"
-			"\t\t\t\tcombined write IO\n",
-			RECV_READ_AHEAD_AREA);
-		for (i = 0; i < ut_max(RECV_READ_AHEAD_AREA,
-					OS_AIO_MERGE_N_CONSECUTIVE); i++) {
-			fprintf(stderr,
-				"\t%3lu,\t%lu,\t%lu,\t%lu\n", i + 1,
-				(i < RECV_READ_AHEAD_AREA) ?
-					recv_sys->stats_read_in_area[i] : 0,
-				(i < OS_AIO_MERGE_N_CONSECUTIVE) ?
-					recv_sys->stats_read_io_consecutive[i] : 0,
-				(i < OS_AIO_MERGE_N_CONSECUTIVE) ?
-					recv_sys->stats_write_io_consecutive[i] : 0);
-		}
-
-		fprintf(stderr,
-			"\n"
-			"Recovery process statistics\n"
-			"  Checked pages by doublewrite buffer: %lu\n"
-			"  Overwritten pages from doublewrite:  %lu\n"
-			"  Recovered pages by io_thread:        %lu\n"
-			"  Recovered pages by main thread:      %lu\n"
-			"  Parsed log records to apply:         %lu\n"
-			"            Sum of the length:         %lu\n"
-			"  Applied log records:                 %lu\n"
-			"            Sum of the length:         %lu\n"
-			"  Pages which are already new enough:  %lu (It may not be accurate, if turns > 1)\n"
-			"  Oldest page's LSN:                   %llu\n"
-			"  Newest page's LSN:                   %llu\n",
-			recv_sys->stats_doublewrite_check_pages,
-			recv_sys->stats_doublewrite_overwrite_pages,
-			recv_sys->stats_recover_pages_with_read,
-			recv_sys->stats_recover_pages_without_read,
-			recv_sys->stats_log_recs,
-			recv_sys->stats_log_len_sum,
-			recv_sys->stats_applied_log_recs,
-			recv_sys->stats_applied_log_len_sum,
-			recv_sys->stats_pages_already_new,
-			recv_sys->stats_oldest_modified_lsn,
-			recv_sys->stats_newest_modified_lsn);
-
-		fprintf(stderr,
-			"============================================================\n");
-	}
+	DBUG_PRINT("ib_log", ("apply completed"));
 
 	if (recv_needed_recovery) {
 		trx_sys_print_mysql_master_log_pos();
@@ -3480,10 +3487,40 @@ recv_recovery_from_checkpoint_finish(void)
 			"InnoDB: a backup!\n");
 	}
 
-	/* Free the resources of the recovery system */
+	/* Make sure that the recv_writer thread is done. This is
+	required because it grabs various mutexes and we want to
+	ensure that when we enable sync_order_checks there is no
+	mutex currently held by any thread. */
+	mutex_enter(&recv_sys->writer_mutex);
 
+	/* Free the resources of the recovery system */
 	recv_recovery_on = FALSE;
 
+	/* By acquring the mutex we ensure that the recv_writer thread
+	won't trigger any more LRU batchtes. Now wait for currently
+	in progress batches to finish. */
+	buf_flush_wait_LRU_batch_end();
+
+	mutex_exit(&recv_sys->writer_mutex);
+
+	ulint count = 0;
+	while (recv_writer_thread_active) {
+		++count;
+		os_thread_sleep(100000);
+		if (srv_print_verbose_log && count > 600) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Waiting for recv_writer to "
+				"finish flushing of buffer pool");
+			count = 0;
+		}
+	}
+
+#ifdef __WIN__
+	if (recv_writer_thread_handle) {
+		CloseHandle(recv_writer_thread_handle);
+	}
+#endif /* __WIN__ */
+
 #ifndef UNIV_LOG_DEBUG
 	recv_sys_debug_free();
 #endif
@@ -3491,7 +3528,9 @@ recv_recovery_from_checkpoint_finish(void)
 	that the data dictionary tables will be free of any locks.
 	The data dictionary latch should guarantee that there is at
 	most one data dictionary transaction active at a time. */
-	trx_rollback_or_clean_recovered(FALSE);
+	if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) {
+		trx_rollback_or_clean_recovered(FALSE);
+	}
 }
 
 /********************************************************//**
@@ -3501,27 +3540,37 @@ void
 recv_recovery_rollback_active(void)
 /*===============================*/
 {
-	int		i;
-
 #ifdef UNIV_SYNC_DEBUG
 	/* Wait for a while so that created threads have time to suspend
 	themselves before we switch the latching order checks on */
 	os_thread_sleep(1000000);
 
-	/* Switch latching order checks on in sync0sync.c */
+	ut_ad(!recv_writer_thread_active);
+
+	/* Switch latching order checks on in sync0sync.cc */
 	sync_order_checks_on = TRUE;
 #endif
-	/* Drop partially created indexes. */
-	row_merge_drop_temp_indexes();
-	/* Drop temporary tables. */
-	row_mysql_drop_temp_tables();
+	/* We can't start any (DDL) transactions if UNDO logging
+	has been disabled, additionally disable ROLLBACK of recovered
+	user transactions. */
+	if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
+	    && !srv_read_only_mode) {
+
+		/* Drop partially created indexes. */
+		row_merge_drop_temp_indexes();
+		/* Drop temporary tables. */
+		row_mysql_drop_temp_tables();
+
+		/* Drop any auxiliary tables that were not dropped when the
+		parent table was dropped. This can happen if the parent table
+		was dropped but the server crashed before the auxiliary tables
+		were dropped. */
+		fts_drop_orphaned_tables();
 
-	if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) {
 		/* Rollback the uncommitted transactions which have no user
 		session */
 
-		os_thread_create(trx_rollback_or_clean_all_recovered,
-				 (void *)&i, NULL);
+		os_thread_create(trx_rollback_or_clean_all_recovered, 0, 0);
 	}
 }
 
@@ -3531,18 +3580,18 @@ UNIV_INTERN
 void
 recv_reset_logs(
 /*============*/
-	ib_uint64_t	lsn,		/*!< in: reset to this lsn
-					rounded up to be divisible by
-					OS_FILE_LOG_BLOCK_SIZE, after
-					which we add
-					LOG_BLOCK_HDR_SIZE */
 #ifdef UNIV_LOG_ARCHIVE
 	ulint		arch_log_no,	/*!< in: next archived log file number */
-#endif /* UNIV_LOG_ARCHIVE */
-	ibool		new_logs_created)/*!< in: TRUE if resetting logs
+	ibool		new_logs_created,/*!< in: TRUE if resetting logs
 					is done at the log creation;
 					FALSE if it is done after
 					archive recovery */
+#endif /* UNIV_LOG_ARCHIVE */
+	lsn_t		lsn)		/*!< in: reset to this lsn
+					rounded up to be divisible by
+					OS_FILE_LOG_BLOCK_SIZE, after
+					which we add
+					LOG_BLOCK_HDR_SIZE */
 {
 	log_group_t*	group;
 
@@ -3558,12 +3607,12 @@ recv_reset_logs(
 #ifdef UNIV_LOG_ARCHIVE
 		group->archived_file_no = arch_log_no;
 		group->archived_offset = 0;
-#endif /* UNIV_LOG_ARCHIVE */
 
 		if (!new_logs_created) {
 			recv_truncate_group(group, group->lsn, group->lsn,
 					    group->lsn, group->lsn);
 		}
+#endif /* UNIV_LOG_ARCHIVE */
 
 		group = UT_LIST_GET_NEXT(log_groups, group);
 	}
@@ -3587,12 +3636,14 @@ recv_reset_logs(
 	log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
 	log_sys->lsn += LOG_BLOCK_HDR_SIZE;
 
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
+		    (log_sys->lsn - log_sys->last_checkpoint_lsn));
+
 	mutex_exit(&(log_sys->mutex));
 
 	/* Reset the checkpoint fields in logs */
 
-	log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
-	log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+	log_make_checkpoint_at(LSN_MAX, TRUE);
 
 	mutex_enter(&(log_sys->mutex));
 }
@@ -3607,8 +3658,8 @@ recv_reset_log_files_for_backup(
 /*============================*/
 	const char*	log_dir,	/*!< in: log file directory path */
 	ulint		n_log_files,	/*!< in: number of log files */
-	ulint		log_file_size,	/*!< in: log file size */
-	ib_uint64_t	lsn)		/*!< in: new start lsn, must be
+	lsn_t		log_file_size,	/*!< in: log file size */
+	lsn_t		lsn)		/*!< in: new start lsn, must be
 					divisible by OS_FILE_LOG_BLOCK_SIZE */
 {
 	os_file_t	log_file;
@@ -3631,7 +3682,7 @@ recv_reset_log_files_for_backup(
 	for (i = 0; i < n_log_files; i++) {
 
 		sprintf(name, "%s%s%lu", log_dir,
-			ib_logfile_basename, (ulong)i);
+			ib_logfile_basename, (ulong) i);
 
 		log_file = os_file_create_simple(innodb_file_log_key,
 						 name, OS_FILE_CREATE,
@@ -3646,23 +3697,19 @@ recv_reset_log_files_for_backup(
 		}
 
 		fprintf(stderr,
-			"Setting log file size to %lu %lu\n",
-			(ulong) ut_get_high32(log_file_size),
-			(ulong) log_file_size & 0xFFFFFFFFUL);
+			"Setting log file size to %llu\n",
+			log_file_size);
 
-		success = os_file_set_size(name, log_file,
-					   log_file_size & 0xFFFFFFFFUL,
-					   ut_get_high32(log_file_size));
+		success = os_file_set_size(name, log_file, log_file_size);
 
 		if (!success) {
 			fprintf(stderr,
-				"InnoDB: Cannot set %s size to %lu %lu\n",
-				name, (ulong) ut_get_high32(log_file_size),
-				(ulong) (log_file_size & 0xFFFFFFFFUL));
+				"InnoDB: Cannot set %s size to %llu\n",
+				name, log_file_size);
 			exit(1);
 		}
 
-		os_file_flush(log_file, TRUE);
+		os_file_flush(log_file);
 		os_file_close(log_file);
 	}
 
@@ -3684,9 +3731,9 @@ recv_reset_log_files_for_backup(
 		exit(1);
 	}
 
-	os_file_write(name, log_file, buf, 0, 0,
+	os_file_write(name, log_file, buf, 0,
 		      LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
-	os_file_flush(log_file, TRUE);
+	os_file_flush(log_file);
 	os_file_close(log_file);
 
 	ut_free(buf);
@@ -3694,7 +3741,6 @@ recv_reset_log_files_for_backup(
 #endif /* UNIV_HOTBACKUP */
 
 #ifdef UNIV_LOG_ARCHIVE
-/* Dead code */
 /******************************************************//**
 Reads from the archive of a log group and performs recovery.
 @return	TRUE if no more complete consistent archive files */
@@ -3712,11 +3758,10 @@ log_group_recover_from_archive_file(
 	ulint		len;
 	ibool		ret;
 	byte*		buf;
-	ulint		read_offset;
-	ulint		file_size;
-	ulint		file_size_high;
+	os_offset_t	read_offset;
+	os_offset_t	file_size;
 	int		input_char;
-	char		name[10000];
+	char		name[OS_FILE_MAX_PATH];
 
 	ut_a(0);
 
@@ -3725,7 +3770,8 @@ try_open_again:
 
 	/* Add the file to the archive file space; open the file */
 
-	log_archived_file_name_gen(name, group->id, group->archived_file_no);
+	log_archived_file_name_gen(name, sizeof(name),
+				   group->id, group->archived_file_no);
 
 	file_handle = os_file_create(innodb_file_log_key,
 				     name, OS_FILE_OPEN,
@@ -3756,10 +3802,8 @@ ask_again:
 		}
 	}
 
-	ret = os_file_get_size(file_handle, &file_size, &file_size_high);
-	ut_a(ret);
-
-	ut_a(file_size_high == 0);
+	file_size = os_file_get_size(file_handle);
+	ut_a(file_size != (os_offset_t) -1);
 
 	fprintf(stderr, "InnoDB: Opened archived log file %s\n", name);
 
@@ -3776,20 +3820,19 @@ ask_again:
 
 	/* Add the archive file as a node to the space */
 
-	fil_node_create(name, 1 + file_size / UNIV_PAGE_SIZE,
-			group->archive_space_id, FALSE);
-#if RECV_SCAN_SIZE < LOG_FILE_HDR_SIZE
-# error "RECV_SCAN_SIZE < LOG_FILE_HDR_SIZE"
-#endif
+	ut_a(fil_node_create(name, 1 + file_size / UNIV_PAGE_SIZE,
+			     group->archive_space_id, FALSE));
+	ut_a(RECV_SCAN_SIZE >= LOG_FILE_HDR_SIZE);
 
 	/* Read the archive file header */
-	fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, group->archive_space_id, 0, 0,
+	fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, 0,
+	       0, 0,
 	       LOG_FILE_HDR_SIZE, buf, NULL);
 
 	/* Check if the archive file header is consistent */
 
 	if (mach_read_from_4(buf + LOG_GROUP_ID) != group->id
-	    || mach_read_from_4(buf + LOG_FILE_NO)
+	    || mach_read_from_8(buf + LOG_FILE_START_LSN)
 	    != group->archived_file_no) {
 		fprintf(stderr,
 			"InnoDB: Archive file header inconsistent %s\n", name);
@@ -3849,14 +3892,15 @@ ask_again:
 		if (log_debug_writes) {
 			fprintf(stderr,
 				"InnoDB: Archive read starting at"
-				" lsn %llu, len %lu from file %s\n",
+				" lsn " LSN_PF ", len %lu from file %s\n",
 				start_lsn,
 				(ulong) len, name);
 		}
 #endif /* UNIV_DEBUG */
 
-		fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE,
-		       group->archive_space_id, read_offset / UNIV_PAGE_SIZE,
+		fil_io(OS_FILE_READ | OS_FILE_LOG, true,
+		       group->archive_space_id, 0,
+		       read_offset / UNIV_PAGE_SIZE,
 		       read_offset % UNIV_PAGE_SIZE, len, buf, NULL);
 
 		ret = recv_scan_log_recs(
@@ -3891,14 +3935,14 @@ ask_again:
 Recovers from archived log files, and also from log files, if they exist.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 recv_recovery_from_archive_start(
 /*=============================*/
 	ib_uint64_t	min_flushed_lsn,/*!< in: min flushed lsn field from the
 					data files */
 	ib_uint64_t	limit_lsn,	/*!< in: recover up to this lsn if
 					possible */
-	ulint		first_log_no)	/*!< in: number of the first archived
+	lsn_t		first_log_no)	/*!< in: number of the first archived
 					log file to use in the recovery; the
 					file will be searched from
 					INNOBASE_LOG_ARCH_DIR specified in
@@ -3908,7 +3952,7 @@ recv_recovery_from_archive_start(
 	ulint		group_id;
 	ulint		trunc_len;
 	ibool		ret;
-	ulint		err;
+	dberr_t		err;
 
 	ut_a(0);
 
@@ -3967,7 +4011,7 @@ recv_recovery_from_archive_start(
 						 trunc_len);
 		}
 
-		group->archived_file_no++;
+		group->archived_file_no += group->file_size - LOG_FILE_HDR_SIZE;
 	}
 
 	if (recv_sys->recovered_lsn < limit_lsn) {
@@ -3981,8 +4025,8 @@ recv_recovery_from_archive_start(
 
 		err = recv_recovery_from_checkpoint_start(LOG_ARCHIVE,
 							  limit_lsn,
-							  IB_ULONGLONG_MAX,
-							  IB_ULONGLONG_MAX);
+							  LSN_MAX,
+							  LSN_MAX);
 		if (err != DB_SUCCESS) {
 
 			return(err);
@@ -3991,11 +4035,11 @@ recv_recovery_from_archive_start(
 		mutex_enter(&(log_sys->mutex));
 	}
 
-	if (limit_lsn != IB_ULONGLONG_MAX) {
+	if (limit_lsn != LSN_MAX) {
 
 		recv_apply_hashed_log_recs(FALSE);
 
-		recv_reset_logs(recv_sys->recovered_lsn, 0, FALSE);
+		recv_reset_logs(0, FALSE, recv_sys->recovered_lsn);
 	}
 
 	mutex_exit(&(log_sys->mutex));
diff --git a/storage/xtradb/mach/mach0data.c b/storage/xtradb/mach/mach0data.cc
index 95b135b0954..df68aab8a18 100644
--- a/storage/xtradb/mach/mach0data.c
+++ b/storage/xtradb/mach/mach0data.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /******************************************************************//**
-@file mach/mach0data.c
+@file mach/mach0data.cc
 Utilities for converting data from the database file
 to the machine format.
 
diff --git a/storage/xtradb/mem/mem0dbg.c b/storage/xtradb/mem/mem0dbg.cc
index 007610c01b7..308c2979551 100644
--- a/storage/xtradb/mem/mem0dbg.c
+++ b/storage/xtradb/mem/mem0dbg.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,29 +11,26 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file mem/mem0dbg.c
+@file mem/mem0dbg.cc
 The memory management: the debug code. This is not a compilation module,
 but is included in mem0mem.* !
 
 Created 6/9/1994 Heikki Tuuri
 *************************************************************************/
 
-#ifndef UNIV_HOTBACKUP
-# include "ha_prototypes.h"
-#endif /* !UNIV_HOTBACKUP */
-
 #ifdef UNIV_MEM_DEBUG
 # ifndef UNIV_HOTBACKUP
+#  include "ha_prototypes.h"
 /* The mutex which protects in the debug version the hash table
 containing the list of live memory heaps, and also the global
 variables below. */
-UNIV_INTERN mutex_t		mem_hash_mutex;
+UNIV_INTERN ib_mutex_t		mem_hash_mutex;
 
 #ifdef UNIV_PFS_MUTEX
 /* Key to register mem_hash_mutex with performance schema */
@@ -61,8 +58,7 @@ static ibool		mem_hash_initialized		= FALSE;
 
 /* The node of the list containing currently allocated memory heaps */
 
-typedef struct mem_hash_node_struct mem_hash_node_t;
-struct mem_hash_node_struct {
+struct mem_hash_node_t {
 	UT_LIST_NODE_T(mem_hash_node_t)
 				list;	/*!< hash list node */
 	mem_heap_t*		heap;	/*!< memory heap */
@@ -264,7 +260,7 @@ mem_field_erase(
 	mutex_exit(&mem_hash_mutex);
 
 	/* Check that the field lengths agree */
-	ut_ad(n == (ulint)mem_field_header_get_len(usr_buf));
+	ut_ad(n == (ulint) mem_field_header_get_len(usr_buf));
 
 	/* In the debug version, set the freed space to a random
 	combination of 0xDE and 0xAD */
@@ -341,10 +337,10 @@ mem_hash_insert(
 
 	mutex_enter(&mem_hash_mutex);
 
-	cell_no = ut_hash_ulint((ulint)heap, MEM_HASH_SIZE);
+	cell_no = ut_hash_ulint((ulint) heap, MEM_HASH_SIZE);
 
 	/* Allocate a new node to the list */
-	new_node = ut_malloc(sizeof(mem_hash_node_t));
+	new_node = static_cast<mem_hash_node_t*>(ut_malloc(sizeof(*new_node)));
 
 	new_node->heap = heap;
 	new_node->file_name = file_name;
@@ -386,7 +382,7 @@ mem_hash_remove(
 
 	mutex_enter(&mem_hash_mutex);
 
-	cell_no = ut_hash_ulint((ulint)heap, MEM_HASH_SIZE);
+	cell_no = ut_hash_ulint((ulint) heap, MEM_HASH_SIZE);
 
 	/* Look for the heap in the hash table list */
 	node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(cell_no));
@@ -426,7 +422,7 @@ mem_hash_remove(
 			node->nth_heap,
 			innobase_basename(node->file_name), (ulong) node->line,
 			innobase_basename(file_name), (ulong) line);
-		ut_print_buf(stderr, (byte*)node->heap - 200, 400);
+		ut_print_buf(stderr, (byte*) node->heap - 200, 400);
 		fputs("\nDump of the mem heap:\n", stderr);
 		mem_heap_validate_or_print(node->heap, NULL, TRUE, &error,
 					   &size, NULL, NULL);
@@ -530,14 +526,14 @@ mem_heap_validate_or_print(
 			fprintf(stderr, " Block %ld:", block_count);
 		}
 
-		field = (byte*)block + mem_block_get_start(block);
+		field = (byte*) block + mem_block_get_start(block);
 
 		if (top && (field == top)) {
 
 			goto completed;
 		}
 
-		while (field < (byte*)block + mem_block_get_free(block)) {
+		while (field < (byte*) block + mem_block_get_free(block)) {
 
 			/* Calculate the pointer to the storage
 			which was given to the user */
@@ -563,8 +559,8 @@ mem_heap_validate_or_print(
 					" field %lx len %lu\n"
 					"InnoDB: header check field is"
 					" %lx but trailer %lx\n",
-					(ulint)block,
-					(ulint)field, len, check_field,
+					(ulint) block,
+					(ulint) field, len, check_field,
 					mem_field_trailer_get_check(
 						user_field));
 
@@ -584,15 +580,15 @@ mem_heap_validate_or_print(
 		/* At the end check that we have arrived to the first free
 		position */
 
-		if (field != (byte*)block + mem_block_get_free(block)) {
+		if (field != (byte*) block + mem_block_get_free(block)) {
 			/* error */
 
 			fprintf(stderr,
 				"InnoDB: Error: block %lx end of"
 				" mem fields %lx\n"
 				"InnoDB: but block free at %lx\n",
-				(ulint)block, (ulint)field,
-				(ulint)((byte*)block
+				(ulint) block, (ulint) field,
+				(ulint)((byte*) block
 					+ mem_block_get_free(block)));
 
 			return;
@@ -830,19 +826,19 @@ mem_analyze_corruption(
 	ulint	dist;
 
 	fputs("InnoDB: Apparent memory corruption: mem dump ", stderr);
-	ut_print_buf(stderr, (byte*)ptr - 250, 500);
+	ut_print_buf(stderr, (byte*) ptr - 250, 500);
 
 	fputs("\nInnoDB: Scanning backward trying to find"
 	      " previous allocated mem blocks\n", stderr);
 
-	p = (byte*)ptr;
+	p = (byte*) ptr;
 	dist = 0;
 
 	for (i = 0; i < 10; i++) {
 		for (;;) {
-			if (((ulint)p) % 4 == 0) {
+			if (((ulint) p) % 4 == 0) {
 
-				if (*((ulint*)p) == MEM_BLOCK_MAGIC_N) {
+				if (*((ulint*) p) == MEM_BLOCK_MAGIC_N) {
 					fprintf(stderr,
 						"Mem block at - %lu,"
 						" file %s, line %lu\n",
@@ -855,7 +851,7 @@ mem_analyze_corruption(
 					break;
 				}
 
-				if (*((ulint*)p) == MEM_FREED_BLOCK_MAGIC_N) {
+				if (*((ulint*) p) == MEM_FREED_BLOCK_MAGIC_N) {
 					fprintf(stderr,
 						"Freed mem block at - %lu,"
 						" file %s, line %lu\n",
@@ -881,14 +877,14 @@ mem_analyze_corruption(
 		"InnoDB: Scanning forward trying to find next"
 		" allocated mem blocks\n");
 
-	p = (byte*)ptr;
+	p = (byte*) ptr;
 	dist = 0;
 
 	for (i = 0; i < 10; i++) {
 		for (;;) {
-			if (((ulint)p) % 4 == 0) {
+			if (((ulint) p) % 4 == 0) {
 
-				if (*((ulint*)p) == MEM_BLOCK_MAGIC_N) {
+				if (*((ulint*) p) == MEM_BLOCK_MAGIC_N) {
 					fprintf(stderr,
 						"Mem block at + %lu, file %s,"
 						" line %lu\n",
@@ -901,7 +897,7 @@ mem_analyze_corruption(
 					break;
 				}
 
-				if (*((ulint*)p) == MEM_FREED_BLOCK_MAGIC_N) {
+				if (*((ulint*) p) == MEM_FREED_BLOCK_MAGIC_N) {
 					fprintf(stderr,
 						"Freed mem block at + %lu,"
 						" file %s, line %lu\n",
diff --git a/storage/xtradb/mem/mem0mem.c b/storage/xtradb/mem/mem0mem.cc
index 159e9fc6b3c..e0e6220f4d8 100644
--- a/storage/xtradb/mem/mem0mem.c
+++ b/storage/xtradb/mem/mem0mem.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file mem/mem0mem.c
+@file mem/mem0mem.cc
 The memory management
 
 Created 6/9/1994 Heikki Tuuri
@@ -30,7 +30,7 @@ Created 6/9/1994 Heikki Tuuri
 
 #include "buf0buf.h"
 #include "srv0srv.h"
-#include "mem0dbg.c"
+#include "mem0dbg.cc"
 #include <stdarg.h>
 
 /*
@@ -108,7 +108,7 @@ mem_heap_strdup(
 	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
 	const char*	str)	/*!< in: string to be copied */
 {
-	return(mem_heap_dup(heap, str, strlen(str) + 1));
+	return(static_cast<char*>(mem_heap_dup(heap, str, strlen(str) + 1)));
 }
 
 /**********************************************************************//**
@@ -140,7 +140,7 @@ mem_heap_strcat(
 	ulint	s1_len = strlen(s1);
 	ulint	s2_len = strlen(s2);
 
-	s = mem_heap_alloc(heap, s1_len + s2_len + 1);
+	s = static_cast<char*>(mem_heap_alloc(heap, s1_len + s2_len + 1));
 
 	memcpy(s, s1, s1_len);
 	memcpy(s + s1_len, s2, s2_len);
@@ -261,7 +261,7 @@ mem_heap_printf_low(
 }
 
 /****************************************************************//**
-A simple (s)printf replacement that dynamically allocates the space for the
+A simple sprintf replacement that dynamically allocates the space for the
 formatted string from the given heap. This supports a very limited set of
 the printf syntax: types 's' and 'u' and length modifier 'l' (which is
 required for the 'u' type).
@@ -285,7 +285,7 @@ mem_heap_printf(
 	va_end(ap);
 
 	/* Now create it for real. */
-	str = mem_heap_alloc(heap, len);
+	str = static_cast<char*>(mem_heap_alloc(heap, len));
 	va_start(ap, format);
 	mem_heap_printf_low(str, format, ap);
 	va_end(ap);
@@ -330,7 +330,8 @@ mem_heap_create_block(
 
 		ut_ad(type == MEM_HEAP_DYNAMIC || n <= MEM_MAX_ALLOC_IN_BUF);
 
-		block = mem_area_alloc(&len, mem_comm_pool);
+		block = static_cast<mem_block_t*>(
+			mem_area_alloc(&len, mem_comm_pool));
 	} else {
 		len = UNIV_PAGE_SIZE;
 
@@ -339,7 +340,7 @@ mem_heap_create_block(
 			buffer pool, but must get the free block from
 			the heap header free block field */
 
-			buf_block = heap->free_block;
+			buf_block = static_cast<buf_block_t*>(heap->free_block);
 			heap->free_block = NULL;
 
 			if (UNIV_UNLIKELY(!buf_block)) {
@@ -354,11 +355,9 @@ mem_heap_create_block(
 	}
 
 	if(!block) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
+		ib_logf(IB_LOG_LEVEL_FATAL,
 			" InnoDB: Unable to allocate memory of size %lu.\n",
 			len);
-		ut_error;
 	}
 	block->buf_block = buf_block;
 	block->free_block = NULL;
@@ -475,7 +474,9 @@ mem_heap_block_free(
 	ulint		type;
 	ulint		len;
 #ifndef UNIV_HOTBACKUP
-	buf_block_t*	buf_block	= block->buf_block;
+	buf_block_t*	buf_block;
+
+	buf_block = static_cast<buf_block_t*>(block->buf_block);
 #endif /* !UNIV_HOTBACKUP */
 
 	if (block->magic_n != MEM_BLOCK_MAGIC_N) {
@@ -505,7 +506,7 @@ mem_heap_block_free(
 		/* In the debug version we set the memory to a random
 		combination of hex 0xDE and 0xAD. */
 
-		mem_erase_buf((byte*)block, len);
+		mem_erase_buf((byte*) block, len);
 #else /* UNIV_MEM_DEBUG */
 		UNIV_MEM_ASSERT_AND_FREE(block, len);
 #endif /* UNIV_MEM_DEBUG */
@@ -525,7 +526,7 @@ mem_heap_block_free(
 	/* In the debug version we set the memory to a random
 	combination of hex 0xDE and 0xAD. */
 
-	mem_erase_buf((byte*)block, len);
+	mem_erase_buf((byte*) block, len);
 #else /* UNIV_MEM_DEBUG */
 	UNIV_MEM_ASSERT_AND_FREE(block, len);
 #endif /* UNIV_MEM_DEBUG */
@@ -544,7 +545,7 @@ mem_heap_free_block_free(
 {
 	if (UNIV_LIKELY_NULL(heap->free_block)) {
 
-		buf_block_free(heap->free_block);
+		buf_block_free(static_cast<buf_block_t*>(heap->free_block));
 
 		heap->free_block = NULL;
 	}
diff --git a/storage/xtradb/mem/mem0pool.c b/storage/xtradb/mem/mem0pool.cc
index 709367266c6..fe9a84d21fa 100644
--- a/storage/xtradb/mem/mem0pool.c
+++ b/storage/xtradb/mem/mem0pool.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file mem/mem0pool.c
+@file mem/mem0pool.cc
 The lowest-level memory management
 
 Created 5/12/1997 Heikki Tuuri
@@ -100,12 +100,12 @@ pool, and after that its locks will grow into the buffer pool. */
 
 /** Data structure for a memory pool. The space is allocated using the buddy
 algorithm, where free list i contains areas of size 2 to power i. */
-struct mem_pool_struct{
+struct mem_pool_t{
 	byte*		buf;		/*!< memory pool */
 	ulint		size;		/*!< memory common pool size */
 	ulint		reserved;	/*!< amount of currently allocated
 					memory */
-	mutex_t		mutex;		/*!< mutex protecting this struct */
+	ib_mutex_t		mutex;		/*!< mutex protecting this struct */
 	UT_LIST_BASE_NODE_T(mem_area_t)
 			free_list[64];	/*!< lists of free memory areas: an
 					area is put to the list whose number
@@ -116,7 +116,7 @@ struct mem_pool_struct{
 UNIV_INTERN mem_pool_t*	mem_comm_pool	= NULL;
 
 #ifdef UNIV_PFS_MUTEX
-/* Key to register mutex in mem_pool_struct with performance schema */
+/* Key to register mutex in mem_pool_t with performance schema */
 UNIV_INTERN mysql_pfs_key_t	mem_pool_mutex_key;
 #endif /* UNIV_PFS_MUTEX */
 
@@ -226,9 +226,9 @@ mem_pool_create(
 	ulint		i;
 	ulint		used;
 
-	pool = ut_malloc(sizeof(mem_pool_t));
+	pool = static_cast<mem_pool_t*>(ut_malloc(sizeof(mem_pool_t)));
 
-	pool->buf = ut_malloc_low(size, TRUE);
+	pool->buf = static_cast<byte*>(ut_malloc_low(size, TRUE));
 	pool->size = size;
 
 	mutex_create(mem_pool_mutex_key, &pool->mutex, SYNC_MEM_POOL);
@@ -340,7 +340,7 @@ mem_pool_fill_free_list(
 
 	UT_LIST_REMOVE(free_list, pool->free_list[i + 1], area);
 
-	area2 = (mem_area_t*)(((byte*)area) + ut_2_exp(i));
+	area2 = (mem_area_t*)(((byte*) area) + ut_2_exp(i));
 	UNIV_MEM_ALLOC(area2, MEM_AREA_EXTRA_SIZE);
 
 	mem_area_set_size(area2, ut_2_exp(i));
@@ -454,9 +454,9 @@ mem_area_alloc(
 	ut_ad(mem_pool_validate(pool));
 
 	*psize = ut_2_exp(n) - MEM_AREA_EXTRA_SIZE;
-	UNIV_MEM_ALLOC(MEM_AREA_EXTRA_SIZE + (byte*)area, *psize);
+	UNIV_MEM_ALLOC(MEM_AREA_EXTRA_SIZE + (byte*) area, *psize);
 
-	return((void*)(MEM_AREA_EXTRA_SIZE + ((byte*)area)));
+	return((void*)(MEM_AREA_EXTRA_SIZE + ((byte*) area)));
 }
 
 /********************************************************************//**
@@ -474,13 +474,13 @@ mem_area_get_buddy(
 
 	ut_ad(size != 0);
 
-	if (((((byte*)area) - pool->buf) % (2 * size)) == 0) {
+	if (((((byte*) area) - pool->buf) % (2 * size)) == 0) {
 
 		/* The buddy is in a higher address */
 
-		buddy = (mem_area_t*)(((byte*)area) + size);
+		buddy = (mem_area_t*)(((byte*) area) + size);
 
-		if ((((byte*)buddy) - pool->buf) + size > pool->size) {
+		if ((((byte*) buddy) - pool->buf) + size > pool->size) {
 
 			/* The buddy is not wholly contained in the pool:
 			there is no buddy */
@@ -493,7 +493,7 @@ mem_area_get_buddy(
 		the upper branch in this if-clause: the remainder would be
 		0 */
 
-		buddy = (mem_area_t*)(((byte*)area) - size);
+		buddy = (mem_area_t*)(((byte*) area) - size);
 	}
 
 	return(buddy);
@@ -524,13 +524,13 @@ mem_area_free(
 	/* It may be that the area was really allocated from the OS with
 	regular malloc: check if ptr points within our memory pool */
 
-	if ((byte*)ptr < pool->buf || (byte*)ptr >= pool->buf + pool->size) {
+	if ((byte*) ptr < pool->buf || (byte*) ptr >= pool->buf + pool->size) {
 		ut_free(ptr);
 
 		return;
 	}
 
-	area = (mem_area_t*) (((byte*)ptr) - MEM_AREA_EXTRA_SIZE);
+	area = (mem_area_t*) (((byte*) ptr) - MEM_AREA_EXTRA_SIZE);
 
 	if (mem_area_get_free(area)) {
 		fprintf(stderr,
@@ -556,12 +556,12 @@ mem_area_free(
 	}
 
 #ifdef UNIV_LIGHT_MEM_DEBUG
-	if (((byte*)area) + size < pool->buf + pool->size) {
+	if (((byte*) area) + size < pool->buf + pool->size) {
 
 		ulint	next_size;
 
 		next_size = mem_area_get_size(
-			(mem_area_t*)(((byte*)area) + size));
+			(mem_area_t*)(((byte*) area) + size));
 		if (UNIV_UNLIKELY(!next_size || !ut_is_2pow(next_size))) {
 			fprintf(stderr,
 				"InnoDB: Error: Memory area size %lu,"
@@ -589,8 +589,8 @@ mem_area_free(
 
 		/* The buddy is in a free list */
 
-		if ((byte*)buddy < (byte*)area) {
-			new_ptr = ((byte*)buddy) + MEM_AREA_EXTRA_SIZE;
+		if ((byte*) buddy < (byte*) area) {
+			new_ptr = ((byte*) buddy) + MEM_AREA_EXTRA_SIZE;
 
 			mem_area_set_size(buddy, 2 * size);
 			mem_area_set_free(buddy, FALSE);
@@ -648,12 +648,12 @@ mem_pool_validate(
 
 	for (i = 0; i < 64; i++) {
 
-		UT_LIST_VALIDATE(free_list, mem_area_t, pool->free_list[i],
-				 (void) 0);
+		UT_LIST_CHECK(free_list, mem_area_t, pool->free_list[i]);
 
-		area = UT_LIST_GET_FIRST(pool->free_list[i]);
+		for (area = UT_LIST_GET_FIRST(pool->free_list[i]);
+		     area != 0;
+		     area = UT_LIST_GET_NEXT(free_list, area)) {
 
-		while (area != NULL) {
 			ut_a(mem_area_get_free(area));
 			ut_a(mem_area_get_size(area) == ut_2_exp(i));
 
@@ -662,8 +662,6 @@ mem_pool_validate(
 			ut_a(!buddy || !mem_area_get_free(buddy)
 			     || (ut_2_exp(i) != mem_area_get_size(buddy)));
 
-			area = UT_LIST_GET_NEXT(free_list, area);
-
 			free += ut_2_exp(i);
 		}
 	}
diff --git a/storage/xtradb/mtr/mtr0log.c b/storage/xtradb/mtr/mtr0log.cc
index 091fabf732c..5335cb4c9ef 100644
--- a/storage/xtradb/mtr/mtr0log.c
+++ b/storage/xtradb/mtr/mtr0log.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file mtr/mtr0log.c
+@file mtr/mtr0log.cc
 Mini-transaction log routines
 
 Created 12/7/1995 Heikki Tuuri
@@ -175,7 +175,7 @@ mlog_parse_nbytes(
 		}
 
 		if (page) {
-			if (UNIV_LIKELY_NULL(page_zip)) {
+			if (page_zip) {
 				mach_write_to_8
 					(((page_zip_des_t*) page_zip)->data
 					 + offset, dval);
@@ -199,7 +199,7 @@ mlog_parse_nbytes(
 			goto corrupt;
 		}
 		if (page) {
-			if (UNIV_LIKELY_NULL(page_zip)) {
+			if (page_zip) {
 				mach_write_to_1
 					(((page_zip_des_t*) page_zip)->data
 					 + offset, val);
@@ -212,7 +212,7 @@ mlog_parse_nbytes(
 			goto corrupt;
 		}
 		if (page) {
-			if (UNIV_LIKELY_NULL(page_zip)) {
+			if (page_zip) {
 				mach_write_to_2
 					(((page_zip_des_t*) page_zip)->data
 					 + offset, val);
@@ -222,7 +222,7 @@ mlog_parse_nbytes(
 		break;
 	case MLOG_4BYTES:
 		if (page) {
-			if (UNIV_LIKELY_NULL(page_zip)) {
+			if (page_zip) {
 				mach_write_to_4
 					(((page_zip_des_t*) page_zip)->data
 					 + offset, val);
@@ -240,8 +240,8 @@ mlog_parse_nbytes(
 }
 
 /********************************************************//**
-Writes 1 - 4 bytes to a file page buffered in the buffer pool.
-Writes the corresponding log record to the mini-transaction log. */
+Writes 1, 2 or 4 bytes to a file page. Writes the corresponding log
+record to the mini-transaction log if mtr is not NULL. */
 UNIV_INTERN
 void
 mlog_write_ulint(
@@ -251,8 +251,6 @@ mlog_write_ulint(
 	byte	type,	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
 	mtr_t*	mtr)	/*!< in: mini-transaction handle */
 {
-	byte*	log_ptr;
-
 	switch (type) {
 	case MLOG_1BYTE:
 		mach_write_to_1(ptr, val);
@@ -267,27 +265,29 @@ mlog_write_ulint(
 		ut_error;
 	}
 
-	log_ptr = mlog_open(mtr, 11 + 2 + 5);
+	if (mtr != 0) {
+		byte*	log_ptr = mlog_open(mtr, 11 + 2 + 5);
 
-	/* If no logging is requested, we may return now */
-	if (log_ptr == NULL) {
+		/* If no logging is requested, we may return now */
 
-		return;
-	}
+		if (log_ptr != 0) {
 
-	log_ptr = mlog_write_initial_log_record_fast(ptr, type, log_ptr, mtr);
+			log_ptr = mlog_write_initial_log_record_fast(
+				ptr, type, log_ptr, mtr);
 
-	mach_write_to_2(log_ptr, page_offset(ptr));
-	log_ptr += 2;
+			mach_write_to_2(log_ptr, page_offset(ptr));
+			log_ptr += 2;
 
-	log_ptr += mach_write_compressed(log_ptr, val);
+			log_ptr += mach_write_compressed(log_ptr, val);
 
-	mlog_close(mtr, log_ptr);
+			mlog_close(mtr, log_ptr);
+		}
+	}
 }
 
 /********************************************************//**
-Writes 8 bytes to a file page buffered in the buffer pool.
-Writes the corresponding log record to the mini-transaction log. */
+Writes 8 bytes to a file page. Writes the corresponding log
+record to the mini-transaction log, only if mtr is not NULL */
 UNIV_INTERN
 void
 mlog_write_ull(
@@ -296,29 +296,25 @@ mlog_write_ull(
 	ib_uint64_t	val,	/*!< in: value to write */
 	mtr_t*		mtr)	/*!< in: mini-transaction handle */
 {
-	byte*	log_ptr;
-
-	ut_ad(ptr && mtr);
-
 	mach_write_to_8(ptr, val);
 
-	log_ptr = mlog_open(mtr, 11 + 2 + 9);
-
-	/* If no logging is requested, we may return now */
-	if (log_ptr == NULL) {
+	if (mtr != 0) {
+		byte*	log_ptr = mlog_open(mtr, 11 + 2 + 9);
 
-		return;
-	}
+		/* If no logging is requested, we may return now */
+		if (log_ptr != 0) {
 
-	log_ptr = mlog_write_initial_log_record_fast(ptr, MLOG_8BYTES,
-						     log_ptr, mtr);
+			log_ptr = mlog_write_initial_log_record_fast(
+				ptr, MLOG_8BYTES, log_ptr, mtr);
 
-	mach_write_to_2(log_ptr, page_offset(ptr));
-	log_ptr += 2;
+			mach_write_to_2(log_ptr, page_offset(ptr));
+			log_ptr += 2;
 
-	log_ptr += mach_ull_write_compressed(log_ptr, val);
+			log_ptr += mach_ull_write_compressed(log_ptr, val);
 
-	mlog_close(mtr, log_ptr);
+			mlog_close(mtr, log_ptr);
+		}
+	}
 }
 
 #ifndef UNIV_HOTBACKUP
@@ -420,7 +416,7 @@ mlog_parse_string(
 	}
 
 	if (page) {
-		if (UNIV_LIKELY_NULL(page_zip)) {
+		if (page_zip) {
 			memcpy(((page_zip_des_t*) page_zip)->data
 				+ offset, ptr, len);
 		}
@@ -439,12 +435,13 @@ UNIV_INTERN
 byte*
 mlog_open_and_write_index(
 /*======================*/
-	mtr_t*		mtr,	/*!< in: mtr */
-	const byte*	rec,	/*!< in: index record or page */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	byte		type,	/*!< in: log item type */
-	ulint		size)	/*!< in: requested buffer size in bytes
-				(if 0, calls mlog_close() and returns NULL) */
+	mtr_t*			mtr,	/*!< in: mtr */
+	const byte*		rec,	/*!< in: index record or page */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	byte			type,	/*!< in: log item type */
+	ulint			size)	/*!< in: requested buffer size in bytes
+					(if 0, calls mlog_close() and
+					returns NULL) */
 {
 	byte*		log_ptr;
 	const byte*	log_start;
@@ -538,7 +535,7 @@ mlog_parse_index(
 /*=============*/
 	byte*		ptr,	/*!< in: buffer */
 	const byte*	end_ptr,/*!< in: buffer end */
-	ibool		comp,	/*!< in: TRUE=compact record format */
+	ibool		comp,	/*!< in: TRUE=compact row format */
 	dict_index_t**	index)	/*!< out, own: dummy index */
 {
 	ulint		i, n, n_uniq;
@@ -563,7 +560,7 @@ mlog_parse_index(
 		n = n_uniq = 1;
 	}
 	table = dict_mem_table_create("LOG_DUMMY", DICT_HDR_SPACE, n,
-				      comp ? DICT_TF_COMPACT : 0);
+				      comp ? DICT_TF_COMPACT : 0, 0);
 	ind = dict_mem_index_create("LOG_DUMMY", "LOG_DUMMY",
 				    DICT_HDR_SPACE, 0, n);
 	ind->table = table;
diff --git a/storage/xtradb/mtr/mtr0mtr.c b/storage/xtradb/mtr/mtr0mtr.cc
index 083692efd77..02e8cbdbfcc 100644
--- a/storage/xtradb/mtr/mtr0mtr.c
+++ b/storage/xtradb/mtr/mtr0mtr.cc
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file mtr/mtr0mtr.c
+@file mtr/mtr0mtr.cc
 Mini-transaction buffer
 
 Created 11/26/1995 Heikki Tuuri
@@ -83,10 +83,10 @@ mtr_memo_slot_release_func(
 		buf_page_release((buf_block_t*) object, slot->type);
 		break;
 	case MTR_MEMO_S_LOCK:
-		rw_lock_s_unlock((rw_lock_t*) object);
+		rw_lock_s_unlock((prio_rw_lock_t*) object);
 		break;
 	case MTR_MEMO_X_LOCK:
-		rw_lock_x_unlock((rw_lock_t*) object);
+		rw_lock_x_unlock((prio_rw_lock_t*) object);
 		break;
 #ifdef UNIV_DEBUG
 	default:
@@ -112,20 +112,20 @@ mtr_memo_pop_all(
 /*=============*/
 	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
-	const dyn_block_t*	block;
-
 	ut_ad(mtr->magic_n == MTR_MAGIC_N);
 	ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in
 					     commit */
 
-	for (block = dyn_array_get_last_block(&mtr->memo);
+	for (const dyn_block_t* block = dyn_array_get_last_block(&mtr->memo);
 	     block;
 	     block = dyn_array_get_prev_block(&mtr->memo, block)) {
 		const mtr_memo_slot_t*	start
-			= (mtr_memo_slot_t*) dyn_block_get_data(block);
+			= reinterpret_cast<mtr_memo_slot_t*>(
+				dyn_block_get_data(block));
 		mtr_memo_slot_t*	slot
-			= (mtr_memo_slot_t*) (dyn_block_get_data(block)
-					      + dyn_block_get_used(block));
+			= reinterpret_cast<mtr_memo_slot_t*>(
+				dyn_block_get_data(block)
+				+ dyn_block_get_used(block));
 
 		ut_ad(!(dyn_block_get_used(block) % sizeof(mtr_memo_slot_t)));
 
@@ -146,16 +146,14 @@ mtr_memo_slot_note_modification(
 	mtr_t*			mtr,	/*!< in: mtr */
 	mtr_memo_slot_t*	slot)	/*!< in: memo slot */
 {
-	ut_ad(mtr);
-	ut_ad(mtr->magic_n == MTR_MAGIC_N);
 	ut_ad(mtr->modifications);
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
 
 	if (slot->object != NULL && slot->type == MTR_MEMO_PAGE_X_FIX) {
 		buf_block_t*	block = (buf_block_t*) slot->object;
 
-#ifdef UNIV_DEBUG
 		ut_ad(!mtr->made_dirty || log_flush_order_mutex_own());
-#endif /* UNIV_DEBUG */
 		buf_flush_note_modification(block, mtr);
 	}
 }
@@ -176,7 +174,7 @@ mtr_memo_note_modifications(
 	dyn_array_t*	memo;
 	ulint		offset;
 
-	ut_ad(mtr);
+	ut_ad(!srv_read_only_mode);
 	ut_ad(mtr->magic_n == MTR_MAGIC_N);
 	ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in
 					     commit */
@@ -188,26 +186,60 @@ mtr_memo_note_modifications(
 		mtr_memo_slot_t* slot;
 
 		offset -= sizeof(mtr_memo_slot_t);
-		slot = dyn_array_get_element(memo, offset);
+
+		slot = static_cast<mtr_memo_slot_t*>(
+			dyn_array_get_element(memo, offset));
 
 		mtr_memo_slot_note_modification(mtr, slot);
 	}
 }
 
 /************************************************************//**
+Append the dirty pages to the flush list. */
+static
+void
+mtr_add_dirtied_pages_to_flush_list(
+/*================================*/
+	mtr_t*	mtr)	/*!< in/out: mtr */
+{
+	ut_ad(!srv_read_only_mode);
+
+	/* No need to acquire log_flush_order_mutex if this mtr has
+	not dirtied a clean page. log_flush_order_mutex is used to
+	ensure ordered insertions in the flush_list. We need to
+	insert in the flush_list iff the page in question was clean
+	before modifications. */
+	if (mtr->made_dirty) {
+		log_flush_order_mutex_enter();
+	}
+
+	/* It is now safe to release the log mutex because the
+	flush_order mutex will ensure that we are the first one
+	to insert into the flush list. */
+	log_release();
+
+	if (mtr->modifications) {
+		mtr_memo_note_modifications(mtr);
+	}
+
+	if (mtr->made_dirty) {
+		log_flush_order_mutex_exit();
+	}
+}
+
+/************************************************************//**
 Writes the contents of a mini-transaction log, if any, to the database log. */
 static
 void
 mtr_log_reserve_and_write(
 /*======================*/
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mtr */
 {
 	dyn_array_t*	mlog;
-	dyn_block_t*	block;
 	ulint		data_size;
 	byte*		first_data;
 
-	ut_ad(mtr);
+	ut_ad(!srv_read_only_mode);
 
 	mlog = &(mtr->log);
 
@@ -221,14 +253,21 @@ mtr_log_reserve_and_write(
 	}
 
 	if (mlog->heap == NULL) {
+		ulint	len;
+
+		len = mtr->log_mode != MTR_LOG_NO_REDO
+			? dyn_block_get_used(mlog) : 0;
+
 		mtr->end_lsn = log_reserve_and_write_fast(
-			first_data, dyn_block_get_used(mlog),
-			&mtr->start_lsn);
+			first_data, len, &mtr->start_lsn);
+
 		if (mtr->end_lsn) {
 
 			/* Success. We have the log mutex.
 			Add pages to flush list and exit */
-			goto func_exit;
+			mtr_add_dirtied_pages_to_flush_list(mtr);
+
+			return;
 		}
 	} else {
 		mutex_enter(&log_sys->mutex);
@@ -241,43 +280,24 @@ mtr_log_reserve_and_write(
 
 	if (mtr->log_mode == MTR_LOG_ALL) {
 
-		block = mlog;
+		for (dyn_block_t* block = mlog;
+		     block != 0;
+		     block = dyn_array_get_next_block(mlog, block)) {
 
-		while (block != NULL) {
-			log_write_low(dyn_block_get_data(block),
-				      dyn_block_get_used(block));
-			block = dyn_array_get_next_block(mlog, block);
+			log_write_low(
+				dyn_block_get_data(block),
+				dyn_block_get_used(block));
 		}
+
 	} else {
-		ut_ad(mtr->log_mode == MTR_LOG_NONE);
+		ut_ad(mtr->log_mode == MTR_LOG_NONE
+		      || mtr->log_mode == MTR_LOG_NO_REDO);
 		/* Do nothing */
 	}
 
 	mtr->end_lsn = log_close();
 
-func_exit:
-
-	/* No need to acquire log_flush_order_mutex if this mtr has
-	not dirtied a clean page. log_flush_order_mutex is used to
-	ensure ordered insertions in the flush_list. We need to
-	insert in the flush_list iff the page in question was clean
-	before modifications. */
-	if (mtr->made_dirty) {
-		log_flush_order_mutex_enter();
-	}
-
-	/* It is now safe to release the log mutex because the
-	flush_order mutex will ensure that we are the first one
-	to insert into the flush list. */
-	log_release();
-
-	if (mtr->modifications) {
-		mtr_memo_note_modifications(mtr);
-	}
-
-	if (mtr->made_dirty) {
-		log_flush_order_mutex_exit();
-	}
+	mtr_add_dirtied_pages_to_flush_list(mtr);
 }
 #endif /* !UNIV_HOTBACKUP */
 
@@ -300,6 +320,7 @@ mtr_commit(
 	ut_ad(!recv_no_log_write);
 
 	if (mtr->modifications && mtr->n_log_recs) {
+		ut_ad(!srv_read_only_mode);
 		mtr_log_reserve_and_write(mtr);
 	}
 
@@ -312,8 +333,8 @@ mtr_commit(
 	/* Declare everything uninitialized except
 	mtr->start_lsn, mtr->end_lsn and mtr->state. */
 	{
-		ib_uint64_t	start_lsn	= mtr->start_lsn;
-		ib_uint64_t	end_lsn		= mtr->end_lsn;
+		lsn_t	start_lsn	= mtr->start_lsn;
+		lsn_t	end_lsn		= mtr->end_lsn;
 		UNIV_MEM_INVALID(mtr, sizeof *mtr);
 		mtr->start_lsn = start_lsn;
 		mtr->end_lsn = end_lsn;
@@ -324,41 +345,44 @@ mtr_commit(
 
 #ifndef UNIV_HOTBACKUP
 /***************************************************//**
-Releases an object in the memo stack. */
+Releases an object in the memo stack.
+@return true if released */
 UNIV_INTERN
-void
+bool
 mtr_memo_release(
 /*=============*/
 	mtr_t*	mtr,	/*!< in/out: mini-transaction */
 	void*	object,	/*!< in: object */
 	ulint	type)	/*!< in: object type: MTR_MEMO_S_LOCK, ... */
 {
-	const dyn_block_t*	block;
-
 	ut_ad(mtr->magic_n == MTR_MAGIC_N);
 	ut_ad(mtr->state == MTR_ACTIVE);
 	/* We cannot release a page that has been written to in the
 	middle of a mini-transaction. */
 	ut_ad(!mtr->modifications || type != MTR_MEMO_PAGE_X_FIX);
 
-	for (block = dyn_array_get_last_block(&mtr->memo);
+	for (const dyn_block_t* block = dyn_array_get_last_block(&mtr->memo);
 	     block;
 	     block = dyn_array_get_prev_block(&mtr->memo, block)) {
 		const mtr_memo_slot_t*	start
-			= (mtr_memo_slot_t*) dyn_block_get_data(block);
+			= reinterpret_cast<mtr_memo_slot_t*>(
+				dyn_block_get_data(block));
 		mtr_memo_slot_t*	slot
-			= (mtr_memo_slot_t*) (dyn_block_get_data(block)
-					      + dyn_block_get_used(block));
+			= reinterpret_cast<mtr_memo_slot_t*>(
+				dyn_block_get_data(block)
+				+ dyn_block_get_used(block));
 
 		ut_ad(!(dyn_block_get_used(block) % sizeof(mtr_memo_slot_t)));
 
 		while (slot-- != start) {
 			if (object == slot->object && type == slot->type) {
 				mtr_memo_slot_release(mtr, slot);
-				return;
+				return(true);
 			}
 		}
 	}
+
+	return(false);
 }
 #endif /* !UNIV_HOTBACKUP */
 
@@ -377,14 +401,8 @@ mtr_read_ulint(
 	ut_ad(mtr->state == MTR_ACTIVE);
 	ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_S_FIX)
 	      || mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX));
-	if (type == MLOG_1BYTE) {
-		return(mach_read_from_1(ptr));
-	} else if (type == MLOG_2BYTES) {
-		return(mach_read_from_2(ptr));
-	} else {
-		ut_ad(type == MLOG_4BYTES);
-		return(mach_read_from_4(ptr));
-	}
+
+	return(mach_read_ulint(ptr, type));
 }
 
 #ifdef UNIV_DEBUG
diff --git a/storage/xtradb/os/os0file.c b/storage/xtradb/os/os0file.cc
index 43b0c2f5adc..38eb5241da1 100644
--- a/storage/xtradb/os/os0file.c
+++ b/storage/xtradb/os/os0file.cc
@@ -1,6 +1,6 @@
 /***********************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2009, Percona Inc.
 
 Portions of this file contain modifications contributed and copyrighted
@@ -19,14 +19,14 @@ WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
 Public License for more details.
 
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 ***********************************************************************/
 
 /**************************************************//**
-@file os/os0file.c
+@file os/os0file.cc
 The interface to the operating system file i/o primitives
 
 Created 10/21/1995 Heikki Tuuri
@@ -43,9 +43,9 @@ Created 10/21/1995 Heikki Tuuri
 #include "srv0start.h"
 #include "fil0fil.h"
 #include "buf0buf.h"
-#include "trx0sys.h"
+#include "btr0types.h"
 #include "trx0trx.h"
-#include "log0recv.h"
+#include "srv0mon.h"
 #ifndef UNIV_HOTBACKUP
 # include "os0sync.h"
 # include "os0thread.h"
@@ -73,32 +73,29 @@ Created 10/21/1995 Heikki Tuuri
 # endif
 #endif
 
+/** Insert buffer segment id */
+static const ulint IO_IBUF_SEGMENT = 0;
+
+/** Log segment id */
+static const ulint IO_LOG_SEGMENT = 1;
+
 /* This specifies the file permissions InnoDB uses when it creates files in
 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
 my_umask */
 
 #ifndef __WIN__
 /** Umask for creating files */
-UNIV_INTERN ulint	os_innodb_umask
-			= S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
+UNIV_INTERN ulint	os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
 #else
 /** Umask for creating files */
-UNIV_INTERN ulint	os_innodb_umask		= 0;
-#endif
-
-#ifdef UNIV_DO_FLUSH
-/* If the following is set to TRUE, we do not call os_file_flush in every
-os_file_write. We can set this TRUE when the doublewrite buffer is used. */
-UNIV_INTERN ibool	os_do_not_call_flush_at_each_write	= FALSE;
-#else
-/* We do not call os_file_flush in every os_file_write. */
-#endif /* UNIV_DO_FLUSH */
+UNIV_INTERN ulint	os_innodb_umask	= 0;
+#endif /* __WIN__ */
 
 #ifndef UNIV_HOTBACKUP
 /* We use these mutexes to protect lseek + file i/o operation, if the
 OS does not provide an atomic pread or pwrite, or similar */
 #define OS_FILE_N_SEEK_MUTEXES	16
-UNIV_INTERN os_mutex_t	os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
+UNIV_INTERN os_ib_mutex_t	os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
 
 /* In simulated aio, merge at most this many consecutive i/os */
 #define OS_AIO_MERGE_N_CONSECUTIVE	64
@@ -119,7 +116,7 @@ of the high level design.
 There are four io-threads (for ibuf, log, read, write).
 All synchronous IO requests are serviced by the calling thread using
 os_file_write/os_file_read. The Asynchronous requests are queued up
-in an array (there are four such arrays) by the calling thread. 
+in an array (there are four such arrays) by the calling thread.
 Later these requests are picked up by the io-thread and are serviced
 synchronously.
 
@@ -169,10 +166,7 @@ UNIV_INTERN mysql_pfs_key_t  innodb_file_bmp_key;
 #endif /* UNIV_PFS_IO */
 
 /** The asynchronous i/o array slot structure */
-typedef struct os_aio_slot_struct	os_aio_slot_t;
-
-/** The asynchronous i/o array slot structure */
-struct os_aio_slot_struct{
+struct os_aio_slot_t{
 #ifdef WIN_ASYNC_IO
 	OVERLAPPED	control;	/*!< Windows control block for the
 					aio request, MUST be first element in the structure*/
@@ -188,9 +182,7 @@ struct os_aio_slot_struct{
 					write */
 	byte*		buf;		/*!< buffer used in i/o */
 	ulint		type;		/*!< OS_FILE_READ or OS_FILE_WRITE */
-	ulint		offset;		/*!< 32 low bits of file offset in
-					bytes */
-	ulint		offset_high;	/*!< 32 high bits of file offset */
+	os_offset_t	offset;		/*!< file offset in bytes */
 	os_file_t	file;		/*!< file where to read or write */
 	const char*	name;		/*!< file name or path */
 	ibool		io_already_done;/*!< used only in simulated aio:
@@ -208,15 +200,12 @@ struct os_aio_slot_struct{
 	struct iocb	control;	/* Linux control block for aio */
 	int		n_bytes;	/* bytes written/read. */
 	int		ret;		/* AIO return code */
-#endif
+#endif /* WIN_ASYNC_IO */
 };
 
 /** The asynchronous i/o array structure */
-typedef struct os_aio_array_struct	os_aio_array_t;
-
-/** The asynchronous i/o array structure */
-struct os_aio_array_struct{
-	os_mutex_t	mutex;	/*!< the mutex protecting the aio array */
+struct os_aio_array_t{
+	os_ib_mutex_t	mutex;	/*!< the mutex protecting the aio array */
 	os_event_t	not_full;
 				/*!< The event which is set to the
 				signaled state when there is space in
@@ -244,7 +233,7 @@ struct os_aio_array_struct{
 
 #if defined(LINUX_NATIVE_AIO)
 	io_context_t*		aio_ctx;
-				/* completion queue for IO. There is 
+				/* completion queue for IO. There is
 				one such queue per segment. Each thread
 				will work on one ctx exclusively. */
 	struct io_event*	aio_events;
@@ -252,7 +241,7 @@ struct os_aio_array_struct{
 				There is one such event for each
 				possible pending IO. The size of the
 				array is equal to n_slots. */
-#endif
+#endif /* LINUX_NATIV_AIO */
 };
 
 #if defined(LINUX_NATIVE_AIO)
@@ -267,7 +256,7 @@ struct os_aio_array_struct{
 #endif
 
 /** Array of events used in simulated aio */
-static os_event_t*	os_aio_segment_wait_events	= NULL;
+static os_event_t*	os_aio_segment_wait_events = NULL;
 
 /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
 are NULL when the module has not yet been initialized. @{ */
@@ -297,10 +286,12 @@ UNIV_INTERN time_t	os_last_printout;
 
 UNIV_INTERN ibool	os_has_said_disk_full	= FALSE;
 
-#ifndef UNIV_HOTBACKUP
+#if !defined(UNIV_HOTBACKUP)	\
+    && (!defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8)
 /** The mutex protecting the following counts of pending I/O operations */
-static os_mutex_t	os_file_count_mutex;
-#endif /* !UNIV_HOTBACKUP */
+static os_ib_mutex_t	os_file_count_mutex;
+#endif /* !UNIV_HOTBACKUP && (!HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8) */
+
 /** Number of pending os_file_pread() operations */
 UNIV_INTERN ulint	os_file_n_pending_preads  = 0;
 /** Number of pending os_file_pwrite() operations */
@@ -360,7 +351,7 @@ ulint
 os_get_os_version(void)
 /*===================*/
 {
-	OSVERSIONINFO	  os_info;
+	OSVERSIONINFO	os_info;
 
 	os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
 
@@ -374,15 +365,15 @@ os_get_os_version(void)
 		switch (os_info.dwMajorVersion) {
 		case 3:
 		case 4:
-			return OS_WINNT;
+			return(OS_WINNT);
 		case 5:
-			return (os_info.dwMinorVersion == 0) ? OS_WIN2000
-							     : OS_WINXP;
+			return (os_info.dwMinorVersion == 0)
+				? OS_WIN2000 : OS_WINXP;
 		case 6:
-			return (os_info.dwMinorVersion == 0) ? OS_WINVISTA
-							     : OS_WIN7;
+			return (os_info.dwMinorVersion == 0)
+				? OS_WINVISTA : OS_WIN7;
 		default:
-			return OS_WIN7;
+			return(OS_WIN7);
 		}
 	} else {
 		ut_error;
@@ -455,6 +446,7 @@ static void NTAPI win_tls_thread_exit(PVOID module, DWORD reason, PVOID reserved
 		win_free_syncio_event();
 }
 
+extern "C" {
 #ifdef _WIN64
 #pragma comment(linker, "/INCLUDE:_tls_used")
 #pragma comment(linker, "/INCLUDE:p_thread_callback_base")
@@ -469,6 +461,7 @@ const PIMAGE_TLS_CALLBACK p_thread_callback_base = win_tls_thread_exit;
 PIMAGE_TLS_CALLBACK p_thread_callback_base = win_tls_thread_exit;
 #pragma data_seg()
 #endif 
+}
 #endif /*_WIN32 */
 
 /***********************************************************************//**
@@ -477,21 +470,26 @@ The number should be retrieved before any other OS calls (because they may
 overwrite the error number). If the number is not known to this program,
 the OS error number + 100 is returned.
 @return	error number, or OS error number + 100 */
-UNIV_INTERN
+static
 ulint
-os_file_get_last_error(
-/*===================*/
-	ibool	report_all_errors)	/*!< in: TRUE if we want an error message
-					printed of all errors */
+os_file_get_last_error_low(
+/*=======================*/
+	bool	report_all_errors,	/*!< in: TRUE if we want an error
+					message printed of all errors */
+	bool	on_error_silent)	/*!< in: TRUE then don't print any
+					diagnostic to the log */
 {
-	ulint	err;
-
 #ifdef __WIN__
 
-	err = (ulint) GetLastError();
+	ulint	err = (ulint) GetLastError();
+	if (err == ERROR_SUCCESS) {
+		return(0);
+	}
 
 	if (report_all_errors
-	    || (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) {
+	    || (!on_error_silent
+		&& err != ERROR_DISK_FULL
+		&& err != ERROR_FILE_EXISTS)) {
 
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
@@ -569,15 +567,18 @@ os_file_get_last_error(
 		return(100 + err);
 	}
 #else
-	err = (ulint) errno;
+	int err = errno;
+	if (err == 0) {
+		return(0);
+	}
 
 	if (report_all_errors
-	    || (err != ENOSPC && err != EEXIST)) {
+	    || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
 
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
-			"  InnoDB: Operating system error number %lu"
-			" in a file operation.\n", (ulong) err);
+			"  InnoDB: Operating system error number %d"
+			" in a file operation.\n", err);
 
 		if (err == ENOENT) {
 			fprintf(stderr,
@@ -597,13 +598,14 @@ os_file_get_last_error(
 				" the access rights to\n"
 				"InnoDB: the directory.\n");
 		} else {
-			if (strerror((int)err) != NULL) {
+			if (strerror(err) != NULL) {
 				fprintf(stderr,
-					"InnoDB: Error number %lu"
+					"InnoDB: Error number %d"
 					" means '%s'.\n",
-					err, strerror((int)err));
+					err, strerror(err));
 			}
 
+
 			fprintf(stderr,
 				"InnoDB: Some operating system"
 				" error numbers are described at\n"
@@ -641,10 +643,26 @@ os_file_get_last_error(
 #endif
 }
 
+/***********************************************************************//**
+Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@return	error number, or OS error number + 100 */
+UNIV_INTERN
+ulint
+os_file_get_last_error(
+/*===================*/
+	bool	report_all_errors)	/*!< in: TRUE if we want an error
+					message printed of all errors */
+{
+	return(os_file_get_last_error_low(report_all_errors, false));
+}
+
 /****************************************************************//**
 Does error handling when a file operation fails.
 Conditionally exits (calling exit(3)) based on should_exit value and the
-error type
+error type, if should_exit is TRUE then on_error_silent is ignored.
 @return	TRUE if we should retry the operation */
 static
 ibool
@@ -652,14 +670,18 @@ os_file_handle_error_cond_exit(
 /*===========================*/
 	const char*	name,		/*!< in: name of a file or NULL */
 	const char*	operation,	/*!< in: operation */
-	ibool		should_exit)	/*!< in: call exit(3) if unknown error
+	ibool		should_exit,	/*!< in: call exit(3) if unknown error
 					and this parameter is TRUE */
+	ibool		on_error_silent)/*!< in: if TRUE then don't print
+					any message to the log iff it is
+					an unknown non-fatal error */
 {
 	ulint	err;
 
-	err = os_file_get_last_error(FALSE);
+	err = os_file_get_last_error_low(false, on_error_silent);
 
-	if (err == OS_FILE_DISK_FULL) {
+	switch (err) {
+	case OS_FILE_DISK_FULL:
 		/* We only print a warning about disk full once */
 
 		if (os_has_said_disk_full) {
@@ -667,6 +689,9 @@ os_file_handle_error_cond_exit(
 			return(FALSE);
 		}
 
+		/* Disk full error is reported irrespective of the
+		on_error_silent setting. */
+
 		if (name) {
 			ut_print_timestamp(stderr);
 			fprintf(stderr,
@@ -684,41 +709,42 @@ os_file_handle_error_cond_exit(
 		fflush(stderr);
 
 		return(FALSE);
-	} else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
 
-		return(TRUE);
-	} else if (err == OS_FILE_AIO_INTERRUPTED) {
+	case OS_FILE_AIO_RESOURCES_RESERVED:
+	case OS_FILE_AIO_INTERRUPTED:
 
 		return(TRUE);
-	} else if (err == OS_FILE_ALREADY_EXISTS
-		   || err == OS_FILE_PATH_ERROR) {
+
+	case OS_FILE_PATH_ERROR:
+	case OS_FILE_ALREADY_EXISTS:
 
 		return(FALSE);
-	} else if (err == OS_FILE_SHARING_VIOLATION) {
+
+	case OS_FILE_SHARING_VIOLATION:
 
 		os_thread_sleep(10000000);  /* 10 sec */
 		return(TRUE);
-	} else if (err == OS_FILE_INSUFFICIENT_RESOURCE) {
 
-		os_thread_sleep(100000);	/* 100 ms */
-		return(TRUE);
-	} else if (err == OS_FILE_OPERATION_ABORTED) {
+	case OS_FILE_OPERATION_ABORTED:
+	case OS_FILE_INSUFFICIENT_RESOURCE:
 
 		os_thread_sleep(100000);	/* 100 ms */
 		return(TRUE);
-	} else {
-		if (name) {
-			fprintf(stderr, "InnoDB: File name %s\n", name);
-		}
 
-		fprintf(stderr, "InnoDB: File operation call: '%s'.\n",
-			operation);
+	default:
 
-		if (should_exit) {
-			fprintf(stderr, "InnoDB: Cannot continue operation.\n");
+		/* If it is an operation that can crash on error then it
+		is better to ignore on_error_silent and print an error message
+		to the log. */
 
-			fflush(stderr);
+		if (should_exit || !on_error_silent) {
+			ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS "
+				"error " ULINTPF ".%s", name ? name : "(unknown)",
+				operation, err, should_exit
+				? " Cannot continue operation" : "");
+		}
 
+		if (should_exit) {
 			exit(1);
 		}
 	}
@@ -733,11 +759,11 @@ static
 ibool
 os_file_handle_error(
 /*=================*/
-	const char*	name,	/*!< in: name of a file or NULL */
-	const char*	operation)/*!< in: operation */
+	const char*	name,		/*!< in: name of a file or NULL */
+	const char*	operation)	/*!< in: operation */
 {
 	/* exit in case of unknown error */
-	return(os_file_handle_error_cond_exit(name, operation, TRUE));
+	return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE));
 }
 
 /****************************************************************//**
@@ -747,11 +773,14 @@ static
 ibool
 os_file_handle_error_no_exit(
 /*=========================*/
-	const char*	name,	/*!< in: name of a file or NULL */
-	const char*	operation)/*!< in: operation */
+	const char*	name,		/*!< in: name of a file or NULL */
+	const char*	operation,	/*!< in: operation */
+	ibool		on_error_silent)/*!< in: if TRUE then don't print
+					any message to the log. */
 {
 	/* don't exit in case of unknown error */
-	return(os_file_handle_error_cond_exit(name, operation, FALSE));
+	return(os_file_handle_error_cond_exit(
+			name, operation, FALSE, on_error_silent));
 }
 
 #undef USE_FILE_LOCK
@@ -774,19 +803,23 @@ os_file_lock(
 	const char*	name)	/*!< in: file name */
 {
 	struct flock lk;
+
+	ut_ad(!srv_read_only_mode);
+
 	lk.l_type = F_WRLCK;
 	lk.l_whence = SEEK_SET;
 	lk.l_start = lk.l_len = 0;
+
 	if (fcntl(fd, F_SETLK, &lk) == -1) {
-		fprintf(stderr,
-			"InnoDB: Unable to lock %s, error: %d\n", name, errno);
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unable to lock %s, error: %d", name, errno);
 
 		if (errno == EAGAIN || errno == EACCES) {
-			fprintf(stderr,
-				"InnoDB: Check that you do not already have"
-				" another mysqld process\n"
-				"InnoDB: using the same InnoDB data"
-				" or log files.\n");
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Check that you do not already have "
+				"another mysqld process using the "
+				"same InnoDB data or log files.");
 		}
 
 		return(-1);
@@ -804,11 +837,11 @@ void
 os_io_init_simple(void)
 /*===================*/
 {
-	ulint	i;
-
+#if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
 	os_file_count_mutex = os_mutex_create();
+#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8 */
 
-	for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
+	for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
 		os_file_seek_mutexes[i] = os_mutex_create();
 	}
 #ifdef _WIN32
@@ -828,6 +861,8 @@ os_file_create_tmpfile(void)
 	FILE*	file	= NULL;
 	int	fd	= innobase_mysql_tmpfile();
 
+	ut_ad(!srv_read_only_mode);
+
 	if (fd >= 0) {
 		file = fdopen(fd, "w+b");
 	}
@@ -878,7 +913,8 @@ os_file_opendir(
 	the first entry in the directory. Since it is '.', that is no problem,
 	as we will skip over the '.' and '..' entries anyway. */
 
-	lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
+	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
+		ut_malloc(sizeof(WIN32_FIND_DATA)));
 
 	dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
 
@@ -902,7 +938,7 @@ os_file_opendir(
 	}
 
 	return(dir);
-#endif
+#endif /* __WIN__ */
 }
 
 /***********************************************************************//**
@@ -920,7 +956,7 @@ os_file_closedir(
 	ret = FindClose(dir);
 
 	if (!ret) {
-		os_file_handle_error_no_exit(NULL, "closedir");
+		os_file_handle_error_no_exit(NULL, "closedir", FALSE);
 
 		return(-1);
 	}
@@ -932,11 +968,11 @@ os_file_closedir(
 	ret = closedir(dir);
 
 	if (ret) {
-		os_file_handle_error_no_exit(NULL, "closedir");
+		os_file_handle_error_no_exit(NULL, "closedir", FALSE);
 	}
 
 	return(ret);
-#endif
+#endif /* __WIN__ */
 }
 
 /***********************************************************************//**
@@ -955,21 +991,22 @@ os_file_readdir_next_file(
 	LPWIN32_FIND_DATA	lpFindFileData;
 	BOOL			ret;
 
-	lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
+	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
+		ut_malloc(sizeof(WIN32_FIND_DATA)));
 next_file:
 	ret = FindNextFile(dir, lpFindFileData);
 
 	if (ret) {
-		ut_a(strlen((char *) lpFindFileData->cFileName)
+		ut_a(strlen((char*) lpFindFileData->cFileName)
 		     < OS_FILE_MAX_PATH);
 
-		if (strcmp((char *) lpFindFileData->cFileName, ".") == 0
-		    || strcmp((char *) lpFindFileData->cFileName, "..") == 0) {
+		if (strcmp((char*) lpFindFileData->cFileName, ".") == 0
+		    || strcmp((char*) lpFindFileData->cFileName, "..") == 0) {
 
 			goto next_file;
 		}
 
-		strcpy(info->name, (char *) lpFindFileData->cFileName);
+		strcpy(info->name, (char*) lpFindFileData->cFileName);
 
 		info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
 			+ (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
@@ -1003,8 +1040,7 @@ next_file:
 
 		return(1);
 	} else {
-		os_file_handle_error_no_exit(dirname,
-					     "readdir_next_file");
+		os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE);
 		return(-1);
 	}
 #else
@@ -1023,7 +1059,7 @@ next_file:
 next_file:
 
 #ifdef HAVE_READDIR_R
-	ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
+	ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
 
 	if (ret != 0
 #ifdef UNIV_AIX
@@ -1036,7 +1072,7 @@ next_file:
 	   ) {
 		fprintf(stderr,
 			"InnoDB: cannot read directory %s, error %lu\n",
-			dirname, (ulong)ret);
+			dirname, (ulong) ret);
 
 		return(-1);
 	}
@@ -1065,7 +1101,8 @@ next_file:
 
 	strcpy(info->name, ent->d_name);
 
-	full_path = ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10);
+	full_path = static_cast<char*>(
+		ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10));
 
 	sprintf(full_path, "%s/%s", dirname, ent->d_name);
 
@@ -1089,14 +1126,14 @@ next_file:
 			goto next_file;
 		}
 
-		os_file_handle_error_no_exit(full_path, "stat");
+		os_file_handle_error_no_exit(full_path, "stat", FALSE);
 
 		ut_free(full_path);
 
 		return(-1);
 	}
 
-	info->size = (ib_int64_t)statinfo.st_size;
+	info->size = (ib_int64_t) statinfo.st_size;
 
 	if (S_ISDIR(statinfo.st_mode)) {
 		info->type = OS_FILE_TYPE_DIR;
@@ -1115,10 +1152,12 @@ next_file:
 }
 
 /*****************************************************************//**
-This function attempts to create a directory named pathname. The new directory
-gets default permissions. On Unix the permissions are (0770 & ~umask). If the
-directory exists already, nothing is done and the call succeeds, unless the
-fail_if_exists arguments is true.
+This function attempts to create a directory named pathname. The new
+directory gets default permissions. On Unix the permissions are
+(0770 & ~umask). If the directory exists already, nothing is done and
+the call succeeds, unless the fail_if_exists arguments is true.
+If another error occurs, such as a permission error, this does not crash,
+but reports the error and returns FALSE.
 @return	TRUE if call succeeds, FALSE on error */
 UNIV_INTERN
 ibool
@@ -1136,13 +1175,14 @@ os_file_create_directory(
 	if (!(rcode != 0
 	      || (GetLastError() == ERROR_ALREADY_EXISTS
 		  && !fail_if_exists))) {
-		/* failure */
-		os_file_handle_error(pathname, "CreateDirectory");
+
+		os_file_handle_error_no_exit(
+			pathname, "CreateDirectory", FALSE);
 
 		return(FALSE);
 	}
 
-	return (TRUE);
+	return(TRUE);
 #else
 	int	rcode;
 
@@ -1150,13 +1190,13 @@ os_file_create_directory(
 
 	if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
 		/* failure */
-		os_file_handle_error(pathname, "mkdir");
+		os_file_handle_error_no_exit(pathname, "mkdir", FALSE);
 
 		return(FALSE);
 	}
 
 	return (TRUE);
-#endif
+#endif /* __WIN__ */
 }
 
 /****************************************************************//**
@@ -1171,135 +1211,186 @@ os_file_create_simple_func(
 /*=======================*/
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file is
-				opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error), or
-				OS_FILE_CREATE_PATH if new file
-				(if exists, error) and subdirectories along
-				its path are created (if needed)*/
+	ulint		create_mode,/*!< in: create mode */
 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
 				OS_FILE_READ_WRITE */
 	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
 {
-#ifdef __WIN__
 	os_file_t	file;
-	DWORD		create_flag;
+	ibool		retry;
+
+	*success = FALSE;
+#ifdef __WIN__
 	DWORD		access;
+	DWORD		create_flag;
 	DWORD		attributes	= 0;
-	ibool		retry;
 
-try_again:
-	ut_a(name);
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
 
 	if (create_mode == OS_FILE_OPEN) {
+
 		create_flag = OPEN_EXISTING;
+
+	} else if (srv_read_only_mode) {
+
+		create_flag = OPEN_EXISTING;
+
 	} else if (create_mode == OS_FILE_CREATE) {
+
 		create_flag = CREATE_NEW;
+
 	} else if (create_mode == OS_FILE_CREATE_PATH) {
-		/* create subdirs along the path if needed  */
+
+		ut_a(!srv_read_only_mode);
+
+		/* Create subdirs along the path if needed  */
 		*success = os_file_create_subdirs_if_needed(name);
+
 		if (!*success) {
-			ut_error;
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Unable to create subdirectories '%s'",
+				name);
+
+			return((os_file_t) -1);
 		}
+
 		create_flag = CREATE_NEW;
 		create_mode = OS_FILE_CREATE;
+
 	} else {
-		create_flag = 0;
-		ut_error;
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown file create mode (%lu) for file '%s'",
+			create_mode, name);
+
+		return((os_file_t) -1);
 	}
 
 	if (access_type == OS_FILE_READ_ONLY) {
 		access = GENERIC_READ;
+	} else if (srv_read_only_mode) {
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"read only mode set. Unable to "
+			"open file '%s' in RW mode, trying RO mode", name);
+
+		access = GENERIC_READ;
+
 	} else if (access_type == OS_FILE_READ_WRITE) {
 		access = GENERIC_READ | GENERIC_WRITE;
 	} else {
-		access = 0;
-		ut_error;
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown file access type (%lu) for file '%s'",
+			access_type, name);
+
+		return((os_file_t) -1);
 	}
 
-	file = CreateFile((LPCTSTR) name,
-			  access,
-			  FILE_SHARE_READ | FILE_SHARE_WRITE,
-			  /* file can be read and written also
-			  by other processes */
-			  NULL,	/* default security attributes */
-			  create_flag,
-			  attributes,
-			  NULL);	/*!< no template file */
+	do {
+		/* Use default security attributes and no template file. */
 
-	if (file == INVALID_HANDLE_VALUE) {
-		*success = FALSE;
+		file = CreateFile(
+			(LPCTSTR) name, access, FILE_SHARE_READ, NULL,
+			create_flag, attributes, NULL);
+
+		if (file == INVALID_HANDLE_VALUE) {
+
+			*success = FALSE;
+
+			retry = os_file_handle_error(
+				name, create_mode == OS_FILE_OPEN ?
+				"open" : "create");
 
-		retry = os_file_handle_error(name,
-					     create_mode == OS_FILE_OPEN ?
-					     "open" : "create");
-		if (retry) {
-			goto try_again;
+		} else {
+			*success = TRUE;
+			retry = false;
 		}
-	} else {
-		*success = TRUE;
-	}
 
-	return(file);
+	} while (retry);
+
 #else /* __WIN__ */
-	os_file_t	file;
 	int		create_flag;
-	ibool		retry;
 
-try_again:
-	ut_a(name);
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
 
 	if (create_mode == OS_FILE_OPEN) {
+
 		if (access_type == OS_FILE_READ_ONLY) {
 			create_flag = O_RDONLY;
+		} else if (srv_read_only_mode) {
+			create_flag = O_RDONLY;
 		} else {
 			create_flag = O_RDWR;
 		}
+
+	} else if (srv_read_only_mode) {
+
+		create_flag = O_RDONLY;
+
 	} else if (create_mode == OS_FILE_CREATE) {
+
 		create_flag = O_RDWR | O_CREAT | O_EXCL;
+
 	} else if (create_mode == OS_FILE_CREATE_PATH) {
-		/* create subdirs along the path if needed  */
+
+		/* Create subdirs along the path if needed  */
+
 		*success = os_file_create_subdirs_if_needed(name);
+
 		if (!*success) {
-			return (-1);
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Unable to create subdirectories '%s'",
+				name);
+
+			return((os_file_t) -1);
 		}
+
 		create_flag = O_RDWR | O_CREAT | O_EXCL;
 		create_mode = OS_FILE_CREATE;
 	} else {
-		create_flag = 0;
-		ut_error;
-	}
 
-	if (create_mode == OS_FILE_CREATE) {
-		file = open(name, create_flag, S_IRUSR | S_IWUSR
-			    | S_IRGRP | S_IWGRP);
-	} else {
-		file = open(name, create_flag);
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown file create mode (%lu) for file '%s'",
+			create_mode, name);
+
+		return((os_file_t) -1);
 	}
 
-	if (file == -1) {
-		*success = FALSE;
+	do {
+		file = ::open(name, create_flag, os_innodb_umask);
 
-		retry = os_file_handle_error(name,
-					     create_mode == OS_FILE_OPEN ?
-					     "open" : "create");
-		if (retry) {
-			goto try_again;
+		if (file == -1) {
+			*success = FALSE;
+
+			retry = os_file_handle_error(
+				name,
+				create_mode == OS_FILE_OPEN
+				?  "open" : "create");
+		} else {
+			*success = TRUE;
+			retry = false;
 		}
+
+	} while (retry);
+
 #ifdef USE_FILE_LOCK
-	} else if (access_type == OS_FILE_READ_WRITE
-		   && os_file_lock(file, name)) {
+	if (!srv_read_only_mode
+	    && *success
+	    && access_type == OS_FILE_READ_WRITE
+	    && os_file_lock(file, name)) {
+
 		*success = FALSE;
 		close(file);
 		file = -1;
-#endif
-	} else {
-		*success = TRUE;
 	}
+#endif /* USE_FILE_LOCK */
 
-	return(file);
 #endif /* __WIN__ */
+
+	return(file);
 }
 
 /****************************************************************//**
@@ -1314,106 +1405,137 @@ os_file_create_simple_no_error_handling_func(
 /*=========================================*/
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
-				is opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error) */
+	ulint		create_mode,/*!< in: create mode */
 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
 				OS_FILE_READ_WRITE, or
 				OS_FILE_READ_ALLOW_DELETE; the last option is
 				used by a backup program reading the file */
 	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
 {
-#ifdef __WIN__
 	os_file_t	file;
-	DWORD		create_flag;
+
+	*success = FALSE;
+#ifdef __WIN__
 	DWORD		access;
+	DWORD		create_flag;
 	DWORD		attributes	= 0;
-	DWORD		share_mode	= FILE_SHARE_READ | FILE_SHARE_WRITE;
+	DWORD		share_mode	= FILE_SHARE_READ;
 
 	ut_a(name);
 
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
 	if (create_mode == OS_FILE_OPEN) {
 		create_flag = OPEN_EXISTING;
+	} else if (srv_read_only_mode) {
+		create_flag = OPEN_EXISTING;
 	} else if (create_mode == OS_FILE_CREATE) {
 		create_flag = CREATE_NEW;
 	} else {
-		create_flag = 0;
-		ut_error;
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown file create mode (%lu) for file '%s'",
+			create_mode, name);
+
+		return((os_file_t) -1);
 	}
 
 	if (access_type == OS_FILE_READ_ONLY) {
 		access = GENERIC_READ;
+	} else if (srv_read_only_mode) {
+		access = GENERIC_READ;
 	} else if (access_type == OS_FILE_READ_WRITE) {
 		access = GENERIC_READ | GENERIC_WRITE;
 	} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
+
+		ut_a(!srv_read_only_mode);
+
 		access = GENERIC_READ;
-		share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ
-			| FILE_SHARE_WRITE;	/*!< A backup program has to give
-						mysqld the maximum freedom to
-						do what it likes with the
-						file */
+
+		/*!< A backup program has to give mysqld the maximum
+		freedom to do what it likes with the file */
+
+		share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
 	} else {
-		access = 0;
-		ut_error;
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown file access type (%lu) for file '%s'",
+			access_type, name);
+
+		return((os_file_t) -1);
 	}
 
 	file = CreateFile((LPCTSTR) name,
 			  access,
 			  share_mode,
-			  NULL,	/* default security attributes */
+			  NULL,			// Security attributes
 			  create_flag,
 			  attributes,
-			  NULL);	/*!< no template file */
-
-	if (file == INVALID_HANDLE_VALUE) {
-		*success = FALSE;
-	} else {
-		*success = TRUE;
-	}
+			  NULL);		// No template file
 
-	return(file);
+	*success = (file != INVALID_HANDLE_VALUE);
 #else /* __WIN__ */
-	os_file_t	file;
 	int		create_flag;
 
 	ut_a(name);
 
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
 	if (create_mode == OS_FILE_OPEN) {
+
 		if (access_type == OS_FILE_READ_ONLY) {
+
+			create_flag = O_RDONLY;
+
+		} else if (srv_read_only_mode) {
+
 			create_flag = O_RDONLY;
+
 		} else {
+
+			ut_a(access_type == OS_FILE_READ_WRITE
+			     || access_type == OS_FILE_READ_ALLOW_DELETE);
+
 			create_flag = O_RDWR;
 		}
+
+	} else if (srv_read_only_mode) {
+
+		create_flag = O_RDONLY;
+
 	} else if (create_mode == OS_FILE_CREATE) {
+
 		create_flag = O_RDWR | O_CREAT | O_EXCL;
-	} else {
-		create_flag = 0;
-		ut_error;
-	}
 
-	if (create_mode == OS_FILE_CREATE) {
-		file = open(name, create_flag, S_IRUSR | S_IWUSR
-			    | S_IRGRP | S_IWGRP);
 	} else {
-		file = open(name, create_flag);
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown file create mode (%lu) for file '%s'",
+			create_mode, name);
+
+		return((os_file_t) -1);
 	}
 
-	if (file == -1) {
-		*success = FALSE;
+	file = ::open(name, create_flag, os_innodb_umask);
+
+	*success = file == -1 ? FALSE : TRUE;
+
 #ifdef USE_FILE_LOCK
-	} else if (access_type == OS_FILE_READ_WRITE
-		   && os_file_lock(file, name)) {
+	if (!srv_read_only_mode
+	    && *success
+	    && access_type == OS_FILE_READ_WRITE
+	    && os_file_lock(file, name)) {
+
 		*success = FALSE;
 		close(file);
 		file = -1;
-#endif
-	} else {
-		*success = TRUE;
+
 	}
+#endif /* USE_FILE_LOCK */
 
-	return(file);
 #endif /* __WIN__ */
+
+	return(file);
 }
 
 /****************************************************************//**
@@ -1423,42 +1545,41 @@ void
 os_file_set_nocache(
 /*================*/
 	int		fd		/*!< in: file descriptor to alter */
-	__attribute__((unused)),
-	const char*	file_name	/*!< in: used in the diagnostic message */
-	__attribute__((unused)),
+					__attribute__((unused)),
+	const char*	file_name	/*!< in: used in the diagnostic
+					message */
+					__attribute__((unused)),
 	const char*	operation_name __attribute__((unused)))
-					/*!< in: "open" or "create"; used in the
-					diagnostic message */
+					/*!< in: "open" or "create"; used
+					in the diagnostic message */
 {
 	/* some versions of Solaris may not have DIRECTIO_ON */
 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
 	if (directio(fd, DIRECTIO_ON) == -1) {
-		int	errno_save;
-		errno_save = (int)errno;
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: Failed to set DIRECTIO_ON "
-			"on file %s: %s: %s, continuing anyway\n",
+		int	errno_save = errno;
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Failed to set DIRECTIO_ON on file %s: %s: %s, "
+			"continuing anyway.",
 			file_name, operation_name, strerror(errno_save));
 	}
 #elif defined(O_DIRECT)
 	if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
-		int	errno_save;
-		errno_save = (int)errno;
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: Failed to set O_DIRECT "
-			"on file %s: %s: %s, continuing anyway\n",
+		int	errno_save = errno;
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Failed to set O_DIRECT on file %s: %s: %s, "
+			"continuing anyway",
 			file_name, operation_name, strerror(errno_save));
+
 		if (errno_save == EINVAL) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: O_DIRECT is known to result in "
-				"'Invalid argument' on Linux on tmpfs, "
-				"see MySQL Bug#26662\n");
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"O_DIRECT is known to result in 'Invalid "
+				"argument' on Linux on tmpfs, see MySQL "
+				"Bug#26662");
 		}
 	}
-#endif
+#endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
 }
 
 
@@ -1481,15 +1602,15 @@ os_file_set_atomic_writes(
 
 	if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) {
 
-		os_file_handle_error_no_exit(name, "ioctl");
+		os_file_handle_error_no_exit(name, "ioctl", FALSE);
 		return(FALSE);
 	}
 
 	return(TRUE);
 #else
-	fprintf(stderr, "InnoDB: Error: trying to enable atomic writes on "
-		"non-supported platform! Please restart with "
-		"innodb_use_atomic_writes disabled.\n");
+	ib_logf(IB_LOG_LEVEL_ERROR,
+		"trying to enable atomic writes on non-supported platform! "
+		"Please restart with innodb_use_atomic_writes disabled.\n");
 	return(FALSE);
 #endif
 }
@@ -1506,14 +1627,7 @@ os_file_create_func(
 /*================*/
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
-				is opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error),
-				OS_FILE_OVERWRITE if a new file is created
-				or an old overwritten;
-				OS_FILE_OPEN_RAW, if a raw device or disk
-				partition should be opened */
+	ulint		create_mode,/*!< in: create mode */
 	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
 				non-buffered i/o is desired,
 				OS_FILE_NORMAL, if any normal file;
@@ -1524,79 +1638,123 @@ os_file_create_func(
 	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
 	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
 {
-#ifdef __WIN__
 	os_file_t	file;
-	DWORD		share_mode	= FILE_SHARE_READ;
-	DWORD		create_flag;
-	DWORD		attributes;
 	ibool		retry;
+	ibool		on_error_no_exit;
+	ibool		on_error_silent;
 
+#ifdef __WIN__
 	DBUG_EXECUTE_IF(
 		"ib_create_table_fail_disk_full",
 		*success = FALSE;
 		SetLastError(ERROR_DISK_FULL);
 		return((os_file_t) -1);
 	);
-try_again:
-	ut_a(name);
+#else /* __WIN__ */
+	DBUG_EXECUTE_IF(
+		"ib_create_table_fail_disk_full",
+		*success = FALSE;
+		errno = ENOSPC;
+		return((os_file_t) -1);
+	);
+#endif /* __WIN__ */
+
+#ifdef __WIN__
+	DWORD		create_flag;
+	DWORD		share_mode	= FILE_SHARE_READ;
+
+	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
+		? TRUE : FALSE;
+
+	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
+		? TRUE : FALSE;
+
+	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
+	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
 
 	if (create_mode == OS_FILE_OPEN_RAW) {
+
+		ut_a(!srv_read_only_mode);
+
 		create_flag = OPEN_EXISTING;
-		share_mode = FILE_SHARE_WRITE;
+
+		/* On Windows Physical devices require admin privileges and
+		have to have the write-share mode set. See the remarks
+		section for the CreateFile() function documentation in MSDN. */
+
+		share_mode |= FILE_SHARE_WRITE;
+
 	} else if (create_mode == OS_FILE_OPEN
 		   || create_mode == OS_FILE_OPEN_RETRY) {
+
 		create_flag = OPEN_EXISTING;
+
+	} else if (srv_read_only_mode) {
+
+		create_flag = OPEN_EXISTING;
+
 	} else if (create_mode == OS_FILE_CREATE) {
+
 		create_flag = CREATE_NEW;
+
 	} else if (create_mode == OS_FILE_OVERWRITE) {
+
 		create_flag = CREATE_ALWAYS;
+
 	} else {
-		create_flag = 0;
-		ut_error;
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown file create mode (%lu) for file '%s'",
+			create_mode, name);
+
+		return((os_file_t) -1);
 	}
 
+	DWORD		attributes = 0;
+
+#ifdef UNIV_HOTBACKUP
+	attributes |= FILE_FLAG_NO_BUFFERING;
+#else
 	if (purpose == OS_FILE_AIO) {
+
+#ifdef WIN_ASYNC_IO
 		/* If specified, use asynchronous (overlapped) io and no
 		buffering of writes in the OS */
-		attributes = 0;
-#ifdef WIN_ASYNC_IO
+
 		if (srv_use_native_aio) {
-			attributes = attributes | FILE_FLAG_OVERLAPPED;
-		}
-#endif
-#ifdef UNIV_NON_BUFFERED_IO
-# ifndef UNIV_HOTBACKUP
-		if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) {
-			/* Do not use unbuffered i/o to log files because
-			value 2 denotes that we do not flush the log at every
-			commit, but only once per second */
-		} else if (srv_win_file_flush_method
-			   == SRV_WIN_IO_UNBUFFERED) {
-			attributes = attributes | FILE_FLAG_NO_BUFFERING;
+			attributes |= FILE_FLAG_OVERLAPPED;
 		}
-# else /* !UNIV_HOTBACKUP */
-		attributes = attributes | FILE_FLAG_NO_BUFFERING;
-# endif /* !UNIV_HOTBACKUP */
-#endif /* UNIV_NON_BUFFERED_IO */
+#endif /* WIN_ASYNC_IO */
+
 	} else if (purpose == OS_FILE_NORMAL) {
-		attributes = 0;
+		/* Use default setting. */
+	} else {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown purpose flag (%lu) while opening file '%s'",
+			purpose, name);
+
+		return((os_file_t)(-1));
+	}
+
 #ifdef UNIV_NON_BUFFERED_IO
-# ifndef UNIV_HOTBACKUP
-		if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) {
-			/* Do not use unbuffered i/o to log files because
-			value 2 denotes that we do not flush the log at every
-			commit, but only once per second */
-		} else if (srv_win_file_flush_method
-			   == SRV_WIN_IO_UNBUFFERED) {
-			attributes = attributes | FILE_FLAG_NO_BUFFERING;
-		}
-# else /* !UNIV_HOTBACKUP */
-		attributes = attributes | FILE_FLAG_NO_BUFFERING;
-# endif /* !UNIV_HOTBACKUP */
+	// TODO: Create a bug, this looks wrong. The flush log
+	// parameter is dynamic.
+	if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) {
+
+		/* Do not use unbuffered i/o for the log files because
+		value 2 denotes that we do not flush the log at every
+		commit, but only once per second */
+
+	} else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
+
+		attributes |= FILE_FLAG_NO_BUFFERING;
+	}
 #endif /* UNIV_NON_BUFFERED_IO */
-	} else {
-		attributes = 0;
-		ut_error;
+
+#endif /* UNIV_HOTBACKUP */
+	DWORD	access = GENERIC_READ;
+
+	if (!srv_read_only_mode) {
+		access |= GENERIC_WRITE;
 	}
 
 	if (type == OS_LOG_FILE) {
@@ -1609,91 +1767,83 @@ try_again:
 		}
 	}
 
-	file = CreateFile((LPCTSTR) name,
-			  GENERIC_READ | GENERIC_WRITE, /* read and write
-							access */
-			  share_mode,	/* File can be read also by other
-					processes; we must give the read
-					permission because of ibbackup. We do
-					not give the write permission to
-					others because if one would succeed to
-					start 2 instances of mysqld on the
-					SAME files, that could cause severe
-					database corruption! When opening
-					raw disk partitions, Microsoft manuals
-					say that we must give also the write
-					permission. */
-			  NULL,	/* default security attributes */
-			  create_flag,
-			  attributes,
-			  NULL);	/*!< no template file */
+	do {
+		/* Use default security attributes and no template file. */
+		file = CreateFile(
+			(LPCTSTR) name, access, share_mode, NULL,
+			create_flag, attributes, NULL);
 
-	if (file == INVALID_HANDLE_VALUE) {
-		*success = FALSE;
+		if (file == INVALID_HANDLE_VALUE) {
+			const char*	operation;
+
+			operation = (create_mode == OS_FILE_CREATE
+				     && !srv_read_only_mode)
+				? "create" : "open";
 
-		/* When srv_file_per_table is on, file creation failure may not
-		be critical to the whole instance. Do not crash the server in
-		case of unknown errors.
-		Please note "srv_file_per_table" is a global variable with
-		no explicit synchronization protection. It could be
-		changed during this execution path. It might not have the
-		same value as the one when building the table definition */
-		if (srv_file_per_table) {
-			retry = os_file_handle_error_no_exit(name,
-						create_mode == OS_FILE_CREATE ?
-						"create" : "open");
+			*success = FALSE;
+
+			if (on_error_no_exit) {
+				retry = os_file_handle_error_no_exit(
+					name, operation, on_error_silent);
+			} else {
+				retry = os_file_handle_error(name, operation);
+			}
 		} else {
-			retry = os_file_handle_error(name,
-						create_mode == OS_FILE_CREATE ?
-						"create" : "open");
+			*success = TRUE;
+			retry = FALSE;
 		}
 
-		if (retry) {
-			goto try_again;
-		}
-	} else {
-		*success = TRUE;
-		if (srv_use_native_aio && ((attributes & FILE_FLAG_OVERLAPPED) != 0)) {
-			ut_a(CreateIoCompletionPort(file, completion_port, 0, 0));
-		}
-	}
+	} while (retry);
 
 	if (srv_use_atomic_writes && type == OS_DATA_FILE &&
-		!os_file_set_atomic_writes(file, name)) {
+		!os_file_set_atomic_writes(name, file)) {
 			 CloseHandle(file);
 			*success = FALSE;
 			file = INVALID_HANDLE_VALUE;
 	}
 
-	return(file);
 #else /* __WIN__ */
-	os_file_t	file;
 	int		create_flag;
-	ibool		retry;
 	const char*	mode_str	= NULL;
 
-	DBUG_EXECUTE_IF(
-		"ib_create_table_fail_disk_full",
-		*success = FALSE;
-		errno = ENOSPC;
-		return((os_file_t) -1);
-	);
-try_again:
-	ut_a(name);
+	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
+		? TRUE : FALSE;
+	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
+		? TRUE : FALSE;
+
+	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
+	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
 
-	if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW
+	if (create_mode == OS_FILE_OPEN
+	    || create_mode == OS_FILE_OPEN_RAW
 	    || create_mode == OS_FILE_OPEN_RETRY) {
+
+		mode_str = "OPEN";
+
+		create_flag = srv_read_only_mode ? O_RDONLY : O_RDWR;
+
+	} else if (srv_read_only_mode) {
+
 		mode_str = "OPEN";
-		create_flag = O_RDWR;
+
+		create_flag = O_RDONLY;
+
 	} else if (create_mode == OS_FILE_CREATE) {
+
 		mode_str = "CREATE";
 		create_flag = O_RDWR | O_CREAT | O_EXCL;
+
 	} else if (create_mode == OS_FILE_OVERWRITE) {
+
 		mode_str = "OVERWRITE";
 		create_flag = O_RDWR | O_CREAT | O_TRUNC;
+
 	} else {
-		create_flag = 0;
-		ut_error;
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown file create mode (%lu) for file '%s'",
+			create_mode, name);
+
+		return((os_file_t) -1);
 	}
 
 	ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
@@ -1703,80 +1853,77 @@ try_again:
 	/* We let O_SYNC only affect log files; note that we map O_DSYNC to
 	O_SYNC because the datasync options seemed to corrupt files in 2001
 	in both Linux and Solaris */
-	if (type == OS_LOG_FILE
-	    && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
 
-# if 0
-		fprintf(stderr, "Using O_SYNC for file %s\n", name);
-# endif
+	if (!srv_read_only_mode
+	    && type == OS_LOG_FILE
+	    && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
 
-		create_flag = create_flag | O_SYNC;
+		create_flag |= O_SYNC;
 	}
 #endif /* O_SYNC */
 
-	file = open(name, create_flag, os_innodb_umask);
+	do {
+		file = ::open(name, create_flag, os_innodb_umask);
 
-	if (file == -1) {
-		*success = FALSE;
+		if (file == -1) {
+			const char*	operation;
 
-		/* When srv_file_per_table is on, file creation failure may not
-		be critical to the whole instance. Do not crash the server in
-		case of unknown errors.
-		Please note "srv_file_per_table" is a global variable with
-		no explicit synchronization protection. It could be
-		changed during this execution path. It might not have the
-		same value as the one when building the table definition */
-		if (srv_file_per_table) {
-			retry = os_file_handle_error_no_exit(name,
-						create_mode == OS_FILE_CREATE ?
-						"create" : "open");
-		} else {
-			retry = os_file_handle_error(name,
-						create_mode == OS_FILE_CREATE ?
-						"create" : "open");
-		}
+			operation = (create_mode == OS_FILE_CREATE
+				     && !srv_read_only_mode)
+				? "create" : "open";
+
+			*success = FALSE;
 
-		if (retry) {
-			goto try_again;
+			if (on_error_no_exit) {
+				retry = os_file_handle_error_no_exit(
+					name, operation, on_error_silent);
+			} else {
+				retry = os_file_handle_error(name, operation);
+			}
 		} else {
-			return(file /* -1 */);
+			*success = TRUE;
+			retry = false;
 		}
-	}
-	/* else */
 
-	*success = TRUE;
+	} while (retry);
 
-	/* We disable OS caching (O_DIRECT) only on data files */
-	if (type != OS_LOG_FILE
-	    && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
-		
-		os_file_set_nocache(file, name, mode_str);
-	}
+	if (!srv_read_only_mode
+	    && *success
+	    && type != OS_LOG_FILE
+	    && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
+		|| srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
 
-	/* ALL_O_DIRECT: O_DIRECT also for transaction log file */
-	if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
+		os_file_set_nocache(file, name, mode_str);
+	} else if (!srv_read_only_mode
+	    && *success
+	    && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
 		os_file_set_nocache(file, name, mode_str);
 	}
 
 #ifdef USE_FILE_LOCK
-	if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
+	if (!srv_read_only_mode
+	    && *success
+	    && create_mode != OS_FILE_OPEN_RAW
+	    && os_file_lock(file, name)) {
 
 		if (create_mode == OS_FILE_OPEN_RETRY) {
-			int i;
-			ut_print_timestamp(stderr);
-			fputs("  InnoDB: Retrying to lock"
-			      " the first data file\n",
-			      stderr);
-			for (i = 0; i < 100; i++) {
+
+			ut_a(!srv_read_only_mode);
+
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Retrying to lock the first data file");
+
+			for (int i = 0; i < 100; i++) {
 				os_thread_sleep(1000000);
+
 				if (!os_file_lock(file, name)) {
 					*success = TRUE;
 					return(file);
 				}
 			}
-			ut_print_timestamp(stderr);
-			fputs("  InnoDB: Unable to open the first data file\n",
-			      stderr);
+
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Unable to open the first data file");
 		}
 
 		*success = FALSE;
@@ -1793,54 +1940,55 @@ try_again:
 		file = -1;
 	}
 
-	return(file);
 #endif /* __WIN__ */
+
+	return(file);
 }
 
 /***********************************************************************//**
 Deletes a file if it exists. The file has to be closed before calling this.
 @return	TRUE if success */
 UNIV_INTERN
-ibool
-os_file_delete_if_exists(
-/*=====================*/
-	const char*	name)	/*!< in: file path as a null-terminated string */
+bool
+os_file_delete_if_exists_func(
+/*==========================*/
+	const char*	name)	/*!< in: file path as a null-terminated
+				string */
 {
 #ifdef __WIN__
-	BOOL	ret;
+	bool	ret;
 	ulint	count	= 0;
 loop:
 	/* In Windows, deleting an .ibd file may fail if ibbackup is copying
 	it */
 
-	ret = DeleteFile((LPCTSTR)name);
+	ret = DeleteFile((LPCTSTR) name);
 
 	if (ret) {
-		return(TRUE);
+		return(true);
 	}
 
-	if (GetLastError() == ERROR_FILE_NOT_FOUND) {
+	DWORD lasterr = GetLastError();
+	if (lasterr == ERROR_FILE_NOT_FOUND
+	    || lasterr == ERROR_PATH_NOT_FOUND) {
 		/* the file does not exist, this not an error */
 
-		return(TRUE);
+		return(true);
 	}
 
 	count++;
 
 	if (count > 100 && 0 == (count % 10)) {
-		fprintf(stderr,
-			"InnoDB: Warning: cannot delete file %s\n"
-			"InnoDB: Are you running ibbackup"
-			" to back up the file?\n", name);
+		os_file_get_last_error(true); /* print error information */
 
-		os_file_get_last_error(TRUE); /* print error information */
+		ib_logf(IB_LOG_LEVEL_WARN, "Delete of file %s failed.", name);
 	}
 
 	os_thread_sleep(1000000);	/* sleep for a second */
 
 	if (count > 2000) {
 
-		return(FALSE);
+		return(false);
 	}
 
 	goto loop;
@@ -1850,23 +1998,24 @@ loop:
 	ret = unlink(name);
 
 	if (ret != 0 && errno != ENOENT) {
-		os_file_handle_error_no_exit(name, "delete");
+		os_file_handle_error_no_exit(name, "delete", FALSE);
 
-		return(FALSE);
+		return(false);
 	}
 
-	return(TRUE);
-#endif
+	return(true);
+#endif /* __WIN__ */
 }
 
 /***********************************************************************//**
 Deletes a file. The file has to be closed before calling this.
 @return	TRUE if success */
 UNIV_INTERN
-ibool
-os_file_delete(
-/*===========*/
-	const char*	name)	/*!< in: file path as a null-terminated string */
+bool
+os_file_delete_func(
+/*================*/
+	const char*	name)	/*!< in: file path as a null-terminated
+				string */
 {
 #ifdef __WIN__
 	BOOL	ret;
@@ -1875,35 +2024,35 @@ loop:
 	/* In Windows, deleting an .ibd file may fail if ibbackup is copying
 	it */
 
-	ret = DeleteFile((LPCTSTR)name);
+	ret = DeleteFile((LPCTSTR) name);
 
 	if (ret) {
-		return(TRUE);
+		return(true);
 	}
 
 	if (GetLastError() == ERROR_FILE_NOT_FOUND) {
 		/* If the file does not exist, we classify this as a 'mild'
 		error and return */
 
-		return(FALSE);
+		return(false);
 	}
 
 	count++;
 
 	if (count > 100 && 0 == (count % 10)) {
+		os_file_get_last_error(true); /* print error information */
+
 		fprintf(stderr,
 			"InnoDB: Warning: cannot delete file %s\n"
 			"InnoDB: Are you running ibbackup"
 			" to back up the file?\n", name);
-
-		os_file_get_last_error(TRUE); /* print error information */
 	}
 
 	os_thread_sleep(1000000);	/* sleep for a second */
 
 	if (count > 2000) {
 
-		return(FALSE);
+		return(false);
 	}
 
 	goto loop;
@@ -1913,12 +2062,12 @@ loop:
 	ret = unlink(name);
 
 	if (ret != 0) {
-		os_file_handle_error_no_exit(name, "delete");
+		os_file_handle_error_no_exit(name, "delete", FALSE);
 
-		return(FALSE);
+		return(false);
 	}
 
-	return(TRUE);
+	return(true);
 #endif
 }
 
@@ -1935,6 +2084,19 @@ os_file_rename_func(
 				string */
 	const char*	newpath)/*!< in: new file path */
 {
+#ifdef UNIV_DEBUG
+	os_file_type_t	type;
+	ibool		exists;
+
+	/* New path must not exist. */
+	ut_ad(os_file_status(newpath, &exists, &type));
+	ut_ad(!exists);
+
+	/* Old path must exist. */
+	ut_ad(os_file_status(oldpath, &exists, &type));
+	ut_ad(exists);
+#endif /* UNIV_DEBUG */
+
 #ifdef __WIN__
 	BOOL	ret;
 
@@ -1944,7 +2106,7 @@ os_file_rename_func(
 		return(TRUE);
 	}
 
-	os_file_handle_error_no_exit(oldpath, "rename");
+	os_file_handle_error_no_exit(oldpath, "rename", FALSE);
 
 	return(FALSE);
 #else
@@ -1953,13 +2115,13 @@ os_file_rename_func(
 	ret = rename(oldpath, newpath);
 
 	if (ret != 0) {
-		os_file_handle_error_no_exit(oldpath, "rename");
+		os_file_handle_error_no_exit(oldpath, "rename", FALSE);
 
 		return(FALSE);
 	}
 
 	return(TRUE);
-#endif
+#endif /* __WIN__ */
 }
 
 /***********************************************************************//**
@@ -1999,7 +2161,7 @@ os_file_close_func(
 	}
 
 	return(TRUE);
-#endif
+#endif /* __WIN__ */
 }
 
 /***********************************************************************//**
@@ -2034,78 +2196,35 @@ os_file_close_no_error_handling(
 	}
 
 	return(TRUE);
-#endif
+#endif /* __WIN__ */
 }
 
 /***********************************************************************//**
 Gets a file size.
-@return	TRUE if success */
+@return	file size, or (os_offset_t) -1 on failure */
 UNIV_INTERN
-ibool
+os_offset_t
 os_file_get_size(
 /*=============*/
-	os_file_t	file,	/*!< in: handle to a file */
-	ulint*		size,	/*!< out: least significant 32 bits of file
-				size */
-	ulint*		size_high)/*!< out: most significant 32 bits of size */
+	os_file_t	file)	/*!< in: handle to a file */
 {
 #ifdef __WIN__
-	DWORD	high;
-	DWORD	low;
+	os_offset_t	offset;
+	DWORD		high;
+	DWORD		low;
 
 	low = GetFileSize(file, &high);
 
 	if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
-		return(FALSE);
+		return((os_offset_t) -1);
 	}
 
-	*size = low;
-	*size_high = high;
+	offset = (os_offset_t) low | ((os_offset_t) high << 32);
 
-	return(TRUE);
+	return(offset);
 #else
-	off_t	offs;
-
-	offs = lseek(file, 0, SEEK_END);
-
-	if (offs == ((off_t)-1)) {
-
-		return(FALSE);
-	}
-
-	if (sizeof(off_t) > 4) {
-		*size = (ulint)(offs & 0xFFFFFFFFUL);
-		*size_high = (ulint)(offs >> 32);
-	} else {
-		*size = (ulint) offs;
-		*size_high = 0;
-	}
-
-	return(TRUE);
-#endif
-}
-
-/***********************************************************************//**
-Gets file size as a 64-bit integer ib_int64_t.
-@return	size in bytes, -1 if error */
-UNIV_INTERN
-ib_int64_t
-os_file_get_size_as_iblonglong(
-/*===========================*/
-	os_file_t	file)	/*!< in: handle to a file */
-{
-	ulint	size;
-	ulint	size_high;
-	ibool	success;
-
-	success = os_file_get_size(file, &size, &size_high);
-
-	if (!success) {
-
-		return(-1);
-	}
-
-	return((((ib_int64_t)size_high) << 32) + (ib_int64_t)size);
+	return((os_offset_t) lseek(file, 0, SEEK_END));
+#endif /* __WIN__ */
 }
 
 /***********************************************************************//**
@@ -2118,32 +2237,27 @@ os_file_set_size(
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
 	os_file_t	file,	/*!< in: handle to a file */
-	ulint		size,	/*!< in: least significant 32 bits of file
-				size */
-	ulint		size_high)/*!< in: most significant 32 bits of size */
+	os_offset_t	size)	/*!< in: file size */
 {
-	ib_int64_t	current_size;
-	ib_int64_t	desired_size;
+	os_offset_t	current_size;
 	ibool		ret;
 	byte*		buf;
 	byte*		buf2;
 	ulint		buf_size;
 
-	ut_a(size == (size & 0xFFFFFFFF));
-
 	current_size = 0;
-	desired_size = (ib_int64_t)size + (((ib_int64_t)size_high) << 32);
 
 #ifdef HAVE_POSIX_FALLOCATE
 	if (srv_use_posix_fallocate) {
 
-		if (posix_fallocate(file, current_size, desired_size) == -1) {
+		if (posix_fallocate(file, current_size, size) == -1) {
 
-			fprintf(stderr, "InnoDB: Error: preallocating file "
+			ib_logf(IB_LOG_LEVEL_ERROR, "preallocating file "
 				"space for file \'%s\' failed.  Current size "
-				"%lld, desired size %lld\n",
-				name, current_size, desired_size);
-			os_file_handle_error_no_exit(name, "posix_fallocate");
+				INT64PF ", desired size " INT64PF "\n",
+				name, current_size, size);
+			os_file_handle_error_no_exit (name, "posix_fallocate",
+						      FALSE);
 			return(FALSE);
 		}
 		return(TRUE);
@@ -2151,59 +2265,56 @@ os_file_set_size(
 #endif
 
 	/* Write up to 1 megabyte at a time. */
-	buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE))
+	buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE))
 		* UNIV_PAGE_SIZE;
-	buf2 = ut_malloc(buf_size + UNIV_PAGE_SIZE);
+	buf2 = static_cast<byte*>(ut_malloc(buf_size + UNIV_PAGE_SIZE));
 
 	/* Align the buffer for possible raw i/o */
-	buf = ut_align(buf2, UNIV_PAGE_SIZE);
+	buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
 
 	/* Write buffer full of zeros */
 	memset(buf, 0, buf_size);
 
-	if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
+	if (size >= (os_offset_t) 100 << 20) {
 
 		fprintf(stderr, "InnoDB: Progress in MB:");
 	}
 
-	while (current_size < desired_size) {
+	while (current_size < size) {
 		ulint	n_bytes;
 
-		if (desired_size - current_size < (ib_int64_t) buf_size) {
-			n_bytes = (ulint) (desired_size - current_size);
+		if (size - current_size < (os_offset_t) buf_size) {
+			n_bytes = (ulint) (size - current_size);
 		} else {
 			n_bytes = buf_size;
 		}
 
-		ret = os_file_write(name, file, buf,
-				    (ulint)(current_size & 0xFFFFFFFF),
-				    (ulint)(current_size >> 32),
-				    n_bytes);
+		ret = os_file_write(name, file, buf, current_size, n_bytes);
 		if (!ret) {
 			ut_free(buf2);
 			goto error_handling;
 		}
 
 		/* Print about progress for each 100 MB written */
-		if ((ib_int64_t) (current_size + n_bytes) / (ib_int64_t)(100 * 1024 * 1024)
-		    != current_size / (ib_int64_t)(100 * 1024 * 1024)) {
+		if ((current_size + n_bytes) / (100 << 20)
+		    != current_size / (100 << 20)) {
 
 			fprintf(stderr, " %lu00",
 				(ulong) ((current_size + n_bytes)
-					 / (ib_int64_t)(100 * 1024 * 1024)));
+					 / (100 << 20)));
 		}
 
 		current_size += n_bytes;
 	}
 
-	if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
+	if (size >= (os_offset_t) 100 << 20) {
 
 		fprintf(stderr, "\n");
 	}
 
 	ut_free(buf2);
 
-	ret = os_file_flush(file, TRUE);
+	ret = os_file_flush(file);
 
 	if (ret) {
 		return(TRUE);
@@ -2262,8 +2373,7 @@ static
 int
 os_file_fsync(
 /*==========*/
-	os_file_t	file,	/*!< in: handle to a file */
-	ibool		metadata)
+	os_file_t	file)	/*!< in: handle to a file */
 {
 	int	ret;
 	int	failures;
@@ -2272,16 +2382,7 @@ os_file_fsync(
 	failures = 0;
 
 	do {
-#if defined(HAVE_FDATASYNC) && HAVE_DECL_FDATASYNC
-		if (metadata) {
-			ret = fsync(file);
-		} else {
-			ret = fdatasync(file);
-		}
-#else
-		(void) metadata;
 		ret = fsync(file);
-#endif
 
 		os_n_fsyncs++;
 
@@ -2291,7 +2392,7 @@ os_file_fsync(
 
 				ut_print_timestamp(stderr);
 				fprintf(stderr,
-					"  InnoDB: fsync(): "
+					" InnoDB: fsync(): "
 					"No locks available; retrying\n");
 			}
 
@@ -2321,8 +2422,7 @@ UNIV_INTERN
 ibool
 os_file_flush_func(
 /*===============*/
-	os_file_t	file,	/*!< in, own: handle to a file */
-	ibool		metadata)
+	os_file_t	file)	/*!< in, own: handle to a file */
 {
 #ifdef __WIN__
 	BOOL	ret;
@@ -2372,18 +2472,18 @@ os_file_flush_func(
 		/* If we are not on an operating system that supports this,
 		then fall back to a plain fsync. */
 
-		ret = os_file_fsync(file, metadata);
+		ret = os_file_fsync(file);
 	} else {
 		ret = fcntl(file, F_FULLFSYNC, NULL);
 
 		if (ret) {
 			/* If we are not on a file system that supports this,
 			then fall back to a plain fsync. */
-			ret = os_file_fsync(file, metadata);
+			ret = os_file_fsync(file);
 		}
 	}
 #else
-	ret = os_file_fsync(file, metadata);
+	ret = os_file_fsync(file);
 #endif
 
 	if (ret == 0) {
@@ -2398,10 +2498,7 @@ os_file_flush_func(
 		return(TRUE);
 	}
 
-	ut_print_timestamp(stderr);
-
-	fprintf(stderr,
-		"  InnoDB: Error: the OS said file flush did not succeed\n");
+	ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed");
 
 	os_file_handle_error(NULL, "flush");
 
@@ -2417,17 +2514,14 @@ os_file_flush_func(
 /*******************************************************************//**
 Does a synchronous read operation in Posix.
 @return	number of bytes read, -1 if error */
-static
+static __attribute__((nonnull(2), warn_unused_result))
 ssize_t
 os_file_pread(
 /*==========*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
 	ulint		n,	/*!< in: number of bytes to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset from where to read */
-	ulint		offset_high, /*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset from where to read */
 	trx_t*		trx)
 {
 	off_t	offs;
@@ -2440,20 +2534,16 @@ os_file_pread(
 	ib_uint64_t	start_time;
 	ib_uint64_t	finish_time;
 
-	ut_a((offset & 0xFFFFFFFFUL) == offset);
+	ut_ad(n);
 
 	/* If off_t is > 4 bytes in size, then we assume we can pass a
 	64-bit address */
+	offs = (off_t) offset;
 
-	if (sizeof(off_t) > 4) {
-		offs = (off_t)offset + (((off_t)offset_high) << 32);
-
-	} else {
-		offs = (off_t)offset;
-
-		if (offset_high > 0) {
-			fprintf(stderr,
-				"InnoDB: Error: file read at offset > 4 GB\n");
+	if (sizeof(off_t) <= 4) {
+		if (offset != (os_offset_t) offs) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"File read at offset > 4 GB");
 		}
 	}
 
@@ -2469,10 +2559,17 @@ os_file_pread(
 		start_time = 0;
 	}
 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
+#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
+	(void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
+	(void) os_atomic_increment_ulint(&os_file_n_pending_preads, 1);
+	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
+#else
 	os_mutex_enter(os_file_count_mutex);
 	os_file_n_pending_preads++;
 	os_n_pending_reads++;
+	MONITOR_INC(MONITOR_OS_PENDING_READS);
 	os_mutex_exit(os_file_count_mutex);
+#endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */
 
 	/* Handle signal interruptions correctly */
 	for (n_bytes = 0; n_bytes < (ssize_t) n; ) {
@@ -2487,10 +2584,17 @@ os_file_pread(
 		}
 	}
 
+#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
+	(void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
+	(void) os_atomic_decrement_ulint(&os_file_n_pending_preads, 1);
+	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
+#else
 	os_mutex_enter(os_file_count_mutex);
 	os_file_n_pending_preads--;
 	os_n_pending_reads--;
+	MONITOR_DEC(MONITOR_OS_PENDING_READS);
 	os_mutex_exit(os_file_count_mutex);
+#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD == 8 */
 
 	if (UNIV_UNLIKELY(start_time != 0))
 	{
@@ -2509,10 +2613,15 @@ os_file_pread(
 		ulint	i;
 #endif /* !UNIV_HOTBACKUP */
 
+#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
+		(void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
+		MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
+#else
 		os_mutex_enter(os_file_count_mutex);
 		os_n_pending_reads++;
+		MONITOR_INC(MONITOR_OS_PENDING_READS);
 		os_mutex_exit(os_file_count_mutex);
-
+#endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */
 #ifndef UNIV_HOTBACKUP
 		/* Protect the seek / read operation with a mutex */
 		i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
@@ -2542,9 +2651,15 @@ os_file_pread(
 		os_mutex_exit(os_file_seek_mutexes[i]);
 #endif /* !UNIV_HOTBACKUP */
 
+#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
+		(void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
+		MONITOR_ATOIC_DEC(MONITOR_OS_PENDING_READS);
+#else
 		os_mutex_enter(os_file_count_mutex);
 		os_n_pending_reads--;
+		MONITOR_DEC(MONITOR_OS_PENDING_READS);
 		os_mutex_exit(os_file_count_mutex);
+#endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD_SIZE == 8 */
 
 		if (UNIV_UNLIKELY(start_time != 0)
 		{
@@ -2561,46 +2676,47 @@ os_file_pread(
 /*******************************************************************//**
 Does a synchronous write operation in Posix.
 @return	number of bytes written, -1 if error */
-static
+static __attribute__((nonnull, warn_unused_result))
 ssize_t
 os_file_pwrite(
 /*===========*/
 	os_file_t	file,	/*!< in: handle to a file */
 	const void*	buf,	/*!< in: buffer from where to write */
 	ulint		n,	/*!< in: number of bytes to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to write */
-	ulint		offset_high) /*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset)	/*!< in: file offset where to write */
 {
 	ssize_t	ret;
 	ssize_t n_written;
 	off_t	offs;
 
-	ut_a((offset & 0xFFFFFFFFUL) == offset);
+	ut_ad(n);
+	ut_ad(!srv_read_only_mode);
 
 	/* If off_t is > 4 bytes in size, then we assume we can pass a
 	64-bit address */
+	offs = (off_t) offset;
 
-	if (sizeof(off_t) > 4) {
-		offs = (off_t)offset + (((off_t)offset_high) << 32);
-	} else {
-		offs = (off_t)offset;
-
-		if (offset_high > 0) {
-			fprintf(stderr,
-				"InnoDB: Error: file write"
-				" at offset > 4 GB\n");
+	if (sizeof(off_t) <= 4) {
+		if (offset != (os_offset_t) offs) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"File write at offset > 4 GB.");
 		}
 	}
 
 	os_n_file_writes++;
 
 #if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
+#if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
 	os_mutex_enter(os_file_count_mutex);
 	os_file_n_pending_pwrites++;
 	os_n_pending_writes++;
+	MONITOR_INC(MONITOR_OS_PENDING_WRITES);
 	os_mutex_exit(os_file_count_mutex);
+#else
+	(void) os_atomic_increment_ulint(&os_n_pending_writes, 1);
+	(void) os_atomic_increment_ulint(&os_file_n_pending_pwrites, 1);
+	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
+#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */
 
 	/* Handle signal interruptions correctly */
 	for (ret = 0; ret < (ssize_t) n; ) {
@@ -2615,23 +2731,17 @@ os_file_pwrite(
 		}
 	}
 
+#if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
 	os_mutex_enter(os_file_count_mutex);
 	os_file_n_pending_pwrites--;
 	os_n_pending_writes--;
+	MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
 	os_mutex_exit(os_file_count_mutex);
-
-# ifdef UNIV_DO_FLUSH
-	if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
-	    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
-	    && !os_do_not_call_flush_at_each_write) {
-
-		/* Always do fsync to reduce the probability that when
-		the OS crashes, a database page is only partially
-		physically written to disk. */
-
-		ut_a(TRUE == os_file_flush(file, TRUE));
-	}
-# endif /* UNIV_DO_FLUSH */
+#else
+	(void) os_atomic_decrement_ulint(&os_n_pending_writes, 1);
+	(void) os_atomic_decrement_ulint(&os_file_n_pending_pwrites, 1);
+	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
+#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */
 
 	return(ret);
 #else
@@ -2643,6 +2753,7 @@ os_file_pwrite(
 
 		os_mutex_enter(os_file_count_mutex);
 		os_n_pending_writes++;
+		MONITOR_INC(MONITOR_OS_PENDING_WRITES);
 		os_mutex_exit(os_file_count_mutex);
 
 # ifndef UNIV_HOTBACKUP
@@ -2672,19 +2783,6 @@ os_file_pwrite(
 			}
 		}
 
-# ifdef UNIV_DO_FLUSH
-		if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
-		    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
-		    && !os_do_not_call_flush_at_each_write) {
-
-			/* Always do fsync to reduce the probability that when
-			the OS crashes, a database page is only partially
-			physically written to disk. */
-
-			ut_a(TRUE == os_file_flush(file, TRUE));
-		}
-# endif /* UNIV_DO_FLUSH */
-
 func_exit:
 # ifndef UNIV_HOTBACKUP
 		os_mutex_exit(os_file_seek_mutexes[i]);
@@ -2692,11 +2790,12 @@ func_exit:
 
 		os_mutex_enter(os_file_count_mutex);
 		os_n_pending_writes--;
+		MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
 		os_mutex_exit(os_file_count_mutex);
 
 		return(ret);
 	}
-#endif
+#endif /* !UNIV_HOTBACKUP */
 }
 #endif
 
@@ -2711,10 +2810,7 @@ os_file_read_func(
 /*==============*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read */
-	ulint		offset_high, /*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n,	/*!< in: number of bytes to read */
 	trx_t*		trx)
 {
@@ -2727,7 +2823,6 @@ os_file_read_func(
 
 	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
 	no more than 32 bits. */
-	ut_a((offset & 0xFFFFFFFFUL) == offset);
 	ut_a((n & 0xFFFFFFFFUL) == n);
 
 	os_n_file_reads++;
@@ -2740,11 +2835,12 @@ try_again:
 
 	os_mutex_enter(os_file_count_mutex);
 	os_n_pending_reads++;
+	MONITOR_INC(MONITOR_OS_PENDING_READS);
 	os_mutex_exit(os_file_count_mutex);
 
 	memset (&overlapped, 0, sizeof (overlapped));
-	overlapped.Offset = (DWORD)offset;
-	overlapped.OffsetHigh = (DWORD)offset_high;
+	overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF);
+	overlapped.OffsetHigh = (DWORD)(offset >> 32);
 	overlapped.hEvent = win_get_syncio_event();
 	ret = ReadFile(file, buf, n, NULL, &overlapped);
 	if (ret) {
@@ -2752,9 +2848,10 @@ try_again:
 	}
 	else if(GetLastError() == ERROR_IO_PENDING) {
 		ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE);
-  }
+        }
 	os_mutex_enter(os_file_count_mutex);
 	os_n_pending_reads--;
+	MONITOR_DEC(MONITOR_OS_PENDING_READS);
 	os_mutex_exit(os_file_count_mutex);
 
 	if (ret && len == n) {
@@ -2767,18 +2864,16 @@ try_again:
 	os_bytes_read_since_printout += n;
 
 try_again:
-	ret = os_file_pread(file, buf, n, offset, offset_high, trx);
+	ret = os_file_pread(file, buf, n, offset, trx);
 
-	if ((ulint)ret == n) {
+	if ((ulint) ret == n) {
 
 		return(TRUE);
 	}
 
-	fprintf(stderr,
-		"InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n"
-		"InnoDB: Was only able to read %ld.\n",
-		(ulong)n, (ulong)offset_high,
-		(ulong)offset, (long)ret);
+	ib_logf(IB_LOG_LEVEL_ERROR,
+		"Tried to read "ULINTPF" bytes at offset " UINT64PF". "
+		"Was only able to read %ld.", n, offset, (lint) ret);
 #endif /* __WIN__ */
 	retry = os_file_handle_error(NULL, "read");
 
@@ -2793,7 +2888,7 @@ try_again:
 		(ulong) GetLastError()
 #else
 		(ulong) errno
-#endif
+#endif /* __WIN__ */
 		);
 	fflush(stderr);
 
@@ -2814,10 +2909,7 @@ os_file_read_no_error_handling_func(
 /*================================*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read */
-	ulint		offset_high, /*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n)	/*!< in: number of bytes to read */
 {
 #ifdef __WIN__
@@ -2825,13 +2917,12 @@ os_file_read_no_error_handling_func(
 	DWORD		len;
 	ibool		retry;
 	OVERLAPPED overlapped;
-	overlapped.Offset = (DWORD)offset;
-	overlapped.OffsetHigh = (DWORD)offset_high;
+	overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF);
+	overlapped.OffsetHigh = (DWORD)(offset >> 32);
 
 
 	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
 	no more than 32 bits. */
-	ut_a((offset & 0xFFFFFFFFUL) == offset);
 	ut_a((n & 0xFFFFFFFFUL) == n);
 
 	os_n_file_reads++;
@@ -2844,11 +2935,12 @@ try_again:
 
 	os_mutex_enter(os_file_count_mutex);
 	os_n_pending_reads++;
+	MONITOR_INC(MONITOR_OS_PENDING_READS);
 	os_mutex_exit(os_file_count_mutex);
 
 	memset (&overlapped, 0, sizeof (overlapped));
-	overlapped.Offset = (DWORD)offset;
-	overlapped.OffsetHigh = (DWORD)offset_high;
+	overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF);
+	overlapped.OffsetHigh = (DWORD)(offset >> 32);
 	overlapped.hEvent = win_get_syncio_event();
 	ret = ReadFile(file, buf, n, NULL, &overlapped);
 	if (ret) {
@@ -2856,9 +2948,10 @@ try_again:
 	}
 	else if(GetLastError() == ERROR_IO_PENDING) {
 		ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE);
-  }
+	}
 	os_mutex_enter(os_file_count_mutex);
 	os_n_pending_reads--;
+	MONITOR_DEC(MONITOR_OS_PENDING_READS);
 	os_mutex_exit(os_file_count_mutex);
 
 	if (ret && len == n) {
@@ -2871,14 +2964,14 @@ try_again:
 	os_bytes_read_since_printout += n;
 
 try_again:
-	ret = os_file_pread(file, buf, n, offset, offset_high, NULL);
+	ret = os_file_pread(file, buf, n, offset, NULL);
 
-	if ((ulint)ret == n) {
+	if ((ulint) ret == n) {
 
 		return(TRUE);
 	}
 #endif /* __WIN__ */
-	retry = os_file_handle_error_no_exit(NULL, "read");
+	retry = os_file_handle_error_no_exit(NULL, "read", FALSE);
 
 	if (retry) {
 		goto try_again;
@@ -2923,12 +3016,11 @@ os_file_write_func(
 				null-terminated string */
 	os_file_t	file,	/*!< in: handle to a file */
 	const void*	buf,	/*!< in: buffer from which to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to write */
-	ulint		offset_high, /*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to write */
 	ulint		n)	/*!< in: number of bytes to write */
 {
+	ut_ad(!srv_read_only_mode);
+
 #ifdef __WIN__
 	BOOL		ret;
 	DWORD		len;
@@ -2938,7 +3030,6 @@ os_file_write_func(
 
 	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
 	no more than 32 bits. */
-	ut_a((offset & 0xFFFFFFFFUL) == offset);
 	ut_a((n & 0xFFFFFFFFUL) == n);
 
 	os_n_file_writes++;
@@ -2950,11 +3041,12 @@ retry:
 
 	os_mutex_enter(os_file_count_mutex);
 	os_n_pending_writes++;
+	MONITOR_INC(MONITOR_OS_PENDING_WRITES);
 	os_mutex_exit(os_file_count_mutex);
 
 	memset (&overlapped, 0, sizeof (overlapped));
-	overlapped.Offset = (DWORD)offset;
-	overlapped.OffsetHigh = (DWORD)offset_high;
+	overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF);
+	overlapped.OffsetHigh = (DWORD)(offset >> 32);
 
 	overlapped.hEvent = win_get_syncio_event();
 	ret = WriteFile(file, buf, n, NULL, &overlapped);
@@ -2965,14 +3057,9 @@ retry:
 		ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE);
 	}
 
-# ifdef UNIV_DO_FLUSH
-	if (!os_do_not_call_flush_at_each_write) {
-		ut_a(TRUE == os_file_flush(file));
-	}
-# endif /* UNIV_DO_FLUSH */
-
 	os_mutex_enter(os_file_count_mutex);
 	os_n_pending_writes--;
+	MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
 	os_mutex_exit(os_file_count_mutex);
 
 	if (ret && len == n) {
@@ -2995,13 +3082,13 @@ retry:
 
 	if (!os_has_said_disk_full) {
 
-		err = (ulint)GetLastError();
+		err = (ulint) GetLastError();
 
 		ut_print_timestamp(stderr);
 
 		fprintf(stderr,
-			"  InnoDB: Error: Write to file %s failed"
-			" at offset %lu %lu.\n"
+			" InnoDB: Error: Write to file %s failed"
+			" at offset %llu.\n"
 			"InnoDB: %lu bytes should have been written,"
 			" only %lu were written.\n"
 			"InnoDB: Operating system error number %lu.\n"
@@ -3009,13 +3096,13 @@ retry:
 			" support files of this size.\n"
 			"InnoDB: Check also that the disk is not full"
 			" or a disk quota exceeded.\n",
-			name, (ulong) offset_high, (ulong) offset,
-			(ulong) n, ret ? len : 0, (ulong) err);
+			name, offset,
+			(ulong) n, (ulong) len, (ulong) err);
 
-		if (strerror((int)err) != NULL) {
+		if (strerror((int) err) != NULL) {
 			fprintf(stderr,
 				"InnoDB: Error number %lu means '%s'.\n",
-				(ulong) err, strerror((int)err));
+				(ulong) err, strerror((int) err));
 		}
 
 		fprintf(stderr,
@@ -3031,9 +3118,9 @@ retry:
 #else
 	ssize_t	ret;
 
-	ret = os_file_pwrite(file, buf, n, offset, offset_high);
+	ret = os_file_pwrite(file, buf, n, offset);
 
-	if ((ulint)ret == n) {
+	if ((ulint) ret == n) {
 
 		return(TRUE);
 	}
@@ -3043,8 +3130,8 @@ retry:
 		ut_print_timestamp(stderr);
 
 		fprintf(stderr,
-			"  InnoDB: Error: Write to file %s failed"
-			" at offset %lu %lu.\n"
+			" InnoDB: Error: Write to file %s failed"
+			" at offset "UINT64PF".\n"
 			"InnoDB: %lu bytes should have been written,"
 			" only %ld were written.\n"
 			"InnoDB: Operating system error number %lu.\n"
@@ -3052,12 +3139,12 @@ retry:
 			" support files of this size.\n"
 			"InnoDB: Check also that the disk is not full"
 			" or a disk quota exceeded.\n",
-			name, offset_high, offset, n, (long int)ret,
-			(ulint)errno);
+			name, offset, n, (lint) ret,
+			(ulint) errno);
 		if (strerror(errno) != NULL) {
 			fprintf(stderr,
-				"InnoDB: Error number %lu means '%s'.\n",
-				(ulint)errno, strerror(errno));
+				"InnoDB: Error number %d means '%s'.\n",
+				errno, strerror(errno));
 		}
 
 		fprintf(stderr,
@@ -3080,15 +3167,15 @@ UNIV_INTERN
 ibool
 os_file_status(
 /*===========*/
-	const char*	path,	/*!< in:	pathname of the file */
+	const char*	path,	/*!< in: pathname of the file */
 	ibool*		exists,	/*!< out: TRUE if file exists */
 	os_file_type_t* type)	/*!< out: type of the file (if it exists) */
 {
 #ifdef __WIN__
 	int		ret;
-	struct _stat	statinfo;
+	struct _stat64	statinfo;
 
-	ret = _stat(path, &statinfo);
+	ret = _stat64(path, &statinfo);
 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
 		/* file does not exist */
 		*exists = FALSE;
@@ -3096,7 +3183,7 @@ os_file_status(
 	} else if (ret) {
 		/* file exists, but stat call failed */
 
-		os_file_handle_error_no_exit(path, "stat");
+		os_file_handle_error_no_exit(path, "stat", FALSE);
 
 		return(FALSE);
 	}
@@ -3124,7 +3211,7 @@ os_file_status(
 	} else if (ret) {
 		/* file exists, but stat call failed */
 
-		os_file_handle_error_no_exit(path, "stat");
+		os_file_handle_error_no_exit(path, "stat", FALSE);
 
 		return(FALSE);
 	}
@@ -3147,47 +3234,73 @@ os_file_status(
 
 /*******************************************************************//**
 This function returns information about the specified file
-@return	TRUE if stat information found */
+@return	DB_SUCCESS if all OK */
 UNIV_INTERN
-ibool
+dberr_t
 os_file_get_status(
 /*===============*/
 	const char*	path,		/*!< in:	pathname of the file */
-	os_file_stat_t* stat_info)	/*!< information of a file in a
+	os_file_stat_t* stat_info,	/*!< information of a file in a
 					directory */
+	bool		check_rw_perm)	/*!< in: for testing whether the
+					file can be opened in RW mode */
 {
-#ifdef __WIN__
 	int		ret;
-	struct _stat	statinfo;
 
-	ret = _stat(path, &statinfo);
+#ifdef __WIN__
+	struct _stat64	statinfo;
+
+	ret = _stat64(path, &statinfo);
+
 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
 		/* file does not exist */
 
-		return(FALSE);
+		return(DB_NOT_FOUND);
+
 	} else if (ret) {
 		/* file exists, but stat call failed */
 
-		os_file_handle_error_no_exit(path, "stat");
+		os_file_handle_error_no_exit(path, "stat", FALSE);
 
-		return(FALSE);
-	}
-	if (_S_IFDIR & statinfo.st_mode) {
+		return(DB_FAIL);
+
+	} else if (_S_IFDIR & statinfo.st_mode) {
 		stat_info->type = OS_FILE_TYPE_DIR;
 	} else if (_S_IFREG & statinfo.st_mode) {
+
+		DWORD	access = GENERIC_READ;
+
+		if (!srv_read_only_mode) {
+			access |= GENERIC_WRITE;
+		}
+
 		stat_info->type = OS_FILE_TYPE_FILE;
+
+		/* Check if we can open it in read-only mode. */
+
+		if (check_rw_perm) {
+			HANDLE	fh;
+
+			fh = CreateFile(
+				(LPCTSTR) path,		// File to open
+				access,
+				0,			// No sharing
+				NULL,			// Default security
+				OPEN_EXISTING,		// Existing file only
+				FILE_ATTRIBUTE_NORMAL,	// Normal file
+				NULL);			// No attr. template
+
+			if (fh == INVALID_HANDLE_VALUE) {
+				stat_info->rw_perm = false;
+			} else {
+				stat_info->rw_perm = true;
+				CloseHandle(fh);
+			}
+		}
 	} else {
 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
 	}
-
-	stat_info->ctime = statinfo.st_ctime;
-	stat_info->atime = statinfo.st_atime;
-	stat_info->mtime = statinfo.st_mtime;
-	stat_info->size	 = statinfo.st_size;
-
-	return(TRUE);
 #else
-	int		ret;
 	struct stat	statinfo;
 
 	ret = stat(path, &statinfo);
@@ -3195,32 +3308,49 @@ os_file_get_status(
 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
 		/* file does not exist */
 
-		return(FALSE);
+		return(DB_NOT_FOUND);
+
 	} else if (ret) {
 		/* file exists, but stat call failed */
 
-		os_file_handle_error_no_exit(path, "stat");
+		os_file_handle_error_no_exit(path, "stat", FALSE);
 
-		return(FALSE);
-	}
+		return(DB_FAIL);
 
-	if (S_ISDIR(statinfo.st_mode)) {
+	} else if (S_ISDIR(statinfo.st_mode)) {
 		stat_info->type = OS_FILE_TYPE_DIR;
 	} else if (S_ISLNK(statinfo.st_mode)) {
 		stat_info->type = OS_FILE_TYPE_LINK;
 	} else if (S_ISREG(statinfo.st_mode)) {
 		stat_info->type = OS_FILE_TYPE_FILE;
+
+		if (check_rw_perm) {
+			int	fh;
+			int	access;
+
+			access = !srv_read_only_mode ? O_RDWR : O_RDONLY;
+
+			fh = ::open(path, access, os_innodb_umask);
+
+			if (fh == -1) {
+				stat_info->rw_perm = false;
+			} else {
+				stat_info->rw_perm = true;
+				close(fh);
+			}
+		}
 	} else {
 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
 	}
 
+#endif /* _WIN_ */
+
 	stat_info->ctime = statinfo.st_ctime;
 	stat_info->atime = statinfo.st_atime;
 	stat_info->mtime = statinfo.st_mtime;
-	stat_info->size	 = statinfo.st_size;
+	stat_info->size  = statinfo.st_size;
 
-	return(TRUE);
-#endif
+	return(DB_SUCCESS);
 }
 
 /* path name separator character */
@@ -3231,18 +3361,165 @@ os_file_get_status(
 #endif
 
 /****************************************************************//**
+This function returns a new path name after replacing the basename
+in an old path with a new basename.  The old_path is a full path
+name including the extension.  The tablename is in the normal
+form "databasename/tablename".  The new base name is found after
+the forward slash.  Both input strings are null terminated.
+
+This function allocates memory to be returned.  It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@return	own: new full pathname */
+UNIV_INTERN
+char*
+os_file_make_new_pathname(
+/*======================*/
+	const char*	old_path,	/*!< in: pathname */
+	const char*	tablename)	/*!< in: contains new base name */
+{
+	ulint		dir_len;
+	char*		last_slash;
+	char*		base_name;
+	char*		new_path;
+	ulint		new_path_len;
+
+	/* Split the tablename into its database and table name components.
+	They are separated by a '/'. */
+	last_slash = strrchr((char*) tablename, '/');
+	base_name = last_slash ? last_slash + 1 : (char*) tablename;
+
+	/* Find the offset of the last slash. We will strip off the
+	old basename.ibd which starts after that slash. */
+	last_slash = strrchr((char*) old_path, OS_FILE_PATH_SEPARATOR);
+	dir_len = last_slash ? last_slash - old_path : strlen(old_path);
+
+	/* allocate a new path and move the old directory path to it. */
+	new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
+	new_path = static_cast<char*>(mem_alloc(new_path_len));
+	memcpy(new_path, old_path, dir_len);
+
+	ut_snprintf(new_path + dir_len,
+		    new_path_len - dir_len,
+		    "%c%s.ibd",
+		    OS_FILE_PATH_SEPARATOR,
+		    base_name);
+
+	return(new_path);
+}
+
+/****************************************************************//**
+This function returns a remote path name by combining a data directory
+path provided in a DATA DIRECTORY clause with the tablename which is
+in the form 'database/tablename'.  It strips the file basename (which
+is the tablename) found after the last directory in the path provided.
+The full filepath created will include the database name as a directory
+under the path provided.  The filename is the tablename with the '.ibd'
+extension. All input and output strings are null-terminated.
+
+This function allocates memory to be returned.  It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@return	own: A full pathname; data_dir_path/databasename/tablename.ibd */
+UNIV_INTERN
+char*
+os_file_make_remote_pathname(
+/*=========================*/
+	const char*	data_dir_path,	/*!< in: pathname */
+	const char*	tablename,	/*!< in: tablename */
+	const char*	extention)	/*!< in: file extention; ibd,cfg */
+{
+	ulint		data_dir_len;
+	char*		last_slash;
+	char*		new_path;
+	ulint		new_path_len;
+
+	ut_ad(extention && strlen(extention) == 3);
+
+	/* Find the offset of the last slash. We will strip off the
+	old basename or tablename which starts after that slash. */
+	last_slash = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
+	data_dir_len = last_slash ? last_slash - data_dir_path : strlen(data_dir_path);
+
+	/* allocate a new path and move the old directory path to it. */
+	new_path_len = data_dir_len + strlen(tablename)
+		       + sizeof "/." + strlen(extention);
+	new_path = static_cast<char*>(mem_alloc(new_path_len));
+	memcpy(new_path, data_dir_path, data_dir_len);
+	ut_snprintf(new_path + data_dir_len,
+		    new_path_len - data_dir_len,
+		    "%c%s.%s",
+		    OS_FILE_PATH_SEPARATOR,
+		    tablename,
+		    extention);
+
+	srv_normalize_path_for_win(new_path);
+
+	return(new_path);
+}
+
+/****************************************************************//**
+This function reduces a null-terminated full remote path name into
+the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
+the 'databasename/tablename.ibd' found at the end of the path with just
+'tablename'.
+
+Since the result is always smaller than the path sent in, no new memory
+is allocated. The caller should allocate memory for the path sent in.
+This function manipulates that path in place.
+
+If the path format is not as expected, just return.  The result is used
+to inform a SHOW CREATE TABLE command. */
+UNIV_INTERN
+void
+os_file_make_data_dir_path(
+/*========================*/
+	char*	data_dir_path)	/*!< in/out: full path/data_dir_path */
+{
+	char*	ptr;
+	char*	tablename;
+	ulint	tablename_len;
+
+	/* Replace the period before the extension with a null byte. */
+	ptr = strrchr((char*) data_dir_path, '.');
+	if (!ptr) {
+		return;
+	}
+	ptr[0] = '\0';
+
+	/* The tablename starts after the last slash. */
+	ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
+	if (!ptr) {
+		return;
+	}
+	ptr[0] = '\0';
+	tablename = ptr + 1;
+
+	/* The databasename starts after the next to last slash. */
+	ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
+	if (!ptr) {
+		return;
+	}
+	tablename_len = ut_strlen(tablename);
+
+	ut_memmove(++ptr, tablename, tablename_len);
+
+	ptr[tablename_len] = '\0';
+}
+
+/****************************************************************//**
 The function os_file_dirname returns a directory component of a
-null-terminated pathname string.  In the usual case, dirname returns
+null-terminated pathname string. In the usual case, dirname returns
 the string up to, but not including, the final '/', and basename
-is the component following the final '/'.  Trailing '/' charac�
-ters are not counted as part of the pathname.
+is the component following the final '/'. Trailing '/' characters
+are not counted as part of the pathname.
 
 If path does not contain a slash, dirname returns the string ".".
 
 Concatenating the string returned by dirname, a "/", and the basename
 yields a complete pathname.
 
-The return value is  a copy of the directory component of the pathname.
+The return value is a copy of the directory component of the pathname.
 The copy is allocated from heap. It is the caller responsibility
 to free it after it is no longer needed.
 
@@ -3294,11 +3571,18 @@ os_file_create_subdirs_if_needed(
 /*=============================*/
 	const char*	path)	/*!< in: path name */
 {
-	char*		subdir;
-	ibool		success, subdir_exists;
-	os_file_type_t	type;
+	if (srv_read_only_mode) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"read only mode set. Can't create subdirectories '%s'",
+			path);
+
+		return(FALSE);
+
+	}
+
+	char*	subdir = os_file_dirname(path);
 
-	subdir = os_file_dirname(path);
 	if (strlen(subdir) == 1
 	    && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
 		/* subdir is root or cwd, nothing to do */
@@ -3308,15 +3592,21 @@ os_file_create_subdirs_if_needed(
 	}
 
 	/* Test if subdir exists */
-	success = os_file_status(subdir, &subdir_exists, &type);
+	os_file_type_t	type;
+	ibool	subdir_exists;
+	ibool	success = os_file_status(subdir, &subdir_exists, &type);
+
 	if (success && !subdir_exists) {
+
 		/* subdir does not exist, create it */
 		success = os_file_create_subdirs_if_needed(subdir);
+
 		if (!success) {
 			mem_free(subdir);
 
 			return(FALSE);
 		}
+
 		success = os_file_create_directory(subdir, FALSE);
 	}
 
@@ -3338,7 +3628,7 @@ os_aio_array_get_nth_slot(
 {
 	ut_a(index < array->n_slots);
 
-	return((array->slots) + index);
+	return(&array->slots[index]);
 }
 
 #if defined(LINUX_NATIVE_AIO)
@@ -3380,7 +3670,7 @@ retry:
 			/* First time around. */
 			ut_print_timestamp(stderr);
 			fprintf(stderr,
-				"  InnoDB: Warning: io_setup() failed"
+				" InnoDB: Warning: io_setup() failed"
 				" with EAGAIN. Will make %d attempts"
 				" before giving up.\n",
 				OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
@@ -3399,7 +3689,7 @@ retry:
 		/* Have tried enough. Better call it a day. */
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
-			"  InnoDB: Error: io_setup() failed"
+			" InnoDB: Error: io_setup() failed"
 			" with EAGAIN after %d attempts.\n",
 			OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
 		break;
@@ -3407,7 +3697,7 @@ retry:
 	case -ENOSYS:
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
-			"  InnoDB: Error: Linux Native AIO interface"
+			" InnoDB: Error: Linux Native AIO interface"
 			" is not supported on this platform. Please"
 			" check your OS documentation and install"
 			" appropriate binary of InnoDB.\n");
@@ -3417,7 +3707,7 @@ retry:
 	default:
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
-			"  InnoDB: Error: Linux Native AIO setup"
+			" InnoDB: Error: Linux Native AIO setup"
 			" returned following error[%d]\n", -ret);
 		break;
 	}
@@ -3440,43 +3730,74 @@ os_aio_native_aio_supported(void)
 /*=============================*/
 {
 	int			fd;
-	byte*			buf;
-	byte*			ptr;
-	struct io_event		io_event;
 	io_context_t		io_ctx;
-	struct iocb		iocb;
-	struct iocb*		p_iocb;
-	int			err;
+	char			name[1000];
 
 	if (!os_aio_linux_create_io_ctx(1, &io_ctx)) {
 		/* The platform does not support native aio. */
 		return(FALSE);
-	}
+	} else if (!srv_read_only_mode) {
+		/* Now check if tmpdir supports native aio ops. */
+		fd = innobase_mysql_tmpfile();
 
-	/* Now check if tmpdir supports native aio ops. */
-	fd = innobase_mysql_tmpfile();
+		if (fd < 0) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Unable to create temp file to check "
+				"native AIO support.");
 
-	if (fd < 0) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr, " InnoDB: Error: unable to create "
-			"temp file to check native AIO support.\n");
+			return(FALSE);
+		}
+	} else {
 
-		return(FALSE);
+		srv_normalize_path_for_win(srv_log_group_home_dir);
+
+		ulint	dirnamelen = strlen(srv_log_group_home_dir);
+		ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
+		memcpy(name, srv_log_group_home_dir, dirnamelen);
+
+		/* Add a path separator if needed. */
+		if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
+			name[dirnamelen++] = SRV_PATH_SEPARATOR;
+		}
+
+		strcpy(name + dirnamelen, "ib_logfile0");
+
+		fd = ::open(name, O_RDONLY);
+
+		if (fd == -1) {
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Unable to open \"%s\" to check "
+				"native AIO read support.", name);
+
+			return(FALSE);
+		}
 	}
 
+	struct io_event	io_event;
+
 	memset(&io_event, 0x0, sizeof(io_event));
 
-	buf = (byte*) ut_malloc(UNIV_PAGE_SIZE * 2);
-	ptr = (byte*) ut_align(buf, UNIV_PAGE_SIZE);
+	byte*	buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2));
+	byte*	ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
+
+	struct iocb	iocb;
 
 	/* Suppress valgrind warning. */
 	memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
-
 	memset(&iocb, 0x0, sizeof(iocb));
-	p_iocb = &iocb;
-	io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
 
-	err = io_submit(io_ctx, 1, &p_iocb);
+	struct iocb*	p_iocb = &iocb;
+
+	if (!srv_read_only_mode) {
+		io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
+	} else {
+		ut_a(UNIV_PAGE_SIZE >= 512);
+		io_prep_pread(p_iocb, fd, ptr, 512, 0);
+	}
+
+	int	err = io_submit(io_ctx, 1, &p_iocb);
+
 	if (err >= 1) {
 		/* Now collect the submitted IO request. */
 		err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
@@ -3491,22 +3812,18 @@ os_aio_native_aio_supported(void)
 
 	case -EINVAL:
 	case -ENOSYS:
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Error: Linux Native AIO is not"
-			" supported on tmpdir.\n"
-			"InnoDB: You can either move tmpdir to a"
-			" file system that supports native AIO\n"
-			"InnoDB: or you can set"
-			" innodb_use_native_aio to FALSE to avoid"
-			" this message.\n");
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Linux Native AIO not supported. You can either "
+			"move %s to a file system that supports native "
+			"AIO or you can set innodb_use_native_aio to "
+			"FALSE to avoid this message.",
+			srv_read_only_mode ? name : "tmpdir");
 
 		/* fall through. */
 	default:
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Error: Linux Native AIO check"
-			" on tmpdir returned error[%d]\n", -err);
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Linux Native AIO check on %s returned error[%d]",
+			srv_read_only_mode ? name : "tmpdir", -err);
 	}
 
 	return(FALSE);
@@ -3528,27 +3845,28 @@ os_aio_array_create(
 	ulint	n_segments)	/*!< in: number of segments in the aio array */
 {
 	os_aio_array_t*	array;
-	ulint		i;
-	os_aio_slot_t*	slot;
 #ifdef LINUX_NATIVE_AIO
 	struct io_event*	io_event = NULL;
 #endif
 	ut_a(n > 0);
 	ut_a(n_segments > 0);
 
-	array = ut_malloc(sizeof(os_aio_array_t));
+	array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(*array)));
+	memset(array, 0x0, sizeof(*array));
 
-	array->mutex		= os_mutex_create();
-	array->not_full		= os_event_create(NULL);
-	array->is_empty		= os_event_create(NULL);
+	array->mutex = os_mutex_create();
+	array->not_full = os_event_create();
+	array->is_empty = os_event_create();
 
 	os_event_set(array->is_empty);
 
-	array->n_slots		= n;
-	array->n_segments	= n_segments;
-	array->n_reserved	= 0;
-	array->cur_seg		= 0;
-	array->slots		= ut_malloc(n * sizeof(os_aio_slot_t));
+	array->n_slots = n;
+	array->n_segments = n_segments;
+
+	array->slots = static_cast<os_aio_slot_t*>(
+		ut_malloc(n * sizeof(*array->slots)));
+
+	memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots)));
 
 #if defined(LINUX_NATIVE_AIO)
 	array->aio_ctx = NULL;
@@ -3563,11 +3881,12 @@ os_aio_array_create(
 	/* Initialize the io_context array. One io_context
 	per segment in the array. */
 
-	array->aio_ctx = ut_malloc(n_segments *
-				   sizeof(*array->aio_ctx));
-	for (i = 0; i < n_segments; ++i) {
+	array->aio_ctx = static_cast<io_context**>(
+		ut_malloc(n_segments * sizeof(*array->aio_ctx)));
+
+	for (ulint i = 0; i < n_segments; ++i) {
 		if (!os_aio_linux_create_io_ctx(n/n_segments,
-					   &array->aio_ctx[i])) {
+						&array->aio_ctx[i])) {
 			/* If something bad happened during aio setup
 			we disable linux native aio.
                         The disadvantage will be a small memory leak
@@ -3590,22 +3909,25 @@ os_aio_array_create(
 	}
 
 	/* Initialize the event array. One event per slot. */
-	io_event = ut_malloc(n * sizeof(*io_event));
+	io_event = static_cast<struct io_event*>(
+		ut_malloc(n * sizeof(*io_event)));
+
 	memset(io_event, 0x0, sizeof(*io_event) * n);
 	array->aio_events = io_event;
 
 skip_native_aio:
 #endif /* LINUX_NATIVE_AIO */
-	for (i = 0; i < n; i++) {
+	for (ulint i = 0; i < n; i++) {
+		os_aio_slot_t*	slot;
+
 		slot = os_aio_array_get_nth_slot(array, i);
 		slot->pos = i;
 		slot->reserved = FALSE;
 #ifdef LINUX_NATIVE_AIO
-
 		memset(&slot->control, 0x0, sizeof(slot->control));
 		slot->n_bytes = 0;
 		slot->ret = 0;
-#endif
+#endif /* WIN_ASYNC_IO */
 	}
 
 	return(array);
@@ -3617,7 +3939,7 @@ static
 void
 os_aio_array_free(
 /*==============*/
-	os_aio_array_t*	array)	/*!< in, own: array to free */
+	os_aio_array_t*& array)	/*!< in, own: array to free */
 {
 	os_mutex_free(array->mutex);
 	os_event_free(array->not_full);
@@ -3632,6 +3954,8 @@ os_aio_array_free(
 
 	ut_free(array->slots);
 	ut_free(array);
+
+	array = 0;
 }
 
 /***********************************************************************
@@ -3652,86 +3976,97 @@ os_aio_init(
 	ulint	n_slots_sync)	/*<! in: number of slots in the sync aio
 				array */
 {
-	ulint	i;
-	ulint 	n_segments = 2 + n_read_segs + n_write_segs;
-
-	ut_ad(n_segments >= 4);
-
 	os_io_init_simple();
 
 #if defined(LINUX_NATIVE_AIO)
 	/* Check if native aio is supported on this system and tmpfs */
-	if (srv_use_native_aio
-	    && !os_aio_native_aio_supported()) {
+	if (srv_use_native_aio && !os_aio_native_aio_supported()) {
+
+		ib_logf(IB_LOG_LEVEL_WARN, "Linux Native AIO disabled.");
 
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Warning: Linux Native AIO"
-			" disabled.\n");
 		srv_use_native_aio = FALSE;
 	}
 #endif /* LINUX_NATIVE_AIO */
 
-	for (i = 0; i < n_segments; i++) {
-		srv_set_io_thread_op_info(i, "not started yet");
-	}
-
+	srv_reset_io_thread_op_info();
 
-	/* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
+	os_aio_read_array = os_aio_array_create(
+		n_read_segs * n_per_seg, n_read_segs);
 
-	os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
-	if (os_aio_ibuf_array == NULL) {
-		goto err_exit;
+	if (os_aio_read_array == NULL) {
+		return(FALSE);
 	}
 
-	srv_io_thread_function[0] = "insert buffer thread";
+	ulint	start = (srv_read_only_mode) ? 0 : 2;
+	ulint	n_segs = n_read_segs + start;
 
-	os_aio_log_array = os_aio_array_create(n_per_seg, 1);
-	if (os_aio_log_array == NULL) {
-		goto err_exit;
+	/* 0 is the ibuf segment and 1 is the insert buffer segment. */
+	for (ulint i = start; i < n_segs; ++i) {
+		ut_a(i < SRV_MAX_N_IO_THREADS);
+		srv_io_thread_function[i] = "read thread";
 	}
 
-	srv_io_thread_function[1] = "log thread";
+	ulint	n_segments = n_read_segs;
 
-	os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
-						n_read_segs);
-	if (os_aio_read_array == NULL) {
-		goto err_exit;
-	}
+	if (!srv_read_only_mode) {
 
-	for (i = 2; i < 2 + n_read_segs; i++) {
-		ut_a(i < SRV_MAX_N_IO_THREADS);
-		srv_io_thread_function[i] = "read thread";
-	}
+		os_aio_log_array = os_aio_array_create(n_per_seg, 1);
 
-	os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
-						 n_write_segs);
-	if (os_aio_write_array == NULL) {
-		goto err_exit;
-	}
+		if (os_aio_log_array == NULL) {
+			return(FALSE);
+		}
 
-	for (i = 2 + n_read_segs; i < n_segments; i++) {
-		ut_a(i < SRV_MAX_N_IO_THREADS);
-		srv_io_thread_function[i] = "write thread";
+		++n_segments;
+
+		srv_io_thread_function[1] = "log thread";
+
+		os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
+
+		if (os_aio_ibuf_array == NULL) {
+			return(FALSE);
+		}
+
+		++n_segments;
+
+		srv_io_thread_function[0] = "insert buffer thread";
+
+		os_aio_write_array = os_aio_array_create(
+			n_write_segs * n_per_seg, n_write_segs);
+
+		if (os_aio_write_array == NULL) {
+			return(FALSE);
+		}
+
+		n_segments += n_write_segs;
+
+		for (ulint i = start + n_read_segs; i < n_segments; ++i) {
+			ut_a(i < SRV_MAX_N_IO_THREADS);
+			srv_io_thread_function[i] = "write thread";
+		}
+
+		ut_ad(n_segments >= 4);
+	} else {
+		ut_ad(n_segments > 0);
 	}
 
 	os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
+
 	if (os_aio_sync_array == NULL) {
-		goto err_exit;
+		return(FALSE);
 	}
 
-
 	os_aio_n_segments = n_segments;
 
 	os_aio_validate();
 
-	os_aio_segment_wait_events = ut_malloc(n_segments * sizeof(void*));
+	os_aio_segment_wait_events = static_cast<os_event_t*>(
+		ut_malloc(n_segments * sizeof *os_aio_segment_wait_events));
 
-	for (i = 0; i < n_segments; i++) {
-		os_aio_segment_wait_events[i] = os_event_create(NULL);
+	for (ulint i = 0; i < n_segments; ++i) {
+		os_aio_segment_wait_events[i] = os_event_create();
 	}
 
-	os_last_printout = time(NULL);
+	os_last_printout = ut_time();
 
 #ifdef _WIN32
 	ut_a(completion_port == 0 && read_completion_port == 0);
@@ -3742,8 +4077,6 @@ os_aio_init(
 
 	return(TRUE);
 
-err_exit:
-	return(FALSE);
 }
 
 /***********************************************************************
@@ -3753,20 +4086,25 @@ void
 os_aio_free(void)
 /*=============*/
 {
-	ulint	i;
+	if (os_aio_ibuf_array != 0) {
+		os_aio_array_free(os_aio_ibuf_array);
+	}
+
+	if (os_aio_log_array != 0) {
+		os_aio_array_free(os_aio_log_array);
+	}
+
+	if (os_aio_write_array != 0) {
+		os_aio_array_free(os_aio_write_array);
+	}
+
+	if (os_aio_sync_array != 0) {
+		os_aio_array_free(os_aio_sync_array);
+	}
 
-	os_aio_array_free(os_aio_ibuf_array);
-	os_aio_ibuf_array = NULL;
-	os_aio_array_free(os_aio_log_array);
-	os_aio_log_array = NULL;
 	os_aio_array_free(os_aio_read_array);
-	os_aio_read_array = NULL;
-	os_aio_array_free(os_aio_write_array);
-	os_aio_write_array = NULL;
-	os_aio_array_free(os_aio_sync_array);
-	os_aio_sync_array = NULL;
 
-	for (i = 0; i < os_aio_n_segments; i++) {
+	for (ulint i = 0; i < os_aio_n_segments; i++) {
 		os_event_free(os_aio_segment_wait_events[i]);
 	}
 
@@ -3805,14 +4143,20 @@ void
 os_aio_wake_all_threads_at_shutdown(void)
 /*=====================================*/
 {
-	ulint	i;
-
 #ifdef WIN_ASYNC_IO
 	/* This code wakes up all ai/o threads in Windows native aio */
 	os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
-	os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
-	os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
-	os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
+	if (os_aio_write_array != 0) {
+		os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
+	}
+
+	if (os_aio_ibuf_array != 0) {
+		os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
+	}
+
+	if (os_aio_log_array != 0) {
+		os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
+	}
 
 #elif defined(LINUX_NATIVE_AIO)
 
@@ -3824,12 +4168,14 @@ os_aio_wake_all_threads_at_shutdown(void)
 	if (srv_use_native_aio) {
 		return;
 	}
+
 	/* Fall through to simulated AIO handler wakeup if we are
 	not using native AIO. */
-#endif
+#endif /* !WIN_ASYNC_AIO */
+
 	/* This loop wakes up all simulated ai/o threads */
 
-	for (i = 0; i < os_aio_n_segments; i++) {
+	for (ulint i = 0; i < os_aio_n_segments; i++) {
 
 		os_event_set(os_aio_segment_wait_events[i]);
 	}
@@ -3843,6 +4189,7 @@ void
 os_aio_wait_until_no_pending_writes(void)
 /*=====================================*/
 {
+	ut_ad(!srv_read_only_mode);
 	os_event_wait(os_aio_write_array->is_empty);
 }
 
@@ -3861,18 +4208,24 @@ os_aio_get_segment_no_from_slot(
 	ulint	seg_len;
 
 	if (array == os_aio_ibuf_array) {
-		segment = 0;
+		ut_ad(!srv_read_only_mode);
+
+		segment = IO_IBUF_SEGMENT;
 
 	} else if (array == os_aio_log_array) {
-		segment = 1;
+		ut_ad(!srv_read_only_mode);
+
+		segment = IO_LOG_SEGMENT;
 
 	} else if (array == os_aio_read_array) {
 		seg_len = os_aio_read_array->n_slots
 			/ os_aio_read_array->n_segments;
 
-		segment = 2 + slot->pos / seg_len;
+		segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
 	} else {
+		ut_ad(!srv_read_only_mode);
 		ut_a(array == os_aio_write_array);
+
 		seg_len = os_aio_write_array->n_slots
 			/ os_aio_write_array->n_segments;
 
@@ -3893,15 +4246,19 @@ os_aio_get_array_and_local_segment(
 	os_aio_array_t** array,		/*!< out: aio wait array */
 	ulint		 global_segment)/*!< in: global segment number */
 {
-	ulint	segment;
+	ulint		segment;
 
 	ut_a(global_segment < os_aio_n_segments);
 
-	if (global_segment == 0) {
+	if (srv_read_only_mode) {
+		*array = os_aio_read_array;
+
+		return(global_segment);
+	} else if (global_segment == IO_IBUF_SEGMENT) {
 		*array = os_aio_ibuf_array;
 		segment = 0;
 
-	} else if (global_segment == 1) {
+	} else if (global_segment == IO_LOG_SEGMENT) {
 		*array = os_aio_log_array;
 		segment = 0;
 
@@ -3937,10 +4294,7 @@ os_aio_array_reserve_slot(
 				null-terminated string */
 	void*		buf,	/*!< in: buffer where to read or from which
 				to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset */
-	ulint		offset_high, /*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset */
 	ulint		len,	/*!< in: length of the block to read or write */
 	ulint		space_id)
 {
@@ -3953,7 +4307,7 @@ os_aio_array_reserve_slot(
 	struct iocb*	iocb;
 	off_t		aio_offset;
 
-#endif
+#endif /* WIN_ASYNC_IO */
 	ulint		i;
 	ulint		counter;
 	ulint		slots_per_seg;
@@ -3961,7 +4315,7 @@ os_aio_array_reserve_slot(
 
 #ifdef WIN_ASYNC_IO
 	ut_a((len & 0xFFFFFFFFUL) == len);
-#endif
+#endif /* WIN_ASYNC_IO */
 
 	/* No need of a mutex. Only reading constant fields */
 	slots_per_seg = array->n_slots / array->n_segments;
@@ -3970,7 +4324,7 @@ os_aio_array_reserve_slot(
 	segment. This can help in merging IO requests when we are
 	doing simulated AIO */
 	local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
-		    % array->n_segments;
+		% array->n_segments;
 
 loop:
 	os_mutex_enter(array->mutex);
@@ -3994,9 +4348,11 @@ loop:
 	local segment and do a full scan of the array. We are
 	guaranteed to find a slot in full scan. */
 	for (i = local_seg * slots_per_seg, counter = 0;
-	     counter < array->n_slots; i++, counter++) {
+	     counter < array->n_slots;
+	     i++, counter++) {
 
 		i %= array->n_slots;
+
 		slot = os_aio_array_get_nth_slot(array, i);
 
 		if (slot->reserved == FALSE) {
@@ -4020,23 +4376,22 @@ found:
 	}
 
 	slot->reserved = TRUE;
-	slot->reservation_time = time(NULL);
+	slot->reservation_time = ut_time();
 	slot->message1 = message1;
 	slot->message2 = message2;
 	slot->file     = file;
 	slot->name     = name;
 	slot->len      = len;
 	slot->type     = type;
-	slot->buf      = buf;
+	slot->buf      = static_cast<byte*>(buf);
 	slot->offset   = offset;
-	slot->offset_high = offset_high;
 	slot->io_already_done = FALSE;
 	slot->space_id = space_id;
 
 #ifdef WIN_ASYNC_IO
-	control = &(slot->control);
-	control->Offset = (DWORD)offset;
-	control->OffsetHigh = (DWORD)offset_high;
+	control = &slot->control;
+	control->Offset = (DWORD) offset & 0xFFFFFFFF;
+	control->OffsetHigh = (DWORD) (offset >> 32);
 	control->hEvent = 0;
 	slot->arr = array;
 
@@ -4049,14 +4404,10 @@ found:
 
 	/* Check if we are dealing with 64 bit arch.
 	If not then make sure that offset fits in 32 bits. */
-	if (sizeof(aio_offset) == 8) {
-		aio_offset = offset_high;
-		aio_offset <<= 32;
-		aio_offset += offset;
-	} else {
-		ut_a(offset_high == 0);
-		aio_offset = offset;
-	}
+	aio_offset = (off_t) offset;
+
+	ut_a(sizeof(aio_offset) >= sizeof(offset)
+	     || ((os_offset_t) aio_offset) == offset);
 
 	iocb = &slot->control;
 
@@ -4067,11 +4418,9 @@ found:
 		io_prep_pwrite(iocb, file, buf, len, aio_offset);
 	}
 
-	iocb->data = (void*)slot;
+	iocb->data = (void*) slot;
 	slot->n_bytes = 0;
 	slot->ret = 0;
-	/*fprintf(stderr, "Filled up Linux native iocb.\n");*/
-	
 
 skip_native_aio:
 #endif /* LINUX_NATIVE_AIO */
@@ -4089,9 +4438,6 @@ os_aio_array_free_slot(
 	os_aio_array_t*	array,	/*!< in: aio array */
 	os_aio_slot_t*	slot)	/*!< in: pointer to slot */
 {
-	ut_ad(array);
-	ut_ad(slot);
-
 	os_mutex_enter(array->mutex);
 
 	ut_ad(slot->reserved);
@@ -4136,36 +4482,42 @@ os_aio_simulated_wake_handler_thread(
 				arrays */
 {
 	os_aio_array_t*	array;
-	os_aio_slot_t*	slot;
-	ulint		segment __attribute__ ((unused));
-	ulint		n;
-	ulint		i;
+	ulint		segment;
 
 	ut_ad(!srv_use_native_aio);
 
 	segment = os_aio_get_array_and_local_segment(&array, global_segment);
 
-	n = array->n_slots / array->n_segments;
+	ulint	n = array->n_slots / array->n_segments;
+
+	segment *= n;
 
 	/* Look through n slots after the segment * n'th slot */
 
 	os_mutex_enter(array->mutex);
 
-	for (i = 0; i < n; i++) {
-		slot = os_aio_array_get_nth_slot(array, i + segment * n);
+	for (ulint i = 0; i < n; ++i) {
+		const os_aio_slot_t*	slot;
+
+		slot = os_aio_array_get_nth_slot(array, segment + i);
 
 		if (slot->reserved) {
+
 			/* Found an i/o request */
 
-			break;
+			os_mutex_exit(array->mutex);
+
+			os_event_t	event;
+
+			event = os_aio_segment_wait_events[global_segment];
+
+			os_event_set(event);
+
+			return;
 		}
 	}
 
 	os_mutex_exit(array->mutex);
-
-	if (i < n) {
-		os_event_set(os_aio_segment_wait_events[global_segment]);
-	}
 }
 
 /**********************************************************************//**
@@ -4175,8 +4527,6 @@ void
 os_aio_simulated_wake_handler_threads(void)
 /*=======================================*/
 {
-	ulint	i;
-
 	if (srv_use_native_aio) {
 		/* We do not use simulated aio: do nothing */
 
@@ -4185,7 +4535,7 @@ os_aio_simulated_wake_handler_threads(void)
 
 	os_aio_recommend_sleep_for_read_threads	= FALSE;
 
-	for (i = 0; i < os_aio_n_segments; i++) {
+	for (ulint i = 0; i < os_aio_n_segments; i++) {
 		os_aio_simulated_wake_handler_thread(i);
 	}
 }
@@ -4207,7 +4557,6 @@ background threads too eagerly to allow for coalescing during
 readahead requests. */
 #ifdef __WIN__
 	os_aio_array_t*	array;
-	ulint		g;
 
 	if (srv_use_native_aio) {
 		/* We do not use simulated aio: do nothing */
@@ -4217,12 +4566,12 @@ readahead requests. */
 
 	os_aio_recommend_sleep_for_read_threads	= TRUE;
 
-	for (g = 0; g < os_aio_n_segments; g++) {
-		os_aio_get_array_and_local_segment(&array, g);
+	for (ulint i = 0; i < os_aio_n_segments; i++) {
+		os_aio_get_array_and_local_segment(&array, i);
 
 		if (array == os_aio_read_array) {
 
-			os_event_reset(os_aio_segment_wait_events[g]);
+			os_event_reset(os_aio_segment_wait_events[i]);
 		}
 	}
 #endif /* __WIN__ */
@@ -4261,7 +4610,7 @@ os_aio_linux_dispatch(
 	fprintf(stderr,
 		"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
 		(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
-		array->aio_ctx[io_ctx_index], (ulong)io_ctx_index);
+		array->aio_ctx[io_ctx_index], (ulong) io_ctx_index);
 #endif
 
 	/* io_submit returns number of successfully
@@ -4303,10 +4652,7 @@ os_aio_func(
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read or from which
 				to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read or write */
-	ulint		offset_high, /*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read or write */
 	ulint		n,	/*!< in: number of bytes to read or write */
 	fil_node_t*	message1,/*!< in: message for the aio handler
 				(can be used to identify a completed
@@ -4325,7 +4671,6 @@ os_aio_func(
 	DWORD		len		= (DWORD) n;
 	BOOL	ret;
 #endif
-	ibool		retry;
 	ulint		wake_later;
 
 	ut_ad(file);
@@ -4348,13 +4693,13 @@ os_aio_func(
 		no need to use an i/o-handler thread */
 
 		if (type == OS_FILE_READ) {
-			ret = os_file_read_func(file, buf, offset,
-							offset_high, n, trx);
+			ret = os_file_read_func(file, buf, offset, n, trx);
 		}
 		else {
+			ut_ad(!srv_read_only_mode);
 			ut_a(type == OS_FILE_WRITE);
 
-			ret = os_file_write(name, file, buf, offset, offset_high, n);
+			ret = os_file_write(name, file, buf, offset, n);
 		}
 		ut_a(ret);
 		return ret;
@@ -4363,9 +4708,12 @@ os_aio_func(
 try_again:
 	switch (mode) {
 	case OS_AIO_NORMAL:
-		array = (type == OS_FILE_READ)
-			? os_aio_read_array
-			: os_aio_write_array;
+		if (type == OS_FILE_READ) {
+			array = os_aio_read_array;
+		} else {
+			ut_ad(!srv_read_only_mode);
+			array = os_aio_write_array;
+		}
 		break;
 	case OS_AIO_IBUF:
 		ut_ad(type == OS_FILE_READ);
@@ -4374,14 +4722,21 @@ try_again:
 
 		wake_later = FALSE;
 
-		array = os_aio_ibuf_array;
+		if (srv_read_only_mode) {
+			array = os_aio_read_array;
+		} else {
+			array = os_aio_ibuf_array;
+		}
 		break;
 	case OS_AIO_LOG:
-		array = os_aio_log_array;
+		if (srv_read_only_mode) {
+			array = os_aio_read_array;
+		} else {
+			array = os_aio_log_array;
+		}
 		break;
 	case OS_AIO_SYNC:
 		array = os_aio_sync_array;
-
 #if defined(LINUX_NATIVE_AIO)
 		/* In Linux native AIO we don't use sync IO array. */
 		ut_a(!srv_use_native_aio);
@@ -4398,13 +4753,13 @@ try_again:
 		trx->io_read += n;
 	}
 	slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
-					 name, buf, offset, offset_high, n, space_id);
+					 name, buf, offset, n, space_id);
 	if (type == OS_FILE_READ) {
 		if (srv_use_native_aio) {
 			os_n_file_reads++;
 			os_bytes_read_since_printout += n;
 #ifdef WIN_ASYNC_IO
-			ret = ReadFile(file, buf, (DWORD)n, &len,
+			ret = ReadFile(file, buf, (DWORD) n, &len,
 				       &(slot->control));
 			if(!ret && GetLastError() != ERROR_IO_PENDING)
 				goto err_exit;
@@ -4413,7 +4768,7 @@ try_again:
 			if (!os_aio_linux_dispatch(array, slot)) {
 				goto err_exit;
 			}
-#endif
+#endif /* WIN_ASYNC_IO */
 		} else {
 			if (!wake_later) {
 				os_aio_simulated_wake_handler_thread(
@@ -4422,10 +4777,11 @@ try_again:
 			}
 		}
 	} else if (type == OS_FILE_WRITE) {
+		ut_ad(!srv_read_only_mode);
 		if (srv_use_native_aio) {
 			os_n_file_writes++;
 #ifdef WIN_ASYNC_IO
-			ret = WriteFile(file, buf, (DWORD)n, &len,
+			ret = WriteFile(file, buf, (DWORD) n, &len,
 					&(slot->control));
 
 			if(!ret && GetLastError() != ERROR_IO_PENDING)
@@ -4434,7 +4790,7 @@ try_again:
 			if (!os_aio_linux_dispatch(array, slot)) {
 				goto err_exit;
 			}
-#endif
+#endif /* WIN_ASYNC_IO */
 		} else {
 			if (!wake_later) {
 				os_aio_simulated_wake_handler_thread(
@@ -4454,10 +4810,8 @@ err_exit:
 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
 	os_aio_array_free_slot(array, slot);
 
-	retry = os_file_handle_error(name,
-				     type == OS_FILE_READ
-				     ? "aio read" : "aio write");
-	if (retry) {
+	if (os_file_handle_error(
+		name,type == OS_FILE_READ ? "aio read" : "aio write")) {
 
 		goto try_again;
 	}
@@ -4555,16 +4909,8 @@ os_aio_windows_handle(
 	*space_id = slot->space_id;
 
 	if (ret && len == slot->len) {
-		ret_val = TRUE;
 
-#ifdef UNIV_DO_FLUSH
-		if (slot->type == OS_FILE_WRITE
-		    && !os_do_not_call_flush_at_each_write) {
-			if (!os_file_flush(slot->file, TRUE)) {
-				ut_error;
-			}
-		}
-#endif /* UNIV_DO_FLUSH */
+		ret_val = TRUE;
 	} else if (os_file_handle_error(slot->name, "Windows aio")) {
 
 		retry = TRUE;
@@ -4689,10 +5035,10 @@ retry:
 			os_aio_slot_t*	slot;
 			struct iocb*	control;
 
-			control = (struct iocb *)events[i].obj;
+			control = (struct iocb*) events[i].obj;
 			ut_a(control != NULL);
 
-			slot = (os_aio_slot_t *) control->data;
+			slot = (os_aio_slot_t*) control->data;
 
 			/* Some sanity checks. */
 			ut_a(slot != NULL);
@@ -4747,7 +5093,7 @@ retry:
 	/* All other errors should cause a trap for now. */
 	ut_print_timestamp(stderr);
 	fprintf(stderr,
-		"  InnoDB: unexpected ret_code[%d] from io_getevents()!\n",
+		" InnoDB: unexpected ret_code[%d] from io_getevents()!\n",
 		ret);
 	ut_error;
 }
@@ -4851,16 +5197,9 @@ found:
 	*type = slot->type;
 	*space_id = slot->space_id;
 
-	if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) {
-		ret = TRUE;
+	if (slot->ret == 0 && slot->n_bytes == (long) slot->len) {
 
-#ifdef UNIV_DO_FLUSH
-		if (slot->type == OS_FILE_WRITE
-		    && !os_do_not_call_flush_at_each_write)
-		    && !os_file_flush(slot->file, TRUE) {
-			ut_error;
-		}
-#endif /* UNIV_DO_FLUSH */
+		ret = TRUE;
 	} else {
 		errno = -slot->ret;
 
@@ -4907,14 +5246,12 @@ os_aio_simulated_handle(
 	ulint*	space_id)
 {
 	os_aio_array_t*	array;
-	ulint		segment __attribute__ ((unused));
-	os_aio_slot_t*	slot;
-	os_aio_slot_t*	slot2;
+	ulint		segment;
 	os_aio_slot_t*	consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
 	ulint		n_consecutive;
 	ulint		total_len;
 	ulint		offs;
-	ulint		lowest_offset;
+	os_offset_t	lowest_offset;
 	ulint		biggest_age;
 	ulint		age;
 	byte*		combined_buf;
@@ -4922,7 +5259,7 @@ os_aio_simulated_handle(
 	ibool		ret;
 	ibool		any_reserved;
 	ulint		n;
-	ulint		i;
+	os_aio_slot_t*	aio_slot;
 
 	/* Fix compiler warning */
 	*consecutive_ios = NULL;
@@ -4960,7 +5297,9 @@ restart:
 
 	os_mutex_enter(array->mutex);
 
-	for (i = 0; i < n; i++) {
+	for (ulint i = 0; i < n; i++) {
+		os_aio_slot_t*	slot;
+
 		slot = os_aio_array_get_nth_slot(array, i + segment * n);
 
 		if (!slot->reserved) {
@@ -4974,8 +5313,8 @@ restart:
 					(ulong) i);
 			}
 
+			aio_slot = slot;
 			ret = TRUE;
-
 			goto slot_io_done;
 		} else {
 			any_reserved = TRUE;
@@ -4985,9 +5324,7 @@ restart:
 	/* There is no completed request.
 	If there is no pending request at all,
 	and the system is being shut down, exit. */
-	if (UNIV_UNLIKELY
-	    (!any_reserved
-	     && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
+	if (!any_reserved && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
 		os_mutex_exit(array->mutex);
 		*message1 = NULL;
 		*message2 = NULL;
@@ -5001,14 +5338,17 @@ restart:
 	then pick the one at the lowest offset. */
 
 	biggest_age = 0;
-	lowest_offset = ULINT_MAX;
+	lowest_offset = IB_UINT64_MAX;
+
+	for (ulint i = 0; i < n; i++) {
+		os_aio_slot_t*	slot;
 
-	for (i = 0; i < n; i++) {
 		slot = os_aio_array_get_nth_slot(array, i + segment * n);
 
 		if (slot->reserved) {
-			age = (ulint)difftime(time(NULL),
-					      slot->reservation_time);
+
+			age = (ulint) difftime(
+				ut_time(), slot->reservation_time);
 
 			if ((age >= 2 && age > biggest_age)
 			    || (age >= 2 && age == biggest_age
@@ -5030,11 +5370,13 @@ restart:
 		lowest offset in the array (we ignore the high 32 bits of the
 		offset in these heuristics) */
 
-		lowest_offset = ULINT_MAX;
+		lowest_offset = IB_UINT64_MAX;
+
+		for (ulint i = 0; i < n; i++) {
+			os_aio_slot_t*	slot;
 
-		for (i = 0; i < n; i++) {
-			slot = os_aio_array_get_nth_slot(array,
-							 i + segment * n);
+			slot = os_aio_array_get_nth_slot(
+				array, i + segment * n);
 
 			if (slot->reserved && slot->offset < lowest_offset) {
 
@@ -5060,28 +5402,28 @@ restart:
 	ut_ad(n_consecutive != 0);
 	ut_ad(consecutive_ios[0] != NULL);
 
-	slot = consecutive_ios[0];
+	aio_slot = consecutive_ios[0];
 
 	/* Check if there are several consecutive blocks to read or write */
 
 consecutive_loop:
-	for (i = 0; i < n; i++) {
-		slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
+	for (ulint i = 0; i < n; i++) {
+		os_aio_slot_t*	slot;
 
-		if (slot2->reserved && slot2 != slot
-		    && slot2->offset == slot->offset + slot->len
-		    /* check that sum does not wrap over */
-		    && slot->offset + slot->len > slot->offset
-		    && slot2->offset_high == slot->offset_high
-		    && slot2->type == slot->type
-		    && slot2->file == slot->file) {
+		slot = os_aio_array_get_nth_slot(array, i + segment * n);
+
+		if (slot->reserved
+		    && slot != aio_slot
+		    && slot->offset == aio_slot->offset + aio_slot->len
+		    && slot->type == aio_slot->type
+		    && slot->file == aio_slot->file) {
 
 			/* Found a consecutive i/o request */
 
-			consecutive_ios[n_consecutive] = slot2;
+			consecutive_ios[n_consecutive] = slot;
 			n_consecutive++;
 
-			slot = slot2;
+			aio_slot = slot;
 
 			if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
 
@@ -5099,22 +5441,24 @@ consecutive_loop:
 	i/o */
 
 	total_len = 0;
-	slot = consecutive_ios[0];
+	aio_slot = consecutive_ios[0];
 
-	for (i = 0; i < n_consecutive; i++) {
+	for (ulint i = 0; i < n_consecutive; i++) {
 		total_len += consecutive_ios[i]->len;
 	}
 
 	if (n_consecutive == 1) {
 		/* We can use the buffer of the i/o request */
-		combined_buf = slot->buf;
+		combined_buf = aio_slot->buf;
 		combined_buf2 = NULL;
 	} else {
-		combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE);
+		combined_buf2 = static_cast<byte*>(
+			ut_malloc(total_len + UNIV_PAGE_SIZE));
 
 		ut_a(combined_buf2);
 
-		combined_buf = ut_align(combined_buf2, UNIV_PAGE_SIZE);
+		combined_buf = static_cast<byte*>(
+			ut_align(combined_buf2, UNIV_PAGE_SIZE));
 	}
 
 	/* We release the array mutex for the time of the i/o: NOTE that
@@ -5123,52 +5467,41 @@ consecutive_loop:
 
 	os_mutex_exit(array->mutex);
 
-	if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
+	if (aio_slot->type == OS_FILE_WRITE && n_consecutive > 1) {
 		/* Copy the buffers to the combined buffer */
 		offs = 0;
 
-		for (i = 0; i < n_consecutive; i++) {
+		for (ulint i = 0; i < n_consecutive; i++) {
 
 			ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
 				  consecutive_ios[i]->len);
+
 			offs += consecutive_ios[i]->len;
 		}
 	}
 
 	srv_set_io_thread_op_info(global_segment, "doing file i/o");
 
-	if (os_aio_print_debug) {
-		fprintf(stderr,
-			"InnoDB: doing i/o of type %lu at offset %lu %lu,"
-			" length %lu\n",
-			(ulong) slot->type, (ulong) slot->offset_high,
-			(ulong) slot->offset, (ulong) total_len);
-	}
-
 	/* Do the i/o with ordinary, synchronous i/o functions: */
-	if (slot->type == OS_FILE_WRITE) {
-		ret = os_file_write(slot->name, slot->file, combined_buf,
-				    slot->offset, slot->offset_high,
-				    total_len);
+	if (aio_slot->type == OS_FILE_WRITE) {
+		ut_ad(!srv_read_only_mode);
+		ret = os_file_write(
+			aio_slot->name, aio_slot->file, combined_buf,
+			aio_slot->offset, total_len);
 	} else {
-		ret = os_file_read(slot->file, combined_buf,
-				   slot->offset, slot->offset_high, total_len);
+		ret = os_file_read(
+			aio_slot->file, combined_buf,
+			aio_slot->offset, total_len);
 	}
 
 	ut_a(ret);
 	srv_set_io_thread_op_info(global_segment, "file i/o done");
 
-#if 0
-	fprintf(stderr,
-		"aio: %lu consecutive %lu:th segment, first offs %lu blocks\n",
-		n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE);
-#endif
-
-	if (slot->type == OS_FILE_READ && n_consecutive > 1) {
+	if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) {
 		/* Copy the combined buffer to individual buffers */
 		offs = 0;
 
-		for (i = 0; i < n_consecutive; i++) {
+		for (ulint i = 0; i < n_consecutive; i++) {
 
 			ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
 				  consecutive_ios[i]->len);
@@ -5184,7 +5517,7 @@ consecutive_loop:
 
 	/* Mark the i/os done in slots */
 
-	for (i = 0; i < n_consecutive; i++) {
+	for (ulint i = 0; i < n_consecutive; i++) {
 		consecutive_ios[i]->io_already_done = TRUE;
 	}
 
@@ -5194,17 +5527,17 @@ consecutive_loop:
 
 slot_io_done:
 
-	ut_a(slot->reserved);
+	ut_a(aio_slot->reserved);
 
-	*message1 = slot->message1;
-	*message2 = slot->message2;
+	*message1 = aio_slot->message1;
+	*message2 = aio_slot->message2;
 
-	*type = slot->type;
-	*space_id = slot->space_id;
+	*type = aio_slot->type;
+	*space_id = aio_slot->space_id;
 
 	os_mutex_exit(array->mutex);
 
-	os_aio_array_free_slot(array, slot);
+	os_aio_array_free_slot(array, aio_slot);
 
 	return(ret);
 
@@ -5223,30 +5556,20 @@ recommended_sleep:
 
 	os_event_wait(os_aio_segment_wait_events[global_segment]);
 
-	if (os_aio_print_debug) {
-		fprintf(stderr,
-			"InnoDB: i/o handler thread for i/o"
-			" segment %lu wakes up\n",
-			(ulong) global_segment);
-	}
-
 	goto restart;
 }
 
 /**********************************************************************//**
 Validates the consistency of an aio array.
-@return	TRUE if ok */
+@return	true if ok */
 static
-ibool
+bool
 os_aio_array_validate(
 /*==================*/
 	os_aio_array_t*	array)	/*!< in: aio wait array */
 {
-	os_aio_slot_t*	slot;
-	ulint		n_reserved	= 0;
 	ulint		i;
-
-	ut_a(array);
+	ulint		n_reserved	= 0;
 
 	os_mutex_enter(array->mutex);
 
@@ -5254,6 +5577,8 @@ os_aio_array_validate(
 	ut_a(array->n_segments > 0);
 
 	for (i = 0; i < array->n_slots; i++) {
+		os_aio_slot_t*	slot;
+
 		slot = os_aio_array_get_nth_slot(array, i);
 
 		if (slot->reserved) {
@@ -5266,7 +5591,7 @@ os_aio_array_validate(
 
 	os_mutex_exit(array->mutex);
 
-	return(TRUE);
+	return(true);
 }
 
 /**********************************************************************//**
@@ -5278,10 +5603,22 @@ os_aio_validate(void)
 /*=================*/
 {
 	os_aio_array_validate(os_aio_read_array);
-	os_aio_array_validate(os_aio_write_array);
-	os_aio_array_validate(os_aio_ibuf_array);
-	os_aio_array_validate(os_aio_log_array);
-	os_aio_array_validate(os_aio_sync_array);
+
+	if (os_aio_write_array != 0) {
+		os_aio_array_validate(os_aio_write_array);
+	}
+
+	if (os_aio_ibuf_array != 0) {
+		os_aio_array_validate(os_aio_ibuf_array);
+	}
+
+	if (os_aio_log_array != 0) {
+		os_aio_array_validate(os_aio_log_array);
+	}
+
+	if (os_aio_sync_array != 0) {
+		os_aio_array_validate(os_aio_sync_array);
+	}
 
 	return(TRUE);
 }
@@ -5321,65 +5658,36 @@ os_aio_print_segment_info(
 }
 
 /**********************************************************************//**
-Prints info of the aio arrays. */
+Prints info about the aio array. */
 UNIV_INTERN
 void
-os_aio_print(
-/*=========*/
-	FILE*	file)	/*!< in: file where to print */
+os_aio_print_array(
+/*==============*/
+	FILE*		file,	/*!< in: file where to print */
+	os_aio_array_t*	array)	/*!< in: aio array to print */
 {
-	os_aio_array_t*	array;
-	os_aio_slot_t*	slot;
-	ulint		n_reserved;
-	ulint		n_res_seg[SRV_MAX_N_IO_THREADS];
-	time_t		current_time;
-	double		time_elapsed;
-	double		avg_bytes_read;
-	ulint		i;
-
-	for (i = 0; i < srv_n_file_io_threads; i++) {
-		fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
-			srv_io_thread_op_info[i],
-			srv_io_thread_function[i]);
-
-#ifndef __WIN__
-		if (os_aio_segment_wait_events[i]->is_set) {
-			fprintf(file, " ev set");
-		}
-#endif
-
-		fprintf(file, "\n");
-	}
-
-	fputs("Pending normal aio reads:", file);
-
-	array = os_aio_read_array;
-loop:
-	ut_a(array);
+	ulint			n_reserved = 0;
+	ulint			n_res_seg[SRV_MAX_N_IO_THREADS];
 
 	os_mutex_enter(array->mutex);
 
 	ut_a(array->n_slots > 0);
 	ut_a(array->n_segments > 0);
 
-	n_reserved = 0;
-
 	memset(n_res_seg, 0x0, sizeof(n_res_seg));
 
-	for (i = 0; i < array->n_slots; i++) {
-		ulint	seg_no;
+	for (ulint i = 0; i < array->n_slots; ++i) {
+		os_aio_slot_t*	slot;
+		ulint		seg_no;
 
 		slot = os_aio_array_get_nth_slot(array, i);
 
 		seg_no = (i * array->n_segments) / array->n_slots;
+
 		if (slot->reserved) {
-			n_reserved++;
-			n_res_seg[seg_no]++;
-#if 0
-			fprintf(stderr, "Reserved slot, messages %p %p\n",
-				(void*) slot->message1,
-				(void*) slot->message2);
-#endif
+			++n_reserved;
+			++n_res_seg[seg_no];
+
 			ut_a(slot->len > 0);
 		}
 	}
@@ -5391,38 +5699,61 @@ loop:
 	os_aio_print_segment_info(file, n_res_seg, array);
 
 	os_mutex_exit(array->mutex);
+}
 
-	if (array == os_aio_read_array) {
-		fputs(", aio writes:", file);
+/**********************************************************************//**
+Prints info of the aio arrays. */
+UNIV_INTERN
+void
+os_aio_print(
+/*=========*/
+	FILE*	file)	/*!< in: file where to print */
+{
+	time_t		current_time;
+	double		time_elapsed;
+	double		avg_bytes_read;
 
-		array = os_aio_write_array;
+	for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
+		fprintf(file, "I/O thread %lu state: %s (%s)",
+			(ulong) i,
+			srv_io_thread_op_info[i],
+			srv_io_thread_function[i]);
 
-		goto loop;
+#ifndef __WIN__
+		if (os_aio_segment_wait_events[i]->is_set) {
+			fprintf(file, " ev set");
+		}
+#endif /* __WIN__ */
+
+		fprintf(file, "\n");
 	}
 
-	if (array == os_aio_write_array) {
-		fputs(",\n ibuf aio reads:", file);
-		array = os_aio_ibuf_array;
+	fputs("Pending normal aio reads:", file);
 
-		goto loop;
+	os_aio_print_array(file, os_aio_read_array);
+
+	if (os_aio_write_array != 0) {
+		fputs(", aio writes:", file);
+		os_aio_print_array(file, os_aio_write_array);
 	}
 
-	if (array == os_aio_ibuf_array) {
-		fputs(", log i/o's:", file);
-		array = os_aio_log_array;
+	if (os_aio_ibuf_array != 0) {
+		fputs(",\n ibuf aio reads:", file);
+		os_aio_print_array(file, os_aio_ibuf_array);
+	}
 
-		goto loop;
+	if (os_aio_log_array != 0) {
+		fputs(", log i/o's:", file);
+		os_aio_print_array(file, os_aio_log_array);
 	}
 
-	if (array == os_aio_log_array) {
+	if (os_aio_sync_array != 0) {
 		fputs(", sync i/o's:", file);
-		array = os_aio_sync_array;
-
-		goto loop;
+		os_aio_print_array(file, os_aio_sync_array);
 	}
 
 	putc('\n', file);
-	current_time = time(NULL);
+	current_time = ut_time();
 	time_elapsed = 0.001 + difftime(current_time, os_last_printout);
 
 	fprintf(file,
@@ -5430,7 +5761,8 @@ loop:
 		"%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
 		(ulong) fil_n_pending_log_flushes,
 		(ulong) fil_n_pending_tablespace_flushes,
-		(ulong) os_n_file_reads, (ulong) os_n_file_writes,
+		(ulong) os_n_file_reads,
+		(ulong) os_n_file_writes,
 		(ulong) os_n_fsyncs);
 
 	if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
@@ -5452,7 +5784,7 @@ loop:
 		" %.2f writes/s, %.2f fsyncs/s\n",
 		(os_n_file_reads - os_n_file_reads_old)
 		/ time_elapsed,
-		(ulong)avg_bytes_read,
+		(ulong) avg_bytes_read,
 		(os_n_file_writes - os_n_file_writes_old)
 		/ time_elapsed,
 		(os_n_fsyncs - os_n_fsyncs_old)
@@ -5502,21 +5834,29 @@ os_aio_all_slots_free(void)
 
 	os_mutex_exit(array->mutex);
 
-	array = os_aio_write_array;
+	if (!srv_read_only_mode) {
+		ut_a(os_aio_write_array == 0);
 
-	os_mutex_enter(array->mutex);
+		array = os_aio_write_array;
 
-	n_res += array->n_reserved;
+		os_mutex_enter(array->mutex);
 
-	os_mutex_exit(array->mutex);
+		n_res += array->n_reserved;
 
-	array = os_aio_ibuf_array;
+		os_mutex_exit(array->mutex);
 
-	os_mutex_enter(array->mutex);
+		ut_a(os_aio_ibuf_array == 0);
 
-	n_res += array->n_reserved;
+		array = os_aio_ibuf_array;
 
-	os_mutex_exit(array->mutex);
+		os_mutex_enter(array->mutex);
+
+		n_res += array->n_reserved;
+
+		os_mutex_exit(array->mutex);
+	}
+
+	ut_a(os_aio_log_array == 0);
 
 	array = os_aio_log_array;
 
diff --git a/storage/xtradb/os/os0proc.c b/storage/xtradb/os/os0proc.cc
index 7b52dd2a28f..ec629430baf 100644
--- a/storage/xtradb/os/os0proc.c
+++ b/storage/xtradb/os/os0proc.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file os/os0proc.c
+@file os/os0proc.cc
 The interface to the operating system
 process control primitives
 
@@ -71,7 +71,7 @@ os_proc_get_number(void)
 #ifdef __WIN__
 	return((ulint)GetCurrentProcessId());
 #else
-	return((ulint)getpid());
+	return((ulint) getpid());
 #endif
 }
 
@@ -118,14 +118,14 @@ os_mem_alloc_large(
 	size = ut_2pow_round(*n + (os_large_page_size - 1),
 			     os_large_page_size);
 
-	shmid = shmget(IPC_PRIVATE, (size_t)size, SHM_HUGETLB | SHM_R | SHM_W);
+	shmid = shmget(IPC_PRIVATE, (size_t) size, SHM_HUGETLB | SHM_R | SHM_W);
 	if (shmid < 0) {
 		fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to allocate"
 			" %lu bytes. errno %d\n", size, errno);
 		ptr = NULL;
 	} else {
 		ptr = shmat(shmid, NULL, 0);
-		if (ptr == (void *)-1) {
+		if (ptr == (void*)-1) {
 			fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to"
 				" attach shared memory segment, errno %d\n",
 				errno);
@@ -265,7 +265,11 @@ os_mem_free_large(
 #elif !defined OS_MAP_ANON
 	ut_free(ptr);
 #else
+# if defined(UNIV_SOLARIS)
+	if (munmap(static_cast<caddr_t>(ptr), size)) {
+# else
 	if (munmap(ptr, size)) {
+# endif /* UNIV_SOLARIS */
 		fprintf(stderr, "InnoDB: munmap(%p, %lu) failed;"
 			" errno %lu\n",
 			ptr, (ulong) size, (ulong) errno);
diff --git a/storage/xtradb/os/os0stacktrace.c b/storage/xtradb/os/os0stacktrace.cc
index 4d52e625057..4d52e625057 100644
--- a/storage/xtradb/os/os0stacktrace.c
+++ b/storage/xtradb/os/os0stacktrace.cc
diff --git a/storage/xtradb/os/os0sync.c b/storage/xtradb/os/os0sync.cc
index 3a182692da3..392dbe0d7a7 100644
--- a/storage/xtradb/os/os0sync.c
+++ b/storage/xtradb/os/os0sync.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file os/os0sync.c
+@file os/os0sync.cc
 The interface to the operating system
 synchronization primitives.
 
@@ -38,8 +38,8 @@ Created 9/6/1995 Heikki Tuuri
 #include "srv0srv.h"
 
 /* Type definition for an operating system mutex struct */
-struct os_mutex_struct{
-	os_event_t	event;	/*!< Used by sync0arr.c for queing threads */
+struct os_mutex_t{
+	os_event_t	event;	/*!< Used by sync0arr.cc for queing threads */
 	void*		handle;	/*!< OS handle to mutex */
 	ulint		count;	/*!< we use this counter to check
 				that the same thread does not
@@ -47,12 +47,12 @@ struct os_mutex_struct{
 				do not assume that the OS mutex
 				supports recursive locking, though
 				NT seems to do that */
-	UT_LIST_NODE_T(os_mutex_str_t) os_mutex_list;
+	UT_LIST_NODE_T(os_mutex_t) os_mutex_list;
 				/* list of all 'slow' OS mutexes created */
 };
 
 /** Mutex protecting counts and the lists of OS mutexes and events */
-UNIV_INTERN os_mutex_t	os_sync_mutex;
+UNIV_INTERN os_ib_mutex_t	os_sync_mutex;
 /** TRUE if os_sync_mutex has been initialized */
 static ibool		os_sync_mutex_inited	= FALSE;
 /** TRUE when os_sync_free() is being executed */
@@ -63,10 +63,10 @@ os_thread_exit */
 UNIV_INTERN ulint	os_thread_count		= 0;
 
 /** The list of all events created */
-static UT_LIST_BASE_NODE_T(os_event_struct_t)	os_event_list;
+static UT_LIST_BASE_NODE_T(os_event)		os_event_list;
 
 /** The list of all OS 'slow' mutexes */
-static UT_LIST_BASE_NODE_T(os_mutex_str_t)	os_mutex_list;
+static UT_LIST_BASE_NODE_T(os_mutex_t)		os_mutex_list;
 
 UNIV_INTERN ulint	os_event_count		= 0;
 UNIV_INTERN ulint	os_mutex_count		= 0;
@@ -75,6 +75,11 @@ UNIV_INTERN ulint	os_fast_mutex_count	= 0;
 /* The number of microsecnds in a second. */
 static const ulint MICROSECS_IN_A_SECOND = 1000000;
 
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	event_os_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	os_mutex_key;
+#endif
+
 /* Because a mutex is embedded inside an event and there is an
 event embedded inside a mutex, on free, this generates a recursive call.
 This version of the free event function doesn't acquire the global lock */
@@ -132,7 +137,7 @@ ibool
 os_cond_wait_timed(
 /*===============*/
 	os_cond_t*		cond,		/*!< in: condition variable. */
-	os_fast_mutex_t*	mutex,		/*!< in: fast mutex */
+	os_fast_mutex_t*	fast_mutex,	/*!< in: fast mutex */
 #ifndef __WIN__
 	const struct timespec*	abstime		/*!< in: timeout */
 #else
@@ -141,6 +146,7 @@ os_cond_wait_timed(
 #endif /* !__WIN__ */
 )
 {
+	fast_mutex_t*	mutex = &fast_mutex->mutex;
 #ifdef __WIN__
 	BOOL	ret;
 	DWORD	err;
@@ -195,8 +201,9 @@ void
 os_cond_wait(
 /*=========*/
 	os_cond_t*		cond,	/*!< in: condition variable. */
-	os_fast_mutex_t*	mutex)	/*!< in: fast mutex */
+	os_fast_mutex_t*	fast_mutex)/*!< in: fast mutex */
 {
+	fast_mutex_t*	mutex = &fast_mutex->mutex;
 	ut_a(cond);
 	ut_a(mutex);
 
@@ -322,7 +329,7 @@ os_sync_free(void)
 /*==============*/
 {
 	os_event_t	event;
-	os_mutex_t	mutex;
+	os_ib_mutex_t	mutex;
 
 	os_sync_free_called = TRUE;
 	event = UT_LIST_GET_FIRST(os_event_list);
@@ -358,22 +365,17 @@ must be reset explicitly by calling sync_os_reset_event.
 @return	the event handle */
 UNIV_INTERN
 os_event_t
-os_event_create(
-/*============*/
-	const char*	name)	/*!< in: the name of the event, if NULL
-				the event is created without a name */
+os_event_create(void)
+/*==================*/
 {
 	os_event_t	event;
 
 #ifdef __WIN__
 	if(!srv_use_native_conditions) {
 
-		event = ut_malloc(sizeof(struct os_event_struct));
+		event = static_cast<os_event_t>(ut_malloc(sizeof(*event)));
 
-		event->handle = CreateEvent(NULL,
-					    TRUE,
-					    FALSE,
-					    (LPCTSTR) name);
+		event->handle = CreateEvent(NULL, TRUE, FALSE, NULL);
 		if (!event->handle) {
 			fprintf(stderr,
 				"InnoDB: Could not create a Windows event"
@@ -382,13 +384,14 @@ os_event_create(
 		}
 	} else /* Windows with condition variables */
 #endif
-
 	{
-		UT_NOT_USED(name);
+		event = static_cast<os_event_t>(ut_malloc(sizeof *event));
 
-		event = ut_malloc(sizeof(struct os_event_struct));
-
-		os_fast_mutex_init(&(event->os_mutex));
+#ifndef PFS_SKIP_EVENT_MUTEX
+		os_fast_mutex_init(event_os_mutex_key, &event->os_mutex);
+#else
+		os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &event->os_mutex);
+#endif
 
 		os_cond_init(&(event->cond_var));
 
@@ -440,8 +443,6 @@ os_event_set(
 	}
 #endif
 
-	ut_a(event);
-
 	os_fast_mutex_lock(&(event->os_mutex));
 
 	if (event->is_set) {
@@ -631,7 +632,6 @@ os_event_wait_time_low(
 	ib_int64_t	reset_sig_count)	/*!< in: zero or the value
 						returned by previous call of
 						os_event_reset(). */
-
 {
 	ibool		timed_out = FALSE;
 
@@ -731,24 +731,26 @@ os_event_wait_time_low(
 
 /*********************************************************//**
 Creates an operating system mutex semaphore. Because these are slow, the
-mutex semaphore of InnoDB itself (mutex_t) should be used where possible.
+mutex semaphore of InnoDB itself (ib_mutex_t) should be used where possible.
 @return	the mutex handle */
 UNIV_INTERN
-os_mutex_t
+os_ib_mutex_t
 os_mutex_create(void)
 /*=================*/
 {
 	os_fast_mutex_t*	mutex;
-	os_mutex_t		mutex_str;
+	os_ib_mutex_t		mutex_str;
 
-	mutex = ut_malloc(sizeof(os_fast_mutex_t));
+	mutex = static_cast<os_fast_mutex_t*>(
+		ut_malloc(sizeof(os_fast_mutex_t)));
 
-	os_fast_mutex_init(mutex);
-	mutex_str = ut_malloc(sizeof(os_mutex_str_t));
+	os_fast_mutex_init(os_mutex_key, mutex);
+
+	mutex_str = static_cast<os_ib_mutex_t>(ut_malloc(sizeof *mutex_str));
 
 	mutex_str->handle = mutex;
 	mutex_str->count = 0;
-	mutex_str->event = os_event_create(NULL);
+	mutex_str->event = os_event_create();
 
 	if (UNIV_LIKELY(os_sync_mutex_inited)) {
 		/* When creating os_sync_mutex itself we cannot reserve it */
@@ -772,9 +774,9 @@ UNIV_INTERN
 void
 os_mutex_enter(
 /*===========*/
-	os_mutex_t	mutex)	/*!< in: mutex to acquire */
+	os_ib_mutex_t	mutex)	/*!< in: mutex to acquire */
 {
-	os_fast_mutex_lock(mutex->handle);
+	os_fast_mutex_lock(static_cast<os_fast_mutex_t*>(mutex->handle));
 
 	(mutex->count)++;
 
@@ -787,14 +789,14 @@ UNIV_INTERN
 void
 os_mutex_exit(
 /*==========*/
-	os_mutex_t	mutex)	/*!< in: mutex to release */
+	os_ib_mutex_t	mutex)	/*!< in: mutex to release */
 {
 	ut_a(mutex);
 
 	ut_a(mutex->count == 1);
 
 	(mutex->count)--;
-	os_fast_mutex_unlock(mutex->handle);
+	os_fast_mutex_unlock(static_cast<os_fast_mutex_t*>(mutex->handle));
 }
 
 /**********************************************************//**
@@ -803,7 +805,7 @@ UNIV_INTERN
 void
 os_mutex_free(
 /*==========*/
-	os_mutex_t	mutex)	/*!< in: mutex to free */
+	os_ib_mutex_t	mutex)	/*!< in: mutex to free */
 {
 	ut_a(mutex);
 
@@ -823,7 +825,7 @@ os_mutex_free(
 		os_mutex_exit(os_sync_mutex);
 	}
 
-	os_fast_mutex_free(mutex->handle);
+	os_fast_mutex_free(static_cast<os_fast_mutex_t*>(mutex->handle));
 	ut_free(mutex->handle);
 	ut_free(mutex);
 }
@@ -832,9 +834,9 @@ os_mutex_free(
 Initializes an operating system fast mutex semaphore. */
 UNIV_INTERN
 void
-os_fast_mutex_init(
-/*===============*/
-	os_fast_mutex_t*	fast_mutex)	/*!< in: fast mutex */
+os_fast_mutex_init_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex)	/*!< in: fast mutex */
 {
 #ifdef __WIN__
 	ut_a(fast_mutex);
@@ -861,9 +863,9 @@ os_fast_mutex_init(
 Acquires ownership of a fast mutex. */
 UNIV_INTERN
 void
-os_fast_mutex_lock(
-/*===============*/
-	os_fast_mutex_t*	fast_mutex)	/*!< in: mutex to acquire */
+os_fast_mutex_lock_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex)	/*!< in: mutex to acquire */
 {
 #ifdef __WIN__
 	EnterCriticalSection((LPCRITICAL_SECTION) fast_mutex);
@@ -876,9 +878,9 @@ os_fast_mutex_lock(
 Releases ownership of a fast mutex. */
 UNIV_INTERN
 void
-os_fast_mutex_unlock(
-/*=================*/
-	os_fast_mutex_t*	fast_mutex)	/*!< in: mutex to release */
+os_fast_mutex_unlock_func(
+/*======================*/
+	fast_mutex_t*		fast_mutex)	/*!< in: mutex to release */
 {
 #ifdef __WIN__
 	LeaveCriticalSection(fast_mutex);
@@ -891,9 +893,9 @@ os_fast_mutex_unlock(
 Frees a mutex object. */
 UNIV_INTERN
 void
-os_fast_mutex_free(
-/*===============*/
-	os_fast_mutex_t*	fast_mutex)	/*!< in: mutex to free */
+os_fast_mutex_free_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex)	/*!< in: mutex to free */
 {
 #ifdef __WIN__
 	ut_a(fast_mutex);
@@ -908,7 +910,7 @@ os_fast_mutex_free(
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
 			"  InnoDB: error: return value %lu when calling\n"
-			"InnoDB: pthread_mutex_destroy().\n", (ulint)ret);
+			"InnoDB: pthread_mutex_destroy().\n", (ulint) ret);
 		fprintf(stderr,
 			"InnoDB: Byte contents of the pthread mutex at %p:\n",
 			(void*) fast_mutex);
diff --git a/storage/xtradb/os/os0thread.c b/storage/xtradb/os/os0thread.cc
index 68a8c973558..685e89d48be 100644
--- a/storage/xtradb/os/os0thread.c
+++ b/storage/xtradb/os/os0thread.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file os/os0thread.c
+@file os/os0thread.cc
 The interface to the operating system thread control primitives
 
 Created 9/8/1995 Heikki Tuuri
@@ -30,6 +30,12 @@ Created 9/8/1995 Heikki Tuuri
 
 #ifdef __WIN__
 #include <windows.h>
+#elif UNIV_LINUX
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
 #endif
 
 #ifndef UNIV_HOTBACKUP
@@ -77,7 +83,7 @@ os_thread_pf(
 
 	return((ulint)(a.field1));
 #else
-	return((ulint)a);
+	return((ulint) a);
 #endif
 }
 
@@ -98,6 +104,24 @@ os_thread_get_curr_id(void)
 #endif
 }
 
+/*****************************************************************//**
+Returns the system-specific thread identifier of current thread.  On Linux,
+returns tid.  On other systems currently returns os_thread_get_curr_id().
+
+@return	current thread identifier */
+UNIV_INTERN
+os_tid_t
+os_thread_get_tid(void)
+/*===================*/
+{
+#ifdef UNIV_LINUX
+	return((os_tid_t)syscall(SYS_gettid));
+#else
+	return(os_thread_get_curr_id());
+#endif
+}
+
+
 /****************************************************************//**
 Creates a new thread of execution. The execution starts from
 the function given. The start function takes a void* parameter
@@ -105,14 +129,10 @@ and returns an ulint.
 @return	handle to the thread */
 UNIV_INTERN
 os_thread_t
-os_thread_create(
-/*=============*/
-#ifndef __WIN__
-	os_posix_f_t		start_f,
-#else
-	ulint (*start_f)(void*),		/*!< in: pointer to function
+os_thread_create_func(
+/*==================*/
+	os_thread_func_t	func,		/*!< in: pointer to function
 						from which to start */
-#endif
 	void*			arg,		/*!< in: argument to start
 						function */
 	os_thread_id_t*		thread_id)	/*!< out: id of the created
@@ -128,7 +148,7 @@ os_thread_create(
 
 	thread = CreateThread(NULL,	/* no security attributes */
 			      0,	/* default size stack */
-			      (LPTHREAD_START_ROUTINE)start_f,
+			      func,
 			      arg,
 			      0,	/* thread runs immediately */
 			      &win_thread_id);
@@ -170,9 +190,9 @@ os_thread_create(
 	os_mutex_exit(os_sync_mutex);
 
 #ifdef UNIV_HPUX10
-	ret = pthread_create(&pthread, pthread_attr_default, start_f, arg);
+	ret = pthread_create(&pthread, pthread_attr_default, func, arg);
 #else
-	ret = pthread_create(&pthread, &attr, start_f, arg);
+	ret = pthread_create(&pthread, &attr, func, arg);
 #endif
 	if (ret) {
 		fprintf(stderr,
@@ -214,7 +234,7 @@ os_thread_exit(
 	os_mutex_exit(os_sync_mutex);
 
 #ifdef __WIN__
-	ExitThread((DWORD)exit_value);
+	ExitThread((DWORD) exit_value);
 #else
 	pthread_detach(pthread_self());
 	pthread_exit(exit_value);
@@ -261,3 +281,31 @@ os_thread_sleep(
 	select(0, NULL, NULL, NULL, &t);
 #endif
 }
+
+/*****************************************************************//**
+Set relative scheduling priority for a given thread on Linux.  Currently a
+no-op on other systems.
+
+@return An actual thread priority after the update */
+UNIV_INTERN
+ulint
+os_thread_set_priority(
+/*===================*/
+	os_tid_t	thread_id,		/*!< in: thread id */
+	ulint		relative_priority)	/*!< in: system-specific
+						priority value */
+{
+#ifdef UNIV_LINUX
+	lint	thread_nice = 19 - relative_priority;
+	if (setpriority(PRIO_PROCESS, thread_id, thread_nice) == -1) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Setting thread %lu nice to %ld failed, "
+			"current nice %d, errno %d",
+			os_thread_pf(thread_id), thread_nice,
+			getpriority(PRIO_PROCESS, thread_id), errno);
+	}
+	return(19 - getpriority(PRIO_PROCESS, thread_id));
+#else
+	return(relative_priority);
+#endif
+}
diff --git a/storage/xtradb/page/page0cur.c b/storage/xtradb/page/page0cur.cc
index a722f5b188d..efce1f10cae 100644
--- a/storage/xtradb/page/page0cur.c
+++ b/storage/xtradb/page/page0cur.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,7 +18,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /********************************************************************//**
-@file page/page0cur.c
+@file page/page0cur.cc
 The page cursor
 
 Created 10/4/1994 Heikki Tuuri
@@ -29,6 +30,7 @@ Created 10/4/1994 Heikki Tuuri
 #endif
 
 #include "page0zip.h"
+#include "btr0btr.h"
 #include "mtr0log.h"
 #include "log0recv.h"
 #include "ut0ut.h"
@@ -772,7 +774,7 @@ page_cur_parse_insert_rec(
 	byte*	buf;
 	byte*	ptr2			= ptr;
 	ulint	info_and_status_bits = 0; /* remove warning */
-	page_cur_t cursor;
+	page_cur_t	cursor;
 	mem_heap_t*	heap		= NULL;
 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
 	ulint*		offsets		= offsets_;
@@ -879,7 +881,8 @@ page_cur_parse_insert_rec(
 	if (mismatch_index + end_seg_len < sizeof buf1) {
 		buf = buf1;
 	} else {
-		buf = mem_alloc(mismatch_index + end_seg_len);
+		buf = static_cast<byte*>(
+			mem_alloc(mismatch_index + end_seg_len));
 	}
 
 	/* Build the inserted record to buf */
@@ -972,6 +975,9 @@ page_cur_insert_rec_low(
 	page = page_align(current_rec);
 	ut_ad(dict_table_is_comp(index->table)
 	      == (ibool) !!page_is_comp(page));
+	ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+	ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID)
+	      == index->id || recv_recovery_is_on() || mtr->inside_ibuf);
 
 	ut_ad(!page_rec_is_supremum(current_rec));
 
@@ -1006,8 +1012,8 @@ page_cur_insert_rec_low(
 
 		rec_offs_init(foffsets_);
 
-		foffsets = rec_get_offsets(free_rec, index, foffsets,
-					ULINT_UNDEFINED, &heap);
+		foffsets = rec_get_offsets(
+			free_rec, index, foffsets, ULINT_UNDEFINED, &heap);
 		if (rec_offs_size(foffsets) < rec_size) {
 			if (UNIV_LIKELY_NULL(heap)) {
 				mem_heap_free(heap);
@@ -1155,71 +1161,22 @@ use_heap:
 }
 
 /***********************************************************//**
-Compresses or reorganizes a page after an optimistic insert.
-@return	rec if succeed, NULL otherwise */
-static
-rec_t*
-page_cur_insert_rec_zip_reorg(
-/*==========================*/
-	rec_t**		current_rec,/*!< in/out: pointer to current record after
-				which the new record is inserted */
-	buf_block_t*	block,	/*!< in: buffer block */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	rec_t*		rec,	/*!< in: inserted record */
-	page_t*		page,	/*!< in: uncompressed page */
-	page_zip_des_t*	page_zip,/*!< in: compressed page */
-	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
-{
-	ulint		pos;
-
-	/* Recompress or reorganize and recompress the page. */
-	if (UNIV_LIKELY(page_zip_compress(page_zip, page, index, mtr))) {
-		return(rec);
-	}
-
-	/* Before trying to reorganize the page,
-	store the number of preceding records on the page. */
-	pos = page_rec_get_n_recs_before(rec);
-	ut_ad(pos > 0);
-
-	if (page_zip_reorganize(block, index, mtr)) {
-		/* The page was reorganized: Find rec by seeking to pos,
-		and update *current_rec. */
-		if (pos > 1) {
-			rec = page_rec_get_nth(page, pos - 1);
-		} else {
-			rec = page + PAGE_NEW_INFIMUM;
-		}
-
-		*current_rec = rec;
-		rec = page + rec_get_next_offs(rec, TRUE);
-
-		return(rec);
-	}
-
-	/* Out of space: restore the page */
-	btr_blob_dbg_remove(page, index, "insert_zip_fail");
-	if (!page_zip_decompress(page_zip, page, FALSE)) {
-		ut_error; /* Memory corrupted? */
-	}
-	ut_ad(page_validate(page, index));
-	btr_blob_dbg_add(page, index, "insert_zip_fail");
-	return(NULL);
-}
-
-/***********************************************************//**
 Inserts a record next to page cursor on a compressed and uncompressed
 page. Returns pointer to inserted record if succeed, i.e.,
 enough space available, NULL otherwise.
 The cursor stays at the same position.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
 @return	pointer to record if succeed, NULL otherwise */
 UNIV_INTERN
 rec_t*
 page_cur_insert_rec_zip(
 /*====================*/
-	rec_t**		current_rec,/*!< in/out: pointer to current record after
-				which the new record is inserted */
-	buf_block_t*	block,	/*!< in: buffer block of *current_rec */
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
 	dict_index_t*	index,	/*!< in: record descriptor */
 	const rec_t*	rec,	/*!< in: pointer to a physical record */
 	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
@@ -1237,16 +1194,19 @@ page_cur_insert_rec_zip(
 					record */
 	page_zip_des_t*	page_zip;
 
-	page_zip = buf_block_get_page_zip(block);
+	page_zip = page_cur_get_page_zip(cursor);
 	ut_ad(page_zip);
 
 	ut_ad(rec_offs_validate(rec, index, offsets));
 
-	page = page_align(*current_rec);
+	page = page_cur_get_page(cursor);
 	ut_ad(dict_table_is_comp(index->table));
 	ut_ad(page_is_comp(page));
+	ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+	ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID)
+	      == index->id || mtr->inside_ibuf || recv_recovery_is_on());
 
-	ut_ad(!page_rec_is_supremum(*current_rec));
+	ut_ad(!page_cur_is_after_last(cursor));
 #ifdef UNIV_ZIP_DEBUG
 	ut_a(page_zip_validate(page_zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
@@ -1271,25 +1231,168 @@ page_cur_insert_rec_zip(
 	}
 #endif /* UNIV_DEBUG_VALGRIND */
 
+	const bool reorg_before_insert = page_has_garbage(page)
+		&& rec_size > page_get_max_insert_size(page, 1)
+		&& rec_size <= page_get_max_insert_size_after_reorganize(
+			page, 1);
+
 	/* 2. Try to find suitable space from page memory management */
 	if (!page_zip_available(page_zip, dict_index_is_clust(index),
-				rec_size, 1)) {
+				rec_size, 1)
+	    || reorg_before_insert) {
+		/* The values can change dynamically. */
+		bool	log_compressed	= page_zip_log_pages;
+		ulint	level		= page_zip_level;
+#ifdef UNIV_DEBUG
+		rec_t*	cursor_rec	= page_cur_get_rec(cursor);
+#endif /* UNIV_DEBUG */
+
+		/* If we are not writing compressed page images, we
+		must reorganize the page before attempting the
+		insert. */
+		if (recv_recovery_is_on()) {
+			/* Insert into the uncompressed page only.
+			The page reorganization or creation that we
+			would attempt outside crash recovery would
+			have been covered by a previous redo log record. */
+		} else if (page_is_empty(page)) {
+			ut_ad(page_cur_is_before_first(cursor));
+
+			/* This is an empty page. Recreate it to
+			get rid of the modification log. */
+			page_create_zip(page_cur_get_block(cursor), index,
+					page_header_get_field(page, PAGE_LEVEL),
+					0, mtr);
+			ut_ad(!page_header_get_ptr(page, PAGE_FREE));
+
+			if (page_zip_available(
+				    page_zip, dict_index_is_clust(index),
+				    rec_size, 1)) {
+				goto use_heap;
+			}
+
+			/* The cursor should remain on the page infimum. */
+			return(NULL);
+		} else if (!page_zip->m_nonempty && !page_has_garbage(page)) {
+			/* The page has been freshly compressed, so
+			reorganizing it will not help. */
+		} else if (log_compressed && !reorg_before_insert) {
+			/* Insert into uncompressed page only, and
+			try page_zip_reorganize() afterwards. */
+		} else if (btr_page_reorganize_low(
+				   recv_recovery_is_on(), level,
+				   cursor, index, mtr)) {
+			ut_ad(!page_header_get_ptr(page, PAGE_FREE));
+
+			if (page_zip_available(
+				    page_zip, dict_index_is_clust(index),
+				    rec_size, 1)) {
+				/* After reorganizing, there is space
+				available. */
+				goto use_heap;
+			}
+		} else {
+			ut_ad(cursor->rec == cursor_rec);
+			return(NULL);
+		}
 
 		/* Try compressing the whole page afterwards. */
-		insert_rec = page_cur_insert_rec_low(*current_rec,
-						     index, rec, offsets,
-						     NULL);
-
-		if (UNIV_LIKELY(insert_rec != NULL)) {
-			insert_rec = page_cur_insert_rec_zip_reorg(
-				current_rec, block, index, insert_rec,
-				page, page_zip, mtr);
-#ifdef UNIV_DEBUG
-			if (insert_rec) {
-				rec_offs_make_valid(
-					insert_rec, index, offsets);
+		insert_rec = page_cur_insert_rec_low(
+			cursor->rec, index, rec, offsets, NULL);
+
+		/* If recovery is on, this implies that the compression
+		of the page was successful during runtime. Had that not
+		been the case or had the redo logging of compressed
+		pages been enabled during runtime then we'd have seen
+		a MLOG_ZIP_PAGE_COMPRESS redo record. Therefore, we
+		know that we don't need to reorganize the page. We,
+		however, do need to recompress the page. That will
+		happen when the next redo record is read which must
+		be of type MLOG_ZIP_PAGE_COMPRESS_NO_DATA and it must
+		contain a valid compression level value.
+		This implies that during recovery from this point till
+		the next redo is applied the uncompressed and
+		compressed versions are not identical and
+		page_zip_validate will fail but that is OK because
+		we call page_zip_validate only after processing
+		all changes to a page under a single mtr during
+		recovery. */
+		if (insert_rec == NULL) {
+			/* Out of space.
+			This should never occur during crash recovery,
+			because the MLOG_COMP_REC_INSERT should only
+			be logged after a successful operation. */
+			ut_ad(!recv_recovery_is_on());
+		} else if (recv_recovery_is_on()) {
+			/* This should be followed by
+			MLOG_ZIP_PAGE_COMPRESS_NO_DATA,
+			which should succeed. */
+			rec_offs_make_valid(insert_rec, index, offsets);
+		} else {
+			ulint	pos = page_rec_get_n_recs_before(insert_rec);
+			ut_ad(pos > 0);
+
+			if (!log_compressed) {
+				if (page_zip_compress(
+					    page_zip, page, index,
+					    level, NULL)) {
+					page_cur_insert_rec_write_log(
+						insert_rec, rec_size,
+						cursor->rec, index, mtr);
+					page_zip_compress_write_log_no_data(
+						level, page, index, mtr);
+
+					rec_offs_make_valid(
+						insert_rec, index, offsets);
+					return(insert_rec);
+				}
+
+				ut_ad(cursor->rec
+				      == (pos > 1
+					  ? page_rec_get_nth(
+						  page, pos - 1)
+					  : page + PAGE_NEW_INFIMUM));
+			} else {
+				/* We are writing entire page images
+				to the log. Reduce the redo log volume
+				by reorganizing the page at the same time. */
+				if (page_zip_reorganize(
+					    cursor->block, index, mtr)) {
+					/* The page was reorganized:
+					Seek to pos. */
+					if (pos > 1) {
+						cursor->rec = page_rec_get_nth(
+							page, pos - 1);
+					} else {
+						cursor->rec = page
+							+ PAGE_NEW_INFIMUM;
+					}
+
+					insert_rec = page + rec_get_next_offs(
+						cursor->rec, TRUE);
+					rec_offs_make_valid(
+						insert_rec, index, offsets);
+					return(insert_rec);
+				}
+
+				/* Theoretically, we could try one
+				last resort of btr_page_reorganize_low()
+				followed by page_zip_available(), but
+				that would be very unlikely to
+				succeed. (If the full reorganized page
+				failed to compress, why would it
+				succeed to compress the page, plus log
+				the insert of this record? */
 			}
-#endif /* UNIV_DEBUG */
+
+			/* Out of space: restore the page */
+			btr_blob_dbg_remove(page, index, "insert_zip_fail");
+			if (!page_zip_decompress(page_zip, page, FALSE)) {
+				ut_error; /* Memory corrupted? */
+			}
+			ut_ad(page_validate(page, index));
+			btr_blob_dbg_add(page, index, "insert_zip_fail");
+			insert_rec = NULL;
 		}
 
 		return(insert_rec);
@@ -1306,7 +1409,7 @@ page_cur_insert_rec_zip(
 		rec_offs_init(foffsets_);
 
 		foffsets = rec_get_offsets(free_rec, index, foffsets,
-					ULINT_UNDEFINED, &heap);
+					   ULINT_UNDEFINED, &heap);
 		if (rec_offs_size(foffsets) < rec_size) {
 too_small:
 			if (UNIV_LIKELY_NULL(heap)) {
@@ -1414,18 +1517,19 @@ use_heap:
 	rec_offs_make_valid(insert_rec, index, offsets);
 
 	/* 4. Insert the record in the linked list of records */
-	ut_ad(*current_rec != insert_rec);
+	ut_ad(cursor->rec != insert_rec);
 
 	{
 		/* next record after current before the insertion */
-		rec_t*	next_rec = page_rec_get_next(*current_rec);
-		ut_ad(rec_get_status(*current_rec)
+		const rec_t*	next_rec = page_rec_get_next_low(
+			cursor->rec, TRUE);
+		ut_ad(rec_get_status(cursor->rec)
 		      <= REC_STATUS_INFIMUM);
 		ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM);
 		ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
 
 		page_rec_set_next(insert_rec, next_rec);
-		page_rec_set_next(*current_rec, insert_rec);
+		page_rec_set_next(cursor->rec, insert_rec);
 	}
 
 	page_header_set_field(page, page_zip, PAGE_N_RECS,
@@ -1439,7 +1543,7 @@ use_heap:
 	UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets),
 			   rec_offs_size(offsets));
 
-	page_zip_dir_insert(page_zip, *current_rec, free_rec, insert_rec);
+	page_zip_dir_insert(page_zip, cursor->rec, free_rec, insert_rec);
 
 	/* 6. Update the last insertion info in page header */
 
@@ -1453,7 +1557,7 @@ use_heap:
 							PAGE_NO_DIRECTION);
 		page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0);
 
-	} else if ((last_insert == *current_rec)
+	} else if ((last_insert == cursor->rec)
 		   && (page_header_get_field(page, PAGE_DIRECTION)
 		       != PAGE_LEFT)) {
 
@@ -1506,7 +1610,7 @@ use_heap:
 	/* 9. Write log record of the insert */
 	if (UNIV_LIKELY(mtr != NULL)) {
 		page_cur_insert_rec_write_log(insert_rec, rec_size,
-					      *current_rec, index, mtr);
+					      cursor->rec, index, mtr);
 	}
 
 	return(insert_rec);
@@ -1600,7 +1704,12 @@ page_parse_copy_rec_list_to_created_page(
 #ifndef UNIV_HOTBACKUP
 /*************************************************************//**
 Copies records from page to a newly created page, from a given record onward,
-including that record. Infimum and supremum records are not copied. */
+including that record. Infimum and supremum records are not copied.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit(). */
 UNIV_INTERN
 void
 page_copy_rec_list_end_to_created_page(
@@ -1780,9 +1889,9 @@ UNIV_INLINE
 void
 page_cur_delete_rec_write_log(
 /*==========================*/
-	rec_t*		rec,	/*!< in: record to be deleted */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+	rec_t*			rec,	/*!< in: record to be deleted */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
 {
 	byte*	log_ptr;
 
@@ -1864,10 +1973,11 @@ UNIV_INTERN
 void
 page_cur_delete_rec(
 /*================*/
-	page_cur_t*	cursor,	/*!< in/out: a page cursor */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	const ulint*	offsets,/*!< in: rec_get_offsets(cursor->rec, index) */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+	page_cur_t*		cursor,	/*!< in/out: a page cursor */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const ulint*		offsets,/*!< in: rec_get_offsets(
+					cursor->rec, index) */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
 {
 	page_dir_slot_t* cur_dir_slot;
 	page_dir_slot_t* prev_slot;
@@ -1880,8 +1990,6 @@ page_cur_delete_rec(
 	ulint		cur_n_owned;
 	rec_t*		rec;
 
-	ut_ad(cursor && mtr);
-
 	page = page_cur_get_page(cursor);
 	page_zip = page_cur_get_page_zip(cursor);
 
@@ -1896,10 +2004,31 @@ page_cur_delete_rec(
 	current_rec = cursor->rec;
 	ut_ad(rec_offs_validate(current_rec, index, offsets));
 	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+	ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+	ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID)
+	      == index->id || mtr->inside_ibuf || recv_recovery_is_on());
 
 	/* The record must not be the supremum or infimum record. */
 	ut_ad(page_rec_is_user_rec(current_rec));
 
+	if (page_get_n_recs(page) == 1 && !recv_recovery_is_on()) {
+		/* Empty the page, unless we are applying the redo log
+		during crash recovery. During normal operation, the
+		page_create_empty() gets logged as one of MLOG_PAGE_CREATE,
+		MLOG_COMP_PAGE_CREATE, MLOG_ZIP_PAGE_COMPRESS. */
+		ut_ad(page_is_leaf(page));
+		/* Usually, this should be the root page,
+		and the whole index tree should become empty.
+		However, this could also be a call in
+		btr_cur_pessimistic_update() to delete the only
+		record in the page and to insert another one. */
+		page_cur_move_to_next(cursor);
+		ut_ad(page_cur_is_after_last(cursor));
+		page_create_empty(page_cur_get_block(cursor),
+				  const_cast<dict_index_t*>(index), mtr);
+		return;
+	}
+
 	/* Save to local variables some data associated with current_rec */
 	cur_slot_no = page_dir_find_owner_slot(current_rec);
 	ut_ad(cur_slot_no > 0);
@@ -1907,7 +2036,9 @@ page_cur_delete_rec(
 	cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot);
 
 	/* 0. Write the log record */
-	page_cur_delete_rec_write_log(current_rec, index, mtr);
+	if (mtr != 0) {
+		page_cur_delete_rec_write_log(current_rec, index, mtr);
+	}
 
 	/* 1. Reset the last insert info in the page header and increment
 	the modify clock for the frame */
@@ -1915,9 +2046,13 @@ page_cur_delete_rec(
 	page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL);
 
 	/* The page gets invalid for optimistic searches: increment the
-	frame modify clock */
+	frame modify clock only if there is an mini-transaction covering
+	the change. During IMPORT we allocate local blocks that are not
+	part of the buffer pool. */
 
-	buf_block_modify_clock_inc(page_cur_get_block(cursor));
+	if (mtr != 0) {
+		buf_block_modify_clock_inc(page_cur_get_block(cursor));
+	}
 
 	/* 2. Find the next and the previous record. Note that the cursor is
 	left at the next record. */
@@ -1961,14 +2096,15 @@ page_cur_delete_rec(
 	page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1);
 
 	/* 6. Free the memory occupied by the record */
-	btr_blob_dbg_remove_rec(current_rec, index, offsets, "delete");
+	btr_blob_dbg_remove_rec(current_rec, const_cast<dict_index_t*>(index),
+				offsets, "delete");
 	page_mem_free(page, page_zip, current_rec, index, offsets);
 
 	/* 7. Now we have decremented the number of owned records of the slot.
 	If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the
 	slots. */
 
-	if (UNIV_UNLIKELY(cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED)) {
+	if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
 		page_dir_balance_slot(page, page_zip, cur_slot_no);
 	}
 
diff --git a/storage/xtradb/page/page0page.c b/storage/xtradb/page/page0page.cc
index f2ce6c9fe16..2faf804279c 100644
--- a/storage/xtradb/page/page0page.c
+++ b/storage/xtradb/page/page0page.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,7 +18,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file page/page0page.c
+@file page/page0page.cc
 Index page routines
 
 Created 2/2/1994 Heikki Tuuri
@@ -222,7 +223,7 @@ page_set_max_trx_id(
 	during a database recovery we assume that the max trx id of every
 	page is the maximum trx id assigned before the crash. */
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id);
 		page_zip_write_header(page_zip,
 				      page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
@@ -499,7 +500,8 @@ page_create_zip(
 					page is created */
 	dict_index_t*	index,		/*!< in: the index of the page */
 	ulint		level,		/*!< in: the B-tree level of the page */
-	mtr_t*		mtr)		/*!< in: mini-transaction handle */
+	trx_id_t	max_trx_id,	/*!< in: PAGE_MAX_TRX_ID */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
 	page_t*		page;
 	page_zip_des_t*	page_zip	= buf_block_get_page_zip(block);
@@ -510,9 +512,11 @@ page_create_zip(
 	ut_ad(dict_table_is_comp(index->table));
 
 	page = page_create_low(block, TRUE);
-	mach_write_to_2(page + PAGE_HEADER + PAGE_LEVEL, level);
+	mach_write_to_2(PAGE_HEADER + PAGE_LEVEL + page, level);
+	mach_write_to_8(PAGE_HEADER + PAGE_MAX_TRX_ID + page, max_trx_id);
 
-	if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page, index, mtr))) {
+	if (!page_zip_compress(page_zip, page, index,
+			       page_zip_level, mtr)) {
 		/* The compression of a newly created page
 		should always succeed. */
 		ut_error;
@@ -521,9 +525,49 @@ page_create_zip(
 	return(page);
 }
 
+/**********************************************************//**
+Empty a previously created B-tree index page. */
+UNIV_INTERN
+void
+page_create_empty(
+/*==============*/
+	buf_block_t*	block,	/*!< in/out: B-tree block */
+	dict_index_t*	index,	/*!< in: the index of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	trx_id_t	max_trx_id = 0;
+	const page_t*	page	= buf_block_get_frame(block);
+	page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
+
+	ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+
+	if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
+		max_trx_id = page_get_max_trx_id(page);
+		ut_ad(max_trx_id);
+	}
+
+	if (page_zip) {
+		page_create_zip(block, index,
+				page_header_get_field(page, PAGE_LEVEL),
+				max_trx_id, mtr);
+	} else {
+		page_create(block, mtr, page_is_comp(page));
+
+		if (max_trx_id) {
+			page_update_max_trx_id(
+				block, page_zip, max_trx_id, mtr);
+		}
+	}
+}
+
 /*************************************************************//**
 Differs from page_copy_rec_list_end, because this function does not
-touch the lock table and max trx id on page or compress the page. */
+touch the lock table and max trx id on page or compress the page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit(). */
 UNIV_INTERN
 void
 page_copy_rec_list_end_no_locks(
@@ -598,6 +642,12 @@ page_copy_rec_list_end_no_locks(
 Copies records from page to new_page, from a given record onward,
 including that record. Infimum and supremum records are not copied.
 The records are copied to the start of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
 @return pointer to the original successor of the infimum record on
 new_page, or NULL on zip overflow (new_block will be decompressed) */
 UNIV_INTERN
@@ -635,7 +685,7 @@ page_copy_rec_list_end(
 	/* Here, "ret" may be pointing to a user record or the
 	predefined supremum record. */
 
-	if (UNIV_LIKELY_NULL(new_page_zip)) {
+	if (new_page_zip) {
 		log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
 	}
 
@@ -655,11 +705,11 @@ page_copy_rec_list_end(
 				       page_get_max_trx_id(page), mtr);
 	}
 
-	if (UNIV_LIKELY_NULL(new_page_zip)) {
+	if (new_page_zip) {
 		mtr_set_log_mode(mtr, log_mode);
 
-		if (UNIV_UNLIKELY
-		    (!page_zip_compress(new_page_zip, new_page, index, mtr))) {
+		if (!page_zip_compress(new_page_zip, new_page,
+				       index, page_zip_level, mtr)) {
 			/* Before trying to reorganize the page,
 			store the number of preceding records on the page. */
 			ulint	ret_pos
@@ -671,14 +721,12 @@ page_copy_rec_list_end(
 			that is smaller than "ret"). */
 			ut_a(ret_pos > 0);
 
-			if (UNIV_UNLIKELY
-			    (!page_zip_reorganize(new_block, index, mtr))) {
+			if (!page_zip_reorganize(new_block, index, mtr)) {
 
 				btr_blob_dbg_remove(new_page, index,
 						    "copy_end_reorg_fail");
-				if (UNIV_UNLIKELY
-				    (!page_zip_decompress(new_page_zip,
-							  new_page, FALSE))) {
+				if (!page_zip_decompress(new_page_zip,
+							 new_page, FALSE)) {
 					ut_error;
 				}
 				ut_ad(page_validate(new_page, index));
@@ -710,6 +758,12 @@ page_copy_rec_list_end(
 Copies records from page to new_page, up to the given record,
 NOT including that record. Infimum and supremum records are not copied.
 The records are copied to the end of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
 @return pointer to the original predecessor of the supremum record on
 new_page, or NULL on zip overflow (new_block will be decompressed) */
 UNIV_INTERN
@@ -742,7 +796,7 @@ page_copy_rec_list_start(
 		return(ret);
 	}
 
-	if (UNIV_LIKELY_NULL(new_page_zip)) {
+	if (new_page_zip) {
 		log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
 	}
 
@@ -778,14 +832,15 @@ page_copy_rec_list_start(
 				       mtr);
 	}
 
-	if (UNIV_LIKELY_NULL(new_page_zip)) {
+	if (new_page_zip) {
 		mtr_set_log_mode(mtr, log_mode);
 
 		DBUG_EXECUTE_IF("page_copy_rec_list_start_compress_fail",
 				goto zip_reorganize;);
 
-		if (UNIV_UNLIKELY
-		    (!page_zip_compress(new_page_zip, new_page, index, mtr))) {
+		if (!page_zip_compress(new_page_zip, new_page, index,
+				       page_zip_level, mtr)) {
+
 			ulint	ret_pos;
 #ifndef DBUG_OFF
 zip_reorganize:
@@ -949,13 +1004,38 @@ page_delete_rec_list_end(
 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
 
-	if (page_rec_is_infimum(rec)) {
-		rec = page_rec_get_next(rec);
-	}
-
 	if (page_rec_is_supremum(rec)) {
+		ut_ad(n_recs == 0 || n_recs == ULINT_UNDEFINED);
+		/* Nothing to do, there are no records bigger than the
+		page supremum. */
+		return;
+	}
 
+	if (recv_recovery_is_on()) {
+		/* If we are replaying a redo log record, we must
+		replay it exactly. Since MySQL 5.6.11, we should be
+		generating a redo log record for page creation if
+		the page would become empty. Thus, this branch should
+		only be executed when applying redo log that was
+		generated by an older version of MySQL. */
+	} else if (page_rec_is_infimum(rec)
+		   || n_recs == page_get_n_recs(page)) {
+delete_all:
+		/* We are deleting all records. */
+		page_create_empty(block, index, mtr);
 		return;
+	} else if (page_is_comp(page)) {
+		if (page_rec_get_next_low(page + PAGE_NEW_INFIMUM, 1) == rec) {
+			/* We are deleting everything from the first
+			user record onwards. */
+			goto delete_all;
+		}
+	} else {
+		if (page_rec_get_next_low(page + PAGE_OLD_INFIMUM, 0) == rec) {
+			/* We are deleting everything from the first
+			user record onwards. */
+			goto delete_all;
+		}
 	}
 
 	/* Reset the last insert info in the page header and increment
@@ -972,7 +1052,7 @@ page_delete_rec_list_end(
 				       ? MLOG_COMP_LIST_END_DELETE
 				       : MLOG_LIST_END_DELETE, mtr);
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		ulint		log_mode;
 
 		ut_a(page_is_comp(page));
@@ -1134,7 +1214,12 @@ page_delete_rec_list_start(
 #endif /* UNIV_ZIP_DEBUG */
 
 	if (page_rec_is_infimum(rec)) {
+		return;
+	}
 
+	if (page_rec_is_supremum(rec)) {
+		/* We are deleting all records. */
+		page_create_empty(block, index, mtr);
 		return;
 	}
 
@@ -1172,6 +1257,12 @@ page_delete_rec_list_start(
 /*************************************************************//**
 Moves record list end to another page. Moved records include
 split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
 @return TRUE on success; FALSE on compression failure (new_block will
 be decompressed) */
 UNIV_INTERN
@@ -1227,6 +1318,12 @@ page_move_rec_list_end(
 /*************************************************************//**
 Moves record list start to another page. Moved records do not include
 split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
 @return	TRUE on success; FALSE on compression failure */
 UNIV_INTERN
 ibool
@@ -1572,7 +1669,7 @@ page_rec_get_n_recs_before(
 	n--;
 
 	ut_ad(n >= 0);
-	ut_ad((ulint)n < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1));
+	ut_ad((ulong) n < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1));
 
 	return((ulint) n);
 }
@@ -2322,12 +2419,26 @@ page_validate(
 		}
 	}
 
+	if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)
+	    && !page_is_empty(page)) {
+		trx_id_t	max_trx_id	= page_get_max_trx_id(page);
+		trx_id_t	sys_max_trx_id	= trx_sys_get_max_trx_id();
+
+		if (max_trx_id == 0 || max_trx_id > sys_max_trx_id) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"PAGE_MAX_TRX_ID out of bounds: "
+				TRX_ID_FMT ", " TRX_ID_FMT,
+				max_trx_id, sys_max_trx_id);
+			goto func_exit2;
+		}
+	}
+
 	heap = mem_heap_create(UNIV_PAGE_SIZE + 200);
 
 	/* The following buffer is used to check that the
 	records in the page record heap do not overlap */
 
-	buf = mem_heap_zalloc(heap, UNIV_PAGE_SIZE);
+	buf = static_cast<byte*>(mem_heap_zalloc(heap, UNIV_PAGE_SIZE));
 
 	/* Check first that the record heap and the directory do not
 	overlap. */
@@ -2337,7 +2448,7 @@ page_validate(
 	if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP)
 			    <= page_dir_get_nth_slot(page, n_slots - 1)))) {
 
-		fprintf(stderr, 
+		fprintf(stderr,
 			"InnoDB: Record heap and dir overlap"
 			" on space %lu page %lu index %s, %p, %p\n",
 			(ulong) page_get_space_id(page),
@@ -2380,7 +2491,7 @@ page_validate(
 			if (UNIV_UNLIKELY
 			    (1 != cmp_rec_rec(rec, old_rec,
 					      offsets, old_offsets, index))) {
-				fprintf(stderr, 
+				fprintf(stderr,
 					"InnoDB: Records in wrong order"
 					" on space %lu page %lu index %s\n",
 					(ulong) page_get_space_id(page),
@@ -2551,7 +2662,7 @@ func_exit:
 
 	if (UNIV_UNLIKELY(ret == FALSE)) {
 func_exit2:
-		fprintf(stderr, 
+		fprintf(stderr,
 			"InnoDB: Apparent corruption"
 			" in space %lu page %lu index %s\n",
 			(ulong) page_get_space_id(page),
@@ -2611,3 +2722,60 @@ page_find_rec_with_heap_no(
 	}
 }
 #endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************//**
+Removes the record from a leaf page. This function does not log
+any changes. It is used by the IMPORT tablespace functions.
+The cursor is moved to the next record after the deleted one.
+@return	true if success, i.e., the page did not become too empty */
+UNIV_INTERN
+bool
+page_delete_rec(
+/*============*/
+	const dict_index_t*	index,	/*!< in: The index that the record
+					belongs to */
+	page_cur_t*		pcur,	/*!< in/out: page cursor on record
+					to delete */
+	page_zip_des_t*		page_zip,/*!< in: compressed page descriptor */
+	const ulint*		offsets)/*!< in: offsets for record */
+{
+	bool		no_compress_needed;
+	buf_block_t*	block = pcur->block;
+	page_t*		page = buf_block_get_frame(block);
+
+	ut_ad(page_is_leaf(page));
+
+	if (!rec_offs_any_extern(offsets)
+	    && ((page_get_data_size(page) - rec_offs_size(offsets)
+		< BTR_CUR_PAGE_COMPRESS_LIMIT)
+		|| (mach_read_from_4(page + FIL_PAGE_NEXT) == FIL_NULL
+		    && mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL)
+		|| (page_get_n_recs(page) < 2))) {
+
+		ulint	root_page_no = dict_index_get_page(index);
+
+		/* The page fillfactor will drop below a predefined
+		minimum value, OR the level in the B-tree contains just
+		one page, OR the page will become empty: we recommend
+		compression if this is not the root page. */
+
+		no_compress_needed = page_get_page_no(page) == root_page_no;
+	} else {
+		no_compress_needed = true;
+	}
+
+	if (no_compress_needed) {
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+		page_cur_delete_rec(pcur, index, offsets, 0);
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+	}
+
+	return(no_compress_needed);
+}
+
diff --git a/storage/xtradb/page/page0zip.c b/storage/xtradb/page/page0zip.cc
index 40d794770ff..81c9e0ab45a 100644
--- a/storage/xtradb/page/page0zip.c
+++ b/storage/xtradb/page/page0zip.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,18 +12,21 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file page/page0zip.c
+@file page/page0zip.cc
 Compressed page interface
 
 Created June 2005 by Marko Makela
 *******************************************************/
 
+#include <map>
+using namespace std;
+
 #define THIS_MODULE
 #include "page0zip.h"
 #ifdef UNIV_NONINL
@@ -38,20 +42,39 @@ Created June 2005 by Marko Makela
 #include "log0recv.h"
 #include "zlib.h"
 #ifndef UNIV_HOTBACKUP
+# include "buf0buf.h"
 # include "buf0lru.h"
 # include "btr0sea.h"
 # include "dict0boot.h"
 # include "lock0lock.h"
+# include "srv0mon.h"
+# include "srv0srv.h"
+# include "ut0crc32.h"
 #else /* !UNIV_HOTBACKUP */
+# include "buf0checksum.h"
 # define lock_move_reorganize_page(block, temp_block)	((void) 0)
 # define buf_LRU_stat_inc_unzip()			((void) 0)
 #endif /* !UNIV_HOTBACKUP */
 
 #ifndef UNIV_HOTBACKUP
 /** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
-UNIV_INTERN page_zip_stat_t page_zip_stat[PAGE_ZIP_NUM_SSIZE_MAX - 1];
+UNIV_INTERN page_zip_stat_t		page_zip_stat[PAGE_ZIP_SSIZE_MAX];
+/** Statistics on compression, indexed by index->id */
+UNIV_INTERN page_zip_stat_per_index_t	page_zip_stat_per_index;
+/** Mutex protecting page_zip_stat_per_index */
+UNIV_INTERN ib_mutex_t			page_zip_stat_per_index_mutex;
+#ifdef HAVE_PSI_INTERFACE
+UNIV_INTERN mysql_pfs_key_t		page_zip_stat_per_index_mutex_key;
+#endif /* HAVE_PSI_INTERFACE */
 #endif /* !UNIV_HOTBACKUP */
 
+/* Compression level to be used by zlib. Settable by user. */
+UNIV_INTERN uint	page_zip_level = DEFAULT_COMPRESSION_LEVEL;
+
+/* Whether or not to log compressed page images to avoid possible
+compression algorithm changes in zlib. */
+UNIV_INTERN my_bool	page_zip_log_pages = true;
+
 /* Please refer to ../include/page0zip.ic for a description of the
 compressed page format. */
 
@@ -381,7 +404,7 @@ page_zip_get_n_prev_extern(
 					compressed page */
 	const rec_t*		rec,	/*!< in: compact physical record
 					on a B-tree leaf page */
-	dict_index_t*		index)	/*!< in: record descriptor */
+	const dict_index_t*	index)	/*!< in: record descriptor */
 {
 	const page_t*	page	= page_align(rec);
 	ulint		n_ext	= 0;
@@ -632,15 +655,15 @@ page_zip_dir_encode(
 #if PAGE_ZIP_DIR_SLOT_MASK & (PAGE_ZIP_DIR_SLOT_MASK + 1)
 # error "PAGE_ZIP_DIR_SLOT_MASK is not 1 less than a power of 2"
 #endif
-#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE - 1
-# error "PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE - 1"
+#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1
+# error "PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1"
 #endif
 		if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) {
 			offs |= PAGE_ZIP_DIR_SLOT_OWNED;
 		}
 
 		info_bits = rec_get_info_bits(rec, TRUE);
-		if (UNIV_UNLIKELY(info_bits & REC_INFO_DELETED_FLAG)) {
+		if (info_bits & REC_INFO_DELETED_FLAG) {
 			info_bits &= ~REC_INFO_DELETED_FLAG;
 			offs |= PAGE_ZIP_DIR_SLOT_DEL;
 		}
@@ -691,6 +714,8 @@ page_zip_dir_encode(
 	ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap);
 }
 
+extern "C" {
+
 /**********************************************************************//**
 Allocate memory for zlib. */
 static
@@ -701,7 +726,7 @@ page_zip_zalloc(
 	uInt	items,	/*!< in: number of items to allocate */
 	uInt	size)	/*!< in: size of an item in bytes */
 {
-	return(mem_heap_zalloc(opaque, items * size));
+	return(mem_heap_zalloc(static_cast<mem_heap_t*>(opaque), items * size));
 }
 
 /**********************************************************************//**
@@ -715,6 +740,8 @@ page_zip_free(
 {
 }
 
+} /* extern "C" */
+
 /**********************************************************************//**
 Configure the zlib allocator to use the given memory heap. */
 UNIV_INTERN
@@ -724,7 +751,7 @@ page_zip_set_alloc(
 	void*		stream,		/*!< in/out: zlib stream */
 	mem_heap_t*	heap)		/*!< in: memory heap to use */
 {
-	z_stream*	strm = stream;
+	z_stream*	strm = static_cast<z_stream*>(stream);
 
 	strm->zalloc = page_zip_zalloc;
 	strm->zfree = page_zip_free;
@@ -1089,7 +1116,7 @@ page_zip_compress_clust(
 		/* Check if there are any externally stored columns.
 		For each externally stored column, store the
 		BTR_EXTERN_FIELD_REF separately. */
-		if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+		if (rec_offs_any_extern(offsets)) {
 			ut_ad(dict_index_is_clust(index));
 
 			err = page_zip_compress_clust_ext(
@@ -1173,6 +1200,7 @@ page_zip_compress(
 				m_start, m_end, m_nonempty */
 	const page_t*	page,	/*!< in: uncompressed page */
 	dict_index_t*	index,	/*!< in: index of the B-tree node */
+	ulint		level,	/*!< in: compression level */
 	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
 {
 	z_stream	c_stream;
@@ -1186,7 +1214,6 @@ page_zip_compress(
 	const rec_t**	recs;	/*!< dense page directory, sorted by address */
 	mem_heap_t*	heap;
 	ulint		trx_id_col;
-	ulint*		offsets	= NULL;
 	ulint		n_blobs	= 0;
 	byte*		storage;/* storage of uncompressed columns */
 #ifndef UNIV_HOTBACKUP
@@ -1195,6 +1222,10 @@ page_zip_compress(
 #ifdef PAGE_ZIP_COMPRESS_DBG
 	FILE*		logfile = NULL;
 #endif
+	/* A local copy of srv_cmp_per_index_enabled to avoid reading that
+	variable multiple times in this function since it can be changed at
+	anytime. */
+	my_bool		cmp_per_index_enabled = srv_cmp_per_index_enabled;
 
 	if (!page) {
 		return(FALSE);
@@ -1220,7 +1251,7 @@ page_zip_compress(
 	ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1),
 		     supremum_extra_data, sizeof supremum_extra_data));
 
-	if (UNIV_UNLIKELY(!page_get_n_recs(page))) {
+	if (page_is_empty(page)) {
 		ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE)
 		     == PAGE_NEW_SUPREMUM);
 	}
@@ -1237,7 +1268,7 @@ page_zip_compress(
 	if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
 		fprintf(stderr, "compress %p %p %lu %lu %lu\n",
 			(void*) page_zip, (void*) page,
-			page_is_leaf(page),
+			(ibool) page_is_leaf(page),
 			n_fields, n_dense);
 	}
 	if (UNIV_UNLIKELY(page_zip_compress_log)) {
@@ -1261,6 +1292,11 @@ page_zip_compress(
 #endif /* PAGE_ZIP_COMPRESS_DBG */
 #ifndef UNIV_HOTBACKUP
 	page_zip_stat[page_zip->ssize - 1].compressed++;
+	if (cmp_per_index_enabled) {
+		mutex_enter(&page_zip_stat_per_index_mutex);
+		page_zip_stat_per_index[index->id].compressed++;
+		mutex_exit(&page_zip_stat_per_index_mutex);
+	}
 #endif /* !UNIV_HOTBACKUP */
 
 	if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
@@ -1269,24 +1305,30 @@ page_zip_compress(
 		goto err_exit;
 	}
 
+	MONITOR_INC(MONITOR_PAGE_COMPRESS);
+
 	heap = mem_heap_create(page_zip_get_size(page_zip)
-			       + n_fields * (2 + sizeof *offsets)
+			       + n_fields * (2 + sizeof(ulint))
+			       + REC_OFFS_HEADER_SIZE
 			       + n_dense * ((sizeof *recs)
 					    - PAGE_ZIP_DIR_SLOT_SIZE)
 			       + UNIV_PAGE_SIZE * 4
 			       + (512 << MAX_MEM_LEVEL));
 
-	recs = mem_heap_zalloc(heap, n_dense * sizeof *recs);
+	recs = static_cast<const rec_t**>(
+		mem_heap_zalloc(heap, n_dense * sizeof *recs));
 
-	fields = mem_heap_alloc(heap, (n_fields + 1) * 2);
+	fields = static_cast<byte*>(mem_heap_alloc(heap, (n_fields + 1) * 2));
+
+	buf = static_cast<byte*>(
+		mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA));
 
-	buf = mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA);
 	buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA;
 
 	/* Compress the data payload. */
 	page_zip_set_alloc(&c_stream, heap);
 
-	err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION,
+	err = deflateInit2(&c_stream, level,
 			   Z_DEFLATED, UNIV_PAGE_SIZE_SHIFT,
 			   MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY);
 	ut_a(err == Z_OK);
@@ -1399,8 +1441,19 @@ err_exit:
 		}
 #endif /* PAGE_ZIP_COMPRESS_DBG */
 #ifndef UNIV_HOTBACKUP
+		if (page_is_leaf(page)) {
+			dict_index_zip_failure(index);
+		}
+
+		ullint	time_diff = ut_time_us(NULL) - usec;
 		page_zip_stat[page_zip->ssize - 1].compressed_usec
-			+= ut_time_us(NULL) - usec;
+			+= time_diff;
+		if (cmp_per_index_enabled) {
+			mutex_enter(&page_zip_stat_per_index_mutex);
+			page_zip_stat_per_index[index->id].compressed_usec
+				+= time_diff;
+			mutex_exit(&page_zip_stat_per_index_mutex);
+		}
 #endif /* !UNIV_HOTBACKUP */
 		return(FALSE);
 	}
@@ -1460,11 +1513,18 @@ err_exit:
 	}
 #endif /* PAGE_ZIP_COMPRESS_DBG */
 #ifndef UNIV_HOTBACKUP
-	{
-		page_zip_stat_t*	zip_stat
-			= &page_zip_stat[page_zip->ssize - 1];
-		zip_stat->compressed_ok++;
-		zip_stat->compressed_usec += ut_time_us(NULL) - usec;
+	ullint	time_diff = ut_time_us(NULL) - usec;
+	page_zip_stat[page_zip->ssize - 1].compressed_ok++;
+	page_zip_stat[page_zip->ssize - 1].compressed_usec += time_diff;
+	if (cmp_per_index_enabled) {
+		mutex_enter(&page_zip_stat_per_index_mutex);
+		page_zip_stat_per_index[index->id].compressed_ok++;
+		page_zip_stat_per_index[index->id].compressed_usec += time_diff;
+		mutex_exit(&page_zip_stat_per_index_mutex);
+	}
+
+	if (page_is_leaf(page)) {
+		dict_index_zip_success(index);
 	}
 #endif /* !UNIV_HOTBACKUP */
 
@@ -1509,6 +1569,7 @@ page_zip_fields_free(
 {
 	if (index) {
 		dict_table_t*	table = index->table;
+		os_fast_mutex_free(&index->zip_pad.mutex);
 		mem_heap_free(index->heap);
 		mutex_free(&(table->autoinc_mutex));
 		ut_free(table->name);
@@ -1560,7 +1621,7 @@ page_zip_fields_decode(
 	}
 
 	table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n,
-				      DICT_TF_COMPACT);
+				      DICT_TF_COMPACT, 0);
 	index = dict_mem_index_create("ZIP_DUMMY", "ZIP_DUMMY",
 				      DICT_HDR_SPACE, 0, n);
 	index->table = table;
@@ -1752,7 +1813,7 @@ page_zip_set_extra_bytes(
 	for (i = 0; i < n; i++) {
 		offs = page_zip_dir_get(page_zip, i);
 
-		if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_DEL)) {
+		if (offs & PAGE_ZIP_DIR_SLOT_DEL) {
 			info_bits |= REC_INFO_DELETED_FLAG;
 		}
 		if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) {
@@ -2117,6 +2178,32 @@ page_zip_apply_log(
 }
 
 /**********************************************************************//**
+Set the heap_no in a record, and skip the fixed-size record header
+that is not included in the d_stream.
+@return	TRUE on success, FALSE if d_stream does not end at rec */
+static
+ibool
+page_zip_decompress_heap_no(
+/*========================*/
+	z_stream*	d_stream,	/*!< in/out: compressed page stream */
+	rec_t*		rec,		/*!< in/out: record */
+	ulint&		heap_status)	/*!< in/out: heap_no and status bits */
+{
+	if (d_stream->next_out != rec - REC_N_NEW_EXTRA_BYTES) {
+		/* n_dense has grown since the page was last compressed. */
+		return(FALSE);
+	}
+
+	/* Skip the REC_N_NEW_EXTRA_BYTES. */
+	d_stream->next_out = rec;
+
+	/* Set heap_no and the status bits. */
+	mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
+	heap_status += 1 << REC_HEAP_NO_SHIFT;
+	return(TRUE);
+}
+
+/**********************************************************************//**
 Decompress the records of a node pointer page.
 @return	TRUE on success, FALSE on failure */
 static
@@ -2152,19 +2239,8 @@ page_zip_decompress_node_ptrs(
 		      - PAGE_ZIP_START - PAGE_DIR);
 		switch (inflate(d_stream, Z_SYNC_FLUSH)) {
 		case Z_STREAM_END:
-			if (d_stream->next_out
-			    != rec - REC_N_NEW_EXTRA_BYTES) {
-				/* n_dense has grown since the page
-				was last compressed. */
-			} else {
-				/* Skip the REC_N_NEW_EXTRA_BYTES. */
-				d_stream->next_out = rec;
-
-				/* Set heap_no and the status bits. */
-				mach_write_to_2(rec - REC_NEW_HEAP_NO,
-						heap_status);
-				heap_status += 1 << REC_HEAP_NO_SHIFT;
-			}
+			page_zip_decompress_heap_no(
+				d_stream, rec, heap_status);
 			goto zlib_done;
 		case Z_OK:
 		case Z_BUF_ERROR:
@@ -2179,12 +2255,10 @@ page_zip_decompress_node_ptrs(
 			goto zlib_error;
 		}
 
-		ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES);
-		/* Prepare to decompress the data bytes. */
-		d_stream->next_out = rec;
-		/* Set heap_no and the status bits. */
-		mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
-		heap_status += 1 << REC_HEAP_NO_SHIFT;
+		if (!page_zip_decompress_heap_no(
+			    d_stream, rec, heap_status)) {
+			ut_ad(0);
+		}
 
 		/* Read the offsets. The status bits are needed here. */
 		offsets = rec_get_offsets(rec, index, offsets,
@@ -2352,19 +2426,8 @@ page_zip_decompress_sec(
 		if (UNIV_LIKELY(d_stream->avail_out)) {
 			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
 			case Z_STREAM_END:
-				if (d_stream->next_out
-				    != rec - REC_N_NEW_EXTRA_BYTES) {
-					/* n_dense has grown since the page
-					was last compressed. */
-				} else {
-					/* Skip the REC_N_NEW_EXTRA_BYTES. */
-					d_stream->next_out = rec;
-
-					/* Set heap_no and the status bits. */
-					mach_write_to_2(rec - REC_NEW_HEAP_NO,
-							heap_status);
-					heap_status += 1 << REC_HEAP_NO_SHIFT;
-				}
+				page_zip_decompress_heap_no(
+					d_stream, rec, heap_status);
 				goto zlib_done;
 			case Z_OK:
 			case Z_BUF_ERROR:
@@ -2380,15 +2443,10 @@ page_zip_decompress_sec(
 			}
 		}
 
-		ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES);
-
-		/* Skip the REC_N_NEW_EXTRA_BYTES. */
-
-		d_stream->next_out = rec;
-
-		/* Set heap_no and the status bits. */
-		mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
-		heap_status += 1 << REC_HEAP_NO_SHIFT;
+		if (!page_zip_decompress_heap_no(
+			    d_stream, rec, heap_status)) {
+			ut_ad(0);
+		}
 	}
 
 	/* Decompress the data of the last record and any trailing garbage,
@@ -2622,19 +2680,8 @@ page_zip_decompress_clust(
 		err = inflate(d_stream, Z_SYNC_FLUSH);
 		switch (err) {
 		case Z_STREAM_END:
-			if (d_stream->next_out
-			    != rec - REC_N_NEW_EXTRA_BYTES) {
-				/* n_dense has grown since the page
-				was last compressed. */
-			} else {
-				/* Skip the REC_N_NEW_EXTRA_BYTES. */
-				d_stream->next_out = rec;
-
-				/* Set heap_no and the status bits. */
-				mach_write_to_2(rec - REC_NEW_HEAP_NO,
-						heap_status);
-				heap_status += 1 << REC_HEAP_NO_SHIFT;
-			}
+			page_zip_decompress_heap_no(
+				d_stream, rec, heap_status);
 			goto zlib_done;
 		case Z_OK:
 		case Z_BUF_ERROR:
@@ -2649,12 +2696,10 @@ page_zip_decompress_clust(
 			goto zlib_error;
 		}
 
-		ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES);
-		/* Prepare to decompress the data bytes. */
-		d_stream->next_out = rec;
-		/* Set heap_no and the status bits. */
-		mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
-		heap_status += 1 << REC_HEAP_NO_SHIFT;
+		if (!page_zip_decompress_heap_no(
+			    d_stream, rec, heap_status)) {
+			ut_ad(0);
+		}
 
 		/* Read the offsets. The status bits are needed here. */
 		offsets = rec_get_offsets(rec, index, offsets,
@@ -2666,7 +2711,7 @@ page_zip_decompress_clust(
 		For each externally stored column, restore the
 		BTR_EXTERN_FIELD_REF separately. */
 
-		if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+		if (rec_offs_any_extern(offsets)) {
 			if (UNIV_UNLIKELY
 			    (!page_zip_decompress_clust_ext(
 				    d_stream, rec, offsets, trx_id_col))) {
@@ -2931,7 +2976,9 @@ page_zip_decompress(
 	}
 
 	heap = mem_heap_create(n_dense * (3 * sizeof *recs) + UNIV_PAGE_SIZE);
-	recs = mem_heap_alloc(heap, n_dense * (2 * sizeof *recs));
+
+	recs = static_cast<rec_t**>(
+		mem_heap_alloc(heap, n_dense * (2 * sizeof *recs)));
 
 	if (all) {
 		/* Copy the page header. */
@@ -2975,7 +3022,7 @@ zlib_error:
 	/* Copy the infimum and supremum records. */
 	memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
 	       infimum_extra, sizeof infimum_extra);
-	if (UNIV_UNLIKELY(!page_get_n_recs(page))) {
+	if (page_is_empty(page)) {
 		rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
 				      PAGE_NEW_SUPREMUM);
 	} else {
@@ -3033,7 +3080,10 @@ zlib_error:
 		/* Pre-allocate the offsets for rec_get_offsets_reverse(). */
 		ulint	n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE
 			+ dict_index_get_n_fields(index);
-		offsets = mem_heap_alloc(heap, n * sizeof(ulint));
+
+		offsets = static_cast<ulint*>(
+			mem_heap_alloc(heap, n * sizeof(ulint)));
+
 		*offsets = n;
 	}
 
@@ -3093,17 +3143,25 @@ err_exit:
 	page_zip_fields_free(index);
 	mem_heap_free(heap);
 #ifndef UNIV_HOTBACKUP
-	{
-		page_zip_stat_t*	zip_stat
-			= &page_zip_stat[page_zip->ssize - 1];
-		zip_stat->decompressed++;
-		zip_stat->decompressed_usec += ut_time_us(NULL) - usec;
+	ullint	time_diff = ut_time_us(NULL) - usec;
+	page_zip_stat[page_zip->ssize - 1].decompressed++;
+	page_zip_stat[page_zip->ssize - 1].decompressed_usec += time_diff;
+
+	index_id_t	index_id = btr_page_get_index_id(page);
+
+	if (srv_cmp_per_index_enabled) {
+		mutex_enter(&page_zip_stat_per_index_mutex);
+		page_zip_stat_per_index[index_id].decompressed++;
+		page_zip_stat_per_index[index_id].decompressed_usec += time_diff;
+		mutex_exit(&page_zip_stat_per_index_mutex);
 	}
 #endif /* !UNIV_HOTBACKUP */
 
 	/* Update the stat counter for LRU policy. */
 	buf_LRU_stat_inc_unzip();
 
+	MONITOR_INC(MONITOR_PAGE_DECOMPRESS);
+
 	return(TRUE);
 }
 
@@ -3118,7 +3176,7 @@ page_zip_hexdump_func(
 	const void*	buf,	/*!< in: data */
 	ulint		size)	/*!< in: length of the data, in bytes */
 {
-	const byte*	s	= buf;
+	const byte*	s	= static_cast<const byte*>(buf);
 	ulint		addr;
 	const ulint	width	= 32; /* bytes per line */
 
@@ -3185,15 +3243,15 @@ page_zip_validate_low(
 
 	/* page_zip_decompress() expects the uncompressed page to be
 	UNIV_PAGE_SIZE aligned. */
-	temp_page_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
-	temp_page = ut_align(temp_page_buf, UNIV_PAGE_SIZE);
+	temp_page_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
+	temp_page = static_cast<byte*>(ut_align(temp_page_buf, UNIV_PAGE_SIZE));
 
 #ifdef UNIV_DEBUG_VALGRIND
 	/* Get detailed information on the valid bits in case the
 	UNIV_MEM_ASSERT_RW() checks fail.  The v-bits of page[],
 	page_zip->data[] or page_zip could be viewed at temp_page[] or
 	temp_page_zip in a debugger when running valgrind --db-attach. */
-	VALGRIND_GET_VBITS(page, temp_page, UNIV_PAGE_SIZE);
+	(void) VALGRIND_GET_VBITS(page, temp_page, UNIV_PAGE_SIZE);
 	UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
 # if UNIV_WORD_SIZE == 4
 	VALGRIND_GET_VBITS(page_zip, &temp_page_zip, sizeof temp_page_zip);
@@ -3202,8 +3260,8 @@ page_zip_validate_low(
 	pad bytes. */
 	UNIV_MEM_ASSERT_RW(page_zip, sizeof *page_zip);
 # endif
-	VALGRIND_GET_VBITS(page_zip->data, temp_page,
-			   page_zip_get_size(page_zip));
+	(void) VALGRIND_GET_VBITS(page_zip->data, temp_page,
+				  page_zip_get_size(page_zip));
 	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
 #endif /* UNIV_DEBUG_VALGRIND */
 
@@ -4005,6 +4063,7 @@ page_zip_write_trx_id_and_roll_ptr(
 	ulint	len;
 
 	ut_ad(PAGE_ZIP_MATCH(rec, page_zip));
+
 	ut_ad(page_simple_validate_new(page));
 	ut_ad(page_zip_simple_validate(page_zip));
 	ut_ad(page_zip_get_size(page_zip)
@@ -4057,10 +4116,10 @@ static
 void
 page_zip_clear_rec(
 /*===============*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	byte*		rec,	/*!< in: record to clear */
-	dict_index_t*	index,	/*!< in: index of rec */
-	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	byte*		rec,		/*!< in: record to clear */
+	const dict_index_t*	index,	/*!< in: index of rec */
+	const ulint*	offsets)	/*!< in: rec_get_offsets(rec, index) */
 {
 	ulint	heap_no;
 	page_t*	page	= page_align(rec);
@@ -4271,11 +4330,12 @@ UNIV_INTERN
 void
 page_zip_dir_delete(
 /*================*/
-	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
-	byte*		rec,	/*!< in: record to delete */
-	dict_index_t*	index,	/*!< in: index of rec */
-	const ulint*	offsets,/*!< in: rec_get_offsets(rec) */
-	const byte*	free)	/*!< in: previous start of the free list */
+	page_zip_des_t*		page_zip,	/*!< in/out: compressed page */
+	byte*			rec,		/*!< in: deleted record */
+	const dict_index_t*	index,		/*!< in: index of rec */
+	const ulint*		offsets,	/*!< in: rec_get_offsets(rec) */
+	const byte*		free)		/*!< in: previous start of
+						the free list */
 {
 	byte*	slot_rec;
 	byte*	slot_free;
@@ -4389,7 +4449,7 @@ page_zip_dir_add_slot(
 	if (!page_is_leaf(page_zip->data)) {
 		ut_ad(!page_zip->n_blobs);
 		stored = dir - n_dense * REC_NODE_PTR_SIZE;
-	} else if (UNIV_UNLIKELY(is_clustered)) {
+	} else if (is_clustered) {
 		/* Move the BLOB pointer array backwards to make space for the
 		roll_ptr and trx_id columns and the dense directory slot. */
 		byte*	externs;
@@ -4591,7 +4651,7 @@ page_zip_reorganize(
 	/* Restore logging. */
 	mtr_set_log_mode(mtr, log_mode);
 
-	if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page, index, mtr))) {
+	if (!page_zip_compress(page_zip, page, index, page_zip_level, mtr)) {
 
 #ifndef UNIV_HOTBACKUP
 		buf_block_free(temp_block);
@@ -4771,21 +4831,113 @@ ulint
 page_zip_calc_checksum(
 /*===================*/
 	const void*	data,	/*!< in: compressed page */
-	ulint		size)	/*!< in: size of compressed page */
+	ulint		size,	/*!< in: size of compressed page */
+	srv_checksum_algorithm_t algo) /*!< in: algorithm to use */
 {
+	uLong		adler;
+	ib_uint32_t	crc32;
+	const Bytef*	s = static_cast<const byte*>(data);
+
 	/* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN,
 	and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */
 
-	const Bytef*	s	= data;
-	uLong		adler;
+	switch (algo) {
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+
+		ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+		crc32 = ut_crc32(s + FIL_PAGE_OFFSET,
+				 FIL_PAGE_LSN - FIL_PAGE_OFFSET)
+			^ ut_crc32(s + FIL_PAGE_TYPE, 2)
+			^ ut_crc32(s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+				   size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+		return((ulint) crc32);
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+		ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+		adler = adler32(0L, s + FIL_PAGE_OFFSET,
+				FIL_PAGE_LSN - FIL_PAGE_OFFSET);
+		adler = adler32(adler, s + FIL_PAGE_TYPE, 2);
+		adler = adler32(adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+				size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+		return((ulint) adler);
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+		return(BUF_NO_CHECKSUM_MAGIC);
+	/* no default so the compiler will emit a warning if new enum
+	is added and not handled here */
+	}
+
+	ut_error;
+	return(0);
+}
+
+/**********************************************************************//**
+Verify a compressed page's checksum.
+@return	TRUE if the stored checksum is valid according to the value of
+innodb_checksum_algorithm */
+UNIV_INTERN
+ibool
+page_zip_verify_checksum(
+/*=====================*/
+	const void*	data,	/*!< in: compressed page */
+	ulint		size)	/*!< in: size of compressed page */
+{
+	ib_uint32_t	stored;
+	ib_uint32_t	calc;
+	ib_uint32_t	crc32 = 0 /* silence bogus warning */;
+	ib_uint32_t	innodb = 0 /* silence bogus warning */;
 
-	ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	stored = mach_read_from_4(
+		(const unsigned char*) data + FIL_PAGE_SPACE_OR_CHKSUM);
 
-	adler = adler32(0L, s + FIL_PAGE_OFFSET,
-			FIL_PAGE_LSN - FIL_PAGE_OFFSET);
-	adler = adler32(adler, s + FIL_PAGE_TYPE, 2);
-	adler = adler32(adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
-			size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	/* declare empty pages non-corrupted */
+	if (stored == 0) {
+		/* make sure that the page is really empty */
+		ut_d(ulint i; for (i = 0; i < size; i++) {
+		     ut_a(*((const char*) data + i) == 0); });
+
+		return(TRUE);
+	}
+
+	calc = page_zip_calc_checksum(
+		data, size, static_cast<srv_checksum_algorithm_t>(
+			srv_checksum_algorithm));
+
+	if (stored == calc) {
+		return(TRUE);
+	}
+
+	switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+		return(stored == calc);
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+		if (stored == BUF_NO_CHECKSUM_MAGIC) {
+			return(TRUE);
+		}
+		crc32 = calc;
+		innodb = page_zip_calc_checksum(
+			data, size, SRV_CHECKSUM_ALGORITHM_INNODB);
+		break;
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+		if (stored == BUF_NO_CHECKSUM_MAGIC) {
+			return(TRUE);
+		}
+		crc32 = page_zip_calc_checksum(
+			data, size, SRV_CHECKSUM_ALGORITHM_CRC32);
+		innodb = calc;
+		break;
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+		return(TRUE);
+	/* no default so the compiler will emit a warning if new enum
+	is added and not handled here */
+	}
 
-	return((ulint) adler);
+	return(stored == crc32 || stored == innodb);
 }
diff --git a/storage/xtradb/pars/lexyy.c b/storage/xtradb/pars/lexyy.cc
index b4a7289a4cb..16458dda637 100644
--- a/storage/xtradb/pars/lexyy.c
+++ b/storage/xtradb/pars/lexyy.cc
@@ -1,25 +1,7 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-
-*****************************************************************************/
-
 #include "univ.i"
-#line 2 "lexyy.c"
+#line 2 "lexyy.cc"
 
-#line 4 "lexyy.c"
+#line 4 "lexyy.cc"
 
 #define  YY_INT_ALIGNED short int
 
@@ -28,7 +10,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #define FLEX_SCANNER
 #define YY_FLEX_MAJOR_VERSION 2
 #define YY_FLEX_MINOR_VERSION 5
-#define YY_FLEX_SUBMINOR_VERSION 31
+#define YY_FLEX_SUBMINOR_VERSION 35
 #if YY_FLEX_SUBMINOR_VERSION > 0
 #define FLEX_BETA
 #endif
@@ -50,7 +32,15 @@ this program; if not, write to the Free Software Foundation, Inc.,
 
 /* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
 
-#if defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
 #include <inttypes.h>
 typedef int8_t flex_int8_t;
 typedef uint8_t flex_uint8_t;
@@ -62,10 +52,9 @@ typedef uint32_t flex_uint32_t;
 typedef signed char flex_int8_t;
 typedef short int flex_int16_t;
 typedef int flex_int32_t;
-typedef unsigned char flex_uint8_t; 
+typedef unsigned char flex_uint8_t;
 typedef unsigned short int flex_uint16_t;
 typedef unsigned int flex_uint32_t;
-#endif /* ! C99 */
 
 /* Limits of integral types. */
 #ifndef INT8_MIN
@@ -96,6 +85,8 @@ typedef unsigned int flex_uint32_t;
 #define UINT32_MAX             (4294967295U)
 #endif
 
+#endif /* ! C99 */
+
 #endif /* ! FLEXINT_H */
 
 #ifdef __cplusplus
@@ -105,11 +96,12 @@ typedef unsigned int flex_uint32_t;
 
 #else	/* ! __cplusplus */
 
-#if __STDC__
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
 
 #define YY_USE_CONST
 
-#endif	/* __STDC__ */
+#endif	/* defined (__STDC__) */
 #endif	/* ! __cplusplus */
 
 #ifdef YY_USE_CONST
@@ -151,24 +143,41 @@ typedef unsigned int flex_uint32_t;
 
 /* Size of default input buffer. */
 #ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
 #define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
 #endif
 
+/* The state buf must be large enough to hold one state per character in the main buffer.
+ */
+#define YY_STATE_BUF_SIZE   ((YY_BUF_SIZE + 2) * sizeof(yy_state_type))
+
 #ifndef YY_TYPEDEF_YY_BUFFER_STATE
 #define YY_TYPEDEF_YY_BUFFER_STATE
 typedef struct yy_buffer_state *YY_BUFFER_STATE;
 #endif
 
-static int yyleng;
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
 
-static FILE *yyin, *yyout;
+extern yy_size_t yyleng;
+
+extern FILE *yyin, *yyout;
 
 #define EOB_ACT_CONTINUE_SCAN 0
 #define EOB_ACT_END_OF_FILE 1
 #define EOB_ACT_LAST_MATCH 2
 
     #define YY_LESS_LINENO(n)
-    
+
 /* Return all but the first "n" matched characters back to the input stream. */
 #define yyless(n) \
 	do \
@@ -185,16 +194,6 @@ static FILE *yyin, *yyout;
 
 #define unput(c) yyunput( c, (yytext_ptr)  )
 
-/* The following is because we cannot portably get our hands on size_t
- * (without autoconf's help, which isn't available because we want
- * flex-generated scanners to compile on their own).
- */
-
-#ifndef YY_TYPEDEF_YY_SIZE_T
-#define YY_TYPEDEF_YY_SIZE_T
-typedef unsigned int yy_size_t;
-#endif
-
 #ifndef YY_STRUCT_YY_BUFFER_STATE
 #define YY_STRUCT_YY_BUFFER_STATE
 struct yy_buffer_state
@@ -212,7 +211,7 @@ struct yy_buffer_state
 	/* Number of characters read into yy_ch_buf, not including EOB
 	 * characters.
 	 */
-	int yy_n_chars;
+	yy_size_t yy_n_chars;
 
 	/* Whether we "own" the buffer - i.e., we know we created it,
 	 * and can realloc() it to grow it, and should free() it to
@@ -235,7 +234,7 @@ struct yy_buffer_state
 
     int yy_bs_lineno; /**< The line count. */
     int yy_bs_column; /**< The column count. */
-    
+
 	/* Whether to try to fill the input buffer when we reach the
 	 * end of it.
 	 */
@@ -282,12 +281,12 @@ static YY_BUFFER_STATE * yy_buffer_stack = 0; /**< Stack as an array. */
 
 /* yy_hold_char holds the character lost when yytext is formed. */
 static char yy_hold_char;
-static int yy_n_chars;		/* number of characters read into yy_ch_buf */
-static int yyleng;
+static yy_size_t yy_n_chars;		/* number of characters read into yy_ch_buf */
+yy_size_t yyleng;
 
 /* Points to current character in buffer. */
 static char *yy_c_buf_p = (char *) 0;
-static int yy_init = 1;		/* whether we need to initialize */
+static int yy_init = 0;		/* whether we need to initialize */
 static int yy_start = 0;	/* start state number */
 
 /* Flag which is used to allow yywrap()'s to do buffer switches
@@ -295,13 +294,13 @@ static int yy_start = 0;	/* start state number */
  */
 static int yy_did_buffer_switch_on_eof;
 
-static void yyrestart (FILE *input_file  );
+void yyrestart (FILE *input_file  );
 __attribute__((unused)) static void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer  );
 static YY_BUFFER_STATE yy_create_buffer (FILE *file,int size  );
-static void yy_delete_buffer (YY_BUFFER_STATE b  );
-static void yy_flush_buffer (YY_BUFFER_STATE b  );
-__attribute__((unused)) static void yypush_buffer_state (YY_BUFFER_STATE new_buffer  );
-__attribute__((unused)) static void yypop_buffer_state (void );
+void yy_delete_buffer (YY_BUFFER_STATE b  );
+void yy_flush_buffer (YY_BUFFER_STATE b  );
+void yypush_buffer_state (YY_BUFFER_STATE new_buffer  );
+void yypop_buffer_state (void );
 
 static void yyensure_buffer_stack (void );
 static void yy_load_buffer_state (void );
@@ -311,11 +310,11 @@ static void yy_init_buffer (YY_BUFFER_STATE b,FILE *file  );
 
 YY_BUFFER_STATE yy_scan_buffer (char *base,yy_size_t size  );
 YY_BUFFER_STATE yy_scan_string (yyconst char *yy_str  );
-YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,int len  );
+YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,yy_size_t len  );
 
-static void *yyalloc (yy_size_t  );
-static void *yyrealloc (void *,yy_size_t  );
-static void yyfree (void *  );
+void *yyalloc (yy_size_t  );
+void *yyrealloc (void *,yy_size_t  );
+void yyfree (void *  );
 
 #define yy_new_buffer yy_create_buffer
 
@@ -348,15 +347,15 @@ static void yyfree (void *  );
 
 typedef unsigned char YY_CHAR;
 
-static FILE *yyin = (FILE *) 0, *yyout = (FILE *) 0;
+FILE *yyin = (FILE *) 0, *yyout = (FILE *) 0;
 
 typedef int yy_state_type;
 
-static int yylineno;
+extern int yylineno;
 
-static int yylineno = 1;
+int yylineno = 1;
 
-static char *yytext;
+extern char *yytext;
 #define yytext_ptr yytext
 
 static yy_state_type yy_get_previous_state (void );
@@ -374,8 +373,8 @@ static void yy_fatal_error (yyconst char msg[]  );
 	*yy_cp = '\0'; \
 	(yy_c_buf_p) = yy_cp;
 
-#define YY_NUM_RULES 119
-#define YY_END_OF_BUFFER 120
+#define YY_NUM_RULES 124
+#define YY_END_OF_BUFFER 125
 /* This struct is not used in this scanner,
    but its presence is necessary. */
 struct yy_trans_info
@@ -383,52 +382,55 @@ struct yy_trans_info
 	flex_int32_t yy_verify;
 	flex_int32_t yy_nxt;
 	};
-static yyconst flex_int16_t yy_accept[399] =
+static yyconst flex_int16_t yy_accept[425] =
     {   0,
-        0,    0,  114,  114,    0,    0,    0,    0,  120,  118,
-      117,  117,    8,  118,  109,    5,   98,  104,  107,  105,
-      102,  106,  118,  108,    1,  118,  103,  101,   99,  100,
-      112,   92,   92,   92,   92,   92,   92,   92,   92,   92,
-       92,   92,   92,   92,   92,   92,   92,   92,   92,   92,
-      110,  111,  114,  115,    6,    7,    9,   10,  117,    4,
-       93,  113,    2,    1,    3,   94,   95,   97,   96,   92,
-       92,   92,   92,   92,   92,   44,   92,   92,   92,   92,
-       92,   92,   92,   92,   92,   92,   92,   92,   92,   92,
-       92,   92,   28,   17,   25,   92,   92,   92,   92,   92,
-
-       54,   61,   92,   14,   92,   92,   92,   92,   92,   92,
-       92,   92,   92,   92,   92,   92,   92,   92,   92,   92,
-       92,   92,  114,  115,  115,  116,    6,    7,    9,   10,
-        2,   13,   45,   92,   92,   92,   92,   92,   92,   92,
-       92,   92,   92,   92,   92,   92,   92,   92,   92,   92,
-       92,   27,   92,   92,   92,   41,   92,   92,   92,   92,
-       21,   92,   92,   92,   92,   15,   92,   92,   92,   18,
-       92,   92,   92,   92,   92,   80,   92,   92,   92,   51,
-       92,   12,   92,   36,   92,   92,   92,   92,   92,   92,
-       92,   92,   92,   92,   92,   92,   92,   92,   20,   24,
-
-       92,   92,   92,   92,   92,   92,   92,   92,   92,   92,
-       46,   92,   92,   30,   92,   87,   92,   92,   39,   92,
-       92,   92,   92,   92,   48,   92,   89,   32,   91,   92,
-       11,   64,   92,   92,   92,   42,   92,   92,   92,   92,
-       92,   92,   92,   92,   92,   92,   29,   92,   92,   92,
-       92,   92,   92,   92,   92,   92,   85,   92,   26,   92,
-       66,   92,   92,   92,   37,   92,   92,   92,   92,   92,
-       92,   92,   31,   65,   23,   92,   57,   92,   75,   92,
-       92,   92,   43,   92,   92,   92,   92,   92,   92,   92,
-       92,   90,   92,   92,   56,   92,   92,   92,   92,   92,
-
-       92,   92,   40,   33,   79,   19,   92,   83,   74,   55,
-       92,   63,   92,   52,   92,   92,   92,   47,   92,   76,
-       92,   78,   92,   92,   34,   92,   92,   92,   35,   72,
-       92,   92,   92,   92,   58,   92,   50,   49,   92,   92,
-       53,   62,   92,   92,   92,   22,   92,   92,   73,   81,
-       92,   92,   77,   92,   68,   92,   92,   92,   92,   38,
-       92,   88,   67,   92,   84,   92,   92,   92,   86,   92,
-       59,   92,   16,   92,   70,   69,   92,   92,   82,   92,
-       92,   92,   92,   92,   92,   92,   92,   92,   92,   71,
-       92,   92,   92,   92,   92,   92,   60,    0
-
+        0,    0,  119,  119,    0,    0,    0,    0,  125,  123,
+      122,  122,    8,  123,  114,    5,  103,  109,  112,  110,
+      107,  111,  123,  113,    1,  123,  108,  106,  104,  105,
+      117,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+      115,  116,  119,  120,    6,    7,    9,   10,  122,    4,
+       98,  118,    2,    1,    3,   99,  100,  102,  101,    0,
+       96,    0,   96,   96,   96,   96,   96,   44,   96,   96,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+       96,   96,   96,   96,   28,   17,   25,   96,   96,   96,
+
+       96,   96,   96,   54,   63,   96,   14,   96,   96,   96,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+       96,   96,   96,   96,   96,  119,  120,  120,  121,    6,
+        7,    9,   10,    2,    0,   97,   13,   45,   96,   96,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+       96,   96,   96,   96,   96,   96,   96,   27,   96,   96,
+       96,   41,   96,   96,   96,   96,   21,   96,   96,   96,
+       96,   96,   15,   96,   96,   96,   18,   96,   96,   96,
+       96,   96,   82,   96,   96,   96,   51,   96,   12,   96,
+       36,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+
+       96,   96,    0,   97,   96,   96,   96,   96,   20,   96,
+       24,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+       96,   96,   46,   96,   96,   30,   96,   89,   96,   96,
+       39,   96,   96,   96,   96,   96,   48,   96,   94,   91,
+       32,   93,   96,   11,   66,   96,   96,   96,   42,   96,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   29,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   87,
+        0,   96,   26,   96,   96,   96,   68,   96,   96,   96,
+       96,   37,   96,   96,   96,   96,   96,   96,   96,   31,
+       67,   23,   96,   59,   96,   77,   96,   96,   96,   43,
+
+       96,   96,   96,   96,   96,   96,   96,   96,   92,   96,
+       96,   56,   96,   96,   96,   96,   96,   96,   96,   40,
+       33,    0,   81,   95,   19,   96,   96,   85,   96,   76,
+       55,   96,   65,   96,   52,   96,   96,   96,   47,   96,
+       78,   96,   80,   96,   96,   34,   96,   96,   96,   35,
+       74,   96,   96,   96,   96,   60,   96,   50,   49,   96,
+       96,   96,   57,   53,   64,   96,   96,   96,   22,   96,
+       96,   75,   83,   96,   96,   79,   96,   70,   96,   96,
+       96,   96,   96,   38,   96,   90,   69,   96,   86,   96,
+       96,   96,   88,   96,   96,   61,   96,   16,   96,   72,
+
+       71,   96,   58,   96,   84,   96,   96,   96,   96,   96,
+       96,   96,   96,   96,   96,   73,   96,   96,   96,   96,
+       96,   96,   62,    0
     } ;
 
 static yyconst flex_int32_t yy_ec[256] =
@@ -436,17 +438,17 @@ static yyconst flex_int32_t yy_ec[256] =
         1,    1,    1,    1,    1,    1,    1,    1,    2,    3,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-        1,    2,    1,    4,    1,    5,    6,    1,    7,    8,
-        9,   10,   11,   12,   13,   14,   15,   16,   16,   16,
-       16,   16,   16,   16,   16,   16,   16,   17,   18,   19,
-       20,   21,   22,    1,   23,   24,   25,   26,   27,   28,
-       29,   30,   31,   32,   33,   34,   35,   36,   37,   38,
-       39,   40,   41,   42,   43,   44,   45,   46,   47,   32,
-        1,    1,    1,    1,   48,    1,   32,   32,   32,   32,
-
-       32,   32,   32,   32,   32,   32,   32,   32,   32,   32,
-       32,   32,   32,   32,   32,   32,   32,   32,   32,   32,
-       32,   32,   49,    1,   50,    1,    1,    1,    1,    1,
+        1,    2,    1,    4,    5,    6,    7,    1,    8,    9,
+       10,   11,   12,   13,   14,   15,   16,   17,   17,   17,
+       17,   17,   17,   17,   17,   17,   17,   18,   19,   20,
+       21,   22,   23,   24,   25,   26,   27,   28,   29,   30,
+       31,   32,   33,   34,   35,   36,   37,   38,   39,   40,
+       41,   42,   43,   44,   45,   46,   47,   48,   49,   50,
+        1,    1,    1,    1,   51,    1,   34,   34,   34,   34,
+
+       34,   34,   34,   34,   34,   34,   34,   52,   34,   34,
+       34,   34,   53,   34,   54,   34,   34,   34,   34,   34,
+       34,   34,   55,    1,   56,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
@@ -463,236 +465,445 @@ static yyconst flex_int32_t yy_ec[256] =
         1,    1,    1,    1,    1
     } ;
 
-static yyconst flex_int32_t yy_meta[51] =
+static yyconst flex_int32_t yy_meta[57] =
     {   0,
-        1,    1,    1,    2,    1,    1,    3,    1,    1,    4,
-        1,    1,    1,    1,    1,    5,    1,    1,    1,    6,
-        1,    1,    5,    5,    5,    5,    5,    5,    5,    5,
-        5,    5,    5,    5,    5,    5,    5,    5,    5,    5,
-        5,    5,    5,    5,    5,    5,    5,    5,    1,    1
+        1,    1,    1,    2,    3,    1,    1,    4,    1,    1,
+        5,    1,    1,    1,    1,    6,    7,    1,    1,    1,
+        8,    1,    1,    6,    9,    9,    9,    9,    9,    9,
+        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+        9,    9,    9,    9,    1,    1
     } ;
 
-static yyconst flex_int16_t yy_base[409] =
+static yyconst flex_int16_t yy_base[438] =
     {   0,
-        0,    0,  437,  436,  438,  437,  439,  438,  441,  448,
-       49,   51,  448,    0,  448,  448,  448,  448,  448,  448,
-      448,  448,  426,  429,   41,  418,  448,   38,  448,  417,
-      448,   20,   33,   32,   46,   40,   44,    0,   54,   52,
-      399,   48,   60,  395,   65,   67,   81,   27,  411,   75,
-      448,  448,    0,   98,    0,  426,    0,  428,  113,    0,
-      448,  448,  415,   54,  410,  448,  448,  448,  448,    0,
-      403,   68,  399,  391,  389,    0,  402,   80,   84,  397,
-      383,   96,  381,  394,  379,  393,  387,  375,  379,  375,
-      377,  377,    0,   98,    0,  376,   97,  385,  368,  375,
-
-        0,    0,  381,  381,  364,   94,  103,  379,   98,   65,
-      381,  369,  109,  361,  377,  373,  351,   97,  372,  363,
-      115,  356,    0,  137,  138,  448,    0,  388,    0,  390,
-      377,    0,    0,  365,  360,  367,  365,  348,  346,  345,
-      350,  359,  347,  359,   95,  347,  353,  354,  336,  336,
-      123,    0,  334,  350,  351,    0,  338,  347,  344,  122,
-      124,  341,  336,  330,  340,  338,  331,  328,  336,    0,
-      326,  336,  334,  325,  315,  309,  322,  307,  327,    0,
-      313,    0,  311,    0,  325,  316,  313,  131,  309,  316,
-      323,  302,  304,  309,  309,  301,  304,  299,    0,    0,
-
-      311,  295,  305,  312,  292,  291,  305,  294,  307,  287,
-        0,  297,  279,    0,  298,    0,  295,  282,    0,  281,
-      276,  281,  280,  290,    0,  276,    0,    0,    0,  280,
-        0,    0,  276,  273,  287,    0,  272,  272,  270,  286,
-      271,  283,  280,  264,  282,  277,    0,  272,  272,  258,
-      257,  270,  256,  270,  269,  268,    0,  252,    0,  246,
-        0,  265,  249,  248,    0,  262,  252,  247,  246,  258,
-      248,  247,    0,    0,    0,  251,    0,  239,    0,  253,
-      249,  235,    0,  249,  250,  233,  238,  231,  249,  231,
-      228,    0,  229,  226,    0,  231,  243,  230,  237,  227,
-
-      235,  220,    0,    0,    0,  212,  219,    0,    0,    0,
-      216,    0,  230,    0,  231,  218,  217,    0,  213,    0,
-      216,    0,  208,  210,    0,  209,  223,  216,    0,    0,
-      219,  222,  204,  219,    0,  215,    0,    0,  199,  213,
-        0,    0,  197,  196,  201,    0,  210,  195,    0,    0,
-      201,  197,    0,  192,    0,  204,  204,  192,  202,    0,
-      179,    0,    0,  199,    0,  183,  177,  183,    0,  174,
-        0,  193,    0,  192,    0,    0,  183,  187,    0,  174,
-      174,  180,  166,  189,  181,  180,  166,  151,  118,    0,
-      130,  136,  127,  123,  119,  111,    0,  448,  167,  173,
-
-      179,  152,  181,  124,  187,  193,  199,  205
+        0,    0,  293,  287,  284,  281,  272,  256,  254, 1357,
+       55,   57, 1357,    0, 1357, 1357, 1357, 1357, 1357, 1357,
+     1357, 1357,  238,  227,   46,  205, 1357,   43, 1357,  203,
+     1357,   46,   50,   56,   52,   66,   64,   51,   81,   92,
+       91,   94,   96,  111,  113,  116,  130,  134,   53,  143,
+     1357, 1357,    0,  106,    0,  212,    0,  210,  141,    0,
+     1357, 1357,  192,   56,  173, 1357, 1357, 1357, 1357,  168,
+      140,  150,  152,  154,  155,  161,  167,  171,  177,  172,
+      184,  174,  188,  189,  191,  194,  203,  212,  215,  217,
+      219,  221,  226,  228,  231,  240,  233,  235,  246,  251,
+
+      258,  253,  255,  256,  269,  271,  278,  272,  285,  283,
+      287,  289,  296,  305,  298,  315,  319,  321,  322,  326,
+      332,  333,  342,  339,  343,    0,  112,  173, 1357,    0,
+      155,    0,  156,  132,   93,    0,  355,  357,  358,  360,
+      364,  367,  374,  370,  379,  380,  389,  383,  390,  392,
+      395,  408,  411,  409,  415,  418,  425,  427,  429,  436,
+      431,  441,  446,  448,  450,  452,  453,  462,  471,  464,
+      473,  474,  478,  485,  488,  490,  491,  494,  500,  501,
+      504,  506,  507,  517,  518,  519,  520,  521,  522,  523,
+      533,  536,  538,  543,  549,  554,  555,  561,  556,  566,
+
+      567,  576,   60,    0,  573,  578,  580,  582,  583,  593,
+      589,  596,  598,  603,  605,  607,  610,  617,  619,  621,
+      622,  628,  633,  634,  635,  639,  640,  649,  650,  652,
+      653,  655,  659,  664,  668,  669,  665,  671,  674,  678,
+      681,  685,  687,  688,  692,  697,  698,  701,  703,  704,
+      707,  708,  717,  713,  728,  730,  724,  740,  734,  745,
+      746,  750,  751,  756,  757,  760,  761,  762,  771,  773,
+       42,  778,  782,  783,  787,  789,  792,  794,  793,  804,
+      805,  808,  809,  810,  819,  823,  826,  828,  829,  830,
+      835,  840,  844,  846,  847,  856,  857,  858,  859,  860,
+
+      863,  872,  873,  878,  879,  882,  885,  889,  894,  895,
+      896,  898,  905,  910,  908,  912,  914,  915,  926,  930,
+      931,   73,  932,  933,  935,  937,  942,  944,  946,  947,
+      948,  949,  951,  958,  961,  965,  967,  972,  978,  979,
+      981,  984,  983,  985,  994,  988,  999, 1000, 1001, 1004,
+     1013, 1015, 1022, 1016, 1019, 1026, 1032, 1033, 1035, 1036,
+     1038, 1039, 1048, 1049, 1050, 1051, 1053, 1054, 1060, 1063,
+     1065, 1066, 1069, 1070, 1072, 1082, 1084, 1085, 1087, 1096,
+     1097, 1098, 1099, 1101, 1113, 1114, 1115, 1116, 1117, 1118,
+     1119, 1128, 1130, 1131, 1134, 1133, 1135, 1137, 1150, 1151,
+
+     1153, 1155, 1157, 1162, 1160, 1167, 1172, 1173, 1174, 1176,
+     1185, 1190, 1183, 1187, 1189, 1199, 1204, 1206, 1208, 1210,
+     1215, 1220, 1222, 1357, 1269, 1278, 1287, 1290, 1293, 1297,
+     1306, 1315, 1324, 1333, 1340, 1344, 1347
     } ;
 
-static yyconst flex_int16_t yy_def[409] =
+static yyconst flex_int16_t yy_def[438] =
     {   0,
-      398,    1,  399,  399,  400,  400,  401,  401,  398,  398,
-      398,  398,  398,  402,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  403,  398,  398,  398,  398,
-      398,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      398,  398,  405,  406,  407,  398,  408,  398,  398,  402,
-      398,  398,  398,  398,  403,  398,  398,  398,  398,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  405,  406,  406,  398,  407,  398,  408,  398,
-      398,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,    0,  398,  398,
-
-      398,  398,  398,  398,  398,  398,  398,  398
+      424,    1,  425,  425,  426,  426,  427,  427,  424,  424,
+      424,  424,  424,  428,  424,  424,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  429,  424,  424,  424,  424,
+      424,  430,  430,  430,  430,  430,   34,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      424,  424,  431,  432,  433,  424,  434,  424,  424,  428,
+      424,  424,  424,  424,  429,  424,  424,  424,  424,  435,
+      430,  436,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  431,  432,  432,  424,  433,
+      424,  434,  424,  424,  424,  437,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+
+      430,  430,  424,  437,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      424,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  424,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,    0,  424,  424,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424
     } ;
 
-static yyconst flex_int16_t yy_nxt[499] =
+static yyconst flex_int16_t yy_nxt[1414] =
     {   0,
-       10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
-       20,   21,   22,   23,   24,   25,   26,   27,   28,   29,
-       30,   31,   32,   33,   34,   35,   36,   37,   38,   38,
-       39,   38,   38,   40,   41,   42,   43,   44,   38,   45,
-       46,   47,   48,   49,   50,   38,   38,   38,   51,   52,
-       59,   59,   59,   59,   63,   71,   64,   67,   68,   73,
-       72,   77,  118,   74,  119,   78,   75,   63,   79,   64,
-       88,   80,   82,   85,   81,   86,   83,   89,   96,   76,
-       90,   93,   84,   91,   99,   87,   92,  101,   97,   94,
-      100,  107,  133,  110,   95,  102,  111,  103,  179,  104,
-
-      108,  109,  105,  115,  121,  112,  180,  125,  134,  113,
-      116,  122,  126,  114,   59,   59,  139,  117,  141,  142,
-      146,  163,  140,  159,  171,  173,  143,  189,   70,  147,
-      172,  177,  183,  164,  207,  208,  148,  190,  160,  161,
-      174,  193,  178,  184,  175,  194,  398,  125,  222,  214,
-      224,  398,  126,  215,  248,  249,   60,  397,  396,  395,
-      225,  394,  393,  223,  392,  391,  250,   53,   53,   53,
-       53,   53,   53,   55,   55,   55,   55,   55,   55,   57,
-       57,   57,   57,   57,   57,   65,   65,  123,  123,  123,
-      390,  123,  123,  124,  124,  124,  124,  124,  124,  127,
-
-      127,  389,  127,  127,  127,  129,  388,  129,  129,  129,
-      129,  387,  386,  385,  384,  383,  382,  381,  380,  379,
-      378,  377,  376,  375,  374,  373,  372,  371,  370,  369,
-      368,  367,  366,  365,  364,  363,  362,  361,  360,  359,
-      358,  357,  356,  355,  354,  353,  352,  351,  350,  349,
-      348,  347,  346,  345,  344,  343,  342,  341,  340,  339,
-      338,  337,  336,  335,  334,  333,  332,  331,  330,  329,
-      328,  327,  326,  325,  324,  323,  322,  321,  320,  319,
-      318,  317,  316,  315,  314,  313,  312,  311,  310,  309,
-      308,  307,  306,  305,  304,  303,  302,  301,  300,  299,
-
-      298,  297,  296,  295,  294,  293,  292,  291,  290,  289,
-      288,  287,  286,  285,  284,  283,  282,  281,  280,  279,
-      278,  277,  276,  275,  274,  273,  272,  271,  270,  269,
-      268,  267,  266,  265,  264,  263,  262,  261,  260,  259,
-      258,  257,  256,  255,  254,  253,  252,  251,  247,  246,
-      245,  244,  243,  242,  241,  240,  239,  238,  237,  236,
-      235,  234,  233,  232,  231,  230,  229,  228,  227,  226,
-      221,  220,  219,  218,  217,  216,  213,  212,  211,  210,
-      209,  206,  205,  204,  203,  202,  201,  200,  199,  198,
-      197,  196,  131,  130,  128,  195,  192,  191,  188,  187,
-
-      186,  185,  182,  181,  176,  170,  169,  168,  167,  166,
-      165,  162,  158,  157,  156,  155,  154,  153,  152,  151,
-      150,  149,  145,  144,  138,  137,  136,  135,  132,  398,
-      131,  130,  128,  120,  106,   98,   69,   66,   62,   61,
-      398,   58,   58,   56,   56,   54,   54,    9,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398
-
+       10,   11,   12,   13,   10,   14,   15,   16,   17,   18,
+       19,   20,   21,   22,   23,   24,   25,   26,   27,   28,
+       29,   30,   31,   10,   32,   33,   34,   35,   36,   37,
+       38,   38,   39,   38,   38,   40,   41,   42,   43,   44,
+       38,   45,   46,   47,   48,   49,   50,   38,   38,   38,
+       38,   38,   38,   38,   51,   52,   59,   59,   59,   59,
+       63,   70,   64,   67,   68,   70,   70,   70,   70,   72,
+       63,   70,   64,   72,   72,   72,   72,  123,   75,   72,
+       84,   70,   76,   73,   85,   77,  136,   79,   74,   72,
+       86,   80,   90,  322,   81,   71,   70,   82,   78,   91,
+
+       83,   87,   92,   88,   72,   93,   70,   70,   94,   70,
+       95,   70,  271,   89,   72,   72,  128,   72,   96,   72,
+       98,  129,  424,   97,   99,  104,   70,  424,   70,  101,
+      100,   70,  102,  105,   72,  106,   72,  107,  103,   72,
+      108,  110,   59,   59,  113,   70,  203,  114,  134,   70,
+      111,  112,  109,   72,  118,   70,  115,   72,   70,  133,
+      116,  119,  131,   72,  117,   70,   72,   70,  120,   70,
+       70,  121,  135,  122,  124,   72,   70,   72,   72,  137,
+      138,  125,   70,  128,   72,  140,   70,   70,  129,   70,
+       72,  141,   70,  424,   72,   72,  139,   72,  142,   70,
+
+       72,  144,  150,   70,   70,  143,   70,   72,  134,   70,
+      145,   72,   72,  133,   72,  152,  146,   72,   70,  131,
+      147,  148,  156,   69,  153,   66,   72,   70,  149,  151,
+       70,  154,   70,  155,   70,   72,   70,   62,   72,  158,
+       72,   70,   72,   70,   72,  157,   70,  159,   70,   72,
+       70,   72,   61,  424,   72,   70,   72,  161,   72,   58,
+      160,   70,  162,   72,  163,  164,   70,  165,   70,   72,
+       70,   70,  168,   70,   72,   58,   72,  170,   72,   72,
+      169,   72,  166,  167,   70,  172,   70,   70,   56,  171,
+      174,   56,   72,   70,   72,   72,  173,   54,   70,  175,
+
+       70,   72,   70,   54,   70,  176,   72,  180,   72,  424,
+       72,   70,   72,   70,  183,  177,  424,  178,  424,   72,
+       70,   72,  181,  179,  184,  424,  182,  424,   72,  188,
+       70,  186,  424,  189,   70,  185,   70,   70,   72,  187,
+      190,   70,   72,  424,   72,   72,  193,   70,   70,   72,
+      194,  191,  424,  424,   70,   72,   72,   70,   70,  424,
+      198,  192,   72,  424,  196,   72,   72,  200,  424,  424,
+       70,  201,   70,   70,  197,   70,  195,  199,   72,   70,
+       72,   72,   70,   72,  202,   70,  205,   72,  424,   70,
+       72,  208,  206,   72,   70,   70,  207,   72,   70,  209,
+
+      210,  424,   72,   72,   70,   70,   72,   70,  424,  216,
+       70,  211,   72,   72,  424,   72,  218,  424,   72,  424,
+      424,  212,  213,   70,   70,  214,   70,  217,  215,  424,
+       70,   72,   72,   70,   72,  223,  219,  220,   72,  222,
+       70,   72,   70,  221,   70,  424,   70,  424,   72,  424,
+       72,   70,   72,  226,   72,  230,   70,  227,  224,   72,
+      225,   70,  229,   70,   72,   70,  424,   70,   70,   72,
+      424,   72,  228,   72,  232,   72,   72,   70,  233,   70,
+      234,  236,  231,  424,  424,   72,   70,   72,   70,   70,
+      424,  237,  238,   70,   72,  235,   72,   72,  240,  239,
+
+       70,   72,  242,   70,  424,   70,   70,  243,   72,   70,
+      424,   72,  241,   72,   72,   70,   70,   72,  246,   70,
+      244,   70,   70,   72,   72,  245,  248,   72,  249,   72,
+       72,  247,   70,   70,   70,   70,   70,   70,   70,  250,
+       72,   72,   72,   72,   72,   72,   72,  255,   70,  424,
+      251,   70,  253,   70,  424,  424,   72,  252,   70,   72,
+      424,   72,  256,  258,   70,  257,   72,  424,  254,   70,
+       70,   70,   72,  259,  261,  262,   70,   72,   72,   72,
+      260,   70,   70,  424,   72,  266,  263,  265,   70,   72,
+       72,   70,  424,   70,  264,   70,   72,   70,   70,   72,
+
+      267,   72,  269,   72,   70,   72,   72,  268,   70,  424,
+      270,   70,   72,   70,  272,  273,   72,  274,   70,   72,
+       70,   72,   70,  275,  277,   70,   72,  276,   72,  280,
+       72,  281,   70,   72,   70,  279,   70,   70,  424,  424,
+       72,  278,   72,   70,   72,   72,  286,  284,   70,   70,
+       70,   72,  424,  282,   70,   70,   72,   72,   72,  285,
+      283,  424,   72,   72,   70,   70,  288,   70,   70,  290,
+       70,  287,   72,   72,   70,   72,   72,  424,   72,   70,
+       70,  291,   72,   70,   70,  289,   70,   72,   72,   70,
+      424,   72,   72,   70,   72,  292,   70,   72,  293,  297,
+
+       70,   72,   70,   70,   72,  295,  294,   70,   72,  296,
+       72,   72,   70,   70,  298,   72,   70,  424,   70,   70,
+       72,   72,   70,   70,   72,  299,   72,   72,   70,  302,
+       72,   72,   70,  424,  424,  424,   72,  424,  300,   70,
+       72,  301,  306,   70,  424,   70,  303,   72,  304,   70,
+      305,   72,  307,   72,  308,   70,  424,   72,  309,  424,
+       70,   70,  312,   72,  311,   70,   70,  310,   72,   72,
+      424,   70,   70,   72,   72,   70,   70,   70,  313,   72,
+       72,  314,  424,   72,   72,   72,   70,  317,   70,  319,
+      320,  424,  424,   70,   72,  315,   72,   70,   70,  321,
+
+      316,   72,   70,  318,   70,   72,   72,   70,   70,   70,
+       72,  424,   72,  424,  424,   72,   72,   72,  424,   70,
+       70,  323,  327,   70,   70,   70,  324,   72,   72,  424,
+      329,   72,   72,   72,   70,  325,  328,  331,   70,  326,
+      424,   70,   72,   70,   70,   70,   72,  332,  330,   72,
+       70,   72,   72,   72,  335,   70,  424,  424,   72,   70,
+      333,   70,   70,   72,  334,  336,  337,   72,  424,   72,
+       72,   70,   70,   70,   70,   70,  338,  424,   70,   72,
+       72,   72,   72,   72,  424,  340,   72,   70,   70,  341,
+      339,  424,  343,   70,   70,   72,   72,   70,  424,  344,
+
+       70,   72,   72,  342,   70,   72,  348,  424,   72,   70,
+       70,   70,   72,   70,  424,  346,  345,   72,   72,   72,
+       70,   72,  347,   70,  424,   70,  349,   70,   72,   70,
+       70,   72,  350,   72,  354,   72,  351,   72,   72,  352,
+      356,   70,  353,  358,  355,   70,   70,   70,   70,   72,
+       70,  357,   70,   72,   72,   72,   72,   70,   72,   70,
+       72,   70,   70,   70,   70,   72,   70,   72,  359,   72,
+       72,   72,   72,   70,   72,  424,   70,  424,  424,  361,
+       70,   72,   70,  362,   72,  360,  365,   70,   72,  363,
+       72,  366,  364,   70,   70,   72,   70,  424,   70,   70,
+
+       70,   72,   72,   70,   72,  367,   72,   72,   72,   70,
+      368,   72,  424,  424,   70,   70,   70,   72,  424,   70,
+      369,  370,   72,   72,   72,  424,  374,   72,   70,  371,
+       70,   70,  424,  375,   70,  372,   72,   70,   72,   72,
+      373,   70,   72,  376,  379,   72,  377,   70,   70,   72,
+       70,   70,  424,   70,   70,   72,   72,  378,   72,   72,
+      380,   72,   72,   70,   70,   70,   70,  383,   70,   70,
+      382,   72,   72,   72,   72,   70,   72,   72,   70,  381,
+       70,   70,  424,   72,   70,   70,   72,   70,   72,   72,
+      387,  386,   72,   72,  384,   72,  385,   70,  424,   70,
+
+       70,  424,   70,  424,  389,   72,  388,   72,   72,  390,
+       72,   70,   70,   70,   70,  392,   70,  424,  424,   72,
+       72,   72,   72,  393,   72,  391,  396,  424,   70,   70,
+       70,   70,   70,   70,   70,  394,   72,   72,   72,   72,
+       72,   72,   72,   70,  398,   70,   70,  395,   70,   70,
+       70,   72,   70,   72,   72,  424,   72,   72,   72,  424,
+       72,  399,  403,  397,  404,   70,   70,  400,   70,  401,
+       70,  424,   70,   72,   72,   70,   72,   70,   72,  405,
+       72,  402,   70,   72,  424,   72,  424,   70,   70,   70,
+       72,   70,  406,  424,  407,   72,   72,   72,   70,   72,
+
+       70,  412,   70,  424,   70,   70,   72,  424,   72,  410,
+       72,  408,   72,   72,   70,  409,  424,  413,  414,   70,
+      415,   70,   72,   70,  411,   70,  424,   72,  416,   72,
+       70,   72,  424,   72,  419,   70,  424,   70,   72,  417,
+      418,  424,  424,   72,  420,   72,  424,  424,  421,  424,
+      424,  424,  424,  424,  424,  424,  422,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  423,   53,
+       53,   53,   53,   53,   53,   53,   53,   53,   55,   55,
+       55,   55,   55,   55,   55,   55,   55,   57,   57,   57,
+       57,   57,   57,   57,   57,   57,   60,  424,   60,   65,
+
+       65,   65,   71,   71,  424,   71,  126,  126,  126,  126,
+      424,  126,  126,  126,  126,  127,  127,  127,  127,  127,
+      127,  127,  127,  127,  130,  130,  130,  424,  130,  130,
+      130,  130,  130,  132,  424,  132,  132,  132,  132,  132,
+      132,  132,  136,  424,  424,  424,  424,  424,  136,   72,
+       72,  424,   72,  204,  424,  204,    9,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+      424,  424,  424
     } ;
 
-static yyconst flex_int16_t yy_chk[499] =
+static yyconst flex_int16_t yy_chk[1414] =
     {   0,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-       11,   11,   12,   12,   25,   32,   25,   28,   28,   33,
-       32,   34,   48,   33,   48,   34,   33,   64,   34,   64,
-       37,   34,   35,   36,   34,   36,   35,   37,   40,   33,
-       37,   39,   35,   37,   42,   36,   37,   43,   40,   39,
-       42,   45,   72,   46,   39,   43,   46,   43,  110,   43,
-
-       45,   45,   43,   47,   50,   46,  110,   54,   72,   46,
-       47,   50,   54,   46,   59,   59,   78,   47,   79,   79,
-       82,   97,   78,   94,  106,  107,   79,  118,  404,   82,
-      106,  109,  113,   97,  145,  145,   82,  118,   94,   94,
-      107,  121,  109,  113,  107,  121,  124,  125,  160,  151,
-      161,  124,  125,  151,  188,  188,  402,  396,  395,  394,
-      161,  393,  392,  160,  391,  389,  188,  399,  399,  399,
-      399,  399,  399,  400,  400,  400,  400,  400,  400,  401,
-      401,  401,  401,  401,  401,  403,  403,  405,  405,  405,
-      388,  405,  405,  406,  406,  406,  406,  406,  406,  407,
-
-      407,  387,  407,  407,  407,  408,  386,  408,  408,  408,
-      408,  385,  384,  383,  382,  381,  380,  378,  377,  374,
-      372,  370,  368,  367,  366,  364,  361,  359,  358,  357,
-      356,  354,  352,  351,  348,  347,  345,  344,  343,  340,
-      339,  336,  334,  333,  332,  331,  328,  327,  326,  324,
-      323,  321,  319,  317,  316,  315,  313,  311,  307,  306,
-      302,  301,  300,  299,  298,  297,  296,  294,  293,  291,
-      290,  289,  288,  287,  286,  285,  284,  282,  281,  280,
-      278,  276,  272,  271,  270,  269,  268,  267,  266,  264,
-      263,  262,  260,  258,  256,  255,  254,  253,  252,  251,
-
-      250,  249,  248,  246,  245,  244,  243,  242,  241,  240,
-      239,  238,  237,  235,  234,  233,  230,  226,  224,  223,
-      222,  221,  220,  218,  217,  215,  213,  212,  210,  209,
-      208,  207,  206,  205,  204,  203,  202,  201,  198,  197,
-      196,  195,  194,  193,  192,  191,  190,  189,  187,  186,
-      185,  183,  181,  179,  178,  177,  176,  175,  174,  173,
-      172,  171,  169,  168,  167,  166,  165,  164,  163,  162,
-      159,  158,  157,  155,  154,  153,  150,  149,  148,  147,
-      146,  144,  143,  142,  141,  140,  139,  138,  137,  136,
-      135,  134,  131,  130,  128,  122,  120,  119,  117,  116,
-
-      115,  114,  112,  111,  108,  105,  104,  103,  100,   99,
-       98,   96,   92,   91,   90,   89,   88,   87,   86,   85,
-       84,   83,   81,   80,   77,   75,   74,   73,   71,   65,
-       63,   58,   56,   49,   44,   41,   30,   26,   24,   23,
-        9,    8,    7,    6,    5,    4,    3,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398
-
+        1,    1,    1,    1,    1,    1,   11,   11,   12,   12,
+       25,   32,   25,   28,   28,   33,   38,   35,   49,   32,
+       64,   34,   64,   33,   38,   35,   49,   49,   33,   34,
+       35,   36,   33,   32,   35,   33,  322,   34,   32,   36,
+       35,   34,   37,  271,   34,   37,   39,   34,   33,   37,
+
+       34,   36,   37,   36,   39,   37,   41,   40,   37,   42,
+       39,   43,  203,   36,   41,   40,   54,   42,   39,   43,
+       40,   54,  127,   39,   40,   43,   44,  127,   45,   41,
+       40,   46,   42,   43,   44,   43,   45,   43,   42,   46,
+       43,   45,   59,   59,   46,   47,  135,   46,  134,   48,
+       45,   45,   44,   47,   47,   71,   46,   48,   50,  133,
+       46,   47,  131,   71,   46,   72,   50,   73,   47,   74,
+       75,   48,   70,   48,   50,   73,   76,   74,   75,   73,
+       74,   50,   77,  128,   76,   75,   78,   80,  128,   82,
+       77,   76,   79,   65,   78,   80,   74,   82,   76,   81,
+
+       79,   79,   82,   83,   84,   77,   85,   81,   63,   86,
+       80,   83,   84,   58,   85,   84,   80,   86,   87,   56,
+       81,   81,   86,   30,   84,   26,   87,   88,   81,   83,
+       89,   84,   90,   85,   91,   88,   92,   24,   89,   88,
+       90,   93,   91,   94,   92,   87,   95,   89,   97,   93,
+       98,   94,   23,    9,   95,   96,   97,   91,   98,    8,
+       90,   99,   92,   96,   93,   94,  100,   96,  102,   99,
+      103,  104,   98,  101,  100,    7,  102,  100,  103,  104,
+       99,  101,   96,   96,  105,  101,  106,  108,    6,  100,
+      103,    5,  105,  107,  106,  108,  102,    4,  110,  106,
+
+      109,  107,  111,    3,  112,  107,  110,  110,  109,    0,
+      111,  113,  112,  115,  111,  108,    0,  109,    0,  113,
+      114,  115,  110,  109,  112,    0,  110,    0,  114,  114,
+      116,  113,    0,  115,  117,  112,  118,  119,  116,  113,
+      116,  120,  117,    0,  118,  119,  118,  121,  122,  120,
+      119,  116,    0,    0,  124,  121,  122,  123,  125,    0,
+      122,  117,  124,    0,  121,  123,  125,  124,    0,    0,
+      137,  124,  138,  139,  121,  140,  120,  123,  137,  141,
+      138,  139,  142,  140,  125,  144,  139,  141,    0,  143,
+      142,  142,  140,  144,  145,  146,  141,  143,  148,  143,
+
+      143,    0,  145,  146,  147,  149,  148,  150,    0,  148,
+      151,  144,  147,  149,    0,  150,  150,    0,  151,    0,
+        0,  145,  146,  152,  154,  147,  153,  149,  147,    0,
+      155,  152,  154,  156,  153,  154,  151,  151,  155,  153,
+      157,  156,  158,  152,  159,    0,  161,    0,  157,    0,
+      158,  160,  159,  157,  161,  161,  162,  157,  155,  160,
+      156,  163,  160,  164,  162,  165,    0,  166,  167,  163,
+        0,  164,  159,  165,  164,  166,  167,  168,  165,  170,
+      166,  167,  163,    0,    0,  168,  169,  170,  171,  172,
+        0,  167,  168,  173,  169,  166,  171,  172,  170,  169,
+
+      174,  173,  172,  175,    0,  176,  177,  173,  174,  178,
+        0,  175,  171,  176,  177,  179,  180,  178,  176,  181,
+      174,  182,  183,  179,  180,  175,  179,  181,  180,  182,
+      183,  178,  184,  185,  186,  187,  188,  189,  190,  181,
+      184,  185,  186,  187,  188,  189,  190,  186,  191,    0,
+      182,  192,  184,  193,    0,    0,  191,  183,  194,  192,
+        0,  193,  188,  192,  195,  190,  194,    0,  185,  196,
+      197,  199,  195,  193,  195,  195,  198,  196,  197,  199,
+      194,  200,  201,    0,  198,  198,  195,  197,  205,  200,
+      201,  202,    0,  206,  196,  207,  205,  208,  209,  202,
+
+      199,  206,  201,  207,  211,  208,  209,  200,  210,    0,
+      202,  212,  211,  213,  205,  206,  210,  207,  214,  212,
+      215,  213,  216,  208,  212,  217,  214,  210,  215,  215,
+      216,  216,  218,  217,  219,  214,  220,  221,    0,    0,
+      218,  213,  219,  222,  220,  221,  221,  219,  223,  224,
+      225,  222,    0,  217,  226,  227,  223,  224,  225,  220,
+      218,    0,  226,  227,  228,  229,  224,  230,  231,  227,
+      232,  222,  228,  229,  233,  230,  231,    0,  232,  234,
+      237,  229,  233,  235,  236,  225,  238,  234,  237,  239,
+        0,  235,  236,  240,  238,  230,  241,  239,  232,  236,
+
+      242,  240,  243,  244,  241,  234,  233,  245,  242,  235,
+      243,  244,  246,  247,  238,  245,  248,    0,  249,  250,
+      246,  247,  251,  252,  248,  243,  249,  250,  254,  248,
+      251,  252,  253,    0,    0,    0,  254,    0,  246,  257,
+      253,  247,  253,  255,    0,  256,  250,  257,  251,  259,
+      252,  255,  254,  256,  255,  258,    0,  259,  256,    0,
+      260,  261,  259,  258,  258,  262,  263,  257,  260,  261,
+        0,  264,  265,  262,  263,  266,  267,  268,  261,  264,
+      265,  262,    0,  266,  267,  268,  269,  265,  270,  267,
+      268,    0,    0,  272,  269,  263,  270,  273,  274,  269,
+
+      264,  272,  275,  266,  276,  273,  274,  277,  279,  278,
+      275,    0,  276,    0,    0,  277,  279,  278,    0,  280,
+      281,  272,  278,  282,  283,  284,  274,  280,  281,    0,
+      280,  282,  283,  284,  285,  275,  279,  283,  286,  276,
+        0,  287,  285,  288,  289,  290,  286,  284,  281,  287,
+      291,  288,  289,  290,  287,  292,    0,    0,  291,  293,
+      285,  294,  295,  292,  286,  288,  289,  293,    0,  294,
+      295,  296,  297,  298,  299,  300,  293,    0,  301,  296,
+      297,  298,  299,  300,    0,  297,  301,  302,  303,  298,
+      295,    0,  301,  304,  305,  302,  303,  306,    0,  302,
+
+      307,  304,  305,  299,  308,  306,  306,    0,  307,  309,
+      310,  311,  308,  312,    0,  304,  303,  309,  310,  311,
+      313,  312,  305,  315,    0,  314,  307,  316,  313,  317,
+      318,  315,  308,  314,  314,  316,  310,  317,  318,  311,
+      316,  319,  313,  318,  315,  320,  321,  323,  324,  319,
+      325,  317,  326,  320,  321,  323,  324,  327,  325,  328,
+      326,  329,  330,  331,  332,  327,  333,  328,  319,  329,
+      330,  331,  332,  334,  333,    0,  335,    0,    0,  326,
+      336,  334,  337,  327,  335,  325,  334,  338,  336,  329,
+      337,  336,  332,  339,  340,  338,  341,    0,  343,  342,
+
+      344,  339,  340,  346,  341,  337,  343,  342,  344,  345,
+      338,  346,    0,    0,  347,  348,  349,  345,    0,  350,
+      340,  342,  347,  348,  349,    0,  348,  350,  351,  344,
+      352,  354,    0,  349,  355,  345,  351,  353,  352,  354,
+      347,  356,  355,  352,  355,  353,  353,  357,  358,  356,
+      359,  360,    0,  361,  362,  357,  358,  354,  359,  360,
+      357,  361,  362,  363,  364,  365,  366,  362,  367,  368,
+      361,  363,  364,  365,  366,  369,  367,  368,  370,  360,
+      371,  372,    0,  369,  373,  374,  370,  375,  371,  372,
+      370,  368,  373,  374,  366,  375,  367,  376,    0,  377,
+
+      378,    0,  379,    0,  374,  376,  371,  377,  378,  375,
+      379,  380,  381,  382,  383,  379,  384,    0,    0,  380,
+      381,  382,  383,  380,  384,  377,  383,    0,  385,  386,
+      387,  388,  389,  390,  391,  381,  385,  386,  387,  388,
+      389,  390,  391,  392,  388,  393,  394,  382,  396,  395,
+      397,  392,  398,  393,  394,    0,  396,  395,  397,    0,
+      398,  390,  395,  385,  397,  399,  400,  391,  401,  392,
+      402,    0,  403,  399,  400,  405,  401,  404,  402,  399,
+      403,  394,  406,  405,    0,  404,    0,  407,  408,  409,
+      406,  410,  402,    0,  404,  407,  408,  409,  413,  410,
+
+      411,  410,  414,    0,  415,  412,  413,    0,  411,  408,
+      414,  406,  415,  412,  416,  407,    0,  411,  412,  417,
+      413,  418,  416,  419,  409,  420,    0,  417,  414,  418,
+      421,  419,    0,  420,  418,  422,    0,  423,  421,  415,
+      417,    0,    0,  422,  419,  423,    0,    0,  420,    0,
+        0,    0,    0,    0,    0,    0,  421,    0,    0,    0,
+        0,    0,    0,    0,    0,    0,    0,    0,  422,  425,
+      425,  425,  425,  425,  425,  425,  425,  425,  426,  426,
+      426,  426,  426,  426,  426,  426,  426,  427,  427,  427,
+      427,  427,  427,  427,  427,  427,  428,    0,  428,  429,
+
+      429,  429,  430,  430,    0,  430,  431,  431,  431,  431,
+        0,  431,  431,  431,  431,  432,  432,  432,  432,  432,
+      432,  432,  432,  432,  433,  433,  433,    0,  433,  433,
+      433,  433,  433,  434,    0,  434,  434,  434,  434,  434,
+      434,  434,  435,    0,    0,    0,    0,    0,  435,  436,
+      436,    0,  436,  437,    0,  437,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+      424,  424,  424
     } ;
 
 static yy_state_type yy_last_accepting_state;
 static char *yy_last_accepting_cpos;
 
-static int yy_flex_debug;
-static int yy_flex_debug = 0;
+extern int yy_flex_debug;
+int yy_flex_debug = 0;
 
 /* The intent behind this definition is that it'll catch
  * any uses of REJECT which flex missed.
@@ -701,15 +912,27 @@ static int yy_flex_debug = 0;
 #define yymore() yymore_used_but_not_detected
 #define YY_MORE_ADJ 0
 #define YY_RESTORE_YY_MORE_OFFSET
-static char *yytext;
+char *yytext;
 #line 1 "pars0lex.l"
-/**************************************************//**
-SQL parser lexical analyzer: input file for the GNU Flex lexer generator
+/*****************************************************************************
 
-(c) 1997 Innobase Oy
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
-Created 12/14/1997 Heikki Tuuri
-Published under the GPL version 2
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/******************************************************
+SQL parser lexical analyzer: input file for the GNU Flex lexer generator
 
 The InnoDB parser is frozen because MySQL takes care of SQL parsing.
 Therefore we normally keep the InnoDB parser C files as they are, and do
@@ -723,10 +946,12 @@ How to make the InnoDB parser and lexer C files:
 
 These instructions seem to work at least with bison-1.875d and flex-2.5.31 on
 Linux.
+
+Created 12/14/1997 Heikki Tuuri
 *******************************************************/
 #define YY_NO_INPUT 1
 #define YY_NO_UNISTD_H 1
-#line 38 "pars0lex.l"
+#line 53 "pars0lex.l"
 #define YYSTYPE que_node_t*
 
 #include "univ.i"
@@ -741,7 +966,9 @@ Linux.
 #define realloc(P, A)	ut_realloc(P, A)
 #define exit(A) 	ut_error
 
-#define YY_INPUT(buf, result, max_size) pars_get_lex_chars(buf, &result, max_size)
+/* Note: We cast &result to int* from yysize_t* */
+#define YY_INPUT(buf, result, max_size) \
+	(result = pars_get_lex_chars(buf, max_size))
 
 /* String buffer for removing quotes */
 static ulint	stringbuf_len_alloc = 0; /* Allocated length */
@@ -756,7 +983,7 @@ string_append(
 	ulint		len)	/*!< in: length of the string */
 {
 	if (stringbuf == NULL) {
-		stringbuf = malloc(1);
+		stringbuf = static_cast<char*>(malloc(1));
 		stringbuf_len_alloc = 1;
 	}
 
@@ -764,7 +991,9 @@ string_append(
 		while (stringbuf_len + len > stringbuf_len_alloc) {
 			stringbuf_len_alloc <<= 1;
 		}
-		stringbuf = realloc(stringbuf, stringbuf_len_alloc);
+
+		stringbuf = static_cast<char*>(
+			realloc(stringbuf, stringbuf_len_alloc));
 	}
 
 	memcpy(stringbuf + stringbuf_len, str, len);
@@ -774,7 +1003,7 @@ string_append(
 
 
 
-#line 759 "lexyy.c"
+#line 1006 "lexyy.cc"
 
 #define INITIAL 0
 #define comment 1
@@ -793,6 +1022,37 @@ string_append(
 #define YY_EXTRA_TYPE void *
 #endif
 
+static int yy_init_globals (void );
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+__attribute__((unused)) static int yylex_destroy (void );
+
+int yyget_debug (void );
+
+void yyset_debug (int debug_flag  );
+
+YY_EXTRA_TYPE yyget_extra (void );
+
+void yyset_extra (YY_EXTRA_TYPE user_defined  );
+
+FILE *yyget_in (void );
+
+void yyset_in  (FILE * in_str  );
+
+FILE *yyget_out (void );
+
+void yyset_out  (FILE * out_str  );
+
+yy_size_t yyget_leng (void );
+
+char *yyget_text (void );
+
+int yyget_lineno (void );
+
+void yyset_lineno (int line_number  );
+
 /* Macros after this point can all be overridden by user definitions in
  * section 1.
  */
@@ -825,7 +1085,12 @@ static int input (void );
 
 /* Amount of stuff to slurp up with each read. */
 #ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
 #define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
 #endif
 
 /* Copy whatever the last rule matched to the standard output. */
@@ -833,7 +1098,7 @@ static int input (void );
 /* This used to be an fputs(), but since the string might contain NUL's,
  * we now use fwrite().
  */
-#define ECHO (void) fwrite( yytext, yyleng, 1, yyout )
+#define ECHO do { if (fwrite( yytext, yyleng, 1, yyout )) {} } while (0)
 #endif
 
 /* Gets input and stuffs it into "buf".  number of characters read, or YY_NULL,
@@ -898,9 +1163,9 @@ static int input (void );
 #ifndef YY_DECL
 #define YY_DECL_IS_OURS 1
 
-UNIV_INTERN int yylex (void);
+extern int yylex (void);
 
-#define YY_DECL UNIV_INTERN int yylex (void)
+#define YY_DECL int yylex (void)
 #endif /* !YY_DECL */
 
 /* Code executed at the beginning of each rule, after yytext and yyleng
@@ -925,15 +1190,15 @@ YY_DECL
 	register yy_state_type yy_current_state;
 	register char *yy_cp, *yy_bp;
 	register int yy_act;
-    
-#line 92 "pars0lex.l"
+
+#line 112 "pars0lex.l"
 
 
-#line 914 "lexyy.c"
+#line 1197 "lexyy.cc"
 
-	if ( (yy_init) )
+	if ( !(yy_init) )
 		{
-		(yy_init) = 0;
+		(yy_init) = 1;
 
 #ifdef YY_USER_INIT
 		YY_USER_INIT;
@@ -982,13 +1247,13 @@ yy_match:
 			while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
 				{
 				yy_current_state = (int) yy_def[yy_current_state];
-				if ( yy_current_state >= 399 )
+				if ( yy_current_state >= 425 )
 					yy_c = yy_meta[(unsigned int) yy_c];
 				}
 			yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
 			++yy_cp;
 			}
-		while ( yy_current_state != 398 );
+		while ( yy_current_state != 424 );
 		yy_cp = (yy_last_accepting_cpos);
 		yy_current_state = (yy_last_accepting_state);
 
@@ -1010,7 +1275,7 @@ do_action:	/* This label is used only to access EOF actions. */
 
 case 1:
 YY_RULE_SETUP
-#line 94 "pars0lex.l"
+#line 114 "pars0lex.l"
 {
 			yylval = sym_tab_add_int_lit(pars_sym_tab_global,
 								atoi(yytext));
@@ -1019,7 +1284,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 2:
 YY_RULE_SETUP
-#line 100 "pars0lex.l"
+#line 120 "pars0lex.l"
 {
 			ut_error;	/* not implemented */
 
@@ -1028,7 +1293,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 3:
 YY_RULE_SETUP
-#line 106 "pars0lex.l"
+#line 126 "pars0lex.l"
 {
 			ulint	type;
 
@@ -1040,7 +1305,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 4:
 YY_RULE_SETUP
-#line 115 "pars0lex.l"
+#line 135 "pars0lex.l"
 {
 			yylval = sym_tab_add_bound_id(pars_sym_tab_global,
 				yytext + 1);
@@ -1050,7 +1315,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 5:
 YY_RULE_SETUP
-#line 122 "pars0lex.l"
+#line 142 "pars0lex.l"
 {
 /* Quoted character string literals are handled in an explicit
 start state 'quoted'.  This state is entered and the buffer for
@@ -1064,7 +1329,7 @@ In the state 'quoted', only two actions are possible (defined below). */
 case 6:
 /* rule 6 can match eol */
 YY_RULE_SETUP
-#line 131 "pars0lex.l"
+#line 151 "pars0lex.l"
 {
 			/* Got a sequence of characters other than "'":
 			append to string buffer */
@@ -1073,7 +1338,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 7:
 YY_RULE_SETUP
-#line 136 "pars0lex.l"
+#line 156 "pars0lex.l"
 {
 			/* Got a sequence of "'" characters:
 			append half of them to string buffer,
@@ -1100,7 +1365,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 8:
 YY_RULE_SETUP
-#line 160 "pars0lex.l"
+#line 180 "pars0lex.l"
 {
 /* Quoted identifiers are handled in an explicit start state 'id'.
 This state is entered and the buffer for the scanned string is emptied
@@ -1114,7 +1379,7 @@ In the state 'id', only two actions are possible (defined below). */
 case 9:
 /* rule 9 can match eol */
 YY_RULE_SETUP
-#line 169 "pars0lex.l"
+#line 189 "pars0lex.l"
 {
 			/* Got a sequence of characters other than '"':
 			append to string buffer */
@@ -1123,7 +1388,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 10:
 YY_RULE_SETUP
-#line 174 "pars0lex.l"
+#line 194 "pars0lex.l"
 {
 			/* Got a sequence of '"' characters:
 			append half of them to string buffer,
@@ -1151,7 +1416,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 11:
 YY_RULE_SETUP
-#line 199 "pars0lex.l"
+#line 219 "pars0lex.l"
 {
 			yylval = sym_tab_add_null_lit(pars_sym_tab_global);
 
@@ -1160,7 +1425,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 12:
 YY_RULE_SETUP
-#line 205 "pars0lex.l"
+#line 225 "pars0lex.l"
 {
 			/* Implicit cursor name */
 			yylval = sym_tab_add_str_lit(pars_sym_tab_global,
@@ -1170,645 +1435,643 @@ YY_RULE_SETUP
 	YY_BREAK
 case 13:
 YY_RULE_SETUP
-#line 212 "pars0lex.l"
+#line 232 "pars0lex.l"
 {
 			return(PARS_AND_TOKEN);
 }
 	YY_BREAK
 case 14:
 YY_RULE_SETUP
-#line 216 "pars0lex.l"
+#line 236 "pars0lex.l"
 {
 			return(PARS_OR_TOKEN);
 }
 	YY_BREAK
 case 15:
 YY_RULE_SETUP
-#line 220 "pars0lex.l"
+#line 240 "pars0lex.l"
 {
 			return(PARS_NOT_TOKEN);
 }
 	YY_BREAK
 case 16:
 YY_RULE_SETUP
-#line 224 "pars0lex.l"
+#line 244 "pars0lex.l"
 {
 			return(PARS_PROCEDURE_TOKEN);
 }
 	YY_BREAK
 case 17:
 YY_RULE_SETUP
-#line 228 "pars0lex.l"
+#line 248 "pars0lex.l"
 {
 			return(PARS_IN_TOKEN);
 }
 	YY_BREAK
 case 18:
 YY_RULE_SETUP
-#line 232 "pars0lex.l"
+#line 252 "pars0lex.l"
 {
 			return(PARS_OUT_TOKEN);
 }
 	YY_BREAK
 case 19:
 YY_RULE_SETUP
-#line 236 "pars0lex.l"
+#line 256 "pars0lex.l"
 {
-	 		return(PARS_BINARY_TOKEN);
+			return(PARS_BINARY_TOKEN);
 }
 	YY_BREAK
 case 20:
 YY_RULE_SETUP
-#line 240 "pars0lex.l"
+#line 260 "pars0lex.l"
 {
-	 		return(PARS_BLOB_TOKEN);
+			return(PARS_BLOB_TOKEN);
 }
 	YY_BREAK
 case 21:
 YY_RULE_SETUP
-#line 244 "pars0lex.l"
+#line 264 "pars0lex.l"
 {
-	 		return(PARS_INT_TOKEN);
+			return(PARS_INT_TOKEN);
 }
 	YY_BREAK
 case 22:
 YY_RULE_SETUP
-#line 248 "pars0lex.l"
+#line 268 "pars0lex.l"
 {
-	 		return(PARS_INT_TOKEN);
+			return(PARS_INT_TOKEN);
 }
 	YY_BREAK
 case 23:
 YY_RULE_SETUP
-#line 252 "pars0lex.l"
+#line 272 "pars0lex.l"
 {
-	 		return(PARS_FLOAT_TOKEN);
+			return(PARS_FLOAT_TOKEN);
 }
 	YY_BREAK
 case 24:
 YY_RULE_SETUP
-#line 256 "pars0lex.l"
+#line 276 "pars0lex.l"
 {
-	 		return(PARS_CHAR_TOKEN);
+			return(PARS_CHAR_TOKEN);
 }
 	YY_BREAK
 case 25:
 YY_RULE_SETUP
-#line 260 "pars0lex.l"
+#line 280 "pars0lex.l"
 {
 			return(PARS_IS_TOKEN);
 }
 	YY_BREAK
 case 26:
 YY_RULE_SETUP
-#line 264 "pars0lex.l"
+#line 284 "pars0lex.l"
 {
 			return(PARS_BEGIN_TOKEN);
 }
 	YY_BREAK
 case 27:
 YY_RULE_SETUP
-#line 268 "pars0lex.l"
+#line 288 "pars0lex.l"
 {
 			return(PARS_END_TOKEN);
 }
 	YY_BREAK
 case 28:
 YY_RULE_SETUP
-#line 272 "pars0lex.l"
+#line 292 "pars0lex.l"
 {
 			return(PARS_IF_TOKEN);
 }
 	YY_BREAK
 case 29:
 YY_RULE_SETUP
-#line 276 "pars0lex.l"
+#line 296 "pars0lex.l"
 {
 			return(PARS_THEN_TOKEN);
 }
 	YY_BREAK
 case 30:
 YY_RULE_SETUP
-#line 280 "pars0lex.l"
+#line 300 "pars0lex.l"
 {
 			return(PARS_ELSE_TOKEN);
 }
 	YY_BREAK
 case 31:
 YY_RULE_SETUP
-#line 284 "pars0lex.l"
+#line 304 "pars0lex.l"
 {
 			return(PARS_ELSIF_TOKEN);
 }
 	YY_BREAK
 case 32:
 YY_RULE_SETUP
-#line 288 "pars0lex.l"
+#line 308 "pars0lex.l"
 {
 			return(PARS_LOOP_TOKEN);
 }
 	YY_BREAK
 case 33:
 YY_RULE_SETUP
-#line 292 "pars0lex.l"
+#line 312 "pars0lex.l"
 {
 			return(PARS_WHILE_TOKEN);
 }
 	YY_BREAK
 case 34:
 YY_RULE_SETUP
-#line 296 "pars0lex.l"
+#line 316 "pars0lex.l"
 {
 			return(PARS_RETURN_TOKEN);
 }
 	YY_BREAK
 case 35:
 YY_RULE_SETUP
-#line 300 "pars0lex.l"
+#line 320 "pars0lex.l"
 {
 			return(PARS_SELECT_TOKEN);
 }
 	YY_BREAK
 case 36:
 YY_RULE_SETUP
-#line 304 "pars0lex.l"
+#line 324 "pars0lex.l"
 {
 			return(PARS_SUM_TOKEN);
 }
 	YY_BREAK
 case 37:
 YY_RULE_SETUP
-#line 308 "pars0lex.l"
+#line 328 "pars0lex.l"
 {
 			return(PARS_COUNT_TOKEN);
 }
 	YY_BREAK
 case 38:
 YY_RULE_SETUP
-#line 312 "pars0lex.l"
+#line 332 "pars0lex.l"
 {
 			return(PARS_DISTINCT_TOKEN);
 }
 	YY_BREAK
 case 39:
 YY_RULE_SETUP
-#line 316 "pars0lex.l"
+#line 336 "pars0lex.l"
 {
 			return(PARS_FROM_TOKEN);
 }
 	YY_BREAK
 case 40:
 YY_RULE_SETUP
-#line 320 "pars0lex.l"
+#line 340 "pars0lex.l"
 {
 			return(PARS_WHERE_TOKEN);
 }
 	YY_BREAK
 case 41:
 YY_RULE_SETUP
-#line 324 "pars0lex.l"
+#line 344 "pars0lex.l"
 {
 			return(PARS_FOR_TOKEN);
 }
 	YY_BREAK
 case 42:
 YY_RULE_SETUP
-#line 328 "pars0lex.l"
+#line 348 "pars0lex.l"
 {
 			return(PARS_READ_TOKEN);
 }
 	YY_BREAK
 case 43:
 YY_RULE_SETUP
-#line 332 "pars0lex.l"
+#line 352 "pars0lex.l"
 {
 			return(PARS_ORDER_TOKEN);
 }
 	YY_BREAK
 case 44:
 YY_RULE_SETUP
-#line 336 "pars0lex.l"
+#line 356 "pars0lex.l"
 {
 			return(PARS_BY_TOKEN);
 }
 	YY_BREAK
 case 45:
 YY_RULE_SETUP
-#line 340 "pars0lex.l"
+#line 360 "pars0lex.l"
 {
 			return(PARS_ASC_TOKEN);
 }
 	YY_BREAK
 case 46:
 YY_RULE_SETUP
-#line 344 "pars0lex.l"
+#line 364 "pars0lex.l"
 {
 			return(PARS_DESC_TOKEN);
 }
 	YY_BREAK
 case 47:
 YY_RULE_SETUP
-#line 348 "pars0lex.l"
+#line 368 "pars0lex.l"
 {
 			return(PARS_INSERT_TOKEN);
 }
 	YY_BREAK
 case 48:
 YY_RULE_SETUP
-#line 352 "pars0lex.l"
+#line 372 "pars0lex.l"
 {
 			return(PARS_INTO_TOKEN);
 }
 	YY_BREAK
 case 49:
 YY_RULE_SETUP
-#line 356 "pars0lex.l"
+#line 376 "pars0lex.l"
 {
 			return(PARS_VALUES_TOKEN);
 }
 	YY_BREAK
 case 50:
 YY_RULE_SETUP
-#line 360 "pars0lex.l"
+#line 380 "pars0lex.l"
 {
 			return(PARS_UPDATE_TOKEN);
 }
 	YY_BREAK
 case 51:
 YY_RULE_SETUP
-#line 364 "pars0lex.l"
+#line 384 "pars0lex.l"
 {
 			return(PARS_SET_TOKEN);
 }
 	YY_BREAK
 case 52:
 YY_RULE_SETUP
-#line 368 "pars0lex.l"
+#line 388 "pars0lex.l"
 {
 			return(PARS_DELETE_TOKEN);
 }
 	YY_BREAK
 case 53:
 YY_RULE_SETUP
-#line 372 "pars0lex.l"
+#line 392 "pars0lex.l"
 {
 			return(PARS_CURRENT_TOKEN);
 }
 	YY_BREAK
 case 54:
 YY_RULE_SETUP
-#line 376 "pars0lex.l"
+#line 396 "pars0lex.l"
 {
 			return(PARS_OF_TOKEN);
 }
 	YY_BREAK
 case 55:
 YY_RULE_SETUP
-#line 380 "pars0lex.l"
+#line 400 "pars0lex.l"
 {
 			return(PARS_CREATE_TOKEN);
 }
 	YY_BREAK
 case 56:
 YY_RULE_SETUP
-#line 384 "pars0lex.l"
+#line 404 "pars0lex.l"
 {
 			return(PARS_TABLE_TOKEN);
 }
 	YY_BREAK
 case 57:
 YY_RULE_SETUP
-#line 388 "pars0lex.l"
+#line 408 "pars0lex.l"
 {
-	 		return(PARS_INDEX_TOKEN);
+			return(PARS_COMPACT_TOKEN);
 }
 	YY_BREAK
 case 58:
 YY_RULE_SETUP
-#line 392 "pars0lex.l"
+#line 412 "pars0lex.l"
 {
-	 		return(PARS_UNIQUE_TOKEN);
+			return(PARS_BLOCK_SIZE_TOKEN);
 }
 	YY_BREAK
 case 59:
 YY_RULE_SETUP
-#line 396 "pars0lex.l"
+#line 416 "pars0lex.l"
 {
-	 		return(PARS_CLUSTERED_TOKEN);
+			return(PARS_INDEX_TOKEN);
 }
 	YY_BREAK
 case 60:
 YY_RULE_SETUP
-#line 400 "pars0lex.l"
+#line 420 "pars0lex.l"
 {
-			return(PARS_DOES_NOT_FIT_IN_MEM_TOKEN);
+			return(PARS_UNIQUE_TOKEN);
 }
 	YY_BREAK
 case 61:
 YY_RULE_SETUP
-#line 404 "pars0lex.l"
+#line 424 "pars0lex.l"
 {
-	 		return(PARS_ON_TOKEN);
+			return(PARS_CLUSTERED_TOKEN);
 }
 	YY_BREAK
 case 62:
 YY_RULE_SETUP
-#line 408 "pars0lex.l"
+#line 428 "pars0lex.l"
 {
-			return(PARS_DECLARE_TOKEN);
+			return(PARS_DOES_NOT_FIT_IN_MEM_TOKEN);
 }
 	YY_BREAK
 case 63:
 YY_RULE_SETUP
-#line 412 "pars0lex.l"
+#line 432 "pars0lex.l"
 {
-			return(PARS_CURSOR_TOKEN);
+			return(PARS_ON_TOKEN);
 }
 	YY_BREAK
 case 64:
 YY_RULE_SETUP
-#line 416 "pars0lex.l"
+#line 436 "pars0lex.l"
 {
-			return(PARS_OPEN_TOKEN);
+			return(PARS_DECLARE_TOKEN);
 }
 	YY_BREAK
 case 65:
 YY_RULE_SETUP
-#line 420 "pars0lex.l"
+#line 440 "pars0lex.l"
 {
-			return(PARS_FETCH_TOKEN);
+			return(PARS_CURSOR_TOKEN);
 }
 	YY_BREAK
 case 66:
 YY_RULE_SETUP
-#line 424 "pars0lex.l"
+#line 444 "pars0lex.l"
 {
-			return(PARS_CLOSE_TOKEN);
+			return(PARS_OPEN_TOKEN);
 }
 	YY_BREAK
 case 67:
 YY_RULE_SETUP
-#line 428 "pars0lex.l"
+#line 448 "pars0lex.l"
 {
-			return(PARS_NOTFOUND_TOKEN);
+			return(PARS_FETCH_TOKEN);
 }
 	YY_BREAK
 case 68:
 YY_RULE_SETUP
-#line 432 "pars0lex.l"
+#line 452 "pars0lex.l"
 {
-			return(PARS_TO_CHAR_TOKEN);
+			return(PARS_CLOSE_TOKEN);
 }
 	YY_BREAK
 case 69:
 YY_RULE_SETUP
-#line 436 "pars0lex.l"
+#line 456 "pars0lex.l"
 {
-			return(PARS_TO_NUMBER_TOKEN);
+			return(PARS_NOTFOUND_TOKEN);
 }
 	YY_BREAK
 case 70:
 YY_RULE_SETUP
-#line 440 "pars0lex.l"
+#line 460 "pars0lex.l"
 {
-			return(PARS_TO_BINARY_TOKEN);
+			return(PARS_TO_CHAR_TOKEN);
 }
 	YY_BREAK
 case 71:
 YY_RULE_SETUP
-#line 444 "pars0lex.l"
+#line 464 "pars0lex.l"
 {
-			return(PARS_BINARY_TO_NUMBER_TOKEN);
+			return(PARS_TO_NUMBER_TOKEN);
 }
 	YY_BREAK
 case 72:
 YY_RULE_SETUP
-#line 448 "pars0lex.l"
+#line 468 "pars0lex.l"
 {
-			return(PARS_SUBSTR_TOKEN);
+			return(PARS_TO_BINARY_TOKEN);
 }
 	YY_BREAK
 case 73:
 YY_RULE_SETUP
-#line 452 "pars0lex.l"
+#line 472 "pars0lex.l"
 {
-			return(PARS_REPLSTR_TOKEN);
+			return(PARS_BINARY_TO_NUMBER_TOKEN);
 }
 	YY_BREAK
 case 74:
 YY_RULE_SETUP
-#line 456 "pars0lex.l"
+#line 476 "pars0lex.l"
 {
-			return(PARS_CONCAT_TOKEN);
+			return(PARS_SUBSTR_TOKEN);
 }
 	YY_BREAK
 case 75:
 YY_RULE_SETUP
-#line 460 "pars0lex.l"
+#line 480 "pars0lex.l"
 {
-			return(PARS_INSTR_TOKEN);
+			return(PARS_REPLSTR_TOKEN);
 }
 	YY_BREAK
 case 76:
 YY_RULE_SETUP
-#line 464 "pars0lex.l"
+#line 484 "pars0lex.l"
 {
-			return(PARS_LENGTH_TOKEN);
+			return(PARS_CONCAT_TOKEN);
 }
 	YY_BREAK
 case 77:
 YY_RULE_SETUP
-#line 468 "pars0lex.l"
+#line 488 "pars0lex.l"
 {
-			return(PARS_SYSDATE_TOKEN);
+			return(PARS_INSTR_TOKEN);
 }
 	YY_BREAK
 case 78:
 YY_RULE_SETUP
-#line 472 "pars0lex.l"
+#line 492 "pars0lex.l"
 {
-			return(PARS_PRINTF_TOKEN);
+			return(PARS_LENGTH_TOKEN);
 }
 	YY_BREAK
 case 79:
 YY_RULE_SETUP
-#line 476 "pars0lex.l"
+#line 496 "pars0lex.l"
 {
-			return(PARS_ASSERT_TOKEN);
+			return(PARS_SYSDATE_TOKEN);
 }
 	YY_BREAK
 case 80:
 YY_RULE_SETUP
-#line 480 "pars0lex.l"
+#line 500 "pars0lex.l"
 {
-			return(PARS_RND_TOKEN);
+			return(PARS_PRINTF_TOKEN);
 }
 	YY_BREAK
 case 81:
 YY_RULE_SETUP
-#line 484 "pars0lex.l"
+#line 504 "pars0lex.l"
 {
-			return(PARS_RND_STR_TOKEN);
+			return(PARS_ASSERT_TOKEN);
 }
 	YY_BREAK
 case 82:
 YY_RULE_SETUP
-#line 488 "pars0lex.l"
+#line 508 "pars0lex.l"
 {
-			return(PARS_ROW_PRINTF_TOKEN);
+			return(PARS_RND_TOKEN);
 }
 	YY_BREAK
 case 83:
 YY_RULE_SETUP
-#line 492 "pars0lex.l"
+#line 512 "pars0lex.l"
 {
-			return(PARS_COMMIT_TOKEN);
+			return(PARS_RND_STR_TOKEN);
 }
 	YY_BREAK
 case 84:
 YY_RULE_SETUP
-#line 496 "pars0lex.l"
+#line 516 "pars0lex.l"
 {
-			return(PARS_ROLLBACK_TOKEN);
+			return(PARS_ROW_PRINTF_TOKEN);
 }
 	YY_BREAK
 case 85:
 YY_RULE_SETUP
-#line 500 "pars0lex.l"
+#line 520 "pars0lex.l"
 {
-			return(PARS_WORK_TOKEN);
+			return(PARS_COMMIT_TOKEN);
 }
 	YY_BREAK
 case 86:
 YY_RULE_SETUP
-#line 504 "pars0lex.l"
+#line 524 "pars0lex.l"
 {
-			return(PARS_UNSIGNED_TOKEN);
+			return(PARS_ROLLBACK_TOKEN);
 }
 	YY_BREAK
 case 87:
 YY_RULE_SETUP
-#line 508 "pars0lex.l"
+#line 528 "pars0lex.l"
 {
-			return(PARS_EXIT_TOKEN);
+			return(PARS_WORK_TOKEN);
 }
 	YY_BREAK
 case 88:
 YY_RULE_SETUP
-#line 512 "pars0lex.l"
+#line 532 "pars0lex.l"
 {
-			return(PARS_FUNCTION_TOKEN);
+			return(PARS_UNSIGNED_TOKEN);
 }
 	YY_BREAK
 case 89:
 YY_RULE_SETUP
-#line 516 "pars0lex.l"
+#line 536 "pars0lex.l"
 {
-			return(PARS_LOCK_TOKEN);
+			return(PARS_EXIT_TOKEN);
 }
 	YY_BREAK
 case 90:
 YY_RULE_SETUP
-#line 520 "pars0lex.l"
+#line 540 "pars0lex.l"
 {
-			return(PARS_SHARE_TOKEN);
+			return(PARS_FUNCTION_TOKEN);
 }
 	YY_BREAK
 case 91:
 YY_RULE_SETUP
-#line 524 "pars0lex.l"
+#line 544 "pars0lex.l"
 {
-			return(PARS_MODE_TOKEN);
+			return(PARS_LOCK_TOKEN);
 }
 	YY_BREAK
 case 92:
 YY_RULE_SETUP
-#line 528 "pars0lex.l"
+#line 548 "pars0lex.l"
 {
-			yylval = sym_tab_add_id(pars_sym_tab_global,
-							(byte*)yytext,
-							ut_strlen(yytext));
-			return(PARS_ID_TOKEN);
+			return(PARS_SHARE_TOKEN);
 }
 	YY_BREAK
 case 93:
 YY_RULE_SETUP
-#line 535 "pars0lex.l"
+#line 552 "pars0lex.l"
 {
-			return(PARS_DDOT_TOKEN);
+			return(PARS_MODE_TOKEN);
 }
 	YY_BREAK
 case 94:
 YY_RULE_SETUP
-#line 539 "pars0lex.l"
+#line 556 "pars0lex.l"
 {
-			return(PARS_ASSIGN_TOKEN);
+                        return(PARS_LIKE_TOKEN);
 }
 	YY_BREAK
 case 95:
 YY_RULE_SETUP
-#line 543 "pars0lex.l"
+#line 560 "pars0lex.l"
 {
-			return(PARS_LE_TOKEN);
+			return(PARS_BIGINT_TOKEN);
 }
 	YY_BREAK
 case 96:
 YY_RULE_SETUP
-#line 547 "pars0lex.l"
+#line 564 "pars0lex.l"
 {
-			return(PARS_GE_TOKEN);
+			yylval = sym_tab_add_id(pars_sym_tab_global,
+							(byte*) yytext,
+							ut_strlen(yytext));
+			return(PARS_ID_TOKEN);
 }
 	YY_BREAK
 case 97:
 YY_RULE_SETUP
-#line 551 "pars0lex.l"
+#line 571 "pars0lex.l"
 {
-			return(PARS_NE_TOKEN);
+			yylval = sym_tab_add_id(pars_sym_tab_global,
+							(byte*) yytext,
+							ut_strlen(yytext));
+			return(PARS_TABLE_NAME_TOKEN);
 }
 	YY_BREAK
 case 98:
 YY_RULE_SETUP
-#line 555 "pars0lex.l"
+#line 578 "pars0lex.l"
 {
-
-			return((int)(*yytext));
+			return(PARS_DDOT_TOKEN);
 }
 	YY_BREAK
 case 99:
 YY_RULE_SETUP
-#line 560 "pars0lex.l"
+#line 582 "pars0lex.l"
 {
-
-			return((int)(*yytext));
+			return(PARS_ASSIGN_TOKEN);
 }
 	YY_BREAK
 case 100:
 YY_RULE_SETUP
-#line 565 "pars0lex.l"
+#line 586 "pars0lex.l"
 {
-
-			return((int)(*yytext));
+			return(PARS_LE_TOKEN);
 }
 	YY_BREAK
 case 101:
 YY_RULE_SETUP
-#line 570 "pars0lex.l"
+#line 590 "pars0lex.l"
 {
-
-			return((int)(*yytext));
+			return(PARS_GE_TOKEN);
 }
 	YY_BREAK
 case 102:
 YY_RULE_SETUP
-#line 575 "pars0lex.l"
+#line 594 "pars0lex.l"
 {
-
-			return((int)(*yytext));
+			return(PARS_NE_TOKEN);
 }
 	YY_BREAK
 case 103:
 YY_RULE_SETUP
-#line 580 "pars0lex.l"
+#line 598 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1816,7 +2079,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 104:
 YY_RULE_SETUP
-#line 585 "pars0lex.l"
+#line 603 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1824,7 +2087,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 105:
 YY_RULE_SETUP
-#line 590 "pars0lex.l"
+#line 608 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1832,7 +2095,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 106:
 YY_RULE_SETUP
-#line 595 "pars0lex.l"
+#line 613 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1840,7 +2103,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 107:
 YY_RULE_SETUP
-#line 600 "pars0lex.l"
+#line 618 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1848,7 +2111,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 108:
 YY_RULE_SETUP
-#line 605 "pars0lex.l"
+#line 623 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1856,7 +2119,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 109:
 YY_RULE_SETUP
-#line 610 "pars0lex.l"
+#line 628 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1864,7 +2127,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 110:
 YY_RULE_SETUP
-#line 615 "pars0lex.l"
+#line 633 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1872,7 +2135,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 111:
 YY_RULE_SETUP
-#line 620 "pars0lex.l"
+#line 638 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1880,7 +2143,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 112:
 YY_RULE_SETUP
-#line 625 "pars0lex.l"
+#line 643 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1888,35 +2151,75 @@ YY_RULE_SETUP
 	YY_BREAK
 case 113:
 YY_RULE_SETUP
-#line 630 "pars0lex.l"
-BEGIN(comment); /* eat up comment */
+#line 648 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
 	YY_BREAK
 case 114:
-/* rule 114 can match eol */
 YY_RULE_SETUP
-#line 632 "pars0lex.l"
+#line 653 "pars0lex.l"
+{
 
+			return((int)(*yytext));
+}
 	YY_BREAK
 case 115:
-/* rule 115 can match eol */
 YY_RULE_SETUP
-#line 633 "pars0lex.l"
+#line 658 "pars0lex.l"
+{
 
+			return((int)(*yytext));
+}
 	YY_BREAK
 case 116:
 YY_RULE_SETUP
-#line 634 "pars0lex.l"
-BEGIN(INITIAL);
+#line 663 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
 	YY_BREAK
 case 117:
-/* rule 117 can match eol */
 YY_RULE_SETUP
-#line 636 "pars0lex.l"
-/* eat up whitespace */
+#line 668 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
 	YY_BREAK
 case 118:
 YY_RULE_SETUP
-#line 639 "pars0lex.l"
+#line 673 "pars0lex.l"
+BEGIN(comment); /* eat up comment */
+	YY_BREAK
+case 119:
+/* rule 119 can match eol */
+YY_RULE_SETUP
+#line 675 "pars0lex.l"
+
+	YY_BREAK
+case 120:
+/* rule 120 can match eol */
+YY_RULE_SETUP
+#line 676 "pars0lex.l"
+
+	YY_BREAK
+case 121:
+YY_RULE_SETUP
+#line 677 "pars0lex.l"
+BEGIN(INITIAL);
+	YY_BREAK
+case 122:
+/* rule 122 can match eol */
+YY_RULE_SETUP
+#line 679 "pars0lex.l"
+/* eat up whitespace */
+	YY_BREAK
+case 123:
+YY_RULE_SETUP
+#line 682 "pars0lex.l"
 {
 			fprintf(stderr,"Unrecognized character: %02x\n",
 				*yytext);
@@ -1926,12 +2229,12 @@ YY_RULE_SETUP
 			return(0);
 }
 	YY_BREAK
-case 119:
+case 124:
 YY_RULE_SETUP
-#line 648 "pars0lex.l"
+#line 691 "pars0lex.l"
 YY_FATAL_ERROR( "flex scanner jammed" );
 	YY_BREAK
-#line 1916 "lexyy.c"
+#line 2237 "lexyy.cc"
 case YY_STATE_EOF(INITIAL):
 case YY_STATE_EOF(comment):
 case YY_STATE_EOF(quoted):
@@ -2121,7 +2424,7 @@ static int yy_get_next_buffer (void)
 
 	else
 		{
-			size_t num_to_read =
+			int num_to_read =
 			YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1;
 
 		while ( num_to_read <= 0 )
@@ -2166,7 +2469,7 @@ static int yy_get_next_buffer (void)
 
 		/* Read in more data. */
 		YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
-			(yy_n_chars), num_to_read );
+			(yy_n_chars), (size_t) num_to_read );
 
 		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars);
 		}
@@ -2190,6 +2493,14 @@ static int yy_get_next_buffer (void)
 	else
 		ret_val = EOB_ACT_CONTINUE_SCAN;
 
+	if ((yy_size_t) ((yy_n_chars) + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
+		/* Extend the array by 50%, plus the number we really need. */
+		yy_size_t new_size = (yy_n_chars) + number_to_move + ((yy_n_chars) >> 1);
+		YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size  );
+		if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
+			YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
+	}
+
 	(yy_n_chars) += number_to_move;
 	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR;
 	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR;
@@ -2201,11 +2512,11 @@ static int yy_get_next_buffer (void)
 
 /* yy_get_previous_state - get the state just before the EOB char was reached */
 
-    static yy_state_type yy_get_previous_state (void)
+     yy_state_type yy_get_previous_state (void)
 {
 	register yy_state_type yy_current_state;
 	register char *yy_cp;
-    
+
 	yy_current_state = (yy_start);
 
 	for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp )
@@ -2219,7 +2530,7 @@ static int yy_get_next_buffer (void)
 		while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
 			{
 			yy_current_state = (int) yy_def[yy_current_state];
-			if ( yy_current_state >= 399 )
+			if ( yy_current_state >= 425 )
 				yy_c = yy_meta[(unsigned int) yy_c];
 			}
 		yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
@@ -2233,7 +2544,7 @@ static int yy_get_next_buffer (void)
  * synopsis
  *	next_state = yy_try_NUL_trans( current_state );
  */
-    static yy_state_type yy_try_NUL_trans  (yy_state_type yy_current_state )
+     static yy_state_type yy_try_NUL_trans  (yy_state_type yy_current_state )
 {
 	register int yy_is_jam;
     	register char *yy_cp = (yy_c_buf_p);
@@ -2247,11 +2558,11 @@ static int yy_get_next_buffer (void)
 	while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
 		{
 		yy_current_state = (int) yy_def[yy_current_state];
-		if ( yy_current_state >= 399 )
+		if ( yy_current_state >= 425 )
 			yy_c = yy_meta[(unsigned int) yy_c];
 		}
 	yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
-	yy_is_jam = (yy_current_state == 398);
+	yy_is_jam = (yy_current_state == 424);
 
 	return yy_is_jam ? 0 : yy_current_state;
 }
@@ -2265,7 +2576,7 @@ static int yy_get_next_buffer (void)
 
 {
 	int c;
-    
+
 	*(yy_c_buf_p) = (yy_hold_char);
 
 	if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR )
@@ -2335,9 +2646,9 @@ static int yy_get_next_buffer (void)
  * 
  * @note This function does not reset the start condition to @c INITIAL .
  */
-    static void yyrestart  (FILE * input_file )
+    void yyrestart  (FILE * input_file )
 {
-    
+
 	if ( ! YY_CURRENT_BUFFER ){
         yyensure_buffer_stack ();
 		YY_CURRENT_BUFFER_LVALUE =
@@ -2354,7 +2665,7 @@ static int yy_get_next_buffer (void)
  */
     __attribute__((unused)) static void yy_switch_to_buffer  (YY_BUFFER_STATE  new_buffer )
 {
-    
+
 	/* TODO. We should be able to replace this entire function body
 	 * with
 	 *		yypop_buffer_state();
@@ -2400,7 +2711,7 @@ static void yy_load_buffer_state  (void)
     static YY_BUFFER_STATE yy_create_buffer  (FILE * file, int  size )
 {
 	YY_BUFFER_STATE b;
-    
+
 	b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state )  );
 	if ( ! b )
 		YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
@@ -2425,9 +2736,9 @@ static void yy_load_buffer_state  (void)
  * @param b a buffer created with yy_create_buffer()
  * 
  */
-    static void yy_delete_buffer (YY_BUFFER_STATE  b )
+     void yy_delete_buffer (YY_BUFFER_STATE  b )
 {
-    
+
 	if ( ! b )
 		return;
 
@@ -2444,11 +2755,11 @@ static void yy_load_buffer_state  (void)
  * This function is sometimes called more than once on the same buffer,
  * such as during a yyrestart() or at EOF.
  */
-    static void yy_init_buffer  (YY_BUFFER_STATE  b, FILE * file )
+     static void yy_init_buffer  (YY_BUFFER_STATE  b, FILE * file )
 
 {
 	int oerrno = errno;
-    
+ 
 	yy_flush_buffer(b );
 
 	b->yy_input_file = file;
@@ -2472,7 +2783,7 @@ static void yy_load_buffer_state  (void)
  * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
  * 
  */
-    static void yy_flush_buffer (YY_BUFFER_STATE  b )
+    void yy_flush_buffer (YY_BUFFER_STATE  b )
 {
     	if ( ! b )
 		return;
@@ -2501,7 +2812,7 @@ static void yy_load_buffer_state  (void)
  *  @param new_buffer The new state.
  *  
  */
-__attribute__((unused)) static void yypush_buffer_state (YY_BUFFER_STATE new_buffer )
+void yypush_buffer_state (YY_BUFFER_STATE new_buffer )
 {
     	if (new_buffer == NULL)
 		return;
@@ -2531,7 +2842,7 @@ __attribute__((unused)) static void yypush_buffer_state (YY_BUFFER_STATE new_buf
  *  The next element becomes the new top.
  *  
  */
-__attribute__((unused)) static void yypop_buffer_state (void)
+void yypop_buffer_state (void)
 {
     	if (!YY_CURRENT_BUFFER)
 		return;
@@ -2564,9 +2875,11 @@ static void yyensure_buffer_stack (void)
 		(yy_buffer_stack) = (struct yy_buffer_state**)yyalloc
 								(num_to_alloc * sizeof(struct yy_buffer_state*)
 								);
-		
+		if ( ! (yy_buffer_stack) )
+			YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
+
 		memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*));
-				
+
 		(yy_buffer_stack_max) = num_to_alloc;
 		(yy_buffer_stack_top) = 0;
 		return;
@@ -2582,6 +2895,8 @@ static void yyensure_buffer_stack (void)
 								((yy_buffer_stack),
 								num_to_alloc * sizeof(struct yy_buffer_state*)
 								);
+		if ( ! (yy_buffer_stack) )
+			YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
 
 		/* zero only the new slots.*/
 		memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*));
@@ -2621,16 +2936,16 @@ static void yy_fatal_error (yyconst char* msg )
 /** Get the current line number.
  * 
  */
-__attribute__((unused)) static int yyget_lineno  (void)
+int yyget_lineno  (void)
 {
-        
+
     return yylineno;
 }
 
 /** Get the input stream.
  * 
  */
-__attribute__((unused)) static FILE *yyget_in  (void)
+FILE *yyget_in  (void)
 {
         return yyin;
 }
@@ -2638,7 +2953,7 @@ __attribute__((unused)) static FILE *yyget_in  (void)
 /** Get the output stream.
  * 
  */
-__attribute__((unused)) static FILE *yyget_out  (void)
+FILE *yyget_out  (void)
 {
         return yyout;
 }
@@ -2646,7 +2961,7 @@ __attribute__((unused)) static FILE *yyget_out  (void)
 /** Get the length of the current token.
  * 
  */
-__attribute__((unused)) static int yyget_leng  (void)
+yy_size_t yyget_leng  (void)
 {
         return yyleng;
 }
@@ -2655,7 +2970,7 @@ __attribute__((unused)) static int yyget_leng  (void)
  * 
  */
 
-__attribute__((unused)) static char *yyget_text  (void)
+char *yyget_text  (void)
 {
         return yytext;
 }
@@ -2664,9 +2979,9 @@ __attribute__((unused)) static char *yyget_text  (void)
  * @param line_number
  * 
  */
-__attribute__((unused)) static void yyset_lineno (int  line_number )
+void yyset_lineno (int  line_number )
 {
-    
+ 
     yylineno = line_number;
 }
 
@@ -2676,30 +2991,58 @@ __attribute__((unused)) static void yyset_lineno (int  line_number )
  * 
  * @see yy_switch_to_buffer
  */
-__attribute__((unused)) static void yyset_in (FILE *  in_str )
+void yyset_in (FILE *  in_str )
 {
         yyin = in_str ;
 }
 
-__attribute__((unused)) static void yyset_out (FILE *  out_str )
+void yyset_out (FILE *  out_str )
 {
         yyout = out_str ;
 }
 
-__attribute__((unused)) static int yyget_debug  (void)
+int yyget_debug  (void)
 {
         return yy_flex_debug;
 }
 
-__attribute__((unused)) static void yyset_debug (int  bdebug )
+void yyset_debug (int  bdebug )
 {
         yy_flex_debug = bdebug ;
 }
 
+static int yy_init_globals (void)
+{
+        /* Initialization is the same as for the non-reentrant scanner.
+     * This function is called from yylex_destroy(), so don't allocate here.
+     */
+
+    (yy_buffer_stack) = 0;
+    (yy_buffer_stack_top) = 0;
+    (yy_buffer_stack_max) = 0;
+    (yy_c_buf_p) = (char *) 0;
+    (yy_init) = 0;
+    (yy_start) = 0;
+
+/* Defined in main.c */
+#ifdef YY_STDINIT
+    yyin = stdin;
+    yyout = stdout;
+#else
+    yyin = (FILE *) 0;
+    yyout = (FILE *) 0;
+#endif
+
+    /* For future reference: Set errno on error, since we are called by
+     * yylex_init()
+     */
+    return 0;
+}
+
 /* yylex_destroy is for both reentrant and non-reentrant scanners. */
 __attribute__((unused)) static int yylex_destroy  (void)
 {
-    
+ 
     /* Pop the buffer stack, destroying each element. */
 	while(YY_CURRENT_BUFFER){
 		yy_delete_buffer(YY_CURRENT_BUFFER  );
@@ -2711,6 +3054,10 @@ __attribute__((unused)) static int yylex_destroy  (void)
 	yyfree((yy_buffer_stack) );
 	(yy_buffer_stack) = NULL;
 
+    /* Reset the globals. This is important in a non-reentrant scanner so the next time
+     * yylex() is called, initialization will occur. */
+    yy_init_globals( );
+
     return 0;
 }
 
@@ -2722,7 +3069,7 @@ __attribute__((unused)) static int yylex_destroy  (void)
 static void yy_flex_strncpy (char* s1, yyconst char * s2, int n )
 {
 	register int i;
-    	for ( i = 0; i < n; ++i )
+	for ( i = 0; i < n; ++i )
 		s1[i] = s2[i];
 }
 #endif
@@ -2731,19 +3078,19 @@ static void yy_flex_strncpy (char* s1, yyconst char * s2, int n )
 static int yy_flex_strlen (yyconst char * s )
 {
 	register int n;
-    	for ( n = 0; s[n]; ++n )
+	for ( n = 0; s[n]; ++n )
 		;
 
 	return n;
 }
 #endif
 
-static void *yyalloc (yy_size_t  size )
+void *yyalloc (yy_size_t  size )
 {
 	return (void *) malloc( size );
 }
 
-static void *yyrealloc  (void * ptr, yy_size_t  size )
+void *yyrealloc  (void * ptr, yy_size_t  size )
 {
 	/* The cast to (char *) in the following accommodates both
 	 * implementations that use char* generic pointers, and those
@@ -2755,27 +3102,14 @@ static void *yyrealloc  (void * ptr, yy_size_t  size )
 	return (void *) realloc( (char *) ptr, size );
 }
 
-static void yyfree (void * ptr )
+void yyfree (void * ptr )
 {
-	free( (char *) ptr );	/* see yyrealloc() for (char *) cast */
+	free( (char*) ptr );	/* see yyrealloc() for (char *) cast */
 }
 
 #define YYTABLES_NAME "yytables"
 
-#undef YY_NEW_FILE
-#undef YY_FLUSH_BUFFER
-#undef yy_set_bol
-#undef yy_new_buffer
-#undef yy_set_interactive
-#undef yytext_ptr
-#undef YY_DO_BEFORE_ACTION
-
-#ifdef YY_DECL_IS_OURS
-#undef YY_DECL_IS_OURS
-#undef YY_DECL
-#endif
-#line 648 "pars0lex.l"
-
+#line 691 "pars0lex.l"
 
 
 
@@ -2793,3 +3127,4 @@ pars_lexer_close(void)
 	stringbuf = NULL;
 	stringbuf_len_alloc = stringbuf_len = 0;
 }
+
diff --git a/storage/xtradb/pars/make_bison.sh b/storage/xtradb/pars/make_bison.sh
index 6587b6b9f1a..2618be102bc 100644..100755
--- a/storage/xtradb/pars/make_bison.sh
+++ b/storage/xtradb/pars/make_bison.sh
@@ -1,24 +1,24 @@
 #!/bin/bash
 #
-# Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
-# 
+# Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+#
 # This program is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
 # Foundation; version 2 of the License.
-# 
+#
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU General Public License along with
-# this program; if not, write to the Free Software Foundation, Inc., 
-# 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 #
 # generate parser files from bison input files.
 
 set -eu
 TMPFILE=pars0grm.tab.c
-OUTFILE=pars0grm.c
+OUTFILE=pars0grm.cc
 
 bison -d pars0grm.y
 mv pars0grm.tab.h ../include/pars0grm.h
diff --git a/storage/xtradb/pars/make_flex.sh b/storage/xtradb/pars/make_flex.sh
index a8d35f8cca4..581fc2342aa 100644..100755
--- a/storage/xtradb/pars/make_flex.sh
+++ b/storage/xtradb/pars/make_flex.sh
@@ -1,25 +1,25 @@
 #!/bin/bash
 #
-# Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
-# 
+# Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+#
 # This program is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
 # Foundation; version 2 of the License.
-# 
+#
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU General Public License along with
-# this program; if not, write to the Free Software Foundation, Inc., 
-# 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 #
 # generate lexer files from flex input files.
 
 set -eu
 
-TMPFILE=_flex_tmp.c
-OUTFILE=lexyy.c
+TMPFILE=_flex_tmp.cc
+OUTFILE=lexyy.cc
 
 flex -o $TMPFILE pars0lex.l
 
diff --git a/storage/xtradb/pars/pars0grm.c b/storage/xtradb/pars/pars0grm.c
deleted file mode 100644
index da19ccd1136..00000000000
--- a/storage/xtradb/pars/pars0grm.c
+++ /dev/null
@@ -1,2601 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-Copyright (c) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004 Free Software
-Foundation, Inc.
-
-As a special exception, when this file is copied by Bison into a
-Bison output file, you may use that output file without restriction.
-This special exception was added by the Free Software Foundation
-in version 1.24 of Bison.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-
-*****************************************************************************/
-
-/* A Bison parser, made by GNU Bison 2.0.  */
-
-/* Written by Richard Stallman by simplifying the original so called
-   ``semantic'' parser.  */
-
-/* All symbols defined below should begin with yy or YY, to avoid
-   infringing on user name space.  This should be done even for local
-   variables, as they might otherwise be expanded by user macros.
-   There are some unavoidable exceptions within include files to
-   define necessary library symbols; they are noted "INFRINGES ON
-   USER NAME SPACE" below.  */
-
-/* Identify Bison output.  */
-#define YYBISON 1
-
-/* Skeleton name.  */
-#define YYSKELETON_NAME "yacc.c"
-
-/* Pure parsers.  */
-#define YYPURE 0
-
-/* Using locations.  */
-#define YYLSP_NEEDED 0
-
-
-
-/* Tokens.  */
-#ifndef YYTOKENTYPE
-# define YYTOKENTYPE
-   /* Put the tokens into the symbol table, so that GDB and other debuggers
-      know about them.  */
-   enum yytokentype {
-     PARS_INT_LIT = 258,
-     PARS_FLOAT_LIT = 259,
-     PARS_STR_LIT = 260,
-     PARS_FIXBINARY_LIT = 261,
-     PARS_BLOB_LIT = 262,
-     PARS_NULL_LIT = 263,
-     PARS_ID_TOKEN = 264,
-     PARS_AND_TOKEN = 265,
-     PARS_OR_TOKEN = 266,
-     PARS_NOT_TOKEN = 267,
-     PARS_GE_TOKEN = 268,
-     PARS_LE_TOKEN = 269,
-     PARS_NE_TOKEN = 270,
-     PARS_PROCEDURE_TOKEN = 271,
-     PARS_IN_TOKEN = 272,
-     PARS_OUT_TOKEN = 273,
-     PARS_BINARY_TOKEN = 274,
-     PARS_BLOB_TOKEN = 275,
-     PARS_INT_TOKEN = 276,
-     PARS_INTEGER_TOKEN = 277,
-     PARS_FLOAT_TOKEN = 278,
-     PARS_CHAR_TOKEN = 279,
-     PARS_IS_TOKEN = 280,
-     PARS_BEGIN_TOKEN = 281,
-     PARS_END_TOKEN = 282,
-     PARS_IF_TOKEN = 283,
-     PARS_THEN_TOKEN = 284,
-     PARS_ELSE_TOKEN = 285,
-     PARS_ELSIF_TOKEN = 286,
-     PARS_LOOP_TOKEN = 287,
-     PARS_WHILE_TOKEN = 288,
-     PARS_RETURN_TOKEN = 289,
-     PARS_SELECT_TOKEN = 290,
-     PARS_SUM_TOKEN = 291,
-     PARS_COUNT_TOKEN = 292,
-     PARS_DISTINCT_TOKEN = 293,
-     PARS_FROM_TOKEN = 294,
-     PARS_WHERE_TOKEN = 295,
-     PARS_FOR_TOKEN = 296,
-     PARS_DDOT_TOKEN = 297,
-     PARS_READ_TOKEN = 298,
-     PARS_ORDER_TOKEN = 299,
-     PARS_BY_TOKEN = 300,
-     PARS_ASC_TOKEN = 301,
-     PARS_DESC_TOKEN = 302,
-     PARS_INSERT_TOKEN = 303,
-     PARS_INTO_TOKEN = 304,
-     PARS_VALUES_TOKEN = 305,
-     PARS_UPDATE_TOKEN = 306,
-     PARS_SET_TOKEN = 307,
-     PARS_DELETE_TOKEN = 308,
-     PARS_CURRENT_TOKEN = 309,
-     PARS_OF_TOKEN = 310,
-     PARS_CREATE_TOKEN = 311,
-     PARS_TABLE_TOKEN = 312,
-     PARS_INDEX_TOKEN = 313,
-     PARS_UNIQUE_TOKEN = 314,
-     PARS_CLUSTERED_TOKEN = 315,
-     PARS_DOES_NOT_FIT_IN_MEM_TOKEN = 316,
-     PARS_ON_TOKEN = 317,
-     PARS_ASSIGN_TOKEN = 318,
-     PARS_DECLARE_TOKEN = 319,
-     PARS_CURSOR_TOKEN = 320,
-     PARS_SQL_TOKEN = 321,
-     PARS_OPEN_TOKEN = 322,
-     PARS_FETCH_TOKEN = 323,
-     PARS_CLOSE_TOKEN = 324,
-     PARS_NOTFOUND_TOKEN = 325,
-     PARS_TO_CHAR_TOKEN = 326,
-     PARS_TO_NUMBER_TOKEN = 327,
-     PARS_TO_BINARY_TOKEN = 328,
-     PARS_BINARY_TO_NUMBER_TOKEN = 329,
-     PARS_SUBSTR_TOKEN = 330,
-     PARS_REPLSTR_TOKEN = 331,
-     PARS_CONCAT_TOKEN = 332,
-     PARS_INSTR_TOKEN = 333,
-     PARS_LENGTH_TOKEN = 334,
-     PARS_SYSDATE_TOKEN = 335,
-     PARS_PRINTF_TOKEN = 336,
-     PARS_ASSERT_TOKEN = 337,
-     PARS_RND_TOKEN = 338,
-     PARS_RND_STR_TOKEN = 339,
-     PARS_ROW_PRINTF_TOKEN = 340,
-     PARS_COMMIT_TOKEN = 341,
-     PARS_ROLLBACK_TOKEN = 342,
-     PARS_WORK_TOKEN = 343,
-     PARS_UNSIGNED_TOKEN = 344,
-     PARS_EXIT_TOKEN = 345,
-     PARS_FUNCTION_TOKEN = 346,
-     PARS_LOCK_TOKEN = 347,
-     PARS_SHARE_TOKEN = 348,
-     PARS_MODE_TOKEN = 349,
-     NEG = 350
-   };
-#endif
-#define PARS_INT_LIT 258
-#define PARS_FLOAT_LIT 259
-#define PARS_STR_LIT 260
-#define PARS_FIXBINARY_LIT 261
-#define PARS_BLOB_LIT 262
-#define PARS_NULL_LIT 263
-#define PARS_ID_TOKEN 264
-#define PARS_AND_TOKEN 265
-#define PARS_OR_TOKEN 266
-#define PARS_NOT_TOKEN 267
-#define PARS_GE_TOKEN 268
-#define PARS_LE_TOKEN 269
-#define PARS_NE_TOKEN 270
-#define PARS_PROCEDURE_TOKEN 271
-#define PARS_IN_TOKEN 272
-#define PARS_OUT_TOKEN 273
-#define PARS_BINARY_TOKEN 274
-#define PARS_BLOB_TOKEN 275
-#define PARS_INT_TOKEN 276
-#define PARS_INTEGER_TOKEN 277
-#define PARS_FLOAT_TOKEN 278
-#define PARS_CHAR_TOKEN 279
-#define PARS_IS_TOKEN 280
-#define PARS_BEGIN_TOKEN 281
-#define PARS_END_TOKEN 282
-#define PARS_IF_TOKEN 283
-#define PARS_THEN_TOKEN 284
-#define PARS_ELSE_TOKEN 285
-#define PARS_ELSIF_TOKEN 286
-#define PARS_LOOP_TOKEN 287
-#define PARS_WHILE_TOKEN 288
-#define PARS_RETURN_TOKEN 289
-#define PARS_SELECT_TOKEN 290
-#define PARS_SUM_TOKEN 291
-#define PARS_COUNT_TOKEN 292
-#define PARS_DISTINCT_TOKEN 293
-#define PARS_FROM_TOKEN 294
-#define PARS_WHERE_TOKEN 295
-#define PARS_FOR_TOKEN 296
-#define PARS_DDOT_TOKEN 297
-#define PARS_READ_TOKEN 298
-#define PARS_ORDER_TOKEN 299
-#define PARS_BY_TOKEN 300
-#define PARS_ASC_TOKEN 301
-#define PARS_DESC_TOKEN 302
-#define PARS_INSERT_TOKEN 303
-#define PARS_INTO_TOKEN 304
-#define PARS_VALUES_TOKEN 305
-#define PARS_UPDATE_TOKEN 306
-#define PARS_SET_TOKEN 307
-#define PARS_DELETE_TOKEN 308
-#define PARS_CURRENT_TOKEN 309
-#define PARS_OF_TOKEN 310
-#define PARS_CREATE_TOKEN 311
-#define PARS_TABLE_TOKEN 312
-#define PARS_INDEX_TOKEN 313
-#define PARS_UNIQUE_TOKEN 314
-#define PARS_CLUSTERED_TOKEN 315
-#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 316
-#define PARS_ON_TOKEN 317
-#define PARS_ASSIGN_TOKEN 318
-#define PARS_DECLARE_TOKEN 319
-#define PARS_CURSOR_TOKEN 320
-#define PARS_SQL_TOKEN 321
-#define PARS_OPEN_TOKEN 322
-#define PARS_FETCH_TOKEN 323
-#define PARS_CLOSE_TOKEN 324
-#define PARS_NOTFOUND_TOKEN 325
-#define PARS_TO_CHAR_TOKEN 326
-#define PARS_TO_NUMBER_TOKEN 327
-#define PARS_TO_BINARY_TOKEN 328
-#define PARS_BINARY_TO_NUMBER_TOKEN 329
-#define PARS_SUBSTR_TOKEN 330
-#define PARS_REPLSTR_TOKEN 331
-#define PARS_CONCAT_TOKEN 332
-#define PARS_INSTR_TOKEN 333
-#define PARS_LENGTH_TOKEN 334
-#define PARS_SYSDATE_TOKEN 335
-#define PARS_PRINTF_TOKEN 336
-#define PARS_ASSERT_TOKEN 337
-#define PARS_RND_TOKEN 338
-#define PARS_RND_STR_TOKEN 339
-#define PARS_ROW_PRINTF_TOKEN 340
-#define PARS_COMMIT_TOKEN 341
-#define PARS_ROLLBACK_TOKEN 342
-#define PARS_WORK_TOKEN 343
-#define PARS_UNSIGNED_TOKEN 344
-#define PARS_EXIT_TOKEN 345
-#define PARS_FUNCTION_TOKEN 346
-#define PARS_LOCK_TOKEN 347
-#define PARS_SHARE_TOKEN 348
-#define PARS_MODE_TOKEN 349
-#define NEG 350
-
-
-
-
-/* Copy the first part of user declarations.  */
-#line 13 "pars0grm.y"
-
-/* The value of the semantic attribute is a pointer to a query tree node
-que_node_t */
-
-#include "univ.i"
-#include <math.h>				/* Can't be before univ.i */
-#include "pars0pars.h"
-#include "mem0mem.h"
-#include "que0types.h"
-#include "que0que.h"
-#include "row0sel.h"
-
-#define YYSTYPE que_node_t*
-
-/* #define __STDC__ */
-
-int
-yylex(void);
-
-
-/* Enabling traces.  */
-#ifndef YYDEBUG
-# define YYDEBUG 0
-#endif
-
-/* Enabling verbose error messages.  */
-#ifdef YYERROR_VERBOSE
-# undef YYERROR_VERBOSE
-# define YYERROR_VERBOSE 1
-#else
-# define YYERROR_VERBOSE 0
-#endif
-
-#if ! defined (YYSTYPE) && ! defined (YYSTYPE_IS_DECLARED)
-typedef int YYSTYPE;
-# define yystype YYSTYPE /* obsolescent; will be withdrawn */
-# define YYSTYPE_IS_DECLARED 1
-# define YYSTYPE_IS_TRIVIAL 1
-#endif
-
-
-
-/* Copy the second part of user declarations.  */
-
-
-/* Line 213 of yacc.c.  */
-#line 297 "pars0grm.c"
-
-#if ! defined (yyoverflow) || YYERROR_VERBOSE
-
-# ifndef YYFREE
-#  define YYFREE free
-# endif
-# ifndef YYMALLOC
-#  define YYMALLOC malloc
-# endif
-
-/* The parser invokes alloca or malloc; define the necessary symbols.  */
-
-# ifdef YYSTACK_USE_ALLOCA
-#  if YYSTACK_USE_ALLOCA
-#   ifdef __GNUC__
-#    define YYSTACK_ALLOC __builtin_alloca
-#   else
-#    define YYSTACK_ALLOC alloca
-#   endif
-#  endif
-# endif
-
-# ifdef YYSTACK_ALLOC
-   /* Pacify GCC's `empty if-body' warning. */
-#  define YYSTACK_FREE(Ptr) do { /* empty */; } while (0)
-# else
-#  if defined (__STDC__) || defined (__cplusplus)
-#   include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
-#   define YYSIZE_T size_t
-#  endif
-#  define YYSTACK_ALLOC YYMALLOC
-#  define YYSTACK_FREE YYFREE
-# endif
-#endif /* ! defined (yyoverflow) || YYERROR_VERBOSE */
-
-
-#if (! defined (yyoverflow) \
-     && (! defined (__cplusplus) \
-	 || (defined (YYSTYPE_IS_TRIVIAL) && YYSTYPE_IS_TRIVIAL)))
-
-/* A type that is properly aligned for any stack member.  */
-union yyalloc
-{
-  short int yyss;
-  YYSTYPE yyvs;
-  };
-
-/* The size of the maximum gap between one aligned stack and the next.  */
-# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1)
-
-/* The size of an array large to enough to hold all stacks, each with
-   N elements.  */
-# define YYSTACK_BYTES(N) \
-     ((N) * (sizeof (short int) + sizeof (YYSTYPE))			\
-      + YYSTACK_GAP_MAXIMUM)
-
-/* Copy COUNT objects from FROM to TO.  The source and destination do
-   not overlap.  */
-# ifndef YYCOPY
-#  if defined (__GNUC__) && 1 < __GNUC__
-#   define YYCOPY(To, From, Count) \
-      __builtin_memcpy (To, From, (Count) * sizeof (*(From)))
-#  else
-#   define YYCOPY(To, From, Count)		\
-      do					\
-	{					\
-	  register YYSIZE_T yyi;		\
-	  for (yyi = 0; yyi < (Count); yyi++)	\
-	    (To)[yyi] = (From)[yyi];		\
-	}					\
-      while (0)
-#  endif
-# endif
-
-/* Relocate STACK from its old location to the new one.  The
-   local variables YYSIZE and YYSTACKSIZE give the old and new number of
-   elements in the stack, and YYPTR gives the new location of the
-   stack.  Advance YYPTR to a properly aligned location for the next
-   stack.  */
-# define YYSTACK_RELOCATE(Stack)					\
-    do									\
-      {									\
-	YYSIZE_T yynewbytes;						\
-	YYCOPY (&yyptr->Stack, Stack, yysize);				\
-	Stack = &yyptr->Stack;						\
-	yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \
-	yyptr += yynewbytes / sizeof (*yyptr);				\
-      }									\
-    while (0)
-
-#endif
-
-#if defined (__STDC__) || defined (__cplusplus)
-   typedef signed char yysigned_char;
-#else
-   typedef short int yysigned_char;
-#endif
-
-/* YYFINAL -- State number of the termination state. */
-#define YYFINAL  5
-/* YYLAST -- Last index in YYTABLE.  */
-#define YYLAST   752
-
-/* YYNTOKENS -- Number of terminals. */
-#define YYNTOKENS  111
-/* YYNNTS -- Number of nonterminals. */
-#define YYNNTS  70
-/* YYNRULES -- Number of rules. */
-#define YYNRULES  175
-/* YYNRULES -- Number of states. */
-#define YYNSTATES  339
-
-/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX.  */
-#define YYUNDEFTOK  2
-#define YYMAXUTOK   350
-
-#define YYTRANSLATE(YYX) 						\
-  ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
-
-/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX.  */
-static const unsigned char yytranslate[] =
-{
-       0,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,   103,     2,     2,
-     105,   106,   100,    99,   108,    98,     2,   101,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,   104,
-      96,    95,    97,   107,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,   109,     2,   110,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     1,     2,     3,     4,
-       5,     6,     7,     8,     9,    10,    11,    12,    13,    14,
-      15,    16,    17,    18,    19,    20,    21,    22,    23,    24,
-      25,    26,    27,    28,    29,    30,    31,    32,    33,    34,
-      35,    36,    37,    38,    39,    40,    41,    42,    43,    44,
-      45,    46,    47,    48,    49,    50,    51,    52,    53,    54,
-      55,    56,    57,    58,    59,    60,    61,    62,    63,    64,
-      65,    66,    67,    68,    69,    70,    71,    72,    73,    74,
-      75,    76,    77,    78,    79,    80,    81,    82,    83,    84,
-      85,    86,    87,    88,    89,    90,    91,    92,    93,    94,
-     102
-};
-
-#if YYDEBUG
-/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in
-   YYRHS.  */
-static const unsigned short int yyprhs[] =
-{
-       0,     0,     3,     6,     8,    11,    14,    17,    20,    23,
-      26,    29,    32,    35,    38,    41,    44,    47,    50,    53,
-      56,    59,    62,    65,    68,    71,    73,    76,    78,    83,
-      85,    87,    89,    91,    93,    95,    97,   101,   105,   109,
-     113,   116,   120,   124,   128,   132,   136,   140,   144,   148,
-     152,   155,   159,   163,   165,   167,   169,   171,   173,   175,
-     177,   179,   181,   183,   185,   186,   188,   192,   199,   204,
-     206,   208,   210,   214,   216,   220,   221,   223,   227,   228,
-     230,   234,   236,   241,   247,   252,   253,   255,   259,   261,
-     265,   267,   268,   271,   272,   275,   276,   281,   282,   284,
-     286,   287,   292,   301,   305,   311,   314,   318,   320,   324,
-     329,   334,   337,   340,   344,   347,   350,   353,   357,   362,
-     364,   367,   368,   371,   373,   381,   388,   399,   401,   403,
-     406,   409,   414,   419,   425,   427,   431,   432,   436,   437,
-     439,   440,   443,   444,   446,   454,   456,   460,   461,   463,
-     464,   466,   477,   480,   483,   485,   487,   489,   491,   493,
-     497,   501,   502,   504,   508,   512,   513,   515,   518,   525,
-     530,   532,   534,   535,   537,   540
-};
-
-/* YYRHS -- A `-1'-separated list of the rules' RHS. */
-static const short int yyrhs[] =
-{
-     112,     0,    -1,   180,   104,    -1,   118,    -1,   119,   104,
-      -1,   151,   104,    -1,   152,   104,    -1,   153,   104,    -1,
-     150,   104,    -1,   154,   104,    -1,   146,   104,    -1,   133,
-     104,    -1,   135,   104,    -1,   145,   104,    -1,   143,   104,
-      -1,   144,   104,    -1,   140,   104,    -1,   141,   104,    -1,
-     155,   104,    -1,   157,   104,    -1,   156,   104,    -1,   169,
-     104,    -1,   170,   104,    -1,   164,   104,    -1,   168,   104,
-      -1,   113,    -1,   114,   113,    -1,     9,    -1,   116,   105,
-     124,   106,    -1,     3,    -1,     4,    -1,     5,    -1,     6,
-      -1,     7,    -1,     8,    -1,    66,    -1,   115,    99,   115,
-      -1,   115,    98,   115,    -1,   115,   100,   115,    -1,   115,
-     101,   115,    -1,    98,   115,    -1,   105,   115,   106,    -1,
-     115,    95,   115,    -1,   115,    96,   115,    -1,   115,    97,
-     115,    -1,   115,    13,   115,    -1,   115,    14,   115,    -1,
-     115,    15,   115,    -1,   115,    10,   115,    -1,   115,    11,
-     115,    -1,    12,   115,    -1,     9,   103,    70,    -1,    66,
-     103,    70,    -1,    71,    -1,    72,    -1,    73,    -1,    74,
-      -1,    75,    -1,    77,    -1,    78,    -1,    79,    -1,    80,
-      -1,    83,    -1,    84,    -1,    -1,   107,    -1,   117,   108,
-     107,    -1,   109,     9,   105,   117,   106,   110,    -1,   120,
-     105,   124,   106,    -1,    76,    -1,    81,    -1,    82,    -1,
-       9,   105,   106,    -1,     9,    -1,   122,   108,     9,    -1,
-      -1,     9,    -1,   123,   108,     9,    -1,    -1,   115,    -1,
-     124,   108,   115,    -1,   115,    -1,    37,   105,   100,   106,
-      -1,    37,   105,    38,     9,   106,    -1,    36,   105,   115,
-     106,    -1,    -1,   125,    -1,   126,   108,   125,    -1,   100,
-      -1,   126,    49,   123,    -1,   126,    -1,    -1,    40,   115,
-      -1,    -1,    41,    51,    -1,    -1,    92,    17,    93,    94,
-      -1,    -1,    46,    -1,    47,    -1,    -1,    44,    45,     9,
-     131,    -1,    35,   127,    39,   122,   128,   129,   130,   132,
-      -1,    48,    49,     9,    -1,   134,    50,   105,   124,   106,
-      -1,   134,   133,    -1,     9,    95,   115,    -1,   136,    -1,
-     137,   108,   136,    -1,    40,    54,    55,     9,    -1,    51,
-       9,    52,   137,    -1,   139,   128,    -1,   139,   138,    -1,
-      53,    39,     9,    -1,   142,   128,    -1,   142,   138,    -1,
-      85,   133,    -1,     9,    63,   115,    -1,    31,   115,    29,
-     114,    -1,   147,    -1,   148,   147,    -1,    -1,    30,   114,
-      -1,   148,    -1,    28,   115,    29,   114,   149,    27,    28,
-      -1,    33,   115,    32,   114,    27,    32,    -1,    41,     9,
-      17,   115,    42,   115,    32,   114,    27,    32,    -1,    90,
-      -1,    34,    -1,    67,     9,    -1,    69,     9,    -1,    68,
-       9,    49,   123,    -1,    68,     9,    49,   121,    -1,     9,
-     171,   160,   161,   162,    -1,   158,    -1,   159,   108,   158,
-      -1,    -1,   105,     3,   106,    -1,    -1,    89,    -1,    -1,
-      12,     8,    -1,    -1,    61,    -1,    56,    57,     9,   105,
-     159,   106,   163,    -1,     9,    -1,   165,   108,     9,    -1,
-      -1,    59,    -1,    -1,    60,    -1,    56,   166,   167,    58,
-       9,    62,     9,   105,   165,   106,    -1,    86,    88,    -1,
-      87,    88,    -1,    21,    -1,    22,    -1,    24,    -1,    19,
-      -1,    20,    -1,     9,    17,   171,    -1,     9,    18,   171,
-      -1,    -1,   172,    -1,   173,   108,   172,    -1,     9,   171,
-     104,    -1,    -1,   174,    -1,   175,   174,    -1,    64,    65,
-       9,    25,   133,   104,    -1,    64,    91,     9,   104,    -1,
-     176,    -1,   177,    -1,    -1,   178,    -1,   179,   178,    -1,
-      16,     9,   105,   173,   106,    25,   175,   179,    26,   114,
-      27,    -1
-};
-
-/* YYRLINE[YYN] -- source line where rule number YYN was defined.  */
-static const unsigned short int yyrline[] =
-{
-       0,   138,   138,   141,   142,   143,   144,   145,   146,   147,
-     148,   149,   150,   151,   152,   153,   154,   155,   156,   157,
-     158,   159,   160,   161,   162,   166,   167,   172,   173,   175,
-     176,   177,   178,   179,   180,   181,   182,   183,   184,   185,
-     186,   187,   188,   189,   190,   191,   192,   193,   194,   195,
-     196,   197,   199,   204,   205,   206,   207,   209,   210,   211,
-     212,   213,   214,   215,   218,   220,   221,   225,   230,   235,
-     236,   237,   241,   245,   246,   251,   252,   253,   258,   259,
-     260,   264,   265,   270,   276,   283,   284,   285,   290,   292,
-     294,   298,   299,   303,   304,   309,   310,   315,   316,   317,
-     321,   322,   327,   337,   342,   344,   349,   353,   354,   359,
-     365,   372,   377,   382,   388,   393,   398,   403,   408,   414,
-     415,   420,   421,   423,   427,   434,   440,   448,   452,   456,
-     462,   468,   470,   475,   480,   481,   486,   487,   492,   493,
-     499,   500,   506,   507,   513,   519,   520,   525,   526,   530,
-     531,   535,   543,   548,   553,   554,   555,   556,   557,   561,
-     564,   570,   571,   572,   577,   581,   583,   584,   588,   594,
-     599,   600,   603,   605,   606,   610
-};
-#endif
-
-#if YYDEBUG || YYERROR_VERBOSE
-/* YYTNME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
-   First, the terminals, then, starting at YYNTOKENS, nonterminals. */
-static const char *const yytname[] =
-{
-  "$end", "error", "$undefined", "PARS_INT_LIT", "PARS_FLOAT_LIT",
-  "PARS_STR_LIT", "PARS_FIXBINARY_LIT", "PARS_BLOB_LIT", "PARS_NULL_LIT",
-  "PARS_ID_TOKEN", "PARS_AND_TOKEN", "PARS_OR_TOKEN", "PARS_NOT_TOKEN",
-  "PARS_GE_TOKEN", "PARS_LE_TOKEN", "PARS_NE_TOKEN",
-  "PARS_PROCEDURE_TOKEN", "PARS_IN_TOKEN", "PARS_OUT_TOKEN",
-  "PARS_BINARY_TOKEN", "PARS_BLOB_TOKEN", "PARS_INT_TOKEN",
-  "PARS_INTEGER_TOKEN", "PARS_FLOAT_TOKEN", "PARS_CHAR_TOKEN",
-  "PARS_IS_TOKEN", "PARS_BEGIN_TOKEN", "PARS_END_TOKEN", "PARS_IF_TOKEN",
-  "PARS_THEN_TOKEN", "PARS_ELSE_TOKEN", "PARS_ELSIF_TOKEN",
-  "PARS_LOOP_TOKEN", "PARS_WHILE_TOKEN", "PARS_RETURN_TOKEN",
-  "PARS_SELECT_TOKEN", "PARS_SUM_TOKEN", "PARS_COUNT_TOKEN",
-  "PARS_DISTINCT_TOKEN", "PARS_FROM_TOKEN", "PARS_WHERE_TOKEN",
-  "PARS_FOR_TOKEN", "PARS_DDOT_TOKEN", "PARS_READ_TOKEN",
-  "PARS_ORDER_TOKEN", "PARS_BY_TOKEN", "PARS_ASC_TOKEN", "PARS_DESC_TOKEN",
-  "PARS_INSERT_TOKEN", "PARS_INTO_TOKEN", "PARS_VALUES_TOKEN",
-  "PARS_UPDATE_TOKEN", "PARS_SET_TOKEN", "PARS_DELETE_TOKEN",
-  "PARS_CURRENT_TOKEN", "PARS_OF_TOKEN", "PARS_CREATE_TOKEN",
-  "PARS_TABLE_TOKEN", "PARS_INDEX_TOKEN", "PARS_UNIQUE_TOKEN",
-  "PARS_CLUSTERED_TOKEN", "PARS_DOES_NOT_FIT_IN_MEM_TOKEN",
-  "PARS_ON_TOKEN", "PARS_ASSIGN_TOKEN", "PARS_DECLARE_TOKEN",
-  "PARS_CURSOR_TOKEN", "PARS_SQL_TOKEN", "PARS_OPEN_TOKEN",
-  "PARS_FETCH_TOKEN", "PARS_CLOSE_TOKEN", "PARS_NOTFOUND_TOKEN",
-  "PARS_TO_CHAR_TOKEN", "PARS_TO_NUMBER_TOKEN", "PARS_TO_BINARY_TOKEN",
-  "PARS_BINARY_TO_NUMBER_TOKEN", "PARS_SUBSTR_TOKEN", "PARS_REPLSTR_TOKEN",
-  "PARS_CONCAT_TOKEN", "PARS_INSTR_TOKEN", "PARS_LENGTH_TOKEN",
-  "PARS_SYSDATE_TOKEN", "PARS_PRINTF_TOKEN", "PARS_ASSERT_TOKEN",
-  "PARS_RND_TOKEN", "PARS_RND_STR_TOKEN", "PARS_ROW_PRINTF_TOKEN",
-  "PARS_COMMIT_TOKEN", "PARS_ROLLBACK_TOKEN", "PARS_WORK_TOKEN",
-  "PARS_UNSIGNED_TOKEN", "PARS_EXIT_TOKEN", "PARS_FUNCTION_TOKEN",
-  "PARS_LOCK_TOKEN", "PARS_SHARE_TOKEN", "PARS_MODE_TOKEN", "'='", "'<'",
-  "'>'", "'-'", "'+'", "'*'", "'/'", "NEG", "'%'", "';'", "'('", "')'",
-  "'?'", "','", "'{'", "'}'", "$accept", "top_statement", "statement",
-  "statement_list", "exp", "function_name", "question_mark_list",
-  "stored_procedure_call", "predefined_procedure_call",
-  "predefined_procedure_name", "user_function_call", "table_list",
-  "variable_list", "exp_list", "select_item", "select_item_list",
-  "select_list", "search_condition", "for_update_clause",
-  "lock_shared_clause", "order_direction", "order_by_clause",
-  "select_statement", "insert_statement_start", "insert_statement",
-  "column_assignment", "column_assignment_list", "cursor_positioned",
-  "update_statement_start", "update_statement_searched",
-  "update_statement_positioned", "delete_statement_start",
-  "delete_statement_searched", "delete_statement_positioned",
-  "row_printf_statement", "assignment_statement", "elsif_element",
-  "elsif_list", "else_part", "if_statement", "while_statement",
-  "for_statement", "exit_statement", "return_statement",
-  "open_cursor_statement", "close_cursor_statement", "fetch_statement",
-  "column_def", "column_def_list", "opt_column_len", "opt_unsigned",
-  "opt_not_null", "not_fit_in_memory", "create_table", "column_list",
-  "unique_def", "clustered_def", "create_index", "commit_statement",
-  "rollback_statement", "type_name", "parameter_declaration",
-  "parameter_declaration_list", "variable_declaration",
-  "variable_declaration_list", "cursor_declaration",
-  "function_declaration", "declaration", "declaration_list",
-  "procedure_definition", 0
-};
-#endif
-
-# ifdef YYPRINT
-/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to
-   token YYLEX-NUM.  */
-static const unsigned short int yytoknum[] =
-{
-       0,   256,   257,   258,   259,   260,   261,   262,   263,   264,
-     265,   266,   267,   268,   269,   270,   271,   272,   273,   274,
-     275,   276,   277,   278,   279,   280,   281,   282,   283,   284,
-     285,   286,   287,   288,   289,   290,   291,   292,   293,   294,
-     295,   296,   297,   298,   299,   300,   301,   302,   303,   304,
-     305,   306,   307,   308,   309,   310,   311,   312,   313,   314,
-     315,   316,   317,   318,   319,   320,   321,   322,   323,   324,
-     325,   326,   327,   328,   329,   330,   331,   332,   333,   334,
-     335,   336,   337,   338,   339,   340,   341,   342,   343,   344,
-     345,   346,   347,   348,   349,    61,    60,    62,    45,    43,
-      42,    47,   350,    37,    59,    40,    41,    63,    44,   123,
-     125
-};
-# endif
-
-/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
-static const unsigned char yyr1[] =
-{
-       0,   111,   112,   113,   113,   113,   113,   113,   113,   113,
-     113,   113,   113,   113,   113,   113,   113,   113,   113,   113,
-     113,   113,   113,   113,   113,   114,   114,   115,   115,   115,
-     115,   115,   115,   115,   115,   115,   115,   115,   115,   115,
-     115,   115,   115,   115,   115,   115,   115,   115,   115,   115,
-     115,   115,   115,   116,   116,   116,   116,   116,   116,   116,
-     116,   116,   116,   116,   117,   117,   117,   118,   119,   120,
-     120,   120,   121,   122,   122,   123,   123,   123,   124,   124,
-     124,   125,   125,   125,   125,   126,   126,   126,   127,   127,
-     127,   128,   128,   129,   129,   130,   130,   131,   131,   131,
-     132,   132,   133,   134,   135,   135,   136,   137,   137,   138,
-     139,   140,   141,   142,   143,   144,   145,   146,   147,   148,
-     148,   149,   149,   149,   150,   151,   152,   153,   154,   155,
-     156,   157,   157,   158,   159,   159,   160,   160,   161,   161,
-     162,   162,   163,   163,   164,   165,   165,   166,   166,   167,
-     167,   168,   169,   170,   171,   171,   171,   171,   171,   172,
-     172,   173,   173,   173,   174,   175,   175,   175,   176,   177,
-     178,   178,   179,   179,   179,   180
-};
-
-/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN.  */
-static const unsigned char yyr2[] =
-{
-       0,     2,     2,     1,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     1,     2,     1,     4,     1,
-       1,     1,     1,     1,     1,     1,     3,     3,     3,     3,
-       2,     3,     3,     3,     3,     3,     3,     3,     3,     3,
-       2,     3,     3,     1,     1,     1,     1,     1,     1,     1,
-       1,     1,     1,     1,     0,     1,     3,     6,     4,     1,
-       1,     1,     3,     1,     3,     0,     1,     3,     0,     1,
-       3,     1,     4,     5,     4,     0,     1,     3,     1,     3,
-       1,     0,     2,     0,     2,     0,     4,     0,     1,     1,
-       0,     4,     8,     3,     5,     2,     3,     1,     3,     4,
-       4,     2,     2,     3,     2,     2,     2,     3,     4,     1,
-       2,     0,     2,     1,     7,     6,    10,     1,     1,     2,
-       2,     4,     4,     5,     1,     3,     0,     3,     0,     1,
-       0,     2,     0,     1,     7,     1,     3,     0,     1,     0,
-       1,    10,     2,     2,     1,     1,     1,     1,     1,     3,
-       3,     0,     1,     3,     3,     0,     1,     2,     6,     4,
-       1,     1,     0,     1,     2,    11
-};
-
-/* YYDEFACT[STATE-NAME] -- Default rule to reduce with in state
-   STATE-NUM when YYTABLE doesn't specify something else to do.  Zero
-   means the default is an error.  */
-static const unsigned char yydefact[] =
-{
-       0,     0,     0,     0,     0,     1,     2,   161,     0,   162,
-       0,     0,     0,     0,     0,   157,   158,   154,   155,   156,
-     159,   160,   165,   163,     0,   166,   172,     0,     0,   167,
-     170,   171,   173,     0,   164,     0,     0,     0,   174,     0,
-       0,     0,     0,     0,   128,    85,     0,     0,     0,     0,
-     147,     0,     0,     0,    69,    70,    71,     0,     0,     0,
-     127,     0,    25,     0,     3,     0,     0,     0,     0,     0,
-      91,     0,     0,    91,     0,     0,     0,     0,     0,     0,
-       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-       0,   169,     0,    29,    30,    31,    32,    33,    34,    27,
-       0,    35,    53,    54,    55,    56,    57,    58,    59,    60,
-      61,    62,    63,     0,     0,     0,     0,     0,     0,     0,
-      88,    81,    86,    90,     0,     0,     0,     0,     0,     0,
-     148,   149,   129,     0,   130,   116,   152,   153,     0,   175,
-      26,     4,    78,    11,     0,   105,    12,     0,   111,   112,
-      16,    17,   114,   115,    14,    15,    13,    10,     8,     5,
-       6,     7,     9,    18,    20,    19,    23,    24,    21,    22,
-       0,   117,     0,    50,     0,    40,     0,     0,     0,     0,
-       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-      78,     0,     0,     0,    75,     0,     0,     0,   103,     0,
-     113,     0,   150,     0,    75,    64,    79,     0,    78,     0,
-      92,   168,    51,    52,    41,    48,    49,    45,    46,    47,
-     121,    42,    43,    44,    37,    36,    38,    39,     0,     0,
-       0,     0,     0,    76,    89,    87,    73,    91,     0,     0,
-     107,   110,     0,     0,    76,   132,   131,    65,     0,    68,
-       0,     0,     0,     0,     0,   119,   123,     0,    28,     0,
-      84,     0,    82,     0,     0,     0,    93,     0,     0,     0,
-       0,   134,     0,     0,     0,     0,     0,    80,   104,   109,
-     122,     0,   120,     0,   125,    83,    77,    74,     0,    95,
-       0,   106,   108,   136,   142,     0,     0,    72,    67,    66,
-       0,   124,    94,     0,   100,     0,     0,   138,   143,   144,
-     135,     0,   118,     0,     0,   102,     0,     0,   139,   140,
-       0,     0,     0,     0,   137,     0,   133,   145,     0,    96,
-      97,   126,   141,   151,     0,    98,    99,   101,   146
-};
-
-/* YYDEFGOTO[NTERM-NUM]. */
-static const short int yydefgoto[] =
-{
-      -1,     2,    62,    63,   206,   116,   248,    64,    65,    66,
-     245,   237,   234,   207,   122,   123,   124,   148,   289,   304,
-     337,   315,    67,    68,    69,   240,   241,   149,    70,    71,
-      72,    73,    74,    75,    76,    77,   255,   256,   257,    78,
-      79,    80,    81,    82,    83,    84,    85,   271,   272,   307,
-     319,   326,   309,    86,   328,   131,   203,    87,    88,    89,
-      20,     9,    10,    25,    26,    30,    31,    32,    33,     3
-};
-
-/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
-   STATE-NUM.  */
-#define YYPACT_NINF -177
-static const short int yypact[] =
-{
-      28,    38,    54,   -46,   -29,  -177,  -177,    56,    50,  -177,
-     -75,     8,     8,    46,    56,  -177,  -177,  -177,  -177,  -177,
-    -177,  -177,    63,  -177,     8,  -177,     2,   -26,   -51,  -177,
-    -177,  -177,  -177,   -13,  -177,    71,    72,   587,  -177,    57,
-     -21,    26,   272,   272,  -177,    13,    91,    55,    96,    67,
-     -22,    99,   100,   103,  -177,  -177,  -177,    75,    29,    35,
-    -177,   116,  -177,   396,  -177,    22,    23,    27,    -9,    30,
-      87,    31,    32,    87,    47,    49,    52,    58,    59,    60,
-      61,    62,    65,    66,    74,    77,    78,    86,    89,   102,
-      75,  -177,   272,  -177,  -177,  -177,  -177,  -177,  -177,    39,
-     272,    51,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,
-    -177,  -177,  -177,   272,   272,   361,    25,   489,    45,    90,
-    -177,   651,  -177,   -39,    93,   142,   124,   108,   152,   170,
-    -177,   131,  -177,   143,  -177,  -177,  -177,  -177,    98,  -177,
-    -177,  -177,   272,  -177,   110,  -177,  -177,   256,  -177,  -177,
-    -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,
-    -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,
-     112,   651,   137,   101,   147,   204,    88,   272,   272,   272,
-     272,   272,   587,   272,   272,   272,   272,   272,   272,   272,
-     272,   587,   272,   -30,   211,   168,   212,   272,  -177,   213,
-    -177,   118,  -177,   167,   217,   122,   651,   -63,   272,   175,
-     651,  -177,  -177,  -177,  -177,   101,   101,    21,    21,   651,
-     332,    21,    21,    21,    -6,    -6,   204,   204,   -60,   460,
-     198,   222,   126,  -177,   125,  -177,  -177,   -33,   584,   140,
-    -177,   128,   228,   229,   139,  -177,   125,  -177,   -53,  -177,
-     272,   -49,   240,   587,   272,  -177,   224,   226,  -177,   225,
-    -177,   150,  -177,   258,   272,   260,   230,   272,   272,   213,
-       8,  -177,   -45,   208,   166,   164,   176,   651,  -177,  -177,
-     587,   631,  -177,   254,  -177,  -177,  -177,  -177,   234,   194,
-     638,   651,  -177,   182,   227,   228,   280,  -177,  -177,  -177,
-     587,  -177,  -177,   273,   247,   587,   289,   214,  -177,  -177,
-    -177,   195,   587,   209,   261,  -177,   524,   199,  -177,   295,
-     292,   215,   299,   279,  -177,   304,  -177,  -177,   -44,  -177,
-      -8,  -177,  -177,  -177,   305,  -177,  -177,  -177,  -177
-};
-
-/* YYPGOTO[NTERM-NUM].  */
-static const short int yypgoto[] =
-{
-    -177,  -177,   -62,  -176,   -40,  -177,  -177,  -177,  -177,  -177,
-    -177,  -177,   109,  -166,   120,  -177,  -177,   -69,  -177,  -177,
-    -177,  -177,   -34,  -177,  -177,    48,  -177,   243,  -177,  -177,
-    -177,  -177,  -177,  -177,  -177,  -177,    64,  -177,  -177,  -177,
-    -177,  -177,  -177,  -177,  -177,  -177,  -177,    24,  -177,  -177,
-    -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,
-     -12,   307,  -177,   297,  -177,  -177,  -177,   285,  -177,  -177
-};
-
-/* YYTABLE[YYPACT[STATE-NUM]].  What to do in state STATE-NUM.  If
-   positive, shift that token.  If negative, reduce the rule which
-   number is the opposite.  If zero, do what YYDEFACT says.
-   If YYTABLE_NINF, syntax error.  */
-#define YYTABLE_NINF -1
-static const unsigned short int yytable[] =
-{
-      21,   140,   115,   117,   152,   121,   220,   264,   231,   181,
-     194,    24,    27,    37,    35,   229,    93,    94,    95,    96,
-      97,    98,    99,   135,   228,   100,    45,    15,    16,    17,
-      18,    13,    19,    14,   145,   129,   181,   130,   335,   336,
-      36,   144,   251,   249,     1,   250,   258,     4,   250,   118,
-     119,    28,   171,   275,     5,   276,   170,   278,     6,   250,
-     173,   294,   333,   295,   334,     8,    28,    11,    12,   195,
-     232,    22,    24,   175,   176,   265,     7,   280,    34,   101,
-      39,    40,    90,    91,   102,   103,   104,   105,   106,    92,
-     107,   108,   109,   110,   188,   189,   111,   112,   177,   178,
-     125,   179,   180,   181,   126,   127,   128,   210,   132,   133,
-      45,   113,   134,   120,   179,   180,   181,   136,   114,   186,
-     187,   188,   189,   137,   312,   138,   141,   147,   142,   316,
-     190,   143,   196,   198,   146,   150,   151,   215,   216,   217,
-     218,   219,   172,   221,   222,   223,   224,   225,   226,   227,
-     192,   154,   230,   155,   174,   121,   156,   238,   140,   197,
-     199,   200,   157,   158,   159,   160,   161,   140,   266,   162,
-     163,    93,    94,    95,    96,    97,    98,    99,   164,   201,
-     100,   165,   166,   183,   184,   185,   186,   187,   188,   189,
-     167,   202,   204,   168,   214,   193,   183,   184,   185,   186,
-     187,   188,   189,   205,   118,   119,   169,   212,   177,   178,
-     277,   179,   180,   181,   281,   208,   211,   213,   140,   181,
-     233,   236,   239,   242,   210,   243,   244,   290,   291,   247,
-     252,   261,   262,   263,   101,   268,   269,   270,   273,   102,
-     103,   104,   105,   106,   274,   107,   108,   109,   110,   279,
-     140,   111,   112,   283,   140,   254,   285,   284,   293,    93,
-      94,    95,    96,    97,    98,    99,   113,   286,   100,   287,
-     296,   288,   297,   114,   298,    93,    94,    95,    96,    97,
-      98,    99,   301,   299,   100,   302,   303,   306,   308,   311,
-     313,   314,   317,   183,   184,   185,   186,   187,   188,   189,
-     320,   327,   321,   318,   260,   324,   322,   325,   330,   329,
-     209,   331,   332,   246,   338,   235,   153,   292,    38,   310,
-     282,    23,   101,    29,     0,     0,     0,   102,   103,   104,
-     105,   106,     0,   107,   108,   109,   110,     0,   101,   111,
-     112,    41,     0,   102,   103,   104,   105,   106,     0,   107,
-     108,   109,   110,     0,   113,   111,   112,     0,     0,     0,
-      42,   114,   253,   254,     0,    43,    44,    45,     0,     0,
-     113,   177,   178,    46,   179,   180,   181,   114,     0,     0,
-      47,     0,     0,    48,     0,    49,     0,     0,    50,     0,
-     182,     0,     0,     0,     0,     0,     0,     0,     0,    51,
-      52,    53,     0,     0,     0,    41,     0,     0,    54,     0,
-       0,     0,     0,    55,    56,     0,     0,    57,    58,    59,
-       0,     0,    60,   139,    42,     0,     0,     0,     0,    43,
-      44,    45,     0,     0,     0,     0,     0,    46,     0,     0,
-       0,    61,     0,     0,    47,     0,     0,    48,     0,    49,
-       0,     0,    50,     0,     0,     0,   183,   184,   185,   186,
-     187,   188,   189,    51,    52,    53,     0,     0,     0,    41,
-       0,     0,    54,     0,     0,     0,     0,    55,    56,     0,
-       0,    57,    58,    59,     0,     0,    60,   259,    42,     0,
-       0,     0,     0,    43,    44,    45,     0,     0,     0,   177,
-     178,    46,   179,   180,   181,    61,     0,     0,    47,     0,
-       0,    48,     0,    49,     0,     0,    50,     0,     0,     0,
-       0,   191,     0,     0,     0,     0,     0,    51,    52,    53,
-       0,     0,     0,    41,     0,     0,    54,     0,     0,     0,
-       0,    55,    56,     0,     0,    57,    58,    59,     0,     0,
-      60,   323,    42,     0,     0,     0,     0,    43,    44,    45,
-       0,     0,     0,     0,     0,    46,     0,     0,     0,    61,
-       0,     0,    47,     0,     0,    48,     0,    49,     0,     0,
-      50,     0,     0,     0,   183,   184,   185,   186,   187,   188,
-     189,    51,    52,    53,   177,   178,    41,   179,   180,   181,
-      54,     0,     0,     0,     0,    55,    56,     0,     0,    57,
-      58,    59,     0,     0,    60,    42,     0,     0,     0,     0,
-      43,    44,    45,     0,     0,     0,   267,     0,    46,     0,
-       0,     0,     0,    61,     0,    47,     0,     0,    48,     0,
-      49,   177,   178,    50,   179,   180,   181,     0,   177,   178,
-       0,   179,   180,   181,    51,    52,    53,     0,     0,     0,
-     300,   177,   178,    54,   179,   180,   181,     0,    55,    56,
-     305,     0,    57,    58,    59,     0,     0,    60,     0,   183,
-     184,   185,   186,   187,   188,   189,     0,     0,     0,     0,
-       0,     0,     0,     0,     0,     0,    61,     0,     0,     0,
-       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-       0,     0,     0,     0,     0,     0,   183,   184,   185,   186,
-     187,   188,   189,   183,   184,   185,   186,   187,   188,   189,
-       0,     0,     0,     0,     0,     0,   183,   184,   185,   186,
-     187,   188,   189
-};
-
-static const short int yycheck[] =
-{
-      12,    63,    42,    43,    73,    45,   182,    40,    38,    15,
-      49,     9,    24,    26,    65,   191,     3,     4,     5,     6,
-       7,     8,     9,    57,   190,    12,    35,    19,    20,    21,
-      22,   106,    24,   108,    68,    57,    15,    59,    46,    47,
-      91,    50,   208,   106,    16,   108,   106,     9,   108,    36,
-      37,    64,    92,   106,     0,   108,    90,   106,   104,   108,
-     100,   106,   106,   108,   108,     9,    64,    17,    18,   108,
-     100,    25,     9,   113,   114,   108,   105,   253,   104,    66,
-       9,     9,    25,   104,    71,    72,    73,    74,    75,    63,
-      77,    78,    79,    80,   100,   101,    83,    84,    10,    11,
-       9,    13,    14,    15,    49,     9,    39,   147,     9,     9,
-      35,    98,     9,   100,    13,    14,    15,    88,   105,    98,
-      99,   100,   101,    88,   300,     9,   104,    40,   105,   305,
-     105,   104,    39,     9,   104,   104,   104,   177,   178,   179,
-     180,   181,   103,   183,   184,   185,   186,   187,   188,   189,
-     105,   104,   192,   104,   103,   195,   104,   197,   220,    17,
-      52,     9,   104,   104,   104,   104,   104,   229,   237,   104,
-     104,     3,     4,     5,     6,     7,     8,     9,   104,     9,
-      12,   104,   104,    95,    96,    97,    98,    99,   100,   101,
-     104,    60,    49,   104,   106,   105,    95,    96,    97,    98,
-      99,   100,   101,   105,    36,    37,   104,    70,    10,    11,
-     250,    13,    14,    15,   254,   105,   104,    70,   280,    15,
-       9,     9,     9,   105,   264,    58,     9,   267,   268,   107,
-      55,     9,   106,   108,    66,    95,   108,     9,     9,    71,
-      72,    73,    74,    75,   105,    77,    78,    79,    80,     9,
-     312,    83,    84,    27,   316,    31,   106,    32,   270,     3,
-       4,     5,     6,     7,     8,     9,    98,     9,    12,     9,
-      62,    41,   106,   105,   110,     3,     4,     5,     6,     7,
-       8,     9,    28,   107,    12,    51,    92,   105,    61,     9,
-      17,    44,     3,    95,    96,    97,    98,    99,   100,   101,
-     105,     9,    93,    89,   106,   106,    45,    12,     9,    94,
-      54,    32,     8,   204,     9,   195,    73,   269,    33,   295,
-     256,    14,    66,    26,    -1,    -1,    -1,    71,    72,    73,
-      74,    75,    -1,    77,    78,    79,    80,    -1,    66,    83,
-      84,     9,    -1,    71,    72,    73,    74,    75,    -1,    77,
-      78,    79,    80,    -1,    98,    83,    84,    -1,    -1,    -1,
-      28,   105,    30,    31,    -1,    33,    34,    35,    -1,    -1,
-      98,    10,    11,    41,    13,    14,    15,   105,    -1,    -1,
-      48,    -1,    -1,    51,    -1,    53,    -1,    -1,    56,    -1,
-      29,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    67,
-      68,    69,    -1,    -1,    -1,     9,    -1,    -1,    76,    -1,
-      -1,    -1,    -1,    81,    82,    -1,    -1,    85,    86,    87,
-      -1,    -1,    90,    27,    28,    -1,    -1,    -1,    -1,    33,
-      34,    35,    -1,    -1,    -1,    -1,    -1,    41,    -1,    -1,
-      -1,   109,    -1,    -1,    48,    -1,    -1,    51,    -1,    53,
-      -1,    -1,    56,    -1,    -1,    -1,    95,    96,    97,    98,
-      99,   100,   101,    67,    68,    69,    -1,    -1,    -1,     9,
-      -1,    -1,    76,    -1,    -1,    -1,    -1,    81,    82,    -1,
-      -1,    85,    86,    87,    -1,    -1,    90,    27,    28,    -1,
-      -1,    -1,    -1,    33,    34,    35,    -1,    -1,    -1,    10,
-      11,    41,    13,    14,    15,   109,    -1,    -1,    48,    -1,
-      -1,    51,    -1,    53,    -1,    -1,    56,    -1,    -1,    -1,
-      -1,    32,    -1,    -1,    -1,    -1,    -1,    67,    68,    69,
-      -1,    -1,    -1,     9,    -1,    -1,    76,    -1,    -1,    -1,
-      -1,    81,    82,    -1,    -1,    85,    86,    87,    -1,    -1,
-      90,    27,    28,    -1,    -1,    -1,    -1,    33,    34,    35,
-      -1,    -1,    -1,    -1,    -1,    41,    -1,    -1,    -1,   109,
-      -1,    -1,    48,    -1,    -1,    51,    -1,    53,    -1,    -1,
-      56,    -1,    -1,    -1,    95,    96,    97,    98,    99,   100,
-     101,    67,    68,    69,    10,    11,     9,    13,    14,    15,
-      76,    -1,    -1,    -1,    -1,    81,    82,    -1,    -1,    85,
-      86,    87,    -1,    -1,    90,    28,    -1,    -1,    -1,    -1,
-      33,    34,    35,    -1,    -1,    -1,    42,    -1,    41,    -1,
-      -1,    -1,    -1,   109,    -1,    48,    -1,    -1,    51,    -1,
-      53,    10,    11,    56,    13,    14,    15,    -1,    10,    11,
-      -1,    13,    14,    15,    67,    68,    69,    -1,    -1,    -1,
-      29,    10,    11,    76,    13,    14,    15,    -1,    81,    82,
-      32,    -1,    85,    86,    87,    -1,    -1,    90,    -1,    95,
-      96,    97,    98,    99,   100,   101,    -1,    -1,    -1,    -1,
-      -1,    -1,    -1,    -1,    -1,    -1,   109,    -1,    -1,    -1,
-      -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
-      -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
-      -1,    -1,    -1,    -1,    -1,    -1,    95,    96,    97,    98,
-      99,   100,   101,    95,    96,    97,    98,    99,   100,   101,
-      -1,    -1,    -1,    -1,    -1,    -1,    95,    96,    97,    98,
-      99,   100,   101
-};
-
-/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
-   symbol of state STATE-NUM.  */
-static const unsigned char yystos[] =
-{
-       0,    16,   112,   180,     9,     0,   104,   105,     9,   172,
-     173,    17,    18,   106,   108,    19,    20,    21,    22,    24,
-     171,   171,    25,   172,     9,   174,   175,   171,    64,   174,
-     176,   177,   178,   179,   104,    65,    91,    26,   178,     9,
-       9,     9,    28,    33,    34,    35,    41,    48,    51,    53,
-      56,    67,    68,    69,    76,    81,    82,    85,    86,    87,
-      90,   109,   113,   114,   118,   119,   120,   133,   134,   135,
-     139,   140,   141,   142,   143,   144,   145,   146,   150,   151,
-     152,   153,   154,   155,   156,   157,   164,   168,   169,   170,
-      25,   104,    63,     3,     4,     5,     6,     7,     8,     9,
-      12,    66,    71,    72,    73,    74,    75,    77,    78,    79,
-      80,    83,    84,    98,   105,   115,   116,   115,    36,    37,
-     100,   115,   125,   126,   127,     9,    49,     9,    39,    57,
-      59,   166,     9,     9,     9,   133,    88,    88,     9,    27,
-     113,   104,   105,   104,    50,   133,   104,    40,   128,   138,
-     104,   104,   128,   138,   104,   104,   104,   104,   104,   104,
-     104,   104,   104,   104,   104,   104,   104,   104,   104,   104,
-     133,   115,   103,   115,   103,   115,   115,    10,    11,    13,
-      14,    15,    29,    95,    96,    97,    98,    99,   100,   101,
-     105,    32,   105,   105,    49,   108,    39,    17,     9,    52,
-       9,     9,    60,   167,    49,   105,   115,   124,   105,    54,
-     115,   104,    70,    70,   106,   115,   115,   115,   115,   115,
-     114,   115,   115,   115,   115,   115,   115,   115,   124,   114,
-     115,    38,   100,     9,   123,   125,     9,   122,   115,     9,
-     136,   137,   105,    58,     9,   121,   123,   107,   117,   106,
-     108,   124,    55,    30,    31,   147,   148,   149,   106,    27,
-     106,     9,   106,   108,    40,   108,   128,    42,    95,   108,
-       9,   158,   159,     9,   105,   106,   108,   115,   106,     9,
-     114,   115,   147,    27,    32,   106,     9,     9,    41,   129,
-     115,   115,   136,   171,   106,   108,    62,   106,   110,   107,
-      29,    28,    51,    92,   130,    32,   105,   160,    61,   163,
-     158,     9,   114,    17,    44,   132,   114,     3,    89,   161,
-     105,    93,    45,    27,   106,    12,   162,     9,   165,    94,
-       9,    32,     8,   106,   108,    46,    47,   131,     9
-};
-
-#if ! defined (YYSIZE_T) && defined (__SIZE_TYPE__)
-# define YYSIZE_T __SIZE_TYPE__
-#endif
-#if ! defined (YYSIZE_T) && defined (size_t)
-# define YYSIZE_T size_t
-#endif
-#if ! defined (YYSIZE_T)
-# if defined (__STDC__) || defined (__cplusplus)
-#  include <stddef.h> /* INFRINGES ON USER NAME SPACE */
-#  define YYSIZE_T size_t
-# endif
-#endif
-#if ! defined (YYSIZE_T)
-# define YYSIZE_T unsigned int
-#endif
-
-#define yyerrok		(yyerrstatus = 0)
-#define yyclearin	(yychar = YYEMPTY)
-#define YYEMPTY		(-2)
-#define YYEOF		0
-
-#define YYACCEPT	goto yyacceptlab
-#define YYABORT		goto yyabortlab
-#define YYERROR		goto yyerrorlab
-
-
-/* Like YYERROR except do call yyerror.  This remains here temporarily
-   to ease the transition to the new meaning of YYERROR, for GCC.
-   Once GCC version 2 has supplanted version 1, this can go.  */
-
-#define YYFAIL		goto yyerrlab
-
-#define YYRECOVERING()  (!!yyerrstatus)
-
-#define YYBACKUP(Token, Value)					\
-do								\
-  if (yychar == YYEMPTY && yylen == 1)				\
-    {								\
-      yychar = (Token);						\
-      yylval = (Value);						\
-      yytoken = YYTRANSLATE (yychar);				\
-      YYPOPSTACK;						\
-      goto yybackup;						\
-    }								\
-  else								\
-    { 								\
-      yyerror ("syntax error: cannot back up");\
-      YYERROR;							\
-    }								\
-while (0)
-
-
-#define YYTERROR	1
-#define YYERRCODE	256
-
-
-/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N].
-   If N is 0, then set CURRENT to the empty location which ends
-   the previous symbol: RHS[0] (always defined).  */
-
-#define YYRHSLOC(Rhs, K) ((Rhs)[K])
-#ifndef YYLLOC_DEFAULT
-# define YYLLOC_DEFAULT(Current, Rhs, N)				\
-    do									\
-      if (N)								\
-	{								\
-	  (Current).first_line   = YYRHSLOC (Rhs, 1).first_line;	\
-	  (Current).first_column = YYRHSLOC (Rhs, 1).first_column;	\
-	  (Current).last_line    = YYRHSLOC (Rhs, N).last_line;		\
-	  (Current).last_column  = YYRHSLOC (Rhs, N).last_column;	\
-	}								\
-      else								\
-	{								\
-	  (Current).first_line   = (Current).last_line   =		\
-	    YYRHSLOC (Rhs, 0).last_line;				\
-	  (Current).first_column = (Current).last_column =		\
-	    YYRHSLOC (Rhs, 0).last_column;				\
-	}								\
-    while (0)
-#endif
-
-
-/* YY_LOCATION_PRINT -- Print the location on the stream.
-   This macro was not mandated originally: define only if we know
-   we won't break user code: when these are the locations we know.  */
-
-#ifndef YY_LOCATION_PRINT
-# if YYLTYPE_IS_TRIVIAL
-#  define YY_LOCATION_PRINT(File, Loc)			\
-     fprintf (File, "%d.%d-%d.%d",			\
-              (Loc).first_line, (Loc).first_column,	\
-              (Loc).last_line,  (Loc).last_column)
-# else
-#  define YY_LOCATION_PRINT(File, Loc) ((void) 0)
-# endif
-#endif
-
-
-/* YYLEX -- calling `yylex' with the right arguments.  */
-
-#ifdef YYLEX_PARAM
-# define YYLEX yylex (YYLEX_PARAM)
-#else
-# define YYLEX yylex ()
-#endif
-
-/* Enable debugging if requested.  */
-#if YYDEBUG
-
-# ifndef YYFPRINTF
-#  include <stdio.h> /* INFRINGES ON USER NAME SPACE */
-#  define YYFPRINTF fprintf
-# endif
-
-# define YYDPRINTF(Args)			\
-do {						\
-  if (yydebug)					\
-    YYFPRINTF Args;				\
-} while (0)
-
-# define YY_SYMBOL_PRINT(Title, Type, Value, Location)		\
-do {								\
-  if (yydebug)							\
-    {								\
-      YYFPRINTF (stderr, "%s ", Title);				\
-      yysymprint (stderr, 					\
-                  Type, Value);	\
-      YYFPRINTF (stderr, "\n");					\
-    }								\
-} while (0)
-
-/*------------------------------------------------------------------.
-| yy_stack_print -- Print the state stack from its BOTTOM up to its |
-| TOP (included).                                                   |
-`------------------------------------------------------------------*/
-
-#if defined (__STDC__) || defined (__cplusplus)
-static void
-yy_stack_print (short int *bottom, short int *top)
-#else
-static void
-yy_stack_print (bottom, top)
-    short int *bottom;
-    short int *top;
-#endif
-{
-  YYFPRINTF (stderr, "Stack now");
-  for (/* Nothing. */; bottom <= top; ++bottom)
-    YYFPRINTF (stderr, " %d", *bottom);
-  YYFPRINTF (stderr, "\n");
-}
-
-# define YY_STACK_PRINT(Bottom, Top)				\
-do {								\
-  if (yydebug)							\
-    yy_stack_print ((Bottom), (Top));				\
-} while (0)
-
-
-/*------------------------------------------------.
-| Report that the YYRULE is going to be reduced.  |
-`------------------------------------------------*/
-
-#if defined (__STDC__) || defined (__cplusplus)
-static void
-yy_reduce_print (int yyrule)
-#else
-static void
-yy_reduce_print (yyrule)
-    int yyrule;
-#endif
-{
-  int yyi;
-  unsigned int yylno = yyrline[yyrule];
-  YYFPRINTF (stderr, "Reducing stack by rule %d (line %u), ",
-             yyrule - 1, yylno);
-  /* Print the symbols being reduced, and their result.  */
-  for (yyi = yyprhs[yyrule]; 0 <= yyrhs[yyi]; yyi++)
-    YYFPRINTF (stderr, "%s ", yytname [yyrhs[yyi]]);
-  YYFPRINTF (stderr, "-> %s\n", yytname [yyr1[yyrule]]);
-}
-
-# define YY_REDUCE_PRINT(Rule)		\
-do {					\
-  if (yydebug)				\
-    yy_reduce_print (Rule);		\
-} while (0)
-
-/* Nonzero means print parse trace.  It is left uninitialized so that
-   multiple parsers can coexist.  */
-int yydebug;
-#else /* !YYDEBUG */
-# define YYDPRINTF(Args)
-# define YY_SYMBOL_PRINT(Title, Type, Value, Location)
-# define YY_STACK_PRINT(Bottom, Top)
-# define YY_REDUCE_PRINT(Rule)
-#endif /* !YYDEBUG */
-
-
-/* YYINITDEPTH -- initial size of the parser's stacks.  */
-#ifndef	YYINITDEPTH
-# define YYINITDEPTH 200
-#endif
-
-/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
-   if the built-in stack extension method is used).
-
-   Do not make this value too large; the results are undefined if
-   SIZE_MAX < YYSTACK_BYTES (YYMAXDEPTH)
-   evaluated with infinite-precision integer arithmetic.  */
-
-#ifndef YYMAXDEPTH
-# define YYMAXDEPTH 10000
-#endif
-
-
-
-#if YYERROR_VERBOSE
-
-# ifndef yystrlen
-#  if defined (__GLIBC__) && defined (_STRING_H)
-#   define yystrlen strlen
-#  else
-/* Return the length of YYSTR.  */
-static YYSIZE_T
-#   if defined (__STDC__) || defined (__cplusplus)
-yystrlen (const char *yystr)
-#   else
-yystrlen (yystr)
-     const char *yystr;
-#   endif
-{
-  register const char *yys = yystr;
-
-  while (*yys++ != '\0')
-    continue;
-
-  return yys - yystr - 1;
-}
-#  endif
-# endif
-
-# ifndef yystpcpy
-#  if defined (__GLIBC__) && defined (_STRING_H) && defined (_GNU_SOURCE)
-#   define yystpcpy stpcpy
-#  else
-/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in
-   YYDEST.  */
-static char *
-#   if defined (__STDC__) || defined (__cplusplus)
-yystpcpy (char *yydest, const char *yysrc)
-#   else
-yystpcpy (yydest, yysrc)
-     char *yydest;
-     const char *yysrc;
-#   endif
-{
-  register char *yyd = yydest;
-  register const char *yys = yysrc;
-
-  while ((*yyd++ = *yys++) != '\0')
-    continue;
-
-  return yyd - 1;
-}
-#  endif
-# endif
-
-#endif /* !YYERROR_VERBOSE */
-
-
-
-#if YYDEBUG
-/*--------------------------------.
-| Print this symbol on YYOUTPUT.  |
-`--------------------------------*/
-
-#if defined (__STDC__) || defined (__cplusplus)
-static void
-yysymprint (FILE *yyoutput, int yytype, YYSTYPE *yyvaluep)
-#else
-static void
-yysymprint (yyoutput, yytype, yyvaluep)
-    FILE *yyoutput;
-    int yytype;
-    YYSTYPE *yyvaluep;
-#endif
-{
-  /* Pacify ``unused variable'' warnings.  */
-  (void) yyvaluep;
-
-  if (yytype < YYNTOKENS)
-    YYFPRINTF (yyoutput, "token %s (", yytname[yytype]);
-  else
-    YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]);
-
-
-# ifdef YYPRINT
-  if (yytype < YYNTOKENS)
-    YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep);
-# endif
-  switch (yytype)
-    {
-      default:
-        break;
-    }
-  YYFPRINTF (yyoutput, ")");
-}
-
-#endif /* ! YYDEBUG */
-/*-----------------------------------------------.
-| Release the memory associated to this symbol.  |
-`-----------------------------------------------*/
-
-#if defined (__STDC__) || defined (__cplusplus)
-static void
-yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep)
-#else
-static void
-yydestruct (yymsg, yytype, yyvaluep)
-    const char *yymsg;
-    int yytype;
-    YYSTYPE *yyvaluep;
-#endif
-{
-  /* Pacify ``unused variable'' warnings.  */
-  (void) yyvaluep;
-
-  if (!yymsg)
-    yymsg = "Deleting";
-  YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp);
-
-  switch (yytype)
-    {
-
-      default:
-        break;
-    }
-}
-
-
-/* Prevent warnings from -Wmissing-prototypes.  */
-
-#ifdef YYPARSE_PARAM
-# if defined (__STDC__) || defined (__cplusplus)
-UNIV_INTERN int yyparse (void *YYPARSE_PARAM);
-# else
-UNIV_INTERN int yyparse ();
-# endif
-#else /* ! YYPARSE_PARAM */
-#if defined (__STDC__) || defined (__cplusplus)
-UNIV_INTERN int yyparse (void);
-#else
-UNIV_INTERN int yyparse ();
-#endif
-#endif /* ! YYPARSE_PARAM */
-
-
-
-/* The look-ahead symbol.  */
-static int yychar;
-
-/* The semantic value of the look-ahead symbol.  */
-UNIV_INTERN YYSTYPE yylval;
-
-/* Number of syntax errors so far.  */
-static int yynerrs;
-
-
-
-/*----------.
-| yyparse.  |
-`----------*/
-
-#ifdef YYPARSE_PARAM
-# if defined (__STDC__) || defined (__cplusplus)
-UNIV_INTERN int yyparse (void *YYPARSE_PARAM)
-# else
-UNIV_INTERN int yyparse (YYPARSE_PARAM)
-  void *YYPARSE_PARAM;
-# endif
-#else /* ! YYPARSE_PARAM */
-#if defined (__STDC__) || defined (__cplusplus)
-int
-yyparse (void)
-#else
-int
-yyparse ()
-
-#endif
-#endif
-{
-  
-  register int yystate;
-  register int yyn;
-  int yyresult;
-  /* Number of tokens to shift before error messages enabled.  */
-  int yyerrstatus;
-  /* Look-ahead token as an internal (translated) token number.  */
-  int yytoken = 0;
-
-  /* Three stacks and their tools:
-     `yyss': related to states,
-     `yyvs': related to semantic values,
-     `yyls': related to locations.
-
-     Refer to the stacks thru separate pointers, to allow yyoverflow
-     to reallocate them elsewhere.  */
-
-  /* The state stack.  */
-  short int yyssa[YYINITDEPTH];
-  short int *yyss = yyssa;
-  register short int *yyssp;
-
-  /* The semantic value stack.  */
-  YYSTYPE yyvsa[YYINITDEPTH];
-  YYSTYPE *yyvs = yyvsa;
-  register YYSTYPE *yyvsp;
-
-
-
-#define YYPOPSTACK   (yyvsp--, yyssp--)
-
-  YYSIZE_T yystacksize = YYINITDEPTH;
-
-  /* The variables used to return semantic value and location from the
-     action routines.  */
-  YYSTYPE yyval;
-
-
-  /* When reducing, the number of symbols on the RHS of the reduced
-     rule.  */
-  int yylen;
-
-  YYDPRINTF ((stderr, "Starting parse\n"));
-
-  yystate = 0;
-  yyerrstatus = 0;
-  yynerrs = 0;
-  yychar = YYEMPTY;		/* Cause a token to be read.  */
-
-  /* Initialize stack pointers.
-     Waste one element of value and location stack
-     so that they stay on the same level as the state stack.
-     The wasted elements are never initialized.  */
-
-  yyssp = yyss;
-  yyvsp = yyvs;
-
-
-  yyvsp[0] = yylval;
-
-  goto yysetstate;
-
-/*------------------------------------------------------------.
-| yynewstate -- Push a new state, which is found in yystate.  |
-`------------------------------------------------------------*/
- yynewstate:
-  /* In all cases, when you get here, the value and location stacks
-     have just been pushed. so pushing a state here evens the stacks.
-     */
-  yyssp++;
-
- yysetstate:
-  *yyssp = yystate;
-
-  if (yyss + yystacksize - 1 <= yyssp)
-    {
-      /* Get the current used size of the three stacks, in elements.  */
-      YYSIZE_T yysize = yyssp - yyss + 1;
-
-#ifdef yyoverflow
-      {
-	/* Give user a chance to reallocate the stack. Use copies of
-	   these so that the &'s don't force the real ones into
-	   memory.  */
-	YYSTYPE *yyvs1 = yyvs;
-	short int *yyss1 = yyss;
-
-
-	/* Each stack pointer address is followed by the size of the
-	   data in use in that stack, in bytes.  This used to be a
-	   conditional around just the two extra args, but that might
-	   be undefined if yyoverflow is a macro.  */
-	yyoverflow ("parser stack overflow",
-		    &yyss1, yysize * sizeof (*yyssp),
-		    &yyvs1, yysize * sizeof (*yyvsp),
-
-		    &yystacksize);
-
-	yyss = yyss1;
-	yyvs = yyvs1;
-      }
-#else /* no yyoverflow */
-# ifndef YYSTACK_RELOCATE
-      goto yyoverflowlab;
-# else
-      /* Extend the stack our own way.  */
-      if (YYMAXDEPTH <= yystacksize)
-	goto yyoverflowlab;
-      yystacksize *= 2;
-      if (YYMAXDEPTH < yystacksize)
-	yystacksize = YYMAXDEPTH;
-
-      {
-	short int *yyss1 = yyss;
-	union yyalloc *yyptr =
-	  (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize));
-	if (! yyptr)
-	  goto yyoverflowlab;
-	YYSTACK_RELOCATE (yyss);
-	YYSTACK_RELOCATE (yyvs);
-
-#  undef YYSTACK_RELOCATE
-	if (yyss1 != yyssa)
-	  YYSTACK_FREE (yyss1);
-      }
-# endif
-#endif /* no yyoverflow */
-
-      yyssp = yyss + yysize - 1;
-      yyvsp = yyvs + yysize - 1;
-
-
-      YYDPRINTF ((stderr, "Stack size increased to %lu\n",
-		  (unsigned long int) yystacksize));
-
-      if (yyss + yystacksize - 1 <= yyssp)
-	YYABORT;
-    }
-
-  YYDPRINTF ((stderr, "Entering state %d\n", yystate));
-
-  goto yybackup;
-
-/*-----------.
-| yybackup.  |
-`-----------*/
-yybackup:
-
-/* Do appropriate processing given the current state.  */
-/* Read a look-ahead token if we need one and don't already have one.  */
-/* yyresume: */
-
-  /* First try to decide what to do without reference to look-ahead token.  */
-
-  yyn = yypact[yystate];
-  if (yyn == YYPACT_NINF)
-    goto yydefault;
-
-  /* Not known => get a look-ahead token if don't already have one.  */
-
-  /* YYCHAR is either YYEMPTY or YYEOF or a valid look-ahead symbol.  */
-  if (yychar == YYEMPTY)
-    {
-      YYDPRINTF ((stderr, "Reading a token: "));
-      yychar = YYLEX;
-    }
-
-  if (yychar <= YYEOF)
-    {
-      yychar = yytoken = YYEOF;
-      YYDPRINTF ((stderr, "Now at end of input.\n"));
-    }
-  else
-    {
-      yytoken = YYTRANSLATE (yychar);
-      YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc);
-    }
-
-  /* If the proper action on seeing token YYTOKEN is to reduce or to
-     detect an error, take that action.  */
-  yyn += yytoken;
-  if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken)
-    goto yydefault;
-  yyn = yytable[yyn];
-  if (yyn <= 0)
-    {
-      if (yyn == 0 || yyn == YYTABLE_NINF)
-	goto yyerrlab;
-      yyn = -yyn;
-      goto yyreduce;
-    }
-
-  if (yyn == YYFINAL)
-    YYACCEPT;
-
-  /* Shift the look-ahead token.  */
-  YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
-
-  /* Discard the token being shifted unless it is eof.  */
-  if (yychar != YYEOF)
-    yychar = YYEMPTY;
-
-  *++yyvsp = yylval;
-
-
-  /* Count tokens shifted since error; after three, turn off error
-     status.  */
-  if (yyerrstatus)
-    yyerrstatus--;
-
-  yystate = yyn;
-  goto yynewstate;
-
-
-/*-----------------------------------------------------------.
-| yydefault -- do the default action for the current state.  |
-`-----------------------------------------------------------*/
-yydefault:
-  yyn = yydefact[yystate];
-  if (yyn == 0)
-    goto yyerrlab;
-  goto yyreduce;
-
-
-/*-----------------------------.
-| yyreduce -- Do a reduction.  |
-`-----------------------------*/
-yyreduce:
-  /* yyn is the number of a rule to reduce with.  */
-  yylen = yyr2[yyn];
-
-  /* If YYLEN is nonzero, implement the default value of the action:
-     `$$ = $1'.
-
-     Otherwise, the following line sets YYVAL to garbage.
-     This behavior is undocumented and Bison
-     users should not rely upon it.  Assigning to YYVAL
-     unconditionally makes the parser a bit smaller, and it avoids a
-     GCC warning that YYVAL may be used uninitialized.  */
-  yyval = yyvsp[1-yylen];
-
-
-  YY_REDUCE_PRINT (yyn);
-  switch (yyn)
-    {
-        case 25:
-#line 166 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 26:
-#line 168 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-1]), (yyvsp[0])); ;}
-    break;
-
-  case 27:
-#line 172 "pars0grm.y"
-    { (yyval) = (yyvsp[0]);;}
-    break;
-
-  case 28:
-#line 174 "pars0grm.y"
-    { (yyval) = pars_func((yyvsp[-3]), (yyvsp[-1])); ;}
-    break;
-
-  case 29:
-#line 175 "pars0grm.y"
-    { (yyval) = (yyvsp[0]);;}
-    break;
-
-  case 30:
-#line 176 "pars0grm.y"
-    { (yyval) = (yyvsp[0]);;}
-    break;
-
-  case 31:
-#line 177 "pars0grm.y"
-    { (yyval) = (yyvsp[0]);;}
-    break;
-
-  case 32:
-#line 178 "pars0grm.y"
-    { (yyval) = (yyvsp[0]);;}
-    break;
-
-  case 33:
-#line 179 "pars0grm.y"
-    { (yyval) = (yyvsp[0]);;}
-    break;
-
-  case 34:
-#line 180 "pars0grm.y"
-    { (yyval) = (yyvsp[0]);;}
-    break;
-
-  case 35:
-#line 181 "pars0grm.y"
-    { (yyval) = (yyvsp[0]);;}
-    break;
-
-  case 36:
-#line 182 "pars0grm.y"
-    { (yyval) = pars_op('+', (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 37:
-#line 183 "pars0grm.y"
-    { (yyval) = pars_op('-', (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 38:
-#line 184 "pars0grm.y"
-    { (yyval) = pars_op('*', (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 39:
-#line 185 "pars0grm.y"
-    { (yyval) = pars_op('/', (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 40:
-#line 186 "pars0grm.y"
-    { (yyval) = pars_op('-', (yyvsp[0]), NULL); ;}
-    break;
-
-  case 41:
-#line 187 "pars0grm.y"
-    { (yyval) = (yyvsp[-1]); ;}
-    break;
-
-  case 42:
-#line 188 "pars0grm.y"
-    { (yyval) = pars_op('=', (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 43:
-#line 189 "pars0grm.y"
-    { (yyval) = pars_op('<', (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 44:
-#line 190 "pars0grm.y"
-    { (yyval) = pars_op('>', (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 45:
-#line 191 "pars0grm.y"
-    { (yyval) = pars_op(PARS_GE_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 46:
-#line 192 "pars0grm.y"
-    { (yyval) = pars_op(PARS_LE_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 47:
-#line 193 "pars0grm.y"
-    { (yyval) = pars_op(PARS_NE_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 48:
-#line 194 "pars0grm.y"
-    { (yyval) = pars_op(PARS_AND_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 49:
-#line 195 "pars0grm.y"
-    { (yyval) = pars_op(PARS_OR_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 50:
-#line 196 "pars0grm.y"
-    { (yyval) = pars_op(PARS_NOT_TOKEN, (yyvsp[0]), NULL); ;}
-    break;
-
-  case 51:
-#line 198 "pars0grm.y"
-    { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[-2]), NULL); ;}
-    break;
-
-  case 52:
-#line 200 "pars0grm.y"
-    { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[-2]), NULL); ;}
-    break;
-
-  case 53:
-#line 204 "pars0grm.y"
-    { (yyval) = &pars_to_char_token; ;}
-    break;
-
-  case 54:
-#line 205 "pars0grm.y"
-    { (yyval) = &pars_to_number_token; ;}
-    break;
-
-  case 55:
-#line 206 "pars0grm.y"
-    { (yyval) = &pars_to_binary_token; ;}
-    break;
-
-  case 56:
-#line 208 "pars0grm.y"
-    { (yyval) = &pars_binary_to_number_token; ;}
-    break;
-
-  case 57:
-#line 209 "pars0grm.y"
-    { (yyval) = &pars_substr_token; ;}
-    break;
-
-  case 58:
-#line 210 "pars0grm.y"
-    { (yyval) = &pars_concat_token; ;}
-    break;
-
-  case 59:
-#line 211 "pars0grm.y"
-    { (yyval) = &pars_instr_token; ;}
-    break;
-
-  case 60:
-#line 212 "pars0grm.y"
-    { (yyval) = &pars_length_token; ;}
-    break;
-
-  case 61:
-#line 213 "pars0grm.y"
-    { (yyval) = &pars_sysdate_token; ;}
-    break;
-
-  case 62:
-#line 214 "pars0grm.y"
-    { (yyval) = &pars_rnd_token; ;}
-    break;
-
-  case 63:
-#line 215 "pars0grm.y"
-    { (yyval) = &pars_rnd_str_token; ;}
-    break;
-
-  case 67:
-#line 226 "pars0grm.y"
-    { (yyval) = pars_stored_procedure_call((yyvsp[-4])); ;}
-    break;
-
-  case 68:
-#line 231 "pars0grm.y"
-    { (yyval) = pars_procedure_call((yyvsp[-3]), (yyvsp[-1])); ;}
-    break;
-
-  case 69:
-#line 235 "pars0grm.y"
-    { (yyval) = &pars_replstr_token; ;}
-    break;
-
-  case 70:
-#line 236 "pars0grm.y"
-    { (yyval) = &pars_printf_token; ;}
-    break;
-
-  case 71:
-#line 237 "pars0grm.y"
-    { (yyval) = &pars_assert_token; ;}
-    break;
-
-  case 72:
-#line 241 "pars0grm.y"
-    { (yyval) = (yyvsp[-2]); ;}
-    break;
-
-  case 73:
-#line 245 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 74:
-#line 247 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 75:
-#line 251 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 76:
-#line 252 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 77:
-#line 254 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 78:
-#line 258 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 79:
-#line 259 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0]));;}
-    break;
-
-  case 80:
-#line 260 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 81:
-#line 264 "pars0grm.y"
-    { (yyval) = (yyvsp[0]); ;}
-    break;
-
-  case 82:
-#line 266 "pars0grm.y"
-    { (yyval) = pars_func(&pars_count_token,
-				          que_node_list_add_last(NULL,
-					    sym_tab_add_int_lit(
-						pars_sym_tab_global, 1))); ;}
-    break;
-
-  case 83:
-#line 271 "pars0grm.y"
-    { (yyval) = pars_func(&pars_count_token,
-					    que_node_list_add_last(NULL,
-						pars_func(&pars_distinct_token,
-						     que_node_list_add_last(
-								NULL, (yyvsp[-1]))))); ;}
-    break;
-
-  case 84:
-#line 277 "pars0grm.y"
-    { (yyval) = pars_func(&pars_sum_token,
-						que_node_list_add_last(NULL,
-									(yyvsp[-1]))); ;}
-    break;
-
-  case 85:
-#line 283 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 86:
-#line 284 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 87:
-#line 286 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 88:
-#line 290 "pars0grm.y"
-    { (yyval) = pars_select_list(&pars_star_denoter,
-								NULL); ;}
-    break;
-
-  case 89:
-#line 293 "pars0grm.y"
-    { (yyval) = pars_select_list((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 90:
-#line 294 "pars0grm.y"
-    { (yyval) = pars_select_list((yyvsp[0]), NULL); ;}
-    break;
-
-  case 91:
-#line 298 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 92:
-#line 299 "pars0grm.y"
-    { (yyval) = (yyvsp[0]); ;}
-    break;
-
-  case 93:
-#line 303 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 94:
-#line 305 "pars0grm.y"
-    { (yyval) = &pars_update_token; ;}
-    break;
-
-  case 95:
-#line 309 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 96:
-#line 311 "pars0grm.y"
-    { yyval = &pars_share_token; ;}
-    break;
-
-  case 97:
-#line 315 "pars0grm.y"
-    { (yyval) = &pars_asc_token; ;}
-    break;
-
-  case 98:
-#line 316 "pars0grm.y"
-    { (yyval) = &pars_asc_token; ;}
-    break;
-
-  case 99:
-#line 317 "pars0grm.y"
-    { (yyval) = &pars_desc_token; ;}
-    break;
-
-  case 100:
-#line 321 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 101:
-#line 323 "pars0grm.y"
-    { (yyval) = pars_order_by((yyvsp[-1]), (yyvsp[0])); ;}
-    break;
-
-  case 102:
-#line 332 "pars0grm.y"
-    { (yyval) = pars_select_statement((yyvsp[-6]), (yyvsp[-4]), (yyvsp[-3]),
-								(yyvsp[-2]), (yyvsp[-1]), (yyvsp[0])); ;}
-    break;
-
-  case 103:
-#line 338 "pars0grm.y"
-    { (yyval) = (yyvsp[0]); ;}
-    break;
-
-  case 104:
-#line 343 "pars0grm.y"
-    { (yyval) = pars_insert_statement((yyvsp[-4]), (yyvsp[-1]), NULL); ;}
-    break;
-
-  case 105:
-#line 345 "pars0grm.y"
-    { (yyval) = pars_insert_statement((yyvsp[-1]), NULL, (yyvsp[0])); ;}
-    break;
-
-  case 106:
-#line 349 "pars0grm.y"
-    { (yyval) = pars_column_assignment((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 107:
-#line 353 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 108:
-#line 355 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 109:
-#line 361 "pars0grm.y"
-    { (yyval) = (yyvsp[0]); ;}
-    break;
-
-  case 110:
-#line 367 "pars0grm.y"
-    { (yyval) = pars_update_statement_start(FALSE,
-								(yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 111:
-#line 373 "pars0grm.y"
-    { (yyval) = pars_update_statement((yyvsp[-1]), NULL, (yyvsp[0])); ;}
-    break;
-
-  case 112:
-#line 378 "pars0grm.y"
-    { (yyval) = pars_update_statement((yyvsp[-1]), (yyvsp[0]), NULL); ;}
-    break;
-
-  case 113:
-#line 383 "pars0grm.y"
-    { (yyval) = pars_update_statement_start(TRUE,
-								(yyvsp[0]), NULL); ;}
-    break;
-
-  case 114:
-#line 389 "pars0grm.y"
-    { (yyval) = pars_update_statement((yyvsp[-1]), NULL, (yyvsp[0])); ;}
-    break;
-
-  case 115:
-#line 394 "pars0grm.y"
-    { (yyval) = pars_update_statement((yyvsp[-1]), (yyvsp[0]), NULL); ;}
-    break;
-
-  case 116:
-#line 399 "pars0grm.y"
-    { (yyval) = pars_row_printf_statement((yyvsp[0])); ;}
-    break;
-
-  case 117:
-#line 404 "pars0grm.y"
-    { (yyval) = pars_assignment_statement((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 118:
-#line 410 "pars0grm.y"
-    { (yyval) = pars_elsif_element((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 119:
-#line 414 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 120:
-#line 416 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-1]), (yyvsp[0])); ;}
-    break;
-
-  case 121:
-#line 420 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 122:
-#line 422 "pars0grm.y"
-    { (yyval) = (yyvsp[0]); ;}
-    break;
-
-  case 123:
-#line 423 "pars0grm.y"
-    { (yyval) = (yyvsp[0]); ;}
-    break;
-
-  case 124:
-#line 430 "pars0grm.y"
-    { (yyval) = pars_if_statement((yyvsp[-5]), (yyvsp[-3]), (yyvsp[-2])); ;}
-    break;
-
-  case 125:
-#line 436 "pars0grm.y"
-    { (yyval) = pars_while_statement((yyvsp[-4]), (yyvsp[-2])); ;}
-    break;
-
-  case 126:
-#line 444 "pars0grm.y"
-    { (yyval) = pars_for_statement((yyvsp[-8]), (yyvsp[-6]), (yyvsp[-4]), (yyvsp[-2])); ;}
-    break;
-
-  case 127:
-#line 448 "pars0grm.y"
-    { (yyval) = pars_exit_statement(); ;}
-    break;
-
-  case 128:
-#line 452 "pars0grm.y"
-    { (yyval) = pars_return_statement(); ;}
-    break;
-
-  case 129:
-#line 457 "pars0grm.y"
-    { (yyval) = pars_open_statement(
-						ROW_SEL_OPEN_CURSOR, (yyvsp[0])); ;}
-    break;
-
-  case 130:
-#line 463 "pars0grm.y"
-    { (yyval) = pars_open_statement(
-						ROW_SEL_CLOSE_CURSOR, (yyvsp[0])); ;}
-    break;
-
-  case 131:
-#line 469 "pars0grm.y"
-    { (yyval) = pars_fetch_statement((yyvsp[-2]), (yyvsp[0]), NULL); ;}
-    break;
-
-  case 132:
-#line 471 "pars0grm.y"
-    { (yyval) = pars_fetch_statement((yyvsp[-2]), NULL, (yyvsp[0])); ;}
-    break;
-
-  case 133:
-#line 476 "pars0grm.y"
-    { (yyval) = pars_column_def((yyvsp[-4]), (yyvsp[-3]), (yyvsp[-2]), (yyvsp[-1]), (yyvsp[0])); ;}
-    break;
-
-  case 134:
-#line 480 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 135:
-#line 482 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 136:
-#line 486 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 137:
-#line 488 "pars0grm.y"
-    { (yyval) = (yyvsp[-1]); ;}
-    break;
-
-  case 138:
-#line 492 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 139:
-#line 494 "pars0grm.y"
-    { (yyval) = &pars_int_token;
-					/* pass any non-NULL pointer */ ;}
-    break;
-
-  case 140:
-#line 499 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 141:
-#line 501 "pars0grm.y"
-    { (yyval) = &pars_int_token;
-					/* pass any non-NULL pointer */ ;}
-    break;
-
-  case 142:
-#line 506 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 143:
-#line 508 "pars0grm.y"
-    { (yyval) = &pars_int_token;
-					/* pass any non-NULL pointer */ ;}
-    break;
-
-  case 144:
-#line 515 "pars0grm.y"
-    { (yyval) = pars_create_table((yyvsp[-4]), (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 145:
-#line 519 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 146:
-#line 521 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 147:
-#line 525 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 148:
-#line 526 "pars0grm.y"
-    { (yyval) = &pars_unique_token; ;}
-    break;
-
-  case 149:
-#line 530 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 150:
-#line 531 "pars0grm.y"
-    { (yyval) = &pars_clustered_token; ;}
-    break;
-
-  case 151:
-#line 539 "pars0grm.y"
-    { (yyval) = pars_create_index((yyvsp[-8]), (yyvsp[-7]), (yyvsp[-5]), (yyvsp[-3]), (yyvsp[-1])); ;}
-    break;
-
-  case 152:
-#line 544 "pars0grm.y"
-    { (yyval) = pars_commit_statement(); ;}
-    break;
-
-  case 153:
-#line 549 "pars0grm.y"
-    { (yyval) = pars_rollback_statement(); ;}
-    break;
-
-  case 154:
-#line 553 "pars0grm.y"
-    { (yyval) = &pars_int_token; ;}
-    break;
-
-  case 155:
-#line 554 "pars0grm.y"
-    { (yyval) = &pars_int_token; ;}
-    break;
-
-  case 156:
-#line 555 "pars0grm.y"
-    { (yyval) = &pars_char_token; ;}
-    break;
-
-  case 157:
-#line 556 "pars0grm.y"
-    { (yyval) = &pars_binary_token; ;}
-    break;
-
-  case 158:
-#line 557 "pars0grm.y"
-    { (yyval) = &pars_blob_token; ;}
-    break;
-
-  case 159:
-#line 562 "pars0grm.y"
-    { (yyval) = pars_parameter_declaration((yyvsp[-2]),
-							PARS_INPUT, (yyvsp[0])); ;}
-    break;
-
-  case 160:
-#line 565 "pars0grm.y"
-    { (yyval) = pars_parameter_declaration((yyvsp[-2]),
-							PARS_OUTPUT, (yyvsp[0])); ;}
-    break;
-
-  case 161:
-#line 570 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 162:
-#line 571 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 163:
-#line 573 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 164:
-#line 578 "pars0grm.y"
-    { (yyval) = pars_variable_declaration((yyvsp[-2]), (yyvsp[-1])); ;}
-    break;
-
-  case 168:
-#line 590 "pars0grm.y"
-    { (yyval) = pars_cursor_declaration((yyvsp[-3]), (yyvsp[-1])); ;}
-    break;
-
-  case 169:
-#line 595 "pars0grm.y"
-    { (yyval) = pars_function_declaration((yyvsp[-1])); ;}
-    break;
-
-  case 175:
-#line 616 "pars0grm.y"
-    { (yyval) = pars_procedure_definition((yyvsp[-9]), (yyvsp[-7]),
-								(yyvsp[-1])); ;}
-    break;
-
-
-    }
-
-/* Line 1010 of yacc.c.  */
-#line 2345 "pars0grm.c"
-
-  yyvsp -= yylen;
-  yyssp -= yylen;
-
-
-  YY_STACK_PRINT (yyss, yyssp);
-
-  *++yyvsp = yyval;
-
-
-  /* Now `shift' the result of the reduction.  Determine what state
-     that goes to, based on the state we popped back to and the rule
-     number reduced by.  */
-
-  yyn = yyr1[yyn];
-
-  yystate = yypgoto[yyn - YYNTOKENS] + *yyssp;
-  if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp)
-    yystate = yytable[yystate];
-  else
-    yystate = yydefgoto[yyn - YYNTOKENS];
-
-  goto yynewstate;
-
-
-/*------------------------------------.
-| yyerrlab -- here on detecting error |
-`------------------------------------*/
-yyerrlab:
-  /* If not already recovering from an error, report this error.  */
-  if (!yyerrstatus)
-    {
-      ++yynerrs;
-#if YYERROR_VERBOSE
-      yyn = yypact[yystate];
-
-      if (YYPACT_NINF < yyn && yyn < YYLAST)
-	{
-	  YYSIZE_T yysize = 0;
-	  int yytype = YYTRANSLATE (yychar);
-	  const char* yyprefix;
-	  char *yymsg;
-	  int yyx;
-
-	  /* Start YYX at -YYN if negative to avoid negative indexes in
-	     YYCHECK.  */
-	  int yyxbegin = yyn < 0 ? -yyn : 0;
-
-	  /* Stay within bounds of both yycheck and yytname.  */
-	  int yychecklim = YYLAST - yyn;
-	  int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
-	  int yycount = 0;
-
-	  yyprefix = ", expecting ";
-	  for (yyx = yyxbegin; yyx < yyxend; ++yyx)
-	    if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR)
-	      {
-		yysize += yystrlen (yyprefix) + yystrlen (yytname [yyx]);
-		yycount += 1;
-		if (yycount == 5)
-		  {
-		    yysize = 0;
-		    break;
-		  }
-	      }
-	  yysize += (sizeof ("syntax error, unexpected ")
-		     + yystrlen (yytname[yytype]));
-	  yymsg = (char *) YYSTACK_ALLOC (yysize);
-	  if (yymsg != 0)
-	    {
-	      char *yyp = yystpcpy (yymsg, "syntax error, unexpected ");
-	      yyp = yystpcpy (yyp, yytname[yytype]);
-
-	      if (yycount < 5)
-		{
-		  yyprefix = ", expecting ";
-		  for (yyx = yyxbegin; yyx < yyxend; ++yyx)
-		    if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR)
-		      {
-			yyp = yystpcpy (yyp, yyprefix);
-			yyp = yystpcpy (yyp, yytname[yyx]);
-			yyprefix = " or ";
-		      }
-		}
-	      yyerror (yymsg);
-	      YYSTACK_FREE (yymsg);
-	    }
-	  else
-	    yyerror ("syntax error; also virtual memory exhausted");
-	}
-      else
-#endif /* YYERROR_VERBOSE */
-	yyerror ("syntax error");
-    }
-
-
-
-  if (yyerrstatus == 3)
-    {
-      /* If just tried and failed to reuse look-ahead token after an
-	 error, discard it.  */
-
-      if (yychar <= YYEOF)
-        {
-          /* If at end of input, pop the error token,
-	     then the rest of the stack, then return failure.  */
-	  if (yychar == YYEOF)
-	     for (;;)
-	       {
-
-		 YYPOPSTACK;
-		 if (yyssp == yyss)
-		   YYABORT;
-		 yydestruct ("Error: popping",
-                             yystos[*yyssp], yyvsp);
-	       }
-        }
-      else
-	{
-	  yydestruct ("Error: discarding", yytoken, &yylval);
-	  yychar = YYEMPTY;
-	}
-    }
-
-  /* Else will try to reuse look-ahead token after shifting the error
-     token.  */
-  goto yyerrlab1;
-
-
-/*---------------------------------------------------.
-| yyerrorlab -- error raised explicitly by YYERROR.  |
-`---------------------------------------------------*/
-yyerrorlab:
-
-#ifdef __GNUC__
-  /* Pacify GCC when the user code never invokes YYERROR and the label
-     yyerrorlab therefore never appears in user code.  */
-  if (0)
-     goto yyerrorlab;
-#endif
-
-yyvsp -= yylen;
-  yyssp -= yylen;
-  yystate = *yyssp;
-  goto yyerrlab1;
-
-
-/*-------------------------------------------------------------.
-| yyerrlab1 -- common code for both syntax error and YYERROR.  |
-`-------------------------------------------------------------*/
-yyerrlab1:
-  yyerrstatus = 3;	/* Each real token shifted decrements this.  */
-
-  for (;;)
-    {
-      yyn = yypact[yystate];
-      if (yyn != YYPACT_NINF)
-	{
-	  yyn += YYTERROR;
-	  if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR)
-	    {
-	      yyn = yytable[yyn];
-	      if (0 < yyn)
-		break;
-	    }
-	}
-
-      /* Pop the current state because it cannot handle the error token.  */
-      if (yyssp == yyss)
-	YYABORT;
-
-
-      yydestruct ("Error: popping", yystos[yystate], yyvsp);
-      YYPOPSTACK;
-      yystate = *yyssp;
-      YY_STACK_PRINT (yyss, yyssp);
-    }
-
-  if (yyn == YYFINAL)
-    YYACCEPT;
-
-  *++yyvsp = yylval;
-
-
-  /* Shift the error token. */
-  YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp);
-
-  yystate = yyn;
-  goto yynewstate;
-
-
-/*-------------------------------------.
-| yyacceptlab -- YYACCEPT comes here.  |
-`-------------------------------------*/
-yyacceptlab:
-  yyresult = 0;
-  goto yyreturn;
-
-/*-----------------------------------.
-| yyabortlab -- YYABORT comes here.  |
-`-----------------------------------*/
-yyabortlab:
-  yydestruct ("Error: discarding lookahead",
-              yytoken, &yylval);
-  yychar = YYEMPTY;
-  yyresult = 1;
-  goto yyreturn;
-
-#ifndef yyoverflow
-/*----------------------------------------------.
-| yyoverflowlab -- parser overflow comes here.  |
-`----------------------------------------------*/
-yyoverflowlab:
-  yyerror ("parser stack overflow");
-  yyresult = 2;
-  /* Fall through.  */
-#endif
-
-yyreturn:
-#ifndef yyoverflow
-  if (yyss != yyssa)
-    YYSTACK_FREE (yyss);
-#endif
-  return yyresult;
-}
-
-
-#line 620 "pars0grm.y"
-
-
diff --git a/storage/xtradb/pars/pars0grm.cc b/storage/xtradb/pars/pars0grm.cc
new file mode 100644
index 00000000000..b360f36e597
--- /dev/null
+++ b/storage/xtradb/pars/pars0grm.cc
@@ -0,0 +1,3034 @@
+/* A Bison parser, made by GNU Bison 2.3.  */
+
+/* Skeleton implementation for Bison's Yacc-like parsers in C
+
+   Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+   Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+/* C LALR(1) parser skeleton written by Richard Stallman, by
+   simplifying the original so-called "semantic" parser.  */
+
+/* All symbols defined below should begin with yy or YY, to avoid
+   infringing on user name space.  This should be done even for local
+   variables, as they might otherwise be expanded by user macros.
+   There are some unavoidable exceptions within include files to
+   define necessary library symbols; they are noted "INFRINGES ON
+   USER NAME SPACE" below.  */
+
+/* Identify Bison output.  */
+#define YYBISON 1
+
+/* Bison version.  */
+#define YYBISON_VERSION "2.3"
+
+/* Skeleton name.  */
+#define YYSKELETON_NAME "yacc.c"
+
+/* Pure parsers.  */
+#define YYPURE 0
+
+/* Using locations.  */
+#define YYLSP_NEEDED 0
+
+
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     PARS_INT_LIT = 258,
+     PARS_FLOAT_LIT = 259,
+     PARS_STR_LIT = 260,
+     PARS_FIXBINARY_LIT = 261,
+     PARS_BLOB_LIT = 262,
+     PARS_NULL_LIT = 263,
+     PARS_ID_TOKEN = 264,
+     PARS_AND_TOKEN = 265,
+     PARS_OR_TOKEN = 266,
+     PARS_NOT_TOKEN = 267,
+     PARS_GE_TOKEN = 268,
+     PARS_LE_TOKEN = 269,
+     PARS_NE_TOKEN = 270,
+     PARS_PROCEDURE_TOKEN = 271,
+     PARS_IN_TOKEN = 272,
+     PARS_OUT_TOKEN = 273,
+     PARS_BINARY_TOKEN = 274,
+     PARS_BLOB_TOKEN = 275,
+     PARS_INT_TOKEN = 276,
+     PARS_INTEGER_TOKEN = 277,
+     PARS_FLOAT_TOKEN = 278,
+     PARS_CHAR_TOKEN = 279,
+     PARS_IS_TOKEN = 280,
+     PARS_BEGIN_TOKEN = 281,
+     PARS_END_TOKEN = 282,
+     PARS_IF_TOKEN = 283,
+     PARS_THEN_TOKEN = 284,
+     PARS_ELSE_TOKEN = 285,
+     PARS_ELSIF_TOKEN = 286,
+     PARS_LOOP_TOKEN = 287,
+     PARS_WHILE_TOKEN = 288,
+     PARS_RETURN_TOKEN = 289,
+     PARS_SELECT_TOKEN = 290,
+     PARS_SUM_TOKEN = 291,
+     PARS_COUNT_TOKEN = 292,
+     PARS_DISTINCT_TOKEN = 293,
+     PARS_FROM_TOKEN = 294,
+     PARS_WHERE_TOKEN = 295,
+     PARS_FOR_TOKEN = 296,
+     PARS_DDOT_TOKEN = 297,
+     PARS_READ_TOKEN = 298,
+     PARS_ORDER_TOKEN = 299,
+     PARS_BY_TOKEN = 300,
+     PARS_ASC_TOKEN = 301,
+     PARS_DESC_TOKEN = 302,
+     PARS_INSERT_TOKEN = 303,
+     PARS_INTO_TOKEN = 304,
+     PARS_VALUES_TOKEN = 305,
+     PARS_UPDATE_TOKEN = 306,
+     PARS_SET_TOKEN = 307,
+     PARS_DELETE_TOKEN = 308,
+     PARS_CURRENT_TOKEN = 309,
+     PARS_OF_TOKEN = 310,
+     PARS_CREATE_TOKEN = 311,
+     PARS_TABLE_TOKEN = 312,
+     PARS_INDEX_TOKEN = 313,
+     PARS_UNIQUE_TOKEN = 314,
+     PARS_CLUSTERED_TOKEN = 315,
+     PARS_DOES_NOT_FIT_IN_MEM_TOKEN = 316,
+     PARS_ON_TOKEN = 317,
+     PARS_ASSIGN_TOKEN = 318,
+     PARS_DECLARE_TOKEN = 319,
+     PARS_CURSOR_TOKEN = 320,
+     PARS_SQL_TOKEN = 321,
+     PARS_OPEN_TOKEN = 322,
+     PARS_FETCH_TOKEN = 323,
+     PARS_CLOSE_TOKEN = 324,
+     PARS_NOTFOUND_TOKEN = 325,
+     PARS_TO_CHAR_TOKEN = 326,
+     PARS_TO_NUMBER_TOKEN = 327,
+     PARS_TO_BINARY_TOKEN = 328,
+     PARS_BINARY_TO_NUMBER_TOKEN = 329,
+     PARS_SUBSTR_TOKEN = 330,
+     PARS_REPLSTR_TOKEN = 331,
+     PARS_CONCAT_TOKEN = 332,
+     PARS_INSTR_TOKEN = 333,
+     PARS_LENGTH_TOKEN = 334,
+     PARS_SYSDATE_TOKEN = 335,
+     PARS_PRINTF_TOKEN = 336,
+     PARS_ASSERT_TOKEN = 337,
+     PARS_RND_TOKEN = 338,
+     PARS_RND_STR_TOKEN = 339,
+     PARS_ROW_PRINTF_TOKEN = 340,
+     PARS_COMMIT_TOKEN = 341,
+     PARS_ROLLBACK_TOKEN = 342,
+     PARS_WORK_TOKEN = 343,
+     PARS_UNSIGNED_TOKEN = 344,
+     PARS_EXIT_TOKEN = 345,
+     PARS_FUNCTION_TOKEN = 346,
+     PARS_LOCK_TOKEN = 347,
+     PARS_SHARE_TOKEN = 348,
+     PARS_MODE_TOKEN = 349,
+     PARS_LIKE_TOKEN = 350,
+     PARS_LIKE_TOKEN_EXACT = 351,
+     PARS_LIKE_TOKEN_PREFIX = 352,
+     PARS_LIKE_TOKEN_SUFFIX = 353,
+     PARS_LIKE_TOKEN_SUBSTR = 354,
+     PARS_TABLE_NAME_TOKEN = 355,
+     PARS_COMPACT_TOKEN = 356,
+     PARS_BLOCK_SIZE_TOKEN = 357,
+     PARS_BIGINT_TOKEN = 358,
+     NEG = 359
+   };
+#endif
+/* Tokens.  */
+#define PARS_INT_LIT 258
+#define PARS_FLOAT_LIT 259
+#define PARS_STR_LIT 260
+#define PARS_FIXBINARY_LIT 261
+#define PARS_BLOB_LIT 262
+#define PARS_NULL_LIT 263
+#define PARS_ID_TOKEN 264
+#define PARS_AND_TOKEN 265
+#define PARS_OR_TOKEN 266
+#define PARS_NOT_TOKEN 267
+#define PARS_GE_TOKEN 268
+#define PARS_LE_TOKEN 269
+#define PARS_NE_TOKEN 270
+#define PARS_PROCEDURE_TOKEN 271
+#define PARS_IN_TOKEN 272
+#define PARS_OUT_TOKEN 273
+#define PARS_BINARY_TOKEN 274
+#define PARS_BLOB_TOKEN 275
+#define PARS_INT_TOKEN 276
+#define PARS_INTEGER_TOKEN 277
+#define PARS_FLOAT_TOKEN 278
+#define PARS_CHAR_TOKEN 279
+#define PARS_IS_TOKEN 280
+#define PARS_BEGIN_TOKEN 281
+#define PARS_END_TOKEN 282
+#define PARS_IF_TOKEN 283
+#define PARS_THEN_TOKEN 284
+#define PARS_ELSE_TOKEN 285
+#define PARS_ELSIF_TOKEN 286
+#define PARS_LOOP_TOKEN 287
+#define PARS_WHILE_TOKEN 288
+#define PARS_RETURN_TOKEN 289
+#define PARS_SELECT_TOKEN 290
+#define PARS_SUM_TOKEN 291
+#define PARS_COUNT_TOKEN 292
+#define PARS_DISTINCT_TOKEN 293
+#define PARS_FROM_TOKEN 294
+#define PARS_WHERE_TOKEN 295
+#define PARS_FOR_TOKEN 296
+#define PARS_DDOT_TOKEN 297
+#define PARS_READ_TOKEN 298
+#define PARS_ORDER_TOKEN 299
+#define PARS_BY_TOKEN 300
+#define PARS_ASC_TOKEN 301
+#define PARS_DESC_TOKEN 302
+#define PARS_INSERT_TOKEN 303
+#define PARS_INTO_TOKEN 304
+#define PARS_VALUES_TOKEN 305
+#define PARS_UPDATE_TOKEN 306
+#define PARS_SET_TOKEN 307
+#define PARS_DELETE_TOKEN 308
+#define PARS_CURRENT_TOKEN 309
+#define PARS_OF_TOKEN 310
+#define PARS_CREATE_TOKEN 311
+#define PARS_TABLE_TOKEN 312
+#define PARS_INDEX_TOKEN 313
+#define PARS_UNIQUE_TOKEN 314
+#define PARS_CLUSTERED_TOKEN 315
+#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 316
+#define PARS_ON_TOKEN 317
+#define PARS_ASSIGN_TOKEN 318
+#define PARS_DECLARE_TOKEN 319
+#define PARS_CURSOR_TOKEN 320
+#define PARS_SQL_TOKEN 321
+#define PARS_OPEN_TOKEN 322
+#define PARS_FETCH_TOKEN 323
+#define PARS_CLOSE_TOKEN 324
+#define PARS_NOTFOUND_TOKEN 325
+#define PARS_TO_CHAR_TOKEN 326
+#define PARS_TO_NUMBER_TOKEN 327
+#define PARS_TO_BINARY_TOKEN 328
+#define PARS_BINARY_TO_NUMBER_TOKEN 329
+#define PARS_SUBSTR_TOKEN 330
+#define PARS_REPLSTR_TOKEN 331
+#define PARS_CONCAT_TOKEN 332
+#define PARS_INSTR_TOKEN 333
+#define PARS_LENGTH_TOKEN 334
+#define PARS_SYSDATE_TOKEN 335
+#define PARS_PRINTF_TOKEN 336
+#define PARS_ASSERT_TOKEN 337
+#define PARS_RND_TOKEN 338
+#define PARS_RND_STR_TOKEN 339
+#define PARS_ROW_PRINTF_TOKEN 340
+#define PARS_COMMIT_TOKEN 341
+#define PARS_ROLLBACK_TOKEN 342
+#define PARS_WORK_TOKEN 343
+#define PARS_UNSIGNED_TOKEN 344
+#define PARS_EXIT_TOKEN 345
+#define PARS_FUNCTION_TOKEN 346
+#define PARS_LOCK_TOKEN 347
+#define PARS_SHARE_TOKEN 348
+#define PARS_MODE_TOKEN 349
+#define PARS_LIKE_TOKEN 350
+#define PARS_LIKE_TOKEN_EXACT 351
+#define PARS_LIKE_TOKEN_PREFIX 352
+#define PARS_LIKE_TOKEN_SUFFIX 353
+#define PARS_LIKE_TOKEN_SUBSTR 354
+#define PARS_TABLE_NAME_TOKEN 355
+#define PARS_COMPACT_TOKEN 356
+#define PARS_BLOCK_SIZE_TOKEN 357
+#define PARS_BIGINT_TOKEN 358
+#define NEG 359
+
+
+
+
+/* Copy the first part of user declarations.  */
+#line 28 "pars0grm.y"
+
+/* The value of the semantic attribute is a pointer to a query tree node
+que_node_t */
+
+#include "univ.i"
+#include <math.h>				/* Can't be before univ.i */
+#include "pars0pars.h"
+#include "mem0mem.h"
+#include "que0types.h"
+#include "que0que.h"
+#include "row0sel.h"
+
+#define YYSTYPE que_node_t*
+
+/* #define __STDC__ */
+
+int
+yylex(void);
+
+
+/* Enabling traces.  */
+#ifndef YYDEBUG
+# define YYDEBUG 0
+#endif
+
+/* Enabling verbose error messages.  */
+#ifdef YYERROR_VERBOSE
+# undef YYERROR_VERBOSE
+# define YYERROR_VERBOSE 1
+#else
+# define YYERROR_VERBOSE 0
+#endif
+
+/* Enabling the token table.  */
+#ifndef YYTOKEN_TABLE
+# define YYTOKEN_TABLE 0
+#endif
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef int YYSTYPE;
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+# define YYSTYPE_IS_TRIVIAL 1
+#endif
+
+
+
+/* Copy the second part of user declarations.  */
+
+
+/* Line 216 of yacc.c.  */
+#line 334 "pars0grm.cc"
+
+#ifdef short
+# undef short
+#endif
+
+#ifdef YYTYPE_UINT8
+typedef YYTYPE_UINT8 yytype_uint8;
+#else
+typedef unsigned char yytype_uint8;
+#endif
+
+#ifdef YYTYPE_INT8
+typedef YYTYPE_INT8 yytype_int8;
+#elif (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+typedef signed char yytype_int8;
+#else
+typedef short int yytype_int8;
+#endif
+
+#ifdef YYTYPE_UINT16
+typedef YYTYPE_UINT16 yytype_uint16;
+#else
+typedef unsigned short int yytype_uint16;
+#endif
+
+#ifdef YYTYPE_INT16
+typedef YYTYPE_INT16 yytype_int16;
+#else
+typedef short int yytype_int16;
+#endif
+
+#ifndef YYSIZE_T
+# ifdef __SIZE_TYPE__
+#  define YYSIZE_T __SIZE_TYPE__
+# elif defined size_t
+#  define YYSIZE_T size_t
+# elif ! defined YYSIZE_T && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+#  include <stddef.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYSIZE_T size_t
+# else
+#  define YYSIZE_T unsigned int
+# endif
+#endif
+
+#define YYSIZE_MAXIMUM ((YYSIZE_T) -1)
+
+#ifndef YY_
+# if defined YYENABLE_NLS && YYENABLE_NLS
+#  if ENABLE_NLS
+#   include <libintl.h> /* INFRINGES ON USER NAME SPACE */
+#   define YY_(msgid) dgettext ("bison-runtime", msgid)
+#  endif
+# endif
+# ifndef YY_
+#  define YY_(msgid) msgid
+# endif
+#endif
+
+/* Suppress unused-variable warnings by "using" E.  */
+#if ! defined lint || defined __GNUC__
+# define YYUSE(e) ((void) (e))
+#else
+# define YYUSE(e) /* empty */
+#endif
+
+/* Identity function, used to suppress warnings about constant conditions.  */
+#ifndef lint
+# define YYID(n) (n)
+#else
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static int
+YYID (int i)
+#else
+static int
+YYID (i)
+    int i;
+#endif
+{
+  return i;
+}
+#endif
+
+#if ! defined yyoverflow || YYERROR_VERBOSE
+
+/* The parser invokes alloca or malloc; define the necessary symbols.  */
+
+# ifdef YYSTACK_USE_ALLOCA
+#  if YYSTACK_USE_ALLOCA
+#   ifdef __GNUC__
+#    define YYSTACK_ALLOC __builtin_alloca
+#   elif defined __BUILTIN_VA_ARG_INCR
+#    include <alloca.h> /* INFRINGES ON USER NAME SPACE */
+#   elif defined _AIX
+#    define YYSTACK_ALLOC __alloca
+#   elif defined _MSC_VER
+#    include <malloc.h> /* INFRINGES ON USER NAME SPACE */
+#    define alloca _alloca
+#   else
+#    define YYSTACK_ALLOC alloca
+#    if ! defined _ALLOCA_H && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+#     include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#     ifndef _STDLIB_H
+#      define _STDLIB_H 1
+#     endif
+#    endif
+#   endif
+#  endif
+# endif
+
+# ifdef YYSTACK_ALLOC
+   /* Pacify GCC's `empty if-body' warning.  */
+#  define YYSTACK_FREE(Ptr) do { /* empty */; } while (YYID (0))
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+    /* The OS might guarantee only one guard page at the bottom of the stack,
+       and a page size can be as small as 4096 bytes.  So we cannot safely
+       invoke alloca (N) if N exceeds 4096.  Use a slightly smaller number
+       to allow for a few compiler-allocated temporary stack slots.  */
+#   define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */
+#  endif
+# else
+#  define YYSTACK_ALLOC YYMALLOC
+#  define YYSTACK_FREE YYFREE
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+#   define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM
+#  endif
+#  if (defined __cplusplus && ! defined _STDLIB_H \
+       && ! ((defined YYMALLOC || defined malloc) \
+	     && (defined YYFREE || defined free)))
+#   include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#   ifndef _STDLIB_H
+#    define _STDLIB_H 1
+#   endif
+#  endif
+#  ifndef YYMALLOC
+#   define YYMALLOC malloc
+#   if ! defined malloc && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+#  ifndef YYFREE
+#   define YYFREE free
+#   if ! defined free && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+void free (void*); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+# endif
+#endif /* ! defined yyoverflow || YYERROR_VERBOSE */
+
+
+#if (! defined yyoverflow \
+     && (! defined __cplusplus \
+	 || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL)))
+
+/* A type that is properly aligned for any stack member.  */
+union yyalloc
+{
+  yytype_int16 yyss;
+  YYSTYPE yyvs;
+  };
+
+/* The size of the maximum gap between one aligned stack and the next.  */
+# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1)
+
+/* The size of an array large to enough to hold all stacks, each with
+   N elements.  */
+# define YYSTACK_BYTES(N) \
+     ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \
+      + YYSTACK_GAP_MAXIMUM)
+
+/* Copy COUNT objects from FROM to TO.  The source and destination do
+   not overlap.  */
+# ifndef YYCOPY
+#  if defined __GNUC__ && 1 < __GNUC__
+#   define YYCOPY(To, From, Count) \
+      __builtin_memcpy (To, From, (Count) * sizeof (*(From)))
+#  else
+#   define YYCOPY(To, From, Count)		\
+      do					\
+	{					\
+	  YYSIZE_T yyi;				\
+	  for (yyi = 0; yyi < (Count); yyi++)	\
+	    (To)[yyi] = (From)[yyi];		\
+	}					\
+      while (YYID (0))
+#  endif
+# endif
+
+/* Relocate STACK from its old location to the new one.  The
+   local variables YYSIZE and YYSTACKSIZE give the old and new number of
+   elements in the stack, and YYPTR gives the new location of the
+   stack.  Advance YYPTR to a properly aligned location for the next
+   stack.  */
+# define YYSTACK_RELOCATE(Stack)					\
+    do									\
+      {									\
+	YYSIZE_T yynewbytes;						\
+	YYCOPY (&yyptr->Stack, Stack, yysize);				\
+	Stack = &yyptr->Stack;						\
+	yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \
+	yyptr += yynewbytes / sizeof (*yyptr);				\
+      }									\
+    while (YYID (0))
+
+#endif
+
+/* YYFINAL -- State number of the termination state.  */
+#define YYFINAL  5
+/* YYLAST -- Last index in YYTABLE.  */
+#define YYLAST   816
+
+/* YYNTOKENS -- Number of terminals.  */
+#define YYNTOKENS  120
+/* YYNNTS -- Number of nonterminals.  */
+#define YYNNTS  73
+/* YYNRULES -- Number of rules.  */
+#define YYNRULES  183
+/* YYNRULES -- Number of states.  */
+#define YYNSTATES  350
+
+/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX.  */
+#define YYUNDEFTOK  2
+#define YYMAXUTOK   359
+
+#define YYTRANSLATE(YYX)						\
+  ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
+
+/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX.  */
+static const yytype_uint8 yytranslate[] =
+{
+       0,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,   112,     2,     2,
+     114,   115,   109,   108,   117,   107,     2,   110,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,   113,
+     105,   104,   106,   116,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,   118,     2,   119,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     1,     2,     3,     4,
+       5,     6,     7,     8,     9,    10,    11,    12,    13,    14,
+      15,    16,    17,    18,    19,    20,    21,    22,    23,    24,
+      25,    26,    27,    28,    29,    30,    31,    32,    33,    34,
+      35,    36,    37,    38,    39,    40,    41,    42,    43,    44,
+      45,    46,    47,    48,    49,    50,    51,    52,    53,    54,
+      55,    56,    57,    58,    59,    60,    61,    62,    63,    64,
+      65,    66,    67,    68,    69,    70,    71,    72,    73,    74,
+      75,    76,    77,    78,    79,    80,    81,    82,    83,    84,
+      85,    86,    87,    88,    89,    90,    91,    92,    93,    94,
+      95,    96,    97,    98,    99,   100,   101,   102,   103,   111
+};
+
+#if YYDEBUG
+/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in
+   YYRHS.  */
+static const yytype_uint16 yyprhs[] =
+{
+       0,     0,     3,     6,     8,    11,    14,    17,    20,    23,
+      26,    29,    32,    35,    38,    41,    44,    47,    50,    53,
+      56,    59,    62,    65,    68,    71,    73,    76,    78,    83,
+      85,    87,    89,    91,    93,    95,    97,   101,   105,   109,
+     113,   116,   120,   124,   128,   132,   136,   140,   144,   148,
+     152,   156,   159,   163,   167,   169,   171,   173,   175,   177,
+     179,   181,   183,   185,   187,   189,   190,   192,   196,   203,
+     208,   210,   212,   214,   218,   220,   224,   225,   227,   231,
+     232,   234,   238,   240,   245,   251,   256,   257,   259,   263,
+     265,   269,   271,   272,   275,   276,   279,   280,   285,   286,
+     288,   290,   291,   296,   305,   309,   315,   318,   322,   324,
+     328,   333,   338,   341,   344,   348,   351,   354,   357,   361,
+     366,   368,   371,   372,   375,   377,   385,   392,   403,   405,
+     407,   410,   413,   418,   423,   429,   431,   435,   436,   440,
+     441,   443,   444,   447,   448,   450,   451,   453,   454,   458,
+     468,   470,   474,   475,   477,   478,   480,   491,   493,   495,
+     498,   501,   503,   505,   507,   509,   511,   513,   517,   521,
+     522,   524,   528,   532,   533,   535,   538,   545,   550,   552,
+     554,   555,   557,   560
+};
+
+/* YYRHS -- A `-1'-separated list of the rules' RHS.  */
+static const yytype_int16 yyrhs[] =
+{
+     121,     0,    -1,   192,   113,    -1,   127,    -1,   128,   113,
+      -1,   160,   113,    -1,   161,   113,    -1,   162,   113,    -1,
+     159,   113,    -1,   163,   113,    -1,   155,   113,    -1,   142,
+     113,    -1,   144,   113,    -1,   154,   113,    -1,   152,   113,
+      -1,   153,   113,    -1,   149,   113,    -1,   150,   113,    -1,
+     164,   113,    -1,   166,   113,    -1,   165,   113,    -1,   181,
+     113,    -1,   182,   113,    -1,   175,   113,    -1,   179,   113,
+      -1,   122,    -1,   123,   122,    -1,     9,    -1,   125,   114,
+     133,   115,    -1,     3,    -1,     4,    -1,     5,    -1,     6,
+      -1,     7,    -1,     8,    -1,    66,    -1,   124,   108,   124,
+      -1,   124,   107,   124,    -1,   124,   109,   124,    -1,   124,
+     110,   124,    -1,   107,   124,    -1,   114,   124,   115,    -1,
+     124,   104,   124,    -1,   124,    95,     5,    -1,   124,   105,
+     124,    -1,   124,   106,   124,    -1,   124,    13,   124,    -1,
+     124,    14,   124,    -1,   124,    15,   124,    -1,   124,    10,
+     124,    -1,   124,    11,   124,    -1,    12,   124,    -1,     9,
+     112,    70,    -1,    66,   112,    70,    -1,    71,    -1,    72,
+      -1,    73,    -1,    74,    -1,    75,    -1,    77,    -1,    78,
+      -1,    79,    -1,    80,    -1,    83,    -1,    84,    -1,    -1,
+     116,    -1,   126,   117,   116,    -1,   118,     9,   114,   126,
+     115,   119,    -1,   129,   114,   133,   115,    -1,    76,    -1,
+      81,    -1,    82,    -1,     9,   114,   115,    -1,   180,    -1,
+     131,   117,   180,    -1,    -1,     9,    -1,   132,   117,     9,
+      -1,    -1,   124,    -1,   133,   117,   124,    -1,   124,    -1,
+      37,   114,   109,   115,    -1,    37,   114,    38,     9,   115,
+      -1,    36,   114,   124,   115,    -1,    -1,   134,    -1,   135,
+     117,   134,    -1,   109,    -1,   135,    49,   132,    -1,   135,
+      -1,    -1,    40,   124,    -1,    -1,    41,    51,    -1,    -1,
+      92,    17,    93,    94,    -1,    -1,    46,    -1,    47,    -1,
+      -1,    44,    45,     9,   140,    -1,    35,   136,    39,   131,
+     137,   138,   139,   141,    -1,    48,    49,   180,    -1,   143,
+      50,   114,   133,   115,    -1,   143,   142,    -1,     9,   104,
+     124,    -1,   145,    -1,   146,   117,   145,    -1,    40,    54,
+      55,     9,    -1,    51,   180,    52,   146,    -1,   148,   137,
+      -1,   148,   147,    -1,    53,    39,   180,    -1,   151,   137,
+      -1,   151,   147,    -1,    85,   142,    -1,     9,    63,   124,
+      -1,    31,   124,    29,   123,    -1,   156,    -1,   157,   156,
+      -1,    -1,    30,   123,    -1,   157,    -1,    28,   124,    29,
+     123,   158,    27,    28,    -1,    33,   124,    32,   123,    27,
+      32,    -1,    41,     9,    17,   124,    42,   124,    32,   123,
+      27,    32,    -1,    90,    -1,    34,    -1,    67,     9,    -1,
+      69,     9,    -1,    68,     9,    49,   132,    -1,    68,     9,
+      49,   130,    -1,     9,   183,   169,   170,   171,    -1,   167,
+      -1,   168,   117,   167,    -1,    -1,   114,     3,   115,    -1,
+      -1,    89,    -1,    -1,    12,     8,    -1,    -1,    61,    -1,
+      -1,   101,    -1,    -1,   102,   104,     3,    -1,    56,    57,
+     180,   114,   168,   115,   172,   173,   174,    -1,     9,    -1,
+     176,   117,     9,    -1,    -1,    59,    -1,    -1,    60,    -1,
+      56,   177,   178,    58,     9,    62,   180,   114,   176,   115,
+      -1,     9,    -1,   100,    -1,    86,    88,    -1,    87,    88,
+      -1,    21,    -1,    22,    -1,   103,    -1,    24,    -1,    19,
+      -1,    20,    -1,     9,    17,   183,    -1,     9,    18,   183,
+      -1,    -1,   184,    -1,   185,   117,   184,    -1,     9,   183,
+     113,    -1,    -1,   186,    -1,   187,   186,    -1,    64,    65,
+       9,    25,   142,   113,    -1,    64,    91,     9,   113,    -1,
+     188,    -1,   189,    -1,    -1,   190,    -1,   191,   190,    -1,
+      16,     9,   114,   185,   115,    25,   187,   191,    26,   123,
+      27,    -1
+};
+
+/* YYRLINE[YYN] -- source line where rule number YYN was defined.  */
+static const yytype_uint16 yyrline[] =
+{
+       0,   162,   162,   165,   166,   167,   168,   169,   170,   171,
+     172,   173,   174,   175,   176,   177,   178,   179,   180,   181,
+     182,   183,   184,   185,   186,   190,   191,   196,   197,   199,
+     200,   201,   202,   203,   204,   205,   206,   207,   208,   209,
+     210,   211,   212,   213,   215,   216,   217,   218,   219,   220,
+     221,   222,   223,   225,   230,   231,   232,   233,   235,   236,
+     237,   238,   239,   240,   241,   244,   246,   247,   251,   257,
+     262,   263,   264,   268,   272,   273,   278,   279,   280,   285,
+     286,   287,   291,   292,   297,   303,   310,   311,   312,   317,
+     319,   322,   326,   327,   331,   332,   337,   338,   343,   344,
+     345,   349,   350,   357,   372,   377,   380,   388,   394,   395,
+     400,   406,   415,   423,   431,   438,   446,   454,   460,   467,
+     473,   474,   479,   480,   482,   486,   493,   499,   509,   513,
+     517,   524,   531,   535,   543,   552,   553,   558,   559,   564,
+     565,   571,   572,   578,   579,   585,   586,   591,   592,   597,
+     608,   609,   614,   615,   619,   620,   624,   638,   639,   643,
+     648,   653,   654,   655,   656,   657,   658,   662,   667,   675,
+     676,   677,   682,   688,   690,   691,   695,   703,   709,   710,
+     713,   715,   716,   720
+};
+#endif
+
+#if YYDEBUG || YYERROR_VERBOSE || YYTOKEN_TABLE
+/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
+   First, the terminals, then, starting at YYNTOKENS, nonterminals.  */
+static const char *const yytname[] =
+{
+  "$end", "error", "$undefined", "PARS_INT_LIT", "PARS_FLOAT_LIT",
+  "PARS_STR_LIT", "PARS_FIXBINARY_LIT", "PARS_BLOB_LIT", "PARS_NULL_LIT",
+  "PARS_ID_TOKEN", "PARS_AND_TOKEN", "PARS_OR_TOKEN", "PARS_NOT_TOKEN",
+  "PARS_GE_TOKEN", "PARS_LE_TOKEN", "PARS_NE_TOKEN",
+  "PARS_PROCEDURE_TOKEN", "PARS_IN_TOKEN", "PARS_OUT_TOKEN",
+  "PARS_BINARY_TOKEN", "PARS_BLOB_TOKEN", "PARS_INT_TOKEN",
+  "PARS_INTEGER_TOKEN", "PARS_FLOAT_TOKEN", "PARS_CHAR_TOKEN",
+  "PARS_IS_TOKEN", "PARS_BEGIN_TOKEN", "PARS_END_TOKEN", "PARS_IF_TOKEN",
+  "PARS_THEN_TOKEN", "PARS_ELSE_TOKEN", "PARS_ELSIF_TOKEN",
+  "PARS_LOOP_TOKEN", "PARS_WHILE_TOKEN", "PARS_RETURN_TOKEN",
+  "PARS_SELECT_TOKEN", "PARS_SUM_TOKEN", "PARS_COUNT_TOKEN",
+  "PARS_DISTINCT_TOKEN", "PARS_FROM_TOKEN", "PARS_WHERE_TOKEN",
+  "PARS_FOR_TOKEN", "PARS_DDOT_TOKEN", "PARS_READ_TOKEN",
+  "PARS_ORDER_TOKEN", "PARS_BY_TOKEN", "PARS_ASC_TOKEN", "PARS_DESC_TOKEN",
+  "PARS_INSERT_TOKEN", "PARS_INTO_TOKEN", "PARS_VALUES_TOKEN",
+  "PARS_UPDATE_TOKEN", "PARS_SET_TOKEN", "PARS_DELETE_TOKEN",
+  "PARS_CURRENT_TOKEN", "PARS_OF_TOKEN", "PARS_CREATE_TOKEN",
+  "PARS_TABLE_TOKEN", "PARS_INDEX_TOKEN", "PARS_UNIQUE_TOKEN",
+  "PARS_CLUSTERED_TOKEN", "PARS_DOES_NOT_FIT_IN_MEM_TOKEN",
+  "PARS_ON_TOKEN", "PARS_ASSIGN_TOKEN", "PARS_DECLARE_TOKEN",
+  "PARS_CURSOR_TOKEN", "PARS_SQL_TOKEN", "PARS_OPEN_TOKEN",
+  "PARS_FETCH_TOKEN", "PARS_CLOSE_TOKEN", "PARS_NOTFOUND_TOKEN",
+  "PARS_TO_CHAR_TOKEN", "PARS_TO_NUMBER_TOKEN", "PARS_TO_BINARY_TOKEN",
+  "PARS_BINARY_TO_NUMBER_TOKEN", "PARS_SUBSTR_TOKEN", "PARS_REPLSTR_TOKEN",
+  "PARS_CONCAT_TOKEN", "PARS_INSTR_TOKEN", "PARS_LENGTH_TOKEN",
+  "PARS_SYSDATE_TOKEN", "PARS_PRINTF_TOKEN", "PARS_ASSERT_TOKEN",
+  "PARS_RND_TOKEN", "PARS_RND_STR_TOKEN", "PARS_ROW_PRINTF_TOKEN",
+  "PARS_COMMIT_TOKEN", "PARS_ROLLBACK_TOKEN", "PARS_WORK_TOKEN",
+  "PARS_UNSIGNED_TOKEN", "PARS_EXIT_TOKEN", "PARS_FUNCTION_TOKEN",
+  "PARS_LOCK_TOKEN", "PARS_SHARE_TOKEN", "PARS_MODE_TOKEN",
+  "PARS_LIKE_TOKEN", "PARS_LIKE_TOKEN_EXACT", "PARS_LIKE_TOKEN_PREFIX",
+  "PARS_LIKE_TOKEN_SUFFIX", "PARS_LIKE_TOKEN_SUBSTR",
+  "PARS_TABLE_NAME_TOKEN", "PARS_COMPACT_TOKEN", "PARS_BLOCK_SIZE_TOKEN",
+  "PARS_BIGINT_TOKEN", "'='", "'<'", "'>'", "'-'", "'+'", "'*'", "'/'",
+  "NEG", "'%'", "';'", "'('", "')'", "'?'", "','", "'{'", "'}'", "$accept",
+  "top_statement", "statement", "statement_list", "exp", "function_name",
+  "question_mark_list", "stored_procedure_call",
+  "predefined_procedure_call", "predefined_procedure_name",
+  "user_function_call", "table_list", "variable_list", "exp_list",
+  "select_item", "select_item_list", "select_list", "search_condition",
+  "for_update_clause", "lock_shared_clause", "order_direction",
+  "order_by_clause", "select_statement", "insert_statement_start",
+  "insert_statement", "column_assignment", "column_assignment_list",
+  "cursor_positioned", "update_statement_start",
+  "update_statement_searched", "update_statement_positioned",
+  "delete_statement_start", "delete_statement_searched",
+  "delete_statement_positioned", "row_printf_statement",
+  "assignment_statement", "elsif_element", "elsif_list", "else_part",
+  "if_statement", "while_statement", "for_statement", "exit_statement",
+  "return_statement", "open_cursor_statement", "close_cursor_statement",
+  "fetch_statement", "column_def", "column_def_list", "opt_column_len",
+  "opt_unsigned", "opt_not_null", "not_fit_in_memory", "compact",
+  "block_size", "create_table", "column_list", "unique_def",
+  "clustered_def", "create_index", "table_name", "commit_statement",
+  "rollback_statement", "type_name", "parameter_declaration",
+  "parameter_declaration_list", "variable_declaration",
+  "variable_declaration_list", "cursor_declaration",
+  "function_declaration", "declaration", "declaration_list",
+  "procedure_definition", 0
+};
+#endif
+
+# ifdef YYPRINT
+/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to
+   token YYLEX-NUM.  */
+static const yytype_uint16 yytoknum[] =
+{
+       0,   256,   257,   258,   259,   260,   261,   262,   263,   264,
+     265,   266,   267,   268,   269,   270,   271,   272,   273,   274,
+     275,   276,   277,   278,   279,   280,   281,   282,   283,   284,
+     285,   286,   287,   288,   289,   290,   291,   292,   293,   294,
+     295,   296,   297,   298,   299,   300,   301,   302,   303,   304,
+     305,   306,   307,   308,   309,   310,   311,   312,   313,   314,
+     315,   316,   317,   318,   319,   320,   321,   322,   323,   324,
+     325,   326,   327,   328,   329,   330,   331,   332,   333,   334,
+     335,   336,   337,   338,   339,   340,   341,   342,   343,   344,
+     345,   346,   347,   348,   349,   350,   351,   352,   353,   354,
+     355,   356,   357,   358,    61,    60,    62,    45,    43,    42,
+      47,   359,    37,    59,    40,    41,    63,    44,   123,   125
+};
+# endif
+
+/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
+static const yytype_uint8 yyr1[] =
+{
+       0,   120,   121,   122,   122,   122,   122,   122,   122,   122,
+     122,   122,   122,   122,   122,   122,   122,   122,   122,   122,
+     122,   122,   122,   122,   122,   123,   123,   124,   124,   124,
+     124,   124,   124,   124,   124,   124,   124,   124,   124,   124,
+     124,   124,   124,   124,   124,   124,   124,   124,   124,   124,
+     124,   124,   124,   124,   125,   125,   125,   125,   125,   125,
+     125,   125,   125,   125,   125,   126,   126,   126,   127,   128,
+     129,   129,   129,   130,   131,   131,   132,   132,   132,   133,
+     133,   133,   134,   134,   134,   134,   135,   135,   135,   136,
+     136,   136,   137,   137,   138,   138,   139,   139,   140,   140,
+     140,   141,   141,   142,   143,   144,   144,   145,   146,   146,
+     147,   148,   149,   150,   151,   152,   153,   154,   155,   156,
+     157,   157,   158,   158,   158,   159,   160,   161,   162,   163,
+     164,   165,   166,   166,   167,   168,   168,   169,   169,   170,
+     170,   171,   171,   172,   172,   173,   173,   174,   174,   175,
+     176,   176,   177,   177,   178,   178,   179,   180,   180,   181,
+     182,   183,   183,   183,   183,   183,   183,   184,   184,   185,
+     185,   185,   186,   187,   187,   187,   188,   189,   190,   190,
+     191,   191,   191,   192
+};
+
+/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN.  */
+static const yytype_uint8 yyr2[] =
+{
+       0,     2,     2,     1,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     1,     2,     1,     4,     1,
+       1,     1,     1,     1,     1,     1,     3,     3,     3,     3,
+       2,     3,     3,     3,     3,     3,     3,     3,     3,     3,
+       3,     2,     3,     3,     1,     1,     1,     1,     1,     1,
+       1,     1,     1,     1,     1,     0,     1,     3,     6,     4,
+       1,     1,     1,     3,     1,     3,     0,     1,     3,     0,
+       1,     3,     1,     4,     5,     4,     0,     1,     3,     1,
+       3,     1,     0,     2,     0,     2,     0,     4,     0,     1,
+       1,     0,     4,     8,     3,     5,     2,     3,     1,     3,
+       4,     4,     2,     2,     3,     2,     2,     2,     3,     4,
+       1,     2,     0,     2,     1,     7,     6,    10,     1,     1,
+       2,     2,     4,     4,     5,     1,     3,     0,     3,     0,
+       1,     0,     2,     0,     1,     0,     1,     0,     3,     9,
+       1,     3,     0,     1,     0,     1,    10,     1,     1,     2,
+       2,     1,     1,     1,     1,     1,     1,     3,     3,     0,
+       1,     3,     3,     0,     1,     2,     6,     4,     1,     1,
+       0,     1,     2,    11
+};
+
+/* YYDEFACT[STATE-NAME] -- Default rule to reduce with in state
+   STATE-NUM when YYTABLE doesn't specify something else to do.  Zero
+   means the default is an error.  */
+static const yytype_uint8 yydefact[] =
+{
+       0,     0,     0,     0,     0,     1,     2,   169,     0,   170,
+       0,     0,     0,     0,     0,   165,   166,   161,   162,   164,
+     163,   167,   168,   173,   171,     0,   174,   180,     0,     0,
+     175,   178,   179,   181,     0,   172,     0,     0,     0,   182,
+       0,     0,     0,     0,     0,   129,    86,     0,     0,     0,
+       0,   152,     0,     0,     0,    70,    71,    72,     0,     0,
+       0,   128,     0,    25,     0,     3,     0,     0,     0,     0,
+       0,    92,     0,     0,    92,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,   177,     0,    29,    30,    31,    32,    33,    34,
+      27,     0,    35,    54,    55,    56,    57,    58,    59,    60,
+      61,    62,    63,    64,     0,     0,     0,     0,     0,     0,
+       0,    89,    82,    87,    91,     0,     0,     0,   157,   158,
+       0,     0,     0,   153,   154,   130,     0,   131,   117,   159,
+     160,     0,   183,    26,     4,    79,    11,     0,   106,    12,
+       0,   112,   113,    16,    17,   115,   116,    14,    15,    13,
+      10,     8,     5,     6,     7,     9,    18,    20,    19,    23,
+      24,    21,    22,     0,   118,     0,    51,     0,    40,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,    79,     0,     0,     0,    76,     0,
+       0,     0,   104,     0,   114,     0,   155,     0,    76,    65,
+      80,     0,    79,     0,    93,   176,    52,    53,    41,    49,
+      50,    46,    47,    48,   122,    43,    42,    44,    45,    37,
+      36,    38,    39,     0,     0,     0,     0,     0,    77,    90,
+      88,    92,    74,     0,     0,   108,   111,     0,     0,    77,
+     133,   132,    66,     0,    69,     0,     0,     0,     0,     0,
+     120,   124,     0,    28,     0,    85,     0,    83,     0,     0,
+       0,    94,     0,     0,     0,     0,   135,     0,     0,     0,
+       0,     0,    81,   105,   110,   123,     0,   121,     0,   126,
+      84,    78,    75,     0,    96,     0,   107,   109,   137,   143,
+       0,     0,    73,    68,    67,     0,   125,    95,     0,   101,
+       0,     0,   139,   144,   145,   136,     0,   119,     0,     0,
+     103,     0,     0,   140,   141,   146,   147,     0,     0,     0,
+       0,   138,     0,   134,     0,   149,   150,     0,    97,    98,
+     127,   142,     0,   156,     0,    99,   100,   102,   148,   151
+};
+
+/* YYDEFGOTO[NTERM-NUM].  */
+static const yytype_int16 yydefgoto[] =
+{
+      -1,     2,    63,    64,   210,   117,   253,    65,    66,    67,
+     250,   241,   239,   211,   123,   124,   125,   151,   294,   309,
+     347,   320,    68,    69,    70,   245,   246,   152,    71,    72,
+      73,    74,    75,    76,    77,    78,   260,   261,   262,    79,
+      80,    81,    82,    83,    84,    85,    86,   276,   277,   312,
+     324,   333,   314,   326,   335,    87,   337,   134,   207,    88,
+     130,    89,    90,    21,     9,    10,    26,    27,    31,    32,
+      33,    34,     3
+};
+
+/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
+   STATE-NUM.  */
+#define YYPACT_NINF -179
+static const yytype_int16 yypact[] =
+{
+      24,    36,    58,   -48,   -25,  -179,  -179,    57,    31,  -179,
+     -74,    14,    14,    50,    57,  -179,  -179,  -179,  -179,  -179,
+    -179,  -179,  -179,    72,  -179,    14,  -179,     3,   -26,   -28,
+    -179,  -179,  -179,  -179,     4,  -179,    91,    95,   589,  -179,
+      80,    -6,    43,   285,   285,  -179,    19,    99,    69,    -5,
+      81,   -13,   110,   112,   114,  -179,  -179,  -179,    89,    37,
+      41,  -179,   122,  -179,   406,  -179,    25,    40,    44,    -3,
+      46,   116,    49,    51,   116,    52,    53,    54,    55,    56,
+      59,    61,    62,    70,    73,    74,    75,    76,    77,    78,
+      79,    89,  -179,   285,  -179,  -179,  -179,  -179,  -179,  -179,
+      82,   285,    83,  -179,  -179,  -179,  -179,  -179,  -179,  -179,
+    -179,  -179,  -179,  -179,   285,   285,   577,    92,   618,    94,
+      97,  -179,   706,  -179,   -33,   124,   153,    -5,  -179,  -179,
+     141,    -5,    -5,  -179,   136,  -179,   148,  -179,  -179,  -179,
+    -179,    98,  -179,  -179,  -179,   285,  -179,   101,  -179,  -179,
+     195,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,
+    -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,
+    -179,  -179,  -179,   100,   706,   135,     6,   154,    -7,   206,
+     285,   285,   285,   285,   285,   589,   218,   285,   285,   285,
+     285,   285,   285,   285,   285,   589,   285,   -27,   216,   173,
+      -5,   285,  -179,   217,  -179,   113,  -179,   171,   221,   119,
+     706,   -56,   285,   185,   706,  -179,  -179,  -179,  -179,     6,
+       6,    27,    27,   706,   345,  -179,    27,    27,    27,    35,
+      35,    -7,    -7,   -53,   467,   223,   232,   127,  -179,   126,
+    -179,   -31,  -179,   638,   151,  -179,   142,   251,   253,   150,
+    -179,   126,  -179,   -46,  -179,   285,   -45,   256,   589,   285,
+    -179,   240,   249,  -179,   245,  -179,   166,  -179,   273,   285,
+      -5,   242,   285,   285,   217,    14,  -179,   -39,   222,   170,
+     167,   179,   706,  -179,  -179,   589,   679,  -179,   268,  -179,
+    -179,  -179,  -179,   247,   207,   686,   706,  -179,   186,   243,
+     251,    -5,  -179,  -179,  -179,   589,  -179,  -179,   286,   261,
+     589,   303,   219,  -179,   224,  -179,   193,   589,   226,   272,
+    -179,   528,   205,  -179,   310,  -179,   233,   314,   230,   317,
+     302,  -179,   328,  -179,   235,  -179,  -179,   -38,  -179,     7,
+    -179,  -179,   334,  -179,   331,  -179,  -179,  -179,  -179,  -179
+};
+
+/* YYPGOTO[NTERM-NUM].  */
+static const yytype_int16 yypgoto[] =
+{
+    -179,  -179,   -63,  -178,   -41,  -179,  -179,  -179,  -179,  -179,
+    -179,  -179,   133,  -155,   143,  -179,  -179,   -68,  -179,  -179,
+    -179,  -179,   -40,  -179,  -179,    71,  -179,   269,  -179,  -179,
+    -179,  -179,  -179,  -179,  -179,  -179,    85,  -179,  -179,  -179,
+    -179,  -179,  -179,  -179,  -179,  -179,  -179,    47,  -179,  -179,
+    -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,
+    -117,  -179,  -179,   -12,   330,  -179,   321,  -179,  -179,  -179,
+     315,  -179,  -179
+};
+
+/* YYTABLE[YYPACT[STATE-NUM]].  What to do in state STATE-NUM.  If
+   positive, shift that token.  If negative, reduce the rule which
+   number is the opposite.  If zero, do what YYDEFACT says.
+   If YYTABLE_NINF, syntax error.  */
+#define YYTABLE_NINF -1
+static const yytype_uint16 yytable[] =
+{
+      22,   143,   116,   118,   128,   122,   155,   224,   184,   269,
+     202,   236,    25,    28,   204,   205,   198,   234,   138,   182,
+     183,   184,    94,    95,    96,    97,    98,    99,   100,   148,
+      38,   101,    46,    15,    16,    17,    18,    36,    19,   233,
+       1,    13,   184,    14,   132,     4,   133,   147,    11,    12,
+     184,   173,   174,   345,   346,   119,   120,   256,     5,   254,
+     176,   255,   263,    37,   255,     6,     8,    29,    29,   280,
+     283,   281,   255,   178,   179,    23,   299,   343,   300,   344,
+     285,    25,   237,   242,   199,   102,   270,    35,   186,     7,
+     103,   104,   105,   106,   107,   129,   108,   109,   110,   111,
+      40,   186,   112,   113,    41,    91,    93,    92,   126,   214,
+     187,   188,   189,   190,   191,   192,   193,    20,   127,   135,
+     131,   136,   186,   137,    46,   139,   114,   317,   121,   140,
+     186,   141,   321,   115,   190,   191,   192,   193,   144,   219,
+     220,   221,   222,   223,   192,   193,   226,   227,   228,   229,
+     230,   231,   232,   292,   145,   235,   150,   146,   122,   149,
+     243,   143,   153,   200,   154,   157,   158,   159,   160,   161,
+     201,   143,   162,   271,   163,   164,    94,    95,    96,    97,
+      98,    99,   100,   165,   316,   101,   166,   167,   168,   169,
+     170,   171,   172,   203,   175,   177,   206,   208,    94,    95,
+      96,    97,    98,    99,   100,   216,   194,   101,   196,   119,
+     120,   197,   209,   215,   282,   212,   180,   181,   286,   182,
+     183,   184,   143,   225,   217,   238,   244,   247,   214,   248,
+     249,   295,   296,   180,   181,   252,   182,   183,   184,   102,
+     257,   266,   267,   268,   103,   104,   105,   106,   107,   213,
+     108,   109,   110,   111,   143,   273,   112,   113,   143,   274,
+     275,   102,   278,   298,   279,   284,   103,   104,   105,   106,
+     107,   259,   108,   109,   110,   111,   288,   289,   112,   113,
+     114,   290,   291,   293,   301,   302,   303,   115,    94,    95,
+      96,    97,    98,    99,   100,   304,   306,   101,   307,   308,
+     311,   186,   114,   318,   313,   319,   322,   327,   323,   115,
+     187,   188,   189,   190,   191,   192,   193,   329,   186,   328,
+     331,   218,   332,   336,   338,   325,   339,   187,   188,   189,
+     190,   191,   192,   193,   340,   334,   341,   348,   265,   342,
+     349,   251,   240,   156,    24,   297,   287,   315,    30,    39,
+       0,   102,     0,     0,    42,     0,   103,   104,   105,   106,
+     107,     0,   108,   109,   110,   111,     0,     0,   112,   113,
+       0,     0,     0,    43,     0,   258,   259,     0,    44,    45,
+      46,     0,     0,     0,     0,     0,    47,     0,     0,     0,
+       0,     0,   114,    48,     0,     0,    49,     0,    50,   115,
+       0,    51,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,    52,    53,    54,    42,     0,     0,     0,     0,
+       0,    55,     0,     0,     0,     0,    56,    57,     0,     0,
+      58,    59,    60,   142,    43,    61,     0,     0,     0,    44,
+      45,    46,     0,     0,     0,     0,     0,    47,     0,     0,
+       0,     0,     0,     0,    48,     0,     0,    49,     0,    50,
+       0,     0,    51,    62,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,    52,    53,    54,    42,     0,     0,     0,
+       0,     0,    55,     0,     0,     0,     0,    56,    57,     0,
+       0,    58,    59,    60,   264,    43,    61,     0,     0,     0,
+      44,    45,    46,     0,     0,     0,     0,     0,    47,     0,
+       0,     0,     0,     0,     0,    48,     0,     0,    49,     0,
+      50,     0,     0,    51,    62,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,    52,    53,    54,    42,     0,     0,
+       0,     0,     0,    55,     0,     0,     0,     0,    56,    57,
+       0,     0,    58,    59,    60,   330,    43,    61,     0,     0,
+       0,    44,    45,    46,     0,     0,     0,     0,     0,    47,
+       0,     0,     0,     0,     0,     0,    48,     0,     0,    49,
+       0,    50,     0,     0,    51,    62,     0,   180,   181,     0,
+     182,   183,   184,     0,     0,    52,    53,    54,    42,     0,
+       0,     0,     0,     0,    55,     0,   185,     0,     0,    56,
+      57,     0,     0,    58,    59,    60,     0,    43,    61,     0,
+       0,     0,    44,    45,    46,     0,     0,     0,   180,   181,
+      47,   182,   183,   184,     0,     0,     0,    48,     0,     0,
+      49,     0,    50,     0,     0,    51,    62,     0,   180,   181,
+     195,   182,   183,   184,     0,     0,    52,    53,    54,     0,
+       0,     0,     0,     0,     0,    55,     0,     0,     0,     0,
+      56,    57,   186,     0,    58,    59,    60,     0,     0,    61,
+     272,   187,   188,   189,   190,   191,   192,   193,     0,   180,
+     181,     0,   182,   183,   184,     0,   180,   181,     0,   182,
+     183,   184,     0,     0,     0,     0,     0,    62,   305,     0,
+       0,     0,     0,   186,     0,     0,   180,   181,   310,   182,
+     183,   184,   187,   188,   189,   190,   191,   192,   193,     0,
+       0,     0,     0,   186,     0,     0,     0,     0,     0,     0,
+       0,     0,   187,   188,   189,   190,   191,   192,   193,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,   186,     0,     0,     0,     0,     0,
+       0,   186,     0,   187,   188,   189,   190,   191,   192,   193,
+     187,   188,   189,   190,   191,   192,   193,     0,     0,     0,
+       0,   186,     0,     0,     0,     0,     0,     0,     0,     0,
+     187,   188,   189,   190,   191,   192,   193
+};
+
+static const yytype_int16 yycheck[] =
+{
+      12,    64,    43,    44,     9,    46,    74,   185,    15,    40,
+     127,    38,     9,    25,   131,   132,    49,   195,    58,    13,
+      14,    15,     3,     4,     5,     6,     7,     8,     9,    69,
+      26,    12,    35,    19,    20,    21,    22,    65,    24,   194,
+      16,   115,    15,   117,    57,     9,    59,    50,    17,    18,
+      15,    91,    93,    46,    47,    36,    37,   212,     0,   115,
+     101,   117,   115,    91,   117,   113,     9,    64,    64,   115,
+     115,   117,   117,   114,   115,    25,   115,   115,   117,   117,
+     258,     9,   109,   200,   117,    66,   117,   113,    95,   114,
+      71,    72,    73,    74,    75,   100,    77,    78,    79,    80,
+       9,    95,    83,    84,     9,    25,    63,   113,     9,   150,
+     104,   105,   106,   107,   108,   109,   110,   103,    49,     9,
+      39,     9,    95,     9,    35,    88,   107,   305,   109,    88,
+      95,     9,   310,   114,   107,   108,   109,   110,   113,   180,
+     181,   182,   183,   184,   109,   110,   187,   188,   189,   190,
+     191,   192,   193,   270,   114,   196,    40,   113,   199,   113,
+     201,   224,   113,    39,   113,   113,   113,   113,   113,   113,
+      17,   234,   113,   241,   113,   113,     3,     4,     5,     6,
+       7,     8,     9,   113,   301,    12,   113,   113,   113,   113,
+     113,   113,   113,    52,   112,   112,    60,    49,     3,     4,
+       5,     6,     7,     8,     9,    70,   114,    12,   114,    36,
+      37,   114,   114,   113,   255,   114,    10,    11,   259,    13,
+      14,    15,   285,     5,    70,     9,     9,   114,   269,    58,
+       9,   272,   273,    10,    11,   116,    13,    14,    15,    66,
+      55,     9,   115,   117,    71,    72,    73,    74,    75,    54,
+      77,    78,    79,    80,   317,   104,    83,    84,   321,   117,
+       9,    66,     9,   275,   114,     9,    71,    72,    73,    74,
+      75,    31,    77,    78,    79,    80,    27,    32,    83,    84,
+     107,   115,     9,    41,    62,   115,   119,   114,     3,     4,
+       5,     6,     7,     8,     9,   116,    28,    12,    51,    92,
+     114,    95,   107,    17,    61,    44,     3,   114,    89,   114,
+     104,   105,   106,   107,   108,   109,   110,    45,    95,    93,
+     115,   115,    12,     9,    94,   101,     9,   104,   105,   106,
+     107,   108,   109,   110,    32,   102,     8,     3,   115,   104,
+       9,   208,   199,    74,    14,   274,   261,   300,    27,    34,
+      -1,    66,    -1,    -1,     9,    -1,    71,    72,    73,    74,
+      75,    -1,    77,    78,    79,    80,    -1,    -1,    83,    84,
+      -1,    -1,    -1,    28,    -1,    30,    31,    -1,    33,    34,
+      35,    -1,    -1,    -1,    -1,    -1,    41,    -1,    -1,    -1,
+      -1,    -1,   107,    48,    -1,    -1,    51,    -1,    53,   114,
+      -1,    56,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    67,    68,    69,     9,    -1,    -1,    -1,    -1,
+      -1,    76,    -1,    -1,    -1,    -1,    81,    82,    -1,    -1,
+      85,    86,    87,    27,    28,    90,    -1,    -1,    -1,    33,
+      34,    35,    -1,    -1,    -1,    -1,    -1,    41,    -1,    -1,
+      -1,    -1,    -1,    -1,    48,    -1,    -1,    51,    -1,    53,
+      -1,    -1,    56,   118,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    67,    68,    69,     9,    -1,    -1,    -1,
+      -1,    -1,    76,    -1,    -1,    -1,    -1,    81,    82,    -1,
+      -1,    85,    86,    87,    27,    28,    90,    -1,    -1,    -1,
+      33,    34,    35,    -1,    -1,    -1,    -1,    -1,    41,    -1,
+      -1,    -1,    -1,    -1,    -1,    48,    -1,    -1,    51,    -1,
+      53,    -1,    -1,    56,   118,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    -1,    67,    68,    69,     9,    -1,    -1,
+      -1,    -1,    -1,    76,    -1,    -1,    -1,    -1,    81,    82,
+      -1,    -1,    85,    86,    87,    27,    28,    90,    -1,    -1,
+      -1,    33,    34,    35,    -1,    -1,    -1,    -1,    -1,    41,
+      -1,    -1,    -1,    -1,    -1,    -1,    48,    -1,    -1,    51,
+      -1,    53,    -1,    -1,    56,   118,    -1,    10,    11,    -1,
+      13,    14,    15,    -1,    -1,    67,    68,    69,     9,    -1,
+      -1,    -1,    -1,    -1,    76,    -1,    29,    -1,    -1,    81,
+      82,    -1,    -1,    85,    86,    87,    -1,    28,    90,    -1,
+      -1,    -1,    33,    34,    35,    -1,    -1,    -1,    10,    11,
+      41,    13,    14,    15,    -1,    -1,    -1,    48,    -1,    -1,
+      51,    -1,    53,    -1,    -1,    56,   118,    -1,    10,    11,
+      32,    13,    14,    15,    -1,    -1,    67,    68,    69,    -1,
+      -1,    -1,    -1,    -1,    -1,    76,    -1,    -1,    -1,    -1,
+      81,    82,    95,    -1,    85,    86,    87,    -1,    -1,    90,
+      42,   104,   105,   106,   107,   108,   109,   110,    -1,    10,
+      11,    -1,    13,    14,    15,    -1,    10,    11,    -1,    13,
+      14,    15,    -1,    -1,    -1,    -1,    -1,   118,    29,    -1,
+      -1,    -1,    -1,    95,    -1,    -1,    10,    11,    32,    13,
+      14,    15,   104,   105,   106,   107,   108,   109,   110,    -1,
+      -1,    -1,    -1,    95,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,   104,   105,   106,   107,   108,   109,   110,    -1,
+      -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    -1,    95,    -1,    -1,    -1,    -1,    -1,
+      -1,    95,    -1,   104,   105,   106,   107,   108,   109,   110,
+     104,   105,   106,   107,   108,   109,   110,    -1,    -1,    -1,
+      -1,    95,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
+     104,   105,   106,   107,   108,   109,   110
+};
+
+/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
+   symbol of state STATE-NUM.  */
+static const yytype_uint8 yystos[] =
+{
+       0,    16,   121,   192,     9,     0,   113,   114,     9,   184,
+     185,    17,    18,   115,   117,    19,    20,    21,    22,    24,
+     103,   183,   183,    25,   184,     9,   186,   187,   183,    64,
+     186,   188,   189,   190,   191,   113,    65,    91,    26,   190,
+       9,     9,     9,    28,    33,    34,    35,    41,    48,    51,
+      53,    56,    67,    68,    69,    76,    81,    82,    85,    86,
+      87,    90,   118,   122,   123,   127,   128,   129,   142,   143,
+     144,   148,   149,   150,   151,   152,   153,   154,   155,   159,
+     160,   161,   162,   163,   164,   165,   166,   175,   179,   181,
+     182,    25,   113,    63,     3,     4,     5,     6,     7,     8,
+       9,    12,    66,    71,    72,    73,    74,    75,    77,    78,
+      79,    80,    83,    84,   107,   114,   124,   125,   124,    36,
+      37,   109,   124,   134,   135,   136,     9,    49,     9,   100,
+     180,    39,    57,    59,   177,     9,     9,     9,   142,    88,
+      88,     9,    27,   122,   113,   114,   113,    50,   142,   113,
+      40,   137,   147,   113,   113,   137,   147,   113,   113,   113,
+     113,   113,   113,   113,   113,   113,   113,   113,   113,   113,
+     113,   113,   113,   142,   124,   112,   124,   112,   124,   124,
+      10,    11,    13,    14,    15,    29,    95,   104,   105,   106,
+     107,   108,   109,   110,   114,    32,   114,   114,    49,   117,
+      39,    17,   180,    52,   180,   180,    60,   178,    49,   114,
+     124,   133,   114,    54,   124,   113,    70,    70,   115,   124,
+     124,   124,   124,   124,   123,     5,   124,   124,   124,   124,
+     124,   124,   124,   133,   123,   124,    38,   109,     9,   132,
+     134,   131,   180,   124,     9,   145,   146,   114,    58,     9,
+     130,   132,   116,   126,   115,   117,   133,    55,    30,    31,
+     156,   157,   158,   115,    27,   115,     9,   115,   117,    40,
+     117,   137,    42,   104,   117,     9,   167,   168,     9,   114,
+     115,   117,   124,   115,     9,   123,   124,   156,    27,    32,
+     115,     9,   180,    41,   138,   124,   124,   145,   183,   115,
+     117,    62,   115,   119,   116,    29,    28,    51,    92,   139,
+      32,   114,   169,    61,   172,   167,   180,   123,    17,    44,
+     141,   123,     3,    89,   170,   101,   173,   114,    93,    45,
+      27,   115,    12,   171,   102,   174,     9,   176,    94,     9,
+      32,     8,   104,   115,   117,    46,    47,   140,     3,     9
+};
+
+#define yyerrok		(yyerrstatus = 0)
+#define yyclearin	(yychar = YYEMPTY)
+#define YYEMPTY		(-2)
+#define YYEOF		0
+
+#define YYACCEPT	goto yyacceptlab
+#define YYABORT		goto yyabortlab
+#define YYERROR		goto yyerrorlab
+
+
+/* Like YYERROR except do call yyerror.  This remains here temporarily
+   to ease the transition to the new meaning of YYERROR, for GCC.
+   Once GCC version 2 has supplanted version 1, this can go.  */
+
+#define YYFAIL		goto yyerrlab
+
+#define YYRECOVERING()  (!!yyerrstatus)
+
+#define YYBACKUP(Token, Value)					\
+do								\
+  if (yychar == YYEMPTY && yylen == 1)				\
+    {								\
+      yychar = (Token);						\
+      yylval = (Value);						\
+      yytoken = YYTRANSLATE (yychar);				\
+      YYPOPSTACK (1);						\
+      goto yybackup;						\
+    }								\
+  else								\
+    {								\
+      yyerror (YY_("syntax error: cannot back up")); \
+      YYERROR;							\
+    }								\
+while (YYID (0))
+
+
+#define YYTERROR	1
+#define YYERRCODE	256
+
+
+/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N].
+   If N is 0, then set CURRENT to the empty location which ends
+   the previous symbol: RHS[0] (always defined).  */
+
+#define YYRHSLOC(Rhs, K) ((Rhs)[K])
+#ifndef YYLLOC_DEFAULT
+# define YYLLOC_DEFAULT(Current, Rhs, N)				\
+    do									\
+      if (YYID (N))                                                    \
+	{								\
+	  (Current).first_line   = YYRHSLOC (Rhs, 1).first_line;	\
+	  (Current).first_column = YYRHSLOC (Rhs, 1).first_column;	\
+	  (Current).last_line    = YYRHSLOC (Rhs, N).last_line;		\
+	  (Current).last_column  = YYRHSLOC (Rhs, N).last_column;	\
+	}								\
+      else								\
+	{								\
+	  (Current).first_line   = (Current).last_line   =		\
+	    YYRHSLOC (Rhs, 0).last_line;				\
+	  (Current).first_column = (Current).last_column =		\
+	    YYRHSLOC (Rhs, 0).last_column;				\
+	}								\
+    while (YYID (0))
+#endif
+
+
+/* YY_LOCATION_PRINT -- Print the location on the stream.
+   This macro was not mandated originally: define only if we know
+   we won't break user code: when these are the locations we know.  */
+
+#ifndef YY_LOCATION_PRINT
+# if defined YYLTYPE_IS_TRIVIAL && YYLTYPE_IS_TRIVIAL
+#  define YY_LOCATION_PRINT(File, Loc)			\
+     fprintf (File, "%d.%d-%d.%d",			\
+	      (Loc).first_line, (Loc).first_column,	\
+	      (Loc).last_line,  (Loc).last_column)
+# else
+#  define YY_LOCATION_PRINT(File, Loc) ((void) 0)
+# endif
+#endif
+
+
+/* YYLEX -- calling `yylex' with the right arguments.  */
+
+#ifdef YYLEX_PARAM
+# define YYLEX yylex (YYLEX_PARAM)
+#else
+# define YYLEX yylex ()
+#endif
+
+/* Enable debugging if requested.  */
+#if YYDEBUG
+
+# ifndef YYFPRINTF
+#  include <stdio.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYFPRINTF fprintf
+# endif
+
+# define YYDPRINTF(Args)			\
+do {						\
+  if (yydebug)					\
+    YYFPRINTF Args;				\
+} while (YYID (0))
+
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)			  \
+do {									  \
+  if (yydebug)								  \
+    {									  \
+      YYFPRINTF (stderr, "%s ", Title);					  \
+      yy_symbol_print (stderr,						  \
+		  Type, Value); \
+      YYFPRINTF (stderr, "\n");						  \
+    }									  \
+} while (YYID (0))
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep)
+#else
+static void
+yy_symbol_value_print (yyoutput, yytype, yyvaluep)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE const * const yyvaluep;
+#endif
+{
+  if (!yyvaluep)
+    return;
+# ifdef YYPRINT
+  if (yytype < YYNTOKENS)
+    YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep);
+# else
+  YYUSE (yyoutput);
+# endif
+  switch (yytype)
+    {
+      default:
+	break;
+    }
+}
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep)
+#else
+static void
+yy_symbol_print (yyoutput, yytype, yyvaluep)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE const * const yyvaluep;
+#endif
+{
+  if (yytype < YYNTOKENS)
+    YYFPRINTF (yyoutput, "token %s (", yytname[yytype]);
+  else
+    YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]);
+
+  yy_symbol_value_print (yyoutput, yytype, yyvaluep);
+  YYFPRINTF (yyoutput, ")");
+}
+
+/*------------------------------------------------------------------.
+| yy_stack_print -- Print the state stack from its BOTTOM up to its |
+| TOP (included).                                                   |
+`------------------------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_stack_print (yytype_int16 *bottom, yytype_int16 *top)
+#else
+static void
+yy_stack_print (bottom, top)
+    yytype_int16 *bottom;
+    yytype_int16 *top;
+#endif
+{
+  YYFPRINTF (stderr, "Stack now");
+  for (; bottom <= top; ++bottom)
+    YYFPRINTF (stderr, " %d", *bottom);
+  YYFPRINTF (stderr, "\n");
+}
+
+# define YY_STACK_PRINT(Bottom, Top)				\
+do {								\
+  if (yydebug)							\
+    yy_stack_print ((Bottom), (Top));				\
+} while (YYID (0))
+
+
+/*------------------------------------------------.
+| Report that the YYRULE is going to be reduced.  |
+`------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_reduce_print (YYSTYPE *yyvsp, int yyrule)
+#else
+static void
+yy_reduce_print (yyvsp, yyrule)
+    YYSTYPE *yyvsp;
+    int yyrule;
+#endif
+{
+  int yynrhs = yyr2[yyrule];
+  int yyi;
+  unsigned long int yylno = yyrline[yyrule];
+  YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n",
+	     yyrule - 1, yylno);
+  /* The symbols being reduced.  */
+  for (yyi = 0; yyi < yynrhs; yyi++)
+    {
+      fprintf (stderr, "   $%d = ", yyi + 1);
+      yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi],
+		       &(yyvsp[(yyi + 1) - (yynrhs)])
+		       		       );
+      fprintf (stderr, "\n");
+    }
+}
+
+# define YY_REDUCE_PRINT(Rule)		\
+do {					\
+  if (yydebug)				\
+    yy_reduce_print (yyvsp, Rule); \
+} while (YYID (0))
+
+/* Nonzero means print parse trace.  It is left uninitialized so that
+   multiple parsers can coexist.  */
+int yydebug;
+#else /* !YYDEBUG */
+# define YYDPRINTF(Args)
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)
+# define YY_STACK_PRINT(Bottom, Top)
+# define YY_REDUCE_PRINT(Rule)
+#endif /* !YYDEBUG */
+
+
+/* YYINITDEPTH -- initial size of the parser's stacks.  */
+#ifndef	YYINITDEPTH
+# define YYINITDEPTH 200
+#endif
+
+/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
+   if the built-in stack extension method is used).
+
+   Do not make this value too large; the results are undefined if
+   YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH)
+   evaluated with infinite-precision integer arithmetic.  */
+
+#ifndef YYMAXDEPTH
+# define YYMAXDEPTH 10000
+#endif
+
+
+
+#if YYERROR_VERBOSE
+
+# ifndef yystrlen
+#  if defined __GLIBC__ && defined _STRING_H
+#   define yystrlen strlen
+#  else
+/* Return the length of YYSTR.  */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static YYSIZE_T
+yystrlen (const char *yystr)
+#else
+static YYSIZE_T
+yystrlen (yystr)
+    const char *yystr;
+#endif
+{
+  YYSIZE_T yylen;
+  for (yylen = 0; yystr[yylen]; yylen++)
+    continue;
+  return yylen;
+}
+#  endif
+# endif
+
+# ifndef yystpcpy
+#  if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE
+#   define yystpcpy stpcpy
+#  else
+/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in
+   YYDEST.  */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static char *
+yystpcpy (char *yydest, const char *yysrc)
+#else
+static char *
+yystpcpy (yydest, yysrc)
+    char *yydest;
+    const char *yysrc;
+#endif
+{
+  char *yyd = yydest;
+  const char *yys = yysrc;
+
+  while ((*yyd++ = *yys++) != '\0')
+    continue;
+
+  return yyd - 1;
+}
+#  endif
+# endif
+
+# ifndef yytnamerr
+/* Copy to YYRES the contents of YYSTR after stripping away unnecessary
+   quotes and backslashes, so that it's suitable for yyerror.  The
+   heuristic is that double-quoting is unnecessary unless the string
+   contains an apostrophe, a comma, or backslash (other than
+   backslash-backslash).  YYSTR is taken from yytname.  If YYRES is
+   null, do not copy; instead, return the length of what the result
+   would have been.  */
+static YYSIZE_T
+yytnamerr (char *yyres, const char *yystr)
+{
+  if (*yystr == '"')
+    {
+      YYSIZE_T yyn = 0;
+      char const *yyp = yystr;
+
+      for (;;)
+	switch (*++yyp)
+	  {
+	  case '\'':
+	  case ',':
+	    goto do_not_strip_quotes;
+
+	  case '\\':
+	    if (*++yyp != '\\')
+	      goto do_not_strip_quotes;
+	    /* Fall through.  */
+	  default:
+	    if (yyres)
+	      yyres[yyn] = *yyp;
+	    yyn++;
+	    break;
+
+	  case '"':
+	    if (yyres)
+	      yyres[yyn] = '\0';
+	    return yyn;
+	  }
+    do_not_strip_quotes: ;
+    }
+
+  if (! yyres)
+    return yystrlen (yystr);
+
+  return yystpcpy (yyres, yystr) - yyres;
+}
+# endif
+
+/* Copy into YYRESULT an error message about the unexpected token
+   YYCHAR while in state YYSTATE.  Return the number of bytes copied,
+   including the terminating null byte.  If YYRESULT is null, do not
+   copy anything; just return the number of bytes that would be
+   copied.  As a special case, return 0 if an ordinary "syntax error"
+   message will do.  Return YYSIZE_MAXIMUM if overflow occurs during
+   size calculation.  */
+static YYSIZE_T
+yysyntax_error (char *yyresult, int yystate, int yychar)
+{
+  int yyn = yypact[yystate];
+
+  if (! (YYPACT_NINF < yyn && yyn <= YYLAST))
+    return 0;
+  else
+    {
+      int yytype = YYTRANSLATE (yychar);
+      YYSIZE_T yysize0 = yytnamerr (0, yytname[yytype]);
+      YYSIZE_T yysize = yysize0;
+      YYSIZE_T yysize1;
+      int yysize_overflow = 0;
+      enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 };
+      char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM];
+      int yyx;
+
+# if 0
+      /* This is so xgettext sees the translatable formats that are
+	 constructed on the fly.  */
+      YY_("syntax error, unexpected %s");
+      YY_("syntax error, unexpected %s, expecting %s");
+      YY_("syntax error, unexpected %s, expecting %s or %s");
+      YY_("syntax error, unexpected %s, expecting %s or %s or %s");
+      YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s");
+# endif
+      char *yyfmt;
+      char const *yyf;
+      static char const yyunexpected[] = "syntax error, unexpected %s";
+      static char const yyexpecting[] = ", expecting %s";
+      static char const yyor[] = " or %s";
+      char yyformat[sizeof yyunexpected
+		    + sizeof yyexpecting - 1
+		    + ((YYERROR_VERBOSE_ARGS_MAXIMUM - 2)
+		       * (sizeof yyor - 1))];
+      char const *yyprefix = yyexpecting;
+
+      /* Start YYX at -YYN if negative to avoid negative indexes in
+	 YYCHECK.  */
+      int yyxbegin = yyn < 0 ? -yyn : 0;
+
+      /* Stay within bounds of both yycheck and yytname.  */
+      int yychecklim = YYLAST - yyn + 1;
+      int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
+      int yycount = 1;
+
+      yyarg[0] = yytname[yytype];
+      yyfmt = yystpcpy (yyformat, yyunexpected);
+
+      for (yyx = yyxbegin; yyx < yyxend; ++yyx)
+	if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR)
+	  {
+	    if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM)
+	      {
+		yycount = 1;
+		yysize = yysize0;
+		yyformat[sizeof yyunexpected - 1] = '\0';
+		break;
+	      }
+	    yyarg[yycount++] = yytname[yyx];
+	    yysize1 = yysize + yytnamerr (0, yytname[yyx]);
+	    yysize_overflow |= (yysize1 < yysize);
+	    yysize = yysize1;
+	    yyfmt = yystpcpy (yyfmt, yyprefix);
+	    yyprefix = yyor;
+	  }
+
+      yyf = YY_(yyformat);
+      yysize1 = yysize + yystrlen (yyf);
+      yysize_overflow |= (yysize1 < yysize);
+      yysize = yysize1;
+
+      if (yysize_overflow)
+	return YYSIZE_MAXIMUM;
+
+      if (yyresult)
+	{
+	  /* Avoid sprintf, as that infringes on the user's name space.
+	     Don't have undefined behavior even if the translation
+	     produced a string with the wrong number of "%s"s.  */
+	  char *yyp = yyresult;
+	  int yyi = 0;
+	  while ((*yyp = *yyf) != '\0')
+	    {
+	      if (*yyp == '%' && yyf[1] == 's' && yyi < yycount)
+		{
+		  yyp += yytnamerr (yyp, yyarg[yyi++]);
+		  yyf += 2;
+		}
+	      else
+		{
+		  yyp++;
+		  yyf++;
+		}
+	    }
+	}
+      return yysize;
+    }
+}
+#endif /* YYERROR_VERBOSE */
+
+
+/*-----------------------------------------------.
+| Release the memory associated to this symbol.  |
+`-----------------------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep)
+#else
+static void
+yydestruct (yymsg, yytype, yyvaluep)
+    const char *yymsg;
+    int yytype;
+    YYSTYPE *yyvaluep;
+#endif
+{
+  YYUSE (yyvaluep);
+
+  if (!yymsg)
+    yymsg = "Deleting";
+  YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp);
+}
+
+
+/* Prevent warnings from -Wmissing-prototypes.  */
+
+#ifdef YYPARSE_PARAM
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void *YYPARSE_PARAM);
+#else
+int yyparse ();
+#endif
+#else /* ! YYPARSE_PARAM */
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void);
+#else
+int yyparse ();
+#endif
+#endif /* ! YYPARSE_PARAM */
+
+
+
+/* The look-ahead symbol.  */
+int yychar;
+
+/* The semantic value of the look-ahead symbol.  */
+YYSTYPE yylval;
+
+/* Number of syntax errors so far.  */
+int yynerrs;
+
+
+
+/*----------.
+| yyparse.  |
+`----------*/
+
+#ifdef YYPARSE_PARAM
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void *YYPARSE_PARAM)
+#else
+int
+yyparse (YYPARSE_PARAM)
+    void *YYPARSE_PARAM;
+#endif
+#else /* ! YYPARSE_PARAM */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void)
+#else
+int
+yyparse ()
+
+#endif
+#endif
+{
+
+  int yystate;
+  int yyn;
+  int yyresult;
+  /* Number of tokens to shift before error messages enabled.  */
+  int yyerrstatus;
+  /* Look-ahead token as an internal (translated) token number.  */
+  int yytoken = 0;
+#if YYERROR_VERBOSE
+  /* Buffer for error messages, and its allocated size.  */
+  char yymsgbuf[128];
+  char *yymsg = yymsgbuf;
+  YYSIZE_T yymsg_alloc = sizeof yymsgbuf;
+#endif
+
+  /* Three stacks and their tools:
+     `yyss': related to states,
+     `yyvs': related to semantic values,
+     `yyls': related to locations.
+
+     Refer to the stacks thru separate pointers, to allow yyoverflow
+     to reallocate them elsewhere.  */
+
+  /* The state stack.  */
+  yytype_int16 yyssa[YYINITDEPTH];
+  yytype_int16 *yyss = yyssa;
+  yytype_int16 *yyssp;
+
+  /* The semantic value stack.  */
+  YYSTYPE yyvsa[YYINITDEPTH];
+  YYSTYPE *yyvs = yyvsa;
+  YYSTYPE *yyvsp;
+
+
+
+#define YYPOPSTACK(N)   (yyvsp -= (N), yyssp -= (N))
+
+  YYSIZE_T yystacksize = YYINITDEPTH;
+
+  /* The variables used to return semantic value and location from the
+     action routines.  */
+  YYSTYPE yyval;
+
+
+  /* The number of symbols on the RHS of the reduced rule.
+     Keep to zero when no symbol should be popped.  */
+  int yylen = 0;
+
+  YYDPRINTF ((stderr, "Starting parse\n"));
+
+  yystate = 0;
+  yyerrstatus = 0;
+  yynerrs = 0;
+  yychar = YYEMPTY;		/* Cause a token to be read.  */
+
+  /* Initialize stack pointers.
+     Waste one element of value and location stack
+     so that they stay on the same level as the state stack.
+     The wasted elements are never initialized.  */
+
+  yyssp = yyss;
+  yyvsp = yyvs;
+
+  goto yysetstate;
+
+/*------------------------------------------------------------.
+| yynewstate -- Push a new state, which is found in yystate.  |
+`------------------------------------------------------------*/
+ yynewstate:
+  /* In all cases, when you get here, the value and location stacks
+     have just been pushed.  So pushing a state here evens the stacks.  */
+  yyssp++;
+
+ yysetstate:
+  *yyssp = yystate;
+
+  if (yyss + yystacksize - 1 <= yyssp)
+    {
+      /* Get the current used size of the three stacks, in elements.  */
+      YYSIZE_T yysize = yyssp - yyss + 1;
+
+#ifdef yyoverflow
+      {
+	/* Give user a chance to reallocate the stack.  Use copies of
+	   these so that the &'s don't force the real ones into
+	   memory.  */
+	YYSTYPE *yyvs1 = yyvs;
+	yytype_int16 *yyss1 = yyss;
+
+
+	/* Each stack pointer address is followed by the size of the
+	   data in use in that stack, in bytes.  This used to be a
+	   conditional around just the two extra args, but that might
+	   be undefined if yyoverflow is a macro.  */
+	yyoverflow (YY_("memory exhausted"),
+		    &yyss1, yysize * sizeof (*yyssp),
+		    &yyvs1, yysize * sizeof (*yyvsp),
+
+		    &yystacksize);
+
+	yyss = yyss1;
+	yyvs = yyvs1;
+      }
+#else /* no yyoverflow */
+# ifndef YYSTACK_RELOCATE
+      goto yyexhaustedlab;
+# else
+      /* Extend the stack our own way.  */
+      if (YYMAXDEPTH <= yystacksize)
+	goto yyexhaustedlab;
+      yystacksize *= 2;
+      if (YYMAXDEPTH < yystacksize)
+	yystacksize = YYMAXDEPTH;
+
+      {
+	yytype_int16 *yyss1 = yyss;
+	union yyalloc *yyptr =
+	  (union yyalloc*) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize));
+	if (! yyptr)
+	  goto yyexhaustedlab;
+	YYSTACK_RELOCATE (yyss);
+	YYSTACK_RELOCATE (yyvs);
+
+#  undef YYSTACK_RELOCATE
+	if (yyss1 != yyssa)
+	  YYSTACK_FREE (yyss1);
+      }
+# endif
+#endif /* no yyoverflow */
+
+      yyssp = yyss + yysize - 1;
+      yyvsp = yyvs + yysize - 1;
+
+
+      YYDPRINTF ((stderr, "Stack size increased to %lu\n",
+		  (unsigned long int) yystacksize));
+
+      if (yyss + yystacksize - 1 <= yyssp)
+	YYABORT;
+    }
+
+  YYDPRINTF ((stderr, "Entering state %d\n", yystate));
+
+  goto yybackup;
+
+/*-----------.
+| yybackup.  |
+`-----------*/
+yybackup:
+
+  /* Do appropriate processing given the current state.  Read a
+     look-ahead token if we need one and don't already have one.  */
+
+  /* First try to decide what to do without reference to look-ahead token.  */
+  yyn = yypact[yystate];
+  if (yyn == YYPACT_NINF)
+    goto yydefault;
+
+  /* Not known => get a look-ahead token if don't already have one.  */
+
+  /* YYCHAR is either YYEMPTY or YYEOF or a valid look-ahead symbol.  */
+  if (yychar == YYEMPTY)
+    {
+      YYDPRINTF ((stderr, "Reading a token: "));
+      yychar = YYLEX;
+    }
+
+  if (yychar <= YYEOF)
+    {
+      yychar = yytoken = YYEOF;
+      YYDPRINTF ((stderr, "Now at end of input.\n"));
+    }
+  else
+    {
+      yytoken = YYTRANSLATE (yychar);
+      YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc);
+    }
+
+  /* If the proper action on seeing token YYTOKEN is to reduce or to
+     detect an error, take that action.  */
+  yyn += yytoken;
+  if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken)
+    goto yydefault;
+  yyn = yytable[yyn];
+  if (yyn <= 0)
+    {
+      if (yyn == 0 || yyn == YYTABLE_NINF)
+	goto yyerrlab;
+      yyn = -yyn;
+      goto yyreduce;
+    }
+
+  if (yyn == YYFINAL)
+    YYACCEPT;
+
+  /* Count tokens shifted since error; after three, turn off error
+     status.  */
+  if (yyerrstatus)
+    yyerrstatus--;
+
+  /* Shift the look-ahead token.  */
+  YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
+
+  /* Discard the shifted token unless it is eof.  */
+  if (yychar != YYEOF)
+    yychar = YYEMPTY;
+
+  yystate = yyn;
+  *++yyvsp = yylval;
+
+  goto yynewstate;
+
+
+/*-----------------------------------------------------------.
+| yydefault -- do the default action for the current state.  |
+`-----------------------------------------------------------*/
+yydefault:
+  yyn = yydefact[yystate];
+  if (yyn == 0)
+    goto yyerrlab;
+  goto yyreduce;
+
+
+/*-----------------------------.
+| yyreduce -- Do a reduction.  |
+`-----------------------------*/
+yyreduce:
+  /* yyn is the number of a rule to reduce with.  */
+  yylen = yyr2[yyn];
+
+  /* If YYLEN is nonzero, implement the default value of the action:
+     `$$ = $1'.
+
+     Otherwise, the following line sets YYVAL to garbage.
+     This behavior is undocumented and Bison
+     users should not rely upon it.  Assigning to YYVAL
+     unconditionally makes the parser a bit smaller, and it avoids a
+     GCC warning that YYVAL may be used uninitialized.  */
+  yyval = yyvsp[1-yylen];
+
+
+  YY_REDUCE_PRINT (yyn);
+  switch (yyn)
+    {
+        case 25:
+#line 190 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 26:
+#line 192 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (2)]), (yyvsp[(2) - (2)])); ;}
+    break;
+
+  case 27:
+#line 196 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 28:
+#line 198 "pars0grm.y"
+    { (yyval) = pars_func((yyvsp[(1) - (4)]), (yyvsp[(3) - (4)])); ;}
+    break;
+
+  case 29:
+#line 199 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 30:
+#line 200 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 31:
+#line 201 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 32:
+#line 202 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 33:
+#line 203 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 34:
+#line 204 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 35:
+#line 205 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 36:
+#line 206 "pars0grm.y"
+    { (yyval) = pars_op('+', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 37:
+#line 207 "pars0grm.y"
+    { (yyval) = pars_op('-', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 38:
+#line 208 "pars0grm.y"
+    { (yyval) = pars_op('*', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 39:
+#line 209 "pars0grm.y"
+    { (yyval) = pars_op('/', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 40:
+#line 210 "pars0grm.y"
+    { (yyval) = pars_op('-', (yyvsp[(2) - (2)]), NULL); ;}
+    break;
+
+  case 41:
+#line 211 "pars0grm.y"
+    { (yyval) = (yyvsp[(2) - (3)]); ;}
+    break;
+
+  case 42:
+#line 212 "pars0grm.y"
+    { (yyval) = pars_op('=', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 43:
+#line 214 "pars0grm.y"
+    { (yyval) = pars_op(PARS_LIKE_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 44:
+#line 215 "pars0grm.y"
+    { (yyval) = pars_op('<', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 45:
+#line 216 "pars0grm.y"
+    { (yyval) = pars_op('>', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 46:
+#line 217 "pars0grm.y"
+    { (yyval) = pars_op(PARS_GE_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 47:
+#line 218 "pars0grm.y"
+    { (yyval) = pars_op(PARS_LE_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 48:
+#line 219 "pars0grm.y"
+    { (yyval) = pars_op(PARS_NE_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 49:
+#line 220 "pars0grm.y"
+    { (yyval) = pars_op(PARS_AND_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 50:
+#line 221 "pars0grm.y"
+    { (yyval) = pars_op(PARS_OR_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 51:
+#line 222 "pars0grm.y"
+    { (yyval) = pars_op(PARS_NOT_TOKEN, (yyvsp[(2) - (2)]), NULL); ;}
+    break;
+
+  case 52:
+#line 224 "pars0grm.y"
+    { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[(1) - (3)]), NULL); ;}
+    break;
+
+  case 53:
+#line 226 "pars0grm.y"
+    { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[(1) - (3)]), NULL); ;}
+    break;
+
+  case 54:
+#line 230 "pars0grm.y"
+    { (yyval) = &pars_to_char_token; ;}
+    break;
+
+  case 55:
+#line 231 "pars0grm.y"
+    { (yyval) = &pars_to_number_token; ;}
+    break;
+
+  case 56:
+#line 232 "pars0grm.y"
+    { (yyval) = &pars_to_binary_token; ;}
+    break;
+
+  case 57:
+#line 234 "pars0grm.y"
+    { (yyval) = &pars_binary_to_number_token; ;}
+    break;
+
+  case 58:
+#line 235 "pars0grm.y"
+    { (yyval) = &pars_substr_token; ;}
+    break;
+
+  case 59:
+#line 236 "pars0grm.y"
+    { (yyval) = &pars_concat_token; ;}
+    break;
+
+  case 60:
+#line 237 "pars0grm.y"
+    { (yyval) = &pars_instr_token; ;}
+    break;
+
+  case 61:
+#line 238 "pars0grm.y"
+    { (yyval) = &pars_length_token; ;}
+    break;
+
+  case 62:
+#line 239 "pars0grm.y"
+    { (yyval) = &pars_sysdate_token; ;}
+    break;
+
+  case 63:
+#line 240 "pars0grm.y"
+    { (yyval) = &pars_rnd_token; ;}
+    break;
+
+  case 64:
+#line 241 "pars0grm.y"
+    { (yyval) = &pars_rnd_str_token; ;}
+    break;
+
+  case 68:
+#line 252 "pars0grm.y"
+    { (yyval) = pars_stored_procedure_call(
+					static_cast<sym_node_t*>((yyvsp[(2) - (6)]))); ;}
+    break;
+
+  case 69:
+#line 258 "pars0grm.y"
+    { (yyval) = pars_procedure_call((yyvsp[(1) - (4)]), (yyvsp[(3) - (4)])); ;}
+    break;
+
+  case 70:
+#line 262 "pars0grm.y"
+    { (yyval) = &pars_replstr_token; ;}
+    break;
+
+  case 71:
+#line 263 "pars0grm.y"
+    { (yyval) = &pars_printf_token; ;}
+    break;
+
+  case 72:
+#line 264 "pars0grm.y"
+    { (yyval) = &pars_assert_token; ;}
+    break;
+
+  case 73:
+#line 268 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (3)]); ;}
+    break;
+
+  case 74:
+#line 272 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 75:
+#line 274 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 76:
+#line 278 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 77:
+#line 279 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 78:
+#line 281 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 79:
+#line 285 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 80:
+#line 286 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)]));;}
+    break;
+
+  case 81:
+#line 287 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 82:
+#line 291 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]); ;}
+    break;
+
+  case 83:
+#line 293 "pars0grm.y"
+    { (yyval) = pars_func(&pars_count_token,
+				          que_node_list_add_last(NULL,
+					    sym_tab_add_int_lit(
+						pars_sym_tab_global, 1))); ;}
+    break;
+
+  case 84:
+#line 298 "pars0grm.y"
+    { (yyval) = pars_func(&pars_count_token,
+					    que_node_list_add_last(NULL,
+						pars_func(&pars_distinct_token,
+						     que_node_list_add_last(
+								NULL, (yyvsp[(4) - (5)]))))); ;}
+    break;
+
+  case 85:
+#line 304 "pars0grm.y"
+    { (yyval) = pars_func(&pars_sum_token,
+						que_node_list_add_last(NULL,
+									(yyvsp[(3) - (4)]))); ;}
+    break;
+
+  case 86:
+#line 310 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 87:
+#line 311 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 88:
+#line 313 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 89:
+#line 317 "pars0grm.y"
+    { (yyval) = pars_select_list(&pars_star_denoter,
+								NULL); ;}
+    break;
+
+  case 90:
+#line 320 "pars0grm.y"
+    { (yyval) = pars_select_list(
+					(yyvsp[(1) - (3)]), static_cast<sym_node_t*>((yyvsp[(3) - (3)]))); ;}
+    break;
+
+  case 91:
+#line 322 "pars0grm.y"
+    { (yyval) = pars_select_list((yyvsp[(1) - (1)]), NULL); ;}
+    break;
+
+  case 92:
+#line 326 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 93:
+#line 327 "pars0grm.y"
+    { (yyval) = (yyvsp[(2) - (2)]); ;}
+    break;
+
+  case 94:
+#line 331 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 95:
+#line 333 "pars0grm.y"
+    { (yyval) = &pars_update_token; ;}
+    break;
+
+  case 96:
+#line 337 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 97:
+#line 339 "pars0grm.y"
+    { (yyval) = &pars_share_token; ;}
+    break;
+
+  case 98:
+#line 343 "pars0grm.y"
+    { (yyval) = &pars_asc_token; ;}
+    break;
+
+  case 99:
+#line 344 "pars0grm.y"
+    { (yyval) = &pars_asc_token; ;}
+    break;
+
+  case 100:
+#line 345 "pars0grm.y"
+    { (yyval) = &pars_desc_token; ;}
+    break;
+
+  case 101:
+#line 349 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 102:
+#line 351 "pars0grm.y"
+    { (yyval) = pars_order_by(
+					static_cast<sym_node_t*>((yyvsp[(3) - (4)])),
+					static_cast<pars_res_word_t*>((yyvsp[(4) - (4)]))); ;}
+    break;
+
+  case 103:
+#line 362 "pars0grm.y"
+    { (yyval) = pars_select_statement(
+					static_cast<sel_node_t*>((yyvsp[(2) - (8)])),
+					static_cast<sym_node_t*>((yyvsp[(4) - (8)])),
+					static_cast<que_node_t*>((yyvsp[(5) - (8)])),
+					static_cast<pars_res_word_t*>((yyvsp[(6) - (8)])),
+					static_cast<pars_res_word_t*>((yyvsp[(7) - (8)])),
+					static_cast<order_node_t*>((yyvsp[(8) - (8)]))); ;}
+    break;
+
+  case 104:
+#line 373 "pars0grm.y"
+    { (yyval) = (yyvsp[(3) - (3)]); ;}
+    break;
+
+  case 105:
+#line 378 "pars0grm.y"
+    { (yyval) = pars_insert_statement(
+					static_cast<sym_node_t*>((yyvsp[(1) - (5)])), (yyvsp[(4) - (5)]), NULL); ;}
+    break;
+
+  case 106:
+#line 381 "pars0grm.y"
+    { (yyval) = pars_insert_statement(
+					static_cast<sym_node_t*>((yyvsp[(1) - (2)])),
+					NULL,
+					static_cast<sel_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 107:
+#line 388 "pars0grm.y"
+    { (yyval) = pars_column_assignment(
+					static_cast<sym_node_t*>((yyvsp[(1) - (3)])),
+					static_cast<que_node_t*>((yyvsp[(3) - (3)]))); ;}
+    break;
+
+  case 108:
+#line 394 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 109:
+#line 396 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 110:
+#line 402 "pars0grm.y"
+    { (yyval) = (yyvsp[(4) - (4)]); ;}
+    break;
+
+  case 111:
+#line 408 "pars0grm.y"
+    { (yyval) = pars_update_statement_start(
+					FALSE,
+					static_cast<sym_node_t*>((yyvsp[(2) - (4)])),
+					static_cast<col_assign_node_t*>((yyvsp[(4) - (4)]))); ;}
+    break;
+
+  case 112:
+#line 416 "pars0grm.y"
+    { (yyval) = pars_update_statement(
+					static_cast<upd_node_t*>((yyvsp[(1) - (2)])),
+					NULL,
+					static_cast<que_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 113:
+#line 424 "pars0grm.y"
+    { (yyval) = pars_update_statement(
+					static_cast<upd_node_t*>((yyvsp[(1) - (2)])),
+					static_cast<sym_node_t*>((yyvsp[(2) - (2)])),
+					NULL); ;}
+    break;
+
+  case 114:
+#line 432 "pars0grm.y"
+    { (yyval) = pars_update_statement_start(
+					TRUE,
+					static_cast<sym_node_t*>((yyvsp[(3) - (3)])), NULL); ;}
+    break;
+
+  case 115:
+#line 439 "pars0grm.y"
+    { (yyval) = pars_update_statement(
+					static_cast<upd_node_t*>((yyvsp[(1) - (2)])),
+					NULL,
+					static_cast<que_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 116:
+#line 447 "pars0grm.y"
+    { (yyval) = pars_update_statement(
+					static_cast<upd_node_t*>((yyvsp[(1) - (2)])),
+					static_cast<sym_node_t*>((yyvsp[(2) - (2)])),
+					NULL); ;}
+    break;
+
+  case 117:
+#line 455 "pars0grm.y"
+    { (yyval) = pars_row_printf_statement(
+					static_cast<sel_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 118:
+#line 461 "pars0grm.y"
+    { (yyval) = pars_assignment_statement(
+					static_cast<sym_node_t*>((yyvsp[(1) - (3)])),
+					static_cast<que_node_t*>((yyvsp[(3) - (3)]))); ;}
+    break;
+
+  case 119:
+#line 469 "pars0grm.y"
+    { (yyval) = pars_elsif_element((yyvsp[(2) - (4)]), (yyvsp[(4) - (4)])); ;}
+    break;
+
+  case 120:
+#line 473 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 121:
+#line 475 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (2)]), (yyvsp[(2) - (2)])); ;}
+    break;
+
+  case 122:
+#line 479 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 123:
+#line 481 "pars0grm.y"
+    { (yyval) = (yyvsp[(2) - (2)]); ;}
+    break;
+
+  case 124:
+#line 482 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]); ;}
+    break;
+
+  case 125:
+#line 489 "pars0grm.y"
+    { (yyval) = pars_if_statement((yyvsp[(2) - (7)]), (yyvsp[(4) - (7)]), (yyvsp[(5) - (7)])); ;}
+    break;
+
+  case 126:
+#line 495 "pars0grm.y"
+    { (yyval) = pars_while_statement((yyvsp[(2) - (6)]), (yyvsp[(4) - (6)])); ;}
+    break;
+
+  case 127:
+#line 503 "pars0grm.y"
+    { (yyval) = pars_for_statement(
+					static_cast<sym_node_t*>((yyvsp[(2) - (10)])),
+					(yyvsp[(4) - (10)]), (yyvsp[(6) - (10)]), (yyvsp[(8) - (10)])); ;}
+    break;
+
+  case 128:
+#line 509 "pars0grm.y"
+    { (yyval) = pars_exit_statement(); ;}
+    break;
+
+  case 129:
+#line 513 "pars0grm.y"
+    { (yyval) = pars_return_statement(); ;}
+    break;
+
+  case 130:
+#line 518 "pars0grm.y"
+    { (yyval) = pars_open_statement(
+						ROW_SEL_OPEN_CURSOR,
+						static_cast<sym_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 131:
+#line 525 "pars0grm.y"
+    { (yyval) = pars_open_statement(
+						ROW_SEL_CLOSE_CURSOR,
+						static_cast<sym_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 132:
+#line 532 "pars0grm.y"
+    { (yyval) = pars_fetch_statement(
+					static_cast<sym_node_t*>((yyvsp[(2) - (4)])),
+					static_cast<sym_node_t*>((yyvsp[(4) - (4)])), NULL); ;}
+    break;
+
+  case 133:
+#line 536 "pars0grm.y"
+    { (yyval) = pars_fetch_statement(
+					static_cast<sym_node_t*>((yyvsp[(2) - (4)])),
+					NULL,
+					static_cast<sym_node_t*>((yyvsp[(4) - (4)]))); ;}
+    break;
+
+  case 134:
+#line 544 "pars0grm.y"
+    { (yyval) = pars_column_def(
+					static_cast<sym_node_t*>((yyvsp[(1) - (5)])),
+					static_cast<pars_res_word_t*>((yyvsp[(2) - (5)])),
+					static_cast<sym_node_t*>((yyvsp[(3) - (5)])),
+					(yyvsp[(4) - (5)]), (yyvsp[(5) - (5)])); ;}
+    break;
+
+  case 135:
+#line 552 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 136:
+#line 554 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 137:
+#line 558 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 138:
+#line 560 "pars0grm.y"
+    { (yyval) = (yyvsp[(2) - (3)]); ;}
+    break;
+
+  case 139:
+#line 564 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 140:
+#line 566 "pars0grm.y"
+    { (yyval) = &pars_int_token;
+					/* pass any non-NULL pointer */ ;}
+    break;
+
+  case 141:
+#line 571 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 142:
+#line 573 "pars0grm.y"
+    { (yyval) = &pars_int_token;
+					/* pass any non-NULL pointer */ ;}
+    break;
+
+  case 143:
+#line 578 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 144:
+#line 580 "pars0grm.y"
+    { (yyval) = &pars_int_token;
+					/* pass any non-NULL pointer */ ;}
+    break;
+
+  case 145:
+#line 585 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 146:
+#line 586 "pars0grm.y"
+    { (yyval) = &pars_int_token;
+					/* pass any non-NULL pointer */ ;}
+    break;
+
+  case 147:
+#line 591 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 148:
+#line 593 "pars0grm.y"
+    { (yyval) = (yyvsp[(3) - (3)]); ;}
+    break;
+
+  case 149:
+#line 600 "pars0grm.y"
+    { (yyval) = pars_create_table(
+					static_cast<sym_node_t*>((yyvsp[(3) - (9)])),
+					static_cast<sym_node_t*>((yyvsp[(5) - (9)])),
+					static_cast<sym_node_t*>((yyvsp[(8) - (9)])),
+					static_cast<sym_node_t*>((yyvsp[(9) - (9)])), (yyvsp[(7) - (9)])); ;}
+    break;
+
+  case 150:
+#line 608 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 151:
+#line 610 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 152:
+#line 614 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 153:
+#line 615 "pars0grm.y"
+    { (yyval) = &pars_unique_token; ;}
+    break;
+
+  case 154:
+#line 619 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 155:
+#line 620 "pars0grm.y"
+    { (yyval) = &pars_clustered_token; ;}
+    break;
+
+  case 156:
+#line 629 "pars0grm.y"
+    { (yyval) = pars_create_index(
+					static_cast<pars_res_word_t*>((yyvsp[(2) - (10)])),
+					static_cast<pars_res_word_t*>((yyvsp[(3) - (10)])),
+					static_cast<sym_node_t*>((yyvsp[(5) - (10)])),
+					static_cast<sym_node_t*>((yyvsp[(7) - (10)])),
+					static_cast<sym_node_t*>((yyvsp[(9) - (10)]))); ;}
+    break;
+
+  case 157:
+#line 638 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]); ;}
+    break;
+
+  case 158:
+#line 639 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]); ;}
+    break;
+
+  case 159:
+#line 644 "pars0grm.y"
+    { (yyval) = pars_commit_statement(); ;}
+    break;
+
+  case 160:
+#line 649 "pars0grm.y"
+    { (yyval) = pars_rollback_statement(); ;}
+    break;
+
+  case 161:
+#line 653 "pars0grm.y"
+    { (yyval) = &pars_int_token; ;}
+    break;
+
+  case 162:
+#line 654 "pars0grm.y"
+    { (yyval) = &pars_int_token; ;}
+    break;
+
+  case 163:
+#line 655 "pars0grm.y"
+    { (yyval) = &pars_bigint_token; ;}
+    break;
+
+  case 164:
+#line 656 "pars0grm.y"
+    { (yyval) = &pars_char_token; ;}
+    break;
+
+  case 165:
+#line 657 "pars0grm.y"
+    { (yyval) = &pars_binary_token; ;}
+    break;
+
+  case 166:
+#line 658 "pars0grm.y"
+    { (yyval) = &pars_blob_token; ;}
+    break;
+
+  case 167:
+#line 663 "pars0grm.y"
+    { (yyval) = pars_parameter_declaration(
+					static_cast<sym_node_t*>((yyvsp[(1) - (3)])),
+					PARS_INPUT,
+					static_cast<pars_res_word_t*>((yyvsp[(3) - (3)]))); ;}
+    break;
+
+  case 168:
+#line 668 "pars0grm.y"
+    { (yyval) = pars_parameter_declaration(
+					static_cast<sym_node_t*>((yyvsp[(1) - (3)])),
+					PARS_OUTPUT,
+					static_cast<pars_res_word_t*>((yyvsp[(3) - (3)]))); ;}
+    break;
+
+  case 169:
+#line 675 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 170:
+#line 676 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 171:
+#line 678 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 172:
+#line 683 "pars0grm.y"
+    { (yyval) = pars_variable_declaration(
+					static_cast<sym_node_t*>((yyvsp[(1) - (3)])),
+					static_cast<pars_res_word_t*>((yyvsp[(2) - (3)]))); ;}
+    break;
+
+  case 176:
+#line 697 "pars0grm.y"
+    { (yyval) = pars_cursor_declaration(
+					static_cast<sym_node_t*>((yyvsp[(3) - (6)])),
+					static_cast<sel_node_t*>((yyvsp[(5) - (6)]))); ;}
+    break;
+
+  case 177:
+#line 704 "pars0grm.y"
+    { (yyval) = pars_function_declaration(
+					static_cast<sym_node_t*>((yyvsp[(3) - (4)]))); ;}
+    break;
+
+  case 183:
+#line 726 "pars0grm.y"
+    { (yyval) = pars_procedure_definition(
+					static_cast<sym_node_t*>((yyvsp[(2) - (11)])),
+					static_cast<sym_node_t*>((yyvsp[(4) - (11)])),
+					(yyvsp[(10) - (11)])); ;}
+    break;
+
+
+/* Line 1267 of yacc.c.  */
+#line 2826 "pars0grm.cc"
+      default: break;
+    }
+  YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc);
+
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+
+  *++yyvsp = yyval;
+
+
+  /* Now `shift' the result of the reduction.  Determine what state
+     that goes to, based on the state we popped back to and the rule
+     number reduced by.  */
+
+  yyn = yyr1[yyn];
+
+  yystate = yypgoto[yyn - YYNTOKENS] + *yyssp;
+  if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp)
+    yystate = yytable[yystate];
+  else
+    yystate = yydefgoto[yyn - YYNTOKENS];
+
+  goto yynewstate;
+
+
+/*------------------------------------.
+| yyerrlab -- here on detecting error |
+`------------------------------------*/
+yyerrlab:
+  /* If not already recovering from an error, report this error.  */
+  if (!yyerrstatus)
+    {
+      ++yynerrs;
+#if ! YYERROR_VERBOSE
+      yyerror (YY_("syntax error"));
+#else
+      {
+	YYSIZE_T yysize = yysyntax_error (0, yystate, yychar);
+	if (yymsg_alloc < yysize && yymsg_alloc < YYSTACK_ALLOC_MAXIMUM)
+	  {
+	    YYSIZE_T yyalloc = 2 * yysize;
+	    if (! (yysize <= yyalloc && yyalloc <= YYSTACK_ALLOC_MAXIMUM))
+	      yyalloc = YYSTACK_ALLOC_MAXIMUM;
+	    if (yymsg != yymsgbuf)
+	      YYSTACK_FREE (yymsg);
+	    yymsg = (char*) YYSTACK_ALLOC (yyalloc);
+	    if (yymsg)
+	      yymsg_alloc = yyalloc;
+	    else
+	      {
+		yymsg = yymsgbuf;
+		yymsg_alloc = sizeof yymsgbuf;
+	      }
+	  }
+
+	if (0 < yysize && yysize <= yymsg_alloc)
+	  {
+	    (void) yysyntax_error (yymsg, yystate, yychar);
+	    yyerror (yymsg);
+	  }
+	else
+	  {
+	    yyerror (YY_("syntax error"));
+	    if (yysize != 0)
+	      goto yyexhaustedlab;
+	  }
+      }
+#endif
+    }
+
+
+
+  if (yyerrstatus == 3)
+    {
+      /* If just tried and failed to reuse look-ahead token after an
+	 error, discard it.  */
+
+      if (yychar <= YYEOF)
+	{
+	  /* Return failure if at end of input.  */
+	  if (yychar == YYEOF)
+	    YYABORT;
+	}
+      else
+	{
+	  yydestruct ("Error: discarding",
+		      yytoken, &yylval);
+	  yychar = YYEMPTY;
+	}
+    }
+
+  /* Else will try to reuse look-ahead token after shifting the error
+     token.  */
+  goto yyerrlab1;
+
+
+/*---------------------------------------------------.
+| yyerrorlab -- error raised explicitly by YYERROR.  |
+`---------------------------------------------------*/
+yyerrorlab:
+
+  /* Pacify compilers like GCC when the user code never invokes
+     YYERROR and the label yyerrorlab therefore never appears in user
+     code.  */
+  if (/*CONSTCOND*/ 0)
+     goto yyerrorlab;
+
+  /* Do not reclaim the symbols of the rule which action triggered
+     this YYERROR.  */
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+  yystate = *yyssp;
+  goto yyerrlab1;
+
+
+/*-------------------------------------------------------------.
+| yyerrlab1 -- common code for both syntax error and YYERROR.  |
+`-------------------------------------------------------------*/
+yyerrlab1:
+  yyerrstatus = 3;	/* Each real token shifted decrements this.  */
+
+  for (;;)
+    {
+      yyn = yypact[yystate];
+      if (yyn != YYPACT_NINF)
+	{
+	  yyn += YYTERROR;
+	  if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR)
+	    {
+	      yyn = yytable[yyn];
+	      if (0 < yyn)
+		break;
+	    }
+	}
+
+      /* Pop the current state because it cannot handle the error token.  */
+      if (yyssp == yyss)
+	YYABORT;
+
+
+      yydestruct ("Error: popping",
+		  yystos[yystate], yyvsp);
+      YYPOPSTACK (1);
+      yystate = *yyssp;
+      YY_STACK_PRINT (yyss, yyssp);
+    }
+
+  if (yyn == YYFINAL)
+    YYACCEPT;
+
+  *++yyvsp = yylval;
+
+
+  /* Shift the error token.  */
+  YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp);
+
+  yystate = yyn;
+  goto yynewstate;
+
+
+/*-------------------------------------.
+| yyacceptlab -- YYACCEPT comes here.  |
+`-------------------------------------*/
+yyacceptlab:
+  yyresult = 0;
+  goto yyreturn;
+
+/*-----------------------------------.
+| yyabortlab -- YYABORT comes here.  |
+`-----------------------------------*/
+yyabortlab:
+  yyresult = 1;
+  goto yyreturn;
+
+#ifndef yyoverflow
+/*-------------------------------------------------.
+| yyexhaustedlab -- memory exhaustion comes here.  |
+`-------------------------------------------------*/
+yyexhaustedlab:
+  yyerror (YY_("memory exhausted"));
+  yyresult = 2;
+  /* Fall through.  */
+#endif
+
+yyreturn:
+  if (yychar != YYEOF && yychar != YYEMPTY)
+     yydestruct ("Cleanup: discarding lookahead",
+		 yytoken, &yylval);
+  /* Do not reclaim the symbols of the rule which action triggered
+     this YYABORT or YYACCEPT.  */
+  YYPOPSTACK (yylen);
+  YY_STACK_PRINT (yyss, yyssp);
+  while (yyssp != yyss)
+    {
+      yydestruct ("Cleanup: popping",
+		  yystos[*yyssp], yyvsp);
+      YYPOPSTACK (1);
+    }
+#ifndef yyoverflow
+  if (yyss != yyssa)
+    YYSTACK_FREE (yyss);
+#endif
+#if YYERROR_VERBOSE
+  if (yymsg != yymsgbuf)
+    YYSTACK_FREE (yymsg);
+#endif
+  /* Make sure YYID is used.  */
+  return YYID (yyresult);
+}
+
+
+#line 732 "pars0grm.y"
+
+
diff --git a/storage/xtradb/pars/pars0grm.y b/storage/xtradb/pars/pars0grm.y
index 5b4549d6d37..60913287cc4 100644
--- a/storage/xtradb/pars/pars0grm.y
+++ b/storage/xtradb/pars/pars0grm.y
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -137,6 +137,15 @@ yylex(void);
 %token PARS_LOCK_TOKEN
 %token PARS_SHARE_TOKEN
 %token PARS_MODE_TOKEN
+%token PARS_LIKE_TOKEN
+%token PARS_LIKE_TOKEN_EXACT
+%token PARS_LIKE_TOKEN_PREFIX
+%token PARS_LIKE_TOKEN_SUFFIX
+%token PARS_LIKE_TOKEN_SUBSTR
+%token PARS_TABLE_NAME_TOKEN
+%token PARS_COMPACT_TOKEN
+%token PARS_BLOCK_SIZE_TOKEN
+%token PARS_BIGINT_TOKEN
 
 %left PARS_AND_TOKEN PARS_OR_TOKEN
 %left PARS_NOT_TOKEN
@@ -201,8 +210,10 @@ exp:
 	| '-' exp %prec NEG 	{ $$ = pars_op('-', $2, NULL); }
 	| '(' exp ')'        	{ $$ = $2; }
 	| exp '=' exp		{ $$ = pars_op('=', $1, $3); }
-	| exp '<' exp		{ $$ = pars_op('<', $1, $3); }
-	| exp '>' exp		{ $$ = pars_op('>', $1, $3); }
+	| exp PARS_LIKE_TOKEN PARS_STR_LIT
+				{ $$ = pars_op(PARS_LIKE_TOKEN, $1, $3); }
+	| exp '<' exp           { $$ = pars_op('<', $1, $3); }
+	| exp '>' exp           { $$ = pars_op('>', $1, $3); }
 	| exp PARS_GE_TOKEN exp	{ $$ = pars_op(PARS_GE_TOKEN, $1, $3); }
 	| exp PARS_LE_TOKEN exp	{ $$ = pars_op(PARS_LE_TOKEN, $1, $3); }
 	| exp PARS_NE_TOKEN exp	{ $$ = pars_op(PARS_NE_TOKEN, $1, $3); }
@@ -238,7 +249,8 @@ question_mark_list:
 
 stored_procedure_call:
 	'{' PARS_ID_TOKEN '(' question_mark_list ')' '}'
-				{ $$ = pars_stored_procedure_call($2); }
+				{ $$ = pars_stored_procedure_call(
+					static_cast<sym_node_t*>($2)); }
 ;
 
 predefined_procedure_call:
@@ -257,8 +269,8 @@ user_function_call:
 ;
 
 table_list:
-	PARS_ID_TOKEN		{ $$ = que_node_list_add_last(NULL, $1); }
-	| table_list ',' PARS_ID_TOKEN
+	table_name		{ $$ = que_node_list_add_last(NULL, $1); }
+	| table_list ',' table_name
 				{ $$ = que_node_list_add_last($1, $3); }
 ;
 
@@ -305,7 +317,8 @@ select_list:
 	'*'			{ $$ = pars_select_list(&pars_star_denoter,
 								NULL); }
 	| select_item_list PARS_INTO_TOKEN variable_list
-				{ $$ = pars_select_list($1, $3); }
+				{ $$ = pars_select_list(
+					$1, static_cast<sym_node_t*>($3)); }
 	| select_item_list	{ $$ = pars_select_list($1, NULL); }
 ;
 
@@ -335,7 +348,9 @@ order_direction:
 order_by_clause:
 	/* Nothing */		{ $$ = NULL; }
 	| PARS_ORDER_TOKEN PARS_BY_TOKEN PARS_ID_TOKEN order_direction
-				{ $$ = pars_order_by($3, $4); }
+				{ $$ = pars_order_by(
+					static_cast<sym_node_t*>($3),
+					static_cast<pars_res_word_t*>($4)); }
 ;
 
 select_statement:
@@ -344,24 +359,35 @@ select_statement:
 	search_condition
 	for_update_clause
 	lock_shared_clause
-	order_by_clause		{ $$ = pars_select_statement($2, $4, $5,
-								$6, $7, $8); }
+	order_by_clause		{ $$ = pars_select_statement(
+					static_cast<sel_node_t*>($2),
+					static_cast<sym_node_t*>($4),
+					static_cast<que_node_t*>($5),
+					static_cast<pars_res_word_t*>($6),
+					static_cast<pars_res_word_t*>($7),
+					static_cast<order_node_t*>($8)); }
 ;
 
 insert_statement_start:
 	PARS_INSERT_TOKEN PARS_INTO_TOKEN
-	PARS_ID_TOKEN		{ $$ = $3; }
+	table_name		{ $$ = $3; }
 ;
 
 insert_statement:
 	insert_statement_start PARS_VALUES_TOKEN '(' exp_list ')'
-				{ $$ = pars_insert_statement($1, $4, NULL); }
+				{ $$ = pars_insert_statement(
+					static_cast<sym_node_t*>($1), $4, NULL); }
 	| insert_statement_start select_statement
-				{ $$ = pars_insert_statement($1, NULL, $2); }
+				{ $$ = pars_insert_statement(
+					static_cast<sym_node_t*>($1),
+					NULL,
+					static_cast<sel_node_t*>($2)); }
 ;
 
 column_assignment:
-	PARS_ID_TOKEN '=' exp	{ $$ = pars_column_assignment($1, $3); }
+	PARS_ID_TOKEN '=' exp	{ $$ = pars_column_assignment(
+					static_cast<sym_node_t*>($1),
+					static_cast<que_node_t*>($3)); }
 ;
 
 column_assignment_list:
@@ -377,46 +403,64 @@ cursor_positioned:
 ;
 
 update_statement_start:
-	PARS_UPDATE_TOKEN PARS_ID_TOKEN
+	PARS_UPDATE_TOKEN table_name
 	PARS_SET_TOKEN
-	column_assignment_list	{ $$ = pars_update_statement_start(FALSE,
-								$2, $4); }
+	column_assignment_list	{ $$ = pars_update_statement_start(
+					FALSE,
+					static_cast<sym_node_t*>($2),
+					static_cast<col_assign_node_t*>($4)); }
 ;
 
 update_statement_searched:
 	update_statement_start
-	search_condition	{ $$ = pars_update_statement($1, NULL, $2); }
+	search_condition	{ $$ = pars_update_statement(
+					static_cast<upd_node_t*>($1),
+					NULL,
+					static_cast<que_node_t*>($2)); }
 ;
 
 update_statement_positioned:
 	update_statement_start
-	cursor_positioned	{ $$ = pars_update_statement($1, $2, NULL); }
+	cursor_positioned	{ $$ = pars_update_statement(
+					static_cast<upd_node_t*>($1),
+					static_cast<sym_node_t*>($2),
+					NULL); }
 ;
 
 delete_statement_start:
 	PARS_DELETE_TOKEN PARS_FROM_TOKEN
-	PARS_ID_TOKEN		{ $$ = pars_update_statement_start(TRUE,
-								$3, NULL); }
+	table_name		{ $$ = pars_update_statement_start(
+					TRUE,
+					static_cast<sym_node_t*>($3), NULL); }
 ;
 
 delete_statement_searched:
 	delete_statement_start
-	search_condition	{ $$ = pars_update_statement($1, NULL, $2); }
+	search_condition	{ $$ = pars_update_statement(
+					static_cast<upd_node_t*>($1),
+					NULL,
+					static_cast<que_node_t*>($2)); }
 ;
 
 delete_statement_positioned:
 	delete_statement_start
-	cursor_positioned	{ $$ = pars_update_statement($1, $2, NULL); }
+	cursor_positioned	{ $$ = pars_update_statement(
+					static_cast<upd_node_t*>($1),
+					static_cast<sym_node_t*>($2),
+					NULL); }
 ;
 
 row_printf_statement:
 	PARS_ROW_PRINTF_TOKEN select_statement
-				{ $$ = pars_row_printf_statement($2); }
+				{ $$ = pars_row_printf_statement(
+					static_cast<sel_node_t*>($2)); }
 ;
 
 assignment_statement:
 	PARS_ID_TOKEN PARS_ASSIGN_TOKEN exp
-				{ $$ = pars_assignment_statement($1, $3); }
+				{ $$ = pars_assignment_statement(
+					static_cast<sym_node_t*>($1),
+					static_cast<que_node_t*>($3)); }
 ;
 
 elsif_element:
@@ -456,7 +500,9 @@ for_statement:
 	exp PARS_DDOT_TOKEN exp
 	PARS_LOOP_TOKEN statement_list
 	PARS_END_TOKEN PARS_LOOP_TOKEN
-				{ $$ = pars_for_statement($2, $4, $6, $8); }
+				{ $$ = pars_for_statement(
+					static_cast<sym_node_t*>($2),
+					$4, $6, $8); }
 ;
 
 exit_statement:
@@ -470,25 +516,36 @@ return_statement:
 open_cursor_statement:
 	PARS_OPEN_TOKEN PARS_ID_TOKEN
 				{ $$ = pars_open_statement(
-						ROW_SEL_OPEN_CURSOR, $2); }
+						ROW_SEL_OPEN_CURSOR,
+						static_cast<sym_node_t*>($2)); }
 ;
 
 close_cursor_statement:
 	PARS_CLOSE_TOKEN PARS_ID_TOKEN
 				{ $$ = pars_open_statement(
-						ROW_SEL_CLOSE_CURSOR, $2); }
+						ROW_SEL_CLOSE_CURSOR,
+						static_cast<sym_node_t*>($2)); }
 ;
 
 fetch_statement:
 	PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN variable_list
-				{ $$ = pars_fetch_statement($2, $4, NULL); }
+				{ $$ = pars_fetch_statement(
+					static_cast<sym_node_t*>($2),
+					static_cast<sym_node_t*>($4), NULL); }
 	| PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN user_function_call
-				{ $$ = pars_fetch_statement($2, NULL, $4); }
+				{ $$ = pars_fetch_statement(
+					static_cast<sym_node_t*>($2),
+					NULL,
+					static_cast<sym_node_t*>($4)); }
 ;
 
 column_def:
 	PARS_ID_TOKEN type_name	opt_column_len opt_unsigned opt_not_null
-				{ $$ = pars_column_def($1, $2, $3, $4, $5); }
+				{ $$ = pars_column_def(
+					static_cast<sym_node_t*>($1),
+					static_cast<pars_res_word_t*>($2),
+					static_cast<sym_node_t*>($3),
+					$4, $5); }
 ;
 
 column_def_list:
@@ -524,10 +581,27 @@ not_fit_in_memory:
 					/* pass any non-NULL pointer */ }
 ;
 
+compact:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_COMPACT_TOKEN	{ $$ = &pars_int_token;
+					/* pass any non-NULL pointer */ }
+;
+
+block_size:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_BLOCK_SIZE_TOKEN	'=' PARS_INT_LIT
+			{ $$ = $3; }
+;
+
 create_table:
 	PARS_CREATE_TOKEN PARS_TABLE_TOKEN
-	PARS_ID_TOKEN '(' column_def_list ')'
-	not_fit_in_memory	{ $$ = pars_create_table($3, $5, $7); }
+	table_name '(' column_def_list ')'
+	not_fit_in_memory compact block_size
+				{ $$ = pars_create_table(
+					static_cast<sym_node_t*>($3),
+					static_cast<sym_node_t*>($5),
+					static_cast<sym_node_t*>($8),
+					static_cast<sym_node_t*>($9), $7); }
 ;
 
 column_list:
@@ -550,8 +624,19 @@ create_index:
 	PARS_CREATE_TOKEN unique_def
 	clustered_def
 	PARS_INDEX_TOKEN
-	PARS_ID_TOKEN PARS_ON_TOKEN PARS_ID_TOKEN
-	'(' column_list ')'	{ $$ = pars_create_index($2, $3, $5, $7, $9); }
+	PARS_ID_TOKEN PARS_ON_TOKEN
+	table_name
+	'(' column_list ')'	{ $$ = pars_create_index(
+					static_cast<pars_res_word_t*>($2),
+					static_cast<pars_res_word_t*>($3),
+					static_cast<sym_node_t*>($5),
+					static_cast<sym_node_t*>($7),
+					static_cast<sym_node_t*>($9)); }
+;
+
+table_name:
+	PARS_ID_TOKEN		{ $$ = $1; }
+	| PARS_TABLE_NAME_TOKEN	{ $$ = $1; }
 ;
 
 commit_statement:
@@ -567,6 +652,7 @@ rollback_statement:
 type_name:
 	PARS_INT_TOKEN		{ $$ = &pars_int_token; }
 	| PARS_INTEGER_TOKEN	{ $$ = &pars_int_token; }
+	| PARS_BIGINT_TOKEN	{ $$ = &pars_bigint_token; }
 	| PARS_CHAR_TOKEN	{ $$ = &pars_char_token; }
 	| PARS_BINARY_TOKEN	{ $$ = &pars_binary_token; }
 	| PARS_BLOB_TOKEN	{ $$ = &pars_blob_token; }
@@ -574,11 +660,15 @@ type_name:
 
 parameter_declaration:
 	PARS_ID_TOKEN PARS_IN_TOKEN type_name
-				{ $$ = pars_parameter_declaration($1,
-							PARS_INPUT, $3); }
+				{ $$ = pars_parameter_declaration(
+					static_cast<sym_node_t*>($1),
+					PARS_INPUT,
+					static_cast<pars_res_word_t*>($3)); }
 	| PARS_ID_TOKEN PARS_OUT_TOKEN type_name
-				{ $$ = pars_parameter_declaration($1,
-							PARS_OUTPUT, $3); }
+				{ $$ = pars_parameter_declaration(
+					static_cast<sym_node_t*>($1),
+					PARS_OUTPUT,
+					static_cast<pars_res_word_t*>($3)); }
 ;
 
 parameter_declaration_list:
@@ -590,7 +680,9 @@ parameter_declaration_list:
 
 variable_declaration:
 	PARS_ID_TOKEN type_name ';'
-				{ $$ = pars_variable_declaration($1, $2); }
+				{ $$ = pars_variable_declaration(
+					static_cast<sym_node_t*>($1),
+					static_cast<pars_res_word_t*>($2)); }
 ;
 
 variable_declaration_list:
@@ -602,12 +694,15 @@ variable_declaration_list:
 cursor_declaration:
 	PARS_DECLARE_TOKEN PARS_CURSOR_TOKEN PARS_ID_TOKEN
 	PARS_IS_TOKEN select_statement ';'
-				{ $$ = pars_cursor_declaration($3, $5); }
+				{ $$ = pars_cursor_declaration(
+					static_cast<sym_node_t*>($3),
+					static_cast<sel_node_t*>($5)); }
 ;
 
 function_declaration:
 	PARS_DECLARE_TOKEN PARS_FUNCTION_TOKEN PARS_ID_TOKEN ';'
-				{ $$ = pars_function_declaration($3); }
+				{ $$ = pars_function_declaration(
+					static_cast<sym_node_t*>($3)); }
 ;
 
 declaration:
@@ -628,8 +723,10 @@ procedure_definition:
 	declaration_list
 	PARS_BEGIN_TOKEN
 	statement_list
-	PARS_END_TOKEN		{ $$ = pars_procedure_definition($2, $4,
-								$10); }
+	PARS_END_TOKEN		{ $$ = pars_procedure_definition(
+					static_cast<sym_node_t*>($2),
+					static_cast<sym_node_t*>($4),
+					$10); }
 ;
 
 %%
diff --git a/storage/xtradb/pars/pars0lex.l b/storage/xtradb/pars/pars0lex.l
index a79f7a9e0ca..f800410fa3f 100644
--- a/storage/xtradb/pars/pars0lex.l
+++ b/storage/xtradb/pars/pars0lex.l
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -64,7 +64,9 @@ Created 12/14/1997 Heikki Tuuri
 #define realloc(P, A)	ut_realloc(P, A)
 #define exit(A) 	ut_error
 
-#define YY_INPUT(buf, result, max_size) pars_get_lex_chars(buf, &result, max_size)
+/* Note: We cast &result to int* from yysize_t* */
+#define YY_INPUT(buf, result, max_size) \
+	pars_get_lex_chars(buf, (int*) &result, max_size)
 
 /* String buffer for removing quotes */
 static ulint	stringbuf_len_alloc = 0; /* Allocated length */
@@ -79,7 +81,7 @@ string_append(
 	ulint		len)	/*!< in: length of the string */
 {
 	if (stringbuf == NULL) {
-		stringbuf = malloc(1);
+		stringbuf = static_cast<char*>(malloc(1));
 		stringbuf_len_alloc = 1;
 	}
 
@@ -87,7 +89,9 @@ string_append(
 		while (stringbuf_len + len > stringbuf_len_alloc) {
 			stringbuf_len_alloc <<= 1;
 		}
-		stringbuf = realloc(stringbuf, stringbuf_len_alloc);
+
+		stringbuf = static_cast<char*>(
+			realloc(stringbuf, stringbuf_len_alloc));
 	}
 
 	memcpy(stringbuf + stringbuf_len, str, len);
@@ -96,8 +100,9 @@ string_append(
 
 %}
 
-DIGIT	[0-9]
-ID	[a-z_A-Z][a-z_A-Z0-9]*
+DIGIT		[0-9]
+ID		[a-z_A-Z][a-z_A-Z0-9]*
+TABLE_NAME	[a-z_A-Z][@a-z_A-Z0-9]*\/(#sql-|[a-z_A-Z])[a-z_A-Z0-9]*
 BOUND_LIT	\:[a-z_A-Z0-9]+
 BOUND_ID	\$[a-z_A-Z0-9]+
 
@@ -249,27 +254,27 @@ In the state 'id', only two actions are possible (defined below). */
 }
 
 "BINARY"	{
-	 		return(PARS_BINARY_TOKEN);
+			return(PARS_BINARY_TOKEN);
 }
 
 "BLOB"		{
-	 		return(PARS_BLOB_TOKEN);
+			return(PARS_BLOB_TOKEN);
 }
 
 "INT"		{
-	 		return(PARS_INT_TOKEN);
+			return(PARS_INT_TOKEN);
 }
 
 "INTEGER"	{
-	 		return(PARS_INT_TOKEN);
+			return(PARS_INT_TOKEN);
 }
 
 "FLOAT"		{
-	 		return(PARS_FLOAT_TOKEN);
+			return(PARS_FLOAT_TOKEN);
 }
 
 "CHAR"		{
-	 		return(PARS_CHAR_TOKEN);
+			return(PARS_CHAR_TOKEN);
 }
 
 "IS"		{
@@ -400,16 +405,24 @@ In the state 'id', only two actions are possible (defined below). */
 			return(PARS_TABLE_TOKEN);
 }
 
+"COMPACT"	{
+			return(PARS_COMPACT_TOKEN);
+}
+
+"BLOCK_SIZE"	{
+			return(PARS_BLOCK_SIZE_TOKEN);
+}
+
 "INDEX"		{
-	 		return(PARS_INDEX_TOKEN);
+			return(PARS_INDEX_TOKEN);
 }
 
 "UNIQUE"	{
-	 		return(PARS_UNIQUE_TOKEN);
+			return(PARS_UNIQUE_TOKEN);
 }
 
 "CLUSTERED"	{
-	 		return(PARS_CLUSTERED_TOKEN);
+			return(PARS_CLUSTERED_TOKEN);
 }
 
 "DOES_NOT_FIT_IN_MEMORY"	{
@@ -417,7 +430,7 @@ In the state 'id', only two actions are possible (defined below). */
 }
 
 "ON"		{
-	 		return(PARS_ON_TOKEN);
+			return(PARS_ON_TOKEN);
 }
 
 "DECLARE"	{
@@ -540,13 +553,28 @@ In the state 'id', only two actions are possible (defined below). */
 			return(PARS_MODE_TOKEN);
 }
 
+"LIKE"  {
+                        return(PARS_LIKE_TOKEN);
+}
+
+"BIGINT"	{
+			return(PARS_BIGINT_TOKEN);
+}
+
 {ID}		{
 			yylval = sym_tab_add_id(pars_sym_tab_global,
-							(byte*)yytext,
+							(byte*) yytext,
 							ut_strlen(yytext));
 			return(PARS_ID_TOKEN);
 }
 
+{TABLE_NAME}	{
+			yylval = sym_tab_add_id(pars_sym_tab_global,
+							(byte*) yytext,
+							ut_strlen(yytext));
+			return(PARS_TABLE_NAME_TOKEN);
+}
+
 ".."		{
 			return(PARS_DDOT_TOKEN);
 }
diff --git a/storage/xtradb/pars/pars0opt.c b/storage/xtradb/pars/pars0opt.cc
index 7f98e95ac3f..cbed2b39eeb 100644
--- a/storage/xtradb/pars/pars0opt.c
+++ b/storage/xtradb/pars/pars0opt.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file pars/pars0opt.c
+@file pars/pars0opt.cc
 Simple SQL optimizer
 
 Created 12/21/1997 Heikki Tuuri
@@ -68,6 +68,7 @@ opt_invert_cmp_op(
 	} else if (op == PARS_GE_TOKEN) {
 		return(PARS_LE_TOKEN);
 	} else {
+		/* TODO: LIKE operator */
 		ut_error;
 	}
 
@@ -96,7 +97,7 @@ opt_check_exp_determined_before(
 	ut_ad(exp && sel_node);
 
 	if (que_node_get_type(exp) == QUE_NODE_FUNC) {
-		func_node = exp;
+		func_node = static_cast<func_node_t*>(exp);
 
 		arg = func_node->args;
 
@@ -114,7 +115,7 @@ opt_check_exp_determined_before(
 
 	ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL);
 
-	sym_node = exp;
+	sym_node = static_cast<sym_node_t*>(exp);
 
 	if (sym_node->token_type != SYM_COLUMN) {
 
@@ -165,11 +166,18 @@ opt_look_for_col_in_comparison_before(
 	     || (search_cond->func == '>')
 	     || (search_cond->func == '=')
 	     || (search_cond->func == PARS_GE_TOKEN)
-	     || (search_cond->func == PARS_LE_TOKEN));
+	     || (search_cond->func == PARS_LE_TOKEN)
+	     || (search_cond->func == PARS_LIKE_TOKEN_EXACT)
+	     || (search_cond->func == PARS_LIKE_TOKEN_PREFIX)
+	     || (search_cond->func == PARS_LIKE_TOKEN_SUFFIX)
+	     || (search_cond->func == PARS_LIKE_TOKEN_SUBSTR));
 
 	table = sel_node_get_nth_plan(sel_node, nth_table)->table;
 
-	if ((cmp_type == OPT_EQUAL) && (search_cond->func != '=')) {
+	if ((cmp_type == OPT_EQUAL)
+	    && (search_cond->func != '=')
+	    && (search_cond->func != PARS_LIKE_TOKEN_EXACT)
+            && (search_cond->func != PARS_LIKE_TOKEN_PREFIX)) {
 
 		return(NULL);
 
@@ -177,7 +185,9 @@ opt_look_for_col_in_comparison_before(
 		   && (search_cond->func != '<')
 		   && (search_cond->func != '>')
 		   && (search_cond->func != PARS_GE_TOKEN)
-		   && (search_cond->func != PARS_LE_TOKEN)) {
+		   && (search_cond->func != PARS_LE_TOKEN)
+		   && (search_cond->func != PARS_LIKE_TOKEN_PREFIX)
+                   && (search_cond->func != PARS_LIKE_TOKEN_SUFFIX)) {
 
 		return(NULL);
 	}
@@ -185,7 +195,7 @@ opt_look_for_col_in_comparison_before(
 	arg = search_cond->args;
 
 	if (que_node_get_type(arg) == QUE_NODE_SYMBOL) {
-		sym_node = arg;
+		sym_node = static_cast<sym_node_t*>(arg);
 
 		if ((sym_node->token_type == SYM_COLUMN)
 		    && (sym_node->table == table)
@@ -211,7 +221,7 @@ opt_look_for_col_in_comparison_before(
 	arg = que_node_get_next(arg);
 
 	if (que_node_get_type(arg) == QUE_NODE_SYMBOL) {
-		sym_node = arg;
+		sym_node = static_cast<sym_node_t*>(arg);
 
 		if ((sym_node->token_type == SYM_COLUMN)
 		    && (sym_node->table == table)
@@ -262,7 +272,7 @@ opt_look_for_col_in_cond_before(
 	ut_a(search_cond->func != PARS_NOT_TOKEN);
 
 	if (search_cond->func == PARS_AND_TOKEN) {
-		new_cond = search_cond->args;
+		new_cond = static_cast<func_node_t*>(search_cond->args);
 
 		exp = opt_look_for_col_in_cond_before(cmp_type, col_no,
 						      new_cond, sel_node,
@@ -272,7 +282,8 @@ opt_look_for_col_in_cond_before(
 			return(exp);
 		}
 
-		new_cond = que_node_get_next(new_cond);
+		new_cond = static_cast<func_node_t*>(
+			que_node_get_next(new_cond));
 
 		exp = opt_look_for_col_in_cond_before(cmp_type, col_no,
 						      new_cond, sel_node,
@@ -332,6 +343,12 @@ opt_calc_index_goodness(
 	ulint		op;
 	ulint		j;
 
+	/* At least for now we don't support using FTS indexes for queries
+	done through InnoDB's own SQL parser. */
+	if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) {
+		return(0);
+	}
+
 	goodness = 0;
 
 	/* Note that as higher level node pointers in the B-tree contain
@@ -346,7 +363,8 @@ opt_calc_index_goodness(
 		col_no = dict_index_get_nth_col_no(index, j);
 
 		exp = opt_look_for_col_in_cond_before(
-			OPT_EQUAL, col_no, sel_node->search_cond,
+			OPT_EQUAL, col_no,
+			static_cast<func_node_t*>(sel_node->search_cond),
 			sel_node, nth_table, &op);
 		if (exp) {
 			/* The value for this column is exactly known already
@@ -359,7 +377,9 @@ opt_calc_index_goodness(
 			/* Look for non-equality comparisons */
 
 			exp = opt_look_for_col_in_cond_before(
-				OPT_COMPARISON, col_no, sel_node->search_cond,
+				OPT_COMPARISON, col_no,
+				static_cast<func_node_t*>(
+					sel_node->search_cond),
 				sel_node, nth_table, &op);
 			if (exp) {
 				index_plan[j] = exp;
@@ -380,7 +400,7 @@ opt_calc_index_goodness(
 		}
 	}
 
-	/* We have to test for goodness here, as last_op may note be set */
+	/* We have to test for goodness here, as last_op may not be set */
 	if (goodness && dict_index_is_clust(index)) {
 
 		goodness++;
@@ -413,7 +433,12 @@ opt_op_to_search_mode(
 			ascending order */
 	ulint	op)	/*!< in: operator '=', PARS_GE_TOKEN, ... */
 {
-	if (op == '=') {
+	if (op == '='
+	    || op == PARS_LIKE_TOKEN_EXACT
+	    || op == PARS_LIKE_TOKEN_PREFIX
+	    || op == PARS_LIKE_TOKEN_SUFFIX
+	    || op == PARS_LIKE_TOKEN_SUBSTR) {
+
 		if (asc) {
 			return(PAGE_CUR_GE);
 		} else {
@@ -583,12 +608,18 @@ opt_search_plan_for_table(
 					    n_fields);
 		dict_index_copy_types(plan->tuple, plan->index, n_fields);
 
-		plan->tuple_exps = mem_heap_alloc(pars_sym_tab_global->heap,
-						  n_fields * sizeof(void*));
+		plan->tuple_exps = static_cast<que_node_t**>(
+			mem_heap_alloc(
+				pars_sym_tab_global->heap,
+				n_fields * sizeof(void*)));
 
 		ut_memcpy(plan->tuple_exps, best_index_plan,
 			  n_fields * sizeof(void*));
-		if (best_last_op == '=') {
+		if (best_last_op == '='
+		    || best_last_op == PARS_LIKE_TOKEN_EXACT
+                    || best_last_op == PARS_LIKE_TOKEN_PREFIX
+                    || best_last_op == PARS_LIKE_TOKEN_SUFFIX
+                    || best_last_op == PARS_LIKE_TOKEN_SUBSTR) {
 			plan->n_exact_match = n_fields;
 		} else {
 			plan->n_exact_match = n_fields - 1;
@@ -717,7 +748,7 @@ opt_find_test_conds(
 					conditions or NULL */
 {
 	func_node_t*	new_cond;
-	ulint		class;
+	ulint		fclass;
 	plan_t*		plan;
 
 	if (cond == NULL) {
@@ -726,11 +757,12 @@ opt_find_test_conds(
 	}
 
 	if (cond->func == PARS_AND_TOKEN) {
-		new_cond = cond->args;
+		new_cond = static_cast<func_node_t*>(cond->args);
 
 		opt_find_test_conds(sel_node, i, new_cond);
 
-		new_cond = que_node_get_next(new_cond);
+		new_cond = static_cast<func_node_t*>(
+			que_node_get_next(new_cond));
 
 		opt_find_test_conds(sel_node, i, new_cond);
 
@@ -739,12 +771,12 @@ opt_find_test_conds(
 
 	plan = sel_node_get_nth_plan(sel_node, i);
 
-	class = opt_classify_comparison(sel_node, i, cond);
+	fclass = opt_classify_comparison(sel_node, i, cond);
 
-	if (class == OPT_END_COND) {
+	if (fclass == OPT_END_COND) {
 		UT_LIST_ADD_LAST(cond_list, plan->end_conds, cond);
 
-	} else if (class == OPT_TEST_COND) {
+	} else if (fclass == OPT_TEST_COND) {
 		UT_LIST_ADD_LAST(cond_list, plan->other_conds, cond);
 
 	}
@@ -772,7 +804,7 @@ opt_normalize_cmp_conds(
 
 		if (que_node_get_type(arg2) == QUE_NODE_SYMBOL) {
 
-			sym_node = arg2;
+			sym_node = static_cast<sym_node_t*>(arg2);
 
 			if ((sym_node->token_type == SYM_COLUMN)
 			    && (sym_node->table == table)) {
@@ -812,7 +844,10 @@ opt_determine_and_normalize_test_conds(
 
 	/* Recursively go through the conjuncts and classify them */
 
-	opt_find_test_conds(sel_node, i, sel_node->search_cond);
+	opt_find_test_conds(
+		sel_node,
+		i,
+		static_cast<func_node_t*>(sel_node->search_cond));
 
 	opt_normalize_cmp_conds(UT_LIST_GET_FIRST(plan->end_conds),
 				plan->table);
@@ -852,14 +887,14 @@ opt_find_all_cols(
 	}
 
 	if (que_node_get_type(exp) == QUE_NODE_FUNC) {
-		func_node = exp;
+		func_node = static_cast<func_node_t*>(exp);
 
-		arg = func_node->args;
+		for (arg = func_node->args;
+		     arg != 0;
+		     arg = que_node_get_next(arg)) {
 
-		while (arg) {
-			opt_find_all_cols(copy_val, index, col_list, plan,
-					  arg);
-			arg = que_node_get_next(arg);
+			opt_find_all_cols(
+				copy_val, index, col_list, plan, arg);
 		}
 
 		return;
@@ -867,7 +902,7 @@ opt_find_all_cols(
 
 	ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL);
 
-	sym_node = exp;
+	sym_node = static_cast<sym_node_t*>(exp);
 
 	if (sym_node->token_type != SYM_COLUMN) {
 
@@ -953,11 +988,12 @@ opt_find_copy_cols(
 	ut_ad(que_node_get_type(search_cond) == QUE_NODE_FUNC);
 
 	if (search_cond->func == PARS_AND_TOKEN) {
-		new_cond = search_cond->args;
+		new_cond = static_cast<func_node_t*>(search_cond->args);
 
 		opt_find_copy_cols(sel_node, i, new_cond);
 
-		new_cond = que_node_get_next(new_cond);
+		new_cond = static_cast<func_node_t*>(
+			que_node_get_next(new_cond));
 
 		opt_find_copy_cols(sel_node, i, new_cond);
 
@@ -1004,21 +1040,23 @@ opt_classify_cols(
 	/* All select list columns should be copied: therefore TRUE as the
 	first argument */
 
-	exp = sel_node->select_list;
+	for (exp = sel_node->select_list;
+	     exp != 0;
+	     exp = que_node_get_next(exp)) {
 
-	while (exp) {
-		opt_find_all_cols(TRUE, plan->index, &(plan->columns), plan,
-				  exp);
-		exp = que_node_get_next(exp);
+		opt_find_all_cols(
+			TRUE, plan->index, &(plan->columns), plan, exp);
 	}
 
-	opt_find_copy_cols(sel_node, i, sel_node->search_cond);
+	opt_find_copy_cols(
+		sel_node, i, static_cast<func_node_t*>(sel_node->search_cond));
 
 	/* All remaining columns in the search condition are temporary
 	columns: therefore FALSE */
 
-	opt_find_all_cols(FALSE, plan->index, &(plan->columns), plan,
-			  sel_node->search_cond);
+	opt_find_all_cols(
+		FALSE, plan->index, &plan->columns, plan,
+		static_cast<func_node_t*>(sel_node->search_cond));
 }
 
 /*******************************************************************//**
@@ -1068,7 +1106,8 @@ opt_clust_access(
 
 	dict_index_copy_types(plan->clust_ref, clust_index, n_fields);
 
-	plan->clust_map = mem_heap_alloc(heap, n_fields * sizeof(ulint));
+	plan->clust_map = static_cast<ulint*>(
+		mem_heap_alloc(heap, n_fields * sizeof(ulint)));
 
 	for (i = 0; i < n_fields; i++) {
 		pos = dict_index_get_nth_field_pos(index, clust_index, i);
@@ -1082,7 +1121,7 @@ opt_clust_access(
 		    || dict_index_get_nth_field(clust_index, i)
 		    ->prefix_len != 0) {
 			fprintf(stderr,
-				"InnoDB: Error in pars0opt.c:"
+				"InnoDB: Error in pars0opt.cc:"
 				" table %s has prefix_len != 0\n",
 				index->table_name);
 		}
@@ -1108,8 +1147,10 @@ opt_search_plan(
 	order_node_t*	order_by;
 	ulint		i;
 
-	sel_node->plans = mem_heap_alloc(pars_sym_tab_global->heap,
-					 sel_node->n_tables * sizeof(plan_t));
+	sel_node->plans = static_cast<plan_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap,
+			sel_node->n_tables * sizeof(plan_t)));
 
 	/* Analyze the search condition to find out what we know at each
 	join stage about the conditions that the columns of a table should
@@ -1138,7 +1179,8 @@ opt_search_plan(
 
 		opt_determine_and_normalize_test_conds(sel_node, i);
 
-		table_node = que_node_get_next(table_node);
+		table_node = static_cast<sym_node_t*>(
+			que_node_get_next(table_node));
 	}
 
 	table_node = sel_node->table_list;
@@ -1155,7 +1197,8 @@ opt_search_plan(
 
 		opt_clust_access(sel_node, i);
 
-		table_node = que_node_get_next(table_node);
+		table_node = static_cast<sym_node_t*>(
+			que_node_get_next(table_node));
 	}
 
 	/* Check that the plan obeys a possible order-by clause: if not,
diff --git a/storage/xtradb/pars/pars0pars.c b/storage/xtradb/pars/pars0pars.cc
index 343a1130d0c..e0bc00fad0d 100644
--- a/storage/xtradb/pars/pars0pars.c
+++ b/storage/xtradb/pars/pars0pars.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2013, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St,
+Fifth Floor, Boston, MA 02110-1301 USA
 
 *****************************************************************************/
 
@@ -81,6 +81,7 @@ UNIV_INTERN pars_res_word_t	pars_distinct_token = {PARS_DISTINCT_TOKEN};
 UNIV_INTERN pars_res_word_t	pars_binary_token = {PARS_BINARY_TOKEN};
 UNIV_INTERN pars_res_word_t	pars_blob_token = {PARS_BLOB_TOKEN};
 UNIV_INTERN pars_res_word_t	pars_int_token = {PARS_INT_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_bigint_token = {PARS_BIGINT_TOKEN};
 UNIV_INTERN pars_res_word_t	pars_char_token = {PARS_CHAR_TOKEN};
 UNIV_INTERN pars_res_word_t	pars_float_token = {PARS_FLOAT_TOKEN};
 UNIV_INTERN pars_res_word_t	pars_update_token = {PARS_UPDATE_TOKEN};
@@ -95,6 +96,95 @@ UNIV_INTERN pars_res_word_t	pars_clustered_token = {PARS_CLUSTERED_TOKEN};
 /** Global variable used to denote the '*' in SELECT * FROM.. */
 UNIV_INTERN ulint	pars_star_denoter	= 12345678;
 
+/********************************************************************
+Get user function with the given name.*/
+UNIV_INLINE
+pars_user_func_t*
+pars_info_lookup_user_func(
+/*=======================*/
+					/* out: user func, or NULL if not
+					found */
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name)	/* in: function name to find*/
+{
+	if (info && info->funcs) {
+		ulint		i;
+		ib_vector_t*	vec = info->funcs;
+
+		for (i = 0; i < ib_vector_size(vec); i++) {
+			pars_user_func_t*	puf;
+
+			puf = static_cast<pars_user_func_t*>(
+				ib_vector_get(vec, i));
+
+			if (strcmp(puf->name, name) == 0) {
+				return(puf);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/********************************************************************
+Get bound identifier with the given name.*/
+UNIV_INLINE
+pars_bound_id_t*
+pars_info_lookup_bound_id(
+/*======================*/
+					/* out: bound literal, or NULL if
+					not found */
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name)	/* in: bound literal name to find */
+{
+	if (info && info->bound_ids) {
+		ulint		i;
+		ib_vector_t*	vec = info->bound_ids;
+
+		for (i = 0; i < ib_vector_size(vec); i++) {
+			pars_bound_id_t*	bid;
+
+		       	bid = static_cast<pars_bound_id_t*>(
+				ib_vector_get(vec, i));
+
+			if (strcmp(bid->name, name) == 0) {
+				return(bid);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/********************************************************************
+Get bound literal with the given name.*/
+UNIV_INLINE
+pars_bound_lit_t*
+pars_info_lookup_bound_lit(
+/*=======================*/
+					/* out: bound literal, or NULL if
+					not found */
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name)	/* in: bound literal name to find */
+{
+	if (info && info->bound_lits) {
+		ulint		i;
+		ib_vector_t*	vec = info->bound_lits;
+
+		for (i = 0; i < ib_vector_size(vec); i++) {
+			pars_bound_lit_t*	pbl;
+
+			pbl = static_cast<pars_bound_lit_t*>(
+				ib_vector_get(vec, i));
+
+			if (strcmp(pbl->name, name) == 0) {
+				return(pbl);
+			}
+		}
+	}
+
+	return(NULL);
+}
 
 /*********************************************************************//**
 Determines the class of a function code.
@@ -153,7 +243,8 @@ pars_func_low(
 {
 	func_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(func_node_t));
+	node = static_cast<func_node_t*>(
+		mem_heap_alloc(pars_sym_tab_global->heap, sizeof(func_node_t)));
 
 	node->common.type = QUE_NODE_FUNC;
 	dfield_set_data(&(node->common.val), NULL, 0);
@@ -161,7 +252,7 @@ pars_func_low(
 
 	node->func = func;
 
-	node->class = pars_func_get_class(func);
+	node->fclass = pars_func_get_class(func);
 
 	node->args = arg;
 
@@ -180,9 +271,183 @@ pars_func(
 	que_node_t*	res_word,/*!< in: function name reserved word */
 	que_node_t*	arg)	/*!< in: first argument in the argument list */
 {
-	return(pars_func_low(((pars_res_word_t*)res_word)->code, arg));
+	return(pars_func_low(((pars_res_word_t*) res_word)->code, arg));
+}
+
+/*************************************************************************
+Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded
+within the search string.*/
+
+int
+pars_like_rebind(
+/*=============*/
+				/* out, own: function node in a query tree */
+	sym_node_t*	node,	/* in: The search string node.*/
+	const byte*	ptr,	/* in: literal to (re) bind */
+	ulint		ptr_len)/* in: length of literal to (re) bind*/
+{
+	dtype_t*	dtype;
+	dfield_t*	dfield;
+	ib_like_t	op_check;
+	sym_node_t*	like_node;
+	sym_node_t*	str_node = NULL;
+	ib_like_t	op = IB_LIKE_EXACT;
+	int		func = PARS_LIKE_TOKEN_EXACT;
+
+	/* Is this a STRING% ? */
+	if (ptr[ptr_len - 1] == '%') {
+		op = IB_LIKE_PREFIX;
+	}
+
+	/* Is this a '%STRING' or %STRING% ?*/
+	if (*ptr == '%') {
+		op = (op == IB_LIKE_PREFIX) ? IB_LIKE_SUBSTR : IB_LIKE_SUFFIX;
+	}
+
+	if (node->like_node == NULL) {
+		/* Add the LIKE operator info node to the node list.
+		This will be used during the comparison phase to determine
+		how to match.*/
+		like_node = sym_tab_add_int_lit(node->sym_table, op);
+		que_node_list_add_last(NULL, like_node);
+		node->like_node = like_node;
+		str_node = sym_tab_add_str_lit(node->sym_table, ptr, ptr_len);
+		que_node_list_add_last(like_node, str_node);
+	} else {
+		like_node = node->like_node;
+
+		/* Change the value of the string in the existing
+		string node of like node */
+		str_node = static_cast<sym_node_t*>(
+			que_node_list_get_last(like_node));
+
+		/* Must find the string node */
+		ut_a(str_node);
+		ut_a(str_node != like_node);
+		ut_a(str_node->token_type == SYM_LIT);
+
+		dfield = que_node_get_val(str_node);
+		dfield_set_data(dfield, ptr, ptr_len);
+	}
+
+	dfield = que_node_get_val(like_node);
+	dtype = dfield_get_type(dfield);
+
+	ut_a(dtype_get_mtype(dtype) == DATA_INT);
+	op_check = static_cast<ib_like_t>(
+		mach_read_from_4(static_cast<byte*>(dfield_get_data(dfield))));
+
+	switch (op_check) {
+	case	IB_LIKE_PREFIX:
+	case	IB_LIKE_SUFFIX:
+	case	IB_LIKE_SUBSTR:
+	case	IB_LIKE_EXACT:
+		break;
+
+	default:
+		ut_error;
+	}
+
+	mach_write_to_4(static_cast<byte*>(dfield_get_data(dfield)), op);
+
+	dfield = que_node_get_val(node);
+
+	/* Adjust the length of the search value so the '%' is not
+	visible. Then create and add a search string node to the
+	search value node. Searching for %SUFFIX and %SUBSTR% requires
+	a full table scan and so we set the search value to ''.
+	For PREFIX% we simply remove the trailing '%'.*/
+
+	switch (op) {
+	case	IB_LIKE_EXACT:
+		dfield = que_node_get_val(str_node);
+		dtype = dfield_get_type(dfield);
+
+		ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		dfield_set_data(dfield, ptr, ptr_len);
+		break;
+
+	case	IB_LIKE_PREFIX:
+		func = PARS_LIKE_TOKEN_PREFIX;
+
+		/* Modify the original node */
+		dfield_set_len(dfield, ptr_len - 1);
+
+		dfield = que_node_get_val(str_node);
+		dtype = dfield_get_type(dfield);
+
+		ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		dfield_set_data(dfield, ptr, ptr_len - 1);
+		break;
+
+	case	IB_LIKE_SUFFIX:
+		func = PARS_LIKE_TOKEN_SUFFIX;
+
+		/* Modify the original node */
+		/* Make it an '' empty string */
+		dfield_set_len(dfield, 0);
+
+		dfield = que_node_get_val(str_node);
+		dtype = dfield_get_type(dfield);
+
+		ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		dfield_set_data(dfield, ptr + 1, ptr_len - 1);
+		break;
+
+	case	IB_LIKE_SUBSTR:
+		func = PARS_LIKE_TOKEN_SUBSTR;
+
+		/* Modify the original node */
+		/* Make it an '' empty string */
+		dfield_set_len(dfield, 0);
+
+		dfield = que_node_get_val(str_node);
+		dtype = dfield_get_type(dfield);
+
+		ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		dfield_set_data(dfield, ptr + 1, ptr_len - 2);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	return(func);
 }
 
+/*************************************************************************
+Parses a LIKE operator expression. */
+static
+int
+pars_like_op(
+/*=========*/
+				/* out, own: function node in a query tree */
+	que_node_t*	arg)	/* in: LIKE comparison string.*/
+{
+	char*		ptr;
+	ulint		ptr_len;
+	int		func = PARS_LIKE_TOKEN_EXACT;
+	dfield_t*	dfield = que_node_get_val(arg);
+	dtype_t*	dtype = dfield_get_type(dfield);
+
+	ut_a(dtype_get_mtype(dtype) == DATA_CHAR
+	     || dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+	ptr = static_cast<char*>(dfield_get_data(dfield));
+	ptr_len = strlen(ptr);
+
+	if (ptr_len) {
+
+		func = pars_like_rebind(
+			static_cast<sym_node_t*>(arg), (byte*) ptr, ptr_len);
+	}
+
+	return(func);
+}
 /*********************************************************************//**
 Parses an operator expression.
 @return	own: function node in a query tree */
@@ -201,6 +466,20 @@ pars_op(
 		que_node_list_add_last(arg1, arg2);
 	}
 
+	/* We need to parse the string and determine whether it's a
+	PREFIX, SUFFIX or SUBSTRING comparison */
+	if (func == PARS_LIKE_TOKEN) {
+
+		ut_a(que_node_get_type(arg2) == QUE_NODE_SYMBOL);
+
+		func = pars_like_op(arg2);
+
+		ut_a(func == PARS_LIKE_TOKEN_EXACT
+		     || func == PARS_LIKE_TOKEN_PREFIX
+		     || func == PARS_LIKE_TOKEN_SUFFIX
+		     || func == PARS_LIKE_TOKEN_SUBSTR);
+	}
+
 	return(pars_func_low(func, arg1));
 }
 
@@ -216,7 +495,9 @@ pars_order_by(
 {
 	order_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(order_node_t));
+	node = static_cast<order_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(order_node_t)));
 
 	node->common.type = QUE_NODE_ORDER;
 
@@ -339,6 +620,14 @@ pars_resolve_func_data_type(
 		dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
 		break;
 
+	case PARS_LIKE_TOKEN_EXACT:
+	case PARS_LIKE_TOKEN_PREFIX:
+	case PARS_LIKE_TOKEN_SUFFIX:
+	case PARS_LIKE_TOKEN_SUBSTR:
+		dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+			  DATA_ENGLISH, 0);
+		break;
+
 	default:
 		ut_error;
 	}
@@ -365,7 +654,7 @@ pars_resolve_exp_variables_and_types(
 	ut_a(exp_node);
 
 	if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
-		func_node = exp_node;
+		func_node = static_cast<func_node_t*>(exp_node);
 
 		arg = func_node->args;
 
@@ -382,7 +671,7 @@ pars_resolve_exp_variables_and_types(
 
 	ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL);
 
-	sym_node = exp_node;
+	sym_node = static_cast<sym_node_t*>(exp_node);
 
 	if (sym_node->resolved) {
 
@@ -473,7 +762,7 @@ pars_resolve_exp_columns(
 	ut_a(exp_node);
 
 	if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
-		func_node = exp_node;
+		func_node = static_cast<func_node_t*>(exp_node);
 
 		arg = func_node->args;
 
@@ -488,7 +777,7 @@ pars_resolve_exp_columns(
 
 	ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL);
 
-	sym_node = exp_node;
+	sym_node = static_cast<sym_node_t*>(exp_node);
 
 	if (sym_node->resolved) {
 
@@ -530,7 +819,7 @@ pars_resolve_exp_columns(
 			}
 		}
 
-		t_node = que_node_get_next(t_node);
+		t_node = static_cast<sym_node_t*>(que_node_get_next(t_node));
 	}
 }
 
@@ -559,19 +848,22 @@ pars_retrieve_table_def(
 /*====================*/
 	sym_node_t*	sym_node)	/*!< in: table node */
 {
-	const char*	table_name;
-
 	ut_a(sym_node);
 	ut_a(que_node_get_type(sym_node) == QUE_NODE_SYMBOL);
 
-	sym_node->resolved = TRUE;
-	sym_node->token_type = SYM_TABLE;
+	/* Open the table only if it is not already opened. */
+	if (sym_node->token_type != SYM_TABLE_REF_COUNTED) {
 
-	table_name = (const char*) sym_node->name;
+		ut_a(sym_node->table == NULL);
 
-	sym_node->table = dict_table_get_low(table_name, DICT_ERR_IGNORE_NONE);
+		sym_node->resolved = TRUE;
+		sym_node->token_type = SYM_TABLE_REF_COUNTED;
 
-	ut_a(sym_node->table);
+		sym_node->table = dict_table_open_on_name(
+			sym_node->name, TRUE, FALSE, DICT_ERR_IGNORE_NONE);
+
+		ut_a(sym_node->table != NULL);
+	}
 }
 
 /*********************************************************************//**
@@ -595,7 +887,8 @@ pars_retrieve_table_list_defs(
 
 		count++;
 
-		sym_node = que_node_get_next(sym_node);
+		sym_node = static_cast<sym_node_t*>(
+			que_node_get_next(sym_node));
 	}
 
 	return(count);
@@ -627,14 +920,15 @@ pars_select_all_columns(
 				table, i);
 
 			col_node = sym_tab_add_id(pars_sym_tab_global,
-						  (byte*)col_name,
+						  (byte*) col_name,
 						  ut_strlen(col_name));
 
 			select_node->select_list = que_node_list_add_last(
 				select_node->select_list, col_node);
 		}
 
-		table_node = que_node_get_next(table_node);
+		table_node = static_cast<sym_node_t*>(
+			que_node_get_next(table_node));
 	}
 }
 
@@ -684,9 +978,9 @@ pars_check_aggregate(
 
 		if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
 
-			func_node = exp_node;
+			func_node = static_cast<func_node_t*>(exp_node);
 
-			if (func_node->class == PARS_FUNC_AGGREGATE) {
+			if (func_node->fclass == PARS_FUNC_AGGREGATE) {
 
 				n_aggregate_nodes++;
 			}
@@ -822,8 +1116,8 @@ pars_function_declaration(
 	sym_node->token_type = SYM_FUNCTION;
 
 	/* Check that the function exists. */
-	ut_a(pars_info_get_user_func(pars_sym_tab_global->info,
-				     sym_node->name));
+	ut_a(pars_info_lookup_user_func(
+		pars_sym_tab_global->info, sym_node->name));
 
 	return(sym_node);
 }
@@ -864,8 +1158,9 @@ pars_column_assignment(
 {
 	col_assign_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap,
-			      sizeof(col_assign_node_t));
+	node = static_cast<col_assign_node_t*>(
+		mem_heap_alloc(pars_sym_tab_global->heap,
+			      sizeof(col_assign_node_t)));
 	node->common.type = QUE_NODE_COL_ASSIGNMENT;
 
 	node->col = column;
@@ -894,7 +1189,8 @@ pars_process_assign_list(
 	ulint			i;
 
 	table_sym = node->table_sym;
-	col_assign_list = node->col_assign_list;
+	col_assign_list = static_cast<col_assign_node_t*>(
+		 node->col_assign_list);
 	clust_index = dict_table_get_first_index(node->table);
 
 	assign_node = col_assign_list;
@@ -920,7 +1216,8 @@ pars_process_assign_list(
 				  assign_node->val);
 		n_assigns++;
 
-		assign_node = que_node_get_next(assign_node);
+		assign_node = static_cast<col_assign_node_t*>(
+				que_node_get_next(assign_node));
 	}
 
 	node->update = upd_create(n_assigns, pars_sym_tab_global->heap);
@@ -946,7 +1243,8 @@ pars_process_assign_list(
 			changes_field_size = 0;
 		}
 
-		assign_node = que_node_get_next(assign_node);
+		assign_node = static_cast<col_assign_node_t*>(
+				que_node_get_next(assign_node));
 	}
 
 	/* Find out if the update can modify an ordering field in any index */
@@ -1129,16 +1427,20 @@ pars_set_dfield_type(
 		flags |= DATA_UNSIGNED;
 	}
 
-	if (type == &pars_int_token) {
+	if (type == &pars_bigint_token) {
+		ut_a(len == 0);
+
+		dtype_set(dfield_get_type(dfield), DATA_INT, flags, 8);
+	} else if (type == &pars_int_token) {
 		ut_a(len == 0);
 
 		dtype_set(dfield_get_type(dfield), DATA_INT, flags, 4);
 
 	} else if (type == &pars_char_token) {
-		ut_a(len == 0);
+		//ut_a(len == 0);
 
 		dtype_set(dfield_get_type(dfield), DATA_VARCHAR,
-			  DATA_ENGLISH | flags, 0);
+			  DATA_ENGLISH | flags, len);
 	} else if (type == &pars_binary_token) {
 		ut_a(len != 0);
 
@@ -1209,12 +1511,12 @@ pars_set_parent_in_list(
 {
 	que_common_t*	common;
 
-	common = node_list;
+	common = static_cast<que_common_t*>(node_list);
 
 	while (common) {
 		common->parent = parent;
 
-		common = que_node_get_next(common);
+		common = static_cast<que_common_t*>(que_node_get_next(common));
 	}
 }
 
@@ -1230,7 +1532,9 @@ pars_elsif_element(
 {
 	elsif_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(elsif_node_t));
+	node = static_cast<elsif_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(elsif_node_t)));
 
 	node->common.type = QUE_NODE_ELSIF;
 
@@ -1258,7 +1562,9 @@ pars_if_statement(
 	if_node_t*	node;
 	elsif_node_t*	elsif_node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(if_node_t));
+	node = static_cast<if_node_t*>(
+		 mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(if_node_t)));
 
 	node->common.type = QUE_NODE_IF;
 
@@ -1273,14 +1579,15 @@ pars_if_statement(
 		/* There is a list of elsif conditions */
 
 		node->else_part = NULL;
-		node->elsif_list = else_part;
+		node->elsif_list = static_cast<elsif_node_t*>(else_part);
 
-		elsif_node = else_part;
+		elsif_node = static_cast<elsif_node_t*>(else_part);
 
 		while (elsif_node) {
 			pars_set_parent_in_list(elsif_node->stat_list, node);
 
-			elsif_node = que_node_get_next(elsif_node);
+			elsif_node = static_cast<elsif_node_t*>(
+				que_node_get_next(elsif_node));
 		}
 	} else {
 		node->else_part = else_part;
@@ -1306,7 +1613,9 @@ pars_while_statement(
 {
 	while_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(while_node_t));
+	node = static_cast<while_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(while_node_t)));
 
 	node->common.type = QUE_NODE_WHILE;
 
@@ -1335,7 +1644,8 @@ pars_for_statement(
 {
 	for_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(for_node_t));
+	node = static_cast<for_node_t*>(
+		mem_heap_alloc(pars_sym_tab_global->heap, sizeof(for_node_t)));
 
 	node->common.type = QUE_NODE_FOR;
 
@@ -1367,7 +1677,8 @@ pars_exit_statement(void)
 {
 	exit_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(exit_node_t));
+	node = static_cast<exit_node_t*>(
+		mem_heap_alloc(pars_sym_tab_global->heap, sizeof(exit_node_t)));
 	node->common.type = QUE_NODE_EXIT;
 
 	return(node);
@@ -1383,8 +1694,9 @@ pars_return_statement(void)
 {
 	return_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap,
-			      sizeof(return_node_t));
+	node = static_cast<return_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(return_node_t)));
 	node->common.type = QUE_NODE_RETURN;
 
 	return(node);
@@ -1402,8 +1714,9 @@ pars_assignment_statement(
 {
 	assign_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap,
-			      sizeof(assign_node_t));
+	node = static_cast<assign_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(assign_node_t)));
 	node->common.type = QUE_NODE_ASSIGNMENT;
 
 	node->var = var;
@@ -1455,7 +1768,9 @@ pars_fetch_statement(
 	/* Logical XOR. */
 	ut_a(!into_list != !user_func);
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(fetch_node_t));
+	node = static_cast<fetch_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(fetch_node_t)));
 
 	node->common.type = QUE_NODE_FETCH;
 
@@ -1468,8 +1783,9 @@ pars_fetch_statement(
 	} else {
 		pars_resolve_exp_variables_and_types(NULL, user_func);
 
-		node->func = pars_info_get_user_func(pars_sym_tab_global->info,
-						     user_func->name);
+		node->func = pars_info_lookup_user_func(
+			pars_sym_tab_global->info, user_func->name);
+
 		ut_a(node->func);
 
 		node->into_list = NULL;
@@ -1503,7 +1819,9 @@ pars_open_statement(
 	sym_node_t*	cursor_decl;
 	open_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(open_node_t));
+	node = static_cast<open_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(open_node_t)));
 
 	node->common.type = QUE_NODE_OPEN;
 
@@ -1513,7 +1831,7 @@ pars_open_statement(
 
 	ut_a(cursor_decl->token_type == SYM_CURSOR);
 
-	node->op_type = type;
+	node->op_type = static_cast<open_node_op>(type);
 	node->cursor_def = cursor_decl->cursor_def;
 
 	return(node);
@@ -1530,8 +1848,9 @@ pars_row_printf_statement(
 {
 	row_printf_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap,
-			      sizeof(row_printf_node_t));
+	node = static_cast<row_printf_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(row_printf_node_t)));
 	node->common.type = QUE_NODE_ROW_PRINTF;
 
 	node->sel_node = sel_node;
@@ -1549,7 +1868,7 @@ commit_node_t*
 pars_commit_statement(void)
 /*=======================*/
 {
-	return(commit_node_create(pars_sym_tab_global->heap));
+	return(trx_commit_node_create(pars_sym_tab_global->heap));
 }
 
 /*********************************************************************//**
@@ -1604,6 +1923,8 @@ pars_create_table(
 	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
 					table */
 	sym_node_t*	column_defs,	/*!< in: list of column names */
+	sym_node_t*	compact,	/* in: non-NULL if COMPACT table. */
+	sym_node_t*	block_size,	/* in: block size (can be NULL) */
 	void*		not_fit_in_memory __attribute__((unused)))
 					/*!< in: a non-NULL pointer means that
 					this is a table which in simulations
@@ -1621,13 +1942,56 @@ pars_create_table(
 	tab_node_t*	node;
 	const dtype_t*	dtype;
 	ulint		n_cols;
+	ulint		flags = 0;
+	ulint		flags2 = 0;
+
+	if (compact != NULL) {
+
+		/* System tables currently only use the REDUNDANT row
+		format therefore the check for srv_file_per_table should be
+		safe for now. */
+
+		flags |= DICT_TF_COMPACT;
+
+		/* FIXME: Ideally this should be part of the SQL syntax
+		or use some other mechanism. We want to reduce dependency
+		on global variables. There is an inherent race here but
+		that has always existed around this variable. */
+		if (srv_file_per_table) {
+			flags2 |= DICT_TF2_USE_TABLESPACE;
+		}
+	}
+
+	if (block_size != NULL) {
+		ulint		size;
+		dfield_t*	dfield;
+
+		dfield = que_node_get_val(block_size);
+
+		ut_a(dfield_get_len(dfield) == 4);
+		size = mach_read_from_4(static_cast<byte*>(
+			dfield_get_data(dfield)));
+
+
+		switch (size) {
+		case 0:
+			break;
+
+		case 1: case 2: case 4: case 8: case 16:
+			flags |= DICT_TF_COMPACT;
+			/* FTS-FIXME: needs the zip changes */
+			/* flags |= size << DICT_TF_COMPRESSED_SHIFT; */
+			break;
+
+		default:
+			ut_error;
+		}
+	}
 
 	n_cols = que_node_list_get_len(column_defs);
 
-	/* As the InnoDB SQL parser is for internal use only,
-	for creating some system tables, this function will only
-	create tables in the old (not compact) record format. */
-	table = dict_mem_table_create(table_sym->name, 0, n_cols, 0);
+	table = dict_mem_table_create(
+		table_sym->name, 0, n_cols, flags, flags2);
 
 #ifdef UNIV_DEBUG
 	if (not_fit_in_memory != NULL) {
@@ -1645,10 +2009,10 @@ pars_create_table(
 		column->resolved = TRUE;
 		column->token_type = SYM_COLUMN;
 
-		column = que_node_get_next(column);
+		column = static_cast<sym_node_t*>(que_node_get_next(column));
 	}
 
-	node = tab_create_graph_create(table, pars_sym_tab_global->heap);
+	node = tab_create_graph_create(table, pars_sym_tab_global->heap, true);
 
 	table_sym->resolved = TRUE;
 	table_sym->token_type = SYM_TABLE;
@@ -1699,10 +2063,10 @@ pars_create_index(
 		column->resolved = TRUE;
 		column->token_type = SYM_COLUMN;
 
-		column = que_node_get_next(column);
+		column = static_cast<sym_node_t*>(que_node_get_next(column));
 	}
 
-	node = ind_create_graph_create(index, pars_sym_tab_global->heap);
+	node = ind_create_graph_create(index, pars_sym_tab_global->heap, true);
 
 	table_sym->resolved = TRUE;
 	table_sym->token_type = SYM_TABLE;
@@ -1737,7 +2101,8 @@ pars_procedure_definition(
 
 	thr = que_thr_create(fork, heap);
 
-	node = mem_heap_alloc(heap, sizeof(proc_node_t));
+	node = static_cast<proc_node_t*>(
+		mem_heap_alloc(heap, sizeof(proc_node_t)));
 
 	node->common.type = QUE_NODE_PROC;
 	node->common.parent = thr;
@@ -1780,11 +2145,10 @@ pars_stored_procedure_call(
 /*************************************************************//**
 Retrieves characters to the lexical analyzer. */
 UNIV_INTERN
-void
+int
 pars_get_lex_chars(
 /*===============*/
 	char*	buf,		/*!< in/out: buffer where to copy */
-	int*	result,		/*!< out: number of characters copied or EOF */
 	int	max_size)	/*!< in: maximum number of characters which fit
 				in the buffer */
 {
@@ -1796,9 +2160,7 @@ pars_get_lex_chars(
 #ifdef YYDEBUG
 		/* fputs("SQL string ends\n", stderr); */
 #endif
-		*result = 0;
-
-		return;
+		return(0);
 	}
 
 	if (len > max_size) {
@@ -1820,9 +2182,10 @@ pars_get_lex_chars(
 
 	ut_memcpy(buf, pars_sym_tab_global->sql_string
 		  + pars_sym_tab_global->next_char_pos, len);
-	*result = len;
 
 	pars_sym_tab_global->next_char_pos += len;
+
+	return(len);
 }
 
 /*************************************************************//**
@@ -1865,8 +2228,8 @@ pars_sql(
 	pars_sym_tab_global = sym_tab_create(heap);
 
 	pars_sym_tab_global->string_len = strlen(str);
-	pars_sym_tab_global->sql_string = mem_heap_dup(
-		heap, str, pars_sym_tab_global->string_len + 1);
+	pars_sym_tab_global->sql_string = static_cast<char*>(
+		mem_heap_dup(heap, str, pars_sym_tab_global->string_len + 1));
 	pars_sym_tab_global->next_char_pos = 0;
 	pars_sym_tab_global->info = info;
 
@@ -1885,6 +2248,8 @@ pars_sql(
 	graph->sym_tab = pars_sym_tab_global;
 	graph->info = info;
 
+	pars_sym_tab_global = NULL;
+
 	/* fprintf(stderr, "SQL graph size %lu\n", mem_heap_get_size(heap)); */
 
 	return(graph);
@@ -1900,7 +2265,7 @@ que_thr_t*
 pars_complete_graph_for_exec(
 /*=========================*/
 	que_node_t*	node,	/*!< in: root node for an incomplete
-				query graph */
+				query graph, or NULL for dummy graph */
 	trx_t*		trx,	/*!< in: transaction handle */
 	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
 {
@@ -1914,7 +2279,9 @@ pars_complete_graph_for_exec(
 
 	thr->child = node;
 
-	que_node_set_parent(node, thr);
+	if (node) {
+		que_node_set_parent(node, thr);
+	}
 
 	trx->graph = NULL;
 
@@ -1934,7 +2301,7 @@ pars_info_create(void)
 
 	heap = mem_heap_create(512);
 
-	info = mem_heap_alloc(heap, sizeof(*info));
+	info = static_cast<pars_info_t*>(mem_heap_alloc(heap, sizeof(*info)));
 
 	info->heap = heap;
 	info->funcs = NULL;
@@ -1974,16 +2341,22 @@ pars_info_add_literal(
 
 	ut_ad(!pars_info_get_bound_lit(info, name));
 
-	pbl = mem_heap_alloc(info->heap, sizeof(*pbl));
+	pbl = static_cast<pars_bound_lit_t*>(
+		mem_heap_alloc(info->heap, sizeof(*pbl)));
 
 	pbl->name = name;
+
 	pbl->address = address;
 	pbl->length = length;
 	pbl->type = type;
 	pbl->prtype = prtype;
 
 	if (!info->bound_lits) {
-		info->bound_lits = ib_vector_create(info->heap, 8);
+		ib_alloc_t*     heap_alloc;
+
+		heap_alloc = ib_heap_allocator_create(info->heap);
+
+		info->bound_lits = ib_vector_create(heap_alloc, sizeof(*pbl), 8);
 	}
 
 	ib_vector_push(info->bound_lits, pbl);
@@ -2004,6 +2377,63 @@ pars_info_add_str_literal(
 			      DATA_VARCHAR, DATA_ENGLISH);
 }
 
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+UNIV_INTERN
+void
+pars_info_bind_literal(
+/*===================*/
+	pars_info_t*	info,		/* in: info struct */
+	const char*	name,		/* in: name */
+	const void*	address,	/* in: address */
+	ulint		length,		/* in: length of data */
+	ulint		type,		/* in: type, e.g. DATA_FIXBINARY */
+	ulint		prtype)		/* in: precise type, e.g. */
+{
+	pars_bound_lit_t*	pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(
+			info, name, address, length, type, prtype);
+	} else {
+		pbl->address = address;
+		pbl->length = length;
+
+		sym_tab_rebind_lit(pbl->node, address, length);
+	}
+}
+
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+UNIV_INTERN
+void
+pars_info_bind_varchar_literal(
+/*===========================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const byte*	str,		/*!< in: string */
+	ulint		str_len)	/*!< in: string length */
+{
+	pars_bound_lit_t*	pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(
+			info, name, str, str_len, DATA_VARCHAR, DATA_ENGLISH);
+	} else {
+
+		pbl->address = str;
+		pbl->length = str_len;
+
+		sym_tab_rebind_lit(pbl->node, str, str_len);
+	}
+}
+
 /****************************************************************//**
 Equivalent to:
 
@@ -2021,12 +2451,65 @@ pars_info_add_int4_literal(
 	const char*	name,		/*!< in: name */
 	lint		val)		/*!< in: value */
 {
-	byte*	buf = mem_heap_alloc(info->heap, 4);
+	byte*	buf = static_cast<byte*>(mem_heap_alloc(info->heap, 4));
 
 	mach_write_to_4(buf, val);
 	pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
 }
 
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+UNIV_INTERN
+void
+pars_info_bind_int4_literal(
+/*========================*/
+	pars_info_t*		info,   /* in: info struct */
+	const char*		name,   /* in: name */
+	const ib_uint32_t*	val)    /* in: value */
+{
+	pars_bound_lit_t*       pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(info, name, val, 4, DATA_INT, 0);
+	} else {
+
+		pbl->address = val;
+		pbl->length = sizeof(*val);
+
+		sym_tab_rebind_lit(pbl->node, val, sizeof(*val));
+	}
+}
+
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+UNIV_INTERN
+void
+pars_info_bind_int8_literal(
+/*========================*/
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name,	/* in: name */
+	const ib_uint64_t*	val)	/* in: value */
+{
+	pars_bound_lit_t*	pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(
+			info, name, val, sizeof(*val), DATA_INT, 0);
+	} else {
+
+		pbl->address = val;
+		pbl->length = sizeof(*val);
+
+		sym_tab_rebind_lit(pbl->node, val, sizeof(*val));
+	}
+}
+
 /****************************************************************//**
 Equivalent to:
 
@@ -2044,7 +2527,7 @@ pars_info_add_ull_literal(
 	const char*	name,		/*!< in: name */
 	ib_uint64_t	val)		/*!< in: value */
 {
-	byte*	buf = mem_heap_alloc(info->heap, 8);
+	byte*	buf = static_cast<byte*>(mem_heap_alloc(info->heap, 8));
 
 	mach_write_to_8(buf, val);
 
@@ -2052,11 +2535,38 @@ pars_info_add_ull_literal(
 }
 
 /****************************************************************//**
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+UNIV_INTERN
+void
+pars_info_bind_ull_literal(
+/*=======================*/
+	pars_info_t*		info,		/*!< in: info struct */
+	const char*		name,		/*!< in: name */
+	const ib_uint64_t*	val)		/*!< in: value */
+{
+	pars_bound_lit_t*	pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(
+			info, name, val, sizeof(*val), DATA_FIXBINARY, 0);
+	} else {
+
+		pbl->address = val;
+		pbl->length = sizeof(*val);
+
+		sym_tab_rebind_lit(pbl->node, val, sizeof(*val));
+	}
+}
+
+/****************************************************************//**
 Add user function. */
 UNIV_INTERN
 void
-pars_info_add_function(
-/*===================*/
+pars_info_bind_function(
+/*====================*/
 	pars_info_t*		info,	/*!< in: info struct */
 	const char*		name,	/*!< in: function name */
 	pars_user_func_cb_t	func,	/*!< in: function address */
@@ -2064,75 +2574,77 @@ pars_info_add_function(
 {
 	pars_user_func_t*	puf;
 
-	ut_ad(!pars_info_get_user_func(info, name));
+	puf = pars_info_lookup_user_func(info, name);
 
-	puf = mem_heap_alloc(info->heap, sizeof(*puf));
+	if (!puf) {
+		if (!info->funcs) {
+			ib_alloc_t*     heap_alloc;
 
-	puf->name = name;
-	puf->func = func;
-	puf->arg = arg;
+			heap_alloc = ib_heap_allocator_create(info->heap);
+
+			info->funcs = ib_vector_create(
+				heap_alloc, sizeof(*puf), 8);
+		}
 
-	if (!info->funcs) {
-		info->funcs = ib_vector_create(info->heap, 8);
+		/* Create a "new" element */
+		puf = static_cast<pars_user_func_t*>(
+			ib_vector_push(info->funcs, NULL));
+		puf->name = name;
 	}
 
-	ib_vector_push(info->funcs, puf);
+	puf->arg = arg;
+	puf->func = func;
 }
 
-/****************************************************************//**
+/********************************************************************
 Add bound id. */
 UNIV_INTERN
 void
-pars_info_add_id(
-/*=============*/
+pars_info_bind_id(
+/*==============*/
 	pars_info_t*	info,		/*!< in: info struct */
+	ibool		copy_name,	/* in: copy name if TRUE */
 	const char*	name,		/*!< in: name */
 	const char*	id)		/*!< in: id */
 {
 	pars_bound_id_t*	bid;
 
-	ut_ad(!pars_info_get_bound_id(info, name));
+	bid = pars_info_lookup_bound_id(info, name);
 
-	bid = mem_heap_alloc(info->heap, sizeof(*bid));
+	if (!bid) {
 
-	bid->name = name;
-	bid->id = id;
+		if (!info->bound_ids) {
+			ib_alloc_t*     heap_alloc;
 
-	if (!info->bound_ids) {
-		info->bound_ids = ib_vector_create(info->heap, 8);
-	}
+			heap_alloc = ib_heap_allocator_create(info->heap);
 
-	ib_vector_push(info->bound_ids, bid);
-}
+			info->bound_ids = ib_vector_create(
+				heap_alloc, sizeof(*bid), 8);
+		}
 
-/****************************************************************//**
-Get user function with the given name.
-@return	user func, or NULL if not found */
-UNIV_INTERN
-pars_user_func_t*
-pars_info_get_user_func(
-/*====================*/
-	pars_info_t*		info,	/*!< in: info struct */
-	const char*		name)	/*!< in: function name to find*/
-{
-	ulint		i;
-	ib_vector_t*	vec;
+		/* Create a "new" element */
+		bid = static_cast<pars_bound_id_t*>(
+			ib_vector_push(info->bound_ids, NULL));
 
-	if (!info || !info->funcs) {
-		return(NULL);
+		bid->name = (copy_name)
+		    ? mem_heap_strdup(info->heap, name) : name;
 	}
 
-	vec = info->funcs;
-
-	for (i = 0; i < ib_vector_size(vec); i++) {
-		pars_user_func_t*	puf = ib_vector_get(vec, i);
+	bid->id = id;
+}
 
-		if (strcmp(puf->name, name) == 0) {
-			return(puf);
-		}
-	}
+/********************************************************************
+Get bound identifier with the given name.*/
 
-	return(NULL);
+pars_bound_id_t*
+pars_info_get_bound_id(
+/*===================*/
+					/* out: bound id, or NULL if not
+					found */
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name)	/* in: bound id name to find */
+{
+	return(pars_info_lookup_bound_id(info, name));
 }
 
 /****************************************************************//**
@@ -2145,52 +2657,5 @@ pars_info_get_bound_lit(
 	pars_info_t*		info,	/*!< in: info struct */
 	const char*		name)	/*!< in: bound literal name to find */
 {
-	ulint		i;
-	ib_vector_t*	vec;
-
-	if (!info || !info->bound_lits) {
-		return(NULL);
-	}
-
-	vec = info->bound_lits;
-
-	for (i = 0; i < ib_vector_size(vec); i++) {
-		pars_bound_lit_t*	pbl = ib_vector_get(vec, i);
-
-		if (strcmp(pbl->name, name) == 0) {
-			return(pbl);
-		}
-	}
-
-	return(NULL);
-}
-
-/****************************************************************//**
-Get bound id with the given name.
-@return	bound id, or NULL if not found */
-UNIV_INTERN
-pars_bound_id_t*
-pars_info_get_bound_id(
-/*===================*/
-	pars_info_t*		info,	/*!< in: info struct */
-	const char*		name)	/*!< in: bound id name to find */
-{
-	ulint		i;
-	ib_vector_t*	vec;
-
-	if (!info || !info->bound_ids) {
-		return(NULL);
-	}
-
-	vec = info->bound_ids;
-
-	for (i = 0; i < ib_vector_size(vec); i++) {
-		pars_bound_id_t*	bid = ib_vector_get(vec, i);
-
-		if (strcmp(bid->name, name) == 0) {
-			return(bid);
-		}
-	}
-
-	return(NULL);
+	return(pars_info_lookup_bound_lit(info, name));
 }
diff --git a/storage/xtradb/pars/pars0sym.c b/storage/xtradb/pars/pars0sym.cc
index 783598fdd1e..b01a69cb33a 100644
--- a/storage/xtradb/pars/pars0sym.c
+++ b/storage/xtradb/pars/pars0sym.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file pars/pars0sym.c
+@file pars/pars0sym.cc
 SQL parser symbol table
 
 Created 12/15/1997 Heikki Tuuri
@@ -49,7 +49,8 @@ sym_tab_create(
 {
 	sym_tab_t*	sym_tab;
 
-	sym_tab = mem_heap_alloc(heap, sizeof(sym_tab_t));
+	sym_tab = static_cast<sym_tab_t*>(
+		mem_heap_alloc(heap, sizeof(sym_tab_t)));
 
 	UT_LIST_INIT(sym_tab->sym_list);
 	UT_LIST_INIT(sym_tab->func_node_list);
@@ -59,6 +60,7 @@ sym_tab_create(
 	return(sym_tab);
 }
 
+
 /******************************************************************//**
 Frees the memory allocated dynamically AFTER parsing phase for variables
 etc. in the symbol table. Does not free the mem heap where the table was
@@ -72,9 +74,23 @@ sym_tab_free_private(
 	sym_node_t*	sym;
 	func_node_t*	func;
 
-	sym = UT_LIST_GET_FIRST(sym_tab->sym_list);
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	for (sym = UT_LIST_GET_FIRST(sym_tab->sym_list);
+	     sym != NULL;
+	     sym = UT_LIST_GET_NEXT(sym_list, sym)) {
+
+		/* Close the tables opened in pars_retrieve_table_def(). */
+
+		if (sym->token_type == SYM_TABLE_REF_COUNTED) {
+
+			dict_table_close(sym->table, TRUE, FALSE);
+
+			sym->table = NULL;
+			sym->resolved = FALSE;
+			sym->token_type = SYM_UNSET;
+		}
 
-	while (sym) {
 		eval_node_free_val_buf(sym);
 
 		if (sym->prefetch_buf) {
@@ -84,16 +100,13 @@ sym_tab_free_private(
 		if (sym->cursor_def) {
 			que_graph_free_recursive(sym->cursor_def);
 		}
-
-		sym = UT_LIST_GET_NEXT(sym_list, sym);
 	}
 
-	func = UT_LIST_GET_FIRST(sym_tab->func_node_list);
+	for (func = UT_LIST_GET_FIRST(sym_tab->func_node_list);
+	     func != NULL;
+	     func = UT_LIST_GET_NEXT(func_node_list, func)) {
 
-	while (func) {
 		eval_node_free_val_buf(func);
-
-		func = UT_LIST_GET_NEXT(func_node_list, func);
 	}
 }
 
@@ -110,10 +123,12 @@ sym_tab_add_int_lit(
 	sym_node_t*	node;
 	byte*		data;
 
-	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
 
 	node->common.type = QUE_NODE_SYMBOL;
 
+	node->table = NULL;
 	node->resolved = TRUE;
 	node->token_type = SYM_LIT;
 
@@ -121,7 +136,7 @@ sym_tab_add_int_lit(
 
 	dtype_set(dfield_get_type(&node->common.val), DATA_INT, 0, 4);
 
-	data = mem_heap_alloc(sym_tab->heap, 4);
+	data = static_cast<byte*>(mem_heap_alloc(sym_tab->heap, 4));
 	mach_write_to_4(data, val);
 
 	dfield_set_data(&(node->common.val), data, 4);
@@ -132,6 +147,8 @@ sym_tab_add_int_lit(
 
 	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
 
+	node->like_node = NULL;
+
 	node->sym_table = sym_tab;
 
 	return(node);
@@ -145,17 +162,19 @@ sym_node_t*
 sym_tab_add_str_lit(
 /*================*/
 	sym_tab_t*	sym_tab,	/*!< in: symbol table */
-	byte*		str,		/*!< in: string with no quotes around
+	const byte*	str,		/*!< in: string with no quotes around
 					it */
 	ulint		len)		/*!< in: string length */
 {
 	sym_node_t*	node;
 	byte*		data;
 
-	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
 
 	node->common.type = QUE_NODE_SYMBOL;
 
+	node->table = NULL;
 	node->resolved = TRUE;
 	node->token_type = SYM_LIT;
 
@@ -164,12 +183,8 @@ sym_tab_add_str_lit(
 	dtype_set(dfield_get_type(&node->common.val),
 		  DATA_VARCHAR, DATA_ENGLISH, 0);
 
-	if (len) {
-		data = mem_heap_alloc(sym_tab->heap, len);
-		ut_memcpy(data, str, len);
-	} else {
-		data = NULL;
-	}
+	data = (len) ? static_cast<byte*>(mem_heap_dup(sym_tab->heap, str, len))
+	      	     : NULL;
 
 	dfield_set_data(&(node->common.val), data, len);
 
@@ -179,6 +194,8 @@ sym_tab_add_str_lit(
 
 	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
 
+	node->like_node = NULL;
+
 	node->sym_table = sym_tab;
 
 	return(node);
@@ -202,10 +219,13 @@ sym_tab_add_bound_lit(
 	blit = pars_info_get_bound_lit(sym_tab->info, name);
 	ut_a(blit);
 
-	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
 
 	node->common.type = QUE_NODE_SYMBOL;
+	node->common.brother = node->common.parent = NULL;
 
+	node->table = NULL;
 	node->resolved = TRUE;
 	node->token_type = SYM_LIT;
 
@@ -255,11 +275,57 @@ sym_tab_add_bound_lit(
 
 	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
 
+	blit->node = node;
+	node->like_node = NULL;
 	node->sym_table = sym_tab;
 
 	return(node);
 }
 
+/**********************************************************************
+Rebind literal to a node in the symbol table. */
+
+sym_node_t*
+sym_tab_rebind_lit(
+/*===============*/
+					/* out: symbol table node */
+	sym_node_t*	node,		/* in: node that is bound to literal*/
+	const void*	address,	/* in: pointer to data */
+	ulint		length)		/* in: length of data */
+{
+	dfield_t*	dfield = que_node_get_val(node);
+	dtype_t*	dtype = dfield_get_type(dfield);
+
+	ut_a(node->token_type == SYM_LIT);
+
+	dfield_set_data(&node->common.val, address, length);
+
+	if (node->like_node) {
+
+	    ut_a(dtype_get_mtype(dtype) == DATA_CHAR
+		 || dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		/* Don't force [FALSE] creation of sub-nodes (for LIKE) */
+		pars_like_rebind(
+			node,static_cast<const byte*>(address), length);
+	}
+
+	/* FIXME: What's this ? */
+	node->common.val_buf_size = 0;
+
+	if (node->prefetch_buf) {
+		sel_col_prefetch_buf_free(node->prefetch_buf);
+		node->prefetch_buf = NULL;
+	}
+
+	if (node->cursor_def) {
+		que_graph_free_recursive(node->cursor_def);
+		node->cursor_def = NULL;
+	}
+
+	return(node);
+}
+
 /******************************************************************//**
 Adds an SQL null literal to a symbol table.
 @return	symbol table node */
@@ -271,10 +337,12 @@ sym_tab_add_null_lit(
 {
 	sym_node_t*	node;
 
-	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
 
 	node->common.type = QUE_NODE_SYMBOL;
 
+	node->table = NULL;
 	node->resolved = TRUE;
 	node->token_type = SYM_LIT;
 
@@ -290,6 +358,8 @@ sym_tab_add_null_lit(
 
 	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
 
+	node->like_node = NULL;
+
 	node->sym_table = sym_tab;
 
 	return(node);
@@ -308,13 +378,11 @@ sym_tab_add_id(
 {
 	sym_node_t*	node;
 
-	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+	node = static_cast<sym_node_t*>(
+		mem_heap_zalloc(sym_tab->heap, sizeof(*node)));
 
 	node->common.type = QUE_NODE_SYMBOL;
 
-	node->resolved = FALSE;
-	node->indirection = NULL;
-
 	node->name = mem_heap_strdupl(sym_tab->heap, (char*) name, len);
 	node->name_len = len;
 
@@ -322,10 +390,6 @@ sym_tab_add_id(
 
 	dfield_set_null(&node->common.val);
 
-	node->common.val_buf_size = 0;
-	node->prefetch_buf = NULL;
-	node->cursor_def = NULL;
-
 	node->sym_table = sym_tab;
 
 	return(node);
@@ -337,7 +401,7 @@ Add a bound identifier to a symbol table.
 UNIV_INTERN
 sym_node_t*
 sym_tab_add_bound_id(
-/*===========*/
+/*=================*/
 	sym_tab_t*	sym_tab,	/*!< in: symbol table */
 	const char*	name)		/*!< in: name of bound id */
 {
@@ -347,11 +411,14 @@ sym_tab_add_bound_id(
 	bid = pars_info_get_bound_id(sym_tab->info, name);
 	ut_a(bid);
 
-	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
 
 	node->common.type = QUE_NODE_SYMBOL;
 
+	node->table = NULL;
 	node->resolved = FALSE;
+	node->token_type = SYM_UNSET;
 	node->indirection = NULL;
 
 	node->name = mem_heap_strdup(sym_tab->heap, bid->id);
@@ -365,6 +432,8 @@ sym_tab_add_bound_id(
 	node->prefetch_buf = NULL;
 	node->cursor_def = NULL;
 
+	node->like_node = NULL;
+
 	node->sym_table = sym_tab;
 
 	return(node);
diff --git a/storage/xtradb/que/que0que.c b/storage/xtradb/que/que0que.cc
index a9d76ce4706..8d9b8fac776 100644
--- a/storage/xtradb/que/que0que.c
+++ b/storage/xtradb/que/que0que.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file que/que0que.c
+@file que/que0que.cc
 Query graph
 
 Created 5/27/1996 Heikki Tuuri
@@ -40,11 +40,10 @@ Created 5/27/1996 Heikki Tuuri
 #include "dict0crea.h"
 #include "log0log.h"
 #include "eval0proc.h"
+#include "lock0lock.h"
 #include "eval0eval.h"
 #include "pars0types.h"
 
-#define QUE_PARALLELIZE_LIMIT	(64 * 256 * 256 * 256)
-#define QUE_ROUND_ROBIN_LIMIT	(64 * 256 * 256 * 256)
 #define QUE_MAX_LOOPS_WITHOUT_CHECK	16
 
 #ifdef UNIV_DEBUG
@@ -101,22 +100,10 @@ A = assign_node_t, W = while_node_t. */
 is executed?
 
 The commit or rollback can be seen as a subprocedure call.
-The problem is that if there are several query threads
-currently running within the transaction, their action could
-mess the commit or rollback operation. Or, at the least, the
-operation would be difficult to visualize and keep in control.
-
-Therefore the query thread requesting a commit or a rollback
-sends to the transaction a signal, which moves the transaction
-to TRX_QUE_SIGNALED state. All running query threads of the
-transaction will eventually notice that the transaction is now in
-this state and voluntarily suspend themselves. Only the last
-query thread which suspends itself will trigger handling of
-the signal.
-
-When the transaction starts to handle a rollback or commit
-signal, it builds a query graph which, when executed, will
-roll back or commit the incomplete transaction. The transaction
+
+When the transaction starts to handle a rollback or commit.
+It builds a query graph which, when executed, will roll back
+or commit the incomplete transaction. The transaction
 is moved to the TRX_QUE_ROLLING_BACK or TRX_QUE_COMMITTING state.
 If specified, the SQL cursors opened by the transaction are closed.
 When the execution of the graph completes, it is like returning
@@ -135,20 +122,6 @@ que_thr_move_to_run_state(
 	que_thr_t*	thr);	/*!< in: an query thread */
 
 /***********************************************************************//**
-Adds a query graph to the session's list of graphs. */
-UNIV_INTERN
-void
-que_graph_publish(
-/*==============*/
-	que_t*	graph,	/*!< in: graph */
-	sess_t*	sess)	/*!< in: session */
-{
-	ut_ad(mutex_own(&kernel_mutex));
-
-	UT_LIST_ADD_LAST(graphs, sess->graphs, graph);
-}
-
-/***********************************************************************//**
 Creates a query graph fork node.
 @return	own: fork node */
 UNIV_INTERN
@@ -166,30 +139,19 @@ que_fork_create(
 
 	ut_ad(heap);
 
-	fork = mem_heap_alloc(heap, sizeof(que_fork_t));
+	fork = static_cast<que_fork_t*>(mem_heap_zalloc(heap, sizeof(*fork)));
 
-	fork->common.type = QUE_NODE_FORK;
-	fork->n_active_thrs = 0;
-
-	fork->state = QUE_FORK_COMMAND_WAIT;
-
-	if (graph != NULL) {
-		fork->graph = graph;
-	} else {
-		fork->graph = fork;
-	}
+	fork->heap = heap;
 
-	fork->common.parent = parent;
 	fork->fork_type = fork_type;
 
-	fork->caller = NULL;
+	fork->common.parent = parent;
 
-	UT_LIST_INIT(fork->thrs);
+	fork->common.type = QUE_NODE_FORK;
 
-	fork->sym_tab = NULL;
-	fork->info = NULL;
+	fork->state = QUE_FORK_COMMAND_WAIT;
 
-	fork->heap = heap;
+	fork->graph = (graph != NULL) ? graph : fork;
 
 	return(fork);
 }
@@ -208,21 +170,18 @@ que_thr_create(
 
 	ut_ad(parent && heap);
 
-	thr = mem_heap_alloc(heap, sizeof(que_thr_t));
+	thr = static_cast<que_thr_t*>(mem_heap_zalloc(heap, sizeof(*thr)));
+
+	thr->graph = parent->graph;
 
-	thr->common.type = QUE_NODE_THR;
 	thr->common.parent = parent;
 
 	thr->magic_n = QUE_THR_MAGIC_N;
 
-	thr->graph = parent->graph;
+	thr->common.type = QUE_NODE_THR;
 
 	thr->state = QUE_THR_COMMAND_WAIT;
 
-	thr->is_active = FALSE;
-
-	thr->run_node = NULL;
-	thr->resource = 0;
 	thr->lock_state = QUE_THR_LOCK_NOLOCK;
 
 	UT_LIST_ADD_LAST(thrs, parent->thrs, thr);
@@ -232,87 +191,53 @@ que_thr_create(
 
 /**********************************************************************//**
 Moves a suspended query thread to the QUE_THR_RUNNING state and may release
-a single worker thread to execute it. This function should be used to end
+a worker thread to execute it. This function should be used to end
 the wait state of a query thread waiting for a lock or a stored procedure
-completion. */
+completion.
+@return the query thread that needs to be released. */
 UNIV_INTERN
-void
-que_thr_end_wait(
-/*=============*/
-	que_thr_t*	thr,		/*!< in: query thread in the
-					QUE_THR_LOCK_WAIT,
-					or QUE_THR_PROCEDURE_WAIT, or
-					QUE_THR_SIG_REPLY_WAIT state */
-	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread; if NULL is passed
-					as the parameter, it is ignored */
+que_thr_t*
+que_thr_end_lock_wait(
+/*==================*/
+	trx_t*		trx)	/*!< in: transaction with que_state in
+		       		QUE_THR_LOCK_WAIT */
 {
-	ibool	was_active;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(thr);
-	ut_ad((thr->state == QUE_THR_LOCK_WAIT)
-	      || (thr->state == QUE_THR_PROCEDURE_WAIT)
-	      || (thr->state == QUE_THR_SIG_REPLY_WAIT));
-	ut_ad(thr->run_node);
-
-	thr->prev_node = thr->run_node;
-
-	was_active = thr->is_active;
-
-	que_thr_move_to_run_state(thr);
+	que_thr_t*	thr;
+	ibool		was_active;
+	ulint		sec;
+	ulint		ms;
+	ib_uint64_t	now;
 
-	if (was_active) {
+	ut_ad(lock_mutex_own());
+	ut_ad(trx_mutex_own(trx));
 
-		return;
-	}
+	thr = trx->lock.wait_thr;
 
-	if (next_thr && *next_thr == NULL) {
-		*next_thr = thr;
-	} else {
-		ut_a(0);
-		srv_que_task_enqueue_low(thr);
-	}
-}
-
-/**********************************************************************//**
-Same as que_thr_end_wait, but no parameter next_thr available. */
-UNIV_INTERN
-void
-que_thr_end_wait_no_next_thr(
-/*=========================*/
-	que_thr_t*	thr)	/*!< in: query thread in the QUE_THR_LOCK_WAIT,
-				or QUE_THR_PROCEDURE_WAIT, or
-				QUE_THR_SIG_REPLY_WAIT state */
-{
-	ibool	was_active;
+	ut_ad(thr != NULL);
 
-	ut_a(thr->state == QUE_THR_LOCK_WAIT);	/* In MySQL this is the
-						only possible state here */
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(thr);
-	ut_ad((thr->state == QUE_THR_LOCK_WAIT)
-	      || (thr->state == QUE_THR_PROCEDURE_WAIT)
-	      || (thr->state == QUE_THR_SIG_REPLY_WAIT));
+	ut_ad(trx->lock.que_state == TRX_QUE_LOCK_WAIT);
+	/* In MySQL this is the only possible state here */
+	ut_a(thr->state == QUE_THR_LOCK_WAIT);
 
 	was_active = thr->is_active;
 
 	que_thr_move_to_run_state(thr);
 
-	if (was_active) {
-
-		return;
+	if (UNIV_UNLIKELY(trx->take_stats)) {
+		ut_usectime(&sec, &ms);
+		now = (ib_uint64_t)sec * 1000000 + ms;
+		trx->lock_que_wait_timer
+			+= (ulint)(now - trx->lock_que_wait_ustarted);
 	}
 
+	trx->lock.que_state = TRX_QUE_RUNNING;
+
+	trx->lock.wait_thr = NULL;
+
 	/* In MySQL we let the OS thread (not just the query thread) to wait
 	for the lock to be released: */
 
-	srv_release_mysql_thread_if_suspended(thr);
-
-	/* srv_que_task_enqueue_low(thr); */
+	return((!was_active && thr != NULL) ? thr : NULL);
 }
 
 /**********************************************************************//**
@@ -330,6 +255,53 @@ que_thr_init_command(
 }
 
 /**********************************************************************//**
+Round robin scheduler.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+UNIV_INTERN
+que_thr_t*
+que_fork_scheduler_round_robin(
+/*===========================*/
+	que_fork_t*	fork,		/*!< in: a query fork */
+	que_thr_t*	thr)		/*!< in: current pos */
+{
+	trx_mutex_enter(fork->trx);
+
+	/* If no current, start first available. */
+	if (thr == NULL) {
+		thr = UT_LIST_GET_FIRST(fork->thrs);
+	} else {
+		thr = UT_LIST_GET_NEXT(thrs, thr);
+	}
+
+	if (thr) {
+
+		fork->state = QUE_FORK_ACTIVE;
+
+		fork->last_sel_node = NULL;
+
+		switch (thr->state) {
+		case QUE_THR_COMMAND_WAIT:
+		case QUE_THR_COMPLETED:
+			ut_a(!thr->is_active);
+			que_thr_init_command(thr);
+			break;
+
+		case QUE_THR_SUSPENDED:
+		case QUE_THR_LOCK_WAIT:
+		default:
+			ut_error;
+
+		}
+	}
+
+	trx_mutex_exit(fork->trx);
+
+	return(thr);
+}
+
+/**********************************************************************//**
 Starts execution of a command in a query fork. Picks a query thread which
 is not in the QUE_THR_RUNNING state and moves it to that state. If none
 can be chosen, a situation which may arise in parallelized fetches, NULL
@@ -363,11 +335,12 @@ que_fork_start_command(
 	state, finally we try to find a query thread in the QUE_THR_COMPLETED
 	state */
 
-	thr = UT_LIST_GET_FIRST(fork->thrs);
-
 	/* We make a single pass over the thr list within which we note which
 	threads are ready to run. */
-	while (thr) {
+	for (thr = UT_LIST_GET_FIRST(fork->thrs);
+	     thr != NULL;
+	     thr = UT_LIST_GET_NEXT(thrs, thr)) {
+
 		switch (thr->state) {
 		case QUE_THR_COMMAND_WAIT:
 
@@ -399,8 +372,6 @@ que_fork_start_command(
 			ut_error;
 
 		}
-
-		thr = UT_LIST_GET_NEXT(thrs, thr);
 	}
 
 	if (suspended_thr) {
@@ -412,52 +383,13 @@ que_fork_start_command(
 
 		thr = completed_thr;
 		que_thr_init_command(thr);
+	} else {
+		ut_error;
 	}
 
 	return(thr);
 }
 
-/**********************************************************************//**
-After signal handling is finished, returns control to a query graph error
-handling routine. (Currently, just returns the control to the root of the
-graph so that the graph can communicate an error message to the client.) */
-UNIV_INTERN
-void
-que_fork_error_handle(
-/*==================*/
-	trx_t*	trx __attribute__((unused)),	/*!< in: trx */
-	que_t*	fork)	/*!< in: query graph which was run before signal
-			handling started, NULL not allowed */
-{
-	que_thr_t*	thr;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(trx->sess->state == SESS_ERROR);
-	ut_ad(UT_LIST_GET_LEN(trx->reply_signals) == 0);
-	ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
-
-	thr = UT_LIST_GET_FIRST(fork->thrs);
-
-	while (thr != NULL) {
-		ut_ad(!thr->is_active);
-		ut_ad(thr->state != QUE_THR_SIG_REPLY_WAIT);
-		ut_ad(thr->state != QUE_THR_LOCK_WAIT);
-
-		thr->run_node = thr;
-		thr->prev_node = thr->child;
-		thr->state = QUE_THR_COMPLETED;
-
-		thr = UT_LIST_GET_NEXT(thrs, thr);
-	}
-
-	thr = UT_LIST_GET_FIRST(fork->thrs);
-
-	que_thr_move_to_run_state(thr);
-
-	ut_a(0);
-	srv_que_task_enqueue_low(thr);
-}
-
 /****************************************************************//**
 Tests if all the query threads in the same fork have a given state.
 @return TRUE if all the query threads in the same fork were in the
@@ -471,15 +403,14 @@ que_fork_all_thrs_in_state(
 {
 	que_thr_t*	thr_node;
 
-	thr_node = UT_LIST_GET_FIRST(fork->thrs);
+	for (thr_node = UT_LIST_GET_FIRST(fork->thrs);
+	     thr_node != NULL;
+	     thr_node = UT_LIST_GET_NEXT(thrs, thr_node)) {
 
-	while (thr_node != NULL) {
 		if (thr_node->state != state) {
 
 			return(FALSE);
 		}
-
-		thr_node = UT_LIST_GET_NEXT(thrs, thr_node);
 	}
 
 	return(TRUE);
@@ -527,7 +458,7 @@ que_graph_free_recursive(
 	switch (que_node_get_type(node)) {
 
 	case QUE_NODE_FORK:
-		fork = node;
+		fork = static_cast<que_fork_t*>(node);
 
 		thr = UT_LIST_GET_FIRST(fork->thrs);
 
@@ -540,7 +471,7 @@ que_graph_free_recursive(
 		break;
 	case QUE_NODE_THR:
 
-		thr = node;
+		thr = static_cast<que_thr_t*>(node);
 
 		if (thr->magic_n != QUE_THR_MAGIC_N) {
 			fprintf(stderr,
@@ -558,21 +489,21 @@ que_graph_free_recursive(
 		break;
 	case QUE_NODE_UNDO:
 
-		undo = node;
+		undo = static_cast<undo_node_t*>(node);
 
 		mem_heap_free(undo->heap);
 
 		break;
 	case QUE_NODE_SELECT:
 
-		sel = node;
+		sel = static_cast<sel_node_t*>(node);
 
 		sel_node_free_private(sel);
 
 		break;
 	case QUE_NODE_INSERT:
 
-		ins = node;
+		ins = static_cast<ins_node_t*>(node);
 
 		que_graph_free_recursive(ins->select);
 
@@ -580,7 +511,7 @@ que_graph_free_recursive(
 
 		break;
 	case QUE_NODE_PURGE:
-		purge = node;
+		purge = static_cast<purge_node_t*>(node);
 
 		mem_heap_free(purge->heap);
 
@@ -588,7 +519,7 @@ que_graph_free_recursive(
 
 	case QUE_NODE_UPDATE:
 
-		upd = node;
+		upd = static_cast<upd_node_t*>(node);
 
 		if (upd->in_mysql_interface) {
 
@@ -607,7 +538,7 @@ que_graph_free_recursive(
 
 		break;
 	case QUE_NODE_CREATE_TABLE:
-		cre_tab = node;
+		cre_tab = static_cast<tab_node_t*>(node);
 
 		que_graph_free_recursive(cre_tab->tab_def);
 		que_graph_free_recursive(cre_tab->col_def);
@@ -617,45 +548,35 @@ que_graph_free_recursive(
 
 		break;
 	case QUE_NODE_CREATE_INDEX:
-		cre_ind = node;
+		cre_ind = static_cast<ind_node_t*>(node);
 
 		que_graph_free_recursive(cre_ind->ind_def);
 		que_graph_free_recursive(cre_ind->field_def);
-		if (srv_use_sys_stats_table)
-			que_graph_free_recursive(cre_ind->stats_def);
 		que_graph_free_recursive(cre_ind->commit_node);
 
 		mem_heap_free(cre_ind->heap);
 
 		break;
-	case QUE_NODE_INSERT_STATS:
-		cre_ind = node;
-
-		que_graph_free_recursive(cre_ind->stats_def);
-		que_graph_free_recursive(cre_ind->commit_node);
-
-		mem_heap_free(cre_ind->heap);
-		break;
 	case QUE_NODE_PROC:
-		que_graph_free_stat_list(((proc_node_t*)node)->stat_list);
+		que_graph_free_stat_list(((proc_node_t*) node)->stat_list);
 
 		break;
 	case QUE_NODE_IF:
-		que_graph_free_stat_list(((if_node_t*)node)->stat_list);
-		que_graph_free_stat_list(((if_node_t*)node)->else_part);
-		que_graph_free_stat_list(((if_node_t*)node)->elsif_list);
+		que_graph_free_stat_list(((if_node_t*) node)->stat_list);
+		que_graph_free_stat_list(((if_node_t*) node)->else_part);
+		que_graph_free_stat_list(((if_node_t*) node)->elsif_list);
 
 		break;
 	case QUE_NODE_ELSIF:
-		que_graph_free_stat_list(((elsif_node_t*)node)->stat_list);
+		que_graph_free_stat_list(((elsif_node_t*) node)->stat_list);
 
 		break;
 	case QUE_NODE_WHILE:
-		que_graph_free_stat_list(((while_node_t*)node)->stat_list);
+		que_graph_free_stat_list(((while_node_t*) node)->stat_list);
 
 		break;
 	case QUE_NODE_FOR:
-		que_graph_free_stat_list(((for_node_t*)node)->stat_list);
+		que_graph_free_stat_list(((for_node_t*) node)->stat_list);
 
 		break;
 
@@ -734,11 +655,11 @@ que_thr_node_step(
 		return(thr);
 	}
 
-	mutex_enter(&kernel_mutex);
+	trx_mutex_enter(thr_get_trx(thr));
 
 	if (que_thr_peek_stop(thr)) {
 
-		mutex_exit(&kernel_mutex);
+		trx_mutex_exit(thr_get_trx(thr));
 
 		return(thr);
 	}
@@ -747,7 +668,7 @@ que_thr_node_step(
 
 	thr->state = QUE_THR_COMPLETED;
 
-	mutex_exit(&kernel_mutex);
+	trx_mutex_exit(thr_get_trx(thr));
 
 	return(NULL);
 }
@@ -764,35 +685,75 @@ que_thr_move_to_run_state(
 /*======================*/
 	que_thr_t*	thr)	/*!< in: an query thread */
 {
-	trx_t*	trx;
-
 	ut_ad(thr->state != QUE_THR_RUNNING);
 
-	trx = thr_get_trx(thr);
-
 	if (!thr->is_active) {
+		trx_t*	trx;
 
-		(thr->graph)->n_active_thrs++;
+		trx = thr_get_trx(thr);
 
-		trx->n_active_thrs++;
+		thr->graph->n_active_thrs++;
 
-		thr->is_active = TRUE;
+		trx->lock.n_active_thrs++;
 
-		ut_ad((thr->graph)->n_active_thrs == 1);
-		ut_ad(trx->n_active_thrs == 1);
+		thr->is_active = TRUE;
 	}
 
 	thr->state = QUE_THR_RUNNING;
 }
 
 /**********************************************************************//**
+Stops a query thread if graph or trx is in a state requiring it. The
+conditions are tested in the order (1) graph, (2) trx.
+@return	TRUE if stopped */
+UNIV_INTERN
+ibool
+que_thr_stop(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	que_t*		graph;
+	trx_t*		trx = thr_get_trx(thr);
+
+	graph = thr->graph;
+
+	ut_ad(trx_mutex_own(trx));
+
+	if (graph->state == QUE_FORK_COMMAND_WAIT) {
+
+		thr->state = QUE_THR_SUSPENDED;
+
+	} else if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+		trx->lock.wait_thr = thr;
+		thr->state = QUE_THR_LOCK_WAIT;
+
+	} else if (trx->error_state != DB_SUCCESS
+		   && trx->error_state != DB_LOCK_WAIT) {
+
+		/* Error handling built for the MySQL interface */
+		thr->state = QUE_THR_COMPLETED;
+
+	} else if (graph->fork_type == QUE_FORK_ROLLBACK) {
+
+		thr->state = QUE_THR_SUSPENDED;
+	} else {
+		ut_ad(graph->state == QUE_FORK_ACTIVE);
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
 Decrements the query thread reference counts in the query graph and the
-transaction. May start signal handling, e.g., a rollback.
+transaction.
 *** NOTE ***:
 This and que_thr_stop_for_mysql are the only functions where the reference
 count can be decremented and this function may only be called from inside
-que_run_threads or que_thr_check_if_switch! These restrictions exist to make
-the rollback code easier to maintain. */
+que_run_threads! These restrictions exist to make the rollback code easier
+to maintain. */
 static
 void
 que_thr_dec_refer_count(
@@ -804,161 +765,57 @@ que_thr_dec_refer_count(
 					calling function can start running
 					a new query thread */
 {
-	que_fork_t*	fork;
 	trx_t*		trx;
-	ulint		fork_type;
-	ibool		stopped;
+	que_fork_t*	fork;
 
-	fork = thr->common.parent;
 	trx = thr_get_trx(thr);
 
-	mutex_enter(&kernel_mutex);
-
 	ut_a(thr->is_active);
+	ut_ad(trx_mutex_own(trx));
 
 	if (thr->state == QUE_THR_RUNNING) {
 
-		stopped = que_thr_stop(thr);
+		if (!que_thr_stop(thr)) {
+
+			ut_a(next_thr != NULL && *next_thr == NULL);
 
-		if (!stopped) {
 			/* The reason for the thr suspension or wait was
 			already canceled before we came here: continue
-			running the thread */
-
-			/* fputs("!!!!!!!! Wait already ended: continue thr\n",
-			stderr); */
-
-			if (next_thr && *next_thr == NULL) {
-				/* Normally srv_suspend_mysql_thread resets
-				the state to DB_SUCCESS before waiting, but
-				in this case we have to do it here,
-				otherwise nobody does it. */
-				trx->error_state = DB_SUCCESS;
-
-				*next_thr = thr;
-			} else {
-				ut_error;
-				srv_que_task_enqueue_low(thr);
-			}
-
-			mutex_exit(&kernel_mutex);
-
-			return;
-		}
-	}
-
-	ut_ad(fork->n_active_thrs == 1);
-	ut_ad(trx->n_active_thrs == 1);
-
-	fork->n_active_thrs--;
-	trx->n_active_thrs--;
-
-	thr->is_active = FALSE;
-
-	if (trx->n_active_thrs > 0) {
-
-		mutex_exit(&kernel_mutex);
-
-		return;
-	}
-
-	fork_type = fork->fork_type;
+			running the thread.
 
-	/* Check if all query threads in the same fork are completed */
+			This is also possible because in trx_commit_step() we
+			assume a single query thread. We set the query thread
+			state to QUE_THR_RUNNING. */
 
-	if (que_fork_all_thrs_in_state(fork, QUE_THR_COMPLETED)) {
+			/* fprintf(stderr,
+		       		"Wait already ended: trx: %p\n", trx); */
 
-		switch (fork_type) {
-		case QUE_FORK_ROLLBACK:
-			/* This is really the undo graph used in rollback,
-			no roll_node in this graph */
+			/* Normally srv_suspend_mysql_thread resets
+			the state to DB_SUCCESS before waiting, but
+			in this case we have to do it here,
+			otherwise nobody does it. */
 
-			ut_ad(UT_LIST_GET_LEN(trx->signals) > 0);
-			ut_ad(trx->handling_signals == TRUE);
+			trx->error_state = DB_SUCCESS;
 
-			trx_finish_rollback_off_kernel(fork, trx, next_thr);
-			break;
-
-		case QUE_FORK_PURGE:
-		case QUE_FORK_RECOVERY:
-		case QUE_FORK_MYSQL_INTERFACE:
+			*next_thr = thr;
 
-			/* Do nothing */
-			break;
-
-		default:
-			ut_error;	/*!< not used in MySQL */
+			return;
 		}
 	}
 
-	if (UT_LIST_GET_LEN(trx->signals) > 0 && trx->n_active_thrs == 0) {
-
-		/* If the trx is signaled and its query thread count drops to
-		zero, then we start processing a signal; from it we may get
-		a new query thread to run */
-
-		trx_sig_start_handle(trx, next_thr);
-	}
-
-	if (trx->handling_signals && UT_LIST_GET_LEN(trx->signals) == 0) {
-
-		trx_end_signal_handling(trx);
-	}
-
-	mutex_exit(&kernel_mutex);
-}
-
-/**********************************************************************//**
-Stops a query thread if graph or trx is in a state requiring it. The
-conditions are tested in the order (1) graph, (2) trx. The kernel mutex has
-to be reserved.
-@return	TRUE if stopped */
-UNIV_INTERN
-ibool
-que_thr_stop(
-/*=========*/
-	que_thr_t*	thr)	/*!< in: query thread */
-{
-	trx_t*	trx;
-	que_t*	graph;
-	ibool	ret	= TRUE;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	graph = thr->graph;
-	trx = graph->trx;
-
-	if (graph->state == QUE_FORK_COMMAND_WAIT) {
-		thr->state = QUE_THR_SUSPENDED;
-
-	} else if (trx->que_state == TRX_QUE_LOCK_WAIT) {
-
-		UT_LIST_ADD_FIRST(trx_thrs, trx->wait_thrs, thr);
-		thr->state = QUE_THR_LOCK_WAIT;
-
-	} else if (trx->error_state != DB_SUCCESS
-		   && trx->error_state != DB_LOCK_WAIT) {
+	fork = static_cast<que_fork_t*>(thr->common.parent);
 
-		/* Error handling built for the MySQL interface */
-		thr->state = QUE_THR_COMPLETED;
+	--trx->lock.n_active_thrs;
 
-	} else if (UT_LIST_GET_LEN(trx->signals) > 0
-		   && graph->fork_type != QUE_FORK_ROLLBACK) {
-
-		thr->state = QUE_THR_SUSPENDED;
-	} else {
-		ut_ad(graph->state == QUE_FORK_ACTIVE);
+	--fork->n_active_thrs;
 
-		ret = FALSE;
-	}
-
-	return(ret);
+	thr->is_active = FALSE;
 }
 
 /**********************************************************************//**
 A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
 query thread is stopped and made inactive, except in the case where
-it was put to the lock wait state in lock0lock.c, but the lock has already
+it was put to the lock wait state in lock0lock.cc, but the lock has already
 been granted or the transaction chosen as a victim in deadlock resolution. */
 UNIV_INTERN
 void
@@ -970,7 +827,10 @@ que_thr_stop_for_mysql(
 
 	trx = thr_get_trx(thr);
 
-	mutex_enter(&kernel_mutex);
+	/* Can't be the purge transaction. */
+	ut_a(trx->id != 0);
+
+	trx_mutex_enter(trx);
 
 	if (thr->state == QUE_THR_RUNNING) {
 
@@ -984,22 +844,22 @@ que_thr_stop_for_mysql(
 			already released, or this transaction was chosen
 			as a victim in selective deadlock resolution */
 
-			mutex_exit(&kernel_mutex);
+			trx_mutex_exit(trx);
 
 			return;
 		}
 	}
 
 	ut_ad(thr->is_active == TRUE);
-	ut_ad(trx->n_active_thrs == 1);
+	ut_ad(trx->lock.n_active_thrs == 1);
 	ut_ad(thr->graph->n_active_thrs == 1);
 
 	thr->is_active = FALSE;
-	(thr->graph)->n_active_thrs--;
+	thr->graph->n_active_thrs--;
 
-	trx->n_active_thrs--;
+	trx->lock.n_active_thrs--;
 
-	mutex_exit(&kernel_mutex);
+	trx_mutex_exit(trx);
 }
 
 /**********************************************************************//**
@@ -1027,7 +887,7 @@ que_thr_move_to_run_state_for_mysql(
 
 		thr->graph->n_active_thrs++;
 
-		trx->n_active_thrs++;
+		trx->lock.n_active_thrs++;
 
 		thr->is_active = TRUE;
 	}
@@ -1046,8 +906,9 @@ que_thr_stop_for_mysql_no_error(
 	trx_t*		trx)	/*!< in: transaction */
 {
 	ut_ad(thr->state == QUE_THR_RUNNING);
+	ut_ad(thr_get_trx(thr)->id != 0);
 	ut_ad(thr->is_active == TRUE);
-	ut_ad(trx->n_active_thrs == 1);
+	ut_ad(trx->lock.n_active_thrs == 1);
 	ut_ad(thr->graph->n_active_thrs == 1);
 
 	if (thr->magic_n != QUE_THR_MAGIC_N) {
@@ -1063,9 +924,9 @@ que_thr_stop_for_mysql_no_error(
 	thr->state = QUE_THR_COMPLETED;
 
 	thr->is_active = FALSE;
-	(thr->graph)->n_active_thrs--;
+	thr->graph->n_active_thrs--;
 
-	trx->n_active_thrs--;
+	trx->lock.n_active_thrs--;
 }
 
 /****************************************************************//**
@@ -1148,8 +1009,6 @@ que_node_print_info(
 		str = "CREATE TABLE";
 	} else if (type == QUE_NODE_CREATE_INDEX) {
 		str = "CREATE INDEX";
-	} else if (type == QUE_NODE_INSERT_STATS) {
-		str = "INSERT TO SYS_STATS";
 	} else if (type == QUE_NODE_FOR) {
 		str = "FOR LOOP";
 	} else if (type == QUE_NODE_RETURN) {
@@ -1246,9 +1105,6 @@ que_thr_step(
 	} else if (type == QUE_NODE_LOCK) {
 
 		ut_error;
-		/*
-		thr = que_lock_step(thr);
-		*/
 	} else if (type == QUE_NODE_THR) {
 		thr = que_thr_node_step(thr);
 	} else if (type == QUE_NODE_COMMIT) {
@@ -1267,8 +1123,6 @@ que_thr_step(
 		thr = dict_create_table_step(thr);
 	} else if (type == QUE_NODE_CREATE_INDEX) {
 		thr = dict_create_index_step(thr);
-	} else if (type == QUE_NODE_INSERT_STATS) {
-		thr = dict_insert_stats_step(thr);
 	} else if (type == QUE_NODE_ROW_PRINTF) {
 		thr = row_printf_step(thr);
 	} else {
@@ -1296,51 +1150,56 @@ que_run_threads_low(
 /*================*/
 	que_thr_t*	thr)	/*!< in: query thread */
 {
+	trx_t*		trx;
 	que_thr_t*	next_thr;
-	ulint		loop_count;
 
 	ut_ad(thr->state == QUE_THR_RUNNING);
 	ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
-	ut_ad(!mutex_own(&kernel_mutex));
+	ut_ad(!trx_mutex_own(thr_get_trx(thr)));
 
-	loop_count = QUE_MAX_LOOPS_WITHOUT_CHECK;
-loop:
-	/* Check that there is enough space in the log to accommodate
-	possible log entries by this query step; if the operation can touch
-	more than about 4 pages, checks must be made also within the query
-	step! */
+	/* cumul_resource counts how much resources the OS thread (NOT the
+	query thread) has spent in this function */
 
-	log_free_check();
+	trx = thr_get_trx(thr);
 
-	/* Perform the actual query step: note that the query thread
-	may change if, e.g., a subprocedure call is made */
+	do {
+		/* Check that there is enough space in the log to accommodate
+		possible log entries by this query step; if the operation can
+		touch more than about 4 pages, checks must be made also within
+		the query step! */
 
-	/*-------------------------*/
-	next_thr = que_thr_step(thr);
-	/*-------------------------*/
+		log_free_check();
 
-	ut_a(!next_thr || (thr_get_trx(next_thr)->error_state == DB_SUCCESS));
+		/* Perform the actual query step: note that the query thread
+		may change if, e.g., a subprocedure call is made */
 
-	loop_count++;
+		/*-------------------------*/
+		next_thr = que_thr_step(thr);
+		/*-------------------------*/
 
-	if (next_thr != thr) {
-		ut_a(next_thr == NULL);
+		trx_mutex_enter(trx);
 
-		/* This can change next_thr to a non-NULL value if there was
-		a lock wait that already completed. */
-		que_thr_dec_refer_count(thr, &next_thr);
+		ut_a(next_thr == NULL || trx->error_state == DB_SUCCESS);
 
-		if (next_thr == NULL) {
+		if (next_thr != thr) {
+			ut_a(next_thr == NULL);
 
-			return;
+			/* This can change next_thr to a non-NULL value
+			if there was a lock wait that already completed. */
+
+			que_thr_dec_refer_count(thr, &next_thr);
+
+			if (next_thr != NULL) {
+
+				thr = next_thr;
+			}
 		}
 
-		loop_count = QUE_MAX_LOOPS_WITHOUT_CHECK;
+		ut_ad(trx == thr_get_trx(thr));
 
-		thr = next_thr;
-	}
+		trx_mutex_exit(trx);
 
-	goto loop;
+	} while (next_thr != NULL);
 }
 
 /**********************************************************************//**
@@ -1351,11 +1210,12 @@ que_run_threads(
 /*============*/
 	que_thr_t*	thr)	/*!< in: query thread */
 {
+	ut_ad(!trx_mutex_own(thr_get_trx(thr)));
+
 loop:
 	ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
-	que_run_threads_low(thr);
 
-	mutex_enter(&kernel_mutex);
+	que_run_threads_low(thr);
 
 	switch (thr->state) {
 
@@ -1363,27 +1223,25 @@ loop:
 		/* There probably was a lock wait, but it already ended
 		before we came here: continue running thr */
 
-		mutex_exit(&kernel_mutex);
-
 		goto loop;
 
 	case QUE_THR_LOCK_WAIT:
-		mutex_exit(&kernel_mutex);
+		lock_wait_suspend_thread(thr);
 
-		/* The ..._mysql_... function works also for InnoDB's
-		internal threads. Let us wait that the lock wait ends. */
+		trx_mutex_enter(thr_get_trx(thr));
 
-		srv_suspend_mysql_thread(thr);
+		ut_a(thr_get_trx(thr)->id != 0);
 
 		if (thr_get_trx(thr)->error_state != DB_SUCCESS) {
 			/* thr was chosen as a deadlock victim or there was
 			a lock wait timeout */
 
 			que_thr_dec_refer_count(thr, NULL);
-
-			return;
+			trx_mutex_exit(thr_get_trx(thr));
+			break;
 		}
 
+		trx_mutex_exit(thr_get_trx(thr));
 		goto loop;
 
 	case QUE_THR_COMPLETED:
@@ -1394,15 +1252,13 @@ loop:
 	default:
 		ut_error;
 	}
-
-	mutex_exit(&kernel_mutex);
 }
 
 /*********************************************************************//**
 Evaluate the given SQL.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 que_eval_sql(
 /*=========*/
 	pars_info_t*	info,	/*!< in: info struct, or NULL */
@@ -1444,7 +1300,35 @@ que_eval_sql(
 
 	que_run_threads(thr);
 
+	if (reserve_dict_mutex) {
+		mutex_enter(&dict_sys->mutex);
+	}
+
 	que_graph_free(graph);
 
+	if (reserve_dict_mutex) {
+		mutex_exit(&dict_sys->mutex);
+	}
+
 	return(trx->error_state);
 }
+
+/*********************************************************************//**
+Initialise the query sub-system. */
+UNIV_INTERN
+void
+que_init(void)
+/*==========*/
+{
+	/* No op */
+}
+
+/*********************************************************************//**
+Close the query sub-system. */
+UNIV_INTERN
+void
+que_close(void)
+/*===========*/
+{
+	/* No op */
+}
diff --git a/storage/xtradb/read/read0read.c b/storage/xtradb/read/read0read.cc
index 9efb268da79..887e1717769 100644
--- a/storage/xtradb/read/read0read.c
+++ b/storage/xtradb/read/read0read.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,19 +11,20 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file read/read0read.c
+@file read/read0read.cc
 Cursor read
 
 Created 2/16/1997 Heikki Tuuri
 *******************************************************/
 
 #include "read0read.h"
+#include "read0i_s.h"
 
 #ifdef UNIV_NONINL
 #include "read0read.ic"
@@ -70,7 +71,7 @@ cluster record is accessed.  Because trx_id of the creating
 transaction is stored when this view was created to the list of
 trx_ids not seen by this read view previous version of the
 record is requested to be built. This is build using clustered record.
-If the secondary key record is delete  marked it's corresponding
+If the secondary key record is delete-marked, its corresponding
 clustered record can be already be purged only if records
 trx_id < low_limit_no. Purge can't remove any record deleted by a
 transaction which was active when cursor was created. But, we still
@@ -129,12 +130,49 @@ in the view. If this is not true we build based on undo_rec previous
 version of the record. This record is found because purge can't remove
 records accessed by active transaction. Thus we see correct version. Q. E. D.
 -------------------------------------------------------------------------------
-FACT C: Purge does not remove any delete marked row that is visible
+FACT C: Purge does not remove any delete-marked row that is visible
 -------
-to cursor view.
-
-TODO: proof this
-
+in any cursor read view.
+
+PROOF: We know that:
+ 1: Currently active read views in trx_sys_t::view_list are ordered by
+    read_view_t::low_limit_no in descending order, that is,
+    newest read view first.
+
+ 2: Purge clones the oldest read view and uses that to determine whether there
+    are any active transactions that can see the to be purged records.
+
+Therefore any joining or active transaction will not have a view older
+than the purge view, according to 1.
+
+When purge needs to remove a delete-marked row from a secondary index,
+it will first check that the DB_TRX_ID value of the corresponding
+record in the clustered index is older than the purge view. It will
+also check if there is a newer version of the row (clustered index
+record) that is not delete-marked in the secondary index. If such a
+row exists and is collation-equal to the delete-marked secondary index
+record then purge will not remove the secondary index record.
+
+Delete-marked clustered index records will be removed by
+row_purge_remove_clust_if_poss(), unless the clustered index record
+(and its DB_ROLL_PTR) has been updated. Every new version of the
+clustered index record will update DB_ROLL_PTR, pointing to a new UNDO
+log entry that allows the old version to be reconstructed. The
+DB_ROLL_PTR in the oldest remaining version in the old-version chain
+may be pointing to garbage (an undo log record discarded by purge),
+but it will never be dereferenced, because the purge view is older
+than any active transaction.
+
+For details see: row_vers_old_has_index_entry() and row_purge_poss_sec()
+
+Some additional issues:
+
+What if trx_sys->view_list == NULL and some transaction T1 and Purge both
+try to open read_view at same time. Only one can acquire trx_sys->mutex.
+In which order will the views be opened? Should it matter? If no, why?
+
+The order does not matter. No new transactions can be created and no running
+transaction can commit or rollback (or free views).
 */
 
 /*********************************************************************//**
@@ -145,12 +183,14 @@ read_view_t*
 read_view_create_low(
 /*=================*/
 	ulint		n,	/*!< in: number of cells in the trx_ids array */
-	read_view_t*	view)	/*!< in: pre-allocated view array or NULL if a
-				new one needs to be created */
+	read_view_t*&	view)	/*!< in,out: pre-allocated view array or NULL if
+				a new one needs to be created */
 {
 	if (view == NULL) {
-		view = ut_malloc(sizeof(read_view_t));
-		srv_read_views_memory += sizeof(read_view_t);
+		view = static_cast<read_view_t*>(
+			ut_malloc(sizeof(read_view_t)));
+		os_atomic_increment_ulint(&srv_read_views_memory,
+					  sizeof(read_view_t));
 		view->max_descr = 0;
 		view->descriptors = NULL;
 	}
@@ -160,12 +200,14 @@ read_view_create_low(
 		/* avoid frequent re-allocations by extending the array to the
 		desired size + 10% */
 
-		srv_read_views_memory += (n + n / 10 - view->max_descr) *
-			sizeof(trx_id_t);
+		os_atomic_increment_ulint(&srv_read_views_memory,
+					  (n + n / 10 - view->max_descr) *
+					  sizeof(trx_id_t));
 		view->max_descr = n + n / 10;
-		view->descriptors = ut_realloc(view->descriptors,
-					       view->max_descr *
-					       sizeof(trx_id_t));
+		view->descriptors = static_cast<trx_id_t*>(
+			ut_realloc(view->descriptors,
+				   view->max_descr *
+				   sizeof *view->descriptors));
 	}
 
 	view->n_descr = n;
@@ -174,128 +216,108 @@ read_view_create_low(
 }
 
 /*********************************************************************//**
-Makes a copy of the oldest existing read view, with the exception that also
-the creating trx of the oldest view is set as not visible in the 'copied'
-view. Opens a new view if no views currently exist. The view must be closed
-with ..._close. This is used in purge.
-@return	own: read view struct */
-UNIV_INTERN
+Clones a read view object. This function will allocate space for two read
+views contiguously, one identical in size and content as @param view (starting
+at returned pointer) and another view immediately following the trx_ids array.
+The second view will have space for an extra trx_id_t element.
+@return	read view struct */
+UNIV_INLINE
 read_view_t*
-read_view_oldest_copy_or_open_new(
-/*==============================*/
-	trx_id_t	cr_trx_id,	/*!< in: trx_id of creating
-					transaction, or 0 used in purge */
-	read_view_t*	view)		/*!< in: pre-allocated view array or
-					NULL if a new one needs to be created */
+read_view_clone(
+/*============*/
+	const read_view_t*	view,		/*!< in: view to clone */
+	read_view_t*&		prebuilt_clone)	/*!< in,out: prebuilt view or
+						NULL */
 {
-	read_view_t*	old_view;
-	read_view_t*	view_copy;
-	ibool		needs_insert	= TRUE;
-	ulint		insert_done	= 0;
-	ulint		n;
-	ulint		i;
+	read_view_t*	clone;
+	trx_id_t*	old_descriptors;
+	ulint		old_max_descr;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(mutex_own(&trx_sys->mutex));
 
-	old_view = UT_LIST_GET_LAST(trx_sys->view_list);
+	clone = read_view_create_low(view->n_descr, prebuilt_clone);
 
-	if (old_view == NULL) {
+	old_descriptors = clone->descriptors;
+	old_max_descr = clone->max_descr;
 
-		return(read_view_open_now(cr_trx_id, view, TRUE));
-	}
+	memcpy(clone, view, sizeof(*view));
 
-	n = old_view->n_descr;
+	clone->descriptors = old_descriptors;
+	clone->max_descr = old_max_descr;
 
-	if (old_view->creator_trx_id) {
-		n++;
-	} else {
-		needs_insert = FALSE;
+	if (view->n_descr) {
+		memcpy(clone->descriptors, view->descriptors,
+		       view->n_descr * sizeof(trx_id_t));
 	}
 
-	view_copy = read_view_create_low(n, view);
-
-	/* Insert the id of the creator in the right place of the descending
-	array of ids, if needs_insert is TRUE: */
+	return(clone);
+}
 
-	i = 0;
-	while (i < n) {
-		if (needs_insert
-		    && (i >= old_view->n_descr
-			|| old_view->creator_trx_id
-			> read_view_get_nth_trx_id(old_view, i))) {
+/*********************************************************************//**
+Insert the view in the proper order into the trx_sys->view_list. The
+read view list is ordered by read_view_t::low_limit_no in descending order. */
+static
+void
+read_view_add(
+/*==========*/
+	read_view_t*	view)		/*!< in: view to add to */
+{
+	read_view_t*	elem;
+	read_view_t*	prev_elem;
 
-			read_view_set_nth_trx_id(view_copy, i,
-						 old_view->creator_trx_id);
-			needs_insert = FALSE;
-			insert_done = 1;
-		} else {
-			read_view_set_nth_trx_id(view_copy, i,
-						 read_view_get_nth_trx_id(
-							 old_view,
-							 i - insert_done));
-		}
+	ut_ad(mutex_own(&trx_sys->mutex));
+	ut_ad(read_view_validate(view));
 
-		i++;
+	/* Find the correct slot for insertion. */
+	for (elem = UT_LIST_GET_FIRST(trx_sys->view_list), prev_elem = NULL;
+	     elem != NULL && view->low_limit_no < elem->low_limit_no;
+	     prev_elem = elem, elem = UT_LIST_GET_NEXT(view_list, elem)) {
+		/* No op */
 	}
 
-	view_copy->creator_trx_id = cr_trx_id;
-
-	view_copy->low_limit_no = old_view->low_limit_no;
-	view_copy->low_limit_id = old_view->low_limit_id;
-
-
-	if (n > 0) {
-		/* The last active transaction has the smallest id: */
-		view_copy->up_limit_id = read_view_get_nth_trx_id(
-			view_copy, n - 1);
+	if (prev_elem == NULL) {
+		UT_LIST_ADD_FIRST(view_list, trx_sys->view_list, view);
 	} else {
-		view_copy->up_limit_id = old_view->up_limit_id;
+		UT_LIST_INSERT_AFTER(
+			view_list, trx_sys->view_list, prev_elem, view);
 	}
 
-	UT_LIST_ADD_LAST(view_list, trx_sys->view_list, view_copy);
-
-	return(view_copy);
+	ut_ad(read_view_list_validate());
 }
 
 /*********************************************************************//**
 Opens a read view where exactly the transactions serialized before this
 point in time are seen in the view.
 @return	own: read view struct */
-UNIV_INTERN
+static
 read_view_t*
-read_view_open_now(
-/*===============*/
+read_view_open_now_low(
+/*===================*/
 	trx_id_t	cr_trx_id,	/*!< in: trx_id of creating
 					transaction, or 0 used in purge */
-	read_view_t*	view,		/*!< in: current read view or NULL if it
-					doesn't exist yet */
-	ibool		exclude_self)	/*!< in: TRUE, if cr_trx_id should be
-					excluded from the resulting view */
+	read_view_t*&	view)		/*!< in,out: pre-allocated view array or
+					NULL if a new one needs to be created */
 {
 	trx_id_t*	descr;
 	ulint		i;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(mutex_own(&trx_sys->mutex));
 
 	view = read_view_create_low(trx_sys->descr_n_used, view);
 
-	view->creator_trx_id = cr_trx_id;
-	view->type = VIEW_NORMAL;
 	view->undo_no = 0;
+	view->type = VIEW_NORMAL;
+	view->creator_trx_id = cr_trx_id;
 
 	/* No future transactions should be visible in the view */
 
 	view->low_limit_no = trx_sys->max_trx_id;
 	view->low_limit_id = view->low_limit_no;
 
-	/* No active transaction should be visible */
-
 	descr = trx_find_descriptor(trx_sys->descriptors,
 				    trx_sys->descr_n_used,
 				    cr_trx_id);
-
-	if (UNIV_LIKELY(exclude_self && descr != NULL)) {
-
+	if (UNIV_LIKELY(descr != NULL)) {
 		ut_ad(trx_sys->descr_n_used > 0);
 		ut_ad(view->n_descr > 0);
 
@@ -307,24 +329,22 @@ read_view_open_now(
 	}
 
 	if (UNIV_LIKELY(i > 0)) {
-
 		/* Copy the [0; i-1] range */
 		memcpy(view->descriptors, trx_sys->descriptors,
 		       i * sizeof(trx_id_t));
 	}
 
 	if (UNIV_UNLIKELY(i + 1 < trx_sys->descr_n_used)) {
-
 		/* Copy the [i+1; descr_n_used-1] range */
 		memcpy(view->descriptors + i,
 		       trx_sys->descriptors + i + 1,
 		       (trx_sys->descr_n_used - i - 1) *
 		       sizeof(trx_id_t));
-	}
+ 	}
 
 	/* NOTE that a transaction whose trx number is < trx_sys->max_trx_id can
 	still be active, if it is in the middle of its commit! Note that when a
-	transaction starts, we initialize trx->no to IB_ULONGLONG_MAX. */
+	transaction starts, we initialize trx->no to TRX_ID_MAX. */
 
 	if (UT_LIST_GET_LEN(trx_sys->trx_serial_list) > 0) {
 
@@ -344,43 +364,121 @@ read_view_open_now(
 		view->up_limit_id = view->low_limit_id;
 	}
 
-
-	UT_LIST_ADD_FIRST(view_list, trx_sys->view_list, view);
+	/* Purge views are not added to the view list. */
+	if (cr_trx_id > 0) {
+		read_view_add(view);
+	}
 
 	return(view);
 }
 
 /*********************************************************************//**
-Closes a read view. */
+Opens a read view where exactly the transactions serialized before this
+point in time are seen in the view.
+@return	own: read view struct */
 UNIV_INTERN
-void
-read_view_close(
-/*============*/
-	read_view_t*	view)	/*!< in: read view */
+read_view_t*
+read_view_open_now(
+/*===============*/
+	trx_id_t	cr_trx_id,	/*!< in: trx_id of creating
+					transaction, or 0 used in purge */
+	read_view_t*&	view)		/*!< in,out: pre-allocated view array or
+					NULL if a new one needs to be created */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	mutex_enter(&trx_sys->mutex);
+
+	view = read_view_open_now_low(cr_trx_id, view);
 
-	UT_LIST_REMOVE(view_list, trx_sys->view_list, view);
+	mutex_exit(&trx_sys->mutex);
+
+	return(view);
 }
 
 /*********************************************************************//**
-Frees resource allocated by a read view. */
+Makes a copy of the oldest existing read view, with the exception that also
+the creating trx of the oldest view is set as not visible in the 'copied'
+view. Opens a new view if no views currently exist. The view must be closed
+with ..._close. This is used in purge.
+@return	own: read view struct */
 UNIV_INTERN
-void
-read_view_free(
-/*===========*/
-	read_view_t*	view)	/*< in: read view */
+read_view_t*
+read_view_purge_open(
+/*=================*/
+	read_view_t*&	prebuilt_clone,	/*!< in,out: pre-allocated view that
+					will be used to clone the oldest view if
+					exists */
+	read_view_t*&	prebuilt_view)	/*!< in,out: pre-allocated view array or
+					NULL if a new one needs to be created */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ulint		i;
+	read_view_t*	view;
+	read_view_t*	oldest_view;
+	trx_id_t	creator_trx_id;
+	ulint		insert_done	= 0;
 
-	srv_read_views_memory -= sizeof(read_view_t) +
-		view->max_descr * sizeof(trx_id_t);
+	mutex_enter(&trx_sys->mutex);
 
-	if (view->descriptors != NULL) {
-		ut_free(view->descriptors);
+	oldest_view = UT_LIST_GET_LAST(trx_sys->view_list);
+
+	if (oldest_view == NULL) {
+
+		view = read_view_open_now_low(0, prebuilt_view);
+
+		mutex_exit(&trx_sys->mutex);
+
+		return(view);
 	}
 
-	ut_free(view);
+	/* Clone the oldest view to a pre-allocated clone view */
+
+	oldest_view = read_view_clone(oldest_view, prebuilt_clone);
+
+	ut_ad(read_view_validate(oldest_view));
+
+	mutex_exit(&trx_sys->mutex);
+
+	ut_a(oldest_view->creator_trx_id > 0);
+	creator_trx_id = oldest_view->creator_trx_id;
+
+	view = read_view_create_low(oldest_view->n_descr + 1, prebuilt_view);
+
+	/* Add the creator transaction id in the trx_ids array in the
+	correct slot. */
+
+	for (i = 0; i < oldest_view->n_descr; ++i) {
+		trx_id_t	id;
+
+		id = oldest_view->descriptors[i - insert_done];
+
+		if (insert_done == 0 && creator_trx_id < id) {
+			id = creator_trx_id;
+			insert_done = 1;
+		}
+
+		view->descriptors[i] = id;
+	}
+
+	if (insert_done == 0) {
+		view->descriptors[i] = creator_trx_id;
+	} else {
+		ut_a(i > 0);
+		view->descriptors[i] = oldest_view->descriptors[i - 1];
+	}
+
+	view->creator_trx_id = 0;
+
+	view->low_limit_no = oldest_view->low_limit_no;
+	view->low_limit_id = oldest_view->low_limit_id;
+
+	if (view->n_descr > 0) {
+		/* The last active transaction has the smallest id: */
+
+		view->up_limit_id = view->descriptors[0];
+	} else {
+		view->up_limit_id = oldest_view->up_limit_id;
+	}
+
+	return(view);
 }
 
 /*********************************************************************//**
@@ -390,27 +488,23 @@ UNIV_INTERN
 void
 read_view_close_for_mysql(
 /*======================*/
-	trx_t*	trx)	/*!< in: trx which has a read view */
+	trx_t*		trx)	/*!< in: trx which has a read view */
 {
 	ut_a(trx->global_read_view);
 
-	mutex_enter(&kernel_mutex);
-
-	read_view_close(trx->global_read_view);
+	read_view_remove(trx->global_read_view, false);
 
 	trx->read_view = NULL;
 	trx->global_read_view = NULL;
-
-	mutex_exit(&kernel_mutex);
 }
 
 /*********************************************************************//**
-Prints a read view to stderr. */
+Prints a read view to file. */
 UNIV_INTERN
 void
 read_view_print(
 /*============*/
-	FILE*			file,
+	FILE*			file,	/*!< in: file to print to */
 	const read_view_t*	view)	/*!< in: read view */
 {
 	ulint	n_ids;
@@ -418,20 +512,20 @@ read_view_print(
 
 	if (view->type == VIEW_HIGH_GRANULARITY) {
 		fprintf(file,
-			"High-granularity read view undo_n:o %llu\n",
-			(ullint) view->undo_no);
+			"High-granularity read view undo_n:o " TRX_ID_FMT "\n",
+			view->undo_no);
 	} else {
 		fprintf(file, "Normal read view\n");
 	}
 
 	fprintf(file, "Read view low limit trx n:o " TRX_ID_FMT "\n",
-		(ullint) view->low_limit_no);
+		view->low_limit_no);
 
 	fprintf(file, "Read view up limit trx id " TRX_ID_FMT "\n",
-		(ullint) view->up_limit_id);
+		view->up_limit_id);
 
 	fprintf(file, "Read view low limit trx id " TRX_ID_FMT "\n",
-		(ullint) view->low_limit_id);
+		view->low_limit_id);
 
 	fprintf(file, "Read view individually stored trx ids:\n");
 
@@ -439,10 +533,66 @@ read_view_print(
 
 	for (i = 0; i < n_ids; i++) {
 		fprintf(file, "Read view trx id " TRX_ID_FMT "\n",
-			(ullint) read_view_get_nth_trx_id(view, i));
+			view->descriptors[i]);
 	}
 }
 
+UNIV_INTERN
+i_s_xtradb_read_view_t*
+read_fill_i_s_xtradb_read_view(i_s_xtradb_read_view_t* rv)
+{
+	read_view_t*    view;
+
+	mutex_enter(&trx_sys->mutex);
+
+	if (UT_LIST_GET_LEN(trx_sys->view_list)) {
+		view = UT_LIST_GET_LAST(trx_sys->view_list);
+	} else {
+		mutex_exit(&trx_sys->mutex);
+		return NULL;
+	}
+
+	if (view->type == VIEW_HIGH_GRANULARITY) {
+		rv->undo_no = view->undo_no;
+	} else {
+		rv->undo_no = ULINT_UNDEFINED;
+	}
+
+	rv->low_limit_no = view->low_limit_no;
+	rv->up_limit_id = view->up_limit_id;
+	rv->low_limit_id = view->low_limit_id;
+
+	mutex_exit(&trx_sys->mutex);
+
+	return rv;
+}
+
+/*********************************************************************//**
+Frees resource allocated by a read view. */
+UNIV_INTERN
+void
+read_view_free(
+/*===========*/
+	read_view_t*&	view)	/*< in,out: read view */
+{
+	if (view == NULL) {
+
+		return;
+	}
+
+	os_atomic_decrement_ulint(&srv_read_views_memory,
+				 sizeof(read_view_t) +
+				 view->max_descr * sizeof(trx_id_t));
+
+	if (view->descriptors != NULL) {
+		ut_free(view->descriptors);
+	}
+
+	ut_free(view);
+
+	view = NULL;
+}
+
 /*********************************************************************//**
 Create a high-granularity consistent cursor view for mysql to be used
 in cursors. In this consistent read view modifications done by the
@@ -452,36 +602,38 @@ UNIV_INTERN
 cursor_view_t*
 read_cursor_view_create_for_mysql(
 /*==============================*/
-	trx_t*	cr_trx)	/*!< in: trx where cursor view is created */
+	trx_t*		cr_trx)	/*!< in: trx where cursor view is created */
 {
-	cursor_view_t*	curview;
 	read_view_t*	view;
 	mem_heap_t*	heap;
-
-	ut_a(cr_trx);
+	cursor_view_t*	curview;
 
 	/* Use larger heap than in trx_create when creating a read_view
 	because cursors are quite long. */
 
 	heap = mem_heap_create(512);
 
-	curview = (cursor_view_t*) mem_heap_alloc(heap, sizeof(cursor_view_t));
+	curview = (cursor_view_t*) mem_heap_alloc(heap, sizeof(*curview));
+
 	curview->heap = heap;
 
-	/* Drop cursor tables from consideration when evaluating the need of
-	auto-commit */
+	/* Drop cursor tables from consideration when evaluating the
+	need of auto-commit */
+
 	curview->n_mysql_tables_in_use = cr_trx->n_mysql_tables_in_use;
-	cr_trx->n_mysql_tables_in_use = 0;
 
-	mutex_enter(&kernel_mutex);
+	cr_trx->n_mysql_tables_in_use = 0;
 
-	curview->read_view = read_view_open_now(cr_trx->id, NULL, FALSE);
+	mutex_enter(&trx_sys->mutex);
 
-	mutex_exit(&kernel_mutex);
+	curview->read_view = NULL;
+	read_view_open_now_low(UINT64_UNDEFINED, curview->read_view);
 
 	view = curview->read_view;
-	view->type = VIEW_HIGH_GRANULARITY;
 	view->undo_no = cr_trx->undo_no;
+	view->type = VIEW_HIGH_GRANULARITY;
+
+	mutex_exit(&trx_sys->mutex);
 
 	return(curview);
 }
@@ -504,15 +656,11 @@ read_cursor_view_close_for_mysql(
 	belong to this transaction */
 	trx->n_mysql_tables_in_use += curview->n_mysql_tables_in_use;
 
-	mutex_enter(&kernel_mutex);
-
-	read_view_close(curview->read_view);
+	read_view_remove(curview->read_view, false);
 	read_view_free(curview->read_view);
 
 	trx->read_view = trx->global_read_view;
 
-	mutex_exit(&kernel_mutex);
-
 	mem_heap_free(curview->heap);
 }
 
@@ -529,7 +677,7 @@ read_cursor_set_for_mysql(
 {
 	ut_a(trx);
 
-	mutex_enter(&kernel_mutex);
+	mutex_enter(&trx_sys->mutex);
 
 	if (UNIV_LIKELY(curview != NULL)) {
 		trx->read_view = curview->read_view;
@@ -537,5 +685,7 @@ read_cursor_set_for_mysql(
 		trx->read_view = trx->global_read_view;
 	}
 
-	mutex_exit(&kernel_mutex);
+	ut_ad(read_view_validate(trx->read_view));
+
+	mutex_exit(&trx_sys->mutex);
 }
diff --git a/storage/xtradb/rem/rem0cmp.c b/storage/xtradb/rem/rem0cmp.cc
index a49a42e8c3f..db0fdf3ee21 100644
--- a/storage/xtradb/rem/rem0cmp.c
+++ b/storage/xtradb/rem/rem0cmp.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /*******************************************************************//**
-@file rem/rem0cmp.c
+@file rem/rem0cmp.cc
 Comparison services for records
 
 Created 7/1/1994 Heikki Tuuri
@@ -29,6 +29,8 @@ Created 7/1/1994 Heikki Tuuri
 #include "rem0cmp.ic"
 #endif
 
+#include "ha_prototypes.h"
+#include "handler0alter.h"
 #include "srv0srv.h"
 
 /*		ALPHABETICAL ORDER
@@ -68,10 +70,12 @@ cmp_debug_dtuple_rec_with_match(
 				has an equal number or more fields than
 				dtuple */
 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
-	ulint*		matched_fields);/*!< in/out: number of already
+	ulint		n_cmp,	/*!< in: number of fields to compare */
+	ulint*		matched_fields)/*!< in/out: number of already
 				completely  matched fields; when function
 				returns, contains the value for current
 				comparison */
+	__attribute__((nonnull, warn_unused_result));
 #endif /* UNIV_DEBUG */
 /*************************************************************//**
 This function is used to compare two data fields for which the data type
@@ -90,6 +94,23 @@ innobase_mysql_cmp(
 	const unsigned char* b,		/*!< in: data field */
 	unsigned int	b_length);	/*!< in: data field length,
 					not UNIV_SQL_NULL */
+/*************************************************************//**
+This function is used to compare two data fields for which the data type
+is such that we must use MySQL code to compare them. The prototype here
+must be a copy of the one in ha_innobase.cc!
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+extern
+int
+innobase_mysql_cmp_prefix(
+/*======================*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number,	/*!< in: number of the charset */
+	const unsigned char* a,		/*!< in: data field */
+	unsigned int	a_length,	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	const unsigned char* b,		/*!< in: data field */
+	unsigned int	b_length);	/*!< in: data field length,
+					not UNIV_SQL_NULL */
 /*********************************************************************//**
 Transforms the character code so that it is ordered appropriately for the
 language. This is only used for the latin1 char set. MySQL does the
@@ -184,8 +205,8 @@ cmp_whole_field(
 
 	case DATA_DECIMAL:
 		/* Remove preceding spaces */
-		for (; a_length && *a == ' '; a++, a_length--);
-		for (; b_length && *b == ' '; b++, b_length--);
+		for (; a_length && *a == ' '; a++, a_length--) { }
+		for (; b_length && *b == ' '; b++, b_length--) { }
 
 		if (*a == '-') {
 			if (*b != '-') {
@@ -271,7 +292,7 @@ cmp_whole_field(
 	case DATA_MYSQL:
 		return(innobase_mysql_cmp(
 			       (int)(prtype & DATA_MYSQL_TYPE_MASK),
-			       (uint)dtype_get_charset_coll(prtype),
+			       (uint) dtype_get_charset_coll(prtype),
 			       a, a_length, b, b_length));
 	default:
 		fprintf(stderr,
@@ -283,6 +304,44 @@ cmp_whole_field(
 	return(0);
 }
 
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INTERN
+int
+cmp_dfield_dfield_like_prefix(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*	dfield1,/* in: data field; must have type field set */
+	dfield_t*	dfield2)/* in: data field */
+{
+	const dtype_t*  type;
+	ulint           ret;
+
+	ut_ad(dfield_check_typed(dfield1));
+
+	type = dfield_get_type(dfield1);
+
+	if (type->mtype >= DATA_FLOAT) {
+		ret = innobase_mysql_cmp_prefix(
+			(int)(type->prtype & DATA_MYSQL_TYPE_MASK),
+			(uint) dtype_get_charset_coll(type->prtype),
+			static_cast<byte*>(dfield_get_data(dfield1)),
+			dfield_get_len(dfield1),
+                        static_cast<byte*>(dfield_get_data(dfield2)),
+                        dfield_get_len(dfield2));
+        } else {
+                ret = (cmp_data_data_like_prefix(
+                        static_cast<byte*>(dfield_get_data(dfield1)),
+                        dfield_get_len(dfield1),
+                        static_cast<byte*>(dfield_get_data(dfield2)),
+                        dfield_get_len(dfield2)));
+        }
+
+        return(ret);
+}
+
 /*************************************************************//**
 This function is used to compare two data fields for which we know the
 data type.
@@ -396,6 +455,162 @@ next_byte:
 	return(0);		/* Not reached */
 }
 
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type to be VARCHAR */
+
+int
+cmp_data_data_slow_varchar(
+/*=======================*/
+				/* out: 1, 0, -1, if lhs is greater, equal,
+				less than rhs, respectively */
+	const byte*	lhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		lhs_len,/* in: data field length or UNIV_SQL_NULL */
+	const byte*	rhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		rhs_len)/* in: data field length or UNIV_SQL_NULL */
+{
+	ulint	i;
+
+	ut_a(rhs_len != UNIV_SQL_NULL);
+
+	if (lhs_len == UNIV_SQL_NULL) {
+
+		/* We define the SQL null to be the smallest possible
+		value of a field in the alphabetical order */
+
+		return(-1);
+	}
+
+	/* Compare the values.*/
+
+	for (i = 0; i < lhs_len && i < rhs_len; ++i, ++rhs, ++lhs) {
+		ulint	lhs_byte = *lhs;
+		ulint	rhs_byte = *rhs;
+
+		if (lhs_byte != rhs_byte) {
+			/* If the bytes are equal, they will remain such even
+			after the collation transformation below */
+
+			lhs_byte = cmp_collate(lhs_byte);
+			rhs_byte = cmp_collate(rhs_byte);
+
+			if (lhs_byte > rhs_byte) {
+
+				return(1);
+			} else if (lhs_byte < rhs_byte) {
+
+				return(-1);
+			}
+		}
+	}
+
+	return(i == lhs_len && i == rhs_len) ? 0 : rhs_len - lhs_len;
+}
+
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type. The comparison is done for the LIKE operator.*/
+
+int
+cmp_data_data_slow_like_prefix(
+/*===========================*/
+				/* out: 1, 0, -1, if lhs is greater, equal,
+				less than rhs, respectively */
+	const byte*	lhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/* in: data field length or UNIV_SQL_NULL */
+	const byte*	rhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2)	/* in: data field length or UNIV_SQL_NULL */
+{
+	ulint	i;
+
+	ut_a(len2 != UNIV_SQL_NULL);
+
+	if (len1 == UNIV_SQL_NULL) {
+
+		/* We define the SQL null to be the smallest possible
+		value of a field in the alphabetical order */
+
+		return(-1);
+	}
+
+	/* Compare the values.*/
+
+	for (i = 0; i < len1 && i < len2; ++i, ++rhs, ++lhs) {
+		ulint	lhs_byte = *lhs;
+		ulint	rhs_byte = *rhs;
+
+		if (lhs_byte != rhs_byte) {
+			/* If the bytes are equal, they will remain such even
+			after the collation transformation below */
+
+			lhs_byte = cmp_collate(lhs_byte);
+			rhs_byte = cmp_collate(rhs_byte);
+
+			if (lhs_byte > rhs_byte) {
+
+				return(1);
+			} else if (lhs_byte < rhs_byte) {
+
+				return(-1);
+			}
+		}
+	}
+
+	return(i == len2 ? 0 : 1);
+}
+
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type. The comparison is done for the LIKE operator.*/
+
+int
+cmp_data_data_slow_like_suffix(
+/*===========================*/
+				/* out: 1, 0, -1, if data1 is greater, equal,
+				less than data2, respectively */
+				/* in: data field (== a pointer to a
+				memory buffer) */
+	const byte*	data1 UNIV_UNUSED,
+				/* in: data field length or UNIV_SQL_NULL */
+	ulint		len1 UNIV_UNUSED,
+				/* in: data field (== a pointer to a memory
+				buffer) */
+	const byte*	data2 UNIV_UNUSED,
+				/* in: data field length or UNIV_SQL_NULL */
+	ulint		len2 UNIV_UNUSED)
+
+{
+	ut_error;	// FIXME:
+	return(1);
+}
+
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type. The comparison is done for the LIKE operator.*/
+
+int
+cmp_data_data_slow_like_substr(
+/*===========================*/
+				/* out: 1, 0, -1, if data1 is greater, equal,
+				less than data2, respectively */
+				/* in: data field (== a pointer to a
+				memory buffer) */
+	const byte*	data1 UNIV_UNUSED,
+				/* in: data field length or UNIV_SQL_NULL */
+	ulint		len1 UNIV_UNUSED,
+				/* in: data field (== a pointer to a memory
+				buffer) */
+	const byte*	data2 UNIV_UNUSED,
+				/* in: data field length or UNIV_SQL_NULL */
+	ulint		len2 UNIV_UNUSED)
+{
+	ut_error;	// FIXME:
+	return(1);
+}
 /*************************************************************//**
 This function is used to compare a data tuple to a physical record.
 Only dtuple->n_fields_cmp first fields are taken into account for
@@ -409,14 +624,15 @@ respectively, when only the common first fields are compared, or until
 the first externally stored field in rec */
 UNIV_INTERN
 int
-cmp_dtuple_rec_with_match(
-/*======================*/
+cmp_dtuple_rec_with_match_low(
+/*==========================*/
 	const dtuple_t*	dtuple,	/*!< in: data tuple */
 	const rec_t*	rec,	/*!< in: physical record which differs from
 				dtuple in some of the common fields, or which
 				has an equal number or more fields than
 				dtuple */
 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n_cmp,	/*!< in: number of fields to compare */
 	ulint*		matched_fields, /*!< in/out: number of already completely
 				matched fields; when function returns,
 				contains the value for current comparison */
@@ -440,7 +656,7 @@ cmp_dtuple_rec_with_match(
 	ulint		cur_field;	/* current field number */
 	ulint		cur_bytes;	/* number of already matched bytes
 					in current field */
-	int		ret = 3333;	/* return value */
+	int		ret;		/* return value */
 
 	ut_ad(dtuple && rec && matched_fields && matched_bytes);
 	ut_ad(dtuple_check_typed(dtuple));
@@ -449,7 +665,9 @@ cmp_dtuple_rec_with_match(
 	cur_field = *matched_fields;
 	cur_bytes = *matched_bytes;
 
-	ut_ad(cur_field <= dtuple_get_n_fields_cmp(dtuple));
+	ut_ad(n_cmp > 0);
+	ut_ad(n_cmp <= dtuple_get_n_fields(dtuple));
+	ut_ad(cur_field <= n_cmp);
 	ut_ad(cur_field <= rec_offs_n_fields(offsets));
 
 	if (cur_bytes == 0 && cur_field == 0) {
@@ -469,7 +687,7 @@ cmp_dtuple_rec_with_match(
 	/* Match fields in a loop; stop if we run out of fields in dtuple
 	or find an externally stored field */
 
-	while (cur_field < dtuple_get_n_fields_cmp(dtuple)) {
+	while (cur_field < n_cmp) {
 
 		ulint	mtype;
 		ulint	prtype;
@@ -527,10 +745,12 @@ cmp_dtuple_rec_with_match(
 			&& dtype_get_charset_coll(prtype)
 			!= DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
 
-			ret = cmp_whole_field(mtype, prtype,
-					      dfield_get_data(dtuple_field),
-					      (unsigned) dtuple_f_len,
-					      rec_b_ptr, (unsigned) rec_f_len);
+			ret = cmp_whole_field(
+				mtype, prtype,
+				static_cast<const byte*>(
+					dfield_get_data(dtuple_field)),
+				(unsigned) dtuple_f_len,
+				rec_b_ptr, (unsigned) rec_f_len);
 
 			if (ret != 0) {
 				cur_bytes = 0;
@@ -544,7 +764,7 @@ cmp_dtuple_rec_with_match(
 		/* Set the pointers at the current byte */
 
 		rec_b_ptr = rec_b_ptr + cur_bytes;
-		dtuple_b_ptr = (byte*)dfield_get_data(dtuple_field)
+		dtuple_b_ptr = (byte*) dfield_get_data(dtuple_field)
 			+ cur_bytes;
 		/* Compare then the fields */
 
@@ -624,7 +844,7 @@ next_field:
 order_resolved:
 	ut_ad((ret >= - 1) && (ret <= 1));
 	ut_ad(ret == cmp_debug_dtuple_rec_with_match(dtuple, rec, offsets,
-						     matched_fields));
+						     n_cmp, matched_fields));
 	ut_ad(*matched_fields == cur_field); /* In the debug version, the
 					     above cmp_debug_... sets
 					     *matched_fields to a value */
@@ -695,156 +915,181 @@ cmp_dtuple_is_prefix_of_rec(
 }
 
 /*************************************************************//**
-Compare two physical records that contain the same number of columns,
-none of which are stored externally.
-@return	1, 0, -1 if rec1 is greater, equal, less, respectively, than rec2 */
-UNIV_INTERN
+Compare two physical record fields.
+@retval 1 if rec1 field is greater than rec2
+@retval -1 if rec1 field is less than rec2
+@retval 0 if rec1 field equals to rec2 */
+static __attribute__((nonnull, warn_unused_result))
 int
-cmp_rec_rec_simple(
-/*===============*/
+cmp_rec_rec_simple_field(
+/*=====================*/
 	const rec_t*		rec1,	/*!< in: physical record */
 	const rec_t*		rec2,	/*!< in: physical record */
 	const ulint*		offsets1,/*!< in: rec_get_offsets(rec1, ...) */
 	const ulint*		offsets2,/*!< in: rec_get_offsets(rec2, ...) */
 	const dict_index_t*	index,	/*!< in: data dictionary index */
-	ibool*			null_eq)/*!< out: set to TRUE if
-					found matching null values */
+	ulint			n)	/*!< in: field to compare */
 {
-	ulint		rec1_f_len;	/*!< length of current field in rec1 */
-	const byte*	rec1_b_ptr;	/*!< pointer to the current byte
-					in rec1 field */
-	ulint		rec1_byte;	/*!< value of current byte to be
-					compared in rec1 */
-	ulint		rec2_f_len;	/*!< length of current field in rec2 */
-	const byte*	rec2_b_ptr;	/*!< pointer to the current byte
-					in rec2 field */
-	ulint		rec2_byte;	/*!< value of current byte to be
-					compared in rec2 */
-	ulint		cur_field;	/*!< current field number */
-	ulint		n_uniq;
-
-	n_uniq = dict_index_get_n_unique(index);
-	ut_ad(rec_offs_n_fields(offsets1) >= n_uniq);
-	ut_ad(rec_offs_n_fields(offsets2) >= n_uniq);
-
-	ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2));
+	const byte*	rec1_b_ptr;
+	const byte*	rec2_b_ptr;
+	ulint		rec1_f_len;
+	ulint		rec2_f_len;
+	const dict_col_t*	col	= dict_index_get_nth_col(index, n);
 
-	for (cur_field = 0; cur_field < n_uniq; cur_field++) {
+	ut_ad(!rec_offs_nth_extern(offsets1, n));
+	ut_ad(!rec_offs_nth_extern(offsets2, n));
 
-		ulint	cur_bytes;
-		ulint	mtype;
-		ulint	prtype;
+	rec1_b_ptr = rec_get_nth_field(rec1, offsets1, n, &rec1_f_len);
+	rec2_b_ptr = rec_get_nth_field(rec2, offsets2, n, &rec2_f_len);
 
-		{
-			const dict_col_t*	col
-				= dict_index_get_nth_col(index, cur_field);
-
-			mtype = col->mtype;
-			prtype = col->prtype;
+	if (rec1_f_len == UNIV_SQL_NULL || rec2_f_len == UNIV_SQL_NULL) {
+		if (rec1_f_len == rec2_f_len) {
+			return(0);
 		}
+		/* We define the SQL null to be the smallest possible
+		value of a field in the alphabetical order */
+		return(rec1_f_len == UNIV_SQL_NULL ? -1 : 1);
+	}
 
-		ut_ad(!rec_offs_nth_extern(offsets1, cur_field));
-		ut_ad(!rec_offs_nth_extern(offsets2, cur_field));
+	if (col->mtype >= DATA_FLOAT
+	    || (col->mtype == DATA_BLOB
+		&& !(col->prtype & DATA_BINARY_TYPE)
+		&& dtype_get_charset_coll(col->prtype)
+		!= DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
+		return(cmp_whole_field(col->mtype, col->prtype,
+				       rec1_b_ptr, (unsigned) rec1_f_len,
+				       rec2_b_ptr, (unsigned) rec2_f_len));
+	}
 
-		rec1_b_ptr = rec_get_nth_field(rec1, offsets1,
-					       cur_field, &rec1_f_len);
-		rec2_b_ptr = rec_get_nth_field(rec2, offsets2,
-					       cur_field, &rec2_f_len);
+	/* Compare the fields */
+	for (ulint cur_bytes = 0;; cur_bytes++, rec1_b_ptr++, rec2_b_ptr++) {
+		ulint		rec1_byte;
+		ulint		rec2_byte;
 
-		if (rec1_f_len == UNIV_SQL_NULL
-		    || rec2_f_len == UNIV_SQL_NULL) {
+		if (rec2_f_len <= cur_bytes) {
+			if (rec1_f_len <= cur_bytes) {
+				return(0);
+			}
 
-			if (rec1_f_len == rec2_f_len) {
-				if (null_eq) {
-					*null_eq = TRUE;
-				}
+			rec2_byte = dtype_get_pad_char(
+				col->mtype, col->prtype);
 
-				goto next_field;
+			if (rec2_byte == ULINT_UNDEFINED) {
+				return(1);
+			}
+		} else {
+			rec2_byte = *rec2_b_ptr;
+		}
 
-			} else if (rec2_f_len == UNIV_SQL_NULL) {
+		if (rec1_f_len <= cur_bytes) {
+			rec1_byte = dtype_get_pad_char(
+				col->mtype, col->prtype);
 
-				/* We define the SQL null to be the
-				smallest possible value of a field
-				in the alphabetical order */
-
-				return(1);
-			} else {
+			if (rec1_byte == ULINT_UNDEFINED) {
 				return(-1);
 			}
+		} else {
+			rec1_byte = *rec1_b_ptr;
 		}
 
-		if (mtype >= DATA_FLOAT
-		    || (mtype == DATA_BLOB
-			&& 0 == (prtype & DATA_BINARY_TYPE)
-			&& dtype_get_charset_coll(prtype)
-			!= DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
-			int ret = cmp_whole_field(mtype, prtype,
-						  rec1_b_ptr,
-						  (unsigned) rec1_f_len,
-						  rec2_b_ptr,
-						  (unsigned) rec2_f_len);
-			if (ret) {
-				return(ret);
-			}
+		if (rec1_byte == rec2_byte) {
+			/* If the bytes are equal, they will remain such
+			even after the collation transformation below */
+			continue;
+		}
 
-			goto next_field;
+		if (col->mtype <= DATA_CHAR
+		    || (col->mtype == DATA_BLOB
+			&& !(col->prtype & DATA_BINARY_TYPE))) {
+
+			rec1_byte = cmp_collate(rec1_byte);
+			rec2_byte = cmp_collate(rec2_byte);
 		}
 
-		/* Compare the fields */
-		for (cur_bytes = 0;; cur_bytes++, rec1_b_ptr++, rec2_b_ptr++) {
-			if (rec2_f_len <= cur_bytes) {
+		if (rec1_byte < rec2_byte) {
+			return(-1);
+		} else if (rec1_byte > rec2_byte) {
+			return(1);
+		}
+	}
+}
 
-				if (rec1_f_len <= cur_bytes) {
+/*************************************************************//**
+Compare two physical records that contain the same number of columns,
+none of which are stored externally.
+@retval 1 if rec1 (including non-ordering columns) is greater than rec2
+@retval -1 if rec1 (including non-ordering columns) is less than rec2
+@retval 0 if rec1 is a duplicate of rec2 */
+UNIV_INTERN
+int
+cmp_rec_rec_simple(
+/*===============*/
+	const rec_t*		rec1,	/*!< in: physical record */
+	const rec_t*		rec2,	/*!< in: physical record */
+	const ulint*		offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+	const ulint*		offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+	const dict_index_t*	index,	/*!< in: data dictionary index */
+	struct TABLE*		table)	/*!< in: MySQL table, for reporting
+					duplicate key value if applicable,
+					or NULL */
+{
+	ulint		n;
+	ulint		n_uniq	= dict_index_get_n_unique(index);
+	bool		null_eq	= false;
 
-					goto next_field;
-				}
+	ut_ad(rec_offs_n_fields(offsets1) >= n_uniq);
+	ut_ad(rec_offs_n_fields(offsets2) == rec_offs_n_fields(offsets2));
 
-				rec2_byte = dtype_get_pad_char(mtype, prtype);
+	ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2));
 
-				if (rec2_byte == ULINT_UNDEFINED) {
-					return(1);
-				}
-			} else {
-				rec2_byte = *rec2_b_ptr;
-			}
+	for (n = 0; n < n_uniq; n++) {
+		int cmp = cmp_rec_rec_simple_field(
+			rec1, rec2, offsets1, offsets2, index, n);
 
-			if (rec1_f_len <= cur_bytes) {
-				rec1_byte = dtype_get_pad_char(mtype, prtype);
+		if (cmp) {
+			return(cmp);
+		}
 
-				if (rec1_byte == ULINT_UNDEFINED) {
-					return(-1);
-				}
-			} else {
-				rec1_byte = *rec1_b_ptr;
-			}
+		/* If the fields are internally equal, they must both
+		be NULL or non-NULL. */
+		ut_ad(rec_offs_nth_sql_null(offsets1, n)
+		      == rec_offs_nth_sql_null(offsets2, n));
 
-			if (rec1_byte == rec2_byte) {
-				/* If the bytes are equal, they will remain
-				such even after the collation transformation
-				below */
+		if (rec_offs_nth_sql_null(offsets1, n)) {
+			ut_ad(!(dict_index_get_nth_col(index, n)->prtype
+				& DATA_NOT_NULL));
+			null_eq = true;
+		}
+	}
 
-				continue;
-			}
+	/* If we ran out of fields, the ordering columns of rec1 were
+	equal to rec2. Issue a duplicate key error if needed. */
 
-			if (mtype <= DATA_CHAR
-			    || (mtype == DATA_BLOB
-				&& !(prtype & DATA_BINARY_TYPE))) {
+	if (!null_eq && table && dict_index_is_unique(index)) {
+		/* Report erroneous row using new version of table. */
+		innobase_rec_to_mysql(table, rec1, index, offsets1);
+		return(0);
+	}
 
-				rec1_byte = cmp_collate(rec1_byte);
-				rec2_byte = cmp_collate(rec2_byte);
-			}
+	/* Else, keep comparing so that we have the full internal
+	order. */
+	for (; n < dict_index_get_n_fields(index); n++) {
+		int cmp = cmp_rec_rec_simple_field(
+			rec1, rec2, offsets1, offsets2, index, n);
 
-			if (rec1_byte < rec2_byte) {
-				return(-1);
-			} else if (rec1_byte > rec2_byte) {
-				return(1);
-			}
+		if (cmp) {
+			return(cmp);
 		}
-next_field:
-		continue;
+
+		/* If the fields are internally equal, they must both
+		be NULL or non-NULL. */
+		ut_ad(rec_offs_nth_sql_null(offsets1, n)
+		      == rec_offs_nth_sql_null(offsets2, n));
 	}
 
-	/* If we ran out of fields, rec1 was equal to rec2. */
+	/* This should never be reached. Internally, an index must
+	never contain duplicate entries. */
+	ut_ad(0);
 	return(0);
 }
 
@@ -912,7 +1157,7 @@ cmp_rec_rec_with_match(
 		ulint	mtype;
 		ulint	prtype;
 
-		if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+		if (dict_index_is_univ(index)) {
 			/* This is for the insert buffer B-tree. */
 			mtype = DATA_BINARY;
 			prtype = 0;
@@ -1113,6 +1358,7 @@ cmp_debug_dtuple_rec_with_match(
 				has an equal number or more fields than
 				dtuple */
 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n_cmp,	/*!< in: number of fields to compare */
 	ulint*		matched_fields) /*!< in/out: number of already
 				completely matched fields; when function
 				returns, contains the value for current
@@ -1125,14 +1371,16 @@ cmp_debug_dtuple_rec_with_match(
 					field data */
 	ulint		rec_f_len;	/* length of current field in rec */
 	const byte*	rec_f_data;	/* pointer to the current rec field */
-	int		ret = 3333;	/* return value */
+	int		ret;		/* return value */
 	ulint		cur_field;	/* current field number */
 
 	ut_ad(dtuple && rec && matched_fields);
 	ut_ad(dtuple_check_typed(dtuple));
 	ut_ad(rec_offs_validate(rec, NULL, offsets));
 
-	ut_ad(*matched_fields <= dtuple_get_n_fields_cmp(dtuple));
+	ut_ad(n_cmp > 0);
+	ut_ad(n_cmp <= dtuple_get_n_fields(dtuple));
+	ut_ad(*matched_fields <= n_cmp);
 	ut_ad(*matched_fields <= rec_offs_n_fields(offsets));
 
 	cur_field = *matched_fields;
@@ -1158,7 +1406,7 @@ cmp_debug_dtuple_rec_with_match(
 
 	/* Match fields in a loop; stop if we run out of fields in dtuple */
 
-	while (cur_field < dtuple_get_n_fields_cmp(dtuple)) {
+	while (cur_field < n_cmp) {
 
 		ulint	mtype;
 		ulint	prtype;
@@ -1172,7 +1420,9 @@ cmp_debug_dtuple_rec_with_match(
 			prtype = type->prtype;
 		}
 
-		dtuple_f_data = dfield_get_data(dtuple_field);
+		dtuple_f_data = static_cast<const byte*>(
+			dfield_get_data(dtuple_field));
+
 		dtuple_f_len = dfield_get_len(dtuple_field);
 
 		rec_f_data = rec_get_nth_field(rec, offsets,
diff --git a/storage/xtradb/rem/rem0rec.c b/storage/xtradb/rem/rem0rec.cc
index d938aa696dd..43072159b9e 100644
--- a/storage/xtradb/rem/rem0rec.c
+++ b/storage/xtradb/rem/rem0rec.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file rem/rem0rec.c
+@file rem/rem0rec.cc
 Record manager
 
 Created 5/30/1994 Heikki Tuuri
@@ -29,8 +29,10 @@ Created 5/30/1994 Heikki Tuuri
 #include "rem0rec.ic"
 #endif
 
+#include "page0page.h"
 #include "mtr0mtr.h"
 #include "mtr0log.h"
+#include "fts0fts.h"
 
 /*			PHYSICAL RECORD (OLD STYLE)
 			===========================
@@ -161,9 +163,9 @@ UNIV_INTERN
 ulint
 rec_get_n_extern_new(
 /*=================*/
-	const rec_t*	rec,	/*!< in: compact physical record */
-	dict_index_t*	index,	/*!< in: record descriptor */
-	ulint		n)	/*!< in: number of columns to scan */
+	const rec_t*		rec,	/*!< in: compact physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint			n)	/*!< in: number of columns to scan */
 {
 	const byte*	nulls;
 	const byte*	lens;
@@ -245,7 +247,7 @@ rec_init_offsets_comp_ordinary(
 /*===========================*/
 	const rec_t*		rec,	/*!< in: physical record in
 					ROW_FORMAT=COMPACT */
-	ibool			temp,	/*!< in: whether to use the
+	bool			temp,	/*!< in: whether to use the
 					format for temporary files in
 					index creation */
 	const dict_index_t*	index,	/*!< in: record descriptor */
@@ -255,15 +257,15 @@ rec_init_offsets_comp_ordinary(
 	ulint		i		= 0;
 	ulint		offs		= 0;
 	ulint		any_ext		= 0;
+	ulint		n_null		= index->n_nullable;
 	const byte*	nulls		= temp
 		? rec - 1
 		: rec - (1 + REC_N_NEW_EXTRA_BYTES);
-	const byte*	lens		= nulls
-		- UT_BITS_IN_BYTES(index->n_nullable);
+	const byte*	lens		= nulls - UT_BITS_IN_BYTES(n_null);
 	ulint		null_mask	= 1;
 
 #ifdef UNIV_DEBUG
-	/* We cannot invoke rec_offs_make_valid() here if temp=TRUE.
+	/* We cannot invoke rec_offs_make_valid() here if temp=true.
 	Similarly, rec_offs_validate() will fail in that case, because
 	it invokes rec_get_status(). */
 	offsets[2] = (ulint) rec;
@@ -275,7 +277,7 @@ rec_init_offsets_comp_ordinary(
 	if (temp && dict_table_is_comp(index->table)) {
 		/* No need to do adjust fixed_len=0. We only need to
 		adjust it for ROW_FORMAT=REDUNDANT. */
-		temp = FALSE;
+		temp = false;
 	}
 
 	/* read the lengths of fields 0..n */
@@ -288,6 +290,7 @@ rec_init_offsets_comp_ordinary(
 
 		if (!(col->prtype & DATA_NOT_NULL)) {
 			/* nullable field => read the null flag */
+			ut_ad(n_null--);
 
 			if (UNIV_UNLIKELY(!(byte) null_mask)) {
 				nulls--;
@@ -403,7 +406,7 @@ rec_init_offsets(
 			break;
 		case REC_STATUS_ORDINARY:
 			rec_init_offsets_comp_ordinary(
-				rec, FALSE, index, offsets);
+				rec, false, index, offsets);
 			return;
 		}
 
@@ -558,6 +561,9 @@ rec_get_offsets_func(
 			n = dict_index_get_n_fields(index);
 			break;
 		case REC_STATUS_NODE_PTR:
+			/* Node pointer records consist of the
+			uniquely identifying fields of the record
+			followed by a child page number field. */
 			n = dict_index_get_n_unique_in_tree(index) + 1;
 			break;
 		case REC_STATUS_INFIMUM:
@@ -577,6 +583,8 @@ rec_get_offsets_func(
 		n = n_fields;
 	}
 
+	/* The offsets header consists of the allocation size at
+	offsets[0] and the REC_OFFS_HEADER_SIZE bytes. */
 	size = n + (1 + REC_OFFS_HEADER_SIZE);
 
 	if (UNIV_UNLIKELY(!offsets)
@@ -586,7 +594,9 @@ rec_get_offsets_func(
 						     MEM_HEAP_DYNAMIC,
 						     file, line);
 		}
-		offsets = mem_heap_alloc(*heap, size * sizeof(ulint));
+		offsets = static_cast<ulint*>(
+			mem_heap_alloc(*heap, size * sizeof(ulint)));
+
 		rec_offs_set_n_alloc(offsets, size);
 	}
 
@@ -785,28 +795,27 @@ rec_get_converted_size_comp_prefix_low(
 	const dfield_t*		fields,	/*!< in: array of data fields */
 	ulint			n_fields,/*!< in: number of data fields */
 	ulint*			extra,	/*!< out: extra size */
-	ibool			temp)	/*!< in: whether this is a
+	bool			temp)	/*!< in: whether this is a
 					temporary file record */
 {
 	ulint	extra_size;
 	ulint	data_size;
 	ulint	i;
-	ut_ad(index);
-	ut_ad(fields);
+	ulint	n_null	= index->n_nullable;
 	ut_ad(n_fields > 0);
 	ut_ad(n_fields <= dict_index_get_n_fields(index));
 	ut_ad(!temp || extra);
 
 	extra_size = temp
-		? UT_BITS_IN_BYTES(index->n_nullable)
+		? UT_BITS_IN_BYTES(n_null)
 		: REC_N_NEW_EXTRA_BYTES
-		+ UT_BITS_IN_BYTES(index->n_nullable);
+		+ UT_BITS_IN_BYTES(n_null);
 	data_size = 0;
 
 	if (temp && dict_table_is_comp(index->table)) {
 		/* No need to do adjust fixed_len=0. We only need to
 		adjust it for ROW_FORMAT=REDUNDANT. */
-		temp = FALSE;
+		temp = false;
 	}
 
 	/* read the lengths of fields 0..n */
@@ -822,6 +831,8 @@ rec_get_converted_size_comp_prefix_low(
 
 		ut_ad(dict_col_type_assert_equal(col,
 						 dfield_get_type(&fields[i])));
+		/* All NULLable fields must be included in the n_null count. */
+		ut_ad((col->prtype & DATA_NOT_NULL) || n_null--);
 
 		if (dfield_is_null(&fields[i])) {
 			/* No length is stored for NULL fields. */
@@ -895,7 +906,7 @@ rec_get_converted_size_comp_prefix(
 {
 	ut_ad(dict_table_is_comp(index->table));
 	return(rec_get_converted_size_comp_prefix_low(
-		       index, fields, n_fields, extra, FALSE));
+		       index, fields, n_fields, extra, false));
 }
 
 /**********************************************************//**
@@ -915,8 +926,6 @@ rec_get_converted_size_comp(
 	ulint*			extra)	/*!< out: extra size */
 {
 	ulint	size;
-	ut_ad(index);
-	ut_ad(fields);
 	ut_ad(n_fields > 0);
 
 	switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) {
@@ -943,7 +952,7 @@ rec_get_converted_size_comp(
 	}
 
 	return(size + rec_get_converted_size_comp_prefix_low(
-		       index, fields, n_fields, extra, FALSE));
+		       index, fields, n_fields, extra, false));
 }
 
 /***********************************************************//**
@@ -1129,7 +1138,7 @@ rec_convert_dtuple_to_rec_comp(
 	const dfield_t*		fields,	/*!< in: array of data fields */
 	ulint			n_fields,/*!< in: number of data fields */
 	ulint			status,	/*!< in: status bits of the record */
-	ibool			temp)	/*!< in: whether to use the
+	bool			temp)	/*!< in: whether to use the
 					format for temporary files in
 					index creation */
 {
@@ -1143,6 +1152,8 @@ rec_convert_dtuple_to_rec_comp(
 	ulint		n_node_ptr_field;
 	ulint		fixed_len;
 	ulint		null_mask	= 1;
+	ulint		n_null;
+
 	ut_ad(temp || dict_table_is_comp(index->table));
 	ut_ad(n_fields > 0);
 
@@ -1154,7 +1165,7 @@ rec_convert_dtuple_to_rec_comp(
 		if (dict_table_is_comp(index->table)) {
 			/* No need to do adjust fixed_len=0. We only
 			need to adjust it for ROW_FORMAT=REDUNDANT. */
-			temp = FALSE;
+			temp = false;
 		}
 	} else {
 		nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1);
@@ -1181,7 +1192,8 @@ rec_convert_dtuple_to_rec_comp(
 	}
 
 	end = rec;
-	lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
+	n_null = index->n_nullable;
+	lens = nulls - UT_BITS_IN_BYTES(n_null);
 	/* clear the SQL-null flags */
 	memset(lens + 1, 0, nulls - lens);
 
@@ -1203,7 +1215,7 @@ rec_convert_dtuple_to_rec_comp(
 
 		if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) {
 			/* nullable field */
-			ut_ad(index->n_nullable > 0);
+			ut_ad(n_null--);
 
 			if (UNIV_UNLIKELY(!(byte) null_mask)) {
 				nulls--;
@@ -1257,7 +1269,9 @@ rec_convert_dtuple_to_rec_comp(
 			*lens-- = (byte) len;
 		} else {
 			ut_ad(len <= dtype_get_len(type)
-			      || dtype_get_mtype(type) == DATA_BLOB);
+			      || dtype_get_mtype(type) == DATA_BLOB
+			      || !strcmp(index->name,
+					 FTS_INDEX_TABLE_IND_NAME));
 			if (len < 128
 			    || (dtype_get_len(type) < 256
 				&& dtype_get_mtype(type) != DATA_BLOB)) {
@@ -1293,13 +1307,12 @@ rec_convert_dtuple_to_rec_new(
 	rec_t*	rec;
 
 	status = dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK;
-	rec_get_converted_size_comp(index, status,
-				    dtuple->fields, dtuple->n_fields,
-				    &extra_size);
+	rec_get_converted_size_comp(
+		index, status, dtuple->fields, dtuple->n_fields, &extra_size);
 	rec = buf + extra_size;
 
 	rec_convert_dtuple_to_rec_comp(
-		rec, index, dtuple->fields, dtuple->n_fields, status, FALSE);
+		rec, index, dtuple->fields, dtuple->n_fields, status, false);
 
 	/* Set the info bits of the record */
 	rec_set_info_and_status_bits(rec, dtuple_get_info_bits(dtuple));
@@ -1375,7 +1388,7 @@ rec_get_converted_size_temp(
 	ulint*			extra)	/*!< out: extra size */
 {
 	return(rec_get_converted_size_comp_prefix_low(
-		       index, fields, n_fields, extra, TRUE));
+		       index, fields, n_fields, extra, true));
 }
 
 /******************************************************//**
@@ -1390,7 +1403,7 @@ rec_init_offsets_temp(
 	ulint*			offsets)/*!< in/out: array of offsets;
 					in: n=rec_offs_n_fields(offsets) */
 {
-	rec_init_offsets_comp_ordinary(rec, TRUE, index, offsets);
+	rec_init_offsets_comp_ordinary(rec, true, index, offsets);
 }
 
 /*********************************************************//**
@@ -1406,7 +1419,7 @@ rec_convert_dtuple_to_temp(
 	ulint			n_fields)	/*!< in: number of fields */
 {
 	rec_convert_dtuple_to_rec_comp(rec, index, fields, n_fields,
-				       REC_STATUS_ORDINARY, TRUE);
+				       REC_STATUS_ORDINARY, true);
 }
 
 /**************************************************************//**
@@ -1486,7 +1499,7 @@ rec_copy_prefix_to_buf_old(
 			mem_free(*buf);
 		}
 
-		*buf = mem_alloc2(prefix_len, buf_size);
+		*buf = static_cast<byte*>(mem_alloc2(prefix_len, buf_size));
 	}
 
 	ut_memcpy(*buf, rec - area_start, prefix_len);
@@ -1612,7 +1625,7 @@ rec_copy_prefix_to_buf(
 			mem_free(*buf);
 		}
 
-		*buf = mem_alloc2(prefix_len, buf_size);
+		*buf = static_cast<byte*>(mem_alloc2(prefix_len, buf_size));
 	}
 
 	memcpy(*buf, lens + 1, prefix_len);
@@ -1826,6 +1839,13 @@ rec_print_comp(
 			if (len <= 30) {
 
 				ut_print_buf(file, data, len);
+			} else if (rec_offs_nth_extern(offsets, i)) {
+				ut_print_buf(file, data, 30);
+				fprintf(file, " (total %lu bytes, external)",
+					(ulong) len);
+				ut_print_buf(file, data + len
+					     - BTR_EXTERN_FIELD_REF_SIZE,
+					     BTR_EXTERN_FIELD_REF_SIZE);
 			} else {
 				ut_print_buf(file, data, 30);
 
@@ -1896,4 +1916,47 @@ rec_print(
 		}
 	}
 }
+
+# ifdef UNIV_DEBUG
+/************************************************************//**
+Reads the DB_TRX_ID of a clustered index record.
+@return	the value of DB_TRX_ID */
+UNIV_INTERN
+trx_id_t
+rec_get_trx_id(
+/*===========*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index)	/*!< in: clustered index */
+{
+	const page_t*	page
+		= page_align(rec);
+	ulint		trx_id_col
+		= dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+	const byte*	trx_id;
+	ulint		len;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+	ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID)
+	      == index->id);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(trx_id_col > 0);
+	ut_ad(trx_id_col != ULINT_UNDEFINED);
+
+	offsets = rec_get_offsets(rec, index, offsets, trx_id_col + 1, &heap);
+
+	trx_id = rec_get_nth_field(rec, offsets, trx_id_col, &len);
+
+	ut_ad(len == DATA_TRX_ID_LEN);
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+	return(trx_read_trx_id(trx_id));
+}
+# endif /* UNIV_DEBUG */
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/row/row0ext.c b/storage/xtradb/row/row0ext.cc
index aa3b14e06f2..32b78391d6a 100644
--- a/storage/xtradb/row/row0ext.c
+++ b/storage/xtradb/row/row0ext.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0ext.c
+@file row/row0ext.cc
 Caching of externally stored column prefixes
 
 Created September 2006 Marko Makela
@@ -42,7 +42,8 @@ row_ext_cache_fill(
 	ulint		zip_size,/*!< compressed page size in bytes, or 0 */
 	const dfield_t*	dfield)	/*!< in: data field */
 {
-	const byte*	field	= dfield_get_data(dfield);
+	const byte*	field	= static_cast<const byte*>(
+					dfield_get_data(dfield));
 	ulint		f_len	= dfield_get_len(dfield);
 	byte*		buf	= ext->buf + i * ext->max_len;
 
@@ -57,14 +58,28 @@ row_ext_cache_fill(
 		/* The BLOB pointer is not set: we cannot fetch it */
 		ext->len[i] = 0;
 	} else {
-		/* Fetch at most ext->max_len of the column.
-		The column should be non-empty.  However,
-		trx_rollback_or_clean_all_recovered() may try to
-		access a half-deleted BLOB if the server previously
-		crashed during the execution of
-		btr_free_externally_stored_field(). */
-		ext->len[i] = btr_copy_externally_stored_field_prefix(
-			buf, ext->max_len, zip_size, field, f_len);
+		if (ext->max_len == REC_VERSION_56_MAX_INDEX_COL_LEN
+		    && f_len > BTR_EXTERN_FIELD_REF_SIZE) {
+			/* In this case, the field is in B format or beyond,
+			(refer to the definition of row_ext_t.max_len)
+			and the field is already fill with prefix, otherwise
+			f_len would be BTR_EXTERN_FIELD_REF_SIZE.
+			So there is no need to re-read the prefix externally,
+			but just copy the local prefix to buf. Please note
+			if the ext->len[i] is zero, it means an error
+			as above. */
+			memcpy(buf, field, f_len - BTR_EXTERN_FIELD_REF_SIZE);
+			ext->len[i] = f_len - BTR_EXTERN_FIELD_REF_SIZE;
+		} else {
+			/* Fetch at most ext->max_len of the column.
+			The column should be non-empty.  However,
+			trx_rollback_or_clean_all_recovered() may try to
+			access a half-deleted BLOB if the server previously
+			crashed during the execution of
+			btr_free_externally_stored_field(). */
+			ext->len[i] = btr_copy_externally_stored_field_prefix(
+				buf, ext->max_len, zip_size, field, f_len);
+		}
 	}
 }
 
@@ -90,19 +105,26 @@ row_ext_create(
 	mem_heap_t*	heap)	/*!< in: heap where created */
 {
 	ulint		i;
-	ulint		zip_size = dict_table_flags_to_zip_size(flags);
+	ulint		zip_size = dict_tf_get_zip_size(flags);
 
-	row_ext_t*	ret = mem_heap_alloc(heap, (sizeof *ret)
-					     + (n_ext - 1) * sizeof ret->len);
+	row_ext_t*	ret;
+
+	ut_ad(n_ext > 0);
+
+	ret = static_cast<row_ext_t*>(
+		mem_heap_alloc(heap,
+			       (sizeof *ret) + (n_ext - 1) * sizeof ret->len));
 
 	ut_ad(ut_is_2pow(zip_size));
-	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
 
 	ret->n_ext = n_ext;
 	ret->ext = ext;
 	ret->max_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags);
 
-	ret->buf = mem_heap_alloc(heap, n_ext * ret->max_len);
+	ret->buf = static_cast<byte*>(
+		mem_heap_alloc(heap, n_ext * ret->max_len));
+
 #ifdef UNIV_DEBUG
 	memset(ret->buf, 0xaa, n_ext * ret->max_len);
 	UNIV_MEM_ALLOC(ret->buf, n_ext * ret->max_len);
diff --git a/storage/xtradb/row/row0ftsort.cc b/storage/xtradb/row/row0ftsort.cc
new file mode 100644
index 00000000000..c0d996e24ea
--- /dev/null
+++ b/storage/xtradb/row/row0ftsort.cc
@@ -0,0 +1,1528 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ftsort.cc
+Create Full Text Index with (parallel) merge sort
+
+Created 10/13/2010 Jimmy Yang
+*******************************************************/
+
+#include "dict0dict.h" /* dict_table_stats_lock() */
+#include "row0merge.h"
+#include "pars0pars.h"
+#include "row0ftsort.h"
+#include "row0merge.h"
+#include "row0row.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+
+/** Read the next record to buffer N.
+@param N	index into array of merge info structure */
+#define ROW_MERGE_READ_GET_NEXT(N)					\
+	do {								\
+		b[N] = row_merge_read_rec(				\
+			block[N], buf[N], b[N], index,			\
+			fd[N], &foffs[N], &mrec[N], offsets[N]);	\
+		if (UNIV_UNLIKELY(!b[N])) {				\
+			if (mrec[N]) {					\
+				goto exit;				\
+			}						\
+		}							\
+	} while (0)
+
+/** Parallel sort degree */
+UNIV_INTERN ulong	fts_sort_pll_degree	= 2;
+
+/*********************************************************************//**
+Create a temporary "fts sort index" used to merge sort the
+tokenized doc string. The index has three "fields":
+
+1) Tokenized word,
+2) Doc ID (depend on number of records to sort, it can be a 4 bytes or 8 bytes
+integer value)
+3) Word's position in original doc.
+
+@return dict_index_t structure for the fts sort index */
+UNIV_INTERN
+dict_index_t*
+row_merge_create_fts_sort_index(
+/*============================*/
+	dict_index_t*		index,	/*!< in: Original FTS index
+					based on which this sort index
+					is created */
+	const dict_table_t*	table,	/*!< in: table that FTS index
+					is being created on */
+	ibool*			opt_doc_id_size)
+					/*!< out: whether to use 4 bytes
+					instead of 8 bytes integer to
+					store Doc ID during sort */
+{
+	dict_index_t*   new_index;
+	dict_field_t*   field;
+	dict_field_t*   idx_field;
+	CHARSET_INFO*	charset;
+
+	// FIXME: This name shouldn't be hard coded here.
+	new_index = dict_mem_index_create(
+		index->table->name, "tmp_fts_idx", 0, DICT_FTS, 3);
+
+	new_index->id = index->id;
+	new_index->table = (dict_table_t*) table;
+	new_index->n_uniq = FTS_NUM_FIELDS_SORT;
+	new_index->n_def = FTS_NUM_FIELDS_SORT;
+	new_index->cached = TRUE;
+
+	btr_search_index_init(new_index);
+
+	idx_field = dict_index_get_nth_field(index, 0);
+	charset = fts_index_get_charset(index);
+
+	/* The first field is on the Tokenized Word */
+	field = dict_index_get_nth_field(new_index, 0);
+	field->name = NULL;
+	field->prefix_len = 0;
+	field->col = static_cast<dict_col_t*>(
+		mem_heap_alloc(new_index->heap, sizeof(dict_col_t)));
+	field->col->len = FTS_MAX_WORD_LEN;
+
+	if (strcmp(charset->name, "latin1_swedish_ci") == 0) {
+		field->col->mtype = DATA_VARCHAR;
+	} else {
+		field->col->mtype = DATA_VARMYSQL;
+	}
+
+	field->col->prtype = idx_field->col->prtype | DATA_NOT_NULL;
+	field->col->mbminmaxlen = idx_field->col->mbminmaxlen;
+	field->fixed_len = 0;
+
+	/* Doc ID */
+	field = dict_index_get_nth_field(new_index, 1);
+	field->name = NULL;
+	field->prefix_len = 0;
+	field->col = static_cast<dict_col_t*>(
+		mem_heap_alloc(new_index->heap, sizeof(dict_col_t)));
+	field->col->mtype = DATA_INT;
+	*opt_doc_id_size = FALSE;
+
+	/* Check whether we can use 4 bytes instead of 8 bytes integer
+	field to hold the Doc ID, thus reduce the overall sort size */
+	if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		/* If Doc ID column is being added by this create
+		index, then just check the number of rows in the table */
+		if (dict_table_get_n_rows(table) < MAX_DOC_ID_OPT_VAL) {
+			*opt_doc_id_size = TRUE;
+		}
+	} else {
+		doc_id_t	max_doc_id;
+
+		/* If the Doc ID column is supplied by user, then
+		check the maximum Doc ID in the table */
+		max_doc_id = fts_get_max_doc_id((dict_table_t*) table);
+
+		if (max_doc_id && max_doc_id < MAX_DOC_ID_OPT_VAL) {
+			*opt_doc_id_size = TRUE;
+		}
+	}
+
+	if (*opt_doc_id_size) {
+		field->col->len = sizeof(ib_uint32_t);
+		field->fixed_len = sizeof(ib_uint32_t);
+	} else {
+		field->col->len = FTS_DOC_ID_LEN;
+		field->fixed_len = FTS_DOC_ID_LEN;
+	}
+
+	field->col->prtype = DATA_NOT_NULL | DATA_BINARY_TYPE;
+
+	field->col->mbminmaxlen = 0;
+
+	/* The third field is on the word's position in the original doc */
+	field = dict_index_get_nth_field(new_index, 2);
+	field->name = NULL;
+	field->prefix_len = 0;
+	field->col = static_cast<dict_col_t*>(
+		mem_heap_alloc(new_index->heap, sizeof(dict_col_t)));
+	field->col->mtype = DATA_INT;
+	field->col->len = 4 ;
+	field->fixed_len = 4;
+	field->col->prtype = DATA_NOT_NULL;
+	field->col->mbminmaxlen = 0;
+
+	return(new_index);
+}
+/*********************************************************************//**
+Initialize FTS parallel sort structures.
+@return TRUE if all successful */
+UNIV_INTERN
+ibool
+row_fts_psort_info_init(
+/*====================*/
+	trx_t*			trx,	/*!< in: transaction */
+	row_merge_dup_t*	dup,	/*!< in,own: descriptor of
+					FTS index being created */
+	const dict_table_t*	new_table,/*!< in: table on which indexes are
+					created */
+	ibool			opt_doc_id_size,
+					/*!< in: whether to use 4 bytes
+					instead of 8 bytes integer to
+					store Doc ID during sort */
+	fts_psort_t**		psort,	/*!< out: parallel sort info to be
+					instantiated */
+	fts_psort_t**		merge)	/*!< out: parallel merge info
+					to be instantiated */
+{
+	ulint			i;
+	ulint			j;
+	fts_psort_common_t*	common_info = NULL;
+	fts_psort_t*		psort_info = NULL;
+	fts_psort_t*		merge_info = NULL;
+	ulint			block_size;
+	ibool			ret = TRUE;
+
+	block_size = 3 * srv_sort_buf_size;
+
+	*psort = psort_info = static_cast<fts_psort_t*>(mem_zalloc(
+		 fts_sort_pll_degree * sizeof *psort_info));
+
+	if (!psort_info) {
+		ut_free(dup);
+		return(FALSE);
+	}
+
+	/* Common Info for all sort threads */
+	common_info = static_cast<fts_psort_common_t*>(
+		mem_alloc(sizeof *common_info));
+
+	if (!common_info) {
+		ut_free(dup);
+		mem_free(psort_info);
+		return(FALSE);
+	}
+
+	common_info->dup = dup;
+	common_info->new_table = (dict_table_t*) new_table;
+	common_info->trx = trx;
+	common_info->all_info = psort_info;
+	common_info->sort_event = os_event_create();
+	common_info->merge_event = os_event_create();
+	common_info->opt_doc_id_size = opt_doc_id_size;
+
+	/* There will be FTS_NUM_AUX_INDEX number of "sort buckets" for
+	each parallel sort thread. Each "sort bucket" holds records for
+	a particular "FTS index partition" */
+	for (j = 0; j < fts_sort_pll_degree; j++) {
+
+		UT_LIST_INIT(psort_info[j].fts_doc_list);
+
+		for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+
+			psort_info[j].merge_file[i] =
+				 static_cast<merge_file_t*>(
+					mem_zalloc(sizeof(merge_file_t)));
+
+			if (!psort_info[j].merge_file[i]) {
+				ret = FALSE;
+				goto func_exit;
+			}
+
+			psort_info[j].merge_buf[i] = row_merge_buf_create(
+				dup->index);
+
+			if (row_merge_file_create(psort_info[j].merge_file[i])
+			    < 0) {
+				goto func_exit;
+			}
+
+			/* Need to align memory for O_DIRECT write */
+			psort_info[j].block_alloc[i] =
+				static_cast<row_merge_block_t*>(ut_malloc(
+					block_size + 1024));
+
+			psort_info[j].merge_block[i] =
+				static_cast<row_merge_block_t*>(
+					ut_align(
+					psort_info[j].block_alloc[i], 1024));
+
+			if (!psort_info[j].merge_block[i]) {
+				ret = FALSE;
+				goto func_exit;
+			}
+		}
+
+		psort_info[j].child_status = 0;
+		psort_info[j].state = 0;
+		psort_info[j].psort_common = common_info;
+	}
+
+	/* Initialize merge_info structures parallel merge and insert
+	into auxiliary FTS tables (FTS_INDEX_TABLE) */
+	*merge = merge_info = static_cast<fts_psort_t*>(
+		mem_alloc(FTS_NUM_AUX_INDEX * sizeof *merge_info));
+
+	for (j = 0; j < FTS_NUM_AUX_INDEX; j++) {
+
+		merge_info[j].child_status = 0;
+		merge_info[j].state = 0;
+		merge_info[j].psort_common = common_info;
+	}
+
+func_exit:
+	if (!ret) {
+		row_fts_psort_info_destroy(psort_info, merge_info);
+	}
+
+	return(ret);
+}
+/*********************************************************************//**
+Clean up and deallocate FTS parallel sort structures, and close the
+merge sort files  */
+UNIV_INTERN
+void
+row_fts_psort_info_destroy(
+/*=======================*/
+	fts_psort_t*	psort_info,	/*!< parallel sort info */
+	fts_psort_t*	merge_info)	/*!< parallel merge info */
+{
+	ulint	i;
+	ulint	j;
+
+	if (psort_info) {
+		for (j = 0; j < fts_sort_pll_degree; j++) {
+			for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+				if (psort_info[j].merge_file[i]) {
+					row_merge_file_destroy(
+						psort_info[j].merge_file[i]);
+				}
+
+				if (psort_info[j].block_alloc[i]) {
+					ut_free(psort_info[j].block_alloc[i]);
+				}
+				mem_free(psort_info[j].merge_file[i]);
+			}
+		}
+
+		os_event_free(merge_info[0].psort_common->sort_event);
+		os_event_free(merge_info[0].psort_common->merge_event);
+		ut_free(merge_info[0].psort_common->dup);
+		mem_free(merge_info[0].psort_common);
+		mem_free(psort_info);
+	}
+
+	if (merge_info) {
+		mem_free(merge_info);
+	}
+}
+/*********************************************************************//**
+Free up merge buffers when merge sort is done */
+UNIV_INTERN
+void
+row_fts_free_pll_merge_buf(
+/*=======================*/
+	fts_psort_t*	psort_info)	/*!< in: parallel sort info */
+{
+	ulint	j;
+	ulint	i;
+
+	if (!psort_info) {
+		return;
+	}
+
+	for (j = 0; j < fts_sort_pll_degree; j++) {
+		for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+			row_merge_buf_free(psort_info[j].merge_buf[i]);
+		}
+	}
+
+	return;
+}
+
+/*********************************************************************//**
+Tokenize incoming text data and add to the sort buffer.
+@return	TRUE if the record passed, FALSE if out of space */
+static
+ibool
+row_merge_fts_doc_tokenize(
+/*=======================*/
+	row_merge_buf_t**	sort_buf,	/*!< in/out: sort buffer */
+	doc_id_t		doc_id,		/*!< in: Doc ID */
+	fts_doc_t*		doc,		/*!< in: Doc to be tokenized */
+	dtype_t*		word_dtype,	/*!< in: data structure for
+						word col */
+	merge_file_t**		merge_file,	/*!< in/out: merge file */
+	ibool			opt_doc_id_size,/*!< in: whether to use 4 bytes
+						instead of 8 bytes integer to
+						store Doc ID during sort*/
+	fts_tokenize_ctx_t*	t_ctx)          /*!< in/out: tokenize context */
+{
+	ulint		i;
+	ulint		inc;
+	fts_string_t	str;
+	ulint		len;
+	row_merge_buf_t* buf;
+	dfield_t*	field;
+	fts_string_t	t_str;
+	ibool		buf_full = FALSE;
+	byte		str_buf[FTS_MAX_WORD_LEN + 1];
+	ulint		data_size[FTS_NUM_AUX_INDEX];
+	ulint		n_tuple[FTS_NUM_AUX_INDEX];
+
+	t_str.f_n_char = 0;
+	t_ctx->buf_used = 0;
+
+	memset(n_tuple, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
+	memset(data_size, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
+
+	/* Tokenize the data and add each word string, its corresponding
+	doc id and position to sort buffer */
+	for (i = t_ctx->processed_len; i < doc->text.f_len; i += inc) {
+		ib_rbt_bound_t	parent;
+		ulint		idx = 0;
+		ib_uint32_t	position;
+		ulint           offset = 0;
+		ulint		cur_len = 0;
+		doc_id_t	write_doc_id;
+
+		inc = innobase_mysql_fts_get_token(
+			doc->charset, doc->text.f_str + i,
+			doc->text.f_str + doc->text.f_len, &str, &offset);
+
+		ut_a(inc > 0);
+
+		/* Ignore string whose character number is less than
+		"fts_min_token_size" or more than "fts_max_token_size" */
+		if (str.f_n_char < fts_min_token_size
+		    || str.f_n_char > fts_max_token_size) {
+
+			t_ctx->processed_len += inc;
+			continue;
+		}
+
+		t_str.f_len = innobase_fts_casedn_str(
+			doc->charset, (char*) str.f_str, str.f_len,
+			(char*) &str_buf, FTS_MAX_WORD_LEN + 1);
+
+		t_str.f_str = (byte*) &str_buf;
+
+		/* if "cached_stopword" is defined, ingore words in the
+		stopword list */
+		if (t_ctx->cached_stopword
+		    && rbt_search(t_ctx->cached_stopword,
+				  &parent, &t_str) == 0) {
+
+			t_ctx->processed_len += inc;
+			continue;
+		}
+
+		/* There are FTS_NUM_AUX_INDEX auxiliary tables, find
+		out which sort buffer to put this word record in */
+		t_ctx->buf_used = fts_select_index(
+			doc->charset, t_str.f_str, t_str.f_len);
+
+		buf = sort_buf[t_ctx->buf_used];
+
+		ut_a(t_ctx->buf_used < FTS_NUM_AUX_INDEX);
+		idx = t_ctx->buf_used;
+
+		mtuple_t* mtuple = &buf->tuples[buf->n_tuples + n_tuple[idx]];
+
+		field = mtuple->fields = static_cast<dfield_t*>(
+			mem_heap_alloc(buf->heap,
+				       FTS_NUM_FIELDS_SORT * sizeof *field));
+
+		/* The first field is the tokenized word */
+		dfield_set_data(field, t_str.f_str, t_str.f_len);
+		len = dfield_get_len(field);
+
+		field->type.mtype = word_dtype->mtype;
+		field->type.prtype = word_dtype->prtype | DATA_NOT_NULL;
+
+		/* Variable length field, set to max size. */
+		field->type.len = FTS_MAX_WORD_LEN;
+		field->type.mbminmaxlen = word_dtype->mbminmaxlen;
+
+		cur_len += len;
+		dfield_dup(field, buf->heap);
+		field++;
+
+		/* The second field is the Doc ID */
+
+		ib_uint32_t	doc_id_32_bit;
+
+		if (!opt_doc_id_size) {
+			fts_write_doc_id((byte*) &write_doc_id, doc_id);
+
+			dfield_set_data(
+				field, &write_doc_id, sizeof(write_doc_id));
+		} else {
+			mach_write_to_4(
+				(byte*) &doc_id_32_bit, (ib_uint32_t) doc_id);
+
+			dfield_set_data(
+				field, &doc_id_32_bit, sizeof(doc_id_32_bit));
+		}
+
+		len = field->len;
+		ut_ad(len == FTS_DOC_ID_LEN || len == sizeof(ib_uint32_t));
+
+		field->type.mtype = DATA_INT;
+		field->type.prtype = DATA_NOT_NULL | DATA_BINARY_TYPE;
+		field->type.len = len;
+		field->type.mbminmaxlen = 0;
+
+		cur_len += len;
+		dfield_dup(field, buf->heap);
+
+		++field;
+
+		/* The third field is the position */
+		mach_write_to_4(
+			(byte*) &position,
+			(i + offset + inc - str.f_len + t_ctx->init_pos));
+
+		dfield_set_data(field, &position, sizeof(position));
+		len = dfield_get_len(field);
+		ut_ad(len == sizeof(ib_uint32_t));
+
+		field->type.mtype = DATA_INT;
+		field->type.prtype = DATA_NOT_NULL;
+		field->type.len = len;
+		field->type.mbminmaxlen = 0;
+		cur_len += len;
+		dfield_dup(field, buf->heap);
+
+		/* One variable length column, word with its lenght less than
+		fts_max_token_size, add one extra size and one extra byte */
+		cur_len += 2;
+
+		/* Reserve one byte for the end marker of row_merge_block_t. */
+		if (buf->total_size + data_size[idx] + cur_len
+		    >= srv_sort_buf_size - 1) {
+
+			buf_full = TRUE;
+			break;
+		}
+
+		/* Increment the number of tuples */
+		n_tuple[idx]++;
+		t_ctx->processed_len += inc;
+		data_size[idx] += cur_len;
+	}
+
+	/* Update the data length and the number of new word tuples
+	added in this round of tokenization */
+	for (i = 0; i <  FTS_NUM_AUX_INDEX; i++) {
+		/* The computation of total_size below assumes that no
+		delete-mark flags will be stored and that all fields
+		are NOT NULL and fixed-length. */
+
+		sort_buf[i]->total_size += data_size[i];
+
+		sort_buf[i]->n_tuples += n_tuple[i];
+
+		merge_file[i]->n_rec += n_tuple[i];
+		t_ctx->rows_added[i] += n_tuple[i];
+	}
+
+	if (!buf_full) {
+		/* we pad one byte between text accross two fields */
+		t_ctx->init_pos += doc->text.f_len + 1;
+	}
+
+	return(!buf_full);
+}
+
+/*********************************************************************//**
+Function performs parallel tokenization of the incoming doc strings.
+It also performs the initial in memory sort of the parsed records.
+@return OS_THREAD_DUMMY_RETURN */
+UNIV_INTERN
+os_thread_ret_t
+fts_parallel_tokenization(
+/*======================*/
+	void*		arg)	/*!< in: psort_info for the thread */
+{
+	fts_psort_t*		psort_info = (fts_psort_t*) arg;
+	ulint			i;
+	fts_doc_item_t*		doc_item = NULL;
+	fts_doc_item_t*		prev_doc_item = NULL;
+	row_merge_buf_t**	buf;
+	ibool			processed = FALSE;
+	merge_file_t**		merge_file;
+	row_merge_block_t**	block;
+	int			tmpfd[FTS_NUM_AUX_INDEX];
+	ulint			mycount[FTS_NUM_AUX_INDEX];
+	ib_uint64_t		total_rec = 0;
+	ulint			num_doc_processed = 0;
+	doc_id_t		last_doc_id = 0;
+	ulint			zip_size;
+	mem_heap_t*		blob_heap = NULL;
+	fts_doc_t		doc;
+	dict_table_t*		table = psort_info->psort_common->new_table;
+	dtype_t			word_dtype;
+	dict_field_t*		idx_field;
+	fts_tokenize_ctx_t	t_ctx;
+	ulint			retried = 0;
+	ut_ad(psort_info);
+
+	ut_ad(psort_info);
+
+	buf = psort_info->merge_buf;
+	merge_file = psort_info->merge_file;
+	blob_heap = mem_heap_create(512);
+	memset(&doc, 0, sizeof(doc));
+	memset(&t_ctx, 0, sizeof(t_ctx));
+	memset(mycount, 0, FTS_NUM_AUX_INDEX * sizeof(int));
+
+	doc.charset = fts_index_get_charset(
+		psort_info->psort_common->dup->index);
+
+	idx_field = dict_index_get_nth_field(
+		psort_info->psort_common->dup->index, 0);
+	word_dtype.prtype = idx_field->col->prtype;
+	word_dtype.mbminmaxlen = idx_field->col->mbminmaxlen;
+	word_dtype.mtype = (strcmp(doc.charset->name, "latin1_swedish_ci") == 0)
+				? DATA_VARCHAR : DATA_VARMYSQL;
+
+	block = psort_info->merge_block;
+	zip_size = dict_table_zip_size(table);
+
+	doc_item = UT_LIST_GET_FIRST(psort_info->fts_doc_list);
+
+	if (doc_item) {
+		prev_doc_item = doc_item;
+	}
+
+	t_ctx.cached_stopword = table->fts->cache->stopword_info.cached_stopword;
+	processed = TRUE;
+loop:
+	while (doc_item) {
+		dfield_t*	dfield = doc_item->field;
+
+		last_doc_id = doc_item->doc_id;
+
+		if (!(dfield->data)
+		    || dfield_get_len(dfield) == UNIV_SQL_NULL) {
+			num_doc_processed++;
+			doc_item = UT_LIST_GET_NEXT(doc_list, doc_item);
+
+			/* Always remember the last doc_item we processed */
+			if (doc_item) {
+				prev_doc_item = doc_item;
+			}
+			continue;
+		}
+
+		/* If finish processing the last item, update "doc" with
+		strings in the doc_item, otherwise continue processing last
+		item */
+		if (processed) {
+			byte*		data;
+			ulint		data_len;
+
+			dfield = doc_item->field;
+			data = static_cast<byte*>(dfield_get_data(dfield));
+			data_len = dfield_get_len(dfield);
+
+			if (dfield_is_ext(dfield)) {
+				doc.text.f_str =
+					btr_copy_externally_stored_field(
+						&doc.text.f_len, data,
+						zip_size, data_len, blob_heap);
+			} else {
+				doc.text.f_str = data;
+				doc.text.f_len = data_len;
+			}
+
+			doc.tokens = 0;
+			t_ctx.processed_len = 0;
+		} else {
+			/* Not yet finish processing the "doc" on hand,
+			continue processing it */
+			ut_ad(doc.text.f_str);
+			ut_ad(t_ctx.processed_len < doc.text.f_len);
+		}
+
+		processed = row_merge_fts_doc_tokenize(
+			buf, doc_item->doc_id, &doc,
+			&word_dtype,
+			merge_file, psort_info->psort_common->opt_doc_id_size,
+			&t_ctx);
+
+		/* Current sort buffer full, need to recycle */
+		if (!processed) {
+			ut_ad(t_ctx.processed_len < doc.text.f_len);
+			ut_ad(t_ctx.rows_added[t_ctx.buf_used]);
+			break;
+		}
+
+		num_doc_processed++;
+
+		if (fts_enable_diag_print && num_doc_processed % 10000 == 1) {
+			fprintf(stderr, "number of doc processed %d\n",
+				(int) num_doc_processed);
+#ifdef FTS_INTERNAL_DIAG_PRINT
+			for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+				fprintf(stderr, "ID %d, partition %d, word "
+					"%d\n",(int) psort_info->psort_id,
+					(int) i, (int) mycount[i]);
+			}
+#endif
+		}
+
+		mem_heap_empty(blob_heap);
+
+		if (doc_item->field->data) {
+			ut_free(doc_item->field->data);
+			doc_item->field->data = NULL;
+		}
+
+		doc_item = UT_LIST_GET_NEXT(doc_list, doc_item);
+
+		/* Always remember the last doc_item we processed */
+		if (doc_item) {
+			prev_doc_item = doc_item;
+			if (last_doc_id != doc_item->doc_id) {
+				t_ctx.init_pos = 0;
+			}
+		}
+	}
+
+	/* If we run out of current sort buffer, need to sort
+	and flush the sort buffer to disk */
+	if (t_ctx.rows_added[t_ctx.buf_used] && !processed) {
+		row_merge_buf_sort(buf[t_ctx.buf_used], NULL);
+		row_merge_buf_write(buf[t_ctx.buf_used],
+				    merge_file[t_ctx.buf_used],
+				    block[t_ctx.buf_used]);
+		row_merge_write(merge_file[t_ctx.buf_used]->fd,
+				merge_file[t_ctx.buf_used]->offset++,
+				block[t_ctx.buf_used]);
+		UNIV_MEM_INVALID(block[t_ctx.buf_used][0], srv_sort_buf_size);
+		buf[t_ctx.buf_used] = row_merge_buf_empty(buf[t_ctx.buf_used]);
+		mycount[t_ctx.buf_used] += t_ctx.rows_added[t_ctx.buf_used];
+		t_ctx.rows_added[t_ctx.buf_used] = 0;
+
+		ut_a(doc_item);
+		goto loop;
+	}
+
+	/* Parent done scanning, and if finish processing all the docs, exit */
+	if (psort_info->state == FTS_PARENT_COMPLETE) {
+	    	if (num_doc_processed >= UT_LIST_GET_LEN(
+			psort_info->fts_doc_list)) {
+			goto exit;
+		} else if (retried > 10000) {
+			ut_ad(!doc_item);
+			/* retied too many times and cannot get new record */
+			fprintf(stderr, "InnoDB: FTS parallel sort processed "
+					"%lu records, the sort queue has "
+					"%lu records. But sort cannot get "
+					"the next records", num_doc_processed,
+					UT_LIST_GET_LEN(
+						psort_info->fts_doc_list));
+			goto exit;
+		}
+	}
+
+	if (doc_item) {
+		doc_item = UT_LIST_GET_NEXT(doc_list, doc_item);
+	} else if (prev_doc_item) {
+		os_thread_yield();
+		doc_item = UT_LIST_GET_NEXT(doc_list, prev_doc_item);
+	} else {
+		os_thread_yield();
+		doc_item = UT_LIST_GET_FIRST(psort_info->fts_doc_list);
+	}
+
+	if (doc_item) {
+		prev_doc_item = doc_item;
+
+		if (last_doc_id != doc_item->doc_id) {
+			t_ctx.init_pos = 0;
+		}
+
+		retried = 0;
+	} else if (psort_info->state == FTS_PARENT_COMPLETE) {
+		retried++;
+	}
+
+	goto loop;
+
+exit:
+	/* Do a final sort of the last (or latest) batch of records
+	in block memory. Flush them to temp file if records cannot
+	be hold in one block memory */
+	for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+		if (t_ctx.rows_added[i]) {
+			row_merge_buf_sort(buf[i], NULL);
+			row_merge_buf_write(
+				buf[i], merge_file[i], block[i]);
+
+			/* Write to temp file, only if records have
+			been flushed to temp file before (offset > 0):
+			The pseudo code for sort is following:
+
+				while (there are rows) {
+					tokenize rows, put result in block[]
+					if (block[] runs out) {
+						sort rows;
+						write to temp file with
+						row_merge_write();
+						offset++;
+					}
+				}
+
+				# write out the last batch
+				if (offset > 0) {
+					row_merge_write();
+					offset++;
+				} else {
+					# no need to write anything
+					offset stay as 0
+				}
+
+			so if merge_file[i]->offset is 0 when we come to
+			here as the last batch, this means rows have
+			never flush to temp file, it can be held all in
+			memory */
+			if (merge_file[i]->offset != 0) {
+				row_merge_write(merge_file[i]->fd,
+						merge_file[i]->offset++,
+						block[i]);
+
+				UNIV_MEM_INVALID(block[i][0],
+						 srv_sort_buf_size);
+			}
+
+			buf[i] = row_merge_buf_empty(buf[i]);
+			t_ctx.rows_added[i] = 0;
+		}
+	}
+
+	if (fts_enable_diag_print) {
+		DEBUG_FTS_SORT_PRINT("  InnoDB_FTS: start merge sort\n");
+	}
+
+	for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+
+		if (!merge_file[i]->offset) {
+			continue;
+		}
+
+		tmpfd[i] = row_merge_file_create_low();
+		if (tmpfd[i] < 0) {
+			goto func_exit;
+		}
+
+		row_merge_sort(psort_info->psort_common->trx,
+			       psort_info->psort_common->dup,
+			       merge_file[i], block[i], &tmpfd[i]);
+		total_rec += merge_file[i]->n_rec;
+		close(tmpfd[i]);
+	}
+
+func_exit:
+	if (fts_enable_diag_print) {
+		DEBUG_FTS_SORT_PRINT("  InnoDB_FTS: complete merge sort\n");
+	}
+
+	mem_heap_free(blob_heap);
+
+	psort_info->child_status = FTS_CHILD_COMPLETE;
+	os_event_set(psort_info->psort_common->sort_event);
+	psort_info->child_status = FTS_CHILD_EXITING;
+
+#ifdef __WIN__
+	CloseHandle(psort_info->thread_hdl);
+#endif /*__WIN__ */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+Start the parallel tokenization and parallel merge sort */
+UNIV_INTERN
+void
+row_fts_start_psort(
+/*================*/
+	fts_psort_t*	psort_info)	/*!< parallel sort structure */
+{
+	ulint		i = 0;
+	os_thread_id_t	thd_id;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		psort_info[i].psort_id = i;
+		psort_info[i].thread_hdl = os_thread_create(
+			fts_parallel_tokenization,
+			(void*) &psort_info[i], &thd_id);
+	}
+}
+
+/*********************************************************************//**
+Function performs the merge and insertion of the sorted records.
+@return OS_THREAD_DUMMY_RETURN */
+UNIV_INTERN
+os_thread_ret_t
+fts_parallel_merge(
+/*===============*/
+	void*		arg)		/*!< in: parallel merge info */
+{
+	fts_psort_t*	psort_info = (fts_psort_t*) arg;
+	ulint		id;
+
+	ut_ad(psort_info);
+
+	id = psort_info->psort_id;
+
+	row_fts_merge_insert(psort_info->psort_common->dup->index,
+			     psort_info->psort_common->new_table,
+			     psort_info->psort_common->all_info, id);
+
+	psort_info->child_status = FTS_CHILD_COMPLETE;
+	os_event_set(psort_info->psort_common->merge_event);
+	psort_info->child_status = FTS_CHILD_EXITING;
+
+#ifdef __WIN__
+	CloseHandle(psort_info->thread_hdl);
+#endif /*__WIN__ */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+Kick off the parallel merge and insert thread */
+UNIV_INTERN
+void
+row_fts_start_parallel_merge(
+/*=========================*/
+	fts_psort_t*	merge_info)	/*!< in: parallel sort info */
+{
+	int		i = 0;
+	os_thread_id_t	thd_id;
+
+	/* Kick off merge/insert threads */
+	for (i = 0; i <  FTS_NUM_AUX_INDEX; i++) {
+		merge_info[i].psort_id = i;
+		merge_info[i].child_status = 0;
+
+		merge_info[i].thread_hdl = os_thread_create(
+			fts_parallel_merge, (void*) &merge_info[i], &thd_id);
+	}
+}
+
+/********************************************************************//**
+Insert processed FTS data to auxillary index tables.
+@return	DB_SUCCESS if insertion runs fine */
+static __attribute__((nonnull))
+dberr_t
+row_merge_write_fts_word(
+/*=====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		ins_graph,	/*!< in: Insert query graphs */
+	fts_tokenizer_word_t* word,	/*!< in: sorted and tokenized
+					word */
+	fts_table_t*	fts_table,	/*!< in: fts aux table instance */
+	CHARSET_INFO*	charset)	/*!< in: charset */
+{
+	ulint	selected;
+	dberr_t	ret = DB_SUCCESS;
+
+	selected = fts_select_index(
+		charset, word->text.f_str, word->text.f_len);
+	fts_table->suffix = fts_get_suffix(selected);
+
+	/* Pop out each fts_node in word->nodes write them to auxiliary table */
+	while (ib_vector_size(word->nodes) > 0) {
+		dberr_t		error;
+		fts_node_t*	fts_node;
+
+		fts_node = static_cast<fts_node_t*>(ib_vector_pop(word->nodes));
+
+		error = fts_write_node(
+			trx, &ins_graph[selected], fts_table, &word->text,
+			fts_node);
+
+		if (error != DB_SUCCESS) {
+			fprintf(stderr, "InnoDB: failed to write"
+				" word %s to FTS auxiliary index"
+				" table, error (%s) \n",
+				word->text.f_str, ut_strerr(error));
+			ret = error;
+		}
+
+		ut_free(fts_node->ilist);
+		fts_node->ilist = NULL;
+	}
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Read sorted FTS data files and insert data tuples to auxillary tables.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+void
+row_fts_insert_tuple(
+/*=================*/
+	fts_psort_insert_t*
+			ins_ctx,	/*!< in: insert context */
+	fts_tokenizer_word_t* word,	/*!< in: last processed
+					tokenized word */
+	ib_vector_t*	positions,	/*!< in: word position */
+	doc_id_t*	in_doc_id,	/*!< in: last item doc id */
+	dtuple_t*	dtuple)		/*!< in: entry to insert */
+{
+	fts_node_t*	fts_node = NULL;
+	dfield_t*	dfield;
+	doc_id_t	doc_id;
+	ulint		position;
+	fts_string_t	token_word;
+	ulint		i;
+
+	/* Get fts_node for the FTS auxillary INDEX table */
+	if (ib_vector_size(word->nodes) > 0) {
+		fts_node = static_cast<fts_node_t*>(
+			ib_vector_last(word->nodes));
+	}
+
+	if (fts_node == NULL
+	    || fts_node->ilist_size > FTS_ILIST_MAX_SIZE) {
+
+		fts_node = static_cast<fts_node_t*>(
+			ib_vector_push(word->nodes, NULL));
+
+		memset(fts_node, 0x0, sizeof(*fts_node));
+	}
+
+	/* If dtuple == NULL, this is the last word to be processed */
+	if (!dtuple) {
+		if (fts_node && ib_vector_size(positions) > 0) {
+			fts_cache_node_add_positions(
+				NULL, fts_node, *in_doc_id,
+				positions);
+
+			/* Write out the current word */
+			row_merge_write_fts_word(ins_ctx->trx,
+						 ins_ctx->ins_graph, word,
+						 &ins_ctx->fts_table,
+						 ins_ctx->charset);
+
+		}
+
+		return;
+	}
+
+	/* Get the first field for the tokenized word */
+	dfield = dtuple_get_nth_field(dtuple, 0);
+
+	token_word.f_n_char = 0;
+	token_word.f_len = dfield->len;
+	token_word.f_str = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (!word->text.f_str) {
+		fts_utf8_string_dup(&word->text, &token_word, ins_ctx->heap);
+	}
+
+	/* compare to the last word, to see if they are the same
+	word */
+	if (innobase_fts_text_cmp(ins_ctx->charset,
+				  &word->text, &token_word) != 0) {
+		ulint	num_item;
+
+		/* Getting a new word, flush the last position info
+		for the currnt word in fts_node */
+		if (ib_vector_size(positions) > 0) {
+			fts_cache_node_add_positions(
+				NULL, fts_node, *in_doc_id, positions);
+		}
+
+		/* Write out the current word */
+		row_merge_write_fts_word(ins_ctx->trx, ins_ctx->ins_graph,
+					 word, &ins_ctx->fts_table,
+					 ins_ctx->charset);
+
+		/* Copy the new word */
+		fts_utf8_string_dup(&word->text, &token_word, ins_ctx->heap);
+
+		num_item = ib_vector_size(positions);
+
+		/* Clean up position queue */
+		for (i = 0; i < num_item; i++) {
+			ib_vector_pop(positions);
+		}
+
+		/* Reset Doc ID */
+		*in_doc_id = 0;
+		memset(fts_node, 0x0, sizeof(*fts_node));
+	}
+
+	/* Get the word's Doc ID */
+	dfield = dtuple_get_nth_field(dtuple, 1);
+
+	if (!ins_ctx->opt_doc_id_size) {
+		doc_id = fts_read_doc_id(
+			static_cast<byte*>(dfield_get_data(dfield)));
+	} else {
+		doc_id = (doc_id_t) mach_read_from_4(
+			static_cast<byte*>(dfield_get_data(dfield)));
+	}
+
+	/* Get the word's position info */
+	dfield = dtuple_get_nth_field(dtuple, 2);
+	position = mach_read_from_4(static_cast<byte*>(dfield_get_data(dfield)));
+
+	/* If this is the same word as the last word, and they
+	have the same Doc ID, we just need to add its position
+	info. Otherwise, we will flush position info to the
+	fts_node and initiate a new position vector  */
+	if (!(*in_doc_id) || *in_doc_id == doc_id) {
+		ib_vector_push(positions, &position);
+	} else {
+		ulint	num_pos = ib_vector_size(positions);
+
+		fts_cache_node_add_positions(NULL, fts_node,
+					     *in_doc_id, positions);
+		for (i = 0; i < num_pos; i++) {
+			ib_vector_pop(positions);
+		}
+		ib_vector_push(positions, &position);
+	}
+
+	/* record the current Doc ID */
+	*in_doc_id = doc_id;
+}
+
+/*********************************************************************//**
+Propagate a newly added record up one level in the selection tree
+@return parent where this value propagated to */
+static
+int
+row_fts_sel_tree_propagate(
+/*=======================*/
+	int		propogated,	/*<! in: tree node propagated */
+	int*		sel_tree,	/*<! in: selection tree */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	ulint**		offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in/out: FTS index */
+{
+	ulint	parent;
+	int	child_left;
+	int	child_right;
+	int	selected;
+
+	/* Find which parent this value will be propagated to */
+	parent = (propogated - 1) / 2;
+
+	/* Find out which value is smaller, and to propagate */
+	child_left = sel_tree[parent * 2 + 1];
+	child_right = sel_tree[parent * 2 + 2];
+
+	if (child_left == -1 || mrec[child_left] == NULL) {
+		if (child_right == -1
+		    || mrec[child_right] == NULL) {
+			selected = -1;
+		} else {
+			selected = child_right ;
+		}
+	} else if (child_right == -1
+		   || mrec[child_right] == NULL) {
+		selected = child_left;
+	} else if (cmp_rec_rec_simple(mrec[child_left], mrec[child_right],
+				      offsets[child_left],
+				      offsets[child_right],
+				      index, NULL) < 0) {
+		selected = child_left;
+	} else {
+		selected = child_right;
+	}
+
+	sel_tree[parent] = selected;
+
+	return(parent);
+}
+
+/*********************************************************************//**
+Readjust selection tree after popping the root and read a new value
+@return the new root */
+static
+int
+row_fts_sel_tree_update(
+/*====================*/
+	int*		sel_tree,	/*<! in/out: selection tree */
+	ulint		propagated,	/*<! in: node to propagate up */
+	ulint		height,		/*<! in: tree height */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	ulint**		offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in: index dictionary */
+{
+	ulint	i;
+
+	for (i = 1; i <= height; i++) {
+		propagated = row_fts_sel_tree_propagate(
+			propagated, sel_tree, mrec, offsets, index);
+	}
+
+	return(sel_tree[0]);
+}
+
+/*********************************************************************//**
+Build selection tree at a specified level */
+static
+void
+row_fts_build_sel_tree_level(
+/*=========================*/
+	int*		sel_tree,	/*<! in/out: selection tree */
+	ulint		level,		/*<! in: selection tree level */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	ulint**		offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in: index dictionary */
+{
+	ulint	start;
+	int	child_left;
+	int	child_right;
+	ulint	i;
+	ulint	num_item;
+
+	start = (1 << level) - 1;
+	num_item = (1 << level);
+
+	for (i = 0; i < num_item;  i++) {
+		child_left = sel_tree[(start + i) * 2 + 1];
+		child_right = sel_tree[(start + i) * 2 + 2];
+
+		if (child_left == -1) {
+			if (child_right == -1) {
+				sel_tree[start + i] = -1;
+			} else {
+				sel_tree[start + i] =  child_right;
+			}
+			continue;
+		} else if (child_right == -1) {
+			sel_tree[start + i] = child_left;
+			continue;
+		}
+
+		/* Deal with NULL child conditions */
+		if (!mrec[child_left]) {
+			if (!mrec[child_right]) {
+				sel_tree[start + i] = -1;
+			} else {
+				sel_tree[start + i] = child_right;
+			}
+			continue;
+		} else if (!mrec[child_right]) {
+			sel_tree[start + i] = child_left;
+			continue;
+		}
+
+		/* Select the smaller one to set parent pointer */
+		int cmp = cmp_rec_rec_simple(
+			mrec[child_left], mrec[child_right],
+			offsets[child_left], offsets[child_right],
+			index, NULL);
+
+		sel_tree[start + i] = cmp < 0 ? child_left : child_right;
+	}
+}
+
+/*********************************************************************//**
+Build a selection tree for merge. The selection tree is a binary tree
+and should have fts_sort_pll_degree / 2 levels. With root as level 0
+@return number of tree levels */
+static
+ulint
+row_fts_build_sel_tree(
+/*===================*/
+	int*		sel_tree,	/*<! in/out: selection tree */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	ulint**		offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in: index dictionary */
+{
+	ulint	treelevel = 1;
+	ulint	num = 2;
+	int	i = 0;
+	ulint	start;
+
+	/* No need to build selection tree if we only have two merge threads */
+	if (fts_sort_pll_degree <= 2) {
+		return(0);
+	}
+
+	while (num < fts_sort_pll_degree) {
+		num = num << 1;
+		treelevel++;
+	}
+
+	start = (1 << treelevel) - 1;
+
+	for (i = 0; i < (int) fts_sort_pll_degree; i++) {
+		sel_tree[i + start] = i;
+	}
+
+	for (i = treelevel - 1; i >=0; i--) {
+		row_fts_build_sel_tree_level(sel_tree, i, mrec, offsets, index);
+	}
+
+	return(treelevel);
+}
+
+/*********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+row_fts_merge_insert(
+/*=================*/
+	dict_index_t*		index,	/*!< in: index */
+	dict_table_t*		table,	/*!< in: new table */
+	fts_psort_t*		psort_info, /*!< parallel sort info */
+	ulint			id)	/* !< in: which auxiliary table's data
+					to insert to */
+{
+	const byte**		b;
+	mem_heap_t*		tuple_heap;
+	mem_heap_t*		heap;
+	dberr_t			error = DB_SUCCESS;
+	ulint*			foffs;
+	ulint**			offsets;
+	fts_tokenizer_word_t	new_word;
+	ib_vector_t*		positions;
+	doc_id_t		last_doc_id;
+	ib_alloc_t*		heap_alloc;
+	ulint			n_bytes;
+	ulint			i;
+	mrec_buf_t**		buf;
+	int*			fd;
+	byte**			block;
+	const mrec_t**		mrec;
+	ulint			count = 0;
+	int*			sel_tree;
+	ulint			height;
+	ulint			start;
+	fts_psort_insert_t	ins_ctx;
+	ulint			count_diag = 0;
+
+	ut_ad(index);
+	ut_ad(table);
+
+	/* We use the insert query graph as the dummy graph
+	needed in the row module call */
+
+	ins_ctx.trx = trx_allocate_for_background();
+
+	ins_ctx.trx->op_info = "inserting index entries";
+
+	ins_ctx.opt_doc_id_size = psort_info[0].psort_common->opt_doc_id_size;
+
+	heap = mem_heap_create(500 + sizeof(mrec_buf_t));
+
+	b = (const byte**) mem_heap_alloc(
+		heap, sizeof (*b) * fts_sort_pll_degree);
+	foffs = (ulint*) mem_heap_alloc(
+		heap, sizeof(*foffs) * fts_sort_pll_degree);
+	offsets = (ulint**) mem_heap_alloc(
+		heap, sizeof(*offsets) * fts_sort_pll_degree);
+	buf = (mrec_buf_t**) mem_heap_alloc(
+		heap, sizeof(*buf) * fts_sort_pll_degree);
+	fd = (int*) mem_heap_alloc(heap, sizeof(*fd) * fts_sort_pll_degree);
+	block = (byte**) mem_heap_alloc(
+		heap, sizeof(*block) * fts_sort_pll_degree);
+	mrec = (const mrec_t**) mem_heap_alloc(
+		heap, sizeof(*mrec) * fts_sort_pll_degree);
+	sel_tree = (int*) mem_heap_alloc(
+		heap, sizeof(*sel_tree) * (fts_sort_pll_degree * 2));
+
+	tuple_heap = mem_heap_create(1000);
+
+	ins_ctx.charset = fts_index_get_charset(index);
+	ins_ctx.heap = heap;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		ulint	num;
+
+		num = 1 + REC_OFFS_HEADER_SIZE
+			+ dict_index_get_n_fields(index);
+		offsets[i] = static_cast<ulint*>(mem_heap_zalloc(
+			heap, num * sizeof *offsets[i]));
+		offsets[i][0] = num;
+		offsets[i][1] = dict_index_get_n_fields(index);
+		block[i] = psort_info[i].merge_block[id];
+		b[i] = psort_info[i].merge_block[id];
+		fd[i] = psort_info[i].merge_file[id]->fd;
+		foffs[i] = 0;
+
+		buf[i] = static_cast<unsigned char (*)[16384]>(
+			mem_heap_alloc(heap, sizeof *buf[i]));
+		count_diag += (int) psort_info[i].merge_file[id]->n_rec;
+	}
+
+	if (fts_enable_diag_print) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB_FTS: to inserted %lu records\n",
+			(ulong) count_diag);
+	}
+
+	/* Initialize related variables if creating FTS indexes */
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	memset(&new_word, 0, sizeof(new_word));
+
+	new_word.nodes = ib_vector_create(heap_alloc, sizeof(fts_node_t), 4);
+	positions = ib_vector_create(heap_alloc, sizeof(ulint), 32);
+	last_doc_id = 0;
+
+	/* Allocate insert query graphs for FTS auxillary
+	Index Table, note we have FTS_NUM_AUX_INDEX such index tables */
+	n_bytes = sizeof(que_t*) * (FTS_NUM_AUX_INDEX + 1);
+	ins_ctx.ins_graph = static_cast<que_t**>(mem_heap_alloc(heap, n_bytes));
+	memset(ins_ctx.ins_graph, 0x0, n_bytes);
+
+	ins_ctx.fts_table.type = FTS_INDEX_TABLE;
+	ins_ctx.fts_table.index_id = index->id;
+	ins_ctx.fts_table.table_id = table->id;
+	ins_ctx.fts_table.parent = index->table->name;
+	ins_ctx.fts_table.table = NULL;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		if (psort_info[i].merge_file[id]->n_rec == 0) {
+			/* No Rows to read */
+			mrec[i] = b[i] = NULL;
+		} else {
+			/* Read from temp file only if it has been
+			written to. Otherwise, block memory holds
+			all the sorted records */
+			if (psort_info[i].merge_file[id]->offset > 0
+			    && (!row_merge_read(
+					fd[i], foffs[i],
+					(row_merge_block_t*) block[i]))) {
+				error = DB_CORRUPTION;
+				goto exit;
+			}
+
+			ROW_MERGE_READ_GET_NEXT(i);
+		}
+	}
+
+	height = row_fts_build_sel_tree(sel_tree, (const mrec_t **) mrec,
+					offsets, index);
+
+	start = (1 << height) - 1;
+
+	/* Fetch sorted records from sort buffer and insert them into
+	corresponding FTS index auxiliary tables */
+	for (;;) {
+		dtuple_t*	dtuple;
+		ulint		n_ext;
+		int		min_rec = 0;
+
+		if (fts_sort_pll_degree <= 2) {
+			while (!mrec[min_rec]) {
+				min_rec++;
+
+				if (min_rec >= (int) fts_sort_pll_degree) {
+					row_fts_insert_tuple(
+						&ins_ctx, &new_word,
+						positions, &last_doc_id,
+						NULL);
+
+					goto exit;
+				}
+			}
+
+			for (i = min_rec + 1; i < fts_sort_pll_degree; i++) {
+				if (!mrec[i]) {
+					continue;
+				}
+
+				if (cmp_rec_rec_simple(
+					    mrec[i], mrec[min_rec],
+					    offsets[i], offsets[min_rec],
+					    index, NULL) < 0) {
+					min_rec = i;
+				}
+			}
+		} else {
+			min_rec = sel_tree[0];
+
+			if (min_rec ==  -1) {
+				row_fts_insert_tuple(
+					&ins_ctx, &new_word,
+					positions, &last_doc_id,
+					NULL);
+
+				goto exit;
+			}
+		}
+
+		dtuple = row_rec_to_index_entry_low(
+			mrec[min_rec], index, offsets[min_rec], &n_ext,
+			tuple_heap);
+
+		row_fts_insert_tuple(
+			&ins_ctx, &new_word, positions,
+			&last_doc_id, dtuple);
+
+
+		ROW_MERGE_READ_GET_NEXT(min_rec);
+
+		if (fts_sort_pll_degree > 2) {
+			if (!mrec[min_rec]) {
+				sel_tree[start + min_rec] = -1;
+			}
+
+			row_fts_sel_tree_update(sel_tree, start + min_rec,
+						height, mrec,
+						offsets, index);
+		}
+
+		count++;
+
+		mem_heap_empty(tuple_heap);
+	}
+
+exit:
+	fts_sql_commit(ins_ctx.trx);
+
+	ins_ctx.trx->op_info = "";
+
+	mem_heap_free(tuple_heap);
+
+	for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+		if (ins_ctx.ins_graph[i]) {
+			fts_que_graph_free(ins_ctx.ins_graph[i]);
+		}
+	}
+
+	trx_free_for_background(ins_ctx.trx);
+
+	mem_heap_free(heap);
+
+	if (fts_enable_diag_print) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB_FTS: inserted %lu records\n",
+			(ulong) count);
+	}
+
+	return(error);
+}
diff --git a/storage/xtradb/row/row0import.cc b/storage/xtradb/row/row0import.cc
new file mode 100644
index 00000000000..b753574158a
--- /dev/null
+++ b/storage/xtradb/row/row0import.cc
@@ -0,0 +1,3806 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0import.cc
+Import a tablespace to a running instance.
+
+Created 2012-02-08 by Sunny Bains.
+*******************************************************/
+
+#include "row0import.h"
+
+#ifdef UNIV_NONINL
+#include "row0import.ic"
+#endif
+
+#include "btr0pcur.h"
+#include "que0que.h"
+#include "dict0boot.h"
+#include "ibuf0ibuf.h"
+#include "pars0pars.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "row0quiesce.h"
+
+#include <vector>
+
+/** The size of the buffer to use for IO. Note: os_file_read() doesn't expect
+reads to fail. If you set the buffer size to be greater than a multiple of the
+file size then it will assert. TODO: Fix this limitation of the IO functions.
+@param n - page size of the tablespace.
+@retval number of pages */
+#define IO_BUFFER_SIZE(n)	((1024 * 1024) / n)
+
+/** For gathering stats on records during phase I */
+struct row_stats_t {
+	ulint		m_n_deleted;		/*!< Number of deleted records
+						found in the index */
+
+	ulint		m_n_purged;		/*!< Number of records purged
+						optimisatically */
+
+	ulint		m_n_rows;		/*!< Number of rows */
+
+	ulint		m_n_purge_failed;	/*!< Number of deleted rows
+						that could not be purged */
+};
+
+/** Index information required by IMPORT. */
+struct row_index_t {
+	index_id_t	m_id;			/*!< Index id of the table
+						in the exporting server */
+	byte*		m_name;			/*!< Index name */
+
+	ulint		m_space;		/*!< Space where it is placed */
+
+	ulint		m_page_no;		/*!< Root page number */
+
+	ulint		m_type;			/*!< Index type */
+
+	ulint		m_trx_id_offset;	/*!< Relevant only for clustered
+						indexes, offset of transaction
+						id system column */
+
+	ulint		m_n_user_defined_cols;	/*!< User defined columns */
+
+	ulint		m_n_uniq;		/*!< Number of columns that can
+						uniquely identify the row */
+
+	ulint		m_n_nullable;		/*!< Number of nullable
+						columns */
+
+	ulint		m_n_fields;		/*!< Total number of fields */
+
+	dict_field_t*	m_fields;		/*!< Index fields */
+
+	const dict_index_t*
+			m_srv_index;		/*!< Index instance in the
+						importing server */
+
+	row_stats_t	m_stats;		/*!< Statistics gathered during
+						the import phase */
+
+};
+
+/** Meta data required by IMPORT. */
+struct row_import {
+	row_import() UNIV_NOTHROW
+		:
+		m_table(),
+		m_version(),
+		m_hostname(),
+		m_table_name(),
+		m_autoinc(),
+		m_page_size(),
+		m_flags(),
+		m_n_cols(),
+		m_cols(),
+		m_col_names(),
+		m_n_indexes(),
+		m_indexes(),
+		m_missing(true) { }
+
+	~row_import() UNIV_NOTHROW;
+
+	/**
+	Find the index entry in in the indexes array.
+	@param name - index name
+	@return instance if found else 0. */
+	row_index_t* get_index(const char* name) const UNIV_NOTHROW;
+
+	/**
+	Get the number of rows in the index.
+	@param name - index name
+	@return number of rows (doesn't include delete marked rows). */
+	ulint	get_n_rows(const char* name) const UNIV_NOTHROW;
+
+	/**
+	Find the ordinal value of the column name in the cfg table columns.
+	@param name - of column to look for.
+	@return ULINT_UNDEFINED if not found. */
+	ulint find_col(const char* name) const UNIV_NOTHROW;
+
+	/**
+	Find the index field entry in in the cfg indexes fields.
+	@name - of the index to look for
+	@return instance if found else 0. */
+	const dict_field_t* find_field(
+		const row_index_t*	cfg_index,
+		const char* 		name) const UNIV_NOTHROW;
+
+	/**
+	Get the number of rows for which purge failed during the convert phase.
+	@param name - index name
+	@return number of rows for which purge failed. */
+	ulint	get_n_purge_failed(const char* name) const UNIV_NOTHROW;
+
+	/**
+	Check if the index is clean. ie. no delete-marked records
+	@param name - index name
+	@return true if index needs to be purged. */
+	bool requires_purge(const char* name) const UNIV_NOTHROW
+	{
+		return(get_n_purge_failed(name) > 0);
+	}
+
+	/**
+	Set the index root <space, pageno> using the index name */
+	void set_root_by_name() UNIV_NOTHROW;
+
+	/**
+	Set the index root <space, pageno> using a heuristic
+	@return DB_SUCCESS or error code */
+	dberr_t set_root_by_heuristic() UNIV_NOTHROW;
+
+	/** Check if the index schema that was read from the .cfg file
+	matches the in memory index definition.
+	Note: It will update row_import_t::m_srv_index to map the meta-data
+	read from the .cfg file to the server index instance.
+	@return DB_SUCCESS or error code. */
+	dberr_t match_index_columns(
+		THD*			thd,
+		const dict_index_t*	index) UNIV_NOTHROW;
+
+	/**
+	Check if the table schema that was read from the .cfg file matches the
+	in memory table definition.
+	@param thd - MySQL session variable
+	@return DB_SUCCESS or error code. */
+	dberr_t match_table_columns(
+		THD*			thd) UNIV_NOTHROW;
+
+	/**
+	Check if the table (and index) schema that was read from the .cfg file
+	matches the in memory table definition.
+	@param thd - MySQL session variable
+	@return DB_SUCCESS or error code. */
+	dberr_t match_schema(
+		THD*			thd) UNIV_NOTHROW;
+
+	dict_table_t*	m_table;		/*!< Table instance */
+
+	ulint		m_version;		/*!< Version of config file */
+
+	byte*		m_hostname;		/*!< Hostname where the
+						tablespace was exported */
+	byte*		m_table_name;		/*!< Exporting instance table
+						name */
+
+	ib_uint64_t	m_autoinc;		/*!< Next autoinc value */
+
+	ulint		m_page_size;		/*!< Tablespace page size */
+
+	ulint		m_flags;		/*!< Table flags */
+
+	ulint		m_n_cols;		/*!< Number of columns in the
+						meta-data file */
+
+	dict_col_t*	m_cols;			/*!< Column data */
+
+	byte**		m_col_names;		/*!< Column names, we store the
+						column naems separately becuase
+						there is no field to store the
+						value in dict_col_t */
+
+	ulint		m_n_indexes;		/*!< Number of indexes,
+						including clustered index */
+
+	row_index_t*	m_indexes;		/*!< Index meta data */
+
+	bool		m_missing;		/*!< true if a .cfg file was
+						found and was readable */
+};
+
+/** Use the page cursor to iterate over records in a block. */
+class RecIterator {
+public:
+	/**
+	Default constructor */
+	RecIterator() UNIV_NOTHROW
+	{
+		memset(&m_cur, 0x0, sizeof(m_cur));
+	}
+
+	/**
+	Position the cursor on the first user record. */
+	void	open(buf_block_t* block) UNIV_NOTHROW
+	{
+		page_cur_set_before_first(block, &m_cur);
+
+		if (!end()) {
+			next();
+		}
+	}
+
+	/**
+	Move to the next record. */
+	void	next() UNIV_NOTHROW
+	{
+		page_cur_move_to_next(&m_cur);
+	}
+
+	/**
+	@return the current record */
+	rec_t*	current() UNIV_NOTHROW
+	{
+		ut_ad(!end());
+		return(page_cur_get_rec(&m_cur));
+	}
+
+	/**
+	@return true if cursor is at the end */
+	bool	end() UNIV_NOTHROW
+	{
+		return(page_cur_is_after_last(&m_cur) == TRUE);
+	}
+
+	/** Remove the current record
+	@return true on success */
+	bool remove(
+		const dict_index_t*	index,
+		page_zip_des_t*		page_zip,
+		ulint*			offsets) UNIV_NOTHROW
+	{
+		/* We can't end up with an empty page unless it is root. */
+		if (page_get_n_recs(m_cur.block->frame) <= 1) {
+			return(false);
+		}
+
+		return(page_delete_rec(index, &m_cur, page_zip, offsets));
+	}
+
+private:
+	page_cur_t	m_cur;
+};
+
+/** Class that purges delete marked reocords from indexes, both secondary
+and cluster. It does a pessimistic delete. This should only be done if we
+couldn't purge the delete marked reocrds during Phase I. */
+class IndexPurge {
+public:
+	/** Constructor
+	@param trx - the user transaction covering the import tablespace
+	@param index - to be imported
+	@param space_id - space id of the tablespace */
+	IndexPurge(
+		trx_t*		trx,
+		dict_index_t*	index) UNIV_NOTHROW
+		:
+		m_trx(trx),
+		m_index(index),
+		m_n_rows(0)
+	{
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Phase II - Purge records from index %s",
+			index->name);
+	}
+
+	/** Descructor */
+	~IndexPurge() UNIV_NOTHROW { }
+
+	/** Purge delete marked records.
+	@return DB_SUCCESS or error code. */
+	dberr_t	garbage_collect() UNIV_NOTHROW;
+
+	/** The number of records that are not delete marked.
+	@return total records in the index after purge */
+	ulint	get_n_rows() const UNIV_NOTHROW
+	{
+		return(m_n_rows);
+	}
+
+private:
+	/**
+	Begin import, position the cursor on the first record. */
+	void	open() UNIV_NOTHROW;
+
+	/**
+	Close the persistent curosr and commit the mini-transaction. */
+	void	close() UNIV_NOTHROW;
+
+	/**
+	Position the cursor on the next record.
+	@return DB_SUCCESS or error code */
+	dberr_t	next() UNIV_NOTHROW;
+
+	/**
+	Store the persistent cursor position and reopen the
+	B-tree cursor in BTR_MODIFY_TREE mode, because the
+	tree structure may be changed during a pessimistic delete. */
+	void	purge_pessimistic_delete() UNIV_NOTHROW;
+
+	/**
+	Purge delete-marked records.
+	@param offsets - current row offsets. */
+	void	purge() UNIV_NOTHROW;
+
+protected:
+	// Disable copying
+	IndexPurge();
+	IndexPurge(const IndexPurge&);
+	IndexPurge &operator=(const IndexPurge&);
+
+private:
+	trx_t*			m_trx;		/*!< User transaction */
+	mtr_t			m_mtr;		/*!< Mini-transaction */
+	btr_pcur_t		m_pcur;		/*!< Persistent cursor */
+	dict_index_t*		m_index;	/*!< Index to be processed */
+	ulint			m_n_rows;	/*!< Records in index */
+};
+
+/** Functor that is called for each physical page that is read from the
+tablespace file.  */
+class AbstractCallback : public PageCallback {
+public:
+	/** Constructor
+	@param trx - covering transaction */
+	AbstractCallback(trx_t* trx)
+		:
+		m_trx(trx),
+		m_space(ULINT_UNDEFINED),
+		m_xdes(),
+		m_xdes_page_no(ULINT_UNDEFINED),
+		m_space_flags(ULINT_UNDEFINED),
+		m_table_flags(ULINT_UNDEFINED) UNIV_NOTHROW { }
+
+	/**
+	Free any extent descriptor instance */
+	virtual ~AbstractCallback()
+	{
+		delete [] m_xdes;
+	}
+
+	/** Determine the page size to use for traversing the tablespace
+	@param file_size - size of the tablespace file in bytes
+	@param block - contents of the first page in the tablespace file.
+	@retval DB_SUCCESS or error code. */
+	virtual dberr_t init(
+		os_offset_t		file_size,
+		const buf_block_t*	block) UNIV_NOTHROW;
+
+	/** @return true if compressed table. */
+	bool is_compressed_table() const UNIV_NOTHROW
+	{
+		return(get_zip_size() > 0);
+	}
+
+protected:
+	/**
+	Get the data page depending on the table type, compressed or not.
+	@param block - block read from disk
+	@retval the buffer frame */
+	buf_frame_t* get_frame(buf_block_t* block) const UNIV_NOTHROW
+	{
+		if (is_compressed_table()) {
+			return(block->page.zip.data);
+		}
+
+		return(buf_block_get_frame(block));
+	}
+
+	/** Check for session interrupt. If required we could
+	even flush to disk here every N pages.
+	@retval DB_SUCCESS or error code */
+	dberr_t periodic_check() UNIV_NOTHROW
+	{
+		if (trx_is_interrupted(m_trx)) {
+			return(DB_INTERRUPTED);
+		}
+
+		return(DB_SUCCESS);
+	}
+
+	/**
+	Get the physical offset of the extent descriptor within the page.
+	@param page_no - page number of the extent descriptor
+	@param page - contents of the page containing the extent descriptor.
+	@return the start of the xdes array in a page */
+	const xdes_t* xdes(
+		ulint		page_no,
+		const page_t*	page) const UNIV_NOTHROW
+	{
+		ulint	offset;
+
+		offset = xdes_calc_descriptor_index(get_zip_size(), page_no);
+
+		return(page + XDES_ARR_OFFSET + XDES_SIZE * offset);
+	}
+
+	/**
+	Set the current page directory (xdes). If the extent descriptor is
+	marked as free then free the current extent descriptor and set it to
+	0. This implies that all pages that are covered by this extent
+	descriptor are also freed.
+
+	@param page_no - offset of page within the file
+	@param page - page contents
+	@return DB_SUCCESS or error code. */
+	dberr_t	set_current_xdes(
+		ulint		page_no,
+		const page_t*	page) UNIV_NOTHROW
+	{
+		m_xdes_page_no = page_no;
+
+		delete[] m_xdes;
+
+		m_xdes = 0;
+
+		ulint		state;
+		const xdes_t*	xdesc = page + XDES_ARR_OFFSET;
+
+		state = mach_read_ulint(xdesc + XDES_STATE, MLOG_4BYTES);
+
+		if (state != XDES_FREE) {
+
+			m_xdes = new(std::nothrow) xdes_t[m_page_size];
+
+			/* Trigger OOM */
+			DBUG_EXECUTE_IF("ib_import_OOM_13",
+					delete [] m_xdes; m_xdes = 0;);
+
+			if (m_xdes == 0) {
+				return(DB_OUT_OF_MEMORY);
+			}
+
+			memcpy(m_xdes, page, m_page_size);
+		}
+
+		return(DB_SUCCESS);
+	}
+
+	/**
+	@return true if it is a root page */
+	bool is_root_page(const page_t* page) const UNIV_NOTHROW
+	{
+		ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+
+		return(mach_read_from_4(page + FIL_PAGE_NEXT) == FIL_NULL
+		       && mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL);
+	}
+
+	/**
+	Check if the page is marked as free in the extent descriptor.
+	@param page_no - page number to check in the extent descriptor.
+	@return true if the page is marked as free */
+	bool is_free(ulint page_no) const UNIV_NOTHROW
+	{
+		ut_a(xdes_calc_descriptor_page(get_zip_size(), page_no)
+		     == m_xdes_page_no);
+
+		if (m_xdes != 0) {
+			const xdes_t*	xdesc = xdes(page_no, m_xdes);
+			ulint		pos = page_no % FSP_EXTENT_SIZE;
+
+			return(xdes_get_bit(xdesc, XDES_FREE_BIT, pos));
+		}
+
+		/* If the current xdes was free, the page must be free. */
+		return(true);
+	}
+
+protected:
+	/** Covering transaction. */
+	trx_t*			m_trx;
+
+	/** Space id of the file being iterated over. */
+	ulint			m_space;
+
+	/** Minimum page number for which the free list has not been
+	initialized: the pages >= this limit are, by definition, free;
+	note that in a single-table tablespace where size < 64 pages,
+	this number is 64, i.e., we have initialized the space about
+	the first extent, but have not physically allocted those pages
+	to the file. @see FSP_LIMIT. */
+	ulint			m_free_limit;
+
+	/** Current size of the space in pages */
+	ulint			m_size;
+
+	/** Current extent descriptor page */
+	xdes_t*			m_xdes;
+
+	/** Physical page offset in the file of the extent descriptor */
+	ulint			m_xdes_page_no;
+
+	/** Flags value read from the header page */
+	ulint			m_space_flags;
+
+	/** Derived from m_space_flags and row format type, the row format
+	type is determined from the page header. */
+	ulint			m_table_flags;
+};
+
+/** Determine the page size to use for traversing the tablespace
+@param file_size - size of the tablespace file in bytes
+@param block - contents of the first page in the tablespace file.
+@retval DB_SUCCESS or error code. */
+dberr_t
+AbstractCallback::init(
+	os_offset_t		file_size,
+	const buf_block_t*	block) UNIV_NOTHROW
+{
+	const page_t*		page = block->frame;
+
+	m_space_flags = fsp_header_get_flags(page);
+
+	/* Since we don't know whether it is a compressed table
+	or not, the data is always read into the block->frame. */
+
+	dberr_t	err = set_zip_size(block->frame);
+
+	if (err != DB_SUCCESS) {
+		return(DB_CORRUPTION);
+	}
+
+	/* Set the page size used to traverse the tablespace. */
+
+	m_page_size = (is_compressed_table())
+		? get_zip_size() : fsp_flags_get_page_size(m_space_flags);
+
+	if (m_page_size == 0) {
+		ib_logf(IB_LOG_LEVEL_ERROR, "Page size is 0");
+		return(DB_CORRUPTION);
+	} else if (!is_compressed_table() && m_page_size != UNIV_PAGE_SIZE) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Page size %lu of ibd file is not the same "
+			"as the server page size %lu",
+			m_page_size, UNIV_PAGE_SIZE);
+
+		return(DB_CORRUPTION);
+
+	} else if ((file_size % m_page_size)) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"File size " UINT64PF " is not a multiple "
+			"of the page size %lu",
+			(ib_uint64_t) file_size, (ulong) m_page_size);
+
+		return(DB_CORRUPTION);
+	}
+
+	ut_a(m_space == ULINT_UNDEFINED);
+
+	m_size  = mach_read_from_4(page + FSP_SIZE);
+	m_free_limit = mach_read_from_4(page + FSP_FREE_LIMIT);
+	m_space = mach_read_from_4(page + FSP_HEADER_OFFSET + FSP_SPACE_ID);
+
+	if ((err = set_current_xdes(0, page)) != DB_SUCCESS) {
+		return(err);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/**
+Try and determine the index root pages by checking if the next/prev
+pointers are both FIL_NULL. We need to ensure that skip deleted pages. */
+struct FetchIndexRootPages : public AbstractCallback {
+
+	/** Index information gathered from the .ibd file. */
+	struct Index {
+
+		Index(index_id_t id, ulint page_no)
+			:
+			m_id(id),
+			m_page_no(page_no) { }
+
+		index_id_t	m_id;		/*!< Index id */
+		ulint		m_page_no;	/*!< Root page number */
+	};
+
+	typedef std::vector<Index> Indexes;
+
+	/** Constructor
+	@param trx - covering (user) transaction
+	@param table - table definition in server .*/
+	FetchIndexRootPages(const dict_table_t* table, trx_t* trx)
+		:
+		AbstractCallback(trx),
+		m_table(table) UNIV_NOTHROW { }
+
+	/** Destructor */
+	virtual ~FetchIndexRootPages() UNIV_NOTHROW { }
+
+	/**
+	@retval the space id of the tablespace being iterated over */
+	virtual ulint get_space_id() const UNIV_NOTHROW
+	{
+		return(m_space);
+	}
+
+	/**
+	Check if the .ibd file row format is the same as the table's.
+	@param ibd_table_flags - determined from space and page.
+	@return DB_SUCCESS or error code. */
+	dberr_t check_row_format(ulint ibd_table_flags) UNIV_NOTHROW
+	{
+		dberr_t		err;
+		rec_format_t	ibd_rec_format;
+		rec_format_t	table_rec_format;
+
+		if (!dict_tf_is_valid(ibd_table_flags)) {
+
+			ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLE_SCHEMA_MISMATCH,
+				".ibd file has invlad table flags: %lx",
+				ibd_table_flags);
+
+			return(DB_CORRUPTION);
+		}
+
+		ibd_rec_format = dict_tf_get_rec_format(ibd_table_flags);
+		table_rec_format = dict_tf_get_rec_format(m_table->flags);
+
+		if (table_rec_format != ibd_rec_format) {
+
+			ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLE_SCHEMA_MISMATCH,
+				"Table has %s row format, .ibd "
+				"file has %s row format.",
+				dict_tf_to_row_format_string(m_table->flags),
+				dict_tf_to_row_format_string(ibd_table_flags));
+
+			err = DB_CORRUPTION;
+		} else {
+			err = DB_SUCCESS;
+		}
+
+		return(err);
+	}
+
+	/**
+	Called for each block as it is read from the file.
+	@param offset - physical offset in the file
+	@param block - block to convert, it is not from the buffer pool.
+	@retval DB_SUCCESS or error code. */
+	virtual dberr_t operator() (
+		os_offset_t	offset,
+		buf_block_t*	block) UNIV_NOTHROW;
+
+	/** Update the import configuration that will be used to import
+	the tablespace. */
+	dberr_t build_row_import(row_import* cfg) const UNIV_NOTHROW;
+
+	/** Table definition in server. */
+	const dict_table_t*	m_table;
+
+	/** Index information */
+	Indexes			m_indexes;
+};
+
+/**
+Called for each block as it is read from the file. Check index pages to
+determine the exact row format. We can't get that from the tablespace
+header flags alone.
+
+@param offset - physical offset in the file
+@param block - block to convert, it is not from the buffer pool.
+@retval DB_SUCCESS or error code. */
+dberr_t
+FetchIndexRootPages::operator() (
+	os_offset_t	offset,
+	buf_block_t*	block) UNIV_NOTHROW
+{
+	dberr_t		err;
+
+	if ((err = periodic_check()) != DB_SUCCESS) {
+		return(err);
+	}
+
+	const page_t*	page = get_frame(block);
+
+	ulint	page_type = fil_page_get_type(page);
+
+	if (block->page.offset * m_page_size != offset) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Page offset doesn't match file offset: "
+			"page offset: %lu, file offset: %lu",
+			(ulint) block->page.offset,
+			(ulint) (offset / m_page_size));
+
+		err = DB_CORRUPTION;
+	} else if (page_type == FIL_PAGE_TYPE_XDES) {
+		err = set_current_xdes(block->page.offset, page);
+	} else if (page_type == FIL_PAGE_INDEX
+		   && !is_free(block->page.offset)
+		   && is_root_page(page)) {
+
+		index_id_t	id = btr_page_get_index_id(page);
+		ulint		page_no = buf_block_get_page_no(block);
+
+		m_indexes.push_back(Index(id, page_no));
+
+		if (m_indexes.size() == 1) {
+
+			m_table_flags = dict_sys_tables_type_to_tf(
+				m_space_flags,
+				page_is_comp(page) ? DICT_N_COLS_COMPACT : 0);
+
+			err = check_row_format(m_table_flags);
+		}
+	}
+
+	return(err);
+}
+
+/**
+Update the import configuration that will be used to import the tablespace.
+@return error code or DB_SUCCESS */
+dberr_t
+FetchIndexRootPages::build_row_import(row_import* cfg) const UNIV_NOTHROW
+{
+	Indexes::const_iterator end = m_indexes.end();
+
+	ut_a(cfg->m_table == m_table);
+	cfg->m_page_size = m_page_size;
+	cfg->m_n_indexes = m_indexes.size();
+
+	if (cfg->m_n_indexes == 0) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR, "No B+Tree found in tablespace");
+
+		return(DB_CORRUPTION);
+	}
+
+	cfg->m_indexes = new(std::nothrow) row_index_t[cfg->m_n_indexes];
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF("ib_import_OOM_11",
+			delete [] cfg->m_indexes; cfg->m_indexes = 0;);
+
+	if (cfg->m_indexes == 0) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes);
+
+	row_index_t*	cfg_index = cfg->m_indexes;
+
+	for (Indexes::const_iterator it = m_indexes.begin();
+	     it != end;
+	     ++it, ++cfg_index) {
+
+		char	name[BUFSIZ];
+
+		ut_snprintf(name, sizeof(name), "index" IB_ID_FMT, it->m_id);
+
+		ulint	len = strlen(name) + 1;
+
+		cfg_index->m_name = new(std::nothrow) byte[len];
+
+		/* Trigger OOM */
+		DBUG_EXECUTE_IF("ib_import_OOM_12",
+				delete [] cfg_index->m_name;
+				cfg_index->m_name = 0;);
+
+		if (cfg_index->m_name == 0) {
+			return(DB_OUT_OF_MEMORY);
+		}
+
+		memcpy(cfg_index->m_name, name, len);
+
+		cfg_index->m_id = it->m_id;
+
+		cfg_index->m_space = m_space;
+
+		cfg_index->m_page_no = it->m_page_no;
+	}
+
+	return(DB_SUCCESS);
+}
+
+/* Functor that is called for each physical page that is read from the
+tablespace file.
+
+  1. Check each page for corruption.
+
+  2. Update the space id and LSN on every page
+     * For the header page
+       - Validate the flags
+       - Update the LSN
+
+  3. On Btree pages
+     * Set the index id
+     * Update the max trx id
+     * In a cluster index, update the system columns
+     * In a cluster index, update the BLOB ptr, set the space id
+     * Purge delete marked records, but only if they can be easily
+       removed from the page
+     * Keep a counter of number of rows, ie. non-delete-marked rows
+     * Keep a counter of number of delete marked rows
+     * Keep a counter of number of purge failure
+     * If a page is stamped with an index id that isn't in the .cfg file
+       we assume it is deleted and the page can be ignored.
+
+   4. Set the page state to dirty so that it will be written to disk.
+*/
+class PageConverter : public AbstractCallback {
+public:
+	/** Constructor
+	* @param cfg - config of table being imported.
+	* @param trx - transaction covering the import */
+	PageConverter(row_import* cfg, trx_t* trx) UNIV_NOTHROW;
+
+	virtual ~PageConverter() UNIV_NOTHROW
+	{
+		if (m_heap != 0) {
+			mem_heap_free(m_heap);
+		}
+	}
+
+	/**
+	@retval the server space id of the tablespace being iterated over */
+	virtual ulint get_space_id() const UNIV_NOTHROW
+	{
+		return(m_cfg->m_table->space);
+	}
+
+	/**
+	Called for each block as it is read from the file.
+	@param offset - physical offset in the file
+	@param block - block to convert, it is not from the buffer pool.
+	@retval DB_SUCCESS or error code. */
+	virtual dberr_t operator() (
+		os_offset_t	offset,
+		buf_block_t*	block) UNIV_NOTHROW;
+private:
+
+	/** Status returned by PageConverter::validate() */
+	enum import_page_status_t {
+		IMPORT_PAGE_STATUS_OK,		/*!< Page is OK */
+		IMPORT_PAGE_STATUS_ALL_ZERO,	/*!< Page is all zeros */
+		IMPORT_PAGE_STATUS_CORRUPTED	/*!< Page is corrupted */
+	};
+
+	/**
+	Update the page, set the space id, max trx id and index id.
+	@param block - block read from file
+	@param page_type - type of the page
+	@retval DB_SUCCESS or error code */
+	dberr_t update_page(
+		buf_block_t*	block,
+		ulint&		page_type) UNIV_NOTHROW;
+
+#if defined UNIV_DEBUG
+	/**
+	@return true error condition is enabled. */
+	bool trigger_corruption() UNIV_NOTHROW
+	{
+		return(false);
+	}
+	#else
+#define trigger_corruption()	(false)
+#endif /* UNIV_DEBUG */
+
+	/**
+	Update the space, index id, trx id.
+	@param block - block to convert
+	@return DB_SUCCESS or error code */
+	dberr_t	update_index_page(buf_block_t*	block) UNIV_NOTHROW;
+
+	/** Update the BLOB refrences and write UNDO log entries for
+	rows that can't be purged optimistically.
+	@param block - block to update
+	@retval DB_SUCCESS or error code */
+	dberr_t	update_records(buf_block_t* block) UNIV_NOTHROW;
+
+	/**
+	Validate the page, check for corruption.
+	@param offset - physical offset within file.
+	@param page - page read from file.
+	@return 0 on success, 1 if all zero, 2 if corrupted */
+	import_page_status_t validate(
+		os_offset_t	offset,
+		buf_block_t*	page) UNIV_NOTHROW;
+
+	/**
+	Validate the space flags and update tablespace header page.
+	@param block - block read from file, not from the buffer pool.
+	@retval DB_SUCCESS or error code */
+	dberr_t	update_header(buf_block_t* block) UNIV_NOTHROW;
+
+	/**
+	Adjust the BLOB reference for a single column that is externally stored
+	@param rec - record to update
+	@param offsets - column offsets for the record
+	@param i - column ordinal value
+	@return DB_SUCCESS or error code */
+	dberr_t	adjust_cluster_index_blob_column(
+		rec_t*		rec,
+		const ulint*	offsets,
+		ulint		i) UNIV_NOTHROW;
+
+	/**
+	Adjusts the BLOB reference in the clustered index row for all
+	externally stored columns.
+	@param rec - record to update
+	@param offsets - column offsets for the record
+	@return DB_SUCCESS or error code */
+	dberr_t	adjust_cluster_index_blob_columns(
+		rec_t*		rec,
+		const ulint*	offsets) UNIV_NOTHROW;
+
+	/**
+	In the clustered index, adjist the BLOB pointers as needed.
+	Also update the BLOB reference, write the new space id.
+	@param rec - record to update
+	@param offsets - column offsets for the record
+	@return DB_SUCCESS or error code */
+	dberr_t	adjust_cluster_index_blob_ref(
+		rec_t*		rec,
+		const ulint*	offsets) UNIV_NOTHROW;
+
+	/**
+	Purge delete-marked records, only if it is possible to do
+	so without re-organising the B+tree.
+	@param offsets - current row offsets.
+	@retval true if purged */
+	bool	purge(const ulint* offsets) UNIV_NOTHROW;
+
+	/**
+	Adjust the BLOB references and sys fields for the current record.
+	@param index - the index being converted
+	@param rec - record to update
+	@param offsets - column offsets for the record
+	@param deleted - true if row is delete marked
+	@return DB_SUCCESS or error code. */
+	dberr_t	adjust_cluster_record(
+		const dict_index_t*	index,
+		rec_t*			rec,
+		const ulint*		offsets,
+		bool			deleted) UNIV_NOTHROW;
+
+	/**
+	Find an index with the matching id.
+	@return row_index_t* instance or 0 */
+	row_index_t* find_index(index_id_t id) UNIV_NOTHROW
+	{
+		row_index_t*	index = &m_cfg->m_indexes[0];
+
+		for (ulint i = 0; i < m_cfg->m_n_indexes; ++i, ++index) {
+			if (id == index->m_id) {
+				return(index);
+			}
+		}
+
+		return(0);
+
+	}
+private:
+	/** Config for table that is being imported. */
+	row_import*		m_cfg;
+
+	/** Current index whose pages are being imported */
+	row_index_t*		m_index;
+
+	/** Current system LSN */
+	lsn_t			m_current_lsn;
+
+	/** Alias for m_page_zip, only set for compressed pages. */
+	page_zip_des_t*		m_page_zip_ptr;
+
+	/** Iterator over records in a block */
+	RecIterator		m_rec_iter;
+
+	/** Record offset */
+	ulint			m_offsets_[REC_OFFS_NORMAL_SIZE];
+
+	/** Pointer to m_offsets_ */
+	ulint*			m_offsets;
+
+	/** Memory heap for the record offsets */
+	mem_heap_t*		m_heap;
+
+	/** Cluster index instance */
+	dict_index_t*		m_cluster_index;
+};
+
+/**
+row_import destructor. */
+row_import::~row_import() UNIV_NOTHROW
+{
+	for (ulint i = 0; m_indexes != 0 && i < m_n_indexes; ++i) {
+		delete [] m_indexes[i].m_name;
+
+		if (m_indexes[i].m_fields == 0) {
+			continue;
+		}
+
+		dict_field_t*	fields = m_indexes[i].m_fields;
+		ulint		n_fields = m_indexes[i].m_n_fields;
+
+		for (ulint j = 0; j < n_fields; ++j) {
+			delete [] fields[j].name;
+		}
+
+		delete [] fields;
+	}
+
+	for (ulint i = 0; m_col_names != 0 && i < m_n_cols; ++i) {
+		delete [] m_col_names[i];
+	}
+
+	delete [] m_cols;
+	delete [] m_indexes;
+	delete [] m_col_names;
+	delete [] m_table_name;
+	delete [] m_hostname;
+}
+
+/**
+Find the index entry in in the indexes array.
+@param name - index name
+@return instance if found else 0. */
+row_index_t*
+row_import::get_index(
+	const char*	name) const UNIV_NOTHROW
+{
+	for (ulint i = 0; i < m_n_indexes; ++i) {
+		const char*	index_name;
+		row_index_t*	index = &m_indexes[i];
+
+		index_name = reinterpret_cast<const char*>(index->m_name);
+
+		if (strcmp(index_name, name) == 0) {
+
+			return(index);
+		}
+	}
+
+	return(0);
+}
+
+/**
+Get the number of rows in the index.
+@param name - index name
+@return number of rows (doesn't include delete marked rows). */
+ulint
+row_import::get_n_rows(
+	const char*	name) const UNIV_NOTHROW
+{
+	const row_index_t*	index = get_index(name);
+
+	ut_a(name != 0);
+
+	return(index->m_stats.m_n_rows);
+}
+
+/**
+Get the number of rows for which purge failed uding the convert phase.
+@param name - index name
+@return number of rows for which purge failed. */
+ulint
+row_import::get_n_purge_failed(
+	const char*	name) const UNIV_NOTHROW
+{
+	const row_index_t*	index = get_index(name);
+
+	ut_a(name != 0);
+
+	return(index->m_stats.m_n_purge_failed);
+}
+
+/**
+Find the ordinal value of the column name in the cfg table columns.
+@param name - of column to look for.
+@return ULINT_UNDEFINED if not found. */
+ulint
+row_import::find_col(
+	const char*	name) const UNIV_NOTHROW
+{
+	for (ulint i = 0; i < m_n_cols; ++i) {
+		const char*	col_name;
+
+		col_name = reinterpret_cast<const char*>(m_col_names[i]);
+
+		if (strcmp(col_name, name) == 0) {
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**
+Find the index field entry in in the cfg indexes fields.
+@name - of the index to look for
+@return instance if found else 0. */
+const dict_field_t*
+row_import::find_field(
+	const row_index_t*	cfg_index,
+	const char* 		name) const UNIV_NOTHROW
+{
+	const dict_field_t*	field = cfg_index->m_fields;
+
+	for (ulint i = 0; i < cfg_index->m_n_fields; ++i, ++field) {
+		const char*	field_name;
+
+		field_name = reinterpret_cast<const char*>(field->name);
+
+		if (strcmp(field_name, name) == 0) {
+			return(field);
+		}
+	}
+
+	return(0);
+}
+
+/**
+Check if the index schema that was read from the .cfg file matches the
+in memory index definition.
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_index_columns(
+	THD*			thd,
+	const dict_index_t*	index) UNIV_NOTHROW
+{
+	row_index_t*		cfg_index;
+	dberr_t			err = DB_SUCCESS;
+
+	cfg_index = get_index(index->name);
+
+	if (cfg_index == 0) {
+		ib_errf(thd, IB_LOG_LEVEL_ERROR,
+			 ER_TABLE_SCHEMA_MISMATCH,
+			 "Index %s not found in tablespace meta-data file.",
+			 index->name);
+
+		return(DB_ERROR);
+	}
+
+	cfg_index->m_srv_index = index;
+
+	const dict_field_t*	field = index->fields;
+
+	for (ulint i = 0; i < index->n_fields; ++i, ++field) {
+
+		const dict_field_t*	cfg_field;
+
+		cfg_field = find_field(cfg_index, field->name);
+
+		if (cfg_field == 0) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				 ER_TABLE_SCHEMA_MISMATCH,
+				 "Index %s field %s not found in tablespace "
+				 "meta-data file.",
+				 index->name, field->name);
+
+			err = DB_ERROR;
+		} else {
+
+			if (cfg_field->prefix_len != field->prefix_len) {
+				ib_errf(thd, IB_LOG_LEVEL_ERROR,
+					 ER_TABLE_SCHEMA_MISMATCH,
+					 "Index %s field %s prefix len %lu "
+					 "doesn't match meta-data file value "
+					 "%lu",
+					 index->name, field->name,
+					 (ulong) field->prefix_len,
+					 (ulong) cfg_field->prefix_len);
+
+				err = DB_ERROR;
+			}
+
+			if (cfg_field->fixed_len != field->fixed_len) {
+				ib_errf(thd, IB_LOG_LEVEL_ERROR,
+					 ER_TABLE_SCHEMA_MISMATCH,
+					 "Index %s field %s fixed len %lu "
+					 "doesn't match meta-data file value "
+					 "%lu",
+					 index->name, field->name,
+					 (ulong) field->fixed_len,
+					 (ulong) cfg_field->fixed_len);
+
+				err = DB_ERROR;
+			}
+		}
+	}
+
+	return(err);
+}
+
+/**
+Check if the table schema that was read from the .cfg file matches the
+in memory table definition.
+@param thd - MySQL session variable
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_table_columns(
+	THD*			thd) UNIV_NOTHROW
+{
+	dberr_t			err = DB_SUCCESS;
+	const dict_col_t*	col = m_table->cols;
+
+	for (ulint i = 0; i < m_table->n_cols; ++i, ++col) {
+
+		const char*	col_name;
+		ulint		cfg_col_index;
+
+		col_name = dict_table_get_col_name(
+			m_table, dict_col_get_no(col));
+
+		cfg_col_index = find_col(col_name);
+
+		if (cfg_col_index == ULINT_UNDEFINED) {
+
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				 ER_TABLE_SCHEMA_MISMATCH,
+				 "Column %s not found in tablespace.",
+				 col_name);
+
+			err = DB_ERROR;
+		} else if (cfg_col_index != col->ind) {
+
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				 ER_TABLE_SCHEMA_MISMATCH,
+				 "Column %s ordinal value mismatch, it's at "
+				 "%lu in the table and %lu in the tablespace "
+				 "meta-data file",
+				 col_name,
+				 (ulong) col->ind, (ulong) cfg_col_index);
+
+			err = DB_ERROR;
+		} else {
+			const dict_col_t*	cfg_col;
+
+			cfg_col = &m_cols[cfg_col_index];
+			ut_a(cfg_col->ind == cfg_col_index);
+
+			if (cfg_col->prtype != col->prtype) {
+				ib_errf(thd,
+					 IB_LOG_LEVEL_ERROR,
+					 ER_TABLE_SCHEMA_MISMATCH,
+					 "Column %s precise type mismatch.",
+					 col_name);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->mtype != col->mtype) {
+				ib_errf(thd,
+					 IB_LOG_LEVEL_ERROR,
+					 ER_TABLE_SCHEMA_MISMATCH,
+					 "Column %s main type mismatch.",
+					 col_name);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->len != col->len) {
+				ib_errf(thd,
+					 IB_LOG_LEVEL_ERROR,
+					 ER_TABLE_SCHEMA_MISMATCH,
+					 "Column %s length mismatch.",
+					 col_name);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->mbminmaxlen != col->mbminmaxlen) {
+				ib_errf(thd,
+					 IB_LOG_LEVEL_ERROR,
+					 ER_TABLE_SCHEMA_MISMATCH,
+					 "Column %s multi-byte len mismatch.",
+					 col_name);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->ind != col->ind) {
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->ord_part != col->ord_part) {
+				ib_errf(thd,
+					 IB_LOG_LEVEL_ERROR,
+					 ER_TABLE_SCHEMA_MISMATCH,
+					 "Column %s ordering mismatch.",
+					 col_name);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->max_prefix != col->max_prefix) {
+				ib_errf(thd,
+					 IB_LOG_LEVEL_ERROR,
+					 ER_TABLE_SCHEMA_MISMATCH,
+					 "Column %s max prefix mismatch.",
+					 col_name);
+				err = DB_ERROR;
+			}
+		}
+	}
+
+	return(err);
+}
+
+/**
+Check if the table (and index) schema that was read from the .cfg file
+matches the in memory table definition.
+@param thd - MySQL session variable
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_schema(
+	THD*		thd) UNIV_NOTHROW
+{
+	/* Do some simple checks. */
+
+	if (m_flags != m_table->flags) {
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+			 "Table flags don't match, server table has 0x%lx "
+			 "and the meta-data file has 0x%lx",
+			 (ulong) m_table->n_cols, (ulong) m_flags);
+
+		return(DB_ERROR);
+	} else if (m_table->n_cols != m_n_cols) {
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+			 "Number of columns don't match, table has %lu "
+			 "columns but the tablespace meta-data file has "
+			 "%lu columns",
+			 (ulong) m_table->n_cols, (ulong) m_n_cols);
+
+		return(DB_ERROR);
+	} else if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) {
+
+		/* If the number of indexes don't match then it is better
+		to abort the IMPORT. It is easy for the user to create a
+		table matching the IMPORT definition. */
+
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+			 "Number of indexes don't match, table has %lu "
+			 "indexes but the tablespace meta-data file has "
+			 "%lu indexes",
+			 (ulong) UT_LIST_GET_LEN(m_table->indexes),
+			 (ulong) m_n_indexes);
+
+		return(DB_ERROR);
+	}
+
+	dberr_t	err = match_table_columns(thd);
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Check if the index definitions match. */
+
+	const dict_index_t* index;
+
+	for (index = UT_LIST_GET_FIRST(m_table->indexes);
+	     index != 0;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		dberr_t	index_err;
+
+		index_err = match_index_columns(thd, index);
+
+		if (index_err != DB_SUCCESS) {
+			err = index_err;
+		}
+	}
+
+	return(err);
+}
+
+/**
+Set the index root <space, pageno>, using index name. */
+void
+row_import::set_root_by_name() UNIV_NOTHROW
+{
+	row_index_t*	cfg_index = m_indexes;
+
+	for (ulint i = 0; i < m_n_indexes; ++i, ++cfg_index) {
+		dict_index_t*	index;
+
+		const char*	index_name;
+
+		index_name = reinterpret_cast<const char*>(cfg_index->m_name);
+
+		index = dict_table_get_index_on_name(m_table, index_name);
+
+		/* We've already checked that it exists. */
+		ut_a(index != 0);
+
+		/* Set the root page number and space id. */
+		index->space = m_table->space;
+		index->page = cfg_index->m_page_no;
+	}
+}
+
+/**
+Set the index root <space, pageno>, using a heuristic.
+@return DB_SUCCESS or error code */
+dberr_t
+row_import::set_root_by_heuristic() UNIV_NOTHROW
+{
+	row_index_t*	cfg_index = m_indexes;
+
+	ut_a(m_n_indexes > 0);
+
+	// TODO: For now use brute force, based on ordinality
+
+	if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) {
+
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name), m_table->name, FALSE);
+
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Table %s should have %lu indexes but the tablespace "
+			"has %lu indexes",
+			table_name,
+			UT_LIST_GET_LEN(m_table->indexes),
+			m_n_indexes);
+	}
+
+	dict_mutex_enter_for_mysql();
+
+	ulint	i = 0;
+	dberr_t	err = DB_SUCCESS;
+
+	for (dict_index_t* index = UT_LIST_GET_FIRST(m_table->indexes);
+	     index != 0;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		if (index->type & DICT_FTS) {
+			index->type |= DICT_CORRUPT;
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Skipping FTS index: %s", index->name);
+		} else if (i < m_n_indexes) {
+
+			delete [] cfg_index[i].m_name;
+
+			ulint	len = strlen(index->name) + 1;
+
+			cfg_index[i].m_name = new(std::nothrow) byte[len];
+
+			/* Trigger OOM */
+			DBUG_EXECUTE_IF("ib_import_OOM_14",
+					delete[] cfg_index[i].m_name;
+					cfg_index[i].m_name = 0;);
+
+			if (cfg_index[i].m_name == 0) {
+				err = DB_OUT_OF_MEMORY;
+				break;
+			}
+
+			memcpy(cfg_index[i].m_name, index->name, len);
+
+			cfg_index[i].m_srv_index = index;
+
+			index->space = m_table->space;
+			index->page = cfg_index[i].m_page_no;
+
+			++i;
+		}
+	}
+
+	dict_mutex_exit_for_mysql();
+
+	return(err);
+}
+
+/**
+Purge delete marked records.
+@return DB_SUCCESS or error code. */
+dberr_t
+IndexPurge::garbage_collect() UNIV_NOTHROW
+{
+	dberr_t	err;
+	ibool	comp = dict_table_is_comp(m_index->table);
+
+	/* Open the persistent cursor and start the mini-transaction. */
+
+	open();
+
+	while ((err = next()) == DB_SUCCESS) {
+
+		rec_t*	rec = btr_pcur_get_rec(&m_pcur);
+		ibool	deleted = rec_get_deleted_flag(rec, comp);
+
+		if (!deleted) {
+			++m_n_rows;
+		} else {
+			purge();
+		}
+	}
+
+	/* Close the persistent cursor and commit the mini-transaction. */
+
+	close();
+
+	return(err == DB_END_OF_INDEX ? DB_SUCCESS : err);
+}
+
+/**
+Begin import, position the cursor on the first record. */
+void
+IndexPurge::open() UNIV_NOTHROW
+{
+	mtr_start(&m_mtr);
+
+	mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO);
+
+	btr_pcur_open_at_index_side(
+		true, m_index, BTR_MODIFY_LEAF, &m_pcur, true, 0, &m_mtr);
+}
+
+/**
+Close the persistent curosr and commit the mini-transaction. */
+void
+IndexPurge::close() UNIV_NOTHROW
+{
+	btr_pcur_close(&m_pcur);
+	mtr_commit(&m_mtr);
+}
+
+/**
+Position the cursor on the next record.
+@return DB_SUCCESS or error code */
+dberr_t
+IndexPurge::next() UNIV_NOTHROW
+{
+	btr_pcur_move_to_next_on_page(&m_pcur);
+
+	/* When switching pages, commit the mini-transaction
+	in order to release the latch on the old page. */
+
+	if (!btr_pcur_is_after_last_on_page(&m_pcur)) {
+		return(DB_SUCCESS);
+	} else if (trx_is_interrupted(m_trx)) {
+		/* Check after every page because the check
+		is expensive. */
+		return(DB_INTERRUPTED);
+	}
+
+	btr_pcur_store_position(&m_pcur, &m_mtr);
+
+	mtr_commit(&m_mtr);
+
+	mtr_start(&m_mtr);
+
+	mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO);
+
+	btr_pcur_restore_position(BTR_MODIFY_LEAF, &m_pcur, &m_mtr);
+
+	if (!btr_pcur_move_to_next_user_rec(&m_pcur, &m_mtr)) {
+
+		return(DB_END_OF_INDEX);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/**
+Store the persistent cursor position and reopen the
+B-tree cursor in BTR_MODIFY_TREE mode, because the
+tree structure may be changed during a pessimistic delete. */
+void
+IndexPurge::purge_pessimistic_delete() UNIV_NOTHROW
+{
+	dberr_t	err;
+
+	btr_pcur_restore_position(BTR_MODIFY_TREE, &m_pcur, &m_mtr);
+
+	ut_ad(rec_get_deleted_flag(
+			btr_pcur_get_rec(&m_pcur),
+			dict_table_is_comp(m_index->table)));
+
+	btr_cur_pessimistic_delete(
+		&err, FALSE, btr_pcur_get_btr_cur(&m_pcur), 0, RB_NONE, &m_mtr);
+
+	ut_a(err == DB_SUCCESS);
+
+	/* Reopen the B-tree cursor in BTR_MODIFY_LEAF mode */
+	mtr_commit(&m_mtr);
+}
+
+/**
+Purge delete-marked records. */
+void
+IndexPurge::purge() UNIV_NOTHROW
+{
+	btr_pcur_store_position(&m_pcur, &m_mtr);
+
+	purge_pessimistic_delete();
+
+	mtr_start(&m_mtr);
+
+	mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO);
+
+	btr_pcur_restore_position(BTR_MODIFY_LEAF, &m_pcur, &m_mtr);
+}
+
+/**
+Constructor
+* @param cfg - config of table being imported.
+* @param trx - transaction covering the import */
+PageConverter::PageConverter(
+	row_import*	cfg,
+	trx_t*		trx)
+	:
+	AbstractCallback(trx),
+	m_cfg(cfg),
+	m_page_zip_ptr(0),
+	m_heap(0) UNIV_NOTHROW
+{
+	m_index = m_cfg->m_indexes;
+
+	m_current_lsn = log_get_lsn();
+	ut_a(m_current_lsn > 0);
+
+	m_offsets = m_offsets_;
+	rec_offs_init(m_offsets_);
+
+	m_cluster_index = dict_table_get_first_index(m_cfg->m_table);
+}
+
+/**
+Adjust the BLOB reference for a single column that is externally stored
+@param rec - record to update
+@param offsets - column offsets for the record
+@param i - column ordinal value
+@return DB_SUCCESS or error code */
+dberr_t
+PageConverter::adjust_cluster_index_blob_column(
+	rec_t*		rec,
+	const ulint*	offsets,
+	ulint		i) UNIV_NOTHROW
+{
+	ulint		len;
+	byte*		field;
+
+	field = rec_get_nth_field(rec, offsets, i, &len);
+
+	DBUG_EXECUTE_IF("ib_import_trigger_corruption_2",
+			len = BTR_EXTERN_FIELD_REF_SIZE - 1;);
+
+	if (len < BTR_EXTERN_FIELD_REF_SIZE) {
+
+		char index_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			index_name, sizeof(index_name),
+			m_cluster_index->name, TRUE);
+
+		ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_INNODB_INDEX_CORRUPT,
+			"Externally stored column(%lu) has a reference "
+			"length of %lu in the cluster index %s",
+			(ulong) i, (ulong) len, index_name);
+
+		return(DB_CORRUPTION);
+	}
+
+	field += BTR_EXTERN_SPACE_ID - BTR_EXTERN_FIELD_REF_SIZE + len;
+
+	if (is_compressed_table()) {
+		mach_write_to_4(field, get_space_id());
+
+		page_zip_write_blob_ptr(
+			m_page_zip_ptr, rec, m_cluster_index, offsets, i, 0);
+	} else {
+		mlog_write_ulint(field, get_space_id(), MLOG_4BYTES, 0);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/**
+Adjusts the BLOB reference in the clustered index row for all externally
+stored columns.
+@param rec - record to update
+@param offsets - column offsets for the record
+@return DB_SUCCESS or error code */
+dberr_t
+PageConverter::adjust_cluster_index_blob_columns(
+	rec_t*		rec,
+	const ulint*	offsets) UNIV_NOTHROW
+{
+	ut_ad(rec_offs_any_extern(offsets));
+
+	/* Adjust the space_id in the BLOB pointers. */
+
+	for (ulint i = 0; i < rec_offs_n_fields(offsets); ++i) {
+
+		/* Only if the column is stored "externally". */
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			dberr_t	err;
+
+			err = adjust_cluster_index_blob_column(rec, offsets, i);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/**
+In the clustered index, adjust BLOB pointers as needed. Also update the
+BLOB reference, write the new space id.
+@param rec - record to update
+@param offsets - column offsets for the record
+@return DB_SUCCESS or error code */
+dberr_t
+PageConverter::adjust_cluster_index_blob_ref(
+	rec_t*		rec,
+	const ulint*	offsets) UNIV_NOTHROW
+{
+	if (rec_offs_any_extern(offsets)) {
+		dberr_t	err;
+
+		err = adjust_cluster_index_blob_columns(rec, offsets);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/**
+Purge delete-marked records, only if it is possible to do so without
+re-organising the B+tree.
+@param offsets - current row offsets.
+@return true if purge succeeded */
+bool
+PageConverter::purge(const ulint* offsets) UNIV_NOTHROW
+{
+	const dict_index_t*	index = m_index->m_srv_index;
+
+	/* We can't have a page that is empty and not root. */
+	if (m_rec_iter.remove(index, m_page_zip_ptr, m_offsets)) {
+
+		++m_index->m_stats.m_n_purged;
+
+		return(true);
+	} else {
+		++m_index->m_stats.m_n_purge_failed;
+	}
+
+	return(false);
+}
+
+/**
+Adjust the BLOB references and sys fields for the current record.
+@param rec - record to update
+@param offsets - column offsets for the record
+@param deleted - true if row is delete marked
+@return DB_SUCCESS or error code. */
+dberr_t
+PageConverter::adjust_cluster_record(
+	const dict_index_t*	index,
+	rec_t*			rec,
+	const ulint*		offsets,
+	bool			deleted) UNIV_NOTHROW
+{
+	dberr_t	err;
+
+	if ((err = adjust_cluster_index_blob_ref(rec, offsets)) == DB_SUCCESS) {
+
+		/* Reset DB_TRX_ID and DB_ROLL_PTR.  Normally, these fields
+		are only written in conjunction with other changes to the
+		record. */
+
+		row_upd_rec_sys_fields(
+			rec, m_page_zip_ptr, m_cluster_index, m_offsets,
+			m_trx, 0);
+	}
+
+	return(err);
+}
+
+/**
+Update the BLOB refrences and write UNDO log entries for
+rows that can't be purged optimistically.
+@param block - block to update
+@retval DB_SUCCESS or error code */
+dberr_t
+PageConverter::update_records(
+	buf_block_t*	block) UNIV_NOTHROW
+{
+	ibool	comp = dict_table_is_comp(m_cfg->m_table);
+	bool	clust_index = m_index->m_srv_index == m_cluster_index;
+
+	/* This will also position the cursor on the first user record. */
+
+	m_rec_iter.open(block);
+
+	while (!m_rec_iter.end()) {
+
+		rec_t*	rec = m_rec_iter.current();
+
+		/* FIXME: Move out of the loop */
+
+		if (rec_get_status(rec) == REC_STATUS_NODE_PTR) {
+			break;
+		}
+
+		ibool	deleted = rec_get_deleted_flag(rec, comp);
+
+		/* For the clustered index we have to adjust the BLOB
+		reference and the system fields irrespective of the
+		delete marked flag. The adjustment of delete marked
+		cluster records is required for purge to work later. */
+
+		if (deleted || clust_index) {
+			m_offsets = rec_get_offsets(
+				rec, m_index->m_srv_index, m_offsets,
+				ULINT_UNDEFINED, &m_heap);
+		}
+
+		if (clust_index) {
+
+			dberr_t err = adjust_cluster_record(
+				m_index->m_srv_index, rec, m_offsets,
+				deleted);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+		}
+
+		/* If it is a delete marked record then try an
+		optimistic delete. */
+
+		if (deleted) {
+			/* A successful purge will move the cursor to the
+			next record. */
+
+			if (!purge(m_offsets)) {
+				m_rec_iter.next();
+			}
+
+			++m_index->m_stats.m_n_deleted;
+		} else {
+			++m_index->m_stats.m_n_rows;
+			m_rec_iter.next();
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/**
+Update the space, index id, trx id.
+@return DB_SUCCESS or error code */
+dberr_t
+PageConverter::update_index_page(
+	buf_block_t*	block) UNIV_NOTHROW
+{
+	index_id_t	id;
+	buf_frame_t*	page = block->frame;
+
+	if (is_free(buf_block_get_page_no(block))) {
+		return(DB_SUCCESS);
+	} else if ((id = btr_page_get_index_id(page)) != m_index->m_id) {
+
+		row_index_t*	index = find_index(id);
+
+		if (index == 0) {
+			m_index = 0;
+			return(DB_CORRUPTION);
+		}
+
+		/* Update current index */
+		m_index = index;
+	}
+
+	/* If the .cfg file is missing and there is an index mismatch
+	then ignore the error. */
+	if (m_cfg->m_missing && (m_index == 0 || m_index->m_srv_index == 0)) {
+		return(DB_SUCCESS);
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!is_compressed_table()
+	     || page_zip_validate(m_page_zip_ptr, page, m_index->m_srv_index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	/* This has to be written to uncompressed index header. Set it to
+	the current index id. */
+	btr_page_set_index_id(
+		page, m_page_zip_ptr, m_index->m_srv_index->id, 0);
+
+	page_set_max_trx_id(block, m_page_zip_ptr, m_trx->id, 0);
+
+	if (page_is_empty(block->frame)) {
+
+		/* Only a root page can be empty. */
+		if (!is_root_page(block->frame)) {
+			// TODO: We should relax this and skip secondary
+			// indexes. Mark them as corrupt because they can
+			// always be rebuilt.
+			return(DB_CORRUPTION);
+		}
+
+		return(DB_SUCCESS);
+	}
+
+	return(update_records(block));
+}
+
+/**
+Validate the space flags and update tablespace header page.
+@param block - block read from file, not from the buffer pool.
+@retval DB_SUCCESS or error code */
+dberr_t
+PageConverter::update_header(
+	buf_block_t*	block) UNIV_NOTHROW
+{
+	/* Check for valid header */
+	switch(fsp_header_get_space_id(get_frame(block))) {
+	case 0:
+		return(DB_CORRUPTION);
+	case ULINT_UNDEFINED:
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Space id check in the header failed "
+			"- ignored");
+	}
+
+	ulint		space_flags = fsp_header_get_flags(get_frame(block));
+
+	if (!fsp_flags_is_valid(space_flags)) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unsupported tablespace format %lu",
+			(ulong) space_flags);
+
+		return(DB_UNSUPPORTED);
+	}
+
+	mach_write_to_8(
+		get_frame(block) + FIL_PAGE_FILE_FLUSH_LSN, m_current_lsn);
+
+	/* Write space_id to the tablespace header, page 0. */
+	mach_write_to_4(
+		get_frame(block) + FSP_HEADER_OFFSET + FSP_SPACE_ID,
+		get_space_id());
+
+	/* This is on every page in the tablespace. */
+	mach_write_to_4(
+		get_frame(block) + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+		get_space_id());
+
+	return(DB_SUCCESS);
+}
+
+/**
+Update the page, set the space id, max trx id and index id.
+@param block - block read from file
+@retval DB_SUCCESS or error code */
+dberr_t
+PageConverter::update_page(
+	buf_block_t*	block,
+	ulint&		page_type) UNIV_NOTHROW
+{
+	dberr_t		err = DB_SUCCESS;
+
+	switch (page_type = fil_page_get_type(get_frame(block))) {
+	case FIL_PAGE_TYPE_FSP_HDR:
+		/* Work directly on the uncompressed page headers. */
+		ut_a(buf_block_get_page_no(block) == 0);
+		return(update_header(block));
+
+	case FIL_PAGE_INDEX:
+		/* We need to decompress the contents into block->frame
+		before we can do any thing with Btree pages. */
+
+		if (is_compressed_table() && !buf_zip_decompress(block, TRUE)) {
+			return(DB_CORRUPTION);
+		}
+
+		/* This is on every page in the tablespace. */
+		mach_write_to_4(
+			get_frame(block)
+			+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id());
+
+		/* Only update the Btree nodes. */
+		return(update_index_page(block));
+
+	case FIL_PAGE_TYPE_SYS:
+		/* This is page 0 in the system tablespace. */
+		return(DB_CORRUPTION);
+
+	case FIL_PAGE_TYPE_XDES:
+		err = set_current_xdes(
+			buf_block_get_page_no(block), get_frame(block));
+	case FIL_PAGE_INODE:
+	case FIL_PAGE_TYPE_TRX_SYS:
+	case FIL_PAGE_IBUF_FREE_LIST:
+	case FIL_PAGE_TYPE_ALLOCATED:
+	case FIL_PAGE_IBUF_BITMAP:
+	case FIL_PAGE_TYPE_BLOB:
+	case FIL_PAGE_TYPE_ZBLOB:
+	case FIL_PAGE_TYPE_ZBLOB2:
+
+		/* Work directly on the uncompressed page headers. */
+		/* This is on every page in the tablespace. */
+		mach_write_to_4(
+			get_frame(block)
+			+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id());
+
+		return(err);
+	}
+
+	ib_logf(IB_LOG_LEVEL_WARN, "Unknown page type (%lu)", page_type);
+
+	return(DB_CORRUPTION);
+}
+
+/**
+Validate the page
+@param offset - physical offset within file.
+@param page - page read from file.
+@return status */
+PageConverter::import_page_status_t
+PageConverter::validate(
+	os_offset_t	offset,
+	buf_block_t*	block) UNIV_NOTHROW
+{
+	buf_frame_t*	page = get_frame(block);
+
+	/* Check that the page number corresponds to the offset in
+	the file. Flag as corrupt if it doesn't. Disable the check
+	for LSN in buf_page_is_corrupted() */
+
+	if (buf_page_is_corrupted(false, page, get_zip_size())
+	    || (page_get_page_no(page) != offset / m_page_size
+		&& page_get_page_no(page) != 0)) {
+
+		return(IMPORT_PAGE_STATUS_CORRUPTED);
+
+	} else if (offset > 0 && page_get_page_no(page) == 0) {
+		const byte*	b = page;
+		const byte*	e = b + m_page_size;
+
+		/* If the page number is zero and offset > 0 then
+		the entire page MUST consist of zeroes. If not then
+		we flag it as corrupt. */
+
+		while (b != e) {
+
+			if (*b++ && !trigger_corruption()) {
+				return(IMPORT_PAGE_STATUS_CORRUPTED);
+			}
+		}
+
+		/* The page is all zero: do nothing. */
+		return(IMPORT_PAGE_STATUS_ALL_ZERO);
+	}
+
+	return(IMPORT_PAGE_STATUS_OK);
+}
+
+/**
+Called for every page in the tablespace. If the page was not
+updated then its state must be set to BUF_PAGE_NOT_USED.
+@param offset - physical offset within the file
+@param block - block read from file, note it is not from the buffer pool
+@retval DB_SUCCESS or error code. */
+dberr_t
+PageConverter::operator() (
+	os_offset_t	offset,
+	buf_block_t*	block) UNIV_NOTHROW
+{
+	ulint		page_type;
+	dberr_t		err = DB_SUCCESS;
+
+	if ((err = periodic_check()) != DB_SUCCESS) {
+		return(err);
+	}
+
+	if (is_compressed_table()) {
+		m_page_zip_ptr = &block->page.zip;
+	} else {
+		ut_ad(m_page_zip_ptr == 0);
+	}
+
+	switch(validate(offset, block)) {
+	case IMPORT_PAGE_STATUS_OK:
+
+		/* We have to decompress the compressed pages before
+		we can work on them */
+
+		if ((err = update_page(block, page_type)) != DB_SUCCESS) {
+			return(err);
+		}
+
+		/* Note: For compressed pages this function will write to the
+		zip descriptor and for uncompressed pages it will write to
+		page (ie. the block->frame). Therefore the caller should write
+		out the descriptor contents and not block->frame for compressed
+		pages. */
+
+		if (!is_compressed_table() || page_type == FIL_PAGE_INDEX) {
+
+			buf_flush_init_for_writing(
+				!is_compressed_table()
+				? block->frame : block->page.zip.data,
+				!is_compressed_table() ? 0 : m_page_zip_ptr,
+				m_current_lsn);
+		} else {
+			/* Calculate and update the checksum of non-btree
+			pages for compressed tables explicitly here. */
+
+			buf_flush_update_zip_checksum(
+				get_frame(block), get_zip_size(),
+				m_current_lsn);
+		}
+
+		break;
+
+	case IMPORT_PAGE_STATUS_ALL_ZERO:
+		/* The page is all zero: leave it as is. */
+		break;
+
+	case IMPORT_PAGE_STATUS_CORRUPTED:
+
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"%s: Page %lu at offset " UINT64PF " looks corrupted.",
+			m_filepath, (ulong) (offset / m_page_size), offset);
+
+		return(DB_CORRUPTION);
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Clean up after import tablespace failure, this function will acquire
+the dictionary latches on behalf of the transaction if the transaction
+hasn't already acquired them. */
+static	__attribute__((nonnull))
+void
+row_import_discard_changes(
+/*=======================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt from handler */
+	trx_t*		trx,		/*!< in/out: transaction for import */
+	dberr_t		err)		/*!< in: error code */
+{
+	dict_table_t*	table = prebuilt->table;
+
+	ut_a(err != DB_SUCCESS);
+
+	prebuilt->trx->error_info = NULL;
+
+	char	table_name[MAX_FULL_NAME_LEN + 1];
+
+	innobase_format_name(
+		table_name, sizeof(table_name),
+		prebuilt->table->name, FALSE);
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Discarding tablespace of table %s: %s",
+		table_name, ut_strerr(err));
+
+	if (trx->dict_operation_lock_mode != RW_X_LATCH) {
+		ut_a(trx->dict_operation_lock_mode == 0);
+		row_mysql_lock_data_dictionary(trx);
+	}
+
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	/* Since we update the index root page numbers on disk after
+	we've done a successful import. The table will not be loadable.
+	However, we need to ensure that the in memory root page numbers
+	are reset to "NULL". */
+
+	for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+		index != 0;
+		index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		index->page = FIL_NULL;
+		index->space = FIL_NULL;
+	}
+
+	table->ibd_file_missing = TRUE;
+
+	fil_close_tablespace(trx, table->space);
+}
+
+/*****************************************************************//**
+Clean up after import tablespace. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_cleanup(
+/*===============*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt from handler */
+	trx_t*		trx,		/*!< in/out: transaction for import */
+	dberr_t		err)		/*!< in: error code */
+{
+	ut_a(prebuilt->trx != trx);
+
+	if (err != DB_SUCCESS) {
+		row_import_discard_changes(prebuilt, trx, err);
+	}
+
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	DBUG_EXECUTE_IF("ib_import_before_commit_crash", DBUG_SUICIDE(););
+
+	trx_commit_for_mysql(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx_free_for_mysql(trx);
+
+	prebuilt->trx->op_info = "";
+
+	DBUG_EXECUTE_IF("ib_import_before_checkpoint_crash", DBUG_SUICIDE(););
+
+	log_make_checkpoint_at(LSN_MAX, TRUE);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Report error during tablespace import. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_error(
+/*=============*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt from handler */
+	trx_t*		trx,		/*!< in/out: transaction for import */
+	dberr_t		err)		/*!< in: error code */
+{
+	if (!trx_is_interrupted(trx)) {
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name),
+			prebuilt->table->name, FALSE);
+
+		ib_senderrf(
+			trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			ER_INNODB_IMPORT_ERROR,
+			table_name, (ulong) err, ut_strerr(err));
+	}
+
+	return(row_import_cleanup(prebuilt, trx, err));
+}
+
+/*****************************************************************//**
+Adjust the root page index node and leaf node segment headers, update
+with the new space id. For all the table's secondary indexes.
+@return error code */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_adjust_root_pages_of_secondary_indexes(
+/*==============================================*/
+	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt from
+						handler */
+	trx_t*			trx,		/*!< in: transaction used for
+						the import */
+	dict_table_t*		table,		/*!< in: table the indexes
+						belong to */
+	const row_import&	cfg)		/*!< Import context */
+{
+	dict_index_t*		index;
+	ulint			n_rows_in_table;
+	dberr_t			err = DB_SUCCESS;
+
+	/* Skip the clustered index. */
+	index = dict_table_get_first_index(table);
+
+	n_rows_in_table = cfg.get_n_rows(index->name);
+
+	DBUG_EXECUTE_IF("ib_import_sec_rec_count_mismatch_failure",
+			n_rows_in_table++;);
+
+	/* Adjust the root pages of the secondary indexes only. */
+	while ((index = dict_table_get_next_index(index)) != NULL) {
+		char		index_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			index_name, sizeof(index_name), index->name, TRUE);
+
+		ut_a(!dict_index_is_clust(index));
+
+		if (!(index->type & DICT_CORRUPT)
+		    && index->space != FIL_NULL
+		    && index->page != FIL_NULL) {
+
+			/* Update the Btree segment headers for index node and
+			leaf nodes in the root page. Set the new space id. */
+
+			err = btr_root_adjust_on_import(index);
+		} else {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Skip adjustment of root pages for "
+				"index %s.", index->name);
+
+			err = DB_CORRUPTION;
+		}
+
+		if (err != DB_SUCCESS) {
+
+			if (index->type & DICT_CLUSTERED) {
+				break;
+			}
+
+			ib_errf(trx->mysql_thd,
+				IB_LOG_LEVEL_WARN,
+				ER_INNODB_INDEX_CORRUPT,
+				"Index '%s' not found or corrupt, "
+				"you should recreate this index.",
+				index_name);
+
+			/* Do not bail out, so that the data
+			can be recovered. */
+
+			err = DB_SUCCESS;
+			index->type |= DICT_CORRUPT;
+			continue;
+		}
+
+		/* If we failed to purge any records in the index then
+		do it the hard way.
+
+		TODO: We can do this in the first pass by generating UNDO log
+		records for the failed rows. */
+
+		if (!cfg.requires_purge(index->name)) {
+			continue;
+		}
+
+		IndexPurge   purge(trx, index);
+
+		trx->op_info = "secondary: purge delete marked records";
+
+		err = purge.garbage_collect();
+
+		trx->op_info = "";
+
+		if (err != DB_SUCCESS) {
+			break;
+		} else if (purge.get_n_rows() != n_rows_in_table) {
+
+			ib_errf(trx->mysql_thd,
+				IB_LOG_LEVEL_WARN,
+				ER_INNODB_INDEX_CORRUPT,
+				"Index '%s' contains %lu entries, "
+				"should be %lu, you should recreate "
+				"this index.", index_name,
+				(ulong) purge.get_n_rows(),
+				(ulong) n_rows_in_table);
+
+			index->type |= DICT_CORRUPT;
+
+			/* Do not bail out, so that the data
+			can be recovered. */
+
+			err = DB_SUCCESS;
+                }
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Ensure that dict_sys->row_id exceeds SELECT MAX(DB_ROW_ID).
+@return error code */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_set_sys_max_row_id(
+/*==========================*/
+	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt from
+						handler */
+	const dict_table_t*	table)		/*!< in: table to import */
+{
+	dberr_t			err;
+	const rec_t*		rec;
+	mtr_t			mtr;
+	btr_pcur_t		pcur;
+	row_id_t		row_id	= 0;
+	dict_index_t*		index;
+
+	index = dict_table_get_first_index(table);
+	ut_a(dict_index_is_clust(index));
+
+	mtr_start(&mtr);
+
+	mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+	btr_pcur_open_at_index_side(
+		false,		// High end
+		index,
+		BTR_SEARCH_LEAF,
+		&pcur,
+		true,		// Init cursor
+		0,		// Leaf level
+		&mtr);
+
+	btr_pcur_move_to_prev_on_page(&pcur);
+	rec = btr_pcur_get_rec(&pcur);
+
+	/* Check for empty table. */
+	if (!page_rec_is_infimum(rec)) {
+		ulint		len;
+		const byte*	field;
+		mem_heap_t*	heap = NULL;
+		ulint		offsets_[1 + REC_OFFS_HEADER_SIZE];
+		ulint*		offsets;
+
+		rec_offs_init(offsets_);
+
+		offsets = rec_get_offsets(
+			rec, index, offsets_, ULINT_UNDEFINED, &heap);
+
+		field = rec_get_nth_field(
+			rec, offsets,
+			dict_index_get_sys_col_pos(index, DATA_ROW_ID),
+			&len);
+
+		if (len == DATA_ROW_ID_LEN) {
+			row_id = mach_read_from_6(field);
+			err = DB_SUCCESS;
+		} else {
+			err = DB_CORRUPTION;
+		}
+
+		if (heap != NULL) {
+			mem_heap_free(heap);
+		}
+	} else {
+		/* The table is empty. */
+		err = DB_SUCCESS;
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	DBUG_EXECUTE_IF("ib_import_set_max_rowid_failure",
+			err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		char		index_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			index_name, sizeof(index_name), index->name, TRUE);
+
+		ib_errf(prebuilt->trx->mysql_thd,
+			IB_LOG_LEVEL_WARN,
+			ER_INNODB_INDEX_CORRUPT,
+			"Index '%s' corruption detected, invalid DB_ROW_ID "
+			"in index.", index_name);
+
+		return(err);
+
+	} else if (row_id > 0) {
+
+		/* Update the system row id if the imported index row id is
+		greater than the max system row id. */
+
+		mutex_enter(&dict_sys->mutex);
+
+		if (row_id >= dict_sys->row_id) {
+			dict_sys->row_id = row_id + 1;
+			dict_hdr_flush_row_id();
+		}
+
+		mutex_exit(&dict_sys->mutex);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Read the a string from the meta data file.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_import_cfg_read_string(
+/*=======================*/
+	FILE*		file,		/*!< in/out: File to read from */
+	byte*		ptr,		/*!< out: string to read */
+	ulint		max_len)	/*!< in: maximum length of the output
+					buffer in bytes */
+{
+	DBUG_EXECUTE_IF("ib_import_string_read_error",
+			errno = EINVAL; return(DB_IO_ERROR););
+
+	ulint		len = 0;
+
+	while (!feof(file)) {
+		int	ch = fgetc(file);
+
+		if (ch == EOF) {
+			break;
+		} else if (ch != 0) {
+			if (len < max_len) {
+				ptr[len++] = ch;
+			} else {
+				break;
+			}
+		/* max_len includes the NUL byte */
+		} else if (len != max_len - 1) {
+			break;
+		} else {
+			ptr[len] = 0;
+			return(DB_SUCCESS);
+		}
+	}
+
+	errno = EINVAL;
+
+	return(DB_IO_ERROR);
+}
+
+/*********************************************************************//**
+Write the meta data (index user fields) config file.
+@return DB_SUCCESS or error code. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_cfg_read_index_fields(
+/*=============================*/
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd,	/*!< in/out: session */
+	row_index_t*		index,	/*!< Index being read in */
+	row_import*		cfg)	/*!< in/out: meta-data read */
+{
+	byte			row[sizeof(ib_uint32_t) * 3];
+	ulint			n_fields = index->m_n_fields;
+
+	index->m_fields = new(std::nothrow) dict_field_t[n_fields];
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF("ib_import_OOM_4",
+			delete [] index->m_fields; index->m_fields = 0;);
+
+	if (index->m_fields == 0) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	dict_field_t*	field = index->m_fields;
+
+	memset(field, 0x0, sizeof(*field) * n_fields);
+
+	for (ulint i = 0; i < n_fields; ++i, ++field) {
+		byte*		ptr = row;
+
+		/* Trigger EOF */
+		DBUG_EXECUTE_IF("ib_import_io_read_error_1",
+				(void) fseek(file, 0L, SEEK_END););
+
+		if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				errno, strerror(errno),
+				"while reading index fields.");
+
+			return(DB_IO_ERROR);
+		}
+
+		field->prefix_len = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		field->fixed_len = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		/* Include the NUL byte in the length. */
+		ulint	len = mach_read_from_4(ptr);
+
+		byte*	name = new(std::nothrow) byte[len];
+
+		/* Trigger OOM */
+		DBUG_EXECUTE_IF("ib_import_OOM_5", delete [] name; name = 0;);
+
+		if (name == 0) {
+			return(DB_OUT_OF_MEMORY);
+		}
+
+		field->name = reinterpret_cast<const char*>(name);
+
+		dberr_t	err = row_import_cfg_read_string(file, name, len);
+
+		if (err != DB_SUCCESS) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				errno, strerror(errno),
+				"while parsing table name.");
+
+			return(err);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Read the index names and root page numbers of the indexes and set the values.
+Row format [root_page_no, len of str, str ... ]
+@return DB_SUCCESS or error code. */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_read_index_data(
+/*=======================*/
+	FILE*		file,		/*!< in: File to read from */
+	THD*		thd,		/*!< in: session */
+	row_import*	cfg)		/*!< in/out: meta-data read */
+{
+	byte*		ptr;
+	row_index_t*	cfg_index;
+	byte		row[sizeof(index_id_t) + sizeof(ib_uint32_t) * 9];
+
+	/* FIXME: What is the max value? */
+	ut_a(cfg->m_n_indexes > 0);
+	ut_a(cfg->m_n_indexes < 1024);
+
+	cfg->m_indexes = new(std::nothrow) row_index_t[cfg->m_n_indexes];
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF("ib_import_OOM_6",
+			delete [] cfg->m_indexes; cfg->m_indexes = 0;);
+
+	if (cfg->m_indexes == 0) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes);
+
+	cfg_index = cfg->m_indexes;
+
+	for (ulint i = 0; i < cfg->m_n_indexes; ++i, ++cfg_index) {
+		/* Trigger EOF */
+		DBUG_EXECUTE_IF("ib_import_io_read_error_2",
+				(void) fseek(file, 0L, SEEK_END););
+
+		/* Read the index data. */
+		size_t	n_bytes = fread(row, 1, sizeof(row), file);
+
+		/* Trigger EOF */
+		DBUG_EXECUTE_IF("ib_import_io_read_error",
+				(void) fseek(file, 0L, SEEK_END););
+
+		if (n_bytes != sizeof(row)) {
+			char	msg[BUFSIZ];
+
+			ut_snprintf(msg, sizeof(msg),
+				    "while reading index meta-data, expected "
+				    "to read %lu bytes but read only %lu "
+				    "bytes",
+				    (ulong) sizeof(row), (ulong) n_bytes);
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				errno, strerror(errno), msg);
+
+			ib_logf(IB_LOG_LEVEL_ERROR, "IO Error: %s", msg);
+
+			return(DB_IO_ERROR);
+		}
+
+		ptr = row;
+
+		cfg_index->m_id = mach_read_from_8(ptr);
+		ptr += sizeof(index_id_t);
+
+		cfg_index->m_space = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_page_no = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_type = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_trx_id_offset = mach_read_from_4(ptr);
+		if (cfg_index->m_trx_id_offset != mach_read_from_4(ptr)) {
+			ut_ad(0);
+			/* Overflow. Pretend that the clustered index
+			has a variable-length PRIMARY KEY. */
+			cfg_index->m_trx_id_offset = 0;
+		}
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_n_user_defined_cols = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_n_uniq = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_n_nullable = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_n_fields = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		/* The NUL byte is included in the name length. */
+		ulint	len = mach_read_from_4(ptr);
+
+		if (len > OS_FILE_MAX_PATH) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_INNODB_INDEX_CORRUPT,
+				"Index name length (%lu) is too long, "
+				"the meta-data is corrupt", len);
+
+			return(DB_CORRUPTION);
+		}
+
+		cfg_index->m_name = new(std::nothrow) byte[len];
+
+		/* Trigger OOM */
+		DBUG_EXECUTE_IF("ib_import_OOM_7",
+				delete [] cfg_index->m_name;
+				cfg_index->m_name = 0;);
+
+		if (cfg_index->m_name == 0) {
+			return(DB_OUT_OF_MEMORY);
+		}
+
+		dberr_t	err;
+
+		err = row_import_cfg_read_string(file, cfg_index->m_name, len);
+
+		if (err != DB_SUCCESS) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				errno, strerror(errno),
+				"while parsing index name.");
+
+			return(err);
+		}
+
+		err = row_import_cfg_read_index_fields(
+			file, thd, cfg_index, cfg);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Set the index root page number for v1 format.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_import_read_indexes(
+/*====================*/
+	FILE*		file,		/*!< in: File to read from */
+	THD*		thd,		/*!< in: session */
+	row_import*	cfg)		/*!< in/out: meta-data read */
+{
+	byte		row[sizeof(ib_uint32_t)];
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_3",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the number of indexes. */
+	if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			errno, strerror(errno),
+			"while reading number of indexes.");
+
+		return(DB_IO_ERROR);
+	}
+
+	cfg->m_n_indexes = mach_read_from_4(row);
+
+	if (cfg->m_n_indexes == 0) {
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			"Number of indexes in meta-data file is 0");
+
+		return(DB_CORRUPTION);
+
+	} else if (cfg->m_n_indexes > 1024) {
+		// FIXME: What is the upper limit? */
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			"Number of indexes in meta-data file is too high: %lu",
+			(ulong) cfg->m_n_indexes);
+		cfg->m_n_indexes = 0;
+
+		return(DB_CORRUPTION);
+	}
+
+	return(row_import_read_index_data(file, thd, cfg));
+}
+
+/*********************************************************************//**
+Read the meta data (table columns) config file. Deserialise the contents of
+dict_col_t structure, along with the column name. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_read_columns(
+/*====================*/
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd,	/*!< in/out: session */
+	row_import*		cfg)	/*!< in/out: meta-data read */
+{
+	dict_col_t*		col;
+	byte			row[sizeof(ib_uint32_t) * 8];
+
+	/* FIXME: What should the upper limit be? */
+	ut_a(cfg->m_n_cols > 0);
+	ut_a(cfg->m_n_cols < 1024);
+
+	cfg->m_cols = new(std::nothrow) dict_col_t[cfg->m_n_cols];
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF("ib_import_OOM_8",
+			delete [] cfg->m_cols; cfg->m_cols = 0;);
+
+	if (cfg->m_cols == 0) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	cfg->m_col_names = new(std::nothrow) byte* [cfg->m_n_cols];
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF("ib_import_OOM_9",
+			delete [] cfg->m_col_names; cfg->m_col_names = 0;);
+
+	if (cfg->m_col_names == 0) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	memset(cfg->m_cols, 0x0, sizeof(cfg->m_cols) * cfg->m_n_cols);
+	memset(cfg->m_col_names, 0x0, sizeof(cfg->m_col_names) * cfg->m_n_cols);
+
+	col = cfg->m_cols;
+
+	for (ulint i = 0; i < cfg->m_n_cols; ++i, ++col) {
+		byte*		ptr = row;
+
+		/* Trigger EOF */
+		DBUG_EXECUTE_IF("ib_import_io_read_error_4",
+				(void) fseek(file, 0L, SEEK_END););
+
+		if (fread(row, 1,  sizeof(row), file) != sizeof(row)) {
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				errno, strerror(errno),
+				"while reading table column meta-data.");
+
+			return(DB_IO_ERROR);
+		}
+
+		col->prtype = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		col->mtype = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		col->len = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		col->mbminmaxlen = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		col->ind = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		col->ord_part = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		col->max_prefix = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		/* Read in the column name as [len, byte array]. The len
+		includes the NUL byte. */
+
+		ulint		len = mach_read_from_4(ptr);
+
+		/* FIXME: What is the maximum column name length? */
+		if (len == 0 || len > 128) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_IO_READ_ERROR,
+				"Column name length %lu, is invalid",
+				(ulong) len);
+
+			return(DB_CORRUPTION);
+		}
+
+		cfg->m_col_names[i] = new(std::nothrow) byte[len];
+
+		/* Trigger OOM */
+		DBUG_EXECUTE_IF("ib_import_OOM_10",
+				delete [] cfg->m_col_names[i];
+				cfg->m_col_names[i] = 0;);
+
+		if (cfg->m_col_names[i] == 0) {
+			return(DB_OUT_OF_MEMORY);
+		}
+
+		dberr_t	err;
+
+		err = row_import_cfg_read_string(
+			file, cfg->m_col_names[i], len);
+
+		if (err != DB_SUCCESS) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				errno, strerror(errno),
+				"while parsing table column name.");
+
+			return(err);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Read the contents of the <tablespace>.cfg file.
+@return DB_SUCCESS or error code. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_read_v1(
+/*===============*/
+	FILE*		file,		/*!< in: File to read from */
+	THD*		thd,		/*!< in: session */
+	row_import*	cfg)		/*!< out: meta data */
+{
+	byte		value[sizeof(ib_uint32_t)];
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_5",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the hostname where the tablespace was exported. */
+	if (fread(value, 1, sizeof(value), file) != sizeof(value)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			errno, strerror(errno),
+			"while reading meta-data export hostname length.");
+
+		return(DB_IO_ERROR);
+	}
+
+	ulint	len = mach_read_from_4(value);
+
+	/* NUL byte is part of name length. */
+	cfg->m_hostname = new(std::nothrow) byte[len];
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF("ib_import_OOM_1",
+			delete [] cfg->m_hostname; cfg->m_hostname = 0;);
+
+	if (cfg->m_hostname == 0) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	dberr_t	err = row_import_cfg_read_string(file, cfg->m_hostname, len);
+
+	if (err != DB_SUCCESS) {
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			errno, strerror(errno),
+			"while parsing export hostname.");
+
+		return(err);
+	}
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_6",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the table name of tablespace that was exported. */
+	if (fread(value, 1, sizeof(value), file) != sizeof(value)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			errno, strerror(errno),
+			"while reading meta-data table name length.");
+
+		return(DB_IO_ERROR);
+	}
+
+	len = mach_read_from_4(value);
+
+	/* NUL byte is part of name length. */
+	cfg->m_table_name = new(std::nothrow) byte[len];
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF("ib_import_OOM_2",
+			delete [] cfg->m_table_name; cfg->m_table_name = 0;);
+
+	if (cfg->m_table_name == 0) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	err = row_import_cfg_read_string(file, cfg->m_table_name, len);
+
+	if (err != DB_SUCCESS) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			errno, strerror(errno),
+			"while parsing table name.");
+
+		return(err);
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Importing tablespace for table '%s' that was exported "
+		"from host '%s'", cfg->m_table_name, cfg->m_hostname);
+
+	byte		row[sizeof(ib_uint32_t) * 3];
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_7",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the autoinc value. */
+	if (fread(row, 1, sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			errno, strerror(errno),
+			"while reading autoinc value.");
+
+		return(DB_IO_ERROR);
+	}
+
+	cfg->m_autoinc = mach_read_from_8(row);
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_8",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the tablespace page size. */
+	if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			errno, strerror(errno),
+			"while reading meta-data header.");
+
+		return(DB_IO_ERROR);
+	}
+
+	byte*		ptr = row;
+
+	cfg->m_page_size = mach_read_from_4(ptr);
+	ptr += sizeof(ib_uint32_t);
+
+	if (cfg->m_page_size != UNIV_PAGE_SIZE) {
+
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+			"Tablespace to be imported has a different "
+			"page size than this server. Server page size "
+			"is %lu, whereas tablespace page size is %lu",
+			UNIV_PAGE_SIZE, (ulong) cfg->m_page_size);
+
+		return(DB_ERROR);
+	}
+
+	cfg->m_flags = mach_read_from_4(ptr);
+	ptr += sizeof(ib_uint32_t);
+
+	cfg->m_n_cols = mach_read_from_4(ptr);
+
+	if (!dict_tf_is_valid(cfg->m_flags)) {
+
+		return(DB_CORRUPTION);
+
+	} else if ((err = row_import_read_columns(file, thd, cfg))
+		   != DB_SUCCESS) {
+
+		return(err);
+
+	} else  if ((err = row_import_read_indexes(file, thd, cfg))
+		   != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	ut_a(err == DB_SUCCESS);
+	return(err);
+}
+
+/**
+Read the contents of the <tablespace>.cfg file.
+@return DB_SUCCESS or error code. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_read_meta_data(
+/*======================*/
+	dict_table_t*	table,		/*!< in: table */
+	FILE*		file,		/*!< in: File to read from */
+	THD*		thd,		/*!< in: session */
+	row_import&	cfg)		/*!< out: contents of the .cfg file */
+{
+	byte		row[sizeof(ib_uint32_t)];
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_9",
+			(void) fseek(file, 0L, SEEK_END););
+
+	if (fread(&row, 1, sizeof(row), file) != sizeof(row)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			errno, strerror(errno),
+			"while reading meta-data version.");
+
+		return(DB_IO_ERROR);
+	}
+
+	cfg.m_version = mach_read_from_4(row);
+
+	/* Check the version number. */
+	switch (cfg.m_version) {
+	case IB_EXPORT_CFG_VERSION_V1:
+
+		return(row_import_read_v1(file, thd, &cfg));
+	default:
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			"Unsupported meta-data version number (%lu), "
+			"file ignored", (ulong) cfg.m_version);
+	}
+
+	return(DB_ERROR);
+}
+
+/**
+Read the contents of the <tablename>.cfg file.
+@return DB_SUCCESS or error code. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_read_cfg(
+/*================*/
+	dict_table_t*	table,	/*!< in: table */
+	THD*		thd,	/*!< in: session */
+	row_import&	cfg)	/*!< out: contents of the .cfg file */
+{
+	dberr_t		err;
+	char		name[OS_FILE_MAX_PATH];
+
+	cfg.m_table = table;
+
+	srv_get_meta_data_filename(table, name, sizeof(name));
+
+	FILE*	file = fopen(name, "rb");
+
+	if (file == NULL) {
+		char	msg[BUFSIZ];
+
+		ut_snprintf(msg, sizeof(msg),
+			    "Error opening '%s', will attempt to import "
+			    "without schema verification", name);
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_READ_ERROR,
+			errno, strerror(errno), msg);
+
+		cfg.m_missing = true;
+
+		err = DB_FAIL;
+	} else {
+
+		cfg.m_missing = false;
+
+		err = row_import_read_meta_data(table, file, thd, cfg);
+		fclose(file);
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Update the <space, root page> of a table's indexes from the values
+in the data dictionary.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_import_update_index_root(
+/*=========================*/
+	trx_t*			trx,		/*!< in/out: transaction that
+						covers the update */
+	const dict_table_t*	table,		/*!< in: Table for which we want
+						to set the root page_no */
+	bool			reset,		/*!< in: if true then set to
+						FIL_NUL */
+	bool			dict_locked)	/*!< in: Set to true if the
+						caller already owns the
+						dict_sys_t:: mutex. */
+
+{
+	const dict_index_t*	index;
+	que_t*			graph = 0;
+	dberr_t			err = DB_SUCCESS;
+
+	static const char	sql[] = {
+		"PROCEDURE UPDATE_INDEX_ROOT() IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_INDEXES\n"
+		"SET SPACE = :space,\n"
+		"    PAGE_NO = :page,\n"
+		"    TYPE = :type\n"
+		"WHERE TABLE_ID = :table_id AND ID = :index_id;\n"
+		"END;\n"};
+
+	if (!dict_locked) {
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	for (index = dict_table_get_first_index(table);
+	     index != 0;
+	     index = dict_table_get_next_index(index)) {
+
+		pars_info_t*	info;
+		ib_uint32_t	page;
+		ib_uint32_t	space;
+		ib_uint32_t	type;
+		index_id_t	index_id;
+		table_id_t	table_id;
+
+		info = (graph != 0) ? graph->info : pars_info_create();
+
+		mach_write_to_4(
+			reinterpret_cast<byte*>(&type),
+			index->type);
+
+		mach_write_to_4(
+			reinterpret_cast<byte*>(&page),
+			reset ? FIL_NULL : index->page);
+
+		mach_write_to_4(
+			reinterpret_cast<byte*>(&space),
+			reset ? FIL_NULL : index->space);
+
+		mach_write_to_8(
+			reinterpret_cast<byte*>(&index_id),
+			index->id);
+
+		mach_write_to_8(
+			reinterpret_cast<byte*>(&table_id),
+			table->id);
+
+		/* If we set the corrupt bit during the IMPORT phase then
+		we need to update the system tables. */
+		pars_info_bind_int4_literal(info, "type", &type);
+		pars_info_bind_int4_literal(info, "space", &space);
+		pars_info_bind_int4_literal(info, "page", &page);
+		pars_info_bind_ull_literal(info, "index_id", &index_id);
+		pars_info_bind_ull_literal(info, "table_id", &table_id);
+
+		if (graph == 0) {
+			graph = pars_sql(info, sql);
+			ut_a(graph);
+			graph->trx = trx;
+		}
+
+		que_thr_t*	thr;
+
+		graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+		ut_a(thr = que_fork_start_command(graph));
+
+		que_run_threads(thr);
+
+		DBUG_EXECUTE_IF("ib_import_internal_error",
+				trx->error_state = DB_ERROR;);
+
+		err = trx->error_state;
+
+		if (err != DB_SUCCESS) {
+			char		index_name[MAX_FULL_NAME_LEN + 1];
+
+			innobase_format_name(
+				index_name, sizeof(index_name),
+				index->name, TRUE);
+
+			ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+				ER_INTERNAL_ERROR,
+				"While updating the <space, root page "
+				"number> of index %s - %s",
+				index_name, ut_strerr(err));
+
+			break;
+		}
+	}
+
+	que_graph_free(graph);
+
+	if (!dict_locked) {
+		mutex_exit(&dict_sys->mutex);
+	}
+
+	return(err);
+}
+
+/** Callback arg for row_import_set_discarded. */
+struct discard_t {
+	ib_uint32_t	flags2;			/*!< Value read from column */
+	bool		state;			/*!< New state of the flag */
+	ulint		n_recs;			/*!< Number of recs processed */
+};
+
+/******************************************************************//**
+Fetch callback that sets or unsets the DISCARDED tablespace flag in
+SYS_TABLES. The flags is stored in MIX_LEN column.
+@return FALSE if all OK */
+static
+ibool
+row_import_set_discarded(
+/*=====================*/
+	void*		row,			/*!< in: sel_node_t* */
+	void*		user_arg)		/*!< in: bool set/unset flag */
+{
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	discard_t*	discard = static_cast<discard_t*>(user_arg);
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+
+	ut_a(dtype_get_mtype(type) == DATA_INT);
+	ut_a(len == sizeof(ib_uint32_t));
+
+	ulint	flags2 = mach_read_from_4(
+		static_cast<byte*>(dfield_get_data(dfield)));
+
+	if (discard->state) {
+		flags2 |= DICT_TF2_DISCARDED;
+	} else {
+		flags2 &= ~DICT_TF2_DISCARDED;
+	}
+
+	mach_write_to_4(reinterpret_cast<byte*>(&discard->flags2), flags2);
+
+	++discard->n_recs;
+
+	/* There should be at most one matching record. */
+	ut_a(discard->n_recs == 1);
+
+	return(FALSE);
+}
+
+/*****************************************************************//**
+Update the DICT_TF2_DISCARDED flag in SYS_TABLES.
+@return DB_SUCCESS or error code. */
+UNIV_INTERN
+dberr_t
+row_import_update_discarded_flag(
+/*=============================*/
+	trx_t*		trx,		/*!< in/out: transaction that
+					covers the update */
+	table_id_t	table_id,	/*!< in: Table for which we want
+					to set the root table->flags2 */
+	bool		discarded,	/*!< in: set MIX_LEN column bit
+					to discarded, if true */
+	bool		dict_locked)	/*!< in: set to true if the
+					caller already owns the
+					dict_sys_t:: mutex. */
+
+{
+	pars_info_t*		info;
+	discard_t		discard;
+
+	static const char	sql[] =
+		"PROCEDURE UPDATE_DISCARDED_FLAG() IS\n"
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS\n"
+		" SELECT MIX_LEN "
+		" FROM SYS_TABLES "
+		" WHERE ID = :table_id FOR UPDATE;"
+		"\n"
+		"BEGIN\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"UPDATE SYS_TABLES"
+		" SET MIX_LEN = :flags2"
+		" WHERE ID = :table_id;\n"
+		"CLOSE c;\n"
+		"END;\n";
+
+	discard.n_recs = 0;
+	discard.state = discarded;
+	discard.flags2 = ULINT32_UNDEFINED;
+
+	info = pars_info_create();
+
+	pars_info_add_ull_literal(info, "table_id", table_id);
+	pars_info_bind_int4_literal(info, "flags2", &discard.flags2);
+
+	pars_info_bind_function(
+		info, "my_func", row_import_set_discarded, &discard);
+
+	dberr_t	err = que_eval_sql(info, sql, !dict_locked, trx);
+
+	ut_a(discard.n_recs == 1);
+	ut_a(discard.flags2 != ULINT32_UNDEFINED);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_import_for_mysql(
+/*=================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL */
+{
+	dberr_t		err;
+	trx_t*		trx;
+	ib_uint64_t	autoinc = 0;
+	char		table_name[MAX_FULL_NAME_LEN + 1];
+	char*		filepath = NULL;
+
+	ut_ad(!srv_read_only_mode);
+
+	innobase_format_name(
+		table_name, sizeof(table_name), table->name, FALSE);
+
+	ut_a(table->space);
+	ut_ad(prebuilt->trx);
+	ut_a(table->ibd_file_missing);
+
+	trx_start_if_not_started(prebuilt->trx);
+
+	trx = trx_allocate_for_mysql();
+
+	/* So that the table is not DROPped during recovery. */
+	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+	trx_start_if_not_started(trx);
+
+	/* So that we can send error messages to the user. */
+	trx->mysql_thd = prebuilt->trx->mysql_thd;
+
+	/* Ensure that the table will be dropped by trx_rollback_active()
+	in case of a crash. */
+
+	trx->table_id = table->id;
+
+	/* Assign an undo segment for the transaction, so that the
+	transaction will be recovered after a crash. */
+
+	mutex_enter(&trx->undo_mutex);
+
+	err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE);
+
+	mutex_exit(&trx->undo_mutex);
+
+	DBUG_EXECUTE_IF("ib_import_undo_assign_failure",
+			err = DB_TOO_MANY_CONCURRENT_TRXS;);
+
+	if (err != DB_SUCCESS) {
+
+		return(row_import_cleanup(prebuilt, trx, err));
+
+	} else if (trx->update_undo == 0) {
+
+		err = DB_TOO_MANY_CONCURRENT_TRXS;
+		return(row_import_cleanup(prebuilt, trx, err));
+	}
+
+	prebuilt->trx->op_info = "read meta-data file";
+
+	/* Prevent DDL operations while we are checking. */
+
+	rw_lock_s_lock_func(&dict_operation_lock, 0, __FILE__, __LINE__);
+
+	row_import	cfg;
+
+	memset(&cfg, 0x0, sizeof(cfg));
+
+	err = row_import_read_cfg(table, trx->mysql_thd, cfg);
+
+	/* Check if the table column definitions match the contents
+	of the config file. */
+
+	if (err == DB_SUCCESS) {
+
+		/* We have a schema file, try and match it with the our
+		data dictionary. */
+
+		err = cfg.match_schema(trx->mysql_thd);
+
+		/* Update index->page and SYS_INDEXES.PAGE_NO to match the
+		B-tree root page numbers in the tablespace. Use the index
+		name from the .cfg file to find match. */
+
+		if (err == DB_SUCCESS) {
+			cfg.set_root_by_name();
+			autoinc = cfg.m_autoinc;
+		}
+
+		rw_lock_s_unlock_gen(&dict_operation_lock, 0);
+
+		DBUG_EXECUTE_IF("ib_import_set_index_root_failure",
+				err = DB_TOO_MANY_CONCURRENT_TRXS;);
+
+	} else if (cfg.m_missing) {
+
+		rw_lock_s_unlock_gen(&dict_operation_lock, 0);
+
+		/* We don't have a schema file, we will have to discover
+		the index root pages from the .ibd file and skip the schema
+		matching step. */
+
+		ut_a(err == DB_FAIL);
+
+		cfg.m_page_size = UNIV_PAGE_SIZE;
+
+		FetchIndexRootPages	fetchIndexRootPages(table, trx);
+
+		err = fil_tablespace_iterate(
+			table, IO_BUFFER_SIZE(cfg.m_page_size),
+			fetchIndexRootPages);
+
+		if (err == DB_SUCCESS) {
+
+			err = fetchIndexRootPages.build_row_import(&cfg);
+
+			/* Update index->page and SYS_INDEXES.PAGE_NO
+			to match the B-tree root page numbers in the
+			tablespace. */
+
+			if (err == DB_SUCCESS) {
+				err = cfg.set_root_by_heuristic();
+			}
+		}
+
+	} else {
+		rw_lock_s_unlock_gen(&dict_operation_lock, 0);
+	}
+
+	if (err != DB_SUCCESS) {
+		return(row_import_error(prebuilt, trx, err));
+	}
+
+	prebuilt->trx->op_info = "importing tablespace";
+
+	ib_logf(IB_LOG_LEVEL_INFO, "Phase I - Update all pages");
+
+	/* Iterate over all the pages and do the sanity checking and
+	the conversion required to import the tablespace. */
+
+	PageConverter	converter(&cfg, trx);
+
+	/* Set the IO buffer size in pages. */
+
+	err = fil_tablespace_iterate(
+		table, IO_BUFFER_SIZE(cfg.m_page_size), converter);
+
+	DBUG_EXECUTE_IF("ib_import_reset_space_and_lsn_failure",
+			err = DB_TOO_MANY_CONCURRENT_TRXS;);
+
+	if (err != DB_SUCCESS) {
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name), table->name, FALSE);
+
+		ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_INTERNAL_ERROR,
+			"Cannot reset LSNs in table '%s' : %s",
+			table_name, ut_strerr(err));
+
+		return(row_import_cleanup(prebuilt, trx, err));
+	}
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* If the table is stored in a remote tablespace, we need to
+	determine that filepath from the link file and system tables.
+	Find the space ID in SYS_TABLES since this is an ALTER TABLE. */
+	if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+		dict_get_and_save_data_dir_path(table, true);
+		ut_a(table->data_dir_path);
+
+		filepath = os_file_make_remote_pathname(
+			table->data_dir_path, table->name, "ibd");
+	} else {
+		filepath = fil_make_ibd_name(table->name, false);
+	}
+	ut_a(filepath);
+
+	/* Open the tablespace so that we can access via the buffer pool.
+	We set the 2nd param (fix_dict = true) here because we already
+	have an x-lock on dict_operation_lock and dict_sys->mutex. */
+
+	err = fil_open_single_table_tablespace(
+		true, true, table->space,
+		dict_tf_to_fsp_flags(table->flags),
+		table->name, filepath);
+
+	DBUG_EXECUTE_IF("ib_import_open_tablespace_failure",
+			err = DB_TABLESPACE_NOT_FOUND;);
+
+	if (err != DB_SUCCESS) {
+		row_mysql_unlock_data_dictionary(trx);
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_FILE_NOT_FOUND,
+			filepath, err, ut_strerr(err));
+
+		mem_free(filepath);
+
+		return(row_import_cleanup(prebuilt, trx, err));
+	}
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	mem_free(filepath);
+
+	err = ibuf_check_bitmap_on_import(trx, table->space);
+
+	DBUG_EXECUTE_IF("ib_import_check_bitmap_failure", err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		return(row_import_cleanup(prebuilt, trx, err));
+	}
+
+	/* The first index must always be the clustered index. */
+
+	dict_index_t*	index = dict_table_get_first_index(table);
+
+	if (!dict_index_is_clust(index)) {
+		return(row_import_error(prebuilt, trx, DB_CORRUPTION));
+	}
+
+	/* Update the Btree segment headers for index node and
+	leaf nodes in the root page. Set the new space id. */
+
+	err = btr_root_adjust_on_import(index);
+
+	DBUG_EXECUTE_IF("ib_import_cluster_root_adjust_failure",
+			err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		return(row_import_error(prebuilt, trx, err));
+	}
+
+	if (err != DB_SUCCESS) {
+		return(row_import_error(prebuilt, trx, err));
+	} else if (cfg.requires_purge(index->name)) {
+
+		/* Purge any delete-marked records that couldn't be
+		purged during the page conversion phase from the
+		cluster index. */
+
+		IndexPurge	purge(trx, index);
+
+		trx->op_info = "cluster: purging delete marked records";
+
+		err = purge.garbage_collect();
+
+		trx->op_info = "";
+	}
+
+	DBUG_EXECUTE_IF("ib_import_cluster_failure", err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		return(row_import_error(prebuilt, trx, err));
+	}
+
+	/* For secondary indexes, purge any records that couldn't be purged
+	during the page conversion phase. */
+
+	err = row_import_adjust_root_pages_of_secondary_indexes(
+		prebuilt, trx, table, cfg);
+
+	DBUG_EXECUTE_IF("ib_import_sec_root_adjust_failure",
+			err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		return(row_import_error(prebuilt, trx, err));
+	}
+
+	/* Ensure that the next available DB_ROW_ID is not smaller than
+	any DB_ROW_ID stored in the table. */
+
+	if (prebuilt->clust_index_was_generated) {
+
+		err = row_import_set_sys_max_row_id(prebuilt, table);
+
+		if (err != DB_SUCCESS) {
+			return(row_import_error(prebuilt, trx, err));
+		}
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO, "Phase III - Flush changes to disk");
+
+	/* Ensure that all pages dirtied during the IMPORT make it to disk.
+	The only dirty pages generated should be from the pessimistic purge
+	of delete marked records that couldn't be purged in Phase I. */
+
+	buf_LRU_flush_or_remove_pages(
+		prebuilt->table->space, BUF_REMOVE_FLUSH_WRITE, trx);
+
+	if (trx_is_interrupted(trx)) {
+		ib_logf(IB_LOG_LEVEL_INFO, "Phase III - Flush interrupted");
+		return(row_import_error(prebuilt, trx, DB_INTERRUPTED));
+	} else {
+		ib_logf(IB_LOG_LEVEL_INFO, "Phase IV - Flush complete");
+	}
+
+	/* The dictionary latches will be released in in row_import_cleanup()
+	after the transaction commit, for both success and error. */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Update the root pages of the table's indexes. */
+	err = row_import_update_index_root(trx, table, false, true);
+
+	if (err != DB_SUCCESS) {
+		return(row_import_error(prebuilt, trx, err));
+	}
+
+	/* Update the table's discarded flag, unset it. */
+	err = row_import_update_discarded_flag(trx, table->id, false, true);
+
+	if (err != DB_SUCCESS) {
+		return(row_import_error(prebuilt, trx, err));
+	}
+
+	table->ibd_file_missing = false;
+	table->flags2 &= ~DICT_TF2_DISCARDED;
+
+	if (autoinc != 0) {
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name), table->name, FALSE);
+
+		ib_logf(IB_LOG_LEVEL_INFO, "%s autoinc value set to " IB_ID_FMT,
+			table_name, autoinc);
+
+		dict_table_autoinc_lock(table);
+		dict_table_autoinc_initialize(table, autoinc);
+		dict_table_autoinc_unlock(table);
+	}
+
+	ut_a(err == DB_SUCCESS);
+
+	return(row_import_cleanup(prebuilt, trx, err));
+}
+
diff --git a/storage/xtradb/row/row0ins.c b/storage/xtradb/row/row0ins.cc
index 0d9db2f6d1f..34e34925b9a 100644
--- a/storage/xtradb/row/row0ins.c
+++ b/storage/xtradb/row/row0ins.cc
@@ -17,14 +17,12 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0ins.c
+@file row/row0ins.cc
 Insert into a table
 
 Created 4/20/1996 Heikki Tuuri
 *******************************************************/
 
-#include "m_string.h" /* for my_sys.h */
-#include "my_sys.h" /* DEBUG_SYNC_C */
 #include "row0ins.h"
 
 #ifdef UNIV_NONINL
@@ -34,6 +32,7 @@ Created 4/20/1996 Heikki Tuuri
 #include "ha_prototypes.h"
 #include "dict0dict.h"
 #include "dict0boot.h"
+#include "trx0rec.h"
 #include "trx0undo.h"
 #include "btr0btr.h"
 #include "btr0cur.h"
@@ -42,6 +41,7 @@ Created 4/20/1996 Heikki Tuuri
 #include "row0upd.h"
 #include "row0sel.h"
 #include "row0row.h"
+#include "row0log.h"
 #include "rem0cmp.h"
 #include "lock0lock.h"
 #include "log0log.h"
@@ -49,11 +49,9 @@ Created 4/20/1996 Heikki Tuuri
 #include "data0data.h"
 #include "usr0sess.h"
 #include "buf0lru.h"
+#include "fts0fts.h"
+#include "fts0types.h"
 #include "m_string.h"
-#include "my_sys.h"
-
-#define	ROW_INS_PREV	1
-#define	ROW_INS_NEXT	2
 
 /*************************************************************************
 IMPORTANT NOTE: Any operation that generates redo MUST check that there
@@ -78,7 +76,8 @@ ins_node_create(
 {
 	ins_node_t*	node;
 
-	node = mem_heap_alloc(heap, sizeof(ins_node_t));
+	node = static_cast<ins_node_t*>(
+		mem_heap_alloc(heap, sizeof(ins_node_t)));
 
 	node->common.type = QUE_NODE_INSERT;
 
@@ -102,7 +101,7 @@ ins_node_create(
 
 /***********************************************************//**
 Creates an entry template for each index of a table. */
-UNIV_INTERN
+static
 void
 ins_node_create_entry_list(
 /*=======================*/
@@ -115,17 +114,18 @@ ins_node_create_entry_list(
 
 	UT_LIST_INIT(node->entry_list);
 
-	index = dict_table_get_first_index(node->table);
+	/* We will include all indexes (include those corrupted
+	secondary indexes) in the entry list. Filteration of
+	these corrupted index will be done in row_ins() */
 
-	while (index != NULL) {
-		entry = row_build_index_entry(node->row, NULL, index,
-					      node->entry_sys_heap);
-		UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry);
+	for (index = dict_table_get_first_index(node->table);
+	     index != 0;
+	     index = dict_table_get_next_index(index)) {
 
-		/* We will include all indexes (include those corrupted
-		secondary indexes) in the entry list. Filteration of
-		these corrupted index will be done in row_ins() */
-		index = dict_table_get_next_index(index);
+		entry = row_build_index_entry(
+			node->row, NULL, index, node->entry_sys_heap);
+
+		UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry);
 	}
 }
 
@@ -157,7 +157,7 @@ row_ins_alloc_sys_fields(
 
 	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
 
-	ptr = mem_heap_zalloc(heap, DATA_ROW_ID_LEN);
+	ptr = static_cast<byte*>(mem_heap_zalloc(heap, DATA_ROW_ID_LEN));
 
 	dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN);
 
@@ -168,7 +168,7 @@ row_ins_alloc_sys_fields(
 	col = dict_table_get_sys_col(table, DATA_TRX_ID);
 
 	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
-	ptr = mem_heap_zalloc(heap, DATA_TRX_ID_LEN);
+	ptr = static_cast<byte*>(mem_heap_zalloc(heap, DATA_TRX_ID_LEN));
 
 	dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN);
 
@@ -179,7 +179,7 @@ row_ins_alloc_sys_fields(
 	col = dict_table_get_sys_col(table, DATA_ROLL_PTR);
 
 	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
-	ptr = mem_heap_zalloc(heap, DATA_ROLL_PTR_LEN);
+	ptr = static_cast<byte*>(mem_heap_zalloc(heap, DATA_ROLL_PTR_LEN));
 
 	dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN);
 }
@@ -222,68 +222,92 @@ Does an insert operation by updating a delete-marked existing record
 in the index. This situation can occur if the delete-marked record is
 kept in the index for consistent reads.
 @return	DB_SUCCESS or error code */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_ins_sec_index_entry_by_modify(
 /*==============================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
 	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
 				depending on whether mtr holds just a leaf
 				latch or also a tree latch */
 	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
+	ulint**		offsets,/*!< in/out: offsets on cursor->page_cur.rec */
+	mem_heap_t*	offsets_heap,
+				/*!< in/out: memory heap that can be emptied */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
 	const dtuple_t*	entry,	/*!< in: index entry to insert */
 	que_thr_t*	thr,	/*!< in: query thread */
 	mtr_t*		mtr)	/*!< in: mtr; must be committed before
 				latching any further pages */
 {
 	big_rec_t*	dummy_big_rec;
-	mem_heap_t*	heap;
 	upd_t*		update;
 	rec_t*		rec;
-	ulint		err;
+	dberr_t		err;
 
 	rec = btr_cur_get_rec(cursor);
 
 	ut_ad(!dict_index_is_clust(cursor->index));
-	ut_ad(rec_get_deleted_flag(rec,
-				   dict_table_is_comp(cursor->index->table)));
+	ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
+	ut_ad(!entry->info_bits);
 
 	/* We know that in the alphabetical ordering, entry and rec are
 	identified. But in their binary form there may be differences if
 	there are char fields in them. Therefore we have to calculate the
 	difference. */
 
-	heap = mem_heap_create(1024);
-
 	update = row_upd_build_sec_rec_difference_binary(
-		cursor->index, entry, rec, thr_get_trx(thr), heap);
+		rec, cursor->index, *offsets, entry, heap);
+
+	if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
+		/* We should never insert in place of a record that
+		has not been delete-marked. The only exception is when
+		online CREATE INDEX copied the changes that we already
+		made to the clustered index, and completed the
+		secondary index creation before we got here. In this
+		case, the change would already be there. The CREATE
+		INDEX should be waiting for a MySQL meta-data lock
+		upgrade at least until this INSERT or UPDATE
+		returns. After that point, the TEMP_INDEX_PREFIX
+		would be dropped from the index name in
+		commit_inplace_alter_table(). */
+		ut_a(update->n_fields == 0);
+		ut_a(*cursor->index->name == TEMP_INDEX_PREFIX);
+		ut_ad(!dict_index_is_online_ddl(cursor->index));
+		return(DB_SUCCESS);
+	}
+
 	if (mode == BTR_MODIFY_LEAF) {
 		/* Try an optimistic updating of the record, keeping changes
 		within the page */
 
-		err = btr_cur_optimistic_update(BTR_KEEP_SYS_FLAG, cursor,
-						update, 0, thr, mtr);
+		/* TODO: pass only *offsets */
+		err = btr_cur_optimistic_update(
+			flags | BTR_KEEP_SYS_FLAG, cursor,
+			offsets, &offsets_heap, update, 0, thr,
+			thr_get_trx(thr)->id, mtr);
 		switch (err) {
 		case DB_OVERFLOW:
 		case DB_UNDERFLOW:
 		case DB_ZIP_OVERFLOW:
 			err = DB_FAIL;
+		default:
+			break;
 		}
 	} else {
 		ut_a(mode == BTR_MODIFY_TREE);
 		if (buf_LRU_buf_pool_running_out()) {
 
-			err = DB_LOCK_TABLE_FULL;
-
-			goto func_exit;
+			return(DB_LOCK_TABLE_FULL);
 		}
 
-		err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG, cursor,
-						 &heap, &dummy_big_rec, update,
-						 0, thr, mtr);
+		err = btr_cur_pessimistic_update(
+			flags | BTR_KEEP_SYS_FLAG, cursor,
+			offsets, &offsets_heap,
+			heap, &dummy_big_rec, update, 0,
+			thr, thr_get_trx(thr)->id, mtr);
 		ut_ad(!dummy_big_rec);
 	}
-func_exit:
-	mem_heap_free(heap);
 
 	return(err);
 }
@@ -293,15 +317,20 @@ Does an insert operation by delete unmarking and updating a delete marked
 existing record in the index. This situation can occur if the delete marked
 record is kept in the index for consistent reads.
 @return	DB_SUCCESS, DB_FAIL, or error code */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_ins_clust_index_entry_by_modify(
 /*================================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
 	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
 				depending on whether mtr holds just a leaf
 				latch or also a tree latch */
 	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
-	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: pointer to memory heap that can
+				be emptied, or NULL */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
 	big_rec_t**	big_rec,/*!< out: possible big rec vector of fields
 				which have to be stored externally by the
 				caller */
@@ -310,9 +339,9 @@ row_ins_clust_index_entry_by_modify(
 	mtr_t*		mtr)	/*!< in: mtr; must be committed before
 				latching any further pages */
 {
-	rec_t*		rec;
-	upd_t*		update;
-	ulint		err;
+	const rec_t*	rec;
+	const upd_t*	update;
+	dberr_t		err;
 
 	ut_ad(dict_index_is_clust(cursor->index));
 
@@ -323,38 +352,40 @@ row_ins_clust_index_entry_by_modify(
 	ut_ad(rec_get_deleted_flag(rec,
 				   dict_table_is_comp(cursor->index->table)));
 
-	if (!*heap) {
-		*heap = mem_heap_create(1024);
-	}
-
 	/* Build an update vector containing all the fields to be modified;
 	NOTE that this vector may NOT contain system columns trx_id or
 	roll_ptr */
 
-	update = row_upd_build_difference_binary(cursor->index, entry, rec,
-						 thr_get_trx(thr), *heap);
-	if (mode == BTR_MODIFY_LEAF) {
+	update = row_upd_build_difference_binary(
+		cursor->index, entry, rec, NULL, true,
+		thr_get_trx(thr), heap);
+	if (mode != BTR_MODIFY_TREE) {
+		ut_ad((mode & ~BTR_ALREADY_S_LATCHED) == BTR_MODIFY_LEAF);
+
 		/* Try optimistic updating of the record, keeping changes
 		within the page */
 
-		err = btr_cur_optimistic_update(0, cursor, update, 0, thr,
-						mtr);
+		err = btr_cur_optimistic_update(
+			flags, cursor, offsets, offsets_heap, update, 0, thr,
+			thr_get_trx(thr)->id, mtr);
 		switch (err) {
 		case DB_OVERFLOW:
 		case DB_UNDERFLOW:
 		case DB_ZIP_OVERFLOW:
 			err = DB_FAIL;
+		default:
+			break;
 		}
 	} else {
-		ut_a(mode == BTR_MODIFY_TREE);
 		if (buf_LRU_buf_pool_running_out()) {
 
 			return(DB_LOCK_TABLE_FULL);
 
 		}
 		err = btr_cur_pessimistic_update(
-			BTR_KEEP_POS_FLAG, cursor, heap, big_rec, update,
-			0, thr, mtr);
+			flags | BTR_KEEP_POS_FLAG,
+			cursor, offsets, offsets_heap, heap,
+			big_rec, update, 0, thr, thr_get_trx(thr)->id, mtr);
 	}
 
 	return(err);
@@ -372,22 +403,19 @@ row_ins_cascade_ancestor_updates_table(
 	dict_table_t*	table)	/*!< in: table */
 {
 	que_node_t*	parent;
-	upd_node_t*	upd_node;
 
-	parent = que_node_get_parent(node);
+	for (parent = que_node_get_parent(node);
+	     que_node_get_type(parent) == QUE_NODE_UPDATE;
+	     parent = que_node_get_parent(parent)) {
 
-	while (que_node_get_type(parent) == QUE_NODE_UPDATE) {
+		upd_node_t*	upd_node;
 
-		upd_node = parent;
+		upd_node = static_cast<upd_node_t*>(parent);
 
 		if (upd_node->table == table && upd_node->is_delete == FALSE) {
 
 			return(TRUE);
 		}
-
-		parent = que_node_get_parent(parent);
-
-		ut_a(parent);
 	}
 
 	return(FALSE);
@@ -397,7 +425,7 @@ row_ins_cascade_ancestor_updates_table(
 Returns the number of ancestor UPDATE or DELETE nodes of a
 cascaded update/delete node.
 @return	number of ancestors */
-static
+static __attribute__((nonnull, warn_unused_result))
 ulint
 row_ins_cascade_n_ancestors(
 /*========================*/
@@ -406,14 +434,11 @@ row_ins_cascade_n_ancestors(
 	que_node_t*	parent;
 	ulint		n_ancestors = 0;
 
-	parent = que_node_get_parent(node);
+	for (parent = que_node_get_parent(node);
+	     que_node_get_type(parent) == QUE_NODE_UPDATE;
+	     parent = que_node_get_parent(parent)) {
 
-	while (que_node_get_type(parent) == QUE_NODE_UPDATE) {
 		n_ancestors++;
-
-		parent = que_node_get_parent(parent);
-
-		ut_a(parent);
 	}
 
 	return(n_ancestors);
@@ -426,7 +451,7 @@ a cascaded update.
 can also be 0 if no foreign key fields changed; the returned value is
 ULINT_UNDEFINED if the column type in the child table is too short to
 fit the new value in the parent table: that means the update fails */
-static
+static __attribute__((nonnull, warn_unused_result))
 ulint
 row_ins_cascade_calc_update_vec(
 /*============================*/
@@ -434,8 +459,10 @@ row_ins_cascade_calc_update_vec(
 					table */
 	dict_foreign_t*	foreign,	/*!< in: foreign key constraint whose
 					type is != 0 */
-	mem_heap_t*	heap)		/*!< in: memory heap to use as
+	mem_heap_t*	heap,		/*!< in: memory heap to use as
 					temporary storage */
+	trx_t*		trx,		/*!< in: update transaction */
+	ibool*		fts_col_affected)/*!< out: is FTS column affected */
 {
 	upd_node_t*	cascade		= node->cascade_node;
 	dict_table_t*	table		= foreign->foreign_table;
@@ -448,6 +475,9 @@ row_ins_cascade_calc_update_vec(
 	ulint		parent_field_no;
 	ulint		i;
 	ulint		j;
+	ibool		doc_id_updated = FALSE;
+	ulint		doc_id_pos = 0;
+	doc_id_t	new_doc_id = FTS_NULL_DOC_ID;
 
 	ut_a(node);
 	ut_a(foreign);
@@ -472,6 +502,13 @@ row_ins_cascade_calc_update_vec(
 
 	n_fields_updated = 0;
 
+	*fts_col_affected = FALSE;
+
+	if (table->fts) {
+		doc_id_pos = dict_table_get_nth_col_pos(
+			table, table->fts->doc_col);
+	}
+
 	for (i = 0; i < foreign->n_fields; i++) {
 
 		parent_field_no = dict_table_get_nth_col_pos(
@@ -527,7 +564,9 @@ row_ins_cascade_calc_update_vec(
 					col->prtype, col->mbminmaxlen,
 					col->len,
 					ufield_len,
-					dfield_get_data(&ufield->new_val))
+					static_cast<char*>(
+						dfield_get_data(
+							&ufield->new_val)))
 				    < ufield_len) {
 
 					return(ULINT_UNDEFINED);
@@ -552,8 +591,9 @@ row_ins_cascade_calc_update_vec(
 					byte*	padded_data;
 					ulint	mbminlen;
 
-					padded_data = mem_heap_alloc(
-						heap, min_size);
+					padded_data = static_cast<byte*>(
+						mem_heap_alloc(
+							heap, min_size));
 
 					pad = padded_data + ufield_len;
 					pad_len = min_size - ufield_len;
@@ -582,11 +622,91 @@ row_ins_cascade_calc_update_vec(
 							padded_data, min_size);
 				}
 
+				/* Check whether the current column has
+				FTS index on it */
+				if (table->fts
+				    && dict_table_is_fts_column(
+					table->fts->indexes,
+					dict_col_get_no(col))
+					!= ULINT_UNDEFINED) {
+					*fts_col_affected = TRUE;
+				}
+
+				/* If Doc ID is updated, check whether the
+				Doc ID is valid */
+				if (table->fts
+				    && ufield->field_no == doc_id_pos) {
+					doc_id_t	n_doc_id;
+
+					n_doc_id =
+						table->fts->cache->next_doc_id;
+
+					new_doc_id = fts_read_doc_id(
+						static_cast<const byte*>(
+							dfield_get_data(
+							&ufield->new_val)));
+
+					if (new_doc_id <= 0) {
+						fprintf(stderr,
+							"InnoDB: FTS Doc ID "
+							"must be larger than "
+							"0 \n");
+						return(ULINT_UNDEFINED);
+					}
+
+					if (new_doc_id < n_doc_id) {
+						fprintf(stderr,
+						       "InnoDB: FTS Doc ID "
+						       "must be larger than "
+						       IB_ID_FMT" for table",
+						       n_doc_id -1);
+
+						ut_print_name(stderr, trx,
+							      TRUE,
+							      table->name);
+
+						putc('\n', stderr);
+						return(ULINT_UNDEFINED);
+					}
+
+					*fts_col_affected = TRUE;
+					doc_id_updated = TRUE;
+				}
+
 				n_fields_updated++;
 			}
 		}
 	}
 
+	/* Generate a new Doc ID if FTS index columns get updated */
+	if (table->fts && *fts_col_affected) {
+		if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			doc_id_t	doc_id;
+                        upd_field_t*	ufield;
+
+			ut_ad(!doc_id_updated);
+			ufield = update->fields + n_fields_updated;
+			fts_get_next_doc_id(table, &trx->fts_next_doc_id);
+			doc_id = fts_update_doc_id(table, ufield,
+						   &trx->fts_next_doc_id);
+			n_fields_updated++;
+			fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL);
+		} else  {
+			if (doc_id_updated) {
+				ut_ad(new_doc_id);
+				fts_trx_add_op(trx, table, new_doc_id,
+					       FTS_INSERT, NULL);
+			} else {
+				fprintf(stderr, "InnoDB: FTS Doc ID must be "
+					"updated along with FTS indexed "
+					"column for table ");
+				ut_print_name(stderr, trx, TRUE, table->name);
+				putc('\n', stderr);
+				return(ULINT_UNDEFINED);
+			}
+		}
+	}
+
 	update->n_fields = n_fields_updated;
 
 	return(n_fields_updated);
@@ -602,6 +722,8 @@ row_ins_set_detailed(
 	trx_t*		trx,		/*!< in: transaction */
 	dict_foreign_t*	foreign)	/*!< in: foreign key constraint */
 {
+	ut_ad(!srv_read_only_mode);
+
 	mutex_enter(&srv_misc_tmpfile_mutex);
 	rewind(srv_misc_tmpfile);
 
@@ -619,6 +741,45 @@ row_ins_set_detailed(
 }
 
 /*********************************************************************//**
+Acquires dict_foreign_err_mutex, rewinds dict_foreign_err_file
+and displays information about the given transaction.
+The caller must release dict_foreign_err_mutex. */
+static
+void
+row_ins_foreign_trx_print(
+/*======================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	ulint	n_rec_locks;
+	ulint	n_trx_locks;
+	ulint	heap_size;
+
+	if (srv_read_only_mode) {
+		return;
+	}
+
+	lock_mutex_enter();
+	n_rec_locks = lock_number_of_rows_locked(&trx->lock);
+	n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
+	heap_size = mem_heap_get_size(trx->lock.lock_heap);
+	lock_mutex_exit();
+
+	mutex_enter(&trx_sys->mutex);
+
+	mutex_enter(&dict_foreign_err_mutex);
+	rewind(dict_foreign_err_file);
+	ut_print_timestamp(dict_foreign_err_file);
+	fputs(" Transaction:\n", dict_foreign_err_file);
+
+	trx_print_low(dict_foreign_err_file, trx, 600,
+		      n_rec_locks, n_trx_locks, heap_size);
+
+	mutex_exit(&trx_sys->mutex);
+
+	ut_ad(mutex_own(&dict_foreign_err_mutex));
+}
+
+/*********************************************************************//**
 Reports a foreign key error associated with an update or a delete of a
 parent table index entry. */
 static
@@ -635,16 +796,16 @@ row_ins_foreign_report_err(
 	const dtuple_t*	entry)		/*!< in: index entry in the parent
 					table */
 {
+	if (srv_read_only_mode) {
+		return;
+	}
+
 	FILE*	ef	= dict_foreign_err_file;
 	trx_t*	trx	= thr_get_trx(thr);
 
 	row_ins_set_detailed(trx, foreign);
 
-	mutex_enter(&dict_foreign_err_mutex);
-	rewind(ef);
-	ut_print_timestamp(ef);
-	fputs(" Transaction:\n", ef);
-	trx_print(ef, trx, 600);
+	row_ins_foreign_trx_print(trx);
 
 	fputs("Foreign key constraint fails for table ", ef);
 	ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
@@ -690,15 +851,16 @@ row_ins_foreign_report_add_err(
 	const dtuple_t*	entry)		/*!< in: index entry to insert in the
 					child table */
 {
+	if (srv_read_only_mode) {
+		return;
+	}
+
 	FILE*	ef	= dict_foreign_err_file;
 
 	row_ins_set_detailed(trx, foreign);
 
-	mutex_enter(&dict_foreign_err_mutex);
-	rewind(ef);
-	ut_print_timestamp(ef);
-	fputs(" Transaction:\n", ef);
-	trx_print(ef, trx, 600);
+	row_ins_foreign_trx_print(trx);
+
 	fputs("Foreign key constraint fails for table ", ef);
 	ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
 	fputs(":\n", ef);
@@ -762,8 +924,8 @@ Perform referential actions or checks when a parent row is deleted or updated
 and the constraint had an ON DELETE or ON UPDATE condition which was not
 RESTRICT.
 @return	DB_SUCCESS, DB_LOCK_WAIT, or error code */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_ins_foreign_check_on_constraint(
 /*================================*/
 	que_thr_t*	thr,		/*!< in: query thread whose run_node
@@ -789,10 +951,12 @@ row_ins_foreign_check_on_constraint(
 	const buf_block_t* clust_block;
 	upd_t*		update;
 	ulint		n_to_update;
-	ulint		err;
+	dberr_t		err;
 	ulint		i;
 	trx_t*		trx;
 	mem_heap_t*	tmp_heap	= NULL;
+	doc_id_t	doc_id = FTS_NULL_DOC_ID;
+	ibool		fts_col_affacted = FALSE;
 
 	ut_a(thr);
 	ut_a(foreign);
@@ -804,12 +968,12 @@ row_ins_foreign_check_on_constraint(
 	/* Since we are going to delete or update a row, we have to invalidate
 	the MySQL query cache for table. A deadlock of threads is not possible
 	here because the caller of this function does not hold any latches with
-	the sync0sync.h rank above the kernel mutex. The query cache mutex has
-	a rank just above the kernel mutex. */
+	the sync0sync.h rank above the lock_sys_t::mutex. The query cache mutex
+       	has a rank just above the lock_sys_t::mutex. */
 
 	row_ins_invalidate_query_cache(thr, table->name);
 
-	node = thr->run_node;
+	node = static_cast<upd_node_t*>(thr->run_node);
 
 	if (node->is_delete && 0 == (foreign->type
 				     & (DICT_FOREIGN_ON_DELETE_CASCADE
@@ -913,6 +1077,8 @@ row_ins_foreign_check_on_constraint(
 
 	rec = btr_pcur_get_rec(pcur);
 
+	tmp_heap = mem_heap_create(256);
+
 	if (dict_index_is_clust(index)) {
 		/* pcur is already positioned in the clustered index of
 		the child table */
@@ -926,8 +1092,6 @@ row_ins_foreign_check_on_constraint(
 
 		clust_index = dict_table_get_first_index(table);
 
-		tmp_heap = mem_heap_create(256);
-
 		ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec,
 					tmp_heap);
 		btr_pcur_open_with_no_init(clust_index, ref,
@@ -989,6 +1153,10 @@ row_ins_foreign_check_on_constraint(
 		goto nonstandard_exit_func;
 	}
 
+	if (table->fts) {
+		doc_id = fts_get_doc_id_from_rec(table, clust_rec, tmp_heap);
+	}
+
 	if (node->is_delete
 	    ? (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL)
 	    : (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL)) {
@@ -1012,6 +1180,31 @@ row_ins_foreign_check_on_constraint(
 			ufield->orig_len = 0;
 			ufield->exp = NULL;
 			dfield_set_null(&ufield->new_val);
+
+			if (table->fts && dict_table_is_fts_column(
+				table->fts->indexes,
+				dict_index_get_nth_col_no(index, i))
+				!= ULINT_UNDEFINED) {
+				fts_col_affacted = TRUE;
+			}
+		}
+
+		if (fts_col_affacted) {
+			fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+		}
+	} else if (table->fts && cascade->is_delete) {
+		/* DICT_FOREIGN_ON_DELETE_CASCADE case */
+		for (i = 0; i < foreign->n_fields; i++) {
+			if (table->fts && dict_table_is_fts_column(
+				table->fts->indexes,
+				dict_index_get_nth_col_no(index, i))
+				!= ULINT_UNDEFINED) {
+				fts_col_affacted = TRUE;
+			}
+		}
+
+		if (fts_col_affacted) {
+			fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
 		}
 	}
 
@@ -1023,8 +1216,9 @@ row_ins_foreign_check_on_constraint(
 
 		upd_vec_heap = mem_heap_create(256);
 
-		n_to_update = row_ins_cascade_calc_update_vec(node, foreign,
-							      upd_vec_heap);
+		n_to_update = row_ins_cascade_calc_update_vec(
+			node, foreign, upd_vec_heap, trx, &fts_col_affacted);
+
 		if (n_to_update == ULINT_UNDEFINED) {
 			err = DB_ROW_IS_REFERENCED;
 
@@ -1050,6 +1244,12 @@ row_ins_foreign_check_on_constraint(
 
 			goto nonstandard_exit_func;
 		}
+
+		/* Mark the old Doc ID as deleted */
+		if (fts_col_affacted) {
+			ut_ad(table->fts);
+			fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+		}
 	}
 
 	/* Store pcur position and initialize or store the cascade node
@@ -1132,7 +1332,7 @@ Sets a shared lock on a record. Used in locking possible duplicate key
 records and also in checking foreign key constraints.
 @return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
 static
-enum db_err
+dberr_t
 row_ins_set_shared_rec_lock(
 /*========================*/
 	ulint			type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
@@ -1143,7 +1343,7 @@ row_ins_set_shared_rec_lock(
 	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
-	enum db_err	err;
+	dberr_t	err;
 
 	ut_ad(rec_offs_validate(rec, index, offsets));
 
@@ -1163,7 +1363,7 @@ Sets a exclusive lock on a record. Used in locking possible duplicate key
 records
 @return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
 static
-enum db_err
+dberr_t
 row_ins_set_exclusive_rec_lock(
 /*===========================*/
 	ulint			type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
@@ -1174,7 +1374,7 @@ row_ins_set_exclusive_rec_lock(
 	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
-	enum db_err	err;
+	dberr_t	err;
 
 	ut_ad(rec_offs_validate(rec, index, offsets));
 
@@ -1195,7 +1395,7 @@ which lock either the success or the failure of the constraint. NOTE that
 the caller must have a shared latch on dict_operation_lock.
 @return	DB_SUCCESS, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */
 UNIV_INTERN
-ulint
+dberr_t
 row_ins_check_foreign_constraint(
 /*=============================*/
 	ibool		check_ref,/*!< in: TRUE if we want to check that
@@ -1209,13 +1409,13 @@ row_ins_check_foreign_constraint(
 	dtuple_t*	entry,	/*!< in: index entry for index */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
+	dberr_t		err;
 	upd_node_t*	upd_node;
 	dict_table_t*	check_table;
 	dict_index_t*	check_index;
 	ulint		n_fields_cmp;
 	btr_pcur_t	pcur;
 	int		cmp;
-	ulint		err;
 	ulint		i;
 	mtr_t		mtr;
 	trx_t*		trx		= thr_get_trx(thr);
@@ -1250,7 +1450,7 @@ run_again:
 	}
 
 	if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) {
-		upd_node = thr->run_node;
+		upd_node = static_cast<upd_node_t*>(thr->run_node);
 
 		if (!(upd_node->is_delete) && upd_node->foreign == foreign) {
 			/* If a cascaded update is done as defined by a
@@ -1281,18 +1481,17 @@ run_again:
 		check_index = foreign->foreign_index;
 	}
 
-	if (check_table == NULL || check_table->ibd_file_missing
+	if (check_table == NULL
+	    || check_table->ibd_file_missing
 	    || check_index == NULL) {
-		if (check_ref) {
+
+		if (!srv_read_only_mode && check_ref) {
 			FILE*	ef = dict_foreign_err_file;
 
 			row_ins_set_detailed(trx, foreign);
 
-			mutex_enter(&dict_foreign_err_mutex);
-			rewind(ef);
-			ut_print_timestamp(ef);
-			fputs(" Transaction:\n", ef);
-			trx_print(ef, trx, 600);
+			row_ins_foreign_trx_print(trx);
+
 			fputs("Foreign key constraint fails for table ", ef);
 			ut_print_name(ef, trx, TRUE,
 				      foreign->foreign_table_name);
@@ -1468,6 +1667,8 @@ run_again:
 				} else {
 					err = DB_SUCCESS;
 				}
+			default:
+				break;
 			}
 
 			goto end_scan;
@@ -1493,18 +1694,49 @@ end_scan:
 
 do_possible_lock_wait:
 	if (err == DB_LOCK_WAIT) {
+		bool		verified = false;
+
 		trx->error_state = err;
 
 		que_thr_stop_for_mysql(thr);
 
-		srv_suspend_mysql_thread(thr);
+		lock_wait_suspend_thread(thr);
 
-		if (trx->error_state == DB_SUCCESS) {
+		if (check_table->to_be_dropped) {
+			/* The table is being dropped. We shall timeout
+			this operation */
+			err = DB_LOCK_WAIT_TIMEOUT;
+			goto exit_func;
+		}
 
-			goto run_again;
+		/* We had temporarily released dict_operation_lock in
+		above lock sleep wait, now we have the lock again, and
+		we will need to re-check whether the foreign key has been
+		dropped. We only need to verify if the table is referenced
+		table case (check_ref == 0), since MDL lock will prevent
+		concurrent DDL and DML on the same table */
+		if (!check_ref) {
+			for (const dict_foreign_t* check_foreign
+				= UT_LIST_GET_FIRST( table->referenced_list);
+			     check_foreign;
+			     check_foreign = UT_LIST_GET_NEXT(
+					referenced_list, check_foreign)) {
+				if (check_foreign == foreign) {
+					verified = true;
+					break;
+				}
+			}
+		} else {
+			verified = true;
 		}
 
-		err = trx->error_state;
+		if (!verified) {
+			err = DB_DICT_CHANGED;
+		} else if (trx->error_state == DB_SUCCESS) {
+			goto run_again;
+		} else {
+			err = trx->error_state;
+		}
 	}
 
 exit_func:
@@ -1526,8 +1758,8 @@ Otherwise does searches to the indexes of referenced tables and
 sets shared locks which lock either the success or the failure of
 a constraint.
 @return	DB_SUCCESS or error code */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_ins_check_foreign_constraints(
 /*==============================*/
 	dict_table_t*	table,	/*!< in: table */
@@ -1536,7 +1768,7 @@ row_ins_check_foreign_constraints(
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	dict_foreign_t*	foreign;
-	ulint		err;
+	dberr_t		err;
 	trx_t*		trx;
 	ibool		got_s_lock	= FALSE;
 
@@ -1544,13 +1776,21 @@ row_ins_check_foreign_constraints(
 
 	foreign = UT_LIST_GET_FIRST(table->foreign_list);
 
+	DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+			    "foreign_constraint_check_for_ins");
+
 	while (foreign) {
 		if (foreign->foreign_index == index) {
+			dict_table_t*	ref_table = NULL;
+			dict_table_t*	foreign_table = foreign->foreign_table;
+			dict_table_t*	referenced_table
+						= foreign->referenced_table;
 
-			if (foreign->referenced_table == NULL) {
-				dict_table_get(foreign->referenced_table_name_lookup,
-					       FALSE,
-					       DICT_ERR_IGNORE_NONE);
+			if (referenced_table == NULL) {
+
+				ref_table = dict_table_open_on_name(
+					foreign->referenced_table_name_lookup,
+					FALSE, FALSE, DICT_ERR_IGNORE_NONE);
 			}
 
 			if (0 == trx->dict_operation_lock_mode) {
@@ -1559,13 +1799,10 @@ row_ins_check_foreign_constraints(
 				row_mysql_freeze_data_dictionary(trx);
 			}
 
-			if (foreign->referenced_table) {
-				mutex_enter(&(dict_sys->mutex));
-
-				(foreign->referenced_table
-				 ->n_foreign_key_checks_running)++;
-
-				mutex_exit(&(dict_sys->mutex));
+			if (referenced_table) {
+				os_inc_counter(dict_sys->mutex,
+					       foreign_table
+					       ->n_foreign_key_checks_running);
 			}
 
 			/* NOTE that if the thread ends up waiting for a lock
@@ -1576,22 +1813,25 @@ row_ins_check_foreign_constraints(
 			err = row_ins_check_foreign_constraint(
 				TRUE, foreign, table, entry, thr);
 
-			if (foreign->referenced_table) {
-				mutex_enter(&(dict_sys->mutex));
-
-				ut_a(foreign->referenced_table
-				     ->n_foreign_key_checks_running > 0);
-				(foreign->referenced_table
-				 ->n_foreign_key_checks_running)--;
+			DBUG_EXECUTE_IF("row_ins_dict_change_err",
+					err = DB_DICT_CHANGED;);
 
-				mutex_exit(&(dict_sys->mutex));
+			if (referenced_table) {
+				os_dec_counter(dict_sys->mutex,
+					       foreign_table
+					       ->n_foreign_key_checks_running);
 			}
 
 			if (got_s_lock) {
 				row_mysql_unfreeze_data_dictionary(trx);
 			}
 
+			if (ref_table != NULL) {
+				dict_table_close(ref_table, FALSE, FALSE);
+			}
+
 			if (err != DB_SUCCESS) {
+
 				return(err);
 			}
 		}
@@ -1643,8 +1883,7 @@ row_ins_dupl_error_with_rec(
 	if (!dict_index_is_clust(index)) {
 
 		for (i = 0; i < n_unique; i++) {
-			if (UNIV_SQL_NULL == dfield_get_len(
-				    dtuple_get_nth_field(entry, i))) {
+			if (dfield_is_null(dtuple_get_nth_field(entry, i))) {
 
 				return(FALSE);
 			}
@@ -1659,26 +1898,30 @@ Scans a unique non-clustered index at a given index entry to determine
 whether a uniqueness violation has occurred for the key value of the entry.
 Set shared locks on possible duplicate records.
 @return	DB_SUCCESS, DB_DUPLICATE_KEY, or DB_LOCK_WAIT */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_ins_scan_sec_index_for_duplicate(
 /*=================================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
 	dict_index_t*	index,	/*!< in: non-clustered unique index */
 	dtuple_t*	entry,	/*!< in: index entry */
-	que_thr_t*	thr)	/*!< in: query thread */
+	que_thr_t*	thr,	/*!< in: query thread */
+	bool		s_latch,/*!< in: whether index->lock is being held */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	mem_heap_t*	offsets_heap)
+				/*!< in/out: memory heap that can be emptied */
 {
 	ulint		n_unique;
-	ulint		i;
 	int		cmp;
 	ulint		n_fields_cmp;
 	btr_pcur_t	pcur;
-	ulint		err		= DB_SUCCESS;
+	dberr_t		err		= DB_SUCCESS;
 	ulint		allow_duplicates;
-	mtr_t		mtr;
-	mem_heap_t*	heap		= NULL;
-	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
-	ulint*		offsets		= offsets_;
-	rec_offs_init(offsets_);
+	ulint*		offsets		= NULL;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(s_latch == rw_lock_own(&index->lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
 
 	n_unique = dict_index_get_n_unique(index);
 
@@ -1686,7 +1929,7 @@ row_ins_scan_sec_index_for_duplicate(
 	n_unique first fields is NULL, a unique key violation cannot occur,
 	since we define NULL != NULL in this case */
 
-	for (i = 0; i < n_unique; i++) {
+	for (ulint i = 0; i < n_unique; i++) {
 		if (UNIV_SQL_NULL == dfield_get_len(
 			    dtuple_get_nth_field(entry, i))) {
 
@@ -1694,15 +1937,17 @@ row_ins_scan_sec_index_for_duplicate(
 		}
 	}
 
-	mtr_start(&mtr);
-
 	/* Store old value on n_fields_cmp */
 
 	n_fields_cmp = dtuple_get_n_fields_cmp(entry);
 
-	dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique(index));
+	dtuple_set_n_fields_cmp(entry, n_unique);
 
-	btr_pcur_open(index, entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr);
+	btr_pcur_open(index, entry, PAGE_CUR_GE,
+		      s_latch
+		      ? BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED
+		      : BTR_SEARCH_LEAF,
+		      &pcur, mtr);
 
 	allow_duplicates = thr_get_trx(thr)->duplicates;
 
@@ -1719,7 +1964,7 @@ row_ins_scan_sec_index_for_duplicate(
 		}
 
 		offsets = rec_get_offsets(rec, index, offsets,
-					  ULINT_UNDEFINED, &heap);
+					  ULINT_UNDEFINED, &offsets_heap);
 
 		/* If the transaction isolation level is no stronger than
 		READ COMMITTED, then avoid gap locks. */
@@ -1731,7 +1976,10 @@ row_ins_scan_sec_index_for_duplicate(
 			lock_type = LOCK_ORDINARY;
 		}
 
-		if (allow_duplicates) {
+		if (flags & BTR_NO_LOCKING_FLAG) {
+			/* Set no locks when applying log
+			in online table rebuild. */
+		} else if (allow_duplicates) {
 
 			/* If the SQL-query will update or replace
 			duplicate key we will take X-lock for
@@ -1769,43 +2017,134 @@ row_ins_scan_sec_index_for_duplicate(
 
 				thr_get_trx(thr)->error_info = index;
 
+				/* If the duplicate is on hidden FTS_DOC_ID,
+				state so in the error log */
+				if (DICT_TF2_FLAG_IS_SET(
+					index->table,
+					DICT_TF2_FTS_HAS_DOC_ID)
+				    && strcmp(index->name,
+					      FTS_DOC_ID_INDEX_NAME) == 0) {
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"Duplicate FTS_DOC_ID value"
+						" on table %s",
+						index->table->name);
+				}
+
 				goto end_scan;
 			}
 		} else {
 			ut_a(cmp < 0);
 			goto end_scan;
 		}
-	} while (btr_pcur_move_to_next(&pcur, &mtr));
+	} while (btr_pcur_move_to_next(&pcur, mtr));
 
 end_scan:
-	if (UNIV_LIKELY_NULL(heap)) {
-		mem_heap_free(heap);
-	}
-	mtr_commit(&mtr);
-
 	/* Restore old value */
 	dtuple_set_n_fields_cmp(entry, n_fields_cmp);
 
 	return(err);
 }
 
+/** Checks for a duplicate when the table is being rebuilt online.
+@retval DB_SUCCESS		when no duplicate is detected
+@retval DB_SUCCESS_LOCKED_REC	when rec is an exact match of entry or
+a newer version of entry (the entry should not be inserted)
+@retval DB_DUPLICATE_KEY	when entry is a duplicate of rec */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_online(
+/*=====================*/
+	ulint		n_uniq,	/*!< in: offset of DB_TRX_ID */
+	const dtuple_t*	entry,	/*!< in: entry that is being inserted */
+	const rec_t*	rec,	/*!< in: clustered index record */
+	ulint*		offsets)/*!< in/out: rec_get_offsets(rec) */
+{
+	ulint	fields	= 0;
+	ulint	bytes	= 0;
+
+	/* During rebuild, there should not be any delete-marked rows
+	in the new table. */
+	ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+	ut_ad(dtuple_get_n_fields_cmp(entry) == n_uniq);
+
+	/* Compare the PRIMARY KEY fields and the
+	DB_TRX_ID, DB_ROLL_PTR. */
+	cmp_dtuple_rec_with_match_low(
+		entry, rec, offsets, n_uniq + 2, &fields, &bytes);
+
+	if (fields < n_uniq) {
+		/* Not a duplicate. */
+		return(DB_SUCCESS);
+	}
+
+	if (fields == n_uniq + 2) {
+		/* rec is an exact match of entry. */
+		ut_ad(bytes == 0);
+		return(DB_SUCCESS_LOCKED_REC);
+	}
+
+	return(DB_DUPLICATE_KEY);
+}
+
+/** Checks for a duplicate when the table is being rebuilt online.
+@retval DB_SUCCESS		when no duplicate is detected
+@retval DB_SUCCESS_LOCKED_REC	when rec is an exact match of entry or
+a newer version of entry (the entry should not be inserted)
+@retval DB_DUPLICATE_KEY	when entry is a duplicate of rec */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_error_in_clust_online(
+/*====================================*/
+	ulint		n_uniq,	/*!< in: offset of DB_TRX_ID */
+	const dtuple_t*	entry,	/*!< in: entry that is being inserted */
+	const btr_cur_t*cursor,	/*!< in: cursor on insert position */
+	ulint**		offsets,/*!< in/out: rec_get_offsets(rec) */
+	mem_heap_t**	heap)	/*!< in/out: heap for offsets */
+{
+	dberr_t		err	= DB_SUCCESS;
+	const rec_t*	rec	= btr_cur_get_rec(cursor);
+
+	if (cursor->low_match >= n_uniq && !page_rec_is_infimum(rec)) {
+		*offsets = rec_get_offsets(rec, cursor->index, *offsets,
+					   ULINT_UNDEFINED, heap);
+		err = row_ins_duplicate_online(n_uniq, entry, rec, *offsets);
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	rec = page_rec_get_next_const(btr_cur_get_rec(cursor));
+
+	if (cursor->up_match >= n_uniq && !page_rec_is_supremum(rec)) {
+		*offsets = rec_get_offsets(rec, cursor->index, *offsets,
+					   ULINT_UNDEFINED, heap);
+		err = row_ins_duplicate_online(n_uniq, entry, rec, *offsets);
+	}
+
+	return(err);
+}
+
 /***************************************************************//**
 Checks if a unique key violation error would occur at an index entry
 insert. Sets shared locks on possible duplicate records. Works only
 for a clustered index!
-@return DB_SUCCESS if no error, DB_DUPLICATE_KEY if error,
-DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate
-record */
-static
-ulint
+@retval DB_SUCCESS if no error
+@retval DB_DUPLICATE_KEY if error,
+@retval DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate
+record
+@retval DB_SUCCESS_LOCKED_REC if an exact match of the record was found
+in online table rebuild (flags & (BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG)) */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_ins_duplicate_error_in_clust(
 /*=============================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
 	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
 	const dtuple_t*	entry,	/*!< in: entry to insert */
 	que_thr_t*	thr,	/*!< in: query thread */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	ulint	err;
+	dberr_t	err;
 	rec_t*	rec;
 	ulint	n_unique;
 	trx_t*	trx		= thr_get_trx(thr);
@@ -1816,8 +2155,7 @@ row_ins_duplicate_error_in_clust(
 
 	UT_NOT_USED(mtr);
 
-	ut_a(dict_index_is_clust(cursor->index));
-	ut_ad(dict_index_is_unique(cursor->index));
+	ut_ad(dict_index_is_clust(cursor->index));
 
 	/* NOTE: For unique non-clustered indexes there may be any number
 	of delete marked records with the same value for the non-clustered
@@ -1876,6 +2214,7 @@ row_ins_duplicate_error_in_clust(
 
 			if (row_ins_dupl_error_with_rec(
 				    rec, entry, cursor->index, offsets)) {
+duplicate:
 				trx->error_info = cursor->index;
 				err = DB_DUPLICATE_KEY;
 				goto func_exit;
@@ -1920,14 +2259,12 @@ row_ins_duplicate_error_in_clust(
 
 			if (row_ins_dupl_error_with_rec(
 				    rec, entry, cursor->index, offsets)) {
-				trx->error_info = cursor->index;
-				err = DB_DUPLICATE_KEY;
-				goto func_exit;
+				goto duplicate;
 			}
 		}
 
-		ut_a(!dict_index_is_clust(cursor->index));
 		/* This should never happen */
+		ut_error;
 	}
 
 	err = DB_SUCCESS;
@@ -1939,99 +2276,403 @@ func_exit:
 }
 
 /***************************************************************//**
-Checks if an index entry has long enough common prefix with an existing
-record so that the intended insert of the entry must be changed to a modify of
-the existing record. In the case of a clustered index, the prefix must be
-n_unique fields long, and in the case of a secondary index, all fields must be
-equal.
-@return 0 if no update, ROW_INS_PREV if previous should be updated;
-currently we do the search so that only the low_match record can match
-enough to the search tuple, not the next record */
+Checks if an index entry has long enough common prefix with an
+existing record so that the intended insert of the entry must be
+changed to a modify of the existing record. In the case of a clustered
+index, the prefix must be n_unique fields long. In the case of a
+secondary index, all fields must be equal.  InnoDB never updates
+secondary index records in place, other than clearing or setting the
+delete-mark flag. We could be able to update the non-unique fields
+of a unique secondary index record by checking the cursor->up_match,
+but we do not do so, because it could have some locking implications.
+@return TRUE if the existing record should be updated; FALSE if not */
 UNIV_INLINE
-ulint
-row_ins_must_modify(
-/*================*/
-	btr_cur_t*	cursor)	/*!< in: B-tree cursor */
+ibool
+row_ins_must_modify_rec(
+/*====================*/
+	const btr_cur_t*	cursor)	/*!< in: B-tree cursor */
 {
-	ulint	enough_match;
-	rec_t*	rec;
+	/* NOTE: (compare to the note in row_ins_duplicate_error_in_clust)
+	Because node pointers on upper levels of the B-tree may match more
+	to entry than to actual user records on the leaf level, we
+	have to check if the candidate record is actually a user record.
+	A clustered index node pointer contains index->n_unique first fields,
+	and a secondary index node pointer contains all index fields. */
+
+	return(cursor->low_match
+	       >= dict_index_get_n_unique_in_tree(cursor->index)
+	       && !page_rec_is_infimum(btr_cur_get_rec(cursor)));
+}
 
-	/* NOTE: (compare to the note in row_ins_duplicate_error) Because node
-	pointers on upper levels of the B-tree may match more to entry than
-	to actual user records on the leaf level, we have to check if the
-	candidate record is actually a user record. In a clustered index
-	node pointers contain index->n_unique first fields, and in the case
-	of a secondary index, all fields of the index. */
+/***************************************************************//**
+Tries to insert an entry into a clustered index, ignoring foreign key
+constraints. If a record with the same unique key is found, the other
+record is necessarily marked deleted by a committed transaction, or a
+unique key violation error occurs. The delete marked record is then
+updated to an existing record, and we must write an undo log record on
+the delete marked record.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+UNIV_INTERN
+dberr_t
+row_ins_clust_index_entry_low(
+/*==========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		n_uniq,	/*!< in: 0 or index->n_uniq */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	btr_cur_t	cursor;
+	ulint*		offsets		= NULL;
+	dberr_t		err;
+	big_rec_t*	big_rec		= NULL;
+	mtr_t		mtr;
+	mem_heap_t*	offsets_heap	= NULL;
 
-	enough_match = dict_index_get_n_unique_in_tree(cursor->index);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!dict_index_is_unique(index)
+	      || n_uniq == dict_index_get_n_unique(index));
+	ut_ad(!n_uniq || n_uniq == dict_index_get_n_unique(index));
 
-	if (cursor->low_match >= enough_match) {
+	mtr_start(&mtr);
 
-		rec = btr_cur_get_rec(cursor);
+	if (mode == BTR_MODIFY_LEAF && dict_index_is_online_ddl(index)) {
+		if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) {
+			mode = BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED;
+		} else {
+			mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
+		}
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+	} else if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) {
+		mode = (mode & BTR_MODIFY_TREE)
+			? BTR_SEARCH_TREE : BTR_SEARCH_LEAF;
+	}
 
-		if (!page_rec_is_infimum(rec)) {
+	cursor.thr = thr;
+
+	/* Note that we use PAGE_CUR_LE as the search mode, because then
+	the function will return in both low_match and up_match of the
+	cursor sensible values */
+
+	btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, mode,
+				    &cursor, 0, __FILE__, __LINE__, &mtr);
+
+#ifdef UNIV_DEBUG
+	{
+		page_t*	page = btr_cur_get_page(&cursor);
+		rec_t*	first_rec = page_rec_get_next(
+			page_get_infimum_rec(page));
+
+		ut_ad(page_rec_is_supremum(first_rec)
+		      || rec_get_n_fields(first_rec, index)
+		      == dtuple_get_n_fields(entry));
+	}
+#endif
+
+	if (n_uniq && (cursor.up_match >= n_uniq
+		       || cursor.low_match >= n_uniq)) {
+
+		if (flags
+		    == (BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+			| BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG)) {
+			/* Set no locks when applying log
+			in online table rebuild. Only check for duplicates. */
+			err = row_ins_duplicate_error_in_clust_online(
+				n_uniq, entry, &cursor,
+				&offsets, &offsets_heap);
+
+			switch (err) {
+			case DB_SUCCESS:
+				break;
+			default:
+				ut_ad(0);
+				/* fall through */
+			case DB_SUCCESS_LOCKED_REC:
+			case DB_DUPLICATE_KEY:
+				thr_get_trx(thr)->error_info = cursor.index;
+			}
+		} else {
+			/* Note that the following may return also
+			DB_LOCK_WAIT */
+
+			err = row_ins_duplicate_error_in_clust(
+				flags, &cursor, entry, thr, &mtr);
+		}
+
+		if (err != DB_SUCCESS) {
+err_exit:
+			mtr_commit(&mtr);
+			goto func_exit;
+		}
+	}
+
+	if (row_ins_must_modify_rec(&cursor)) {
+		/* There is already an index entry with a long enough common
+		prefix, we must convert the insert into a modify of an
+		existing record */
+		mem_heap_t*	entry_heap	= mem_heap_create(1024);
+
+		err = row_ins_clust_index_entry_by_modify(
+			flags, mode, &cursor, &offsets, &offsets_heap,
+			entry_heap, &big_rec, entry, thr, &mtr);
+
+		rec_t*		rec		= btr_cur_get_rec(&cursor);
+
+		if (big_rec && UNIV_LIKELY(!thr_get_trx(thr)->fake_changes)) {
+			ut_a(err == DB_SUCCESS);
+			/* Write out the externally stored
+			columns while still x-latching
+			index->lock and block->lock. Allocate
+			pages for big_rec in the mtr that
+			modified the B-tree, but be sure to skip
+			any pages that were freed in mtr. We will
+			write out the big_rec pages before
+			committing the B-tree mini-transaction. If
+			the system crashes so that crash recovery
+			will not replay the mtr_commit(&mtr), the
+			big_rec pages will be left orphaned until
+			the pages are allocated for something else.
+
+			TODO: If the allocation extends the
+			tablespace, it will not be redo
+			logged, in either mini-transaction.
+			Tablespace extension should be
+			redo-logged in the big_rec
+			mini-transaction, so that recovery
+			will not fail when the big_rec was
+			written to the extended portion of the
+			file, in case the file was somehow
+			truncated in the crash. */
+
+			DEBUG_SYNC_C_IF_THD(
+				thr_get_trx(thr)->mysql_thd,
+				"before_row_ins_upd_extern");
+			err = btr_store_big_rec_extern_fields(
+				index, btr_cur_get_block(&cursor),
+				rec, offsets, big_rec, &mtr,
+				BTR_STORE_INSERT_UPDATE);
+			DEBUG_SYNC_C_IF_THD(
+				thr_get_trx(thr)->mysql_thd,
+				"after_row_ins_upd_extern");
+			/* If writing big_rec fails (for
+			example, because of DB_OUT_OF_FILE_SPACE),
+			the record will be corrupted. Even if
+			we did not update any externally
+			stored columns, our update could cause
+			the record to grow so that a
+			non-updated column was selected for
+			external storage. This non-update
+			would not have been written to the
+			undo log, and thus the record cannot
+			be rolled back.
+
+			However, because we have not executed
+			mtr_commit(mtr) yet, the update will
+			not be replayed in crash recovery, and
+			the following assertion failure will
+			effectively "roll back" the operation. */
+			ut_a(err == DB_SUCCESS);
+			dtuple_big_rec_free(big_rec);
+		}
+
+		if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) {
+			row_log_table_insert(rec, index, offsets);
+		}
+
+		mtr_commit(&mtr);
+		mem_heap_free(entry_heap);
+	} else {
+		rec_t*	insert_rec;
+
+		if (mode != BTR_MODIFY_TREE) {
+			ut_ad(((mode & ~BTR_ALREADY_S_LATCHED)
+			       == BTR_MODIFY_LEAF)
+			      || thr_get_trx(thr)->fake_changes);
+			err = btr_cur_optimistic_insert(
+				flags, &cursor, &offsets, &offsets_heap,
+				entry, &insert_rec, &big_rec,
+				n_ext, thr, &mtr);
+		} else {
+			if (buf_LRU_buf_pool_running_out()) {
+
+				err = DB_LOCK_TABLE_FULL;
+				goto err_exit;
+			}
+
+			err = btr_cur_optimistic_insert(
+				flags, &cursor,
+				&offsets, &offsets_heap,
+				entry, &insert_rec, &big_rec,
+				n_ext, thr, &mtr);
+
+			if (err == DB_FAIL) {
+				err = btr_cur_pessimistic_insert(
+					flags, &cursor,
+					&offsets, &offsets_heap,
+					entry, &insert_rec, &big_rec,
+					n_ext, thr, &mtr);
+			}
+		}
+
+		if (UNIV_LIKELY_NULL(big_rec)) {
+			mtr_commit(&mtr);
+
+			if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) {
+
+				/* skip store extern */
+				mem_heap_free(big_rec->heap);
+				goto func_exit;
+			}
+
+			/* Online table rebuild could read (and
+			ignore) the incomplete record at this point.
+			If online rebuild is in progress, the
+			row_ins_index_entry_big_rec() will write log. */
+
+			DBUG_EXECUTE_IF(
+				"row_ins_extern_checkpoint",
+				log_make_checkpoint_at(
+					LSN_MAX, TRUE););
+			err = row_ins_index_entry_big_rec(
+				entry, big_rec, offsets, &offsets_heap, index,
+				thr_get_trx(thr)->mysql_thd,
+				__FILE__, __LINE__);
+			dtuple_convert_back_big_rec(index, entry, big_rec);
+		} else {
+			if (err == DB_SUCCESS
+			    && dict_index_is_online_ddl(index)) {
+				row_log_table_insert(
+					insert_rec, index, offsets);
+			}
 
-			return(ROW_INS_PREV);
+			mtr_commit(&mtr);
 		}
 	}
 
-	return(0);
+func_exit:
+	if (offsets_heap) {
+		mem_heap_free(offsets_heap);
+	}
+
+	return(err);
 }
 
 /***************************************************************//**
-Tries to insert an index entry to an index. If the index is clustered
-and a record with the same unique key is found, the other record is
-necessarily marked deleted by a committed transaction, or a unique key
-violation error occurs. The delete marked record is then updated to an
-existing record, and we must write an undo log record on the delete
-marked record. If the index is secondary, and a record with exactly the
+Starts a mini-transaction and checks if the index will be dropped.
+@return true if the index is to be dropped */
+static __attribute__((nonnull, warn_unused_result))
+bool
+row_ins_sec_mtr_start_and_check_if_aborted(
+/*=======================================*/
+	mtr_t*		mtr,	/*!< out: mini-transaction */
+	dict_index_t*	index,	/*!< in/out: secondary index */
+	bool		check,	/*!< in: whether to check */
+	ulint		search_mode)
+				/*!< in: flags */
+{
+	ut_ad(!dict_index_is_clust(index));
+
+	mtr_start(mtr);
+
+	if (!check) {
+		return(false);
+	}
+
+	if (search_mode & BTR_ALREADY_S_LATCHED) {
+		mtr_s_lock(dict_index_get_lock(index), mtr);
+	} else {
+		mtr_x_lock(dict_index_get_lock(index), mtr);
+	}
+
+	switch (index->online_status) {
+	case ONLINE_INDEX_ABORTED:
+	case ONLINE_INDEX_ABORTED_DROPPED:
+		ut_ad(*index->name == TEMP_INDEX_PREFIX);
+		return(true);
+	case ONLINE_INDEX_COMPLETE:
+		return(false);
+	case ONLINE_INDEX_CREATION:
+		break;
+	}
+
+	ut_error;
+	return(true);
+}
+
+/***************************************************************//**
+Tries to insert an entry into a secondary index. If a record with exactly the
 same fields is found, the other record is necessarily marked deleted.
 It is then unmarked. Otherwise, the entry is just inserted to the index.
-@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL if pessimistic retry needed,
-or error code */
-static
-ulint
-row_ins_index_entry_low(
-/*====================*/
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+UNIV_INTERN
+dberr_t
+row_ins_sec_index_entry_low(
+/*========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
 	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
 				depending on whether we wish optimistic or
 				pessimistic descent down the index tree */
-	dict_index_t*	index,	/*!< in: index */
+	dict_index_t*	index,	/*!< in: secondary index */
+	mem_heap_t*	offsets_heap,
+				/*!< in/out: memory heap that can be emptied */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
 	dtuple_t*	entry,	/*!< in/out: index entry to insert */
-	ulint		n_ext,	/*!< in: number of externally stored columns */
+	trx_id_t	trx_id,	/*!< in: PAGE_MAX_TRX_ID during
+				row_log_table_apply(), or 0 */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	btr_cur_t	cursor;
-	ulint		search_mode;
-	ulint		modify = 0; /* remove warning */
-	rec_t*		insert_rec;
-	rec_t*		rec;
-	ulint*		offsets;
-	ulint		err;
+	ulint		search_mode	= mode | BTR_INSERT;
+	dberr_t		err		= DB_SUCCESS;
 	ulint		n_unique;
-	big_rec_t*	big_rec			= NULL;
 	mtr_t		mtr;
-	mem_heap_t*	heap			= NULL;
+	ulint*		offsets	= NULL;
 
-	log_free_check();
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_MODIFY_TREE);
 
+	cursor.thr = thr;
+	ut_ad(thr_get_trx(thr)->id);
 	mtr_start(&mtr);
 
-	cursor.thr = thr;
+	/* Ensure that we acquire index->lock when inserting into an
+	index with index->online_status == ONLINE_INDEX_COMPLETE, but
+	could still be subject to rollback_inplace_alter_table().
+	This prevents a concurrent change of index->online_status.
+	The memory object cannot be freed as long as we have an open
+	reference to the table, or index->table->n_ref_count > 0. */
+	const bool check = *index->name == TEMP_INDEX_PREFIX;
+	if (check) {
+		DEBUG_SYNC_C("row_ins_sec_index_enter");
+		if (mode == BTR_MODIFY_LEAF) {
+			search_mode |= BTR_ALREADY_S_LATCHED;
+			mtr_s_lock(dict_index_get_lock(index), &mtr);
+		} else {
+			mtr_x_lock(dict_index_get_lock(index), &mtr);
+		}
+
+		if (row_log_online_op_try(
+			    index, entry, thr_get_trx(thr)->id)) {
+			goto func_exit;
+		}
+	}
 
 	/* Note that we use PAGE_CUR_LE as the search mode, because then
 	the function will return in both low_match and up_match of the
 	cursor sensible values */
 
-	if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) {
-		search_mode = (mode & BTR_MODIFY_TREE)
-			? BTR_SEARCH_TREE : BTR_SEARCH_LEAF;
-	} else if (dict_index_is_clust(index)) {
-		search_mode = mode;
-	} else if (!(thr_get_trx(thr)->check_unique_secondary)) {
-		search_mode = mode | BTR_INSERT | BTR_IGNORE_SEC_UNIQUE;
-	} else {
-		search_mode = mode | BTR_INSERT;
+	if (!thr_get_trx(thr)->check_unique_secondary) {
+		search_mode |= BTR_IGNORE_SEC_UNIQUE;
 	}
 
 	btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
@@ -2039,18 +2680,11 @@ row_ins_index_entry_low(
 				    &cursor, 0, __FILE__, __LINE__, &mtr);
 
 	if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
-		/* The insertion was made to the insert buffer already during
-		the search: we are done */
-
-		ut_ad(search_mode & BTR_INSERT);
-		err = DB_SUCCESS;
-
-		goto function_exit;
+		/* The insert was buffered during the search: we are done */
+		goto func_exit;
 	}
 
 #ifdef UNIV_DEBUG
-	if (!srv_use_sys_stats_table
-	    || index != UT_LIST_GET_FIRST(dict_sys->sys_stats->indexes))
 	{
 		page_t*	page = btr_cur_get_page(&cursor);
 		rec_t*	first_rec = page_rec_get_next(
@@ -2064,233 +2698,253 @@ row_ins_index_entry_low(
 
 	n_unique = dict_index_get_n_unique(index);
 
-	if (dict_index_is_unique(index) && (cursor.up_match >= n_unique
-					    || cursor.low_match >= n_unique)) {
+	if (dict_index_is_unique(index)
+	    && (cursor.low_match >= n_unique || cursor.up_match >= n_unique)) {
+		mtr_commit(&mtr);
 
-		if (dict_index_is_clust(index)) {
-			/* Note that the following may return also
-			DB_LOCK_WAIT */
+		DEBUG_SYNC_C("row_ins_sec_index_unique");
 
-			err = row_ins_duplicate_error_in_clust(
-				&cursor, entry, thr, &mtr);
-			if (err != DB_SUCCESS) {
+		if (row_ins_sec_mtr_start_and_check_if_aborted(
+			    &mtr, index, check, search_mode)) {
+			goto func_exit;
+		}
 
-				goto function_exit;
-			}
-		} else {
-			mtr_commit(&mtr);
-			err = row_ins_scan_sec_index_for_duplicate(
-				index, entry, thr);
-			mtr_start(&mtr);
+		err = row_ins_scan_sec_index_for_duplicate(
+			flags, index, entry, thr, check, &mtr, offsets_heap);
 
-			if (err != DB_SUCCESS) {
-				goto function_exit;
+		mtr_commit(&mtr);
+
+		switch (err) {
+		case DB_SUCCESS:
+			break;
+		case DB_DUPLICATE_KEY:
+			if (*index->name == TEMP_INDEX_PREFIX) {
+				ut_ad(!thr_get_trx(thr)
+				      ->dict_operation_lock_mode);
+				mutex_enter(&dict_sys->mutex);
+				dict_set_corrupted_index_cache_only(
+					index, index->table);
+				mutex_exit(&dict_sys->mutex);
+				/* Do not return any error to the
+				caller. The duplicate will be reported
+				by ALTER TABLE or CREATE UNIQUE INDEX.
+				Unfortunately we cannot report the
+				duplicate key value to the DDL thread,
+				because the altered_table object is
+				private to its call stack. */
+				err = DB_SUCCESS;
 			}
+			/* fall through */
+		default:
+			return(err);
+		}
 
-			/* We did not find a duplicate and we have now
-			locked with s-locks the necessary records to
-			prevent any insertion of a duplicate by another
-			transaction. Let us now reposition the cursor and
-			continue the insertion. */
-
-			btr_cur_search_to_nth_level(
-				index, 0, entry, PAGE_CUR_LE,
-				(UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)
-				 ? BTR_SEARCH_LEAF : (mode | BTR_INSERT)),
-				&cursor, 0, __FILE__, __LINE__, &mtr);
+		if (row_ins_sec_mtr_start_and_check_if_aborted(
+			    &mtr, index, check, search_mode)) {
+			goto func_exit;
 		}
-	}
 
-	modify = row_ins_must_modify(&cursor);
+		/* We did not find a duplicate and we have now
+		locked with s-locks the necessary records to
+		prevent any insertion of a duplicate by another
+		transaction. Let us now reposition the cursor and
+		continue the insertion. */
 
-	if (modify != 0) {
+		btr_cur_search_to_nth_level(
+			index, 0, entry, PAGE_CUR_LE,
+			UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)
+			? BTR_SEARCH_LEAF
+			: (btr_latch_mode)
+			(search_mode & ~(BTR_INSERT | BTR_IGNORE_SEC_UNIQUE)),
+			&cursor, 0, __FILE__, __LINE__, &mtr);
+	}
+
+	if (row_ins_must_modify_rec(&cursor)) {
 		/* There is already an index entry with a long enough common
 		prefix, we must convert the insert into a modify of an
 		existing record */
+		offsets = rec_get_offsets(
+			btr_cur_get_rec(&cursor), index, offsets,
+			ULINT_UNDEFINED, &offsets_heap);
 
-		if (modify == ROW_INS_NEXT) {
-			rec = page_rec_get_next(btr_cur_get_rec(&cursor));
-
-			btr_cur_position(index, rec,
-					 btr_cur_get_block(&cursor),&cursor);
-		}
-
-		if (dict_index_is_clust(index)) {
-			err = row_ins_clust_index_entry_by_modify(
-				mode, &cursor, &heap, &big_rec, entry,
-				thr, &mtr);
-
-			if (big_rec) {
-				ut_a(err == DB_SUCCESS);
-				if (UNIV_UNLIKELY(thr_get_trx(thr)->
-						  fake_changes)) {
-					goto stored_big_rec;
-				}
-
-				/* Write out the externally stored
-				columns while still x-latching
-				index->lock and block->lock. Allocate
-				pages for big_rec in the mtr that
-				modified the B-tree, but be sure to skip
-				any pages that were freed in mtr. We will
-				write out the big_rec pages before
-				committing the B-tree mini-transaction. If
-				the system crashes so that crash recovery
-				will not replay the mtr_commit(&mtr), the
-				big_rec pages will be left orphaned until
-				the pages are allocated for something else.
-
-				TODO: If the allocation extends the
-				tablespace, it will not be redo
-				logged, in either mini-transaction.
-				Tablespace extension should be
-				redo-logged in the big_rec
-				mini-transaction, so that recovery
-				will not fail when the big_rec was
-				written to the extended portion of the
-				file, in case the file was somehow
-				truncated in the crash. */
-
-				rec = btr_cur_get_rec(&cursor);
-				offsets = rec_get_offsets(
-					rec, index, NULL,
-					ULINT_UNDEFINED, &heap);
-
-				DEBUG_SYNC_C("before_row_ins_upd_extern");
-				err = btr_store_big_rec_extern_fields(
-					index, btr_cur_get_block(&cursor),
-					rec, offsets, big_rec, &mtr,
-					BTR_STORE_INSERT_UPDATE);
-				DEBUG_SYNC_C("after_row_ins_upd_extern");
-				/* If writing big_rec fails (for
-				example, because of DB_OUT_OF_FILE_SPACE),
-				the record will be corrupted. Even if
-				we did not update any externally
-				stored columns, our update could cause
-				the record to grow so that a
-				non-updated column was selected for
-				external storage. This non-update
-				would not have been written to the
-				undo log, and thus the record cannot
-				be rolled back.
-
-				However, because we have not executed
-				mtr_commit(mtr) yet, the update will
-				not be replayed in crash recovery, and
-				the following assertion failure will
-				effectively "roll back" the operation. */
-				ut_a(err == DB_SUCCESS);
-				goto stored_big_rec;
-			}
-		} else {
-			ut_ad(!n_ext);
-			err = row_ins_sec_index_entry_by_modify(
-				mode, &cursor, entry, thr, &mtr);
-		}
+		err = row_ins_sec_index_entry_by_modify(
+			flags, mode, &cursor, &offsets,
+			offsets_heap, heap, entry, thr, &mtr);
 	} else {
+		rec_t*		insert_rec;
+		big_rec_t*	big_rec;
+
 		if (mode == BTR_MODIFY_LEAF) {
 			err = btr_cur_optimistic_insert(
-				0, &cursor, entry, &insert_rec, &big_rec,
-				n_ext, thr, &mtr);
+				flags, &cursor, &offsets, &offsets_heap,
+				entry, &insert_rec,
+				&big_rec, 0, thr, &mtr);
 		} else {
-			ut_a(mode == BTR_MODIFY_TREE);
+			ut_ad(mode == BTR_MODIFY_TREE);
 			if (buf_LRU_buf_pool_running_out()) {
 
 				err = DB_LOCK_TABLE_FULL;
-
-				goto function_exit;
+				goto func_exit;
 			}
 
 			err = btr_cur_optimistic_insert(
-				0, &cursor, entry, &insert_rec, &big_rec,
-				n_ext, thr, &mtr);
-
+				flags, &cursor,
+				&offsets, &offsets_heap,
+				entry, &insert_rec,
+				&big_rec, 0, thr, &mtr);
 			if (err == DB_FAIL) {
 				err = btr_cur_pessimistic_insert(
-					0, &cursor, entry, &insert_rec,
-					&big_rec, n_ext, thr, &mtr);
+					flags, &cursor,
+					&offsets, &offsets_heap,
+					entry, &insert_rec,
+					&big_rec, 0, thr, &mtr);
 			}
 		}
+
+		if (err == DB_SUCCESS && trx_id) {
+			page_update_max_trx_id(
+				btr_cur_get_block(&cursor),
+				btr_cur_get_page_zip(&cursor),
+				trx_id, &mtr);
+		}
+
+		ut_ad(!big_rec);
 	}
 
-function_exit:
+func_exit:
 	mtr_commit(&mtr);
+	return(err);
+}
 
-	if (UNIV_LIKELY_NULL(big_rec)) {
-		rec_t*	rec;
-		ulint*	offsets;
+/***************************************************************//**
+Tries to insert the externally stored fields (off-page columns)
+of a clustered index entry.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+UNIV_INTERN
+dberr_t
+row_ins_index_entry_big_rec_func(
+/*=============================*/
+	const dtuple_t*		entry,	/*!< in/out: index entry to insert */
+	const big_rec_t*	big_rec,/*!< in: externally stored fields */
+	ulint*			offsets,/*!< in/out: rec offsets */
+	mem_heap_t**		heap,	/*!< in/out: memory heap */
+	dict_index_t*		index,	/*!< in: index */
+	const char*		file,	/*!< in: file name of caller */
+#ifndef DBUG_OFF
+	const void*		thd,	/*!< in: connection, or NULL */
+#endif /* DBUG_OFF */
+	ulint			line)	/*!< in: line number of caller */
+{
+	mtr_t		mtr;
+	btr_cur_t	cursor;
+	rec_t*		rec;
+	dberr_t		error;
 
-		if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) {
-			/* skip store extern */
-			if (modify) {
-				dtuple_big_rec_free(big_rec);
-			} else {
-				dtuple_convert_back_big_rec(index, entry, big_rec);
-			}
+	ut_ad(dict_index_is_clust(index));
 
-			if (UNIV_LIKELY_NULL(heap)) {
-				mem_heap_free(heap);
-			}
+	DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern_latch");
+
+	mtr_start(&mtr);
+	btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+				    BTR_MODIFY_TREE, &cursor, 0,
+				    file, line, &mtr);
+	rec = btr_cur_get_rec(&cursor);
+	offsets = rec_get_offsets(rec, index, offsets,
+				  ULINT_UNDEFINED, heap);
+
+	DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern");
+	error = btr_store_big_rec_extern_fields(
+		index, btr_cur_get_block(&cursor),
+		rec, offsets, big_rec, &mtr, BTR_STORE_INSERT);
+	DEBUG_SYNC_C_IF_THD(thd, "after_row_ins_extern");
+
+	if (error == DB_SUCCESS
+	    && dict_index_is_online_ddl(index)) {
+		row_log_table_insert(rec, index, offsets);
+	}
+
+	mtr_commit(&mtr);
+
+	return(error);
+}
+
+/***************************************************************//**
+Inserts an entry into a clustered index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+UNIV_INTERN
+dberr_t
+row_ins_clust_index_entry(
+/*======================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+{
+	dberr_t	err;
+	ulint	n_uniq;
+
+	if (UT_LIST_GET_FIRST(index->table->foreign_list)) {
+		err = row_ins_check_foreign_constraints(
+			index->table, index, entry, thr);
+		if (err != DB_SUCCESS) {
 
 			return(err);
 		}
+	}
 
-		DBUG_EXECUTE_IF(
-			"row_ins_extern_checkpoint",
-			log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE););
-
-		mtr_start(&mtr);
-
-		DEBUG_SYNC_C("before_row_ins_extern_latch");
-		btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
-					    BTR_MODIFY_TREE, &cursor, 0,
-					    __FILE__, __LINE__, &mtr);
-		rec = btr_cur_get_rec(&cursor);
-		offsets = rec_get_offsets(rec, index, NULL,
-					  ULINT_UNDEFINED, &heap);
-
-		DEBUG_SYNC_C("before_row_ins_extern");
-		err = btr_store_big_rec_extern_fields(
-			index, btr_cur_get_block(&cursor),
-			rec, offsets, big_rec, &mtr, BTR_STORE_INSERT);
-		DEBUG_SYNC_C("after_row_ins_extern");
-
-stored_big_rec:
-		if (modify) {
-			dtuple_big_rec_free(big_rec);
-		} else {
-			dtuple_convert_back_big_rec(index, entry, big_rec);
-		}
+	n_uniq = dict_index_is_unique(index) ? index->n_uniq : 0;
 
-		mtr_commit(&mtr);
+	/* Try first optimistic descent to the B-tree */
+
+	log_free_check();
+
+	err = row_ins_clust_index_entry_low(
+		0, BTR_MODIFY_LEAF, index, n_uniq, entry, n_ext, thr);
+
+#ifdef UNIV_DEBUG
+	/* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC().
+	Once it is fixed, remove the 'ifdef', 'if' and this comment. */
+	if (!thr_get_trx(thr)->ddl) {
+		DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+				    "after_row_ins_clust_index_entry_leaf");
 	}
+#endif /* UNIV_DEBUG */
 
-	if (UNIV_LIKELY_NULL(heap)) {
-		mem_heap_free(heap);
+	if (err != DB_FAIL) {
+		DEBUG_SYNC_C("row_ins_clust_index_entry_leaf_after");
+		return(err);
 	}
-	return(err);
+
+	/* Try then pessimistic descent to the B-tree */
+
+	log_free_check();
+
+	return(row_ins_clust_index_entry_low(
+		       0, BTR_MODIFY_TREE, index, n_uniq, entry, n_ext, thr));
 }
 
 /***************************************************************//**
-Inserts an index entry to index. Tries first optimistic, then pessimistic
-descent down the tree. If the entry matches enough to a delete marked record,
-performs the insert by updating or delete unmarking the delete marked
-record.
+Inserts an entry into a secondary index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
 @return	DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
 UNIV_INTERN
-ulint
-row_ins_index_entry(
-/*================*/
-	dict_index_t*	index,	/*!< in: index */
+dberr_t
+row_ins_sec_index_entry(
+/*====================*/
+	dict_index_t*	index,	/*!< in: secondary index */
 	dtuple_t*	entry,	/*!< in/out: index entry to insert */
-	ulint		n_ext,	/*!< in: number of externally stored columns */
-	ibool		foreign,/*!< in: TRUE=check foreign key constraints
-				(foreign=FALSE only during CREATE INDEX) */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	ulint	err;
+	dberr_t		err;
+	mem_heap_t*	offsets_heap;
+	mem_heap_t*	heap;
 
-	if (foreign && UT_LIST_GET_FIRST(index->table->foreign_list)) {
+	if (UT_LIST_GET_FIRST(index->table->foreign_list)) {
 		err = row_ins_check_foreign_constraints(index->table, index,
 							entry, thr);
 		if (err != DB_SUCCESS) {
@@ -2299,29 +2953,59 @@ row_ins_index_entry(
 		}
 	}
 
+	ut_ad(thr_get_trx(thr)->id);
+
+	offsets_heap = mem_heap_create(1024);
+	heap = mem_heap_create(1024);
+
 	/* Try first optimistic descent to the B-tree */
 
-	err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry,
-				      n_ext, thr);
-	if (err != DB_FAIL) {
-		if (index == dict_table_get_first_index(index->table)
-		    && thr_get_trx(thr)->mysql_thd != 0) {
-			DEBUG_SYNC_C("row_ins_clust_index_entry_leaf_after");
-		}
-		return(err);
-	}
+	log_free_check();
 
-	/* Try then pessimistic descent to the B-tree */
+	err = row_ins_sec_index_entry_low(
+		0, BTR_MODIFY_LEAF, index, offsets_heap, heap, entry, 0, thr);
+	if (err == DB_FAIL) {
+		mem_heap_empty(heap);
+
+		/* Try then pessimistic descent to the B-tree */
+
+		log_free_check();
+
+		err = row_ins_sec_index_entry_low(
+			0, BTR_MODIFY_TREE, index,
+			offsets_heap, heap, entry, 0, thr);
+	}
 
-	err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry,
-				      n_ext, thr);
+	mem_heap_free(heap);
+	mem_heap_free(offsets_heap);
 	return(err);
 }
 
+/***************************************************************//**
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+static
+dberr_t
+row_ins_index_entry(
+/*================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	if (dict_index_is_clust(index)) {
+		return(row_ins_clust_index_entry(index, entry, thr, 0));
+	} else {
+		return(row_ins_sec_index_entry(index, entry, thr));
+	}
+}
+
 /***********************************************************//**
 Sets the values of the dtuple fields in entry from the values of appropriate
 columns in row. */
-static
+static __attribute__((nonnull))
 void
 row_ins_index_entry_set_vals(
 /*=========================*/
@@ -2332,8 +3016,6 @@ row_ins_index_entry_set_vals(
 	ulint	n_fields;
 	ulint	i;
 
-	ut_ad(entry && row);
-
 	n_fields = dtuple_get_n_fields(entry);
 
 	for (i = 0; i < n_fields; i++) {
@@ -2357,7 +3039,9 @@ row_ins_index_entry_set_vals(
 			len = dtype_get_at_most_n_mbchars(
 				col->prtype, col->mbminmaxlen,
 				ind_field->prefix_len,
-				len, dfield_get_data(row_field));
+				len,
+				static_cast<const char*>(
+					dfield_get_data(row_field)));
 
 			ut_ad(!dfield_is_ext(row_field));
 		}
@@ -2374,14 +3058,14 @@ row_ins_index_entry_set_vals(
 Inserts a single index entry to the table.
 @return DB_SUCCESS if operation successfully completed, else error
 code or DB_LOCK_WAIT */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_ins_index_entry_step(
 /*=====================*/
 	ins_node_t*	node,	/*!< in: row insert node */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	ulint	err;
+	dberr_t	err;
 
 	ut_ad(dtuple_check_typed(node->row));
 
@@ -2389,7 +3073,16 @@ row_ins_index_entry_step(
 
 	ut_ad(dtuple_check_typed(node->entry));
 
-	err = row_ins_index_entry(node->index, node->entry, 0, TRUE, thr);
+	err = row_ins_index_entry(node->index, node->entry, thr);
+
+#ifdef UNIV_DEBUG
+	/* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC().
+	Once it is fixed, remove the 'ifdef', 'if' and this comment. */
+	if (!thr_get_trx(thr)->ddl) {
+		DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+				    "after_row_ins_index_entry_step");
+	}
+#endif /* UNIV_DEBUG */
 
 	return(err);
 }
@@ -2488,16 +3181,14 @@ row_ins_get_row_from_select(
 Inserts a row to a table.
 @return DB_SUCCESS if operation successfully completed, else error
 code or DB_LOCK_WAIT */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_ins(
 /*====*/
 	ins_node_t*	node,	/*!< in: row insert node */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	ulint	err;
-
-	ut_ad(node && thr);
+	dberr_t	err;
 
 	if (node->state == INS_NODE_ALLOC_ROW_ID) {
 
@@ -2521,17 +3212,23 @@ row_ins(
 	ut_ad(node->state == INS_NODE_INSERT_ENTRIES);
 
 	while (node->index != NULL) {
-		err = row_ins_index_entry_step(node, thr);
+		if (node->index->type != DICT_FTS) {
+			err = row_ins_index_entry_step(node, thr);
 
-		if (err != DB_SUCCESS) {
+			if (err != DB_SUCCESS) {
 
-			return(err);
+				return(err);
+			}
 		}
 
 		node->index = dict_table_get_next_index(node->index);
 		node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry);
 
-		/* Skip corrupted secondar index and its entry */
+		DBUG_EXECUTE_IF(
+			"row_ins_skip_sec",
+			node->index = NULL; node->entry = NULL; break;);
+
+		/* Skip corrupted secondary index and its entry */
 		while (node->index && dict_index_is_corrupted(node->index)) {
 
 			node->index = dict_table_get_next_index(node->index);
@@ -2560,15 +3257,15 @@ row_ins_step(
 	que_node_t*	parent;
 	sel_node_t*	sel_node;
 	trx_t*		trx;
-	ulint		err;
+	dberr_t		err;
 
 	ut_ad(thr);
 
 	trx = thr_get_trx(thr);
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
-	node = thr->run_node;
+	node = static_cast<ins_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_INSERT);
 
@@ -2593,6 +3290,8 @@ row_ins_step(
 
 	if (node->state == INS_NODE_SET_IX_LOCK) {
 
+		node->state = INS_NODE_ALLOC_ROW_ID;
+
 		/* It may be that the current session has not yet started
 		its transaction, or it has been committed: */
 
@@ -2604,6 +3303,9 @@ row_ins_step(
 
 		err = lock_table(0, node->table, LOCK_IX, thr);
 
+		DBUG_EXECUTE_IF("ib_row_ins_ix_lock_wait",
+				err = DB_LOCK_WAIT;);
+
 		if (err != DB_SUCCESS) {
 
 			goto error_handling;
@@ -2611,8 +3313,6 @@ row_ins_step(
 
 		node->trx_id = trx->id;
 same_trx:
-		node->state = INS_NODE_ALLOC_ROW_ID;
-
 		if (node->ins_type == INS_SEARCHED) {
 			/* Reset the cursor */
 			sel_node->state = SEL_NODE_OPEN;
diff --git a/storage/xtradb/row/row0log.cc b/storage/xtradb/row/row0log.cc
new file mode 100644
index 00000000000..49f9eb842b1
--- /dev/null
+++ b/storage/xtradb/row/row0log.cc
@@ -0,0 +1,3400 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0log.cc
+Modification log for online index creation and online table rebuild
+
+Created 2011-05-26 Marko Makela
+*******************************************************/
+
+#include "row0log.h"
+
+#ifdef UNIV_NONINL
+#include "row0log.ic"
+#endif
+
+#include "row0row.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "row0merge.h"
+#include "row0ext.h"
+#include "data0data.h"
+#include "que0que.h"
+#include "handler0alter.h"
+
+#include<map>
+
+/** Table row modification operations during online table rebuild.
+Delete-marked records are not copied to the rebuilt table. */
+enum row_tab_op {
+	/** Insert a record */
+	ROW_T_INSERT = 0x41,
+	/** Update a record in place */
+	ROW_T_UPDATE,
+	/** Delete (purge) a record */
+	ROW_T_DELETE
+};
+
+/** Index record modification operations during online index creation */
+enum row_op {
+	/** Insert a record */
+	ROW_OP_INSERT = 0x61,
+	/** Delete a record */
+	ROW_OP_DELETE
+};
+
+#ifdef UNIV_DEBUG
+/** Write information about the applied record to the error log */
+# define ROW_LOG_APPLY_PRINT
+#endif /* UNIV_DEBUG */
+
+#ifdef ROW_LOG_APPLY_PRINT
+/** When set, write information about the applied record to the error log */
+static bool row_log_apply_print;
+#endif /* ROW_LOG_APPLY_PRINT */
+
+/** Size of the modification log entry header, in bytes */
+#define ROW_LOG_HEADER_SIZE 2/*op, extra_size*/
+
+/** Log block for modifications during online ALTER TABLE */
+struct row_log_buf_t {
+	byte*		block;	/*!< file block buffer */
+	mrec_buf_t	buf;	/*!< buffer for accessing a record
+				that spans two blocks */
+	ulint		blocks; /*!< current position in blocks */
+	ulint		bytes;	/*!< current position within buf */
+	ulonglong	total;	/*!< logical position, in bytes from
+				the start of the row_log_table log;
+				0 for row_log_online_op() and
+				row_log_apply(). */
+};
+
+/** Tracks BLOB allocation during online ALTER TABLE */
+class row_log_table_blob_t {
+public:
+	/** Constructor (declaring a BLOB freed)
+	@param offset_arg	row_log_t::tail::total */
+#ifdef UNIV_DEBUG
+	row_log_table_blob_t(ulonglong offset_arg) :
+		old_offset (0), free_offset (offset_arg),
+		offset (BLOB_FREED) {}
+#else /* UNIV_DEBUG */
+	row_log_table_blob_t() :
+		offset (BLOB_FREED) {}
+#endif /* UNIV_DEBUG */
+
+	/** Declare a BLOB freed again.
+	@param offset_arg	row_log_t::tail::total */
+#ifdef UNIV_DEBUG
+	void blob_free(ulonglong offset_arg)
+#else /* UNIV_DEBUG */
+	void blob_free()
+#endif /* UNIV_DEBUG */
+	{
+		ut_ad(offset < offset_arg);
+		ut_ad(offset != BLOB_FREED);
+		ut_d(old_offset = offset);
+		ut_d(free_offset = offset_arg);
+		offset = BLOB_FREED;
+	}
+	/** Declare a freed BLOB reused.
+	@param offset_arg	row_log_t::tail::total */
+	void blob_alloc(ulonglong offset_arg) {
+		ut_ad(free_offset <= offset_arg);
+		ut_d(old_offset = offset);
+		offset = offset_arg;
+	}
+	/** Determine if a BLOB was freed at a given log position
+	@param offset_arg	row_log_t::head::total after the log record
+	@return true if freed */
+	bool is_freed(ulonglong offset_arg) const {
+		/* This is supposed to be the offset at the end of the
+		current log record. */
+		ut_ad(offset_arg > 0);
+		/* We should never get anywhere close the magic value. */
+		ut_ad(offset_arg < BLOB_FREED);
+		return(offset_arg < offset);
+	}
+private:
+	/** Magic value for a freed BLOB */
+	static const ulonglong BLOB_FREED = ~0ULL;
+#ifdef UNIV_DEBUG
+	/** Old offset, in case a page was freed, reused, freed, ... */
+	ulonglong	old_offset;
+	/** Offset of last blob_free() */
+	ulonglong	free_offset;
+#endif /* UNIV_DEBUG */
+	/** Byte offset to the log file */
+	ulonglong	offset;
+};
+
+/** @brief Map of off-page column page numbers to 0 or log byte offsets.
+
+If there is no mapping for a page number, it is safe to access.
+If a page number maps to 0, it is an off-page column that has been freed.
+If a page number maps to a nonzero number, the number is a byte offset
+into the index->online_log, indicating that the page is safe to access
+when applying log records starting from that offset. */
+typedef std::map<ulint, row_log_table_blob_t> page_no_map;
+
+/** @brief Buffer for logging modifications during online index creation
+
+All modifications to an index that is being created will be logged by
+row_log_online_op() to this buffer.
+
+All modifications to a table that is being rebuilt will be logged by
+row_log_table_delete(), row_log_table_update(), row_log_table_insert()
+to this buffer.
+
+When head.blocks == tail.blocks, the reader will access tail.block
+directly. When also head.bytes == tail.bytes, both counts will be
+reset to 0 and the file will be truncated. */
+struct row_log_t {
+	int		fd;	/*!< file descriptor */
+	ib_mutex_t	mutex;	/*!< mutex protecting error,
+				max_trx and tail */
+	page_no_map*	blobs;	/*!< map of page numbers of off-page columns
+				that have been freed during table-rebuilding
+				ALTER TABLE (row_log_table_*); protected by
+				index->lock X-latch only */
+	dict_table_t*	table;	/*!< table that is being rebuilt,
+				or NULL when this is a secondary
+				index that is being created online */
+	bool		same_pk;/*!< whether the definition of the PRIMARY KEY
+				has remained the same */
+	const dtuple_t*	add_cols;
+				/*!< default values of added columns, or NULL */
+	const ulint*	col_map;/*!< mapping of old column numbers to
+				new ones, or NULL if !table */
+	dberr_t		error;	/*!< error that occurred during online
+				table rebuild */
+	trx_id_t	max_trx;/*!< biggest observed trx_id in
+				row_log_online_op();
+				protected by mutex and index->lock S-latch,
+				or by index->lock X-latch only */
+	row_log_buf_t	tail;	/*!< writer context;
+				protected by mutex and index->lock S-latch,
+				or by index->lock X-latch only */
+	row_log_buf_t	head;	/*!< reader context; protected by MDL only;
+				modifiable by row_log_apply_ops() */
+	ulint		size;	/*!< allocated size */
+};
+
+/******************************************************//**
+Logs an operation to a secondary index that is (or was) being created. */
+UNIV_INTERN
+void
+row_log_online_op(
+/*==============*/
+	dict_index_t*	index,	/*!< in/out: index, S or X latched */
+	const dtuple_t* tuple,	/*!< in: index tuple */
+	trx_id_t	trx_id)	/*!< in: transaction ID for insert,
+				or 0 for delete */
+{
+	byte*		b;
+	ulint		extra_size;
+	ulint		size;
+	ulint		mrec_size;
+	ulint		avail_size;
+	row_log_t*	log;
+
+	ut_ad(dtuple_validate(tuple));
+	ut_ad(dtuple_get_n_fields(tuple) == dict_index_get_n_fields(index));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED)
+	      || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (dict_index_is_corrupted(index)) {
+		return;
+	}
+
+	ut_ad(dict_index_is_online_ddl(index));
+
+	/* Compute the size of the record. This differs from
+	row_merge_buf_encode(), because here we do not encode
+	extra_size+1 (and reserve 0 as the end-of-chunk marker). */
+
+	size = rec_get_converted_size_temp(
+		index, tuple->fields, tuple->n_fields, &extra_size);
+	ut_ad(size >= extra_size);
+	ut_ad(size <= sizeof log->tail.buf);
+
+	mrec_size = ROW_LOG_HEADER_SIZE
+		+ (extra_size >= 0x80) + size
+		+ (trx_id ? DATA_TRX_ID_LEN : 0);
+
+	log = index->online_log;
+	mutex_enter(&log->mutex);
+
+	if (trx_id > log->max_trx) {
+		log->max_trx = trx_id;
+	}
+
+	UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf);
+
+	ut_ad(log->tail.bytes < srv_sort_buf_size);
+	avail_size = srv_sort_buf_size - log->tail.bytes;
+
+	if (mrec_size > avail_size) {
+		b = log->tail.buf;
+	} else {
+		b = log->tail.block + log->tail.bytes;
+	}
+
+	if (trx_id != 0) {
+		*b++ = ROW_OP_INSERT;
+		trx_write_trx_id(b, trx_id);
+		b += DATA_TRX_ID_LEN;
+	} else {
+		*b++ = ROW_OP_DELETE;
+	}
+
+	if (extra_size < 0x80) {
+		*b++ = (byte) extra_size;
+	} else {
+		ut_ad(extra_size < 0x8000);
+		*b++ = (byte) (0x80 | (extra_size >> 8));
+		*b++ = (byte) extra_size;
+	}
+
+	rec_convert_dtuple_to_temp(
+		b + extra_size, index, tuple->fields, tuple->n_fields);
+	b += size;
+
+	if (mrec_size >= avail_size) {
+		const os_offset_t	byte_offset
+			= (os_offset_t) log->tail.blocks
+			* srv_sort_buf_size;
+		ibool			ret;
+
+		if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {
+			goto write_failed;
+		}
+
+		if (mrec_size == avail_size) {
+			ut_ad(b == &log->tail.block[srv_sort_buf_size]);
+		} else {
+			ut_ad(b == log->tail.buf + mrec_size);
+			memcpy(log->tail.block + log->tail.bytes,
+			       log->tail.buf, avail_size);
+		}
+		UNIV_MEM_ASSERT_RW(log->tail.block, srv_sort_buf_size);
+		ret = os_file_write(
+			"(modification log)",
+			OS_FILE_FROM_FD(log->fd),
+			log->tail.block, byte_offset, srv_sort_buf_size);
+		log->tail.blocks++;
+		if (!ret) {
+write_failed:
+			/* We set the flag directly instead of invoking
+			dict_set_corrupted_index_cache_only(index) here,
+			because the index is not "public" yet. */
+			index->type |= DICT_CORRUPT;
+		}
+		UNIV_MEM_INVALID(log->tail.block, srv_sort_buf_size);
+		memcpy(log->tail.block, log->tail.buf + avail_size,
+		       mrec_size - avail_size);
+		log->tail.bytes = mrec_size - avail_size;
+	} else {
+		log->tail.bytes += mrec_size;
+		ut_ad(b == log->tail.block + log->tail.bytes);
+	}
+
+	UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf);
+	mutex_exit(&log->mutex);
+}
+
+/******************************************************//**
+Gets the error status of the online index rebuild log.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_log_table_get_error(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: clustered index of a table
+					that is being rebuilt online */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_online_ddl(index));
+	return(index->online_log->error);
+}
+
+/******************************************************//**
+Starts logging an operation to a table that is being rebuilt.
+@return pointer to log, or NULL if no logging is necessary */
+static __attribute__((nonnull, warn_unused_result))
+byte*
+row_log_table_open(
+/*===============*/
+	row_log_t*	log,	/*!< in/out: online rebuild log */
+	ulint		size,	/*!< in: size of log record */
+	ulint*		avail)	/*!< out: available size for log record */
+{
+	mutex_enter(&log->mutex);
+
+	UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf);
+
+	if (log->error != DB_SUCCESS) {
+		mutex_exit(&log->mutex);
+		return(NULL);
+	}
+
+	ut_ad(log->tail.bytes < srv_sort_buf_size);
+	*avail = srv_sort_buf_size - log->tail.bytes;
+
+	if (size > *avail) {
+		return(log->tail.buf);
+	} else {
+		return(log->tail.block + log->tail.bytes);
+	}
+}
+
+/******************************************************//**
+Stops logging an operation to a table that is being rebuilt. */
+static __attribute__((nonnull))
+void
+row_log_table_close_func(
+/*=====================*/
+	row_log_t*	log,	/*!< in/out: online rebuild log */
+#ifdef UNIV_DEBUG
+	const byte*	b,	/*!< in: end of log record */
+#endif /* UNIV_DEBUG */
+	ulint		size,	/*!< in: size of log record */
+	ulint		avail)	/*!< in: available size for log record */
+{
+	ut_ad(mutex_own(&log->mutex));
+
+	if (size >= avail) {
+		const os_offset_t	byte_offset
+			= (os_offset_t) log->tail.blocks
+			* srv_sort_buf_size;
+		ibool			ret;
+
+		if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {
+			goto write_failed;
+		}
+
+		if (size == avail) {
+			ut_ad(b == &log->tail.block[srv_sort_buf_size]);
+		} else {
+			ut_ad(b == log->tail.buf + size);
+			memcpy(log->tail.block + log->tail.bytes,
+			       log->tail.buf, avail);
+		}
+		UNIV_MEM_ASSERT_RW(log->tail.block, srv_sort_buf_size);
+		ret = os_file_write(
+			"(modification log)",
+			OS_FILE_FROM_FD(log->fd),
+			log->tail.block, byte_offset, srv_sort_buf_size);
+		log->tail.blocks++;
+		if (!ret) {
+write_failed:
+			log->error = DB_ONLINE_LOG_TOO_BIG;
+		}
+		UNIV_MEM_INVALID(log->tail.block, srv_sort_buf_size);
+		memcpy(log->tail.block, log->tail.buf + avail, size - avail);
+		log->tail.bytes = size - avail;
+	} else {
+		log->tail.bytes += size;
+		ut_ad(b == log->tail.block + log->tail.bytes);
+	}
+
+	log->tail.total += size;
+	UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf);
+	mutex_exit(&log->mutex);
+}
+
+#ifdef UNIV_DEBUG
+# define row_log_table_close(log, b, size, avail)	\
+	row_log_table_close_func(log, b, size, avail)
+#else /* UNIV_DEBUG */
+# define row_log_table_close(log, b, size, avail)	\
+	row_log_table_close_func(log, size, avail)
+#endif /* UNIV_DEBUG */
+
+/******************************************************//**
+Logs a delete operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_delete(). */
+UNIV_INTERN
+void
+row_log_table_delete(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	bool		purge,	/*!< in: true=purging BLOBs */
+	trx_id_t	trx_id)	/*!< in: DB_TRX_ID of the record before
+				it was deleted */
+{
+	ulint		old_pk_extra_size;
+	ulint		old_pk_size;
+	ulint		ext_size = 0;
+	ulint		mrec_size;
+	ulint		avail_size;
+	mem_heap_t*	heap		= NULL;
+	const dtuple_t*	old_pk;
+	row_ext_t*	ext;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index));
+	ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED)
+	      || rw_lock_own(&index->lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (dict_index_is_corrupted(index)
+	    || !dict_index_is_online_ddl(index)
+	    || index->online_log->error != DB_SUCCESS) {
+		return;
+	}
+
+	dict_table_t* new_table = index->online_log->table;
+	dict_index_t* new_index = dict_table_get_first_index(new_table);
+
+	ut_ad(dict_index_is_clust(new_index));
+	ut_ad(!dict_index_is_online_ddl(new_index));
+
+	/* Create the tuple PRIMARY KEY, DB_TRX_ID in the new_table. */
+	if (index->online_log->same_pk) {
+		byte*		db_trx_id;
+		dtuple_t*	tuple;
+		ut_ad(new_index->n_uniq == index->n_uniq);
+
+		/* The PRIMARY KEY and DB_TRX_ID are in the first
+		fields of the record. */
+		heap = mem_heap_create(
+			DATA_TRX_ID_LEN
+			+ DTUPLE_EST_ALLOC(new_index->n_uniq + 1));
+		old_pk = tuple = dtuple_create(heap, new_index->n_uniq + 1);
+		dict_index_copy_types(tuple, new_index, tuple->n_fields);
+		dtuple_set_n_fields_cmp(tuple, new_index->n_uniq);
+
+		for (ulint i = 0; i < new_index->n_uniq; i++) {
+			ulint		len;
+			const void*	field	= rec_get_nth_field(
+				rec, offsets, i, &len);
+			dfield_t*	dfield	= dtuple_get_nth_field(
+				tuple, i);
+			ut_ad(len != UNIV_SQL_NULL);
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+			dfield_set_data(dfield, field, len);
+		}
+
+		db_trx_id = static_cast<byte*>(
+			mem_heap_alloc(heap, DATA_TRX_ID_LEN));
+		trx_write_trx_id(db_trx_id, trx_id);
+
+		dfield_set_data(dtuple_get_nth_field(tuple, new_index->n_uniq),
+				db_trx_id, DATA_TRX_ID_LEN);
+	} else {
+		/* The PRIMARY KEY has changed. Translate the tuple. */
+		dfield_t*	dfield;
+
+		old_pk = row_log_table_get_pk(rec, index, offsets, &heap);
+
+		if (!old_pk) {
+			ut_ad(index->online_log->error != DB_SUCCESS);
+			return;
+		}
+
+		/* Remove DB_ROLL_PTR. */
+		ut_ad(dtuple_get_n_fields_cmp(old_pk)
+		      == dict_index_get_n_unique(new_index));
+		ut_ad(dtuple_get_n_fields(old_pk)
+		      == dict_index_get_n_unique(new_index) + 2);
+		const_cast<ulint&>(old_pk->n_fields)--;
+
+		/* Overwrite DB_TRX_ID with the old trx_id. */
+		dfield = dtuple_get_nth_field(old_pk, new_index->n_uniq);
+		ut_ad(dfield_get_type(dfield)->mtype == DATA_SYS);
+		ut_ad(dfield_get_type(dfield)->prtype
+		      == (DATA_NOT_NULL | DATA_TRX_ID));
+		ut_ad(dfield_get_len(dfield) == DATA_TRX_ID_LEN);
+		dfield_dup(dfield, heap);
+		trx_write_trx_id(static_cast<byte*>(dfield->data), trx_id);
+	}
+
+	ut_ad(dtuple_get_n_fields(old_pk) > 1);
+	ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+		      old_pk, old_pk->n_fields - 1)->len);
+	old_pk_size = rec_get_converted_size_temp(
+		new_index, old_pk->fields, old_pk->n_fields,
+		&old_pk_extra_size);
+	ut_ad(old_pk_extra_size < 0x100);
+
+	mrec_size = 4 + old_pk_size;
+
+	/* Log enough prefix of the BLOB unless both the
+	old and new table are in COMPACT or REDUNDANT format,
+	which store the prefix in the clustered index record. */
+	if (purge && rec_offs_any_extern(offsets)
+	    && (dict_table_get_format(index->table) >= UNIV_FORMAT_B
+		|| dict_table_get_format(new_table) >= UNIV_FORMAT_B)) {
+
+		/* Build a cache of those off-page column prefixes
+		that are referenced by secondary indexes. It can be
+		that none of the off-page columns are needed. */
+		row_build(ROW_COPY_DATA, index, rec,
+			  offsets, NULL, NULL, NULL, &ext, heap);
+		if (ext) {
+			/* Log the row_ext_t, ext->ext and ext->buf */
+			ext_size = ext->n_ext * ext->max_len
+				+ sizeof(*ext)
+				+ ext->n_ext * sizeof(ulint)
+				+ (ext->n_ext - 1) * sizeof ext->len;
+			mrec_size += ext_size;
+		}
+	}
+
+	if (byte* b = row_log_table_open(index->online_log,
+					 mrec_size, &avail_size)) {
+		*b++ = ROW_T_DELETE;
+		*b++ = static_cast<byte>(old_pk_extra_size);
+
+		/* Log the size of external prefix we saved */
+		mach_write_to_2(b, ext_size);
+		b += 2;
+
+		rec_convert_dtuple_to_temp(
+			b + old_pk_extra_size, new_index,
+			old_pk->fields, old_pk->n_fields);
+
+		b += old_pk_size;
+
+		if (ext_size) {
+			ulint	cur_ext_size = sizeof(*ext)
+				+ (ext->n_ext - 1) * sizeof ext->len;
+
+			memcpy(b, ext, cur_ext_size);
+			b += cur_ext_size;
+
+			/* Check if we need to col_map to adjust the column
+			number. If columns were added/removed/reordered,
+			adjust the column number. */
+			if (const ulint* col_map =
+				index->online_log->col_map) {
+				for (ulint i = 0; i < ext->n_ext; i++) {
+					const_cast<ulint&>(ext->ext[i]) =
+						col_map[ext->ext[i]];
+				}
+			}
+
+			memcpy(b, ext->ext, ext->n_ext * sizeof(*ext->ext));
+			b += ext->n_ext * sizeof(*ext->ext);
+
+			ext_size -= cur_ext_size
+				 + ext->n_ext * sizeof(*ext->ext);
+			memcpy(b, ext->buf, ext_size);
+			b += ext_size;
+		}
+
+		row_log_table_close(
+			index->online_log, b, mrec_size, avail_size);
+	}
+
+	mem_heap_free(heap);
+}
+
+/******************************************************//**
+Logs an insert or update to a table that is being rebuilt. */
+static
+void
+row_log_table_low_redundant(
+/*========================*/
+	const rec_t*		rec,	/*!< in: clustered index leaf
+					page record in ROW_FORMAT=REDUNDANT,
+					page X-latched */
+	dict_index_t*		index,	/*!< in/out: clustered index, S-latched
+					or X-latched */
+	bool			insert,	/*!< in: true if insert,
+					false if update */
+	const dtuple_t*		old_pk,	/*!< in: old PRIMARY KEY value
+					(if !insert and a PRIMARY KEY
+					is being created) */
+	const dict_index_t*	new_index)
+					/*!< in: clustered index of the
+					new table, not latched */
+{
+	ulint		old_pk_size;
+	ulint		old_pk_extra_size;
+	ulint		size;
+	ulint		extra_size;
+	ulint		mrec_size;
+	ulint		avail_size;
+	mem_heap_t*	heap		= NULL;
+	dtuple_t*	tuple;
+
+	ut_ad(!page_is_comp(page_align(rec)));
+	ut_ad(dict_index_get_n_fields(index) == rec_get_n_fields_old(rec));
+	ut_ad(dict_tf_is_valid(index->table->flags));
+	ut_ad(!dict_table_is_comp(index->table));  /* redundant row format */
+	ut_ad(dict_index_is_clust(new_index));
+
+	heap = mem_heap_create(DTUPLE_EST_ALLOC(index->n_fields));
+	tuple = dtuple_create(heap, index->n_fields);
+	dict_index_copy_types(tuple, index, index->n_fields);
+	dtuple_set_n_fields_cmp(tuple, dict_index_get_n_unique(index));
+
+	if (rec_get_1byte_offs_flag(rec)) {
+		for (ulint i = 0; i < index->n_fields; i++) {
+			dfield_t*	dfield;
+			ulint		len;
+			const void*	field;
+
+			dfield = dtuple_get_nth_field(tuple, i);
+			field = rec_get_nth_field_old(rec, i, &len);
+
+			dfield_set_data(dfield, field, len);
+		}
+	} else {
+		for (ulint i = 0; i < index->n_fields; i++) {
+			dfield_t*	dfield;
+			ulint		len;
+			const void*	field;
+
+			dfield = dtuple_get_nth_field(tuple, i);
+			field = rec_get_nth_field_old(rec, i, &len);
+
+			dfield_set_data(dfield, field, len);
+
+			if (rec_2_is_field_extern(rec, i)) {
+				dfield_set_ext(dfield);
+			}
+		}
+	}
+
+	size = rec_get_converted_size_temp(
+		index, tuple->fields, tuple->n_fields, &extra_size);
+
+	mrec_size = ROW_LOG_HEADER_SIZE + size + (extra_size >= 0x80);
+
+	if (insert || index->online_log->same_pk) {
+		ut_ad(!old_pk);
+		old_pk_extra_size = old_pk_size = 0;
+	} else {
+		ut_ad(old_pk);
+		ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp);
+		ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+			      old_pk, old_pk->n_fields - 2)->len);
+		ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+			      old_pk, old_pk->n_fields - 1)->len);
+
+		old_pk_size = rec_get_converted_size_temp(
+			new_index, old_pk->fields, old_pk->n_fields,
+			&old_pk_extra_size);
+		ut_ad(old_pk_extra_size < 0x100);
+		mrec_size += 1/*old_pk_extra_size*/ + old_pk_size;
+	}
+
+	if (byte* b = row_log_table_open(index->online_log,
+					 mrec_size, &avail_size)) {
+		*b++ = insert ? ROW_T_INSERT : ROW_T_UPDATE;
+
+		if (old_pk_size) {
+			*b++ = static_cast<byte>(old_pk_extra_size);
+
+			rec_convert_dtuple_to_temp(
+				b + old_pk_extra_size, new_index,
+				old_pk->fields, old_pk->n_fields);
+			b += old_pk_size;
+		}
+
+		if (extra_size < 0x80) {
+			*b++ = static_cast<byte>(extra_size);
+		} else {
+			ut_ad(extra_size < 0x8000);
+			*b++ = static_cast<byte>(0x80 | (extra_size >> 8));
+			*b++ = static_cast<byte>(extra_size);
+		}
+
+		rec_convert_dtuple_to_temp(
+			b + extra_size, index, tuple->fields, tuple->n_fields);
+		b += size;
+
+		row_log_table_close(
+			index->online_log, b, mrec_size, avail_size);
+	}
+
+	mem_heap_free(heap);
+}
+
+/******************************************************//**
+Logs an insert or update to a table that is being rebuilt. */
+static __attribute__((nonnull(1,2,3)))
+void
+row_log_table_low(
+/*==============*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	bool		insert,	/*!< in: true if insert, false if update */
+	const dtuple_t*	old_pk)	/*!< in: old PRIMARY KEY value (if !insert
+				and a PRIMARY KEY is being created) */
+{
+	ulint			omit_size;
+	ulint			old_pk_size;
+	ulint			old_pk_extra_size;
+	ulint			extra_size;
+	ulint			mrec_size;
+	ulint			avail_size;
+	const dict_index_t*	new_index = dict_table_get_first_index(
+		index->online_log->table);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_clust(new_index));
+	ut_ad(!dict_index_is_online_ddl(new_index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index));
+	ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED)
+	      || rw_lock_own(&index->lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX);
+	ut_ad(page_is_leaf(page_align(rec)));
+	ut_ad(!page_is_comp(page_align(rec)) == !rec_offs_comp(offsets));
+
+	if (dict_index_is_corrupted(index)
+	    || !dict_index_is_online_ddl(index)
+	    || index->online_log->error != DB_SUCCESS) {
+		return;
+	}
+
+	if (!rec_offs_comp(offsets)) {
+		row_log_table_low_redundant(
+			rec, index, insert, old_pk, new_index);
+		return;
+	}
+
+	ut_ad(page_is_comp(page_align(rec)));
+	ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY);
+
+	omit_size = REC_N_NEW_EXTRA_BYTES;
+
+	extra_size = rec_offs_extra_size(offsets) - omit_size;
+
+	mrec_size = ROW_LOG_HEADER_SIZE
+		+ (extra_size >= 0x80) + rec_offs_size(offsets) - omit_size;
+
+	if (insert || index->online_log->same_pk) {
+		ut_ad(!old_pk);
+		old_pk_extra_size = old_pk_size = 0;
+	} else {
+		ut_ad(old_pk);
+		ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp);
+		ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+			      old_pk, old_pk->n_fields - 2)->len);
+		ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+			      old_pk, old_pk->n_fields - 1)->len);
+
+		old_pk_size = rec_get_converted_size_temp(
+			new_index, old_pk->fields, old_pk->n_fields,
+			&old_pk_extra_size);
+		ut_ad(old_pk_extra_size < 0x100);
+		mrec_size += 1/*old_pk_extra_size*/ + old_pk_size;
+	}
+
+	if (byte* b = row_log_table_open(index->online_log,
+					 mrec_size, &avail_size)) {
+		*b++ = insert ? ROW_T_INSERT : ROW_T_UPDATE;
+
+		if (old_pk_size) {
+			*b++ = static_cast<byte>(old_pk_extra_size);
+
+			rec_convert_dtuple_to_temp(
+				b + old_pk_extra_size, new_index,
+				old_pk->fields, old_pk->n_fields);
+			b += old_pk_size;
+		}
+
+		if (extra_size < 0x80) {
+			*b++ = static_cast<byte>(extra_size);
+		} else {
+			ut_ad(extra_size < 0x8000);
+			*b++ = static_cast<byte>(0x80 | (extra_size >> 8));
+			*b++ = static_cast<byte>(extra_size);
+		}
+
+		memcpy(b, rec - rec_offs_extra_size(offsets), extra_size);
+		b += extra_size;
+		memcpy(b, rec, rec_offs_data_size(offsets));
+		b += rec_offs_data_size(offsets);
+
+		row_log_table_close(
+			index->online_log, b, mrec_size, avail_size);
+	}
+}
+
+/******************************************************//**
+Logs an update to a table that is being rebuilt.
+This will be merged in row_log_table_apply_update(). */
+UNIV_INTERN
+void
+row_log_table_update(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	const dtuple_t*	old_pk)	/*!< in: row_log_table_get_pk()
+				before the update */
+{
+	row_log_table_low(rec, index, offsets, false, old_pk);
+}
+
+/** Gets the old table column of a PRIMARY KEY column.
+@param table	old table (before ALTER TABLE)
+@param col_map	mapping of old column numbers to new ones
+@param col_no	column position in the new table
+@return old table column, or NULL if this is an added column */
+static
+const dict_col_t*
+row_log_table_get_pk_old_col(
+/*=========================*/
+	const dict_table_t*	table,
+	const ulint*		col_map,
+	ulint			col_no)
+{
+	for (ulint i = 0; i < table->n_cols; i++) {
+		if (col_no == col_map[i]) {
+			return(dict_table_get_nth_col(table, i));
+		}
+	}
+
+	return(NULL);
+}
+
+/** Maps an old table column of a PRIMARY KEY column.
+@param col	old table column (before ALTER TABLE)
+@param ifield	clustered index field in the new table (after ALTER TABLE)
+@param dfield	clustered index tuple field in the new table
+@param heap	memory heap for allocating dfield contents
+@param rec	clustered index leaf page record in the old table
+@param offsets	rec_get_offsets(rec)
+@param i	rec field corresponding to col
+@param zip_size	compressed page size of the old table, or 0 for uncompressed
+@param max_len	maximum length of dfield
+@retval DB_INVALID_NULL if a NULL value is encountered
+@retval DB_TOO_BIG_INDEX_COL if the maximum prefix length is exceeded */
+static
+dberr_t
+row_log_table_get_pk_col(
+/*=====================*/
+	const dict_col_t*	col,
+	const dict_field_t*	ifield,
+	dfield_t*		dfield,
+	mem_heap_t*		heap,
+	const rec_t*		rec,
+	const ulint*		offsets,
+	ulint			i,
+	ulint			zip_size,
+	ulint			max_len)
+{
+	const byte*	field;
+	ulint		len;
+
+	ut_ad(ut_is_2pow(zip_size));
+
+	field = rec_get_nth_field(rec, offsets, i, &len);
+
+	if (len == UNIV_SQL_NULL) {
+		return(DB_INVALID_NULL);
+	}
+
+	if (rec_offs_nth_extern(offsets, i)) {
+		ulint	field_len = ifield->prefix_len;
+		byte*	blob_field;
+
+		if (!field_len) {
+			field_len = ifield->fixed_len;
+			if (!field_len) {
+				field_len = max_len + 1;
+			}
+		}
+
+		blob_field = static_cast<byte*>(
+			mem_heap_alloc(heap, field_len));
+
+		len = btr_copy_externally_stored_field_prefix(
+			blob_field, field_len, zip_size, field, len);
+		if (len >= max_len + 1) {
+			return(DB_TOO_BIG_INDEX_COL);
+		}
+
+		dfield_set_data(dfield, blob_field, len);
+	} else {
+		dfield_set_data(dfield, mem_heap_dup(heap, field, len), len);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/******************************************************//**
+Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR
+of a table that is being rebuilt.
+@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table,
+or NULL if the PRIMARY KEY definition does not change */
+UNIV_INTERN
+const dtuple_t*
+row_log_table_get_pk(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	mem_heap_t**	heap)	/*!< in/out: memory heap where allocated */
+{
+	dtuple_t*	tuple	= NULL;
+	row_log_t*	log	= index->online_log;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_online_ddl(index));
+	ut_ad(!offsets || rec_offs_validate(rec, index, offsets));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED)
+	      || rw_lock_own(&index->lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(log);
+	ut_ad(log->table);
+
+	if (log->same_pk) {
+		/* The PRIMARY KEY columns are unchanged. */
+		return(NULL);
+	}
+
+	mutex_enter(&log->mutex);
+
+	/* log->error is protected by log->mutex. */
+	if (log->error == DB_SUCCESS) {
+		dict_table_t*	new_table	= log->table;
+		dict_index_t*	new_index
+			= dict_table_get_first_index(new_table);
+		const ulint	new_n_uniq
+			= dict_index_get_n_unique(new_index);
+
+		if (!*heap) {
+			ulint	size = 0;
+
+			if (!offsets) {
+				size += (1 + REC_OFFS_HEADER_SIZE
+					 + index->n_fields)
+					* sizeof *offsets;
+			}
+
+			for (ulint i = 0; i < new_n_uniq; i++) {
+				size += dict_col_get_min_size(
+					dict_index_get_nth_col(new_index, i));
+			}
+
+			*heap = mem_heap_create(
+				DTUPLE_EST_ALLOC(new_n_uniq + 2) + size);
+		}
+
+		if (!offsets) {
+			offsets = rec_get_offsets(rec, index, NULL,
+						  ULINT_UNDEFINED, heap);
+		}
+
+		tuple = dtuple_create(*heap, new_n_uniq + 2);
+		dict_index_copy_types(tuple, new_index, tuple->n_fields);
+		dtuple_set_n_fields_cmp(tuple, new_n_uniq);
+
+		const ulint max_len = DICT_MAX_FIELD_LEN_BY_FORMAT(new_table);
+		const ulint zip_size = dict_table_zip_size(index->table);
+
+		for (ulint new_i = 0; new_i < new_n_uniq; new_i++) {
+			dict_field_t*	ifield;
+			dfield_t*	dfield;
+			ulint		prtype;
+			ulint		mbminmaxlen;
+
+			ifield = dict_index_get_nth_field(new_index, new_i);
+			dfield = dtuple_get_nth_field(tuple, new_i);
+
+			const ulint	col_no
+				= dict_field_get_col(ifield)->ind;
+
+			if (const dict_col_t* col
+			    = row_log_table_get_pk_old_col(
+				    index->table, log->col_map, col_no)) {
+				ulint	i = dict_col_get_clust_pos(col, index);
+
+				if (i == ULINT_UNDEFINED) {
+					ut_ad(0);
+					log->error = DB_CORRUPTION;
+					goto err_exit;
+				}
+
+				log->error = row_log_table_get_pk_col(
+					col, ifield, dfield, *heap,
+					rec, offsets, i, zip_size, max_len);
+
+				if (log->error != DB_SUCCESS) {
+err_exit:
+					tuple = NULL;
+					goto func_exit;
+				}
+
+				mbminmaxlen = col->mbminmaxlen;
+				prtype = col->prtype;
+			} else {
+				/* No matching column was found in the old
+				table, so this must be an added column.
+				Copy the default value. */
+				ut_ad(log->add_cols);
+
+				dfield_copy(dfield, dtuple_get_nth_field(
+						    log->add_cols, col_no));
+				mbminmaxlen = dfield->type.mbminmaxlen;
+				prtype = dfield->type.prtype;
+			}
+
+			ut_ad(!dfield_is_ext(dfield));
+			ut_ad(!dfield_is_null(dfield));
+
+			if (ifield->prefix_len) {
+				ulint	len = dtype_get_at_most_n_mbchars(
+					prtype, mbminmaxlen,
+					ifield->prefix_len,
+					dfield_get_len(dfield),
+					static_cast<const char*>(
+						dfield_get_data(dfield)));
+
+				ut_ad(len <= dfield_get_len(dfield));
+				dfield_set_len(dfield, len);
+			}
+		}
+
+		const byte* trx_roll = rec
+			+ row_get_trx_id_offset(index, offsets);
+
+		dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq),
+				trx_roll, DATA_TRX_ID_LEN);
+		dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq + 1),
+				trx_roll + DATA_TRX_ID_LEN, DATA_ROLL_PTR_LEN);
+	}
+
+func_exit:
+	mutex_exit(&log->mutex);
+	return(tuple);
+}
+
+/******************************************************//**
+Logs an insert to a table that is being rebuilt.
+This will be merged in row_log_table_apply_insert(). */
+UNIV_INTERN
+void
+row_log_table_insert(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec,index) */
+{
+	row_log_table_low(rec, index, offsets, true, NULL);
+}
+
+/******************************************************//**
+Notes that a BLOB is being freed during online ALTER TABLE. */
+UNIV_INTERN
+void
+row_log_table_blob_free(
+/*====================*/
+	dict_index_t*	index,	/*!< in/out: clustered index, X-latched */
+	ulint		page_no)/*!< in: starting page number of the BLOB */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_online_ddl(index));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&index->lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(page_no != FIL_NULL);
+
+	if (index->online_log->error != DB_SUCCESS) {
+		return;
+	}
+
+	page_no_map*	blobs	= index->online_log->blobs;
+
+	if (!blobs) {
+		index->online_log->blobs = blobs = new page_no_map();
+	}
+
+#ifdef UNIV_DEBUG
+	const ulonglong	log_pos = index->online_log->tail.total;
+#else
+# define log_pos /* empty */
+#endif /* UNIV_DEBUG */
+
+	const page_no_map::value_type v(page_no,
+					row_log_table_blob_t(log_pos));
+
+	std::pair<page_no_map::iterator,bool> p = blobs->insert(v);
+
+	if (!p.second) {
+		/* Update the existing mapping. */
+		ut_ad(p.first->first == page_no);
+		p.first->second.blob_free(log_pos);
+	}
+#undef log_pos
+}
+
+/******************************************************//**
+Notes that a BLOB is being allocated during online ALTER TABLE. */
+UNIV_INTERN
+void
+row_log_table_blob_alloc(
+/*=====================*/
+	dict_index_t*	index,	/*!< in/out: clustered index, X-latched */
+	ulint		page_no)/*!< in: starting page number of the BLOB */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_online_ddl(index));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&index->lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(page_no != FIL_NULL);
+
+	if (index->online_log->error != DB_SUCCESS) {
+		return;
+	}
+
+	/* Only track allocations if the same page has been freed
+	earlier. Double allocation without a free is not allowed. */
+	if (page_no_map* blobs = index->online_log->blobs) {
+		page_no_map::iterator p = blobs->find(page_no);
+
+		if (p != blobs->end()) {
+			ut_ad(p->first == page_no);
+			p->second.blob_alloc(index->online_log->tail.total);
+		}
+	}
+}
+
+/******************************************************//**
+Converts a log record to a table row.
+@return converted row, or NULL if the conversion fails
+or the transaction has been rolled back */
+static __attribute__((nonnull, warn_unused_result))
+const dtuple_t*
+row_log_table_apply_convert_mrec(
+/*=============================*/
+	const mrec_t*		mrec,		/*!< in: merge record */
+	dict_index_t*		index,		/*!< in: index of mrec */
+	const ulint*		offsets,	/*!< in: offsets of mrec */
+	const row_log_t*	log,		/*!< in: rebuild context */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	trx_id_t		trx_id,		/*!< in: DB_TRX_ID of mrec */
+	dberr_t*		error)		/*!< out: DB_SUCCESS or
+						reason of failure */
+{
+	dtuple_t*	row;
+
+	/* This is based on row_build(). */
+	if (log->add_cols) {
+		row = dtuple_copy(log->add_cols, heap);
+		/* dict_table_copy_types() would set the fields to NULL */
+		for (ulint i = 0; i < dict_table_get_n_cols(log->table); i++) {
+			dict_col_copy_type(
+				dict_table_get_nth_col(log->table, i),
+				dfield_get_type(dtuple_get_nth_field(row, i)));
+		}
+	} else {
+		row = dtuple_create(heap, dict_table_get_n_cols(log->table));
+		dict_table_copy_types(row, log->table);
+	}
+
+	for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) {
+		const dict_field_t*	ind_field
+			= dict_index_get_nth_field(index, i);
+
+		if (ind_field->prefix_len) {
+			/* Column prefixes can only occur in key
+			fields, which cannot be stored externally. For
+			a column prefix, there should also be the full
+			field in the clustered index tuple. The row
+			tuple comprises full fields, not prefixes. */
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+			continue;
+		}
+
+		const dict_col_t*	col
+			= dict_field_get_col(ind_field);
+		ulint			col_no
+			= log->col_map[dict_col_get_no(col)];
+
+		if (col_no == ULINT_UNDEFINED) {
+			/* dropped column */
+			continue;
+		}
+
+		dfield_t*		dfield
+			= dtuple_get_nth_field(row, col_no);
+		ulint			len;
+		const byte*		data= NULL;
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			ut_ad(rec_offs_any_extern(offsets));
+			rw_lock_x_lock(dict_index_get_lock(index));
+
+			if (const page_no_map* blobs = log->blobs) {
+				data = rec_get_nth_field(
+					mrec, offsets, i, &len);
+				ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+				ulint	page_no = mach_read_from_4(
+					data + len - (BTR_EXTERN_FIELD_REF_SIZE
+						      - BTR_EXTERN_PAGE_NO));
+				page_no_map::const_iterator p = blobs->find(
+					page_no);
+				if (p != blobs->end()
+				    && p->second.is_freed(log->head.total)) {
+					/* This BLOB has been freed.
+					We must not access the row. */
+					row = NULL;
+				}
+			}
+
+			if (row) {
+				data = btr_rec_copy_externally_stored_field(
+					mrec, offsets,
+					dict_table_zip_size(index->table),
+					i, &len, heap);
+				ut_a(data);
+			}
+
+			rw_lock_x_unlock(dict_index_get_lock(index));
+
+			if (!row) {
+				goto func_exit;
+			}
+		} else {
+			data = rec_get_nth_field(mrec, offsets, i, &len);
+		}
+
+		dfield_set_data(dfield, data, len);
+
+		/* See if any columns were changed to NULL or NOT NULL. */
+		const dict_col_t*	new_col
+			= dict_table_get_nth_col(log->table, col_no);
+		ut_ad(new_col->mtype == col->mtype);
+
+		/* Assert that prtype matches except for nullability. */
+		ut_ad(!((new_col->prtype ^ col->prtype) & ~DATA_NOT_NULL));
+		ut_ad(!((new_col->prtype ^ dfield_get_type(dfield)->prtype)
+			& ~DATA_NOT_NULL));
+
+		if (new_col->prtype == col->prtype) {
+			continue;
+		}
+
+		if ((new_col->prtype & DATA_NOT_NULL)
+		    && dfield_is_null(dfield)) {
+			/* We got a NULL value for a NOT NULL column. */
+			*error = DB_INVALID_NULL;
+			return(NULL);
+		}
+
+		/* Adjust the DATA_NOT_NULL flag in the parsed row. */
+		dfield_get_type(dfield)->prtype = new_col->prtype;
+
+		ut_ad(dict_col_type_assert_equal(new_col,
+						 dfield_get_type(dfield)));
+	}
+
+func_exit:
+	*error = DB_SUCCESS;
+	return(row);
+}
+
+/******************************************************//**
+Replays an insert operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_insert_low(
+/*===========================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	const dtuple_t*		row,		/*!< in: table row
+						in the old table definition */
+	trx_id_t		trx_id,		/*!< in: trx_id of the row */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	row_merge_dup_t*	dup)		/*!< in/out: for reporting
+						duplicate key errors */
+{
+	dberr_t		error;
+	dtuple_t*	entry;
+	const row_log_t*log	= dup->index->online_log;
+	dict_index_t*	index	= dict_table_get_first_index(log->table);
+
+	ut_ad(dtuple_validate(row));
+	ut_ad(trx_id);
+
+#ifdef ROW_LOG_APPLY_PRINT
+	if (row_log_apply_print) {
+		fprintf(stderr, "table apply insert "
+			IB_ID_FMT " " IB_ID_FMT "\n",
+			index->table->id, index->id);
+		dtuple_print(stderr, row);
+	}
+#endif /* ROW_LOG_APPLY_PRINT */
+
+	static const ulint	flags
+		= (BTR_CREATE_FLAG
+		   | BTR_NO_LOCKING_FLAG
+		   | BTR_NO_UNDO_LOG_FLAG
+		   | BTR_KEEP_SYS_FLAG);
+
+	entry = row_build_index_entry(row, NULL, index, heap);
+
+	error = row_ins_clust_index_entry_low(
+		flags, BTR_MODIFY_TREE, index, index->n_uniq, entry, 0, thr);
+
+	switch (error) {
+	case DB_SUCCESS:
+		break;
+	case DB_SUCCESS_LOCKED_REC:
+		/* The row had already been copied to the table. */
+		return(DB_SUCCESS);
+	default:
+		return(error);
+	}
+
+	do {
+		if (!(index = dict_table_get_next_index(index))) {
+			break;
+		}
+
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		entry = row_build_index_entry(row, NULL, index, heap);
+		error = row_ins_sec_index_entry_low(
+			flags, BTR_MODIFY_TREE,
+			index, offsets_heap, heap, entry, trx_id, thr);
+	} while (error == DB_SUCCESS);
+
+	return(error);
+}
+
+/******************************************************//**
+Replays an insert operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_insert(
+/*=======================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	const mrec_t*		mrec,		/*!< in: record to insert */
+	const ulint*		offsets,	/*!< in: offsets of mrec */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	row_merge_dup_t*	dup,		/*!< in/out: for reporting
+						duplicate key errors */
+	trx_id_t		trx_id)		/*!< in: DB_TRX_ID of mrec */
+{
+	const row_log_t*log	= dup->index->online_log;
+	dberr_t		error;
+	const dtuple_t*	row	= row_log_table_apply_convert_mrec(
+		mrec, dup->index, offsets, log, heap, trx_id, &error);
+
+	ut_ad(error == DB_SUCCESS || !row);
+	/* Handling of duplicate key error requires storing
+	of offending key in a record buffer. */
+	ut_ad(error != DB_DUPLICATE_KEY);
+
+	if (error != DB_SUCCESS)
+		return(error);
+
+	if (row) {
+		error = row_log_table_apply_insert_low(
+			thr, row, trx_id, offsets_heap, heap, dup);
+		if (error != DB_SUCCESS) {
+			/* Report the erroneous row using the new
+			version of the table. */
+			innobase_row_to_mysql(dup->table, log->table, row);
+		}
+	}
+	return(error);
+}
+
+/******************************************************//**
+Deletes a record from a table that is being rebuilt.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull(1, 2, 4, 5), warn_unused_result))
+dberr_t
+row_log_table_apply_delete_low(
+/*===========================*/
+	btr_pcur_t*		pcur,		/*!< in/out: B-tree cursor,
+						will be trashed */
+	const ulint*		offsets,	/*!< in: offsets on pcur */
+	const row_ext_t*	save_ext,	/*!< in: saved external field
+						info, or NULL */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	mtr_t*			mtr)		/*!< in/out: mini-transaction,
+						will be committed */
+{
+	dberr_t		error;
+	row_ext_t*	ext;
+	dtuple_t*	row;
+	dict_index_t*	index	= btr_pcur_get_btr_cur(pcur)->index;
+
+	ut_ad(dict_index_is_clust(index));
+
+#ifdef ROW_LOG_APPLY_PRINT
+	if (row_log_apply_print) {
+		fprintf(stderr, "table apply delete "
+			IB_ID_FMT " " IB_ID_FMT "\n",
+			index->table->id, index->id);
+		rec_print_new(stderr, btr_pcur_get_rec(pcur), offsets);
+	}
+#endif /* ROW_LOG_APPLY_PRINT */
+	if (dict_table_get_next_index(index)) {
+		/* Build a row template for purging secondary index entries. */
+		row = row_build(
+			ROW_COPY_DATA, index, btr_pcur_get_rec(pcur),
+			offsets, NULL, NULL, NULL,
+			save_ext ? NULL : &ext, heap);
+		if (!save_ext) {
+			save_ext = ext;
+		}
+	} else {
+		row = NULL;
+	}
+
+	btr_cur_pessimistic_delete(&error, FALSE, btr_pcur_get_btr_cur(pcur),
+				   BTR_CREATE_FLAG, RB_NONE, mtr);
+	mtr_commit(mtr);
+
+	if (error != DB_SUCCESS) {
+		return(error);
+	}
+
+	while ((index = dict_table_get_next_index(index)) != NULL) {
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		const dtuple_t*	entry = row_build_index_entry(
+			row, save_ext, index, heap);
+		mtr_start(mtr);
+		btr_pcur_open(index, entry, PAGE_CUR_LE,
+			      BTR_MODIFY_TREE, pcur, mtr);
+#ifdef UNIV_DEBUG
+		switch (btr_pcur_get_btr_cur(pcur)->flag) {
+		case BTR_CUR_DELETE_REF:
+		case BTR_CUR_DEL_MARK_IBUF:
+		case BTR_CUR_DELETE_IBUF:
+		case BTR_CUR_INSERT_TO_IBUF:
+			/* We did not request buffering. */
+			break;
+		case BTR_CUR_HASH:
+		case BTR_CUR_HASH_FAIL:
+		case BTR_CUR_BINARY:
+			goto flag_ok;
+		}
+		ut_ad(0);
+flag_ok:
+#endif /* UNIV_DEBUG */
+
+		if (page_rec_is_infimum(btr_pcur_get_rec(pcur))
+		    || btr_pcur_get_low_match(pcur) < index->n_uniq) {
+			/* All secondary index entries should be
+			found, because new_table is being modified by
+			this thread only, and all indexes should be
+			updated in sync. */
+			mtr_commit(mtr);
+			return(DB_INDEX_CORRUPT);
+		}
+
+		btr_cur_pessimistic_delete(&error, FALSE,
+					   btr_pcur_get_btr_cur(pcur),
+					   BTR_CREATE_FLAG, RB_NONE, mtr);
+		mtr_commit(mtr);
+	}
+
+	return(error);
+}
+
+/******************************************************//**
+Replays a delete operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull(1, 3, 4, 5, 6, 7), warn_unused_result))
+dberr_t
+row_log_table_apply_delete(
+/*=======================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	ulint			trx_id_col,	/*!< in: position of
+						DB_TRX_ID in the new
+						clustered index */
+	const mrec_t*		mrec,		/*!< in: merge record */
+	const ulint*		moffsets,	/*!< in: offsets of mrec */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	dict_table_t*		new_table,	/*!< in: rebuilt table */
+	const row_ext_t*	save_ext)	/*!< in: saved external field
+						info, or NULL */
+{
+	dict_index_t*	index = dict_table_get_first_index(new_table);
+	dtuple_t*	old_pk;
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+	ulint*		offsets;
+
+	ut_ad(rec_offs_n_fields(moffsets)
+	      == dict_index_get_n_unique(index) + 1);
+	ut_ad(!rec_offs_any_extern(moffsets));
+
+	/* Convert the row to a search tuple. */
+	old_pk = dtuple_create(heap, index->n_uniq + 1);
+	dict_index_copy_types(old_pk, index, old_pk->n_fields);
+	dtuple_set_n_fields_cmp(old_pk, index->n_uniq);
+
+	for (ulint i = 0; i <= index->n_uniq; i++) {
+		ulint		len;
+		const void*	field;
+		field = rec_get_nth_field(mrec, moffsets, i, &len);
+		ut_ad(len != UNIV_SQL_NULL);
+		dfield_set_data(dtuple_get_nth_field(old_pk, i),
+				field, len);
+	}
+
+	mtr_start(&mtr);
+	btr_pcur_open(index, old_pk, PAGE_CUR_LE,
+		      BTR_MODIFY_TREE, &pcur, &mtr);
+#ifdef UNIV_DEBUG
+	switch (btr_pcur_get_btr_cur(&pcur)->flag) {
+	case BTR_CUR_DELETE_REF:
+	case BTR_CUR_DEL_MARK_IBUF:
+	case BTR_CUR_DELETE_IBUF:
+	case BTR_CUR_INSERT_TO_IBUF:
+		/* We did not request buffering. */
+		break;
+	case BTR_CUR_HASH:
+	case BTR_CUR_HASH_FAIL:
+	case BTR_CUR_BINARY:
+		goto flag_ok;
+	}
+	ut_ad(0);
+flag_ok:
+#endif /* UNIV_DEBUG */
+
+	if (page_rec_is_infimum(btr_pcur_get_rec(&pcur))
+	    || btr_pcur_get_low_match(&pcur) < index->n_uniq) {
+all_done:
+		mtr_commit(&mtr);
+		/* The record was not found. All done. */
+		return(DB_SUCCESS);
+	}
+
+	offsets = rec_get_offsets(btr_pcur_get_rec(&pcur), index, NULL,
+				  ULINT_UNDEFINED, &offsets_heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	ut_a(!rec_offs_any_null_extern(btr_pcur_get_rec(&pcur), offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+	/* Only remove the record if DB_TRX_ID matches what was
+	buffered. */
+
+	{
+		ulint		len;
+		const void*	mrec_trx_id
+			= rec_get_nth_field(mrec, moffsets, trx_id_col, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+		const void*	rec_trx_id
+			= rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets,
+					    trx_id_col, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+		if (memcmp(mrec_trx_id, rec_trx_id, DATA_TRX_ID_LEN)) {
+			goto all_done;
+		}
+	}
+
+	return(row_log_table_apply_delete_low(&pcur, offsets, save_ext,
+					      heap, &mtr));
+}
+
+/******************************************************//**
+Replays an update operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_update(
+/*=======================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	ulint			trx_id_col,	/*!< in: position of
+						DB_TRX_ID in the
+						old clustered index */
+	ulint			new_trx_id_col,	/*!< in: position of
+						DB_TRX_ID in the new
+						clustered index */
+	const mrec_t*		mrec,		/*!< in: new value */
+	const ulint*		offsets,	/*!< in: offsets of mrec */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	row_merge_dup_t*	dup,		/*!< in/out: for reporting
+						duplicate key errors */
+	trx_id_t		trx_id,		/*!< in: DB_TRX_ID of mrec */
+	const dtuple_t*		old_pk)		/*!< in: PRIMARY KEY and
+						DB_TRX_ID,DB_ROLL_PTR
+						of the old value,
+						or PRIMARY KEY if same_pk */
+{
+	const row_log_t*log	= dup->index->online_log;
+	const dtuple_t*	row;
+	dict_index_t*	index	= dict_table_get_first_index(log->table);
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+	dberr_t		error;
+
+	ut_ad(dtuple_get_n_fields_cmp(old_pk)
+	      == dict_index_get_n_unique(index));
+	ut_ad(dtuple_get_n_fields(old_pk)
+	      == dict_index_get_n_unique(index)
+	      + (dup->index->online_log->same_pk ? 0 : 2));
+
+	row = row_log_table_apply_convert_mrec(
+		mrec, dup->index, offsets, log, heap, trx_id, &error);
+
+	ut_ad(error == DB_SUCCESS || !row);
+	/* Handling of duplicate key error requires storing
+	of offending key in a record buffer. */
+	ut_ad(error != DB_DUPLICATE_KEY);
+
+	if (!row) {
+		return(error);
+	}
+
+	mtr_start(&mtr);
+	btr_pcur_open(index, old_pk, PAGE_CUR_LE,
+		      BTR_MODIFY_TREE, &pcur, &mtr);
+#ifdef UNIV_DEBUG
+	switch (btr_pcur_get_btr_cur(&pcur)->flag) {
+	case BTR_CUR_DELETE_REF:
+	case BTR_CUR_DEL_MARK_IBUF:
+	case BTR_CUR_DELETE_IBUF:
+	case BTR_CUR_INSERT_TO_IBUF:
+		ut_ad(0);/* We did not request buffering. */
+	case BTR_CUR_HASH:
+	case BTR_CUR_HASH_FAIL:
+	case BTR_CUR_BINARY:
+		break;
+	}
+#endif /* UNIV_DEBUG */
+
+	if (page_rec_is_infimum(btr_pcur_get_rec(&pcur))
+	    || btr_pcur_get_low_match(&pcur) < index->n_uniq) {
+		mtr_commit(&mtr);
+insert:
+		ut_ad(mtr.state == MTR_COMMITTED);
+		/* The row was not found. Insert it. */
+		error = row_log_table_apply_insert_low(
+			thr, row, trx_id, offsets_heap, heap, dup);
+		if (error != DB_SUCCESS) {
+err_exit:
+			/* Report the erroneous row using the new
+			version of the table. */
+			innobase_row_to_mysql(dup->table, log->table, row);
+		}
+
+		return(error);
+	}
+
+	/* Update the record. */
+	ulint*		cur_offsets	= rec_get_offsets(
+		btr_pcur_get_rec(&pcur),
+		index, NULL, ULINT_UNDEFINED, &offsets_heap);
+
+	dtuple_t*	entry	= row_build_index_entry(
+		row, NULL, index, heap);
+	const upd_t*	update	= row_upd_build_difference_binary(
+		index, entry, btr_pcur_get_rec(&pcur), cur_offsets,
+		false, NULL, heap);
+
+	error = DB_SUCCESS;
+
+	if (!update->n_fields) {
+		/* Nothing to do. */
+		goto func_exit;
+	}
+
+	if (rec_offs_any_extern(cur_offsets)) {
+		/* If the record contains any externally stored
+		columns, perform the update by delete and insert,
+		because we will not write any undo log that would
+		allow purge to free any orphaned externally stored
+		columns. */
+delete_insert:
+		error = row_log_table_apply_delete_low(
+			&pcur, cur_offsets, NULL, heap, &mtr);
+		ut_ad(mtr.state == MTR_COMMITTED);
+
+		if (error != DB_SUCCESS) {
+			goto err_exit;
+		}
+
+		goto insert;
+	}
+
+	if (upd_get_nth_field(update, 0)->field_no < new_trx_id_col) {
+		if (dup->index->online_log->same_pk) {
+			/* The ROW_T_UPDATE log record should only be
+			written when the PRIMARY KEY fields of the
+			record did not change in the old table.  We
+			can only get a change of PRIMARY KEY columns
+			in the rebuilt table if the PRIMARY KEY was
+			redefined (!same_pk). */
+			ut_ad(0);
+			error = DB_CORRUPTION;
+			goto func_exit;
+		}
+
+		/* The PRIMARY KEY columns have changed.
+		Delete the record with the old PRIMARY KEY value,
+		provided that it carries the same
+		DB_TRX_ID,DB_ROLL_PTR. Then, insert the new row. */
+		ulint		len;
+		const byte*	cur_trx_roll	= rec_get_nth_field(
+			mrec, offsets, trx_id_col, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+		const dfield_t*	new_trx_roll	= dtuple_get_nth_field(
+			old_pk, new_trx_id_col);
+		/* We assume that DB_TRX_ID,DB_ROLL_PTR are stored
+		in one contiguous block. */
+		ut_ad(rec_get_nth_field(mrec, offsets, trx_id_col + 1, &len)
+		      == cur_trx_roll + DATA_TRX_ID_LEN);
+		ut_ad(len == DATA_ROLL_PTR_LEN);
+		ut_ad(new_trx_roll->len == DATA_TRX_ID_LEN);
+		ut_ad(dtuple_get_nth_field(old_pk, new_trx_id_col + 1)
+		      -> len == DATA_ROLL_PTR_LEN);
+		ut_ad(static_cast<const byte*>(
+			      dtuple_get_nth_field(old_pk, new_trx_id_col + 1)
+			      ->data)
+		      == static_cast<const byte*>(new_trx_roll->data)
+		      + DATA_TRX_ID_LEN);
+
+		if (!memcmp(cur_trx_roll, new_trx_roll->data,
+			    DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) {
+			/* The old row exists. Remove it. */
+			goto delete_insert;
+		}
+
+		/* Unless we called row_log_table_apply_delete_low(),
+		this will likely cause a duplicate key error. */
+		mtr_commit(&mtr);
+		goto insert;
+	}
+
+	dtuple_t*	old_row;
+	row_ext_t*	old_ext;
+
+	if (dict_table_get_next_index(index)) {
+		/* Construct the row corresponding to the old value of
+		the record. */
+		old_row = row_build(
+			ROW_COPY_DATA, index, btr_pcur_get_rec(&pcur),
+			cur_offsets, NULL, NULL, NULL, &old_ext, heap);
+		ut_ad(old_row);
+#ifdef ROW_LOG_APPLY_PRINT
+		if (row_log_apply_print) {
+			fprintf(stderr, "table apply update "
+				IB_ID_FMT " " IB_ID_FMT "\n",
+				index->table->id, index->id);
+			dtuple_print(stderr, old_row);
+			dtuple_print(stderr, row);
+		}
+#endif /* ROW_LOG_APPLY_PRINT */
+	} else {
+		old_row = NULL;
+		old_ext = NULL;
+	}
+
+	big_rec_t*	big_rec;
+
+	error = btr_cur_pessimistic_update(
+		BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+		| BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG
+		| BTR_KEEP_POS_FLAG,
+		btr_pcur_get_btr_cur(&pcur),
+		&cur_offsets, &offsets_heap, heap, &big_rec,
+		update, 0, thr, 0, &mtr);
+
+	if (big_rec) {
+		if (error == DB_SUCCESS) {
+			error = btr_store_big_rec_extern_fields(
+				index, btr_pcur_get_block(&pcur),
+				btr_pcur_get_rec(&pcur), cur_offsets,
+				big_rec, &mtr, BTR_STORE_UPDATE);
+		}
+
+		dtuple_big_rec_free(big_rec);
+	}
+
+	while ((index = dict_table_get_next_index(index)) != NULL) {
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		if (!row_upd_changes_ord_field_binary(
+			    index, update, thr, old_row, NULL)) {
+			continue;
+		}
+
+		mtr_commit(&mtr);
+
+		entry = row_build_index_entry(old_row, old_ext, index, heap);
+		if (!entry) {
+			ut_ad(0);
+			return(DB_CORRUPTION);
+		}
+
+		mtr_start(&mtr);
+
+		if (ROW_FOUND != row_search_index_entry(
+			    index, entry, BTR_MODIFY_TREE, &pcur, &mtr)) {
+			ut_ad(0);
+			error = DB_CORRUPTION;
+			break;
+		}
+
+		btr_cur_pessimistic_delete(
+			&error, FALSE, btr_pcur_get_btr_cur(&pcur),
+			BTR_CREATE_FLAG, RB_NONE, &mtr);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		mtr_commit(&mtr);
+
+		entry = row_build_index_entry(row, NULL, index, heap);
+		error = row_ins_sec_index_entry_low(
+			BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+			| BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG,
+			BTR_MODIFY_TREE, index, offsets_heap, heap,
+			entry, trx_id, thr);
+
+		mtr_start(&mtr);
+	}
+
+func_exit:
+	mtr_commit(&mtr);
+	if (error != DB_SUCCESS) {
+		goto err_exit;
+	}
+
+	return(error);
+}
+
+/******************************************************//**
+Applies an operation to a table that was rebuilt.
+@return NULL on failure (mrec corruption) or when out of data;
+pointer to next record on success */
+static __attribute__((nonnull, warn_unused_result))
+const mrec_t*
+row_log_table_apply_op(
+/*===================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	ulint			trx_id_col,	/*!< in: position of
+						DB_TRX_ID in old index */
+	ulint			new_trx_id_col,	/*!< in: position of
+						DB_TRX_ID in new index */
+	row_merge_dup_t*	dup,		/*!< in/out: for reporting
+						duplicate key errors */
+	dberr_t*		error,		/*!< out: DB_SUCCESS
+						or error code */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	const mrec_t*		mrec,		/*!< in: merge record */
+	const mrec_t*		mrec_end,	/*!< in: end of buffer */
+	ulint*			offsets)	/*!< in/out: work area
+						for parsing mrec */
+{
+	row_log_t*	log	= dup->index->online_log;
+	dict_index_t*	new_index = dict_table_get_first_index(log->table);
+	ulint		extra_size;
+	const mrec_t*	next_mrec;
+	dtuple_t*	old_pk;
+	row_ext_t*	ext;
+	ulint		ext_size;
+
+	ut_ad(dict_index_is_clust(dup->index));
+	ut_ad(dup->index->table != log->table);
+	ut_ad(log->head.total <= log->tail.total);
+
+	*error = DB_SUCCESS;
+
+	/* 3 = 1 (op type) + 1 (ext_size) + at least 1 byte payload */
+	if (mrec + 3 >= mrec_end) {
+		return(NULL);
+	}
+
+	const mrec_t* const mrec_start = mrec;
+
+	switch (*mrec++) {
+	default:
+		ut_ad(0);
+		*error = DB_CORRUPTION;
+		return(NULL);
+	case ROW_T_INSERT:
+		extra_size = *mrec++;
+
+		if (extra_size >= 0x80) {
+			/* Read another byte of extra_size. */
+
+			extra_size = (extra_size & 0x7f) << 8;
+			extra_size |= *mrec++;
+		}
+
+		mrec += extra_size;
+
+		if (mrec > mrec_end) {
+			return(NULL);
+		}
+
+		rec_offs_set_n_fields(offsets, dup->index->n_fields);
+		rec_init_offsets_temp(mrec, dup->index, offsets);
+
+		next_mrec = mrec + rec_offs_data_size(offsets);
+
+		if (next_mrec > mrec_end) {
+			return(NULL);
+		} else {
+			log->head.total += next_mrec - mrec_start;
+
+			ulint		len;
+			const byte*	db_trx_id
+				= rec_get_nth_field(
+					mrec, offsets, trx_id_col, &len);
+			ut_ad(len == DATA_TRX_ID_LEN);
+			*error = row_log_table_apply_insert(
+				thr, mrec, offsets, offsets_heap,
+				heap, dup, trx_read_trx_id(db_trx_id));
+		}
+		break;
+
+	case ROW_T_DELETE:
+		/* 1 (extra_size) + 2 (ext_size) + at least 1 (payload) */
+		if (mrec + 4 >= mrec_end) {
+			return(NULL);
+		}
+
+		extra_size = *mrec++;
+		ext_size = mach_read_from_2(mrec);
+		mrec += 2;
+		ut_ad(mrec < mrec_end);
+
+		/* We assume extra_size < 0x100 for the PRIMARY KEY prefix.
+		For fixed-length PRIMARY key columns, it is 0. */
+		mrec += extra_size;
+
+		rec_offs_set_n_fields(offsets, new_index->n_uniq + 1);
+		rec_init_offsets_temp(mrec, new_index, offsets);
+		next_mrec = mrec + rec_offs_data_size(offsets) + ext_size;
+		if (next_mrec > mrec_end) {
+			return(NULL);
+		}
+
+		log->head.total += next_mrec - mrec_start;
+
+		/* If there are external fields, retrieve those logged
+		prefix info and reconstruct the row_ext_t */
+		if (ext_size) {
+			/* We use memcpy to avoid unaligned
+			access on some non-x86 platforms.*/
+			ext = static_cast<row_ext_t*>(
+				mem_heap_dup(heap,
+					     mrec + rec_offs_data_size(offsets),
+					     ext_size));
+
+			byte*	ext_start = reinterpret_cast<byte*>(ext);
+
+			ulint	ext_len = sizeof(*ext)
+				+ (ext->n_ext - 1) * sizeof ext->len;
+
+			ext->ext = reinterpret_cast<ulint*>(ext_start + ext_len);
+			ext_len += ext->n_ext * sizeof(*ext->ext);
+
+			ext->buf = static_cast<byte*>(ext_start + ext_len);
+		} else {
+			ext = NULL;
+		}
+
+		*error = row_log_table_apply_delete(
+			thr, new_trx_id_col,
+			mrec, offsets, offsets_heap, heap,
+			log->table, ext);
+		break;
+
+	case ROW_T_UPDATE:
+		/* Logically, the log entry consists of the
+		(PRIMARY KEY,DB_TRX_ID) of the old value (converted
+		to the new primary key definition) followed by
+		the new value in the old table definition. If the
+		definition of the columns belonging to PRIMARY KEY
+		is not changed, the log will only contain
+		DB_TRX_ID,new_row. */
+
+		if (dup->index->online_log->same_pk) {
+			ut_ad(new_index->n_uniq == dup->index->n_uniq);
+
+			extra_size = *mrec++;
+
+			if (extra_size >= 0x80) {
+				/* Read another byte of extra_size. */
+
+				extra_size = (extra_size & 0x7f) << 8;
+				extra_size |= *mrec++;
+			}
+
+			mrec += extra_size;
+
+			if (mrec > mrec_end) {
+				return(NULL);
+			}
+
+			rec_offs_set_n_fields(offsets, dup->index->n_fields);
+			rec_init_offsets_temp(mrec, dup->index, offsets);
+
+			next_mrec = mrec + rec_offs_data_size(offsets);
+
+			if (next_mrec > mrec_end) {
+				return(NULL);
+			}
+
+			old_pk = dtuple_create(heap, new_index->n_uniq);
+			dict_index_copy_types(
+				old_pk, new_index, old_pk->n_fields);
+
+			/* Copy the PRIMARY KEY fields from mrec to old_pk. */
+			for (ulint i = 0; i < new_index->n_uniq; i++) {
+				const void*	field;
+				ulint		len;
+				dfield_t*	dfield;
+
+				ut_ad(!rec_offs_nth_extern(offsets, i));
+
+				field = rec_get_nth_field(
+					mrec, offsets, i, &len);
+				ut_ad(len != UNIV_SQL_NULL);
+
+				dfield = dtuple_get_nth_field(old_pk, i);
+				dfield_set_data(dfield, field, len);
+			}
+		} else {
+			/* We assume extra_size < 0x100
+			for the PRIMARY KEY prefix. */
+			mrec += *mrec + 1;
+
+			if (mrec > mrec_end) {
+				return(NULL);
+			}
+
+			/* Get offsets for PRIMARY KEY,
+			DB_TRX_ID, DB_ROLL_PTR. */
+			rec_offs_set_n_fields(offsets, new_index->n_uniq + 2);
+			rec_init_offsets_temp(mrec, new_index, offsets);
+
+			next_mrec = mrec + rec_offs_data_size(offsets);
+			if (next_mrec + 2 > mrec_end) {
+				return(NULL);
+			}
+
+			/* Copy the PRIMARY KEY fields and
+			DB_TRX_ID, DB_ROLL_PTR from mrec to old_pk. */
+			old_pk = dtuple_create(heap, new_index->n_uniq + 2);
+			dict_index_copy_types(old_pk, new_index,
+					      old_pk->n_fields);
+
+			for (ulint i = 0;
+			     i < dict_index_get_n_unique(new_index) + 2;
+			     i++) {
+				const void*	field;
+				ulint		len;
+				dfield_t*	dfield;
+
+				ut_ad(!rec_offs_nth_extern(offsets, i));
+
+				field = rec_get_nth_field(
+					mrec, offsets, i, &len);
+				ut_ad(len != UNIV_SQL_NULL);
+
+				dfield = dtuple_get_nth_field(old_pk, i);
+				dfield_set_data(dfield, field, len);
+			}
+
+			mrec = next_mrec;
+
+			/* Fetch the new value of the row as it was
+			in the old table definition. */
+			extra_size = *mrec++;
+
+			if (extra_size >= 0x80) {
+				/* Read another byte of extra_size. */
+
+				extra_size = (extra_size & 0x7f) << 8;
+				extra_size |= *mrec++;
+			}
+
+			mrec += extra_size;
+
+			if (mrec > mrec_end) {
+				return(NULL);
+			}
+
+			rec_offs_set_n_fields(offsets, dup->index->n_fields);
+			rec_init_offsets_temp(mrec, dup->index, offsets);
+
+			next_mrec = mrec + rec_offs_data_size(offsets);
+
+			if (next_mrec > mrec_end) {
+				return(NULL);
+			}
+		}
+
+		ut_ad(next_mrec <= mrec_end);
+		log->head.total += next_mrec - mrec_start;
+		dtuple_set_n_fields_cmp(old_pk, new_index->n_uniq);
+
+		{
+			ulint		len;
+			const byte*	db_trx_id
+				= rec_get_nth_field(
+					mrec, offsets, trx_id_col, &len);
+			ut_ad(len == DATA_TRX_ID_LEN);
+			*error = row_log_table_apply_update(
+				thr, trx_id_col, new_trx_id_col,
+				mrec, offsets, offsets_heap,
+				heap, dup, trx_read_trx_id(db_trx_id), old_pk);
+		}
+
+		break;
+	}
+
+	ut_ad(log->head.total <= log->tail.total);
+	mem_heap_empty(offsets_heap);
+	mem_heap_empty(heap);
+	return(next_mrec);
+}
+
+/******************************************************//**
+Applies operations to a table was rebuilt.
+@return DB_SUCCESS, or error code on failure */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_ops(
+/*====================*/
+	que_thr_t*	thr,	/*!< in: query graph */
+	row_merge_dup_t*dup)	/*!< in/out: for reporting duplicate key
+				errors */
+{
+	dberr_t		error;
+	const mrec_t*	mrec		= NULL;
+	const mrec_t*	next_mrec;
+	const mrec_t*	mrec_end	= NULL; /* silence bogus warning */
+	const mrec_t*	next_mrec_end;
+	mem_heap_t*	heap;
+	mem_heap_t*	offsets_heap;
+	ulint*		offsets;
+	bool		has_index_lock;
+	dict_index_t*	index		= const_cast<dict_index_t*>(
+		dup->index);
+	dict_table_t*	new_table	= index->online_log->table;
+	dict_index_t*	new_index	= dict_table_get_first_index(
+		new_table);
+	const ulint	i		= 1 + REC_OFFS_HEADER_SIZE
+		+ ut_max(dict_index_get_n_fields(index),
+			 dict_index_get_n_unique(new_index) + 2);
+	const ulint	trx_id_col	= dict_col_get_clust_pos(
+		dict_table_get_sys_col(index->table, DATA_TRX_ID), index);
+	const ulint	new_trx_id_col	= dict_col_get_clust_pos(
+		dict_table_get_sys_col(new_table, DATA_TRX_ID), new_index);
+	trx_t*		trx		= thr_get_trx(thr);
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_online_ddl(index));
+	ut_ad(trx->mysql_thd);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(!dict_index_is_online_ddl(new_index));
+	ut_ad(trx_id_col > 0);
+	ut_ad(trx_id_col != ULINT_UNDEFINED);
+	ut_ad(new_trx_id_col > 0);
+	ut_ad(new_trx_id_col != ULINT_UNDEFINED);
+
+	UNIV_MEM_INVALID(&mrec_end, sizeof mrec_end);
+
+	offsets = static_cast<ulint*>(ut_malloc(i * sizeof *offsets));
+	offsets[0] = i;
+	offsets[1] = dict_index_get_n_fields(index);
+
+	heap = mem_heap_create(UNIV_PAGE_SIZE);
+	offsets_heap = mem_heap_create(UNIV_PAGE_SIZE);
+	has_index_lock = true;
+
+next_block:
+	ut_ad(has_index_lock);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(index->online_log->head.bytes == 0);
+
+	if (trx_is_interrupted(trx)) {
+		goto interrupted;
+	}
+
+	if (dict_index_is_corrupted(index)) {
+		error = DB_INDEX_CORRUPT;
+		goto func_exit;
+	}
+
+	ut_ad(dict_index_is_online_ddl(index));
+
+	error = index->online_log->error;
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(index->online_log->head.blocks
+			  > index->online_log->tail.blocks)) {
+unexpected_eof:
+		fprintf(stderr, "InnoDB: unexpected end of temporary file"
+			" for table %s\n", index->table_name);
+corruption:
+		error = DB_CORRUPTION;
+		goto func_exit;
+	}
+
+	if (index->online_log->head.blocks
+	    == index->online_log->tail.blocks) {
+		if (index->online_log->head.blocks) {
+#ifdef HAVE_FTRUNCATE
+			/* Truncate the file in order to save space. */
+			ftruncate(index->online_log->fd, 0);
+#endif /* HAVE_FTRUNCATE */
+			index->online_log->head.blocks
+				= index->online_log->tail.blocks = 0;
+		}
+
+		next_mrec = index->online_log->tail.block;
+		next_mrec_end = next_mrec + index->online_log->tail.bytes;
+
+		if (next_mrec_end == next_mrec) {
+			/* End of log reached. */
+all_done:
+			ut_ad(has_index_lock);
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->tail.blocks == 0);
+			index->online_log->head.bytes = 0;
+			index->online_log->tail.bytes = 0;
+			error = DB_SUCCESS;
+			goto func_exit;
+		}
+	} else {
+		os_offset_t	ofs;
+		ibool		success;
+
+		ofs = (os_offset_t) index->online_log->head.blocks
+			* srv_sort_buf_size;
+
+		ut_ad(has_index_lock);
+		has_index_lock = false;
+		rw_lock_x_unlock(dict_index_get_lock(index));
+
+		log_free_check();
+
+		ut_ad(dict_index_is_online_ddl(index));
+
+		success = os_file_read_no_error_handling(
+			OS_FILE_FROM_FD(index->online_log->fd),
+			index->online_log->head.block, ofs,
+			srv_sort_buf_size);
+
+		if (!success) {
+			fprintf(stderr, "InnoDB: unable to read temporary file"
+				" for table %s\n", index->table_name);
+			goto corruption;
+		}
+
+#ifdef POSIX_FADV_DONTNEED
+		/* Each block is read exactly once.  Free up the file cache. */
+		posix_fadvise(index->online_log->fd,
+			      ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+#if 0 //def FALLOC_FL_PUNCH_HOLE
+		/* Try to deallocate the space for the file on disk.
+		This should work on ext4 on Linux 2.6.39 and later,
+		and be ignored when the operation is unsupported. */
+		fallocate(index->online_log->fd,
+			  FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+			  ofs, srv_buf_size);
+#endif /* FALLOC_FL_PUNCH_HOLE */
+
+		next_mrec = index->online_log->head.block;
+		next_mrec_end = next_mrec + srv_sort_buf_size;
+	}
+
+	/* This read is not protected by index->online_log->mutex for
+	performance reasons. We will eventually notice any error that
+	was flagged by a DML thread. */
+	error = index->online_log->error;
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (mrec) {
+		/* A partial record was read from the previous block.
+		Copy the temporary buffer full, as we do not know the
+		length of the record. Parse subsequent records from
+		the bigger buffer index->online_log->head.block
+		or index->online_log->tail.block. */
+
+		ut_ad(mrec == index->online_log->head.buf);
+		ut_ad(mrec_end > mrec);
+		ut_ad(mrec_end < (&index->online_log->head.buf)[1]);
+
+		memcpy((mrec_t*) mrec_end, next_mrec,
+		       (&index->online_log->head.buf)[1] - mrec_end);
+		mrec = row_log_table_apply_op(
+			thr, trx_id_col, new_trx_id_col,
+			dup, &error, offsets_heap, heap,
+			index->online_log->head.buf,
+			(&index->online_log->head.buf)[1], offsets);
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		} else if (UNIV_UNLIKELY(mrec == NULL)) {
+			/* The record was not reassembled properly. */
+			goto corruption;
+		}
+		/* The record was previously found out to be
+		truncated. Now that the parse buffer was extended,
+		it should proceed beyond the old end of the buffer. */
+		ut_a(mrec > mrec_end);
+
+		index->online_log->head.bytes = mrec - mrec_end;
+		next_mrec += index->online_log->head.bytes;
+	}
+
+	ut_ad(next_mrec <= next_mrec_end);
+	/* The following loop must not be parsing the temporary
+	buffer, but head.block or tail.block. */
+
+	/* mrec!=NULL means that the next record starts from the
+	middle of the block */
+	ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0));
+
+#ifdef UNIV_DEBUG
+	if (next_mrec_end == index->online_log->head.block
+	    + srv_sort_buf_size) {
+		/* If tail.bytes == 0, next_mrec_end can also be at
+		the end of tail.block. */
+		if (index->online_log->tail.bytes == 0) {
+			ut_ad(next_mrec == next_mrec_end);
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->head.bytes == 0);
+		} else {
+			ut_ad(next_mrec == index->online_log->head.block
+			      + index->online_log->head.bytes);
+			ut_ad(index->online_log->tail.blocks
+			      > index->online_log->head.blocks);
+		}
+	} else if (next_mrec_end == index->online_log->tail.block
+		   + index->online_log->tail.bytes) {
+		ut_ad(next_mrec == index->online_log->tail.block
+		      + index->online_log->head.bytes);
+		ut_ad(index->online_log->tail.blocks == 0);
+		ut_ad(index->online_log->head.blocks == 0);
+		ut_ad(index->online_log->head.bytes
+		      <= index->online_log->tail.bytes);
+	} else {
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
+	mrec_end = next_mrec_end;
+
+	while (!trx_is_interrupted(trx)) {
+		mrec = next_mrec;
+		ut_ad(mrec < mrec_end);
+
+		if (!has_index_lock) {
+			/* We are applying operations from a different
+			block than the one that is being written to.
+			We do not hold index->lock in order to
+			allow other threads to concurrently buffer
+			modifications. */
+			ut_ad(mrec >= index->online_log->head.block);
+			ut_ad(mrec_end == index->online_log->head.block
+			      + srv_sort_buf_size);
+			ut_ad(index->online_log->head.bytes
+			      < srv_sort_buf_size);
+
+			/* Take the opportunity to do a redo log
+			checkpoint if needed. */
+			log_free_check();
+		} else {
+			/* We are applying operations from the last block.
+			Do not allow other threads to buffer anything,
+			so that we can finally catch up and synchronize. */
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(mrec_end == index->online_log->tail.block
+			      + index->online_log->tail.bytes);
+			ut_ad(mrec >= index->online_log->tail.block);
+		}
+
+		/* This read is not protected by index->online_log->mutex
+		for performance reasons. We will eventually notice any
+		error that was flagged by a DML thread. */
+		error = index->online_log->error;
+
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		}
+
+		next_mrec = row_log_table_apply_op(
+			thr, trx_id_col, new_trx_id_col,
+			dup, &error, offsets_heap, heap,
+			mrec, mrec_end, offsets);
+
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		} else if (next_mrec == next_mrec_end) {
+			/* The record happened to end on a block boundary.
+			Do we have more blocks left? */
+			if (has_index_lock) {
+				/* The index will be locked while
+				applying the last block. */
+				goto all_done;
+			}
+
+			mrec = NULL;
+process_next_block:
+			rw_lock_x_lock(dict_index_get_lock(index));
+			has_index_lock = true;
+
+			index->online_log->head.bytes = 0;
+			index->online_log->head.blocks++;
+			goto next_block;
+		} else if (next_mrec != NULL) {
+			ut_ad(next_mrec < next_mrec_end);
+			index->online_log->head.bytes += next_mrec - mrec;
+		} else if (has_index_lock) {
+			/* When mrec is within tail.block, it should
+			be a complete record, because we are holding
+			index->lock and thus excluding the writer. */
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(mrec_end == index->online_log->tail.block
+			      + index->online_log->tail.bytes);
+			ut_ad(0);
+			goto unexpected_eof;
+		} else {
+			memcpy(index->online_log->head.buf, mrec,
+			       mrec_end - mrec);
+			mrec_end += index->online_log->head.buf - mrec;
+			mrec = index->online_log->head.buf;
+			goto process_next_block;
+		}
+	}
+
+interrupted:
+	error = DB_INTERRUPTED;
+func_exit:
+	if (!has_index_lock) {
+		rw_lock_x_lock(dict_index_get_lock(index));
+	}
+
+	mem_heap_free(offsets_heap);
+	mem_heap_free(heap);
+	ut_free(offsets);
+	return(error);
+}
+
+/******************************************************//**
+Apply the row_log_table log to a table upon completing rebuild.
+@return DB_SUCCESS, or error code on failure */
+UNIV_INTERN
+dberr_t
+row_log_table_apply(
+/*================*/
+	que_thr_t*	thr,	/*!< in: query graph */
+	dict_table_t*	old_table,
+				/*!< in: old table */
+	struct TABLE*	table)	/*!< in/out: MySQL table
+				(for reporting duplicates) */
+{
+	dberr_t		error;
+	dict_index_t*	clust_index;
+
+	thr_get_trx(thr)->error_key_num = 0;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+	clust_index = dict_table_get_first_index(old_table);
+
+	rw_lock_x_lock(dict_index_get_lock(clust_index));
+
+	if (!clust_index->online_log) {
+		ut_ad(dict_index_get_online_status(clust_index)
+		      == ONLINE_INDEX_COMPLETE);
+		/* This function should not be called unless
+		rebuilding a table online. Build in some fault
+		tolerance. */
+		ut_ad(0);
+		error = DB_ERROR;
+	} else {
+		row_merge_dup_t	dup = {
+			clust_index, table,
+			clust_index->online_log->col_map, 0
+		};
+
+		error = row_log_table_apply_ops(thr, &dup);
+
+		ut_ad(error != DB_SUCCESS
+		      || clust_index->online_log->head.total
+		      == clust_index->online_log->tail.total);
+	}
+
+	rw_lock_x_unlock(dict_index_get_lock(clust_index));
+	return(error);
+}
+
+/******************************************************//**
+Allocate the row log for an index and flag the index
+for online creation.
+@retval true if success, false if not */
+UNIV_INTERN
+bool
+row_log_allocate(
+/*=============*/
+	dict_index_t*	index,	/*!< in/out: index */
+	dict_table_t*	table,	/*!< in/out: new table being rebuilt,
+				or NULL when creating a secondary index */
+	bool		same_pk,/*!< in: whether the definition of the
+				PRIMARY KEY has remained the same */
+	const dtuple_t*	add_cols,
+				/*!< in: default values of
+				added columns, or NULL */
+	const ulint*	col_map)/*!< in: mapping of old column
+				numbers to new ones, or NULL if !table */
+{
+	byte*		buf;
+	row_log_t*	log;
+	ulint		size;
+	DBUG_ENTER("row_log_allocate");
+
+	ut_ad(!dict_index_is_online_ddl(index));
+	ut_ad(dict_index_is_clust(index) == !!table);
+	ut_ad(!table || index->table != table);
+	ut_ad(same_pk || table);
+	ut_ad(!table || col_map);
+	ut_ad(!add_cols || col_map);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	size = 2 * srv_sort_buf_size + sizeof *log;
+	buf = (byte*) os_mem_alloc_large(&size, FALSE);
+	if (!buf) {
+		DBUG_RETURN(false);
+	}
+
+	log = (row_log_t*) &buf[2 * srv_sort_buf_size];
+	log->size = size;
+	log->fd = row_merge_file_create_low();
+	if (log->fd < 0) {
+		os_mem_free_large(buf, size);
+		DBUG_RETURN(false);
+	}
+	mutex_create(index_online_log_key, &log->mutex,
+		     SYNC_INDEX_ONLINE_LOG);
+	log->blobs = NULL;
+	log->table = table;
+	log->same_pk = same_pk;
+	log->add_cols = add_cols;
+	log->col_map = col_map;
+	log->error = DB_SUCCESS;
+	log->max_trx = 0;
+	log->head.block = buf;
+	log->tail.block = buf + srv_sort_buf_size;
+	log->tail.blocks = log->tail.bytes = 0;
+	log->tail.total = 0;
+	log->head.blocks = log->head.bytes = 0;
+	log->head.total = 0;
+	dict_index_set_online_status(index, ONLINE_INDEX_CREATION);
+	index->online_log = log;
+
+	/* While we might be holding an exclusive data dictionary lock
+	here, in row_log_abort_sec() we will not always be holding it. Use
+	atomic operations in both cases. */
+	MONITOR_ATOMIC_INC(MONITOR_ONLINE_CREATE_INDEX);
+
+	DBUG_RETURN(true);
+}
+
+/******************************************************//**
+Free the row log for an index that was being created online. */
+UNIV_INTERN
+void
+row_log_free(
+/*=========*/
+	row_log_t*&	log)	/*!< in,own: row log */
+{
+	MONITOR_ATOMIC_DEC(MONITOR_ONLINE_CREATE_INDEX);
+
+	delete log->blobs;
+	row_merge_file_destroy_low(log->fd);
+	mutex_free(&log->mutex);
+	os_mem_free_large(log->head.block, log->size);
+	log = 0;
+}
+
+/******************************************************//**
+Get the latest transaction ID that has invoked row_log_online_op()
+during online creation.
+@return latest transaction ID, or 0 if nothing was logged */
+UNIV_INTERN
+trx_id_t
+row_log_get_max_trx(
+/*================*/
+	dict_index_t*	index)	/*!< in: index, must be locked */
+{
+	ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_CREATION);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad((rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED)
+	       && mutex_own(&index->online_log->mutex))
+	      || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	return(index->online_log->max_trx);
+}
+
+/******************************************************//**
+Applies an operation to a secondary index that was being created. */
+static __attribute__((nonnull))
+void
+row_log_apply_op_low(
+/*=================*/
+	dict_index_t*	index,		/*!< in/out: index */
+	row_merge_dup_t*dup,		/*!< in/out: for reporting
+					duplicate key errors */
+	dberr_t*	error,		/*!< out: DB_SUCCESS or error code */
+	mem_heap_t*	offsets_heap,	/*!< in/out: memory heap for
+					allocating offsets; can be emptied */
+	bool		has_index_lock, /*!< in: true if holding index->lock
+					in exclusive mode */
+	enum row_op	op,		/*!< in: operation being applied */
+	trx_id_t	trx_id,		/*!< in: transaction identifier */
+	const dtuple_t*	entry)		/*!< in: row */
+{
+	mtr_t		mtr;
+	btr_cur_t	cursor;
+	ulint*		offsets = NULL;
+
+	ut_ad(!dict_index_is_clust(index));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)
+	      == has_index_lock);
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(!dict_index_is_corrupted(index));
+	ut_ad(trx_id != 0 || op == ROW_OP_DELETE);
+
+	mtr_start(&mtr);
+
+	/* We perform the pessimistic variant of the operations if we
+	already hold index->lock exclusively. First, search the
+	record. The operation may already have been performed,
+	depending on when the row in the clustered index was
+	scanned. */
+	btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+				    has_index_lock
+				    ? BTR_MODIFY_TREE
+				    : BTR_MODIFY_LEAF,
+				    &cursor, 0, __FILE__, __LINE__,
+				    &mtr);
+
+	ut_ad(dict_index_get_n_unique(index) > 0);
+	/* This test is somewhat similar to row_ins_must_modify_rec(),
+	but not identical for unique secondary indexes. */
+	if (cursor.low_match >= dict_index_get_n_unique(index)
+	    && !page_rec_is_infimum(btr_cur_get_rec(&cursor))) {
+		/* We have a matching record. */
+		bool	exists	= (cursor.low_match
+				   == dict_index_get_n_fields(index));
+#ifdef UNIV_DEBUG
+		rec_t*	rec	= btr_cur_get_rec(&cursor);
+		ut_ad(page_rec_is_user_rec(rec));
+		ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
+#endif /* UNIV_DEBUG */
+
+		ut_ad(exists || dict_index_is_unique(index));
+
+		switch (op) {
+		case ROW_OP_DELETE:
+			if (!exists) {
+				/* The record was already deleted. */
+				goto func_exit;
+			}
+
+			if (btr_cur_optimistic_delete(
+				    &cursor, BTR_CREATE_FLAG, &mtr)) {
+				*error = DB_SUCCESS;
+				break;
+			}
+
+			if (!has_index_lock) {
+				/* This needs a pessimistic operation.
+				Lock the index tree exclusively. */
+				mtr_commit(&mtr);
+				mtr_start(&mtr);
+				btr_cur_search_to_nth_level(
+					index, 0, entry, PAGE_CUR_LE,
+					BTR_MODIFY_TREE, &cursor, 0,
+					__FILE__, __LINE__, &mtr);
+
+				/* No other thread than the current one
+				is allowed to modify the index tree.
+				Thus, the record should still exist. */
+				ut_ad(cursor.low_match
+				      >= dict_index_get_n_fields(index));
+				ut_ad(page_rec_is_user_rec(
+					      btr_cur_get_rec(&cursor)));
+			}
+
+			/* As there are no externally stored fields in
+			a secondary index record, the parameter
+			rb_ctx = RB_NONE will be ignored. */
+
+			btr_cur_pessimistic_delete(
+				error, FALSE, &cursor,
+				BTR_CREATE_FLAG, RB_NONE, &mtr);
+			break;
+		case ROW_OP_INSERT:
+			if (exists) {
+				/* The record already exists. There
+				is nothing to be inserted. */
+				goto func_exit;
+			}
+
+			if (dtuple_contains_null(entry)) {
+				/* The UNIQUE KEY columns match, but
+				there is a NULL value in the key, and
+				NULL!=NULL. */
+				goto insert_the_rec;
+			}
+
+			/* Duplicate key error */
+			ut_ad(dict_index_is_unique(index));
+			row_merge_dup_report(dup, entry->fields);
+			goto func_exit;
+		}
+	} else {
+		switch (op) {
+			rec_t*		rec;
+			big_rec_t*	big_rec;
+		case ROW_OP_DELETE:
+			/* The record does not exist. */
+			goto func_exit;
+		case ROW_OP_INSERT:
+			if (dict_index_is_unique(index)
+			    && (cursor.up_match
+				>= dict_index_get_n_unique(index)
+				|| cursor.low_match
+				>= dict_index_get_n_unique(index))
+			    && (!index->n_nullable
+				|| !dtuple_contains_null(entry))) {
+				/* Duplicate key */
+				row_merge_dup_report(dup, entry->fields);
+				goto func_exit;
+			}
+insert_the_rec:
+			/* Insert the record. As we are inserting into
+			a secondary index, there cannot be externally
+			stored columns (!big_rec). */
+			*error = btr_cur_optimistic_insert(
+				BTR_NO_UNDO_LOG_FLAG
+				| BTR_NO_LOCKING_FLAG
+				| BTR_CREATE_FLAG,
+				&cursor, &offsets, &offsets_heap,
+				const_cast<dtuple_t*>(entry),
+				&rec, &big_rec, 0, NULL, &mtr);
+			ut_ad(!big_rec);
+			if (*error != DB_FAIL) {
+				break;
+			}
+
+			if (!has_index_lock) {
+				/* This needs a pessimistic operation.
+				Lock the index tree exclusively. */
+				mtr_commit(&mtr);
+				mtr_start(&mtr);
+				btr_cur_search_to_nth_level(
+					index, 0, entry, PAGE_CUR_LE,
+					BTR_MODIFY_TREE, &cursor, 0,
+					__FILE__, __LINE__, &mtr);
+			}
+
+			/* We already determined that the
+			record did not exist. No other thread
+			than the current one is allowed to
+			modify the index tree. Thus, the
+			record should still not exist. */
+
+			*error = btr_cur_pessimistic_insert(
+				BTR_NO_UNDO_LOG_FLAG
+				| BTR_NO_LOCKING_FLAG
+				| BTR_CREATE_FLAG,
+				&cursor, &offsets, &offsets_heap,
+				const_cast<dtuple_t*>(entry),
+				&rec, &big_rec,
+				0, NULL, &mtr);
+			ut_ad(!big_rec);
+			break;
+		}
+		mem_heap_empty(offsets_heap);
+	}
+
+	if (*error == DB_SUCCESS && trx_id) {
+		page_update_max_trx_id(btr_cur_get_block(&cursor),
+				       btr_cur_get_page_zip(&cursor),
+				       trx_id, &mtr);
+	}
+
+func_exit:
+	mtr_commit(&mtr);
+}
+
+/******************************************************//**
+Applies an operation to a secondary index that was being created.
+@return NULL on failure (mrec corruption) or when out of data;
+pointer to next record on success */
+static __attribute__((nonnull, warn_unused_result))
+const mrec_t*
+row_log_apply_op(
+/*=============*/
+	dict_index_t*	index,		/*!< in/out: index */
+	row_merge_dup_t*dup,		/*!< in/out: for reporting
+					duplicate key errors */
+	dberr_t*	error,		/*!< out: DB_SUCCESS or error code */
+	mem_heap_t*	offsets_heap,	/*!< in/out: memory heap for
+					allocating offsets; can be emptied */
+	mem_heap_t*	heap,		/*!< in/out: memory heap for
+					allocating data tuples */
+	bool		has_index_lock, /*!< in: true if holding index->lock
+					in exclusive mode */
+	const mrec_t*	mrec,		/*!< in: merge record */
+	const mrec_t*	mrec_end,	/*!< in: end of buffer */
+	ulint*		offsets)	/*!< in/out: work area for
+					rec_init_offsets_temp() */
+
+{
+	enum row_op	op;
+	ulint		extra_size;
+	ulint		data_size;
+	ulint		n_ext;
+	dtuple_t*	entry;
+	trx_id_t	trx_id;
+
+	/* Online index creation is only used for secondary indexes. */
+	ut_ad(!dict_index_is_clust(index));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)
+	      == has_index_lock);
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (dict_index_is_corrupted(index)) {
+		*error = DB_INDEX_CORRUPT;
+		return(NULL);
+	}
+
+	*error = DB_SUCCESS;
+
+	if (mrec + ROW_LOG_HEADER_SIZE >= mrec_end) {
+		return(NULL);
+	}
+
+	switch (*mrec) {
+	case ROW_OP_INSERT:
+		if (ROW_LOG_HEADER_SIZE + DATA_TRX_ID_LEN + mrec >= mrec_end) {
+			return(NULL);
+		}
+
+		op = static_cast<enum row_op>(*mrec++);
+		trx_id = trx_read_trx_id(mrec);
+		mrec += DATA_TRX_ID_LEN;
+		break;
+	case ROW_OP_DELETE:
+		op = static_cast<enum row_op>(*mrec++);
+		trx_id = 0;
+		break;
+	default:
+corrupted:
+		ut_ad(0);
+		*error = DB_CORRUPTION;
+		return(NULL);
+	}
+
+	extra_size = *mrec++;
+
+	ut_ad(mrec < mrec_end);
+
+	if (extra_size >= 0x80) {
+		/* Read another byte of extra_size. */
+
+		extra_size = (extra_size & 0x7f) << 8;
+		extra_size |= *mrec++;
+	}
+
+	mrec += extra_size;
+
+	if (mrec > mrec_end) {
+		return(NULL);
+	}
+
+	rec_init_offsets_temp(mrec, index, offsets);
+
+	if (rec_offs_any_extern(offsets)) {
+		/* There should never be any externally stored fields
+		in a secondary index, which is what online index
+		creation is used for. Therefore, the log file must be
+		corrupted. */
+		goto corrupted;
+	}
+
+	data_size = rec_offs_data_size(offsets);
+
+	mrec += data_size;
+
+	if (mrec > mrec_end) {
+		return(NULL);
+	}
+
+	entry = row_rec_to_index_entry_low(
+		mrec - data_size, index, offsets, &n_ext, heap);
+	/* Online index creation is only implemented for secondary
+	indexes, which never contain off-page columns. */
+	ut_ad(n_ext == 0);
+#ifdef ROW_LOG_APPLY_PRINT
+	if (row_log_apply_print) {
+		fprintf(stderr, "apply " IB_ID_FMT " " TRX_ID_FMT " %u %u ",
+			index->id, trx_id,
+			unsigned (op), unsigned (has_index_lock));
+		for (const byte* m = mrec - data_size; m < mrec; m++) {
+			fprintf(stderr, "%02x", *m);
+		}
+		putc('\n', stderr);
+	}
+#endif /* ROW_LOG_APPLY_PRINT */
+	row_log_apply_op_low(index, dup, error, offsets_heap,
+			     has_index_lock, op, trx_id, entry);
+	return(mrec);
+}
+
+/******************************************************//**
+Applies operations to a secondary index that was being created.
+@return DB_SUCCESS, or error code on failure */
+static __attribute__((nonnull))
+dberr_t
+row_log_apply_ops(
+/*==============*/
+	trx_t*		trx,	/*!< in: transaction (for checking if
+				the operation was interrupted) */
+	dict_index_t*	index,	/*!< in/out: index */
+	row_merge_dup_t*dup)	/*!< in/out: for reporting duplicate key
+				errors */
+{
+	dberr_t		error;
+	const mrec_t*	mrec	= NULL;
+	const mrec_t*	next_mrec;
+	const mrec_t*	mrec_end= NULL; /* silence bogus warning */
+	const mrec_t*	next_mrec_end;
+	mem_heap_t*	offsets_heap;
+	mem_heap_t*	heap;
+	ulint*		offsets;
+	bool		has_index_lock;
+	const ulint	i	= 1 + REC_OFFS_HEADER_SIZE
+		+ dict_index_get_n_fields(index);
+
+	ut_ad(dict_index_is_online_ddl(index));
+	ut_ad(*index->name == TEMP_INDEX_PREFIX);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(index->online_log);
+	UNIV_MEM_INVALID(&mrec_end, sizeof mrec_end);
+
+	offsets = static_cast<ulint*>(ut_malloc(i * sizeof *offsets));
+	offsets[0] = i;
+	offsets[1] = dict_index_get_n_fields(index);
+
+	offsets_heap = mem_heap_create(UNIV_PAGE_SIZE);
+	heap = mem_heap_create(UNIV_PAGE_SIZE);
+	has_index_lock = true;
+
+next_block:
+	ut_ad(has_index_lock);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(index->online_log->head.bytes == 0);
+
+	if (trx_is_interrupted(trx)) {
+		goto interrupted;
+	}
+
+	if (dict_index_is_corrupted(index)) {
+		error = DB_INDEX_CORRUPT;
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(index->online_log->head.blocks
+			  > index->online_log->tail.blocks)) {
+unexpected_eof:
+		fprintf(stderr, "InnoDB: unexpected end of temporary file"
+			" for index %s\n", index->name + 1);
+corruption:
+		error = DB_CORRUPTION;
+		goto func_exit;
+	}
+
+	if (index->online_log->head.blocks
+	    == index->online_log->tail.blocks) {
+		if (index->online_log->head.blocks) {
+#ifdef HAVE_FTRUNCATE
+			/* Truncate the file in order to save space. */
+			ftruncate(index->online_log->fd, 0);
+#endif /* HAVE_FTRUNCATE */
+			index->online_log->head.blocks
+				= index->online_log->tail.blocks = 0;
+		}
+
+		next_mrec = index->online_log->tail.block;
+		next_mrec_end = next_mrec + index->online_log->tail.bytes;
+
+		if (next_mrec_end == next_mrec) {
+			/* End of log reached. */
+all_done:
+			ut_ad(has_index_lock);
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->tail.blocks == 0);
+			error = DB_SUCCESS;
+			goto func_exit;
+		}
+	} else {
+		os_offset_t	ofs;
+		ibool		success;
+
+		ofs = (os_offset_t) index->online_log->head.blocks
+			* srv_sort_buf_size;
+
+		ut_ad(has_index_lock);
+		has_index_lock = false;
+		rw_lock_x_unlock(dict_index_get_lock(index));
+
+		log_free_check();
+
+		success = os_file_read_no_error_handling(
+			OS_FILE_FROM_FD(index->online_log->fd),
+			index->online_log->head.block, ofs,
+			srv_sort_buf_size);
+
+		if (!success) {
+			fprintf(stderr, "InnoDB: unable to read temporary file"
+				" for index %s\n", index->name + 1);
+			goto corruption;
+		}
+
+#ifdef POSIX_FADV_DONTNEED
+		/* Each block is read exactly once.  Free up the file cache. */
+		posix_fadvise(index->online_log->fd,
+			      ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+#if 0 //def FALLOC_FL_PUNCH_HOLE
+		/* Try to deallocate the space for the file on disk.
+		This should work on ext4 on Linux 2.6.39 and later,
+		and be ignored when the operation is unsupported. */
+		fallocate(index->online_log->fd,
+			  FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+			  ofs, srv_buf_size);
+#endif /* FALLOC_FL_PUNCH_HOLE */
+
+		next_mrec = index->online_log->head.block;
+		next_mrec_end = next_mrec + srv_sort_buf_size;
+	}
+
+	if (mrec) {
+		/* A partial record was read from the previous block.
+		Copy the temporary buffer full, as we do not know the
+		length of the record. Parse subsequent records from
+		the bigger buffer index->online_log->head.block
+		or index->online_log->tail.block. */
+
+		ut_ad(mrec == index->online_log->head.buf);
+		ut_ad(mrec_end > mrec);
+		ut_ad(mrec_end < (&index->online_log->head.buf)[1]);
+
+		memcpy((mrec_t*) mrec_end, next_mrec,
+		       (&index->online_log->head.buf)[1] - mrec_end);
+		mrec = row_log_apply_op(
+			index, dup, &error, offsets_heap, heap,
+			has_index_lock, index->online_log->head.buf,
+			(&index->online_log->head.buf)[1], offsets);
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		} else if (UNIV_UNLIKELY(mrec == NULL)) {
+			/* The record was not reassembled properly. */
+			goto corruption;
+		}
+		/* The record was previously found out to be
+		truncated. Now that the parse buffer was extended,
+		it should proceed beyond the old end of the buffer. */
+		ut_a(mrec > mrec_end);
+
+		index->online_log->head.bytes = mrec - mrec_end;
+		next_mrec += index->online_log->head.bytes;
+	}
+
+	ut_ad(next_mrec <= next_mrec_end);
+	/* The following loop must not be parsing the temporary
+	buffer, but head.block or tail.block. */
+
+	/* mrec!=NULL means that the next record starts from the
+	middle of the block */
+	ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0));
+
+#ifdef UNIV_DEBUG
+	if (next_mrec_end == index->online_log->head.block
+	    + srv_sort_buf_size) {
+		/* If tail.bytes == 0, next_mrec_end can also be at
+		the end of tail.block. */
+		if (index->online_log->tail.bytes == 0) {
+			ut_ad(next_mrec == next_mrec_end);
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->head.bytes == 0);
+		} else {
+			ut_ad(next_mrec == index->online_log->head.block
+			      + index->online_log->head.bytes);
+			ut_ad(index->online_log->tail.blocks
+			      > index->online_log->head.blocks);
+		}
+	} else if (next_mrec_end == index->online_log->tail.block
+		   + index->online_log->tail.bytes) {
+		ut_ad(next_mrec == index->online_log->tail.block
+		      + index->online_log->head.bytes);
+		ut_ad(index->online_log->tail.blocks == 0);
+		ut_ad(index->online_log->head.blocks == 0);
+		ut_ad(index->online_log->head.bytes
+		      <= index->online_log->tail.bytes);
+	} else {
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
+	mrec_end = next_mrec_end;
+
+	while (!trx_is_interrupted(trx)) {
+		mrec = next_mrec;
+		ut_ad(mrec < mrec_end);
+
+		if (!has_index_lock) {
+			/* We are applying operations from a different
+			block than the one that is being written to.
+			We do not hold index->lock in order to
+			allow other threads to concurrently buffer
+			modifications. */
+			ut_ad(mrec >= index->online_log->head.block);
+			ut_ad(mrec_end == index->online_log->head.block
+			      + srv_sort_buf_size);
+			ut_ad(index->online_log->head.bytes
+			      < srv_sort_buf_size);
+
+			/* Take the opportunity to do a redo log
+			checkpoint if needed. */
+			log_free_check();
+		} else {
+			/* We are applying operations from the last block.
+			Do not allow other threads to buffer anything,
+			so that we can finally catch up and synchronize. */
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(mrec_end == index->online_log->tail.block
+			      + index->online_log->tail.bytes);
+			ut_ad(mrec >= index->online_log->tail.block);
+		}
+
+		next_mrec = row_log_apply_op(
+			index, dup, &error, offsets_heap, heap,
+			has_index_lock, mrec, mrec_end, offsets);
+
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		} else if (next_mrec == next_mrec_end) {
+			/* The record happened to end on a block boundary.
+			Do we have more blocks left? */
+			if (has_index_lock) {
+				/* The index will be locked while
+				applying the last block. */
+				goto all_done;
+			}
+
+			mrec = NULL;
+process_next_block:
+			rw_lock_x_lock(dict_index_get_lock(index));
+			has_index_lock = true;
+
+			index->online_log->head.bytes = 0;
+			index->online_log->head.blocks++;
+			goto next_block;
+		} else if (next_mrec != NULL) {
+			ut_ad(next_mrec < next_mrec_end);
+			index->online_log->head.bytes += next_mrec - mrec;
+		} else if (has_index_lock) {
+			/* When mrec is within tail.block, it should
+			be a complete record, because we are holding
+			index->lock and thus excluding the writer. */
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(mrec_end == index->online_log->tail.block
+			      + index->online_log->tail.bytes);
+			ut_ad(0);
+			goto unexpected_eof;
+		} else {
+			memcpy(index->online_log->head.buf, mrec,
+			       mrec_end - mrec);
+			mrec_end += index->online_log->head.buf - mrec;
+			mrec = index->online_log->head.buf;
+			goto process_next_block;
+		}
+	}
+
+interrupted:
+	error = DB_INTERRUPTED;
+func_exit:
+	if (!has_index_lock) {
+		rw_lock_x_lock(dict_index_get_lock(index));
+	}
+
+	switch (error) {
+	case DB_SUCCESS:
+		break;
+	case DB_INDEX_CORRUPT:
+		if (((os_offset_t) index->online_log->tail.blocks + 1)
+		    * srv_sort_buf_size >= srv_online_max_size) {
+			/* The log file grew too big. */
+			error = DB_ONLINE_LOG_TOO_BIG;
+		}
+		/* fall through */
+	default:
+		/* We set the flag directly instead of invoking
+		dict_set_corrupted_index_cache_only(index) here,
+		because the index is not "public" yet. */
+		index->type |= DICT_CORRUPT;
+	}
+
+	mem_heap_free(heap);
+	mem_heap_free(offsets_heap);
+	ut_free(offsets);
+	return(error);
+}
+
+/******************************************************//**
+Apply the row log to the index upon completing index creation.
+@return DB_SUCCESS, or error code on failure */
+UNIV_INTERN
+dberr_t
+row_log_apply(
+/*==========*/
+	trx_t*		trx,	/*!< in: transaction (for checking if
+				the operation was interrupted) */
+	dict_index_t*	index,	/*!< in/out: secondary index */
+	struct TABLE*	table)	/*!< in/out: MySQL table
+				(for reporting duplicates) */
+{
+	dberr_t		error;
+	row_log_t*	log;
+	row_merge_dup_t	dup = { index, table, NULL, 0 };
+	DBUG_ENTER("row_log_apply");
+
+	ut_ad(dict_index_is_online_ddl(index));
+	ut_ad(!dict_index_is_clust(index));
+
+	log_free_check();
+
+	rw_lock_x_lock(dict_index_get_lock(index));
+
+	if (!dict_table_is_corrupted(index->table)) {
+		error = row_log_apply_ops(trx, index, &dup);
+	} else {
+		error = DB_SUCCESS;
+	}
+
+	if (error != DB_SUCCESS || dup.n_dup) {
+		ut_a(!dict_table_is_discarded(index->table));
+		/* We set the flag directly instead of invoking
+		dict_set_corrupted_index_cache_only(index) here,
+		because the index is not "public" yet. */
+		index->type |= DICT_CORRUPT;
+		index->table->drop_aborted = TRUE;
+
+		if (error == DB_SUCCESS) {
+			error = DB_DUPLICATE_KEY;
+		}
+
+		dict_index_set_online_status(index, ONLINE_INDEX_ABORTED);
+	} else {
+		dict_index_set_online_status(index, ONLINE_INDEX_COMPLETE);
+	}
+
+	log = index->online_log;
+	index->online_log = NULL;
+	/* We could remove the TEMP_INDEX_PREFIX and update the data
+	dictionary to say that this index is complete, if we had
+	access to the .frm file here.  If the server crashes before
+	all requested indexes have been created, this completed index
+	will be dropped. */
+	rw_lock_x_unlock(dict_index_get_lock(index));
+
+	row_log_free(log);
+
+	DBUG_RETURN(error);
+}
diff --git a/storage/xtradb/row/row0merge.c b/storage/xtradb/row/row0merge.c
deleted file mode 100644
index 752e941af7f..00000000000
--- a/storage/xtradb/row/row0merge.c
+++ /dev/null
@@ -1,2885 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file row/row0merge.c
-New index creation routines using a merge sort
-
-Created 12/4/2005 Jan Lindstrom
-Completed by Sunny Bains and Marko Makela
-*******************************************************/
-
-#include "row0merge.h"
-#include "row0ext.h"
-#include "row0row.h"
-#include "row0upd.h"
-#include "row0ins.h"
-#include "row0sel.h"
-#include "dict0dict.h"
-#include "dict0mem.h"
-#include "dict0boot.h"
-#include "dict0crea.h"
-#include "dict0load.h"
-#include "btr0btr.h"
-#include "mach0data.h"
-#include "trx0rseg.h"
-#include "trx0trx.h"
-#include "trx0roll.h"
-#include "trx0undo.h"
-#include "trx0purge.h"
-#include "trx0rec.h"
-#include "que0que.h"
-#include "rem0cmp.h"
-#include "read0read.h"
-#include "os0file.h"
-#include "lock0lock.h"
-#include "data0data.h"
-#include "data0type.h"
-#include "que0que.h"
-#include "pars0pars.h"
-#include "mem0mem.h"
-#include "log0log.h"
-#include "ut0sort.h"
-#include "handler0alter.h"
-#include "ha_prototypes.h"
-
-/* Ignore posix_fadvise() on those platforms where it does not exist */
-#if defined __WIN__
-# define posix_fadvise(fd, offset, len, advice) /* nothing */
-#endif /* __WIN__ */
-
-#ifdef __WIN__
-/* error LNK2001: unresolved external symbol _debug_sync_C_callback_ptr */
-# define DEBUG_SYNC_C(dummy) ((void) 0)
-#else
-# include "m_string.h" /* for my_sys.h */
-# include "my_sys.h" /* DEBUG_SYNC_C */
-#endif
-
-#ifdef UNIV_DEBUG
-/** Set these in order ot enable debug printout. */
-/* @{ */
-/** Log the outcome of each row_merge_cmp() call, comparing records. */
-static ibool	row_merge_print_cmp;
-/** Log each record read from temporary file. */
-static ibool	row_merge_print_read;
-/** Log each record write to temporary file. */
-static ibool	row_merge_print_write;
-/** Log each row_merge_blocks() call, merging two blocks of records to
-a bigger one. */
-static ibool	row_merge_print_block;
-/** Log each block read from temporary file. */
-static ibool	row_merge_print_block_read;
-/** Log each block read from temporary file. */
-static ibool	row_merge_print_block_write;
-/* @} */
-#endif /* UNIV_DEBUG */
-
-/** @brief Block size for I/O operations in merge sort.
-
-The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty()
-rounded to a power of 2.
-
-When not creating a PRIMARY KEY that contains column prefixes, this
-can be set as small as UNIV_PAGE_SIZE / 2.  See the comment above
-ut_ad(data_size < sizeof(row_merge_block_t)).
-1MB is the default merge-sort block size for innodb */
-typedef byte* row_merge_block_t;
-
-/** @brief Secondary buffer for I/O operations of merge records.
-
-This buffer is used for writing or reading a record that spans two
-row_merge_block_t.  Thus, it must be able to hold one merge record,
-whose maximum size is the same as the minimum size of
-row_merge_block_t. */
-typedef byte	mrec_buf_t[UNIV_PAGE_SIZE_MAX];
-
-/** @brief Merge record in row_merge_block_t.
-
-The format is the same as a record in ROW_FORMAT=COMPACT with the
-exception that the REC_N_NEW_EXTRA_BYTES are omitted. */
-typedef byte	mrec_t;
-
-/** Buffer for sorting in main memory. */
-struct row_merge_buf_struct {
-	mem_heap_t*	heap;		/*!< memory heap where allocated */
-	dict_index_t*	index;		/*!< the index the tuples belong to */
-	ulint		total_size;	/*!< total amount of data bytes */
-	ulint		n_tuples;	/*!< number of data tuples */
-	ulint		max_tuples;	/*!< maximum number of data tuples */
-	const dfield_t**tuples;		/*!< array of pointers to
-					arrays of fields that form
-					the data tuples */
-	const dfield_t**tmp_tuples;	/*!< temporary copy of tuples,
-					for sorting */
-};
-
-/** Buffer for sorting in main memory. */
-typedef struct row_merge_buf_struct row_merge_buf_t;
-
-/** Information about temporary files used in merge sort */
-struct merge_file_struct {
-	int		fd;		/*!< file descriptor */
-	ulint		offset;		/*!< file offset (end of file) */
-	ib_uint64_t	n_rec;		/*!< number of records in the file */
-};
-
-/** Information about temporary files used in merge sort */
-typedef struct merge_file_struct merge_file_t;
-
-#ifdef UNIV_DEBUG
-/******************************************************//**
-Display a merge tuple. */
-static
-void
-row_merge_tuple_print(
-/*==================*/
-	FILE*		f,	/*!< in: output stream */
-	const dfield_t*	entry,	/*!< in: tuple to print */
-	ulint		n_fields)/*!< in: number of fields in the tuple */
-{
-	ulint	j;
-
-	for (j = 0; j < n_fields; j++) {
-		const dfield_t*	field = &entry[j];
-
-		if (dfield_is_null(field)) {
-			fputs("\n NULL;", f);
-		} else {
-			ulint	field_len	= dfield_get_len(field);
-			ulint	len		= ut_min(field_len, 20);
-			if (dfield_is_ext(field)) {
-				fputs("\nE", f);
-			} else {
-				fputs("\n ", f);
-			}
-			ut_print_buf(f, dfield_get_data(field), len);
-			if (len != field_len) {
-				fprintf(f, " (total %lu bytes)", field_len);
-			}
-		}
-	}
-	putc('\n', f);
-}
-#endif /* UNIV_DEBUG */
-
-/******************************************************//**
-Allocate a sort buffer.
-@return	own: sort buffer */
-static
-row_merge_buf_t*
-row_merge_buf_create_low(
-/*=====================*/
-	mem_heap_t*	heap,		/*!< in: heap where allocated */
-	dict_index_t*	index,		/*!< in: secondary index */
-	ulint		max_tuples,	/*!< in: maximum number of data tuples */
-	ulint		buf_size)	/*!< in: size of the buffer, in bytes */
-{
-	row_merge_buf_t*	buf;
-
-	ut_ad(max_tuples > 0);
-	ut_ad(max_tuples < buf_size);
-
-	buf = mem_heap_zalloc(heap, buf_size);
-	buf->heap = heap;
-	buf->index = index;
-	buf->max_tuples = max_tuples;
-	buf->tuples = mem_heap_alloc(heap,
-				     2 * max_tuples * sizeof *buf->tuples);
-	buf->tmp_tuples = buf->tuples + max_tuples;
-
-	return(buf);
-}
-
-/******************************************************//**
-Allocate a sort buffer.
-@return	own: sort buffer */
-static
-row_merge_buf_t*
-row_merge_buf_create(
-/*=================*/
-	dict_index_t*	index,		/*!< in: secondary index */
-	ulint		block_size)	/*!< in: merge block buffer size */
-{
-	row_merge_buf_t*	buf;
-	ulint			max_tuples;
-	ulint			buf_size;
-	mem_heap_t*		heap;
-
-	max_tuples = block_size / ut_max(1, dict_index_get_min_size(index));
-
-	buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
-
-	heap = mem_heap_create(buf_size + block_size);
-
-	buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
-
-	return(buf);
-}
-
-/******************************************************//**
-Empty a sort buffer.
-@return	sort buffer */
-static
-row_merge_buf_t*
-row_merge_buf_empty(
-/*================*/
-	row_merge_buf_t*	buf)	/*!< in,own: sort buffer */
-{
-	ulint		buf_size;
-	ulint		max_tuples	= buf->max_tuples;
-	mem_heap_t*	heap		= buf->heap;
-	dict_index_t*	index		= buf->index;
-
-	buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
-
-	mem_heap_empty(heap);
-
-	return(row_merge_buf_create_low(heap, index, max_tuples, buf_size));
-}
-
-/******************************************************//**
-Deallocate a sort buffer. */
-static
-void
-row_merge_buf_free(
-/*===============*/
-	row_merge_buf_t*	buf)	/*!< in,own: sort buffer, to be freed */
-{
-	mem_heap_free(buf->heap);
-}
-
-/******************************************************//**
-Insert a data tuple into a sort buffer.
-@return	TRUE if added, FALSE if out of space */
-static
-ibool
-row_merge_buf_add(
-/*==============*/
-	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
-	const dtuple_t*		row,	/*!< in: row in clustered index */
-	const row_ext_t*	ext,	/*!< in: cache of externally stored
-					column prefixes, or NULL */
-	ulint			block_size)
-					/*!< in: merge block buffer size */
-{
-	ulint			i;
-	ulint			n_fields;
-	ulint			data_size;
-	ulint			extra_size;
-	const dict_index_t*	index;
-	dfield_t*		entry;
-	dfield_t*		field;
-	const dict_field_t*	ifield;
-
-	if (buf->n_tuples >= buf->max_tuples) {
-		return(FALSE);
-	}
-
-	UNIV_PREFETCH_R(row->fields);
-
-	index = buf->index;
-
-	n_fields = dict_index_get_n_fields(index);
-
-	entry = mem_heap_alloc(buf->heap, n_fields * sizeof *entry);
-	buf->tuples[buf->n_tuples] = entry;
-	field = entry;
-
-	data_size = 0;
-	extra_size = UT_BITS_IN_BYTES(index->n_nullable);
-
-	ifield = dict_index_get_nth_field(index, 0);
-
-	for (i = 0; i < n_fields; i++, field++, ifield++) {
-		const dict_col_t*	col;
-		ulint			col_no;
-		ulint			fixed_len;
-		const dfield_t*		row_field;
-		ulint			len;
-
-		col = ifield->col;
-		col_no = dict_col_get_no(col);
-		row_field = dtuple_get_nth_field(row, col_no);
-		dfield_copy(field, row_field);
-		len = dfield_get_len(field);
-
-		if (dfield_is_null(field)) {
-			ut_ad(!(col->prtype & DATA_NOT_NULL));
-			continue;
-		} else if (UNIV_LIKELY(!ext)) {
-		} else if (dict_index_is_clust(index)) {
-			/* Flag externally stored fields. */
-			const byte*	buf = row_ext_lookup(ext, col_no,
-							     &len);
-			if (UNIV_LIKELY_NULL(buf)) {
-				ut_a(buf != field_ref_zero);
-				if (i < dict_index_get_n_unique(index)) {
-					dfield_set_data(field, buf, len);
-				} else {
-					dfield_set_ext(field);
-					len = dfield_get_len(field);
-				}
-			}
-		} else {
-			const byte*	buf = row_ext_lookup(ext, col_no,
-							     &len);
-			if (UNIV_LIKELY_NULL(buf)) {
-				ut_a(buf != field_ref_zero);
-				dfield_set_data(field, buf, len);
-			}
-		}
-
-		/* If a column prefix index, take only the prefix */
-
-		if (ifield->prefix_len) {
-			len = dtype_get_at_most_n_mbchars(
-				col->prtype,
-				col->mbminmaxlen,
-				ifield->prefix_len,
-				len, dfield_get_data(field));
-			dfield_set_len(field, len);
-		}
-
-		ut_ad(len <= col->len || col->mtype == DATA_BLOB);
-
-		fixed_len = ifield->fixed_len;
-		if (fixed_len && !dict_table_is_comp(index->table)
-		    && DATA_MBMINLEN(col->mbminmaxlen)
-		    != DATA_MBMAXLEN(col->mbminmaxlen)) {
-			/* CHAR in ROW_FORMAT=REDUNDANT is always
-			fixed-length, but in the temporary file it is
-			variable-length for variable-length character
-			sets. */
-			fixed_len = 0;
-		}
-
-		if (fixed_len) {
-#ifdef UNIV_DEBUG
-			ulint	mbminlen = DATA_MBMINLEN(col->mbminmaxlen);
-			ulint	mbmaxlen = DATA_MBMAXLEN(col->mbminmaxlen);
-
-			/* len should be between size calcualted base on
-			mbmaxlen and mbminlen */
-			ut_ad(len <= fixed_len);
-			ut_ad(!mbmaxlen || len >= mbminlen
-			      * (fixed_len / mbmaxlen));
-
-			ut_ad(!dfield_is_ext(field));
-#endif /* UNIV_DEBUG */
-		} else if (dfield_is_ext(field)) {
-			extra_size += 2;
-		} else if (len < 128
-			   || (col->len < 256 && col->mtype != DATA_BLOB)) {
-			extra_size++;
-		} else {
-			/* For variable-length columns, we look up the
-			maximum length from the column itself.  If this
-			is a prefix index column shorter than 256 bytes,
-			this will waste one byte. */
-			extra_size += 2;
-		}
-		data_size += len;
-	}
-
-#ifdef UNIV_DEBUG
-	{
-		ulint	size;
-		ulint	extra;
-
-		size = rec_get_converted_size_temp(
-			index, entry, n_fields, &extra);
-
-		ut_ad(data_size + extra_size == size);
-		ut_ad(extra_size == extra);
-	}
-#endif /* UNIV_DEBUG */
-
-	/* Add to the total size of the record in row_merge_block_t
-	the encoded length of extra_size and the extra bytes (extra_size).
-	See row_merge_buf_write() for the variable-length encoding
-	of extra_size. */
-	data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
-
-	/* The following assertion may fail if row_merge_block_t is
-	declared very small and a PRIMARY KEY is being created with
-	many prefix columns.  In that case, the record may exceed the
-	page_zip_rec_needs_ext() limit.  However, no further columns
-	will be moved to external storage until the record is inserted
-	to the clustered index B-tree. */
-	ut_ad(data_size < block_size);
-
-	/* Reserve one byte for the end marker of row_merge_block_t. */
-	if (buf->total_size + data_size >= block_size - 1) {
-		return(FALSE);
-	}
-
-	buf->total_size += data_size;
-	buf->n_tuples++;
-
-	field = entry;
-
-	/* Copy the data fields. */
-
-	do {
-		dfield_dup(field++, buf->heap);
-	} while (--n_fields);
-
-	return(TRUE);
-}
-
-/** Structure for reporting duplicate records. */
-struct row_merge_dup_struct {
-	const dict_index_t*	index;		/*!< index being sorted */
-	struct TABLE*		table;		/*!< MySQL table object */
-	ulint			n_dup;		/*!< number of duplicates */
-};
-
-/** Structure for reporting duplicate records. */
-typedef struct row_merge_dup_struct row_merge_dup_t;
-
-/*************************************************************//**
-Report a duplicate key. */
-static
-void
-row_merge_dup_report(
-/*=================*/
-	row_merge_dup_t*	dup,	/*!< in/out: for reporting duplicates */
-	const dfield_t*		entry)	/*!< in: duplicate index entry */
-{
-	mrec_buf_t* 		buf;
-	const dtuple_t*		tuple;
-	dtuple_t		tuple_store;
-	const rec_t*		rec;
-	const dict_index_t*	index	= dup->index;
-	ulint			n_fields= dict_index_get_n_fields(index);
-	mem_heap_t*		heap;
-	ulint*			offsets;
-	ulint			n_ext;
-
-	if (dup->n_dup++) {
-		/* Only report the first duplicate record,
-		but count all duplicate records. */
-		return;
-	}
-
-	/* Convert the tuple to a record and then to MySQL format. */
-	heap = mem_heap_create((1 + REC_OFFS_HEADER_SIZE + n_fields)
-			       * sizeof *offsets
-			       + sizeof *buf);
-
-	buf = mem_heap_alloc(heap, sizeof *buf);
-
-	tuple = dtuple_from_fields(&tuple_store, entry, n_fields);
-	n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0;
-
-	rec = rec_convert_dtuple_to_rec(*buf, index, tuple, n_ext);
-	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
-
-	innobase_rec_to_mysql(dup->table, rec, index, offsets);
-
-	mem_heap_free(heap);
-}
-
-/*************************************************************//**
-Compare two tuples.
-@return	1, 0, -1 if a is greater, equal, less, respectively, than b */
-static
-int
-row_merge_tuple_cmp(
-/*================*/
-	ulint			n_field,/*!< in: number of fields */
-	const dfield_t*		a,	/*!< in: first tuple to be compared */
-	const dfield_t*		b,	/*!< in: second tuple to be compared */
-	row_merge_dup_t*	dup)	/*!< in/out: for reporting duplicates */
-{
-	int		cmp;
-	const dfield_t*	field	= a;
-
-	/* Compare the fields of the tuples until a difference is
-	found or we run out of fields to compare.  If !cmp at the
-	end, the tuples are equal. */
-	do {
-		cmp = cmp_dfield_dfield(a++, b++);
-	} while (!cmp && --n_field);
-
-	if (UNIV_UNLIKELY(!cmp) && UNIV_LIKELY_NULL(dup)) {
-		/* Report a duplicate value error if the tuples are
-		logically equal.  NULL columns are logically inequal,
-		although they are equal in the sorting order.  Find
-		out if any of the fields are NULL. */
-		for (b = field; b != a; b++) {
-			if (dfield_is_null(b)) {
-
-				goto func_exit;
-			}
-		}
-
-		row_merge_dup_report(dup, field);
-	}
-
-func_exit:
-	return(cmp);
-}
-
-/** Wrapper for row_merge_tuple_sort() to inject some more context to
-UT_SORT_FUNCTION_BODY().
-@param a	array of tuples that being sorted
-@param b	aux (work area), same size as tuples[]
-@param c	lower bound of the sorting area, inclusive
-@param d	upper bound of the sorting area, inclusive */
-#define row_merge_tuple_sort_ctx(a,b,c,d) \
-	row_merge_tuple_sort(n_field, dup, a, b, c, d)
-/** Wrapper for row_merge_tuple_cmp() to inject some more context to
-UT_SORT_FUNCTION_BODY().
-@param a	first tuple to be compared
-@param b	second tuple to be compared
-@return	1, 0, -1 if a is greater, equal, less, respectively, than b */
-#define row_merge_tuple_cmp_ctx(a,b) row_merge_tuple_cmp(n_field, a, b, dup)
-
-/**********************************************************************//**
-Merge sort the tuple buffer in main memory. */
-static
-void
-row_merge_tuple_sort(
-/*=================*/
-	ulint			n_field,/*!< in: number of fields */
-	row_merge_dup_t*	dup,	/*!< in/out: for reporting duplicates */
-	const dfield_t**	tuples,	/*!< in/out: tuples */
-	const dfield_t**	aux,	/*!< in/out: work area */
-	ulint			low,	/*!< in: lower bound of the
-					sorting area, inclusive */
-	ulint			high)	/*!< in: upper bound of the
-					sorting area, exclusive */
-{
-	UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
-			      tuples, aux, low, high, row_merge_tuple_cmp_ctx);
-}
-
-/******************************************************//**
-Sort a buffer. */
-static
-void
-row_merge_buf_sort(
-/*===============*/
-	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
-	row_merge_dup_t*	dup)	/*!< in/out: for reporting duplicates */
-{
-	row_merge_tuple_sort(dict_index_get_n_unique(buf->index), dup,
-			     buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
-}
-
-/******************************************************//**
-Write a buffer to a block. */
-static
-void
-row_merge_buf_write(
-/*================*/
-	const row_merge_buf_t*	buf,	/*!< in: sorted buffer */
-#ifdef UNIV_DEBUG
-	const merge_file_t*	of,	/*!< in: output file */
-#endif /* UNIV_DEBUG */
-	row_merge_block_t*	block)	/*!< out: buffer for writing to file */
-#ifndef UNIV_DEBUG
-# define row_merge_buf_write(buf, of, block) row_merge_buf_write(buf, block)
-#endif /* !UNIV_DEBUG */
-{
-	const dict_index_t*	index	= buf->index;
-	ulint			n_fields= dict_index_get_n_fields(index);
-	byte*			b	= &(*block)[0];
-
-	ulint		i;
-
-	for (i = 0; i < buf->n_tuples; i++) {
-		ulint		size;
-		ulint		extra_size;
-		const dfield_t*	entry		= buf->tuples[i];
-
-		size = rec_get_converted_size_temp(
-			index, entry, n_fields, &extra_size);
-		ut_ad(size >= extra_size);
-
-		/* Encode extra_size + 1 */
-		if (extra_size + 1 < 0x80) {
-			*b++ = (byte) (extra_size + 1);
-		} else {
-			ut_ad((extra_size + 1) < 0x8000);
-			*b++ = (byte) (0x80 | ((extra_size + 1) >> 8));
-			*b++ = (byte) (extra_size + 1);
-		}
-
-		ut_ad(b + size < block[1]);
-
-		rec_convert_dtuple_to_temp(b + extra_size, index,
-					   entry, n_fields);
-
-		b += size;
-
-#ifdef UNIV_DEBUG
-		if (row_merge_print_write) {
-			fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu",
-				(void*) b, of->fd, (ulong) of->offset,
-				(ulong) i);
-			row_merge_tuple_print(stderr, entry, n_fields);
-		}
-#endif /* UNIV_DEBUG */
-	}
-
-	/* Write an "end-of-chunk" marker. */
-	ut_a(b < block[1]);
-	ut_a(b == block[0] + buf->total_size);
-	*b++ = 0;
-#ifdef UNIV_DEBUG_VALGRIND
-	/* The rest of the block is uninitialized.  Initialize it
-	to avoid bogus warnings. */
-	memset(b, 0xff, block[1] - b);
-#endif /* UNIV_DEBUG_VALGRIND */
-#ifdef UNIV_DEBUG
-	if (row_merge_print_write) {
-		fprintf(stderr, "row_merge_buf_write %p,%d,%lu EOF\n",
-			(void*) b, of->fd, (ulong) of->offset);
-	}
-#endif /* UNIV_DEBUG */
-}
-
-/******************************************************//**
-Create a memory heap and allocate space for row_merge_rec_offsets()
-and mrec_buf_t[3].
-@return	memory heap */
-static
-mem_heap_t*
-row_merge_heap_create(
-/*==================*/
-	const dict_index_t*	index,		/*!< in: record descriptor */
-	mrec_buf_t**		buf,		/*!< out: 3 buffers */
-	ulint**			offsets1,	/*!< out: offsets */
-	ulint**			offsets2)	/*!< out: offsets */
-{
-	ulint		i	= 1 + REC_OFFS_HEADER_SIZE
-		+ dict_index_get_n_fields(index);
-	mem_heap_t*	heap	= mem_heap_create(2 * i * sizeof **offsets1
-						  + 3 * sizeof **buf);
-
-	*buf = mem_heap_alloc(heap, 3 * sizeof **buf);
-	*offsets1 = mem_heap_alloc(heap, i * sizeof **offsets1);
-	*offsets2 = mem_heap_alloc(heap, i * sizeof **offsets2);
-
-	(*offsets1)[0] = (*offsets2)[0] = i;
-	(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
-
-	return(heap);
-}
-
-/**********************************************************************//**
-Search an index object by name and column names.  If several indexes match,
-return the index with the max id.
-@return	matching index, NULL if not found */
-static
-dict_index_t*
-row_merge_dict_table_get_index(
-/*===========================*/
-	dict_table_t*		table,		/*!< in: table */
-	const merge_index_def_t*index_def)	/*!< in: index definition */
-{
-	ulint		i;
-	dict_index_t*	index;
-	const char**	column_names;
-
-	column_names = mem_alloc(index_def->n_fields * sizeof *column_names);
-
-	for (i = 0; i < index_def->n_fields; ++i) {
-		column_names[i] = index_def->fields[i].field_name;
-	}
-
-	index = dict_table_get_index_by_max_id(
-		table, index_def->name, column_names, index_def->n_fields);
-
-	mem_free((void*) column_names);
-
-	return(index);
-}
-
-/********************************************************************//**
-Read a merge block from the file system.
-@return	TRUE if request was successful, FALSE if fail */
-static
-ibool
-row_merge_read(
-/*===========*/
-	int			fd,	/*!< in: file descriptor */
-	ulint			offset,	/*!< in: offset where to read
-					in number of row_merge_block_t
-					elements */
-	row_merge_block_t	buf,	/*!< out: data */
-	ulint			block_size)
-					/*!< in: merge block buffer size */
-{
-	ib_uint64_t	ofs = ((ib_uint64_t) offset) * block_size;
-	ibool		success;
-
-	DBUG_EXECUTE_IF("row_merge_read_failure", return(FALSE););
-
-#ifdef UNIV_DEBUG
-	if (row_merge_print_block_read) {
-		fprintf(stderr, "row_merge_read fd=%d ofs=%lu\n",
-			fd, (ulong) offset);
-	}
-#endif /* UNIV_DEBUG */
-
-	success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
-						 (ulint) (ofs & 0xFFFFFFFF),
-						 (ulint) (ofs >> 32),
-						 block_size);
-#ifdef POSIX_FADV_DONTNEED
-	/* Each block is read exactly once.  Free up the file cache. */
-	posix_fadvise(fd, ofs, sizeof *buf, POSIX_FADV_DONTNEED);
-#endif /* POSIX_FADV_DONTNEED */
-
-	if (UNIV_UNLIKELY(!success)) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: failed to read merge block at %llu\n", ofs);
-	}
-
-	return(UNIV_LIKELY(success));
-}
-
-/********************************************************************//**
-Write a merge block to the file system.
-@return	TRUE if request was successful, FALSE if fail */
-static
-ibool
-row_merge_write(
-/*============*/
-	int		fd,	/*!< in: file descriptor */
-	ulint		offset,	/*!< in: offset where to write,
-				in number of row_merge_block_t elements */
-	const void*	buf,	/*!< in: data */
-	ulint		block_size)
-				/*!< in: merge block buffer size */
-{
-	ib_uint64_t	ofs = block_size * (ib_uint64_t) offset;
-	ibool		ret;
-
-	ret = os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
-			    (ulint) (ofs & 0xFFFFFFFF),
-			    (ulint) (ofs >> 32),
-			    block_size);
-
-	DBUG_EXECUTE_IF("row_merge_write_failure", return(FALSE););
-
-#ifdef UNIV_DEBUG
-	if (row_merge_print_block_write) {
-		fprintf(stderr, "row_merge_write fd=%d ofs=%lu\n",
-			fd, (ulong) offset);
-	}
-#endif /* UNIV_DEBUG */
-
-#ifdef POSIX_FADV_DONTNEED
-	/* The block will be needed on the next merge pass,
-	but it can be evicted from the file cache meanwhile. */
-	posix_fadvise(fd, ofs, block_size, POSIX_FADV_DONTNEED);
-#endif /* POSIX_FADV_DONTNEED */
-
-	return(UNIV_LIKELY(ret));
-}
-
-/********************************************************************//**
-Read a merge record.
-@return	pointer to next record, or NULL on I/O error or end of list */
-static __attribute__((nonnull))
-const byte*
-row_merge_read_rec(
-/*===============*/
-	row_merge_block_t*	block,	/*!< in/out: file buffer */
-	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
-	const byte*		b,	/*!< in: pointer to record */
-	const dict_index_t*	index,	/*!< in: index of the record */
-	int			fd,	/*!< in: file descriptor */
-	ulint*			foffs,	/*!< in/out: file offset */
-	const mrec_t**		mrec,	/*!< out: pointer to merge record,
-					or NULL on end of list
-					(non-NULL on I/O error) */
-	ulint*			offsets,/*!< out: offsets of mrec */
-	ulint			block_size)
-					/*!< in: merge block buffer size */
-{
-	ulint	extra_size;
-	ulint	data_size;
-	ulint	avail_size;
-
-	ut_ad(block);
-	ut_ad(buf);
-	ut_ad(b >= block[0]);
-	ut_ad(b < block[1]);
-	ut_ad(index);
-	ut_ad(foffs);
-	ut_ad(mrec);
-	ut_ad(offsets);
-
-	ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE
-	      + dict_index_get_n_fields(index));
-
-	extra_size = *b++;
-
-	if (UNIV_UNLIKELY(!extra_size)) {
-		/* End of list */
-		*mrec = NULL;
-#ifdef UNIV_DEBUG
-		if (row_merge_print_read) {
-			fprintf(stderr, "row_merge_read %p,%p,%d,%lu EOF\n",
-				(const void*) b, (const void*) block,
-				fd, (ulong) *foffs);
-		}
-#endif /* UNIV_DEBUG */
-		return(NULL);
-	}
-
-	if (extra_size >= 0x80) {
-		/* Read another byte of extra_size. */
-
-		if (UNIV_UNLIKELY(b >= block[1])) {
-			if (!row_merge_read(fd, ++(*foffs), block[0],
-					    block_size)) {
-err_exit:
-				/* Signal I/O error. */
-				*mrec = b;
-				return(NULL);
-			}
-
-			/* Wrap around to the beginning of the buffer. */
-			b = block[0];
-		}
-
-		extra_size = (extra_size & 0x7f) << 8;
-		extra_size |= *b++;
-	}
-
-	/* Normalize extra_size.  Above, value 0 signals "end of list". */
-	extra_size--;
-
-	/* Read the extra bytes. */
-
-	if (UNIV_UNLIKELY(b + extra_size >= block[1])) {
-		/* The record spans two blocks.  Copy the entire record
-		to the auxiliary buffer and handle this as a special
-		case. */
-
-		avail_size = block[1] - b;
-
-		memcpy(*buf, b, avail_size);
-
-		if (!row_merge_read(fd, ++(*foffs), block[0],
-				    block_size)) {
-
-			goto err_exit;
-		}
-
-		/* Wrap around to the beginning of the buffer. */
-		b = block[0];
-
-		/* Copy the record. */
-		memcpy(*buf + avail_size, b, extra_size - avail_size);
-		b += extra_size - avail_size;
-
-		*mrec = *buf + extra_size;
-
-		rec_init_offsets_temp(*mrec, index, offsets);
-
-		data_size = rec_offs_data_size(offsets);
-
-		/* These overflows should be impossible given that
-		records are much smaller than either buffer, and
-		the record starts near the beginning of each buffer. */
-		ut_a(extra_size + data_size < block_size);
-		ut_a(b + data_size < block[1]);
-
-		/* Copy the data bytes. */
-		memcpy(*buf + extra_size, b, data_size);
-		b += data_size;
-
-		goto func_exit;
-	}
-
-	*mrec = b + extra_size;
-
-	rec_init_offsets_temp(*mrec, index, offsets);
-
-	data_size = rec_offs_data_size(offsets);
-	ut_ad(extra_size + data_size < block_size);
-
-	b += extra_size + data_size;
-
-	if (UNIV_LIKELY(b < block[1])) {
-		/* The record fits entirely in the block.
-		This is the normal case. */
-		goto func_exit;
-	}
-
-	/* The record spans two blocks.  Copy it to buf. */
-
-	b -= extra_size + data_size;
-	avail_size = block[1] - b;
-	memcpy(*buf, b, avail_size);
-	*mrec = *buf + extra_size;
-#ifdef UNIV_DEBUG
-	/* We cannot invoke rec_offs_make_valid() here, because there
-	are no REC_N_NEW_EXTRA_BYTES between extra_size and data_size.
-	Similarly, rec_offs_validate() would fail, because it invokes
-	rec_get_status(). */
-	offsets[2] = (ulint) *mrec;
-	offsets[3] = (ulint) index;
-#endif /* UNIV_DEBUG */
-
-	if (!row_merge_read(fd, ++(*foffs), block[0],
-			    block_size)) {
-
-		goto err_exit;
-	}
-
-	/* Wrap around to the beginning of the buffer. */
-	b = block[0];
-
-	/* Copy the rest of the record. */
-	memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
-	b += extra_size + data_size - avail_size;
-
-func_exit:
-#ifdef UNIV_DEBUG
-	if (row_merge_print_read) {
-		fprintf(stderr, "row_merge_read %p,%p,%d,%lu ",
-			(const void*) b, (const void*) block,
-			fd, (ulong) *foffs);
-		rec_print_comp(stderr, *mrec, offsets);
-		putc('\n', stderr);
-	}
-#endif /* UNIV_DEBUG */
-
-	return(b);
-}
-
-/********************************************************************//**
-Write a merge record. */
-static
-void
-row_merge_write_rec_low(
-/*====================*/
-	byte*		b,	/*!< out: buffer */
-	ulint		e,	/*!< in: encoded extra_size */
-#ifdef UNIV_DEBUG
-	ulint		size,	/*!< in: total size to write */
-	int		fd,	/*!< in: file descriptor */
-	ulint		foffs,	/*!< in: file offset */
-#endif /* UNIV_DEBUG */
-	const mrec_t*	mrec,	/*!< in: record to write */
-	const ulint*	offsets)/*!< in: offsets of mrec */
-#ifndef UNIV_DEBUG
-# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets)	\
-	row_merge_write_rec_low(b, e, mrec, offsets)
-#endif /* !UNIV_DEBUG */
-{
-#ifdef UNIV_DEBUG
-	const byte* const end = b + size;
-	ut_ad(e == rec_offs_extra_size(offsets) + 1);
-
-	if (row_merge_print_write) {
-		fprintf(stderr, "row_merge_write %p,%d,%lu ",
-			(void*) b, fd, (ulong) foffs);
-		rec_print_comp(stderr, mrec, offsets);
-		putc('\n', stderr);
-	}
-#endif /* UNIV_DEBUG */
-
-	if (e < 0x80) {
-		*b++ = (byte) e;
-	} else {
-		*b++ = (byte) (0x80 | (e >> 8));
-		*b++ = (byte) e;
-	}
-
-	memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
-	ut_ad(b + rec_offs_size(offsets) == end);
-}
-
-/********************************************************************//**
-Write a merge record.
-@return	pointer to end of block, or NULL on error */
-static
-byte*
-row_merge_write_rec(
-/*================*/
-	row_merge_block_t*	block,	/*!< in/out: file buffer */
-	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
-	byte*			b,	/*!< in: pointer to end of block */
-	int			fd,	/*!< in: file descriptor */
-	ulint*			foffs,	/*!< in/out: file offset */
-	const mrec_t*		mrec,	/*!< in: record to write */
-	const ulint*		offsets,/*!< in: offsets of mrec */
-	ulint			block_size)
-					/*!< in: merge block buffer size */
-{
-	ulint	extra_size;
-	ulint	size;
-	ulint	avail_size;
-
-	ut_ad(block);
-	ut_ad(buf);
-	ut_ad(b >= block[0]);
-	ut_ad(b < block[1]);
-	ut_ad(mrec);
-	ut_ad(foffs);
-	ut_ad(mrec < block[0] || mrec > block[1]);
-	ut_ad(mrec < buf[0] || mrec > buf[1]);
-
-	/* Normalize extra_size.  Value 0 signals "end of list". */
-	extra_size = rec_offs_extra_size(offsets) + 1;
-
-	size = extra_size + (extra_size >= 0x80)
-		+ rec_offs_data_size(offsets);
-
-	if (UNIV_UNLIKELY(b + size >= block[1])) {
-		/* The record spans two blocks.
-		Copy it to the temporary buffer first. */
-		avail_size = block[1] - b;
-
-		row_merge_write_rec_low(buf[0],
-					extra_size, size, fd, *foffs,
-					mrec, offsets);
-
-		/* Copy the head of the temporary buffer, write
-		the completed block, and copy the tail of the
-		record to the head of the new block. */
-		memcpy(b, buf[0], avail_size);
-
-		if (!row_merge_write(fd, (*foffs)++, block[0],
-				     block_size)) {
-			return(NULL);
-		}
-
-		UNIV_MEM_INVALID(block[0], block_size);
-
-		/* Copy the rest. */
-		b = block[0];
-		memcpy(b, buf[0] + avail_size, size - avail_size);
-		b += size - avail_size;
-	} else {
-		row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
-					mrec, offsets);
-		b += size;
-	}
-
-	return(b);
-}
-
-/********************************************************************//**
-Write an end-of-list marker.
-@return	pointer to end of block, or NULL on error */
-static
-byte*
-row_merge_write_eof(
-/*================*/
-	row_merge_block_t*	block,	/*!< in/out: file buffer */
-	byte*			b,	/*!< in: pointer to end of block */
-	int			fd,	/*!< in: file descriptor */
-	ulint*			foffs,	/*!< in/out: file offset */
-	ulint			block_size)
-					/*!< in: merge block buffer size */
-{
-	ut_ad(block);
-	ut_ad(b >= block[0]);
-	ut_ad(b < block[1]);
-	ut_ad(foffs);
-#ifdef UNIV_DEBUG
-	if (row_merge_print_write) {
-		fprintf(stderr, "row_merge_write %p,%p,%d,%lu EOF\n",
-			(void*) b, (void*) block, fd, (ulong) *foffs);
-	}
-#endif /* UNIV_DEBUG */
-
-	*b++ = 0;
-	UNIV_MEM_ASSERT_RW(block[0], b - block[0]);
-	UNIV_MEM_ASSERT_W(block[0], block_size);
-#ifdef UNIV_DEBUG_VALGRIND
-	/* The rest of the block is uninitialized.  Initialize it
-	to avoid bogus warnings. */
-	memset(b, 0xff, block[1] - b);
-#endif /* UNIV_DEBUG_VALGRIND */
-
-	if (!row_merge_write(fd, (*foffs)++, block[0],
-			     block_size)) {
-		return(NULL);
-	}
-
-	UNIV_MEM_INVALID(block[0], block_size);
-	return(block[0]);
-}
-
-/*************************************************************//**
-Compare two merge records.
-@return	1, 0, -1 if mrec1 is greater, equal, less, respectively, than mrec2 */
-static
-int
-row_merge_cmp(
-/*==========*/
-	const mrec_t*		mrec1,		/*!< in: first merge
-						record to be compared */
-	const mrec_t*		mrec2,		/*!< in: second merge
-						record to be compared */
-	const ulint*		offsets1,	/*!< in: first record offsets */
-	const ulint*		offsets2,	/*!< in: second record offsets */
-	const dict_index_t*	index,		/*!< in: index */
-	ibool*			null_eq)	/*!< out: set to TRUE if
-						found matching null values */
-{
-	int	cmp;
-
-	cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index,
-				 null_eq);
-
-#ifdef UNIV_DEBUG
-	if (row_merge_print_cmp) {
-		fputs("row_merge_cmp1 ", stderr);
-		rec_print_comp(stderr, mrec1, offsets1);
-		fputs("\nrow_merge_cmp2 ", stderr);
-		rec_print_comp(stderr, mrec2, offsets2);
-		fprintf(stderr, "\nrow_merge_cmp=%d\n", cmp);
-	}
-#endif /* UNIV_DEBUG */
-
-	return(cmp);
-}
-
-/********************************************************************//**
-Reads clustered index of the table and create temporary files
-containing the index entries for the indexes to be built.
-@return	DB_SUCCESS or error */
-static __attribute__((nonnull))
-ulint
-row_merge_read_clustered_index(
-/*===========================*/
-	trx_t*			trx,	/*!< in: transaction */
-	struct TABLE*		table,	/*!< in/out: MySQL table object,
-					for reporting erroneous records */
-	const dict_table_t*	old_table,/*!< in: table where rows are
-					read from */
-	const dict_table_t*	new_table,/*!< in: table where indexes are
-					created; identical to old_table
-					unless creating a PRIMARY KEY */
-	dict_index_t**		index,	/*!< in: indexes to be created */
-	merge_file_t*		files,	/*!< in: temporary files */
-	ulint			n_index,/*!< in: number of indexes to create */
-	row_merge_block_t*	block,	/*!< in/out: file buffer */
-	ulint			block_size)
-					/*!< in: merge block buffer size */
-{
-	dict_index_t*		clust_index;	/* Clustered index */
-	mem_heap_t*		row_heap;	/* Heap memory to create
-						clustered index records */
-	row_merge_buf_t**	merge_buf;	/* Temporary list for records*/
-	btr_pcur_t		pcur;		/* Persistent cursor on the
-						clustered index */
-	mtr_t			mtr;		/* Mini transaction */
-	ulint			err = DB_SUCCESS;/* Return code */
-	ulint			i;
-	ulint			n_nonnull = 0;	/* number of columns
-						changed to NOT NULL */
-	ulint*			nonnull = NULL;	/* NOT NULL columns */
-
-	trx->op_info = "reading clustered index";
-
-	ut_ad(trx);
-	ut_ad(old_table);
-	ut_ad(new_table);
-	ut_ad(index);
-	ut_ad(files);
-
-	/* Create and initialize memory for record buffers */
-
-	merge_buf = mem_alloc(n_index * sizeof *merge_buf);
-
-	for (i = 0; i < n_index; i++) {
-		merge_buf[i] = row_merge_buf_create(index[i], block_size);
-	}
-
-	mtr_start(&mtr);
-
-	/* Find the clustered index and create a persistent cursor
-	based on that. */
-
-	clust_index = dict_table_get_first_index(old_table);
-
-	btr_pcur_open_at_index_side(
-		TRUE, clust_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
-
-	if (UNIV_UNLIKELY(old_table != new_table)) {
-		ulint	n_cols = dict_table_get_n_cols(old_table);
-
-		/* A primary key will be created.  Identify the
-		columns that were flagged NOT NULL in the new table,
-		so that we can quickly check that the records in the
-		(old) clustered index do not violate the added NOT
-		NULL constraints. */
-
-		ut_a(n_cols == dict_table_get_n_cols(new_table));
-
-		nonnull = mem_alloc(n_cols * sizeof *nonnull);
-
-		for (i = 0; i < n_cols; i++) {
-			if (dict_table_get_nth_col(old_table, i)->prtype
-			    & DATA_NOT_NULL) {
-
-				continue;
-			}
-
-			if (dict_table_get_nth_col(new_table, i)->prtype
-			    & DATA_NOT_NULL) {
-
-				nonnull[n_nonnull++] = i;
-			}
-		}
-
-		if (!n_nonnull) {
-			mem_free(nonnull);
-			nonnull = NULL;
-		}
-	}
-
-	row_heap = mem_heap_create(sizeof(mrec_buf_t));
-
-	/* Scan the clustered index. */
-	for (;;) {
-		const rec_t*	rec;
-		ulint*		offsets;
-		dtuple_t*	row		= NULL;
-		row_ext_t*	ext;
-		ibool		has_next	= TRUE;
-
-		btr_pcur_move_to_next_on_page(&pcur);
-
-		/* When switching pages, commit the mini-transaction
-		in order to release the latch on the old page. */
-
-		if (btr_pcur_is_after_last_on_page(&pcur)) {
-			if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
-				i = 0;
-				err = DB_INTERRUPTED;
-				goto err_exit;
-			}
-
-			/* Store the cursor position on the last user
-			record on the page. */
-			btr_pcur_move_to_prev_on_page(&pcur);
-			/* Leaf pages must never be empty, unless
-			this is the only page in the index tree. */
-			ut_ad(btr_pcur_is_on_user_rec(&pcur)
-			      || buf_block_get_page_no(
-				      btr_pcur_get_block(&pcur))
-			      == clust_index->page);
-
-			btr_pcur_store_position(&pcur, &mtr);
-			mtr_commit(&mtr);
-			mtr_start(&mtr);
-			/* Restore position on the record, or its
-			predecessor if the record was purged
-			meanwhile. */
-			btr_pcur_restore_position(BTR_SEARCH_LEAF,
-						  &pcur, &mtr);
-			/* Move to the successor of the original record. */
-			has_next = btr_pcur_move_to_next_user_rec(&pcur, &mtr);
-		}
-
-		if (UNIV_LIKELY(has_next)) {
-			rec = btr_pcur_get_rec(&pcur);
-
-			SRV_CORRUPT_TABLE_CHECK(rec,
-			{
-				err = DB_CORRUPTION;
-				goto err_exit;
-			});
-
-			offsets = rec_get_offsets(rec, clust_index, NULL,
-						  ULINT_UNDEFINED, &row_heap);
-
-			/* Skip delete marked records. */
-			if (rec_get_deleted_flag(
-				    rec, dict_table_is_comp(old_table))) {
-				continue;
-			}
-
-			srv_n_rows_inserted++;
-
-			/* Build a row based on the clustered index. */
-
-			row = row_build(ROW_COPY_POINTERS, clust_index,
-					rec, offsets,
-					new_table, &ext, row_heap);
-
-			if (UNIV_LIKELY_NULL(nonnull)) {
-				for (i = 0; i < n_nonnull; i++) {
-					dfield_t*	field
-						= &row->fields[nonnull[i]];
-					dtype_t*	field_type
-						= dfield_get_type(field);
-
-					ut_a(!(field_type->prtype
-					       & DATA_NOT_NULL));
-
-					if (dfield_is_null(field)) {
-						err = DB_PRIMARY_KEY_IS_NULL;
-						i = 0;
-						goto err_exit;
-					}
-
-					field_type->prtype |= DATA_NOT_NULL;
-				}
-			}
-		}
-
-		/* Build all entries for all the indexes to be created
-		in a single scan of the clustered index. */
-
-		for (i = 0; i < n_index; i++) {
-			row_merge_buf_t*	buf	= merge_buf[i];
-			merge_file_t*		file	= &files[i];
-			const dict_index_t*	index	= buf->index;
-
-			if (UNIV_LIKELY
-			    (row && row_merge_buf_add(buf, row, ext,
-						      block_size))) {
-				file->n_rec++;
-				continue;
-			}
-
-			/* The buffer must be sufficiently large
-			to hold at least one record. */
-			ut_ad(buf->n_tuples || !has_next);
-
-			/* We have enough data tuples to form a block.
-			Sort them and write to disk. */
-
-			if (buf->n_tuples) {
-				if (dict_index_is_unique(index)) {
-					row_merge_dup_t	dup;
-					dup.index = buf->index;
-					dup.table = table;
-					dup.n_dup = 0;
-
-					row_merge_buf_sort(buf, &dup);
-
-					if (dup.n_dup) {
-						err = DB_DUPLICATE_KEY;
-err_exit:
-						trx->error_key_num = i;
-						goto func_exit;
-					}
-				} else {
-					row_merge_buf_sort(buf, NULL);
-				}
-			}
-
-			row_merge_buf_write(buf, file, block);
-
-			if (!row_merge_write(file->fd, file->offset++,
-					     block[0], block_size)) {
-				err = DB_OUT_OF_FILE_SPACE;
-				goto err_exit;
-			}
-
-			UNIV_MEM_INVALID(block[0], block_size);
-			merge_buf[i] = row_merge_buf_empty(buf);
-
-			if (UNIV_LIKELY(row != NULL)) {
-				/* Try writing the record again, now
-				that the buffer has been written out
-				and emptied. */
-
-				if (UNIV_UNLIKELY
-				    (!row_merge_buf_add(buf, row, ext,
-							block_size))) {
-					/* An empty buffer should have enough
-					room for at least one record. */
-					ut_error;
-				}
-
-				file->n_rec++;
-			}
-		}
-
-		mem_heap_empty(row_heap);
-
-		if (UNIV_UNLIKELY(!has_next)) {
-			goto func_exit;
-		}
-	}
-
-func_exit:
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
-	mem_heap_free(row_heap);
-
-	if (UNIV_LIKELY_NULL(nonnull)) {
-		mem_free(nonnull);
-	}
-
-	for (i = 0; i < n_index; i++) {
-		row_merge_buf_free(merge_buf[i]);
-	}
-
-	mem_free(merge_buf);
-
-	trx->op_info = "";
-
-	return(err);
-}
-
-/** Write a record via buffer 2 and read the next record to buffer N.
-@param N	number of the buffer (0 or 1)
-@param AT_END	statement to execute at end of input */
-#define ROW_MERGE_WRITE_GET_NEXT(N, AT_END)				\
-	do {								\
-		b2 = row_merge_write_rec(&block[2], &buf[2], b2,	\
-					 of->fd, &of->offset,		\
-					 mrec##N, offsets##N,		\
-					 block_size);			\
-		if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) {	\
-			goto corrupt;					\
-		}							\
-		b##N = row_merge_read_rec(&block[N], &buf[N],		\
-					  b##N, index,			\
-					  file->fd, foffs##N,		\
-					  &mrec##N, offsets##N,		\
-					  block_size);			\
-		if (UNIV_UNLIKELY(!b##N)) {				\
-			if (mrec##N) {					\
-				goto corrupt;				\
-			}						\
-			AT_END;						\
-		}							\
-	} while (0)
-
-/*************************************************************//**
-Merge two blocks of records on disk and write a bigger block.
-@return	DB_SUCCESS or error code */
-static
-ulint
-row_merge_blocks(
-/*=============*/
-	const dict_index_t*	index,	/*!< in: index being created */
-	const merge_file_t*	file,	/*!< in: file containing
-					index entries */
-	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
-	ulint*			foffs0,	/*!< in/out: offset of first
-					source list in the file */
-	ulint*			foffs1,	/*!< in/out: offset of second
-					source list in the file */
-	merge_file_t*		of,	/*!< in/out: output file */
-	struct TABLE*		table,	/*!< in/out: MySQL table, for
-					reporting erroneous key value
-					if applicable */
-	ulint			block_size)
-					/*!< in: merge block buffer size */
-{
-	mem_heap_t*	heap;	/*!< memory heap for offsets0, offsets1 */
-
-	mrec_buf_t*	buf;	/*!< buffer for handling
-				split mrec in block[] */
-	const byte*	b0;	/*!< pointer to block[0] */
-	const byte*	b1;	/*!< pointer to block[1] */
-	byte*		b2;	/*!< pointer to block[2] */
-	const mrec_t*	mrec0;	/*!< merge rec, points to block[0] or buf[0] */
-	const mrec_t*	mrec1;	/*!< merge rec, points to block[1] or buf[1] */
-	ulint*		offsets0;/* offsets of mrec0 */
-	ulint*		offsets1;/* offsets of mrec1 */
-
-#ifdef UNIV_DEBUG
-	if (row_merge_print_block) {
-		fprintf(stderr,
-			"row_merge_blocks fd=%d ofs=%lu + fd=%d ofs=%lu"
-			" = fd=%d ofs=%lu\n",
-			file->fd, (ulong) *foffs0,
-			file->fd, (ulong) *foffs1,
-			of->fd, (ulong) of->offset);
-	}
-#endif /* UNIV_DEBUG */
-
-	heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
-
-	/* Write a record and read the next record.  Split the output
-	file in two halves, which can be merged on the following pass. */
-
-	if (!row_merge_read(file->fd, *foffs0, block[0],
-			    block_size)
-	    || !row_merge_read(file->fd, *foffs1, block[1],
-			       block_size)) {
-corrupt:
-		mem_heap_free(heap);
-		return(DB_CORRUPTION);
-	}
-
-	b0 = block[0];
-	b1 = block[1];
-	b2 = block[2];
-
-	b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
-				foffs0, &mrec0, offsets0, block_size);
-	b1 = row_merge_read_rec(&block[1], &buf[1], b1, index, file->fd,
-				foffs1, &mrec1, offsets1, block_size);
-	if (UNIV_UNLIKELY(!b0 && mrec0)
-	    || UNIV_UNLIKELY(!b1 && mrec1)) {
-
-		goto corrupt;
-	}
-
-	while (mrec0 && mrec1) {
-		ibool	null_eq = FALSE;
-		switch (row_merge_cmp(mrec0, mrec1,
-				      offsets0, offsets1, index,
-				      &null_eq)) {
-		case 0:
-			if (UNIV_UNLIKELY
-			    (dict_index_is_unique(index) && !null_eq)) {
-				innobase_rec_to_mysql(table, mrec0,
-						      index, offsets0);
-				mem_heap_free(heap);
-				return(DB_DUPLICATE_KEY);
-			}
-			/* fall through */
-		case -1:
-			ROW_MERGE_WRITE_GET_NEXT(0, goto merged);
-			break;
-		case 1:
-			ROW_MERGE_WRITE_GET_NEXT(1, goto merged);
-			break;
-		default:
-			ut_error;
-		}
-
-	}
-
-merged:
-	if (mrec0) {
-		/* append all mrec0 to output */
-		for (;;) {
-			ROW_MERGE_WRITE_GET_NEXT(0, goto done0);
-		}
-	}
-done0:
-	if (mrec1) {
-		/* append all mrec1 to output */
-		for (;;) {
-			ROW_MERGE_WRITE_GET_NEXT(1, goto done1);
-		}
-	}
-done1:
-
-	mem_heap_free(heap);
-	b2 = row_merge_write_eof(&block[2], b2, of->fd, &of->offset,
-				 block_size);
-	return(b2 ? DB_SUCCESS : DB_CORRUPTION);
-}
-
-/*************************************************************//**
-Copy a block of index entries.
-@return	TRUE on success, FALSE on failure */
-static __attribute__((nonnull))
-ibool
-row_merge_blocks_copy(
-/*==================*/
-	const dict_index_t*	index,	/*!< in: index being created */
-	const merge_file_t*	file,	/*!< in: input file */
-	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
-	ulint*			foffs0,	/*!< in/out: input file offset */
-	merge_file_t*		of,	/*!< in/out: output file */
-	ulint			block_size)
-					/*!< in: merge block buffer size */
-{
-	mem_heap_t*	heap;	/*!< memory heap for offsets0, offsets1 */
-
-	mrec_buf_t*	buf;	/*!< buffer for handling
-				split mrec in block[] */
-	const byte*	b0;	/*!< pointer to block[0] */
-	byte*		b2;	/*!< pointer to block[2] */
-	const mrec_t*	mrec0;	/*!< merge rec, points to block[0] */
-	ulint*		offsets0;/* offsets of mrec0 */
-	ulint*		offsets1;/* dummy offsets */
-
-#ifdef UNIV_DEBUG
-	if (row_merge_print_block) {
-		fprintf(stderr,
-			"row_merge_blocks_copy fd=%d ofs=%lu"
-			" = fd=%d ofs=%lu\n",
-			file->fd, (ulong) foffs0,
-			of->fd, (ulong) of->offset);
-	}
-#endif /* UNIV_DEBUG */
-
-	heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
-
-	/* Write a record and read the next record.  Split the output
-	file in two halves, which can be merged on the following pass. */
-
-	if (!row_merge_read(file->fd, *foffs0, block[0], block_size)) {
-corrupt:
-		mem_heap_free(heap);
-		return(FALSE);
-	}
-
-	b0 = block[0];
-	b2 = block[2];
-
-	b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
-				foffs0, &mrec0, offsets0, block_size);
-	if (UNIV_UNLIKELY(!b0 && mrec0)) {
-
-		goto corrupt;
-	}
-
-	if (mrec0) {
-		/* append all mrec0 to output */
-		for (;;) {
-			ROW_MERGE_WRITE_GET_NEXT(0, goto done0);
-		}
-	}
-done0:
-
-	/* The file offset points to the beginning of the last page
-	that has been read.  Update it to point to the next block. */
-	(*foffs0)++;
-
-	mem_heap_free(heap);
-	return(row_merge_write_eof(&block[2], b2, of->fd, &of->offset,
-				   block_size) != NULL);
-}
-
-/*************************************************************//**
-Merge disk files.
-@return	DB_SUCCESS or error code */
-static __attribute__((nonnull))
-ulint
-row_merge(
-/*======*/
-	trx_t*			trx,	/*!< in: transaction */
-	const dict_index_t*	index,	/*!< in: index being created */
-	merge_file_t*		file,	/*!< in/out: file containing
-					index entries */
-	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
-	int*			tmpfd,	/*!< in/out: temporary file handle */
-	struct TABLE*		table,	/*!< in/out: MySQL table, for
-					reporting erroneous key value
-					if applicable */
-	ulint*			num_run,/*!< in/out: Number of runs remain
-					to be merged */
-	ulint*			run_offset, /*!< in/out: Array contains the
-					first offset number for each merge
-					run */
-	ulint			block_size) /*!< in: merge block buffer size */
-{
-	ulint		foffs0;	/*!< first input offset */
-	ulint		foffs1;	/*!< second input offset */
-	ulint		error;	/*!< error code */
-	merge_file_t	of;	/*!< output file */
-	const ulint	ihalf	= run_offset[*num_run / 2];
-				/*!< half the input file */
-	ulint		n_run	= 0;
-				/*!< num of runs generated from this merge */
-
-	UNIV_MEM_ASSERT_W(block[0], 3 * block_size);
-
-	ut_ad(ihalf < file->offset);
-
-	of.fd = *tmpfd;
-	of.offset = 0;
-	of.n_rec = 0;
-
-#ifdef POSIX_FADV_SEQUENTIAL
-	/* The input file will be read sequentially, starting from the
-	beginning and the middle.  In Linux, the POSIX_FADV_SEQUENTIAL
-	affects the entire file.  Each block will be read exactly once. */
-	posix_fadvise(file->fd, 0, 0,
-		      POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE);
-#endif /* POSIX_FADV_SEQUENTIAL */
-
-	/* Merge blocks to the output file. */
-	foffs0 = 0;
-	foffs1 = ihalf;
-
-	UNIV_MEM_INVALID(run_offset, *num_run * sizeof *run_offset);
-
-	for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) {
-
-		if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
-			return(DB_INTERRUPTED);
-		}
-
-		/* Remember the offset number for this run */
-		run_offset[n_run++] = of.offset;
-
-		error = row_merge_blocks(index, file, block,
-					 &foffs0, &foffs1, &of, table,
-					 block_size);
-
-		if (error != DB_SUCCESS) {
-			return(error);
-		}
-
-	}
-
-	/* Copy the last blocks, if there are any. */
-
-	while (foffs0 < ihalf) {
-		if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
-			return(DB_INTERRUPTED);
-		}
-
-		/* Remember the offset number for this run */
-		run_offset[n_run++] = of.offset;
-
-		if (!row_merge_blocks_copy(index, file, block, &foffs0, &of,
-					   block_size)) {
-			return(DB_CORRUPTION);
-		}
-	}
-
-	ut_ad(foffs0 == ihalf);
-
-	while (foffs1 < file->offset) {
-		if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
-			return(DB_INTERRUPTED);
-		}
-
-		/* Remember the offset number for this run */
-		run_offset[n_run++] = of.offset;
-
-		if (!row_merge_blocks_copy(index, file, block, &foffs1, &of,
-					   block_size)) {
-			return(DB_CORRUPTION);
-		}
-	}
-
-	ut_ad(foffs1 == file->offset);
-
-	if (UNIV_UNLIKELY(of.n_rec != file->n_rec)) {
-		return(DB_CORRUPTION);
-	}
-
-	ut_ad(n_run <= *num_run);
-
-	*num_run = n_run;
-
-	/* Each run can contain one or more offsets. As merge goes on,
-	the number of runs (to merge) will reduce until we have one
-	single run. So the number of runs will always be smaller than
-	the number of offsets in file */
-	ut_ad((*num_run) <= file->offset);
-
-	/* The number of offsets in output file is always equal or
-	smaller than input file */
-	ut_ad(of.offset <= file->offset);
-
-	/* Swap file descriptors for the next pass. */
-	*tmpfd = file->fd;
-	*file = of;
-
-	UNIV_MEM_INVALID(block[0], 3 * block_size);
-
-	return(DB_SUCCESS);
-}
-
-/*************************************************************//**
-Merge disk files.
-@return	DB_SUCCESS or error code */
-static
-ulint
-row_merge_sort(
-/*===========*/
-	trx_t*			trx,	/*!< in: transaction */
-	const dict_index_t*	index,	/*!< in: index being created */
-	merge_file_t*		file,	/*!< in/out: file containing
-					index entries */
-	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
-	int*			tmpfd,	/*!< in/out: temporary file handle */
-	struct TABLE*		table,	/*!< in/out: MySQL table, for
-					reporting erroneous key value
-					if applicable */
-	ulint			block_size)
-					/*!< in: merge block buffer size */
-{
-	ulint	half = file->offset / 2;
-	ulint	num_runs;
-	ulint*	run_offset;
-	ulint	error = DB_SUCCESS;
-
-	/* Record the number of merge runs we need to perform */
-	num_runs = file->offset;
-
-	/* If num_runs are less than 1, nothing to merge */
-	if (num_runs <= 1) {
-		return(error);
-	}
-
-	/* "run_offset" records each run's first offset number */
-	run_offset = (ulint*) mem_alloc(file->offset * sizeof(ulint));
-
-	/* This tells row_merge() where to start for the first round
-	of merge. */
-	run_offset[half] = half;
-
-	/* The file should always contain at least one byte (the end
-	of file marker).  Thus, it must be at least one block. */
-	ut_ad(file->offset > 0);
-
-	/* Merge the runs until we have one big run */
-	do {
-		error = row_merge(trx, index, file, block, tmpfd,
-				  table, &num_runs, run_offset, block_size);
-
-		UNIV_MEM_ASSERT_RW(run_offset, num_runs * sizeof *run_offset);
-
-		if (error != DB_SUCCESS) {
-			break;
-		}
-	} while (num_runs > 1);
-
-	mem_free(run_offset);
-
-	return(error);
-}
-
-/*************************************************************//**
-Copy externally stored columns to the data tuple. */
-static
-void
-row_merge_copy_blobs(
-/*=================*/
-	const mrec_t*	mrec,	/*!< in: merge record */
-	const ulint*	offsets,/*!< in: offsets of mrec */
-	ulint		zip_size,/*!< in: compressed page size in bytes, or 0 */
-	dtuple_t*	tuple,	/*!< in/out: data tuple */
-	mem_heap_t*	heap)	/*!< in/out: memory heap */
-{
-	ulint	i;
-	ulint	n_fields = dtuple_get_n_fields(tuple);
-
-	for (i = 0; i < n_fields; i++) {
-		ulint		len;
-		const void*	data;
-		dfield_t*	field = dtuple_get_nth_field(tuple, i);
-
-		if (!dfield_is_ext(field)) {
-			continue;
-		}
-
-		ut_ad(!dfield_is_null(field));
-
-		/* The table is locked during index creation.
-		Therefore, externally stored columns cannot possibly
-		be freed between the time the BLOB pointers are read
-		(row_merge_read_clustered_index()) and dereferenced
-		(below). */
-		data = btr_rec_copy_externally_stored_field(
-			mrec, offsets, zip_size, i, &len, heap);
-		/* Because we have locked the table, any records
-		written by incomplete transactions must have been
-		rolled back already. There must not be any incomplete
-		BLOB columns. */
-		ut_a(data);
-
-		dfield_set_data(field, data, len);
-	}
-}
-
-/********************************************************************//**
-Read sorted file containing index data tuples and insert these data
-tuples to the index
-@return	DB_SUCCESS or error number */
-static
-ulint
-row_merge_insert_index_tuples(
-/*==========================*/
-	trx_t*			trx,	/*!< in: transaction */
-	dict_index_t*		index,	/*!< in: index */
-	dict_table_t*		table,	/*!< in: new table */
-	ulint			zip_size,/*!< in: compressed page size of
-					 the old table, or 0 if uncompressed */
-	int			fd,	/*!< in: file descriptor */
-	row_merge_block_t*	block,	/*!< in/out: file buffer */
-	ulint			block_size)
-					/*! in: merge block buffer size */
-{
-	const byte*		b;
-	que_thr_t*		thr;
-	ins_node_t*		node;
-	mem_heap_t*		tuple_heap;
-	mem_heap_t*		graph_heap;
-	ulint			error = DB_SUCCESS;
-	ulint			foffs = 0;
-	ulint*			offsets;
-
-	ut_ad(trx);
-	ut_ad(index);
-	ut_ad(table);
-
-	/* We use the insert query graph as the dummy graph
-	needed in the row module call */
-
-	trx->op_info = "inserting index entries";
-
-	graph_heap = mem_heap_create(500 + sizeof(mrec_buf_t));
-	node = ins_node_create(INS_DIRECT, table, graph_heap);
-
-	thr = pars_complete_graph_for_exec(node, trx, graph_heap);
-
-	que_thr_move_to_run_state_for_mysql(thr, trx);
-
-	tuple_heap = mem_heap_create(1000);
-
-	{
-		ulint i	= 1 + REC_OFFS_HEADER_SIZE
-			+ dict_index_get_n_fields(index);
-		offsets = mem_heap_alloc(graph_heap, i * sizeof *offsets);
-		offsets[0] = i;
-		offsets[1] = dict_index_get_n_fields(index);
-	}
-
-	b = *block;
-
-	if (!row_merge_read(fd, foffs, block[0], block_size)) {
-		error = DB_CORRUPTION;
-	} else {
-		mrec_buf_t*	buf = mem_heap_alloc(graph_heap, sizeof *buf);
-
-		for (;;) {
-			const mrec_t*	mrec;
-			dtuple_t*	dtuple;
-			ulint		n_ext;
-
-			b = row_merge_read_rec(block, buf, b, index,
-					       fd, &foffs, &mrec, offsets,
-					       block_size);
-			if (UNIV_UNLIKELY(!b)) {
-				/* End of list, or I/O error */
-				if (mrec) {
-					error = DB_CORRUPTION;
-				}
-				break;
-			}
-
-			dtuple = row_rec_to_index_entry_low(
-				mrec, index, offsets, &n_ext, tuple_heap);
-
-			if (UNIV_UNLIKELY(n_ext)) {
-				row_merge_copy_blobs(mrec, offsets, zip_size,
-						     dtuple, tuple_heap);
-			}
-
-			node->row = dtuple;
-			node->table = table;
-			node->trx_id = trx->id;
-
-			ut_ad(dtuple_validate(dtuple));
-
-			do {
-				thr->run_node = thr;
-				thr->prev_node = thr->common.parent;
-
-				error = row_ins_index_entry(index, dtuple,
-							    0, FALSE, thr);
-
-				if (UNIV_LIKELY(error == DB_SUCCESS)) {
-
-					goto next_rec;
-				}
-
-				thr->lock_state = QUE_THR_LOCK_ROW;
-				trx->error_state = error;
-				que_thr_stop_for_mysql(thr);
-				thr->lock_state = QUE_THR_LOCK_NOLOCK;
-			} while (row_mysql_handle_errors(&error, trx,
-							 thr, NULL));
-
-			goto err_exit;
-next_rec:
-			mem_heap_empty(tuple_heap);
-		}
-	}
-
-	que_thr_stop_for_mysql_no_error(thr, trx);
-err_exit:
-	que_graph_free(thr->graph);
-
-	trx->op_info = "";
-
-	mem_heap_free(tuple_heap);
-
-	return(error);
-}
-
-/*********************************************************************//**
-Sets an exclusive lock on a table, for the duration of creating indexes.
-@return	error code or DB_SUCCESS */
-UNIV_INTERN
-ulint
-row_merge_lock_table(
-/*=================*/
-	trx_t*		trx,		/*!< in/out: transaction */
-	dict_table_t*	table,		/*!< in: table to lock */
-	enum lock_mode	mode)		/*!< in: LOCK_X or LOCK_S */
-{
-	mem_heap_t*	heap;
-	que_thr_t*	thr;
-	ulint		err;
-	sel_node_t*	node;
-
-	ut_ad(trx);
-	ut_ad(mode == LOCK_X || mode == LOCK_S);
-
-	heap = mem_heap_create(512);
-
-	trx->op_info = "setting table lock for creating or dropping index";
-
-	node = sel_node_create(heap);
-	thr = pars_complete_graph_for_exec(node, trx, heap);
-	thr->graph->state = QUE_FORK_ACTIVE;
-
-	/* We use the select query graph as the dummy graph needed
-	in the lock module call */
-
-	thr = que_fork_get_first_thr(que_node_get_parent(thr));
-	que_thr_move_to_run_state_for_mysql(thr, trx);
-
-run_again:
-	thr->run_node = thr;
-	thr->prev_node = thr->common.parent;
-
-	err = lock_table(0, table, mode, thr);
-
-	trx->error_state = err;
-
-	if (UNIV_LIKELY(err == DB_SUCCESS)) {
-		que_thr_stop_for_mysql_no_error(thr, trx);
-	} else {
-		que_thr_stop_for_mysql(thr);
-
-		if (err != DB_QUE_THR_SUSPENDED) {
-			ibool	was_lock_wait;
-
-			was_lock_wait = row_mysql_handle_errors(
-				&err, trx, thr, NULL);
-
-			if (was_lock_wait) {
-				goto run_again;
-			}
-		} else {
-			que_thr_t*	run_thr;
-			que_node_t*	parent;
-
-			parent = que_node_get_parent(thr);
-			run_thr = que_fork_start_command(parent);
-
-			ut_a(run_thr == thr);
-
-			/* There was a lock wait but the thread was not
-			in a ready to run or running state. */
-			trx->error_state = DB_LOCK_WAIT;
-
-			goto run_again;
-		}
-	}
-
-	que_graph_free(thr->graph);
-	trx->op_info = "";
-
-	return(err);
-}
-
-/*********************************************************************//**
-Drop an index from the InnoDB system tables.  The data dictionary must
-have been locked exclusively by the caller, because the transaction
-will not be committed. */
-UNIV_INTERN
-void
-row_merge_drop_index(
-/*=================*/
-	dict_index_t*	index,	/*!< in: index to be removed */
-	dict_table_t*	table,	/*!< in: table */
-	trx_t*		trx)	/*!< in: transaction handle */
-{
-	ulint		err;
-	pars_info_t*	info = pars_info_create();
-
-	/* We use the private SQL parser of Innobase to generate the
-	query graphs needed in deleting the dictionary data from system
-	tables in Innobase. Deleting a row from SYS_INDEXES table also
-	frees the file segments of the B-tree associated with the index. */
-
-	static const char sql[] =
-		"PROCEDURE DROP_INDEX_PROC () IS\n"
-		"BEGIN\n"
-		/* Rename the index, so that it will be dropped by
-		row_merge_drop_temp_indexes() at crash recovery
-		if the server crashes before this trx is committed. */
-		"UPDATE SYS_INDEXES SET NAME=CONCAT('"
-		TEMP_INDEX_PREFIX_STR "', NAME) WHERE ID = :indexid;\n"
-		"COMMIT WORK;\n"
-		/* Drop the statistics of the index. */
-		"DELETE FROM SYS_STATS WHERE INDEX_ID = :indexid;\n"
-		/* Drop the field definitions of the index. */
-		"DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n"
-		/* Drop the index definition and the B-tree. */
-		"DELETE FROM SYS_INDEXES WHERE ID = :indexid;\n"
-		"END;\n";
-
-	ut_ad(index && table && trx);
-
-	pars_info_add_ull_literal(info, "indexid", index->id);
-
-	trx_start_if_not_started(trx);
-	trx->op_info = "dropping index";
-
-	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
-
-	err = que_eval_sql(info, sql, FALSE, trx);
-
-
-	if (err != DB_SUCCESS) {
-		/* Even though we ensure that DDL transactions are WAIT
-		and DEADLOCK free, we could encounter other errors e.g.,
-		DB_TOO_MANY_TRANSACTIONS. */
-		trx->error_state = DB_SUCCESS;
-
-		ut_print_timestamp(stderr);
-		fprintf(stderr, " InnoDB: Error: row_merge_drop_index failed "
-			"with error code: %lu.\n", (ulint) err);
-	}
-
-	/* Replace this index with another equivalent index for all
-	foreign key constraints on this table where this index is used */
-
-	dict_table_replace_index_in_foreign_list(table, index, trx);
-	dict_index_remove_from_cache(table, index);
-
-	trx->op_info = "";
-}
-
-/*********************************************************************//**
-Drop those indexes which were created before an error occurred when
-building an index.  The data dictionary must have been locked
-exclusively by the caller, because the transaction will not be
-committed. */
-UNIV_INTERN
-void
-row_merge_drop_indexes(
-/*===================*/
-	trx_t*		trx,		/*!< in: transaction */
-	dict_table_t*	table,		/*!< in: table containing the indexes */
-	dict_index_t**	index,		/*!< in: indexes to drop */
-	ulint		num_created)	/*!< in: number of elements in index[] */
-{
-	ulint	key_num;
-
-	for (key_num = 0; key_num < num_created; key_num++) {
-		row_merge_drop_index(index[key_num], table, trx);
-	}
-}
-
-/*********************************************************************//**
-Drop all partially created indexes during crash recovery. */
-UNIV_INTERN
-void
-row_merge_drop_temp_indexes(void)
-/*=============================*/
-{
-	trx_t*		trx;
-	btr_pcur_t	pcur;
-	mtr_t		mtr;
-
-	/* Load the table definitions that contain partially defined
-	indexes, so that the data dictionary information can be checked
-	when accessing the tablename.ibd files. */
-	trx = trx_allocate_for_background();
-	trx->op_info = "dropping partially created indexes";
-	row_mysql_lock_data_dictionary(trx);
-
-	mtr_start(&mtr);
-
-	btr_pcur_open_at_index_side(
-		TRUE,
-		dict_table_get_first_index(dict_sys->sys_indexes),
-		BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
-
-	for (;;) {
-		const rec_t*	rec;
-		const byte*	field;
-		ulint		len;
-		table_id_t	table_id;
-		dict_table_t*	table;
-
-		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
-
-		if (!btr_pcur_is_on_user_rec(&pcur)) {
-			break;
-		}
-
-		rec = btr_pcur_get_rec(&pcur);
-		field = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_NAME_FIELD,
-					      &len);
-		if (len == UNIV_SQL_NULL || len == 0
-		    || (char) *field != TEMP_INDEX_PREFIX) {
-			continue;
-		}
-
-		/* This is a temporary index. */
-
-		field = rec_get_nth_field_old(rec, 0/*TABLE_ID*/, &len);
-		if (len != 8) {
-			/* Corrupted TABLE_ID */
-			continue;
-		}
-
-		table_id = mach_read_from_8(field);
-
-		btr_pcur_store_position(&pcur, &mtr);
-		btr_pcur_commit_specify_mtr(&pcur, &mtr);
-
-		table = dict_table_get_on_id_low(table_id);
-
-		if (table) {
-			dict_index_t*	index;
-			dict_index_t*	next_index;
-
-			for (index = dict_table_get_first_index(table);
-			     index; index = next_index) {
-
-				next_index = dict_table_get_next_index(index);
-
-				if (*index->name == TEMP_INDEX_PREFIX) {
-					row_merge_drop_index(index, table, trx);
-					trx_commit_for_mysql(trx);
-				}
-			}
-		}
-
-		mtr_start(&mtr);
-		btr_pcur_restore_position(BTR_SEARCH_LEAF,
-					  &pcur, &mtr);
-	}
-
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
-	row_mysql_unlock_data_dictionary(trx);
-	trx_free_for_background(trx);
-}
-
-/*********************************************************************//**
-Creates temperary merge files, and if UNIV_PFS_IO defined, register
-the file descriptor with Performance Schema.
-@return file descriptor, or -1 on failure */
-UNIV_INLINE
-int
-row_merge_file_create_low(void)
-/*===========================*/
-{
-	int	fd;
-#ifdef UNIV_PFS_IO
-	/* This temp file open does not go through normal
-	file APIs, add instrumentation to register with
-	performance schema */
-	struct PSI_file_locker*	locker = NULL;
-	PSI_file_locker_state	state;
-	register_pfs_file_open_begin(&state, locker, innodb_file_temp_key,
-				     PSI_FILE_OPEN,
-				     "Innodb Merge Temp File",
-				     __FILE__, __LINE__);
-#endif
-	fd = innobase_mysql_tmpfile();
-#ifdef UNIV_PFS_IO
-        register_pfs_file_open_end(locker, fd);
-#endif
-	if (fd < 0) {
-		fprintf(stderr,
-			"InnoDB: Error: Cannot create temporary merge file\n");
-		return(-1);
-	}
-	return(fd);
-}
-
-/*********************************************************************//**
-Create a merge file.
-@return file descriptor, or -1 on failure */
-static __attribute__((nonnull, warn_unused_result))
-int
-row_merge_file_create(
-/*==================*/
-	merge_file_t*	merge_file)	/*!< out: merge file structure */
-{
-	merge_file->fd = row_merge_file_create_low();
-	merge_file->offset = 0;
-	merge_file->n_rec = 0;
-	return(merge_file->fd);
-}
-
-/*********************************************************************//**
-Destroy a merge file. And de-register the file from Performance Schema
-if UNIV_PFS_IO is defined. */
-UNIV_INLINE
-void
-row_merge_file_destroy_low(
-/*=======================*/
-	int		fd)	/*!< in: merge file descriptor */
-{
-#ifdef UNIV_PFS_IO
-	struct PSI_file_locker*	locker = NULL;
-	PSI_file_locker_state	state;
-	register_pfs_file_io_begin(&state, locker,
-				   fd, 0, PSI_FILE_CLOSE,
-				   __FILE__, __LINE__);
-#endif
-	close(fd);
-#ifdef UNIV_PFS_IO
-	register_pfs_file_io_end(locker, 0);
-#endif
-}
-/*********************************************************************//**
-Destroy a merge file. */
-static
-void
-row_merge_file_destroy(
-/*===================*/
-	merge_file_t*	merge_file)	/*!< out: merge file structure */
-{
-	if (merge_file->fd != -1) {
-		row_merge_file_destroy_low(merge_file->fd);
-		merge_file->fd = -1;
-	}
-}
-
-/*********************************************************************//**
-Determine the precise type of a column that is added to a tem
-if a column must be constrained NOT NULL.
-@return	col->prtype, possibly ORed with DATA_NOT_NULL */
-UNIV_INLINE
-ulint
-row_merge_col_prtype(
-/*=================*/
-	const dict_col_t*	col,		/*!< in: column */
-	const char*		col_name,	/*!< in: name of the column */
-	const merge_index_def_t*index_def)	/*!< in: the index definition
-						of the primary key */
-{
-	ulint	prtype = col->prtype;
-	ulint	i;
-
-	ut_ad(index_def->ind_type & DICT_CLUSTERED);
-
-	if (prtype & DATA_NOT_NULL) {
-
-		return(prtype);
-	}
-
-	/* All columns that are included
-	in the PRIMARY KEY must be NOT NULL. */
-
-	for (i = 0; i < index_def->n_fields; i++) {
-		if (!strcmp(col_name, index_def->fields[i].field_name)) {
-			return(prtype | DATA_NOT_NULL);
-		}
-	}
-
-	return(prtype);
-}
-
-/*********************************************************************//**
-Create a temporary table for creating a primary key, using the definition
-of an existing table.
-@return	table, or NULL on error */
-UNIV_INTERN
-dict_table_t*
-row_merge_create_temporary_table(
-/*=============================*/
-	const char*		table_name,	/*!< in: new table name */
-	const merge_index_def_t*index_def,	/*!< in: the index definition
-						of the primary key */
-	const dict_table_t*	table,		/*!< in: old table definition */
-	trx_t*			trx)		/*!< in/out: transaction
-						(sets error_state) */
-{
-	ulint		i;
-	dict_table_t*	new_table = NULL;
-	ulint		n_cols = dict_table_get_n_user_cols(table);
-	ulint		error;
-	mem_heap_t*	heap = mem_heap_create(1000);
-
-	ut_ad(table_name);
-	ut_ad(index_def);
-	ut_ad(table);
-	ut_ad(mutex_own(&dict_sys->mutex));
-
-	new_table = dict_mem_table_create(table_name, 0, n_cols, table->flags);
-
-	for (i = 0; i < n_cols; i++) {
-		const dict_col_t*	col;
-		const char*		col_name;
-
-		col = dict_table_get_nth_col(table, i);
-		col_name = dict_table_get_col_name(table, i);
-
-		dict_mem_table_add_col(new_table, heap, col_name, col->mtype,
-				       row_merge_col_prtype(col, col_name,
-							    index_def),
-				       col->len);
-	}
-
-	error = row_create_table_for_mysql(new_table, trx);
-	mem_heap_free(heap);
-
-	if (error != DB_SUCCESS) {
-		trx->error_state = error;
-		new_table = NULL;
-	}
-
-	return(new_table);
-}
-
-/*********************************************************************//**
-Rename the temporary indexes in the dictionary to permanent ones.  The
-data dictionary must have been locked exclusively by the caller,
-because the transaction will not be committed.
-@return	DB_SUCCESS if all OK */
-UNIV_INTERN
-ulint
-row_merge_rename_indexes(
-/*=====================*/
-	trx_t*		trx,		/*!< in/out: transaction */
-	dict_table_t*	table)		/*!< in/out: table with new indexes */
-{
-	ulint		err = DB_SUCCESS;
-	pars_info_t*	info = pars_info_create();
-
-	/* We use the private SQL parser of Innobase to generate the
-	query graphs needed in renaming indexes. */
-
-	static const char sql[] =
-		"PROCEDURE RENAME_INDEXES_PROC () IS\n"
-		"BEGIN\n"
-		"UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
-		"WHERE TABLE_ID = :tableid AND SUBSTR(NAME,0,1)='"
-		TEMP_INDEX_PREFIX_STR "';\n"
-		"END;\n";
-
-	ut_ad(table);
-	ut_ad(trx);
-	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
-
-	trx->op_info = "renaming indexes";
-
-	pars_info_add_ull_literal(info, "tableid", table->id);
-
-	err = que_eval_sql(info, sql, FALSE, trx);
-
-	if (err == DB_SUCCESS) {
-		dict_index_t*	index = dict_table_get_first_index(table);
-		do {
-			if (*index->name == TEMP_INDEX_PREFIX) {
-				index->name++;
-			}
-			index = dict_table_get_next_index(index);
-		} while (index);
-	} else {
-		/* Even though we ensure that DDL transactions are WAIT
-		and DEADLOCK free, we could encounter other errors e.g.,
-		DB_TOO_MANY_TRANSACTIONS. */
-		trx->error_state = DB_SUCCESS;
-
-		ut_print_timestamp(stderr);
-		fprintf(stderr, " InnoDB: Error: row_merge_rename_indexes "
-			"failed with error code: %lu.\n", (ulint) err);
-	}
-
-	trx->op_info = "";
-
-	return(err);
-}
-
-/*********************************************************************//**
-Rename the tables in the data dictionary.  The data dictionary must
-have been locked exclusively by the caller, because the transaction
-will not be committed.
-@return	error code or DB_SUCCESS */
-UNIV_INTERN
-ulint
-row_merge_rename_tables(
-/*====================*/
-	dict_table_t*	old_table,	/*!< in/out: old table, renamed to
-					tmp_name */
-	dict_table_t*	new_table,	/*!< in/out: new table, renamed to
-					old_table->name */
-	const char*	tmp_name,	/*!< in: new name for old_table */
-	trx_t*		trx)		/*!< in: transaction handle */
-{
-	ulint		err	= DB_ERROR;
-	pars_info_t*	info;
-	char		old_name[MAX_FULL_NAME_LEN + 1];
-
-	ut_ad(old_table != new_table);
-	ut_ad(mutex_own(&dict_sys->mutex));
-
-	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
-
-	/* store the old/current name to an automatic variable */
-	if (strlen(old_table->name) + 1 <= sizeof(old_name)) {
-		memcpy(old_name, old_table->name, strlen(old_table->name) + 1);
-	} else {
-		ut_print_timestamp(stderr);
-		fprintf(stderr, " InnoDB: too long table name: '%s', "
-			"max length is %d\n", old_table->name,
-			MAX_FULL_NAME_LEN);
-		ut_error;
-	}
-
-	trx->op_info = "renaming tables";
-
-	/* We use the private SQL parser of Innobase to generate the query
-	graphs needed in updating the dictionary data in system tables. */
-
-	info = pars_info_create();
-
-	pars_info_add_str_literal(info, "new_name", new_table->name);
-	pars_info_add_str_literal(info, "old_name", old_name);
-	pars_info_add_str_literal(info, "tmp_name", tmp_name);
-
-	err = que_eval_sql(info,
-			   "PROCEDURE RENAME_TABLES () IS\n"
-			   "BEGIN\n"
-			   "UPDATE SYS_TABLES SET NAME = :tmp_name\n"
-			   " WHERE NAME = :old_name;\n"
-			   "UPDATE SYS_TABLES SET NAME = :old_name\n"
-			   " WHERE NAME = :new_name;\n"
-			   "END;\n", FALSE, trx);
-
-	if (err != DB_SUCCESS) {
-
-		goto err_exit;
-	}
-
-	/* Generate the redo logs for file operations */
-	fil_mtr_rename_log(old_table->space, old_name,
-			   new_table->space, new_table->name, tmp_name);
-
-	/* What if the redo logs are flushed to disk here?  This is
-	tested with following crash point */
-	DBUG_EXECUTE_IF("bug14669848_precommit", log_buffer_flush_to_disk();
-			DBUG_SUICIDE(););
-
-	/* File operations cannot be rolled back.  So, before proceeding
-	with file operations, commit the dictionary changes.*/
-	trx_commit_for_mysql(trx);
-
-	/* If server crashes here, the dictionary in InnoDB and MySQL
-	will differ.  The .ibd files and the .frm files must be swapped
-	manually by the administrator. No loss of data. */
-	DBUG_EXECUTE_IF("bug14669848", DBUG_SUICIDE(););
-
-	/* Ensure that the redo logs are flushed to disk.  The config
-	innodb_flush_log_at_trx_commit must not affect this. */
-	log_buffer_flush_to_disk();
-
-	/* The following calls will also rename the .ibd data files if
-	the tables are stored in a single-table tablespace */
-
-	if (!dict_table_rename_in_cache(old_table, tmp_name, FALSE)) {
-
-		err = DB_ERROR;
-		goto err_exit;
-	}
-
-	DEBUG_SYNC_C("row_merge_rename_tables_between_renames");
-
-	if (!dict_table_rename_in_cache(new_table, old_name, FALSE)) {
-
-		err = DB_ERROR;
-		goto err_exit;
-	}
-
-	err = dict_load_foreigns(old_name, FALSE, TRUE, DICT_ERR_IGNORE_NONE);
-
-	if (err != DB_SUCCESS) {
-err_exit:
-		trx->error_state = DB_SUCCESS;
-		trx_general_rollback_for_mysql(trx, NULL);
-		trx->error_state = DB_SUCCESS;
-	}
-
-	trx->op_info = "";
-
-	return(err);
-}
-
-/*********************************************************************//**
-Create and execute a query graph for creating an index.
-@return	DB_SUCCESS or error code */
-static
-ulint
-row_merge_create_index_graph(
-/*=========================*/
-	trx_t*		trx,		/*!< in: trx */
-	dict_table_t*	table,		/*!< in: table */
-	dict_index_t*	index)		/*!< in: index */
-{
-	ind_node_t*	node;		/*!< Index creation node */
-	mem_heap_t*	heap;		/*!< Memory heap */
-	que_thr_t*	thr;		/*!< Query thread */
-	ulint		err;
-
-	ut_ad(trx);
-	ut_ad(table);
-	ut_ad(index);
-
-	heap = mem_heap_create(512);
-
-	index->table = table;
-	node = ind_create_graph_create(index, heap);
-	thr = pars_complete_graph_for_exec(node, trx, heap);
-
-	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
-
-	que_run_threads(thr);
-
-	err = trx->error_state;
-
-	que_graph_free((que_t*) que_node_get_parent(thr));
-
-	return(err);
-}
-
-/*********************************************************************//**
-Create the index and load in to the dictionary.
-@return	index, or NULL on error */
-UNIV_INTERN
-dict_index_t*
-row_merge_create_index(
-/*===================*/
-	trx_t*			trx,	/*!< in/out: trx (sets error_state) */
-	dict_table_t*		table,	/*!< in: the index is on this table */
-	const merge_index_def_t*index_def)
-					/*!< in: the index definition */
-{
-	dict_index_t*	index;
-	ulint		err;
-	ulint		n_fields = index_def->n_fields;
-	ulint		i;
-
-	/* Create the index prototype, using the passed in def, this is not
-	a persistent operation. We pass 0 as the space id, and determine at
-	a lower level the space id where to store the table. */
-
-	index = dict_mem_index_create(table->name, index_def->name,
-				      0, index_def->ind_type, n_fields);
-
-	ut_a(index);
-
-	for (i = 0; i < n_fields; i++) {
-		merge_index_field_t*	ifield = &index_def->fields[i];
-
-		dict_mem_index_add_field(index, ifield->field_name,
-					 ifield->prefix_len);
-	}
-
-	/* Add the index to SYS_INDEXES, using the index prototype. */
-	err = row_merge_create_index_graph(trx, table, index);
-
-	if (err == DB_SUCCESS) {
-
-		index = row_merge_dict_table_get_index(
-			table, index_def);
-
-		ut_a(index);
-
-		/* Note the id of the transaction that created this
-		index, we use it to restrict readers from accessing
-		this index, to ensure read consistency. */
-		index->trx_id = trx->id;
-	} else {
-		index = NULL;
-	}
-
-	return(index);
-}
-
-/*********************************************************************//**
-Check if a transaction can use an index. */
-UNIV_INTERN
-ibool
-row_merge_is_index_usable(
-/*======================*/
-	const trx_t*		trx,	/*!< in: transaction */
-	const dict_index_t*	index)	/*!< in: index to check */
-{
-	return(!dict_index_is_corrupted(index)
-	       && (!trx->read_view
-	           || read_view_sees_trx_id(trx->read_view, index->trx_id)));
-}
-
-/*********************************************************************//**
-Drop the old table.
-@return	DB_SUCCESS or error code */
-UNIV_INTERN
-ulint
-row_merge_drop_table(
-/*=================*/
-	trx_t*		trx,		/*!< in: transaction */
-	dict_table_t*	table)		/*!< in: table to drop */
-{
-	/* There must be no open transactions on the table. */
-	ut_a(table->n_mysql_handles_opened == 0);
-
-	return(row_drop_table_for_mysql(table->name, trx, FALSE));
-}
-
-/*********************************************************************//**
-Build indexes on a table by reading a clustered index,
-creating a temporary file containing index entries, merge sorting
-these index entries and inserting sorted index entries to indexes.
-@return	DB_SUCCESS or error code */
-UNIV_INTERN
-ulint
-row_merge_build_indexes(
-/*====================*/
-	trx_t*		trx,		/*!< in: transaction */
-	dict_table_t*	old_table,	/*!< in: table where rows are
-					read from */
-	dict_table_t*	new_table,	/*!< in: table where indexes are
-					created; identical to old_table
-					unless creating a PRIMARY KEY */
-	dict_index_t**	indexes,	/*!< in: indexes to be created */
-	ulint		n_indexes,	/*!< in: size of indexes[] */
-	struct TABLE*	table)		/*!< in/out: MySQL table, for
-					reporting erroneous key value
-					if applicable */
-{
-	merge_file_t*		merge_files;
-	/* Some code uses block[1] as the synonym for block + block_size.  So
-	we initialize block[3] to the address boundary of block[2], even
-	though space for 3 only buffers is allocated. */
-	row_merge_block_t	block[4];
-	ulint			block_size;
-	ulint			i;
-	ulint			error;
-	int			tmpfd = -1;
-	ulint			merge_sort_block_size;
-	void*			block_mem;
-
-	ut_ad(trx);
-	ut_ad(old_table);
-	ut_ad(new_table);
-	ut_ad(indexes);
-	ut_ad(n_indexes);
-
-	merge_sort_block_size = thd_merge_sort_block_size(trx->mysql_thd);
-
-	trx_start_if_not_started(trx);
-
-	/* Allocate memory for merge file data structure and initialize
-	fields */
-
-	merge_files = mem_alloc(n_indexes * sizeof *merge_files);
-	block_size = 3 * merge_sort_block_size;
-	block_mem = os_mem_alloc_large(&block_size, FALSE);
-
-	for (i = 0; i < UT_ARR_SIZE(block); i++) {
-		block[i] = (row_merge_block_t ) ((byte *) block_mem +
-			i * merge_sort_block_size);
-	}
-
-	/* Initialize all the merge file descriptors, so that we
-	don't call row_merge_file_destroy() on uninitialized
-	merge file descriptor */
-
-	for (i = 0; i < n_indexes; i++) {
-		merge_files[i].fd = -1;
-	}
-
-	for (i = 0; i < n_indexes; i++) {
-
-		if (row_merge_file_create(&merge_files[i]) < 0)
-		{
-			error = DB_OUT_OF_MEMORY;
-			goto func_exit;
-		}
-	}
-
-	tmpfd = row_merge_file_create_low();
-
-	if (tmpfd < 0)
-	{
-		error = DB_OUT_OF_MEMORY;
-		goto func_exit;
-	}
-
-	/* Reset the MySQL row buffer that is used when reporting
-	duplicate keys. */
-	innobase_rec_reset(table);
-
-	/* Read clustered index of the table and create files for
-	secondary index entries for merge sort */
-
-	error = row_merge_read_clustered_index(
-		trx, table, old_table, new_table, indexes,
-		merge_files, n_indexes, block, merge_sort_block_size);
-
-	if (error != DB_SUCCESS) {
-
-		goto func_exit;
-	}
-
-	/* Now we have files containing index entries ready for
-	sorting and inserting. */
-
-	for (i = 0; i < n_indexes; i++) {
-		error = row_merge_sort(trx, indexes[i], &merge_files[i],
-				       block, &tmpfd, table,
-				       merge_sort_block_size);
-
-		if (error == DB_SUCCESS) {
-			error = row_merge_insert_index_tuples(
-				trx, indexes[i], new_table,
-				dict_table_zip_size(old_table),
-				merge_files[i].fd, block,
-				merge_sort_block_size);
-		}
-
-		/* Close the temporary file to free up space. */
-		row_merge_file_destroy(&merge_files[i]);
-
-		if (error != DB_SUCCESS) {
-			trx->error_key_num = i;
-			goto func_exit;
-		}
-	}
-
-func_exit:
-	row_merge_file_destroy_low(tmpfd);
-
-	for (i = 0; i < n_indexes; i++) {
-		row_merge_file_destroy(&merge_files[i]);
-	}
-
-	mem_free(merge_files);
-	os_mem_free_large(block_mem, block_size);
-
-	return(error);
-}
diff --git a/storage/xtradb/row/row0merge.cc b/storage/xtradb/row/row0merge.cc
new file mode 100644
index 00000000000..08a3ecb5732
--- /dev/null
+++ b/storage/xtradb/row/row0merge.cc
@@ -0,0 +1,3666 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0merge.cc
+New index creation routines using a merge sort
+
+Created 12/4/2005 Jan Lindstrom
+Completed by Sunny Bains and Marko Makela
+*******************************************************/
+
+#include "row0merge.h"
+#include "row0ext.h"
+#include "row0log.h"
+#include "row0ins.h"
+#include "row0sel.h"
+#include "dict0crea.h"
+#include "trx0purge.h"
+#include "lock0lock.h"
+#include "pars0pars.h"
+#include "ut0sort.h"
+#include "row0ftsort.h"
+#include "row0import.h"
+#include "handler0alter.h"
+#include "ha_prototypes.h"
+
+/* Ignore posix_fadvise() on those platforms where it does not exist */
+#if defined __WIN__
+# define posix_fadvise(fd, offset, len, advice) /* nothing */
+#endif /* __WIN__ */
+
+#ifdef UNIV_DEBUG
+/** Set these in order ot enable debug printout. */
+/* @{ */
+/** Log each record read from temporary file. */
+static ibool	row_merge_print_read;
+/** Log each record write to temporary file. */
+static ibool	row_merge_print_write;
+/** Log each row_merge_blocks() call, merging two blocks of records to
+a bigger one. */
+static ibool	row_merge_print_block;
+/** Log each block read from temporary file. */
+static ibool	row_merge_print_block_read;
+/** Log each block read from temporary file. */
+static ibool	row_merge_print_block_write;
+/* @} */
+#endif /* UNIV_DEBUG */
+
+/* Whether to disable file system cache */
+UNIV_INTERN char	srv_disable_sort_file_cache;
+
+#ifdef UNIV_DEBUG
+/******************************************************//**
+Display a merge tuple. */
+static __attribute__((nonnull))
+void
+row_merge_tuple_print(
+/*==================*/
+	FILE*		f,	/*!< in: output stream */
+	const mtuple_t*	entry,	/*!< in: tuple to print */
+	ulint		n_fields)/*!< in: number of fields in the tuple */
+{
+	ulint	j;
+
+	for (j = 0; j < n_fields; j++) {
+		const dfield_t*	field = &entry->fields[j];
+
+		if (dfield_is_null(field)) {
+			fputs("\n NULL;", f);
+		} else {
+			ulint	field_len	= dfield_get_len(field);
+			ulint	len		= ut_min(field_len, 20);
+			if (dfield_is_ext(field)) {
+				fputs("\nE", f);
+			} else {
+				fputs("\n ", f);
+			}
+			ut_print_buf(f, dfield_get_data(field), len);
+			if (len != field_len) {
+				fprintf(f, " (total %lu bytes)", field_len);
+			}
+		}
+	}
+	putc('\n', f);
+}
+#endif /* UNIV_DEBUG */
+
+/******************************************************//**
+Encode an index record. */
+static __attribute__((nonnull))
+void
+row_merge_buf_encode(
+/*=================*/
+	byte**			b,		/*!< in/out: pointer to
+						current end of output buffer */
+	const dict_index_t*	index,		/*!< in: index */
+	const mtuple_t*		entry,		/*!< in: index fields
+						of the record to encode */
+	ulint			n_fields)	/*!< in: number of fields
+						in the entry */
+{
+	ulint	size;
+	ulint	extra_size;
+
+	size = rec_get_converted_size_temp(
+		index, entry->fields, n_fields, &extra_size);
+	ut_ad(size >= extra_size);
+
+	/* Encode extra_size + 1 */
+	if (extra_size + 1 < 0x80) {
+		*(*b)++ = (byte) (extra_size + 1);
+	} else {
+		ut_ad((extra_size + 1) < 0x8000);
+		*(*b)++ = (byte) (0x80 | ((extra_size + 1) >> 8));
+		*(*b)++ = (byte) (extra_size + 1);
+	}
+
+	rec_convert_dtuple_to_temp(*b + extra_size, index,
+				   entry->fields, n_fields);
+
+	*b += size;
+}
+
+/******************************************************//**
+Allocate a sort buffer.
+@return	own: sort buffer */
+static __attribute__((malloc, nonnull))
+row_merge_buf_t*
+row_merge_buf_create_low(
+/*=====================*/
+	mem_heap_t*	heap,		/*!< in: heap where allocated */
+	dict_index_t*	index,		/*!< in: secondary index */
+	ulint		max_tuples,	/*!< in: maximum number of
+					data tuples */
+	ulint		buf_size)	/*!< in: size of the buffer,
+					in bytes */
+{
+	row_merge_buf_t*	buf;
+
+	ut_ad(max_tuples > 0);
+
+	ut_ad(max_tuples <= srv_sort_buf_size);
+
+	buf = static_cast<row_merge_buf_t*>(mem_heap_zalloc(heap, buf_size));
+	buf->heap = heap;
+	buf->index = index;
+	buf->max_tuples = max_tuples;
+	buf->tuples = static_cast<mtuple_t*>(
+		ut_malloc(2 * max_tuples * sizeof *buf->tuples));
+	buf->tmp_tuples = buf->tuples + max_tuples;
+
+	return(buf);
+}
+
+/******************************************************//**
+Allocate a sort buffer.
+@return	own: sort buffer */
+UNIV_INTERN
+row_merge_buf_t*
+row_merge_buf_create(
+/*=================*/
+	dict_index_t*	index)	/*!< in: secondary index */
+{
+	row_merge_buf_t*	buf;
+	ulint			max_tuples;
+	ulint			buf_size;
+	mem_heap_t*		heap;
+
+	max_tuples = srv_sort_buf_size
+		/ ut_max(1, dict_index_get_min_size(index));
+
+	buf_size = (sizeof *buf);
+
+	heap = mem_heap_create(buf_size);
+
+	buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
+
+	return(buf);
+}
+
+/******************************************************//**
+Empty a sort buffer.
+@return	sort buffer */
+UNIV_INTERN
+row_merge_buf_t*
+row_merge_buf_empty(
+/*================*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer */
+{
+	ulint		buf_size	= sizeof *buf;
+	ulint		max_tuples	= buf->max_tuples;
+	mem_heap_t*	heap		= buf->heap;
+	dict_index_t*	index		= buf->index;
+	mtuple_t*	tuples		= buf->tuples;
+
+	mem_heap_empty(heap);
+
+	buf = static_cast<row_merge_buf_t*>(mem_heap_zalloc(heap, buf_size));
+	buf->heap = heap;
+	buf->index = index;
+	buf->max_tuples = max_tuples;
+	buf->tuples = tuples;
+	buf->tmp_tuples = buf->tuples + max_tuples;
+
+	return(buf);
+}
+
+/******************************************************//**
+Deallocate a sort buffer. */
+UNIV_INTERN
+void
+row_merge_buf_free(
+/*===============*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer to be freed */
+{
+	ut_free(buf->tuples);
+	mem_heap_free(buf->heap);
+}
+
+/******************************************************//**
+Insert a data tuple into a sort buffer.
+@return	number of rows added, 0 if out of space */
+static
+ulint
+row_merge_buf_add(
+/*==============*/
+	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
+	dict_index_t*		fts_index,/*!< in: fts index to be created */
+	const dict_table_t*	old_table,/*!< in: original table */
+	fts_psort_t*		psort_info, /*!< in: parallel sort info */
+	const dtuple_t*		row,	/*!< in: table row */
+	const row_ext_t*	ext,	/*!< in: cache of externally stored
+					column prefixes, or NULL */
+	doc_id_t*		doc_id)	/*!< in/out: Doc ID if we are
+					creating FTS index */
+{
+	ulint			i;
+	const dict_index_t*	index;
+	mtuple_t*		entry;
+	dfield_t*		field;
+	const dict_field_t*	ifield;
+	ulint			n_fields;
+	ulint			data_size;
+	ulint			extra_size;
+	ulint			bucket = 0;
+	doc_id_t		write_doc_id;
+	ulint			n_row_added = 0;
+	DBUG_ENTER("row_merge_buf_add");
+
+	if (buf->n_tuples >= buf->max_tuples) {
+		DBUG_RETURN(0);
+	}
+
+	DBUG_EXECUTE_IF(
+		"ib_row_merge_buf_add_two",
+		if (buf->n_tuples >= 2) DBUG_RETURN(0););
+
+	UNIV_PREFETCH_R(row->fields);
+
+	/* If we are building FTS index, buf->index points to
+	the 'fts_sort_idx', and real FTS index is stored in
+	fts_index */
+	index = (buf->index->type & DICT_FTS) ? fts_index : buf->index;
+
+	n_fields = dict_index_get_n_fields(index);
+
+	entry = &buf->tuples[buf->n_tuples];
+	field = entry->fields = static_cast<dfield_t*>(
+		mem_heap_alloc(buf->heap, n_fields * sizeof *entry->fields));
+
+	data_size = 0;
+	extra_size = UT_BITS_IN_BYTES(index->n_nullable);
+
+	ifield = dict_index_get_nth_field(index, 0);
+
+	for (i = 0; i < n_fields; i++, field++, ifield++) {
+		ulint			len;
+		const dict_col_t*	col;
+		ulint			col_no;
+		ulint			fixed_len;
+		const dfield_t*		row_field;
+
+		col = ifield->col;
+		col_no = dict_col_get_no(col);
+
+		/* Process the Doc ID column */
+		if (*doc_id > 0
+		    && col_no == index->table->fts->doc_col) {
+			fts_write_doc_id((byte*) &write_doc_id, *doc_id);
+
+			/* Note: field->data now points to a value on the
+			stack: &write_doc_id after dfield_set_data(). Because
+			there is only one doc_id per row, it shouldn't matter.
+			We allocate a new buffer before we leave the function
+			later below. */
+
+			dfield_set_data(
+				field, &write_doc_id, sizeof(write_doc_id));
+
+			field->type.mtype = ifield->col->mtype;
+			field->type.prtype = ifield->col->prtype;
+			field->type.mbminmaxlen = DATA_MBMINMAXLEN(0, 0);
+			field->type.len = ifield->col->len;
+		} else {
+			row_field = dtuple_get_nth_field(row, col_no);
+
+			dfield_copy(field, row_field);
+
+			/* Tokenize and process data for FTS */
+			if (index->type & DICT_FTS) {
+				fts_doc_item_t*	doc_item;
+				byte*		value;
+
+				/* fetch Doc ID if it already exists
+				in the row, and not supplied by the
+				caller. Even if the value column is
+				NULL, we still need to get the Doc
+				ID so to maintain the correct max
+				Doc ID */
+				if (*doc_id == 0) {
+					const dfield_t*	doc_field;
+					doc_field = dtuple_get_nth_field(
+						row,
+						index->table->fts->doc_col);
+					*doc_id = (doc_id_t) mach_read_from_8(
+						static_cast<byte*>(
+						dfield_get_data(doc_field)));
+
+					if (*doc_id == 0) {
+						ib_logf(IB_LOG_LEVEL_WARN,
+							"FTS Doc ID is zero. "
+							"Record Skipped");
+						DBUG_RETURN(0);
+					}
+				}
+
+				if (dfield_is_null(field)) {
+					n_row_added = 1;
+					continue;
+				}
+
+				doc_item = static_cast<fts_doc_item_t*>(
+					mem_heap_alloc(
+						buf->heap,
+						sizeof(*doc_item)));
+
+				value = static_cast<byte*>(
+					ut_malloc(field->len));
+				memcpy(value, field->data, field->len);
+				field->data = value;
+
+				doc_item->field = field;
+				doc_item->doc_id = *doc_id;
+
+				bucket = *doc_id % fts_sort_pll_degree;
+
+				UT_LIST_ADD_LAST(
+					doc_list,
+					psort_info[bucket].fts_doc_list,
+					doc_item);
+				n_row_added = 1;
+				continue;
+			}
+		}
+
+		len = dfield_get_len(field);
+
+		if (dfield_is_null(field)) {
+			ut_ad(!(col->prtype & DATA_NOT_NULL));
+			continue;
+		} else if (!ext) {
+		} else if (dict_index_is_clust(index)) {
+			/* Flag externally stored fields. */
+			const byte*	buf = row_ext_lookup(ext, col_no,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				ut_a(buf != field_ref_zero);
+				if (i < dict_index_get_n_unique(index)) {
+					dfield_set_data(field, buf, len);
+				} else {
+					dfield_set_ext(field);
+					len = dfield_get_len(field);
+				}
+			}
+		} else {
+			const byte*	buf = row_ext_lookup(ext, col_no,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				ut_a(buf != field_ref_zero);
+				dfield_set_data(field, buf, len);
+			}
+		}
+
+		/* If a column prefix index, take only the prefix */
+
+		if (ifield->prefix_len) {
+			len = dtype_get_at_most_n_mbchars(
+				col->prtype,
+				col->mbminmaxlen,
+				ifield->prefix_len,
+				len,
+				static_cast<char*>(dfield_get_data(field)));
+			dfield_set_len(field, len);
+		}
+
+		ut_ad(len <= col->len || col->mtype == DATA_BLOB);
+
+		fixed_len = ifield->fixed_len;
+		if (fixed_len && !dict_table_is_comp(index->table)
+		    && DATA_MBMINLEN(col->mbminmaxlen)
+		    != DATA_MBMAXLEN(col->mbminmaxlen)) {
+			/* CHAR in ROW_FORMAT=REDUNDANT is always
+			fixed-length, but in the temporary file it is
+			variable-length for variable-length character
+			sets. */
+			fixed_len = 0;
+		}
+
+		if (fixed_len) {
+#ifdef UNIV_DEBUG
+			ulint	mbminlen = DATA_MBMINLEN(col->mbminmaxlen);
+			ulint	mbmaxlen = DATA_MBMAXLEN(col->mbminmaxlen);
+
+			/* len should be between size calcualted base on
+			mbmaxlen and mbminlen */
+			ut_ad(len <= fixed_len);
+			ut_ad(!mbmaxlen || len >= mbminlen
+			      * (fixed_len / mbmaxlen));
+
+			ut_ad(!dfield_is_ext(field));
+#endif /* UNIV_DEBUG */
+		} else if (dfield_is_ext(field)) {
+			extra_size += 2;
+		} else if (len < 128
+			   || (col->len < 256 && col->mtype != DATA_BLOB)) {
+			extra_size++;
+		} else {
+			/* For variable-length columns, we look up the
+			maximum length from the column itself.  If this
+			is a prefix index column shorter than 256 bytes,
+			this will waste one byte. */
+			extra_size += 2;
+		}
+		data_size += len;
+	}
+
+	/* If this is FTS index, we already populated the sort buffer, return
+	here */
+	if (index->type & DICT_FTS) {
+		DBUG_RETURN(n_row_added);
+	}
+
+#ifdef UNIV_DEBUG
+	{
+		ulint	size;
+		ulint	extra;
+
+		size = rec_get_converted_size_temp(
+			index, entry->fields, n_fields, &extra);
+
+		ut_ad(data_size + extra_size == size);
+		ut_ad(extra_size == extra);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Add to the total size of the record in row_merge_block_t
+	the encoded length of extra_size and the extra bytes (extra_size).
+	See row_merge_buf_write() for the variable-length encoding
+	of extra_size. */
+	data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
+
+	ut_ad(data_size < srv_sort_buf_size);
+
+	/* Reserve one byte for the end marker of row_merge_block_t. */
+	if (buf->total_size + data_size >= srv_sort_buf_size - 1) {
+		DBUG_RETURN(0);
+	}
+
+	buf->total_size += data_size;
+	buf->n_tuples++;
+	n_row_added++;
+
+	field = entry->fields;
+
+	/* Copy the data fields. */
+
+	do {
+		dfield_dup(field++, buf->heap);
+	} while (--n_fields);
+
+	DBUG_RETURN(n_row_added);
+}
+
+/*************************************************************//**
+Report a duplicate key. */
+UNIV_INTERN
+void
+row_merge_dup_report(
+/*=================*/
+	row_merge_dup_t*	dup,	/*!< in/out: for reporting duplicates */
+	const dfield_t*		entry)	/*!< in: duplicate index entry */
+{
+	if (!dup->n_dup++) {
+		/* Only report the first duplicate record,
+		but count all duplicate records. */
+		innobase_fields_to_mysql(dup->table, dup->index, entry);
+	}
+}
+
+/*************************************************************//**
+Compare two tuples.
+@return	1, 0, -1 if a is greater, equal, less, respectively, than b */
+static __attribute__((warn_unused_result))
+int
+row_merge_tuple_cmp(
+/*================*/
+	ulint			n_uniq,	/*!< in: number of unique fields */
+	ulint			n_field,/*!< in: number of fields */
+	const mtuple_t&		a,	/*!< in: first tuple to be compared */
+	const mtuple_t&		b,	/*!< in: second tuple to be compared */
+	row_merge_dup_t*	dup)	/*!< in/out: for reporting duplicates,
+					NULL if non-unique index */
+{
+	int		cmp;
+	const dfield_t*	af	= a.fields;
+	const dfield_t*	bf	= b.fields;
+	ulint		n	= n_uniq;
+
+	ut_ad(n_uniq > 0);
+	ut_ad(n_uniq <= n_field);
+
+	/* Compare the fields of the tuples until a difference is
+	found or we run out of fields to compare.  If !cmp at the
+	end, the tuples are equal. */
+	do {
+		cmp = cmp_dfield_dfield(af++, bf++);
+	} while (!cmp && --n);
+
+	if (cmp) {
+		return(cmp);
+	}
+
+	if (dup) {
+		/* Report a duplicate value error if the tuples are
+		logically equal.  NULL columns are logically inequal,
+		although they are equal in the sorting order.  Find
+		out if any of the fields are NULL. */
+		for (const dfield_t* df = a.fields; df != af; df++) {
+			if (dfield_is_null(df)) {
+				goto no_report;
+			}
+		}
+
+		row_merge_dup_report(dup, a.fields);
+	}
+
+no_report:
+	/* The n_uniq fields were equal, but we compare all fields so
+	that we will get the same (internal) order as in the B-tree. */
+	for (n = n_field - n_uniq + 1; --n; ) {
+		cmp = cmp_dfield_dfield(af++, bf++);
+		if (cmp) {
+			return(cmp);
+		}
+	}
+
+	/* This should never be reached, except in a secondary index
+	when creating a secondary index and a PRIMARY KEY, and there
+	is a duplicate in the PRIMARY KEY that has not been detected
+	yet. Internally, an index must never contain duplicates. */
+	return(cmp);
+}
+
+/** Wrapper for row_merge_tuple_sort() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param tuples	array of tuples that being sorted
+@param aux	work area, same size as tuples[]
+@param low	lower bound of the sorting area, inclusive
+@param high	upper bound of the sorting area, inclusive */
+#define row_merge_tuple_sort_ctx(tuples, aux, low, high)		\
+	row_merge_tuple_sort(n_uniq, n_field, dup, tuples, aux, low, high)
+/** Wrapper for row_merge_tuple_cmp() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param a	first tuple to be compared
+@param b	second tuple to be compared
+@return	1, 0, -1 if a is greater, equal, less, respectively, than b */
+#define row_merge_tuple_cmp_ctx(a,b)			\
+	row_merge_tuple_cmp(n_uniq, n_field, a, b, dup)
+
+/**********************************************************************//**
+Merge sort the tuple buffer in main memory. */
+static __attribute__((nonnull(4,5)))
+void
+row_merge_tuple_sort(
+/*=================*/
+	ulint			n_uniq,	/*!< in: number of unique fields */
+	ulint			n_field,/*!< in: number of fields */
+	row_merge_dup_t*	dup,	/*!< in/out: reporter of duplicates
+					(NULL if non-unique index) */
+	mtuple_t*		tuples,	/*!< in/out: tuples */
+	mtuple_t*		aux,	/*!< in/out: work area */
+	ulint			low,	/*!< in: lower bound of the
+					sorting area, inclusive */
+	ulint			high)	/*!< in: upper bound of the
+					sorting area, exclusive */
+{
+	ut_ad(n_field > 0);
+	ut_ad(n_uniq <= n_field);
+
+	UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
+			      tuples, aux, low, high, row_merge_tuple_cmp_ctx);
+}
+
+/******************************************************//**
+Sort a buffer. */
+UNIV_INTERN
+void
+row_merge_buf_sort(
+/*===============*/
+	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
+	row_merge_dup_t*	dup)	/*!< in/out: reporter of duplicates
+					(NULL if non-unique index) */
+{
+	row_merge_tuple_sort(dict_index_get_n_unique(buf->index),
+			     dict_index_get_n_fields(buf->index),
+			     dup,
+			     buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
+}
+
+/******************************************************//**
+Write a buffer to a block. */
+UNIV_INTERN
+void
+row_merge_buf_write(
+/*================*/
+	const row_merge_buf_t*	buf,	/*!< in: sorted buffer */
+	const merge_file_t*	of UNIV_UNUSED,
+					/*!< in: output file */
+	row_merge_block_t*	block)	/*!< out: buffer for writing to file */
+{
+	const dict_index_t*	index	= buf->index;
+	ulint			n_fields= dict_index_get_n_fields(index);
+	byte*			b	= &block[0];
+
+	for (ulint i = 0; i < buf->n_tuples; i++) {
+		const mtuple_t*	entry	= &buf->tuples[i];
+
+		row_merge_buf_encode(&b, index, entry, n_fields);
+		ut_ad(b < &block[srv_sort_buf_size]);
+#ifdef UNIV_DEBUG
+		if (row_merge_print_write) {
+			fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu",
+				(void*) b, of->fd, (ulong) of->offset,
+				(ulong) i);
+			row_merge_tuple_print(stderr, entry, n_fields);
+		}
+#endif /* UNIV_DEBUG */
+	}
+
+	/* Write an "end-of-chunk" marker. */
+	ut_a(b < &block[srv_sort_buf_size]);
+	ut_a(b == &block[0] + buf->total_size);
+	*b++ = 0;
+#ifdef UNIV_DEBUG_VALGRIND
+	/* The rest of the block is uninitialized.  Initialize it
+	to avoid bogus warnings. */
+	memset(b, 0xff, &block[srv_sort_buf_size] - b);
+#endif /* UNIV_DEBUG_VALGRIND */
+#ifdef UNIV_DEBUG
+	if (row_merge_print_write) {
+		fprintf(stderr, "row_merge_buf_write %p,%d,%lu EOF\n",
+			(void*) b, of->fd, (ulong) of->offset);
+	}
+#endif /* UNIV_DEBUG */
+}
+
+/******************************************************//**
+Create a memory heap and allocate space for row_merge_rec_offsets()
+and mrec_buf_t[3].
+@return	memory heap */
+static
+mem_heap_t*
+row_merge_heap_create(
+/*==================*/
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	mrec_buf_t**		buf,		/*!< out: 3 buffers */
+	ulint**			offsets1,	/*!< out: offsets */
+	ulint**			offsets2)	/*!< out: offsets */
+{
+	ulint		i	= 1 + REC_OFFS_HEADER_SIZE
+		+ dict_index_get_n_fields(index);
+	mem_heap_t*	heap	= mem_heap_create(2 * i * sizeof **offsets1
+						  + 3 * sizeof **buf);
+
+	*buf = static_cast<mrec_buf_t*>(
+		mem_heap_alloc(heap, 3 * sizeof **buf));
+	*offsets1 = static_cast<ulint*>(
+		mem_heap_alloc(heap, i * sizeof **offsets1));
+	*offsets2 = static_cast<ulint*>(
+		mem_heap_alloc(heap, i * sizeof **offsets2));
+
+	(*offsets1)[0] = (*offsets2)[0] = i;
+	(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
+
+	return(heap);
+}
+
+/********************************************************************//**
+Read a merge block from the file system.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+row_merge_read(
+/*===========*/
+	int			fd,	/*!< in: file descriptor */
+	ulint			offset,	/*!< in: offset where to read
+					in number of row_merge_block_t
+					elements */
+	row_merge_block_t*	buf)	/*!< out: data */
+{
+	os_offset_t	ofs = ((os_offset_t) offset) * srv_sort_buf_size;
+	ibool		success;
+
+	DBUG_EXECUTE_IF("row_merge_read_failure", return(FALSE););
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_block_read) {
+		fprintf(stderr, "row_merge_read fd=%d ofs=%lu\n",
+			fd, (ulong) offset);
+	}
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_block_read) {
+		fprintf(stderr, "row_merge_read fd=%d ofs=%lu\n",
+			fd, (ulong) offset);
+	}
+#endif /* UNIV_DEBUG */
+
+	success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
+						 ofs, srv_sort_buf_size);
+#ifdef POSIX_FADV_DONTNEED
+	/* Each block is read exactly once.  Free up the file cache. */
+	posix_fadvise(fd, ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+	if (UNIV_UNLIKELY(!success)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: failed to read merge block at "UINT64PF"\n",
+			ofs);
+	}
+
+	return(UNIV_LIKELY(success));
+}
+
+/********************************************************************//**
+Write a merge block to the file system.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+row_merge_write(
+/*============*/
+	int		fd,	/*!< in: file descriptor */
+	ulint		offset,	/*!< in: offset where to write,
+				in number of row_merge_block_t elements */
+	const void*	buf)	/*!< in: data */
+{
+	size_t		buf_len = srv_sort_buf_size;
+	os_offset_t	ofs = buf_len * (os_offset_t) offset;
+	ibool		ret;
+
+	DBUG_EXECUTE_IF("row_merge_write_failure", return(FALSE););
+
+	ret = os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf, ofs, buf_len);
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_block_write) {
+		fprintf(stderr, "row_merge_write fd=%d ofs=%lu\n",
+			fd, (ulong) offset);
+	}
+#endif /* UNIV_DEBUG */
+
+#ifdef POSIX_FADV_DONTNEED
+	/* The block will be needed on the next merge pass,
+	but it can be evicted from the file cache meanwhile. */
+	posix_fadvise(fd, ofs, buf_len, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+	return(UNIV_LIKELY(ret));
+}
+
+/********************************************************************//**
+Read a merge record.
+@return	pointer to next record, or NULL on I/O error or end of list */
+UNIV_INTERN
+const byte*
+row_merge_read_rec(
+/*===============*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	const byte*		b,	/*!< in: pointer to record */
+	const dict_index_t*	index,	/*!< in: index of the record */
+	int			fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t**		mrec,	/*!< out: pointer to merge record,
+					or NULL on end of list
+					(non-NULL on I/O error) */
+	ulint*			offsets)/*!< out: offsets of mrec */
+{
+	ulint	extra_size;
+	ulint	data_size;
+	ulint	avail_size;
+
+	ut_ad(block);
+	ut_ad(buf);
+	ut_ad(b >= &block[0]);
+	ut_ad(b < &block[srv_sort_buf_size]);
+	ut_ad(index);
+	ut_ad(foffs);
+	ut_ad(mrec);
+	ut_ad(offsets);
+
+	ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE
+	      + dict_index_get_n_fields(index));
+
+	extra_size = *b++;
+
+	if (UNIV_UNLIKELY(!extra_size)) {
+		/* End of list */
+		*mrec = NULL;
+#ifdef UNIV_DEBUG
+		if (row_merge_print_read) {
+			fprintf(stderr, "row_merge_read %p,%p,%d,%lu EOF\n",
+				(const void*) b, (const void*) block,
+				fd, (ulong) *foffs);
+		}
+#endif /* UNIV_DEBUG */
+		return(NULL);
+	}
+
+	if (extra_size >= 0x80) {
+		/* Read another byte of extra_size. */
+
+		if (UNIV_UNLIKELY(b >= &block[srv_sort_buf_size])) {
+			if (!row_merge_read(fd, ++(*foffs), block)) {
+err_exit:
+				/* Signal I/O error. */
+				*mrec = b;
+				return(NULL);
+			}
+
+			/* Wrap around to the beginning of the buffer. */
+			b = &block[0];
+		}
+
+		extra_size = (extra_size & 0x7f) << 8;
+		extra_size |= *b++;
+	}
+
+	/* Normalize extra_size.  Above, value 0 signals "end of list". */
+	extra_size--;
+
+	/* Read the extra bytes. */
+
+	if (UNIV_UNLIKELY(b + extra_size >= &block[srv_sort_buf_size])) {
+		/* The record spans two blocks.  Copy the entire record
+		to the auxiliary buffer and handle this as a special
+		case. */
+
+		avail_size = &block[srv_sort_buf_size] - b;
+		ut_ad(avail_size < sizeof *buf);
+		memcpy(*buf, b, avail_size);
+
+		if (!row_merge_read(fd, ++(*foffs), block)) {
+
+			goto err_exit;
+		}
+
+		/* Wrap around to the beginning of the buffer. */
+		b = &block[0];
+
+		/* Copy the record. */
+		memcpy(*buf + avail_size, b, extra_size - avail_size);
+		b += extra_size - avail_size;
+
+		*mrec = *buf + extra_size;
+
+		rec_init_offsets_temp(*mrec, index, offsets);
+
+		data_size = rec_offs_data_size(offsets);
+
+		/* These overflows should be impossible given that
+		records are much smaller than either buffer, and
+		the record starts near the beginning of each buffer. */
+		ut_a(extra_size + data_size < sizeof *buf);
+		ut_a(b + data_size < &block[srv_sort_buf_size]);
+
+		/* Copy the data bytes. */
+		memcpy(*buf + extra_size, b, data_size);
+		b += data_size;
+
+		goto func_exit;
+	}
+
+	*mrec = b + extra_size;
+
+	rec_init_offsets_temp(*mrec, index, offsets);
+
+	data_size = rec_offs_data_size(offsets);
+	ut_ad(extra_size + data_size < sizeof *buf);
+
+	b += extra_size + data_size;
+
+	if (UNIV_LIKELY(b < &block[srv_sort_buf_size])) {
+		/* The record fits entirely in the block.
+		This is the normal case. */
+		goto func_exit;
+	}
+
+	/* The record spans two blocks.  Copy it to buf. */
+
+	b -= extra_size + data_size;
+	avail_size = &block[srv_sort_buf_size] - b;
+	memcpy(*buf, b, avail_size);
+	*mrec = *buf + extra_size;
+#ifdef UNIV_DEBUG
+	/* We cannot invoke rec_offs_make_valid() here, because there
+	are no REC_N_NEW_EXTRA_BYTES between extra_size and data_size.
+	Similarly, rec_offs_validate() would fail, because it invokes
+	rec_get_status(). */
+	offsets[2] = (ulint) *mrec;
+	offsets[3] = (ulint) index;
+#endif /* UNIV_DEBUG */
+
+	if (!row_merge_read(fd, ++(*foffs), block)) {
+
+		goto err_exit;
+	}
+
+	/* Wrap around to the beginning of the buffer. */
+	b = &block[0];
+
+	/* Copy the rest of the record. */
+	memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
+	b += extra_size + data_size - avail_size;
+
+func_exit:
+#ifdef UNIV_DEBUG
+	if (row_merge_print_read) {
+		fprintf(stderr, "row_merge_read %p,%p,%d,%lu ",
+			(const void*) b, (const void*) block,
+			fd, (ulong) *foffs);
+		rec_print_comp(stderr, *mrec, offsets);
+		putc('\n', stderr);
+	}
+#endif /* UNIV_DEBUG */
+
+	return(b);
+}
+
+/********************************************************************//**
+Write a merge record. */
+static
+void
+row_merge_write_rec_low(
+/*====================*/
+	byte*		b,	/*!< out: buffer */
+	ulint		e,	/*!< in: encoded extra_size */
+#ifdef UNIV_DEBUG
+	ulint		size,	/*!< in: total size to write */
+	int		fd,	/*!< in: file descriptor */
+	ulint		foffs,	/*!< in: file offset */
+#endif /* UNIV_DEBUG */
+	const mrec_t*	mrec,	/*!< in: record to write */
+	const ulint*	offsets)/*!< in: offsets of mrec */
+#ifndef UNIV_DEBUG
+# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets)	\
+	row_merge_write_rec_low(b, e, mrec, offsets)
+#endif /* !UNIV_DEBUG */
+{
+#ifdef UNIV_DEBUG
+	const byte* const end = b + size;
+	ut_ad(e == rec_offs_extra_size(offsets) + 1);
+
+	if (row_merge_print_write) {
+		fprintf(stderr, "row_merge_write %p,%d,%lu ",
+			(void*) b, fd, (ulong) foffs);
+		rec_print_comp(stderr, mrec, offsets);
+		putc('\n', stderr);
+	}
+#endif /* UNIV_DEBUG */
+
+	if (e < 0x80) {
+		*b++ = (byte) e;
+	} else {
+		*b++ = (byte) (0x80 | (e >> 8));
+		*b++ = (byte) e;
+	}
+
+	memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
+	ut_ad(b + rec_offs_size(offsets) == end);
+}
+
+/********************************************************************//**
+Write a merge record.
+@return	pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_rec(
+/*================*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	byte*			b,	/*!< in: pointer to end of block */
+	int			fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t*		mrec,	/*!< in: record to write */
+	const ulint*		offsets)/*!< in: offsets of mrec */
+{
+	ulint	extra_size;
+	ulint	size;
+	ulint	avail_size;
+
+	ut_ad(block);
+	ut_ad(buf);
+	ut_ad(b >= &block[0]);
+	ut_ad(b < &block[srv_sort_buf_size]);
+	ut_ad(mrec);
+	ut_ad(foffs);
+	ut_ad(mrec < &block[0] || mrec > &block[srv_sort_buf_size]);
+	ut_ad(mrec < buf[0] || mrec > buf[1]);
+
+	/* Normalize extra_size.  Value 0 signals "end of list". */
+	extra_size = rec_offs_extra_size(offsets) + 1;
+
+	size = extra_size + (extra_size >= 0x80)
+		+ rec_offs_data_size(offsets);
+
+	if (UNIV_UNLIKELY(b + size >= &block[srv_sort_buf_size])) {
+		/* The record spans two blocks.
+		Copy it to the temporary buffer first. */
+		avail_size = &block[srv_sort_buf_size] - b;
+
+		row_merge_write_rec_low(buf[0],
+					extra_size, size, fd, *foffs,
+					mrec, offsets);
+
+		/* Copy the head of the temporary buffer, write
+		the completed block, and copy the tail of the
+		record to the head of the new block. */
+		memcpy(b, buf[0], avail_size);
+
+		if (!row_merge_write(fd, (*foffs)++, block)) {
+			return(NULL);
+		}
+
+		UNIV_MEM_INVALID(&block[0], srv_sort_buf_size);
+
+		/* Copy the rest. */
+		b = &block[0];
+		memcpy(b, buf[0] + avail_size, size - avail_size);
+		b += size - avail_size;
+	} else {
+		row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
+					mrec, offsets);
+		b += size;
+	}
+
+	return(b);
+}
+
+/********************************************************************//**
+Write an end-of-list marker.
+@return	pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_eof(
+/*================*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	byte*			b,	/*!< in: pointer to end of block */
+	int			fd,	/*!< in: file descriptor */
+	ulint*			foffs)	/*!< in/out: file offset */
+{
+	ut_ad(block);
+	ut_ad(b >= &block[0]);
+	ut_ad(b < &block[srv_sort_buf_size]);
+	ut_ad(foffs);
+#ifdef UNIV_DEBUG
+	if (row_merge_print_write) {
+		fprintf(stderr, "row_merge_write %p,%p,%d,%lu EOF\n",
+			(void*) b, (void*) block, fd, (ulong) *foffs);
+	}
+#endif /* UNIV_DEBUG */
+
+	*b++ = 0;
+	UNIV_MEM_ASSERT_RW(&block[0], b - &block[0]);
+	UNIV_MEM_ASSERT_W(&block[0], srv_sort_buf_size);
+#ifdef UNIV_DEBUG_VALGRIND
+	/* The rest of the block is uninitialized.  Initialize it
+	to avoid bogus warnings. */
+	memset(b, 0xff, &block[srv_sort_buf_size] - b);
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	if (!row_merge_write(fd, (*foffs)++, block)) {
+		return(NULL);
+	}
+
+	UNIV_MEM_INVALID(&block[0], srv_sort_buf_size);
+	return(&block[0]);
+}
+
+/********************************************************************//**
+Reads clustered index of the table and create temporary files
+containing the index entries for the indexes to be built.
+@return	DB_SUCCESS or error */
+static __attribute__((nonnull(1,2,3,4,6,9,10,16), warn_unused_result))
+dberr_t
+row_merge_read_clustered_index(
+/*===========================*/
+	trx_t*			trx,	/*!< in: transaction */
+	struct TABLE*		table,	/*!< in/out: MySQL table object,
+					for reporting erroneous records */
+	const dict_table_t*	old_table,/*!< in: table where rows are
+					read from */
+	const dict_table_t*	new_table,/*!< in: table where indexes are
+					created; identical to old_table
+					unless creating a PRIMARY KEY */
+	bool			online,	/*!< in: true if creating indexes
+					online */
+	dict_index_t**		index,	/*!< in: indexes to be created */
+	dict_index_t*		fts_sort_idx,
+					/*!< in: full-text index to be created,
+					or NULL */
+	fts_psort_t*		psort_info,
+					/*!< in: parallel sort info for
+					fts_sort_idx creation, or NULL */
+	merge_file_t*		files,	/*!< in: temporary files */
+	const ulint*		key_numbers,
+					/*!< in: MySQL key numbers to create */
+	ulint			n_index,/*!< in: number of indexes to create */
+	const dtuple_t*		add_cols,
+					/*!< in: default values of
+					added columns, or NULL */
+	const ulint*		col_map,/*!< in: mapping of old column
+					numbers to new ones, or NULL
+					if old_table == new_table */
+	ulint			add_autoinc,
+					/*!< in: number of added
+					AUTO_INCREMENT column, or
+					ULINT_UNDEFINED if none is added */
+	ib_sequence_t&		sequence,/*!< in/out: autoinc sequence */
+	row_merge_block_t*	block)	/*!< in/out: file buffer */
+{
+	dict_index_t*		clust_index;	/* Clustered index */
+	mem_heap_t*		row_heap;	/* Heap memory to create
+						clustered index tuples */
+	row_merge_buf_t**	merge_buf;	/* Temporary list for records*/
+	btr_pcur_t		pcur;		/* Cursor on the clustered
+						index */
+	mtr_t			mtr;		/* Mini transaction */
+	dberr_t			err = DB_SUCCESS;/* Return code */
+	ulint			n_nonnull = 0;	/* number of columns
+						changed to NOT NULL */
+	ulint*			nonnull = NULL;	/* NOT NULL columns */
+	dict_index_t*		fts_index = NULL;/* FTS index */
+	doc_id_t		doc_id = 0;
+	doc_id_t		max_doc_id = 0;
+	ibool			add_doc_id = FALSE;
+	os_event_t		fts_parallel_sort_event = NULL;
+	ibool			fts_pll_sort = FALSE;
+	ib_int64_t		sig_count = 0;
+	DBUG_ENTER("row_merge_read_clustered_index");
+
+	ut_ad((old_table == new_table) == !col_map);
+	ut_ad(!add_cols || col_map);
+
+	trx->op_info = "reading clustered index";
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	DEBUG_FTS_SORT_PRINT("FTS_SORT: Start Create Index\n");
+#endif
+
+	/* Create and initialize memory for record buffers */
+
+	merge_buf = static_cast<row_merge_buf_t**>(
+		mem_alloc(n_index * sizeof *merge_buf));
+
+	for (ulint i = 0; i < n_index; i++) {
+		if (index[i]->type & DICT_FTS) {
+
+			/* We are building a FT index, make sure
+			we have the temporary 'fts_sort_idx' */
+			ut_a(fts_sort_idx);
+
+			fts_index = index[i];
+
+			merge_buf[i] = row_merge_buf_create(fts_sort_idx);
+
+			add_doc_id = DICT_TF2_FLAG_IS_SET(
+				new_table, DICT_TF2_FTS_ADD_DOC_ID);
+
+			/* If Doc ID does not exist in the table itself,
+			fetch the first FTS Doc ID */
+			if (add_doc_id) {
+				fts_get_next_doc_id(
+					(dict_table_t*) new_table,
+					&doc_id);
+				ut_ad(doc_id > 0);
+			}
+
+			fts_pll_sort = TRUE;
+			row_fts_start_psort(psort_info);
+			fts_parallel_sort_event =
+				 psort_info[0].psort_common->sort_event;
+		} else {
+			merge_buf[i] = row_merge_buf_create(index[i]);
+		}
+	}
+
+	mtr_start(&mtr);
+
+	/* Find the clustered index and create a persistent cursor
+	based on that. */
+
+	clust_index = dict_table_get_first_index(old_table);
+
+	btr_pcur_open_at_index_side(
+		true, clust_index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
+
+	if (old_table != new_table) {
+		/* The table is being rebuilt.  Identify the columns
+		that were flagged NOT NULL in the new table, so that
+		we can quickly check that the records in the old table
+		do not violate the added NOT NULL constraints. */
+
+		nonnull = static_cast<ulint*>(
+			mem_alloc(dict_table_get_n_cols(new_table)
+				  * sizeof *nonnull));
+
+		for (ulint i = 0; i < dict_table_get_n_cols(old_table); i++) {
+			if (dict_table_get_nth_col(old_table, i)->prtype
+			    & DATA_NOT_NULL) {
+				continue;
+			}
+
+			const ulint j = col_map[i];
+
+			if (j == ULINT_UNDEFINED) {
+				/* The column was dropped. */
+				continue;
+			}
+
+			if (dict_table_get_nth_col(new_table, j)->prtype
+			    & DATA_NOT_NULL) {
+				nonnull[n_nonnull++] = j;
+			}
+		}
+
+		if (!n_nonnull) {
+			mem_free(nonnull);
+			nonnull = NULL;
+		}
+	}
+
+	row_heap = mem_heap_create(sizeof(mrec_buf_t));
+
+	/* Scan the clustered index. */
+	for (;;) {
+		const rec_t*	rec;
+		ulint*		offsets;
+		const dtuple_t*	row;
+		row_ext_t*	ext;
+		page_cur_t*	cur	= btr_pcur_get_page_cur(&pcur);
+
+		page_cur_move_to_next(cur);
+
+		if (page_cur_is_after_last(cur)) {
+			if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+				err = DB_INTERRUPTED;
+				trx->error_key_num = 0;
+				goto func_exit;
+			}
+
+			if (online && old_table != new_table) {
+				err = row_log_table_get_error(clust_index);
+				if (err != DB_SUCCESS) {
+					trx->error_key_num = 0;
+					goto func_exit;
+				}
+			}
+#ifdef DBUG_OFF
+# define dbug_run_purge	false
+#else /* DBUG_OFF */
+			bool	dbug_run_purge = false;
+#endif /* DBUG_OFF */
+			DBUG_EXECUTE_IF(
+				"ib_purge_on_create_index_page_switch",
+				dbug_run_purge = true;);
+
+			if (dbug_run_purge
+			    || rw_lock_get_waiters(
+				    dict_index_get_lock(clust_index))) {
+				/* There are waiters on the clustered
+				index tree lock, likely the purge
+				thread. Store and restore the cursor
+				position, and yield so that scanning a
+				large table will not starve other
+				threads. */
+
+				/* Store the cursor position on the last user
+				record on the page. */
+				btr_pcur_move_to_prev_on_page(&pcur);
+				/* Leaf pages must never be empty, unless
+				this is the only page in the index tree. */
+				ut_ad(btr_pcur_is_on_user_rec(&pcur)
+				      || buf_block_get_page_no(
+					      btr_pcur_get_block(&pcur))
+				      == clust_index->page);
+
+				btr_pcur_store_position(&pcur, &mtr);
+				mtr_commit(&mtr);
+
+				if (dbug_run_purge) {
+					/* This is for testing
+					purposes only (see
+					DBUG_EXECUTE_IF above).  We
+					signal the purge thread and
+					hope that the purge batch will
+					complete before we execute
+					btr_pcur_restore_position(). */
+					trx_purge_run();
+					os_thread_sleep(1000000);
+				}
+
+				/* Give the waiters a chance to proceed. */
+				os_thread_yield();
+
+				mtr_start(&mtr);
+				/* Restore position on the record, or its
+				predecessor if the record was purged
+				meanwhile. */
+				btr_pcur_restore_position(
+					BTR_SEARCH_LEAF, &pcur, &mtr);
+				/* Move to the successor of the
+				original record. */
+				if (!btr_pcur_move_to_next_user_rec(
+					    &pcur, &mtr)) {
+end_of_index:
+					row = NULL;
+					mtr_commit(&mtr);
+					mem_heap_free(row_heap);
+					if (nonnull) {
+						mem_free(nonnull);
+					}
+					goto write_buffers;
+				}
+			} else {
+				ulint		next_page_no;
+				buf_block_t*	block;
+
+				next_page_no = btr_page_get_next(
+					page_cur_get_page(cur), &mtr);
+
+				if (next_page_no == FIL_NULL) {
+					goto end_of_index;
+				}
+
+				block = page_cur_get_block(cur);
+				block = btr_block_get(
+					buf_block_get_space(block),
+					buf_block_get_zip_size(block),
+					next_page_no, BTR_SEARCH_LEAF,
+					clust_index, &mtr);
+
+				btr_leaf_page_release(page_cur_get_block(cur),
+						      BTR_SEARCH_LEAF, &mtr);
+				page_cur_set_before_first(block, cur);
+				page_cur_move_to_next(cur);
+
+				ut_ad(!page_cur_is_after_last(cur));
+			}
+		}
+
+		rec = page_cur_get_rec(cur);
+
+		SRV_CORRUPT_TABLE_CHECK(rec,
+		{
+			err = DB_CORRUPTION;
+			goto func_exit;
+		});
+
+		offsets = rec_get_offsets(rec, clust_index, NULL,
+					  ULINT_UNDEFINED, &row_heap);
+
+		if (online) {
+			/* Perform a REPEATABLE READ.
+
+			When rebuilding the table online,
+			row_log_table_apply() must not see a newer
+			state of the table when applying the log.
+			This is mainly to prevent false duplicate key
+			errors, because the log will identify records
+			by the PRIMARY KEY, and also to prevent unsafe
+			BLOB access.
+
+			When creating a secondary index online, this
+			table scan must not see records that have only
+			been inserted to the clustered index, but have
+			not been written to the online_log of
+			index[]. If we performed READ UNCOMMITTED, it
+			could happen that the ADD INDEX reaches
+			ONLINE_INDEX_COMPLETE state between the time
+			the DML thread has updated the clustered index
+			but has not yet accessed secondary index. */
+			ut_ad(trx->read_view);
+
+			if (!read_view_sees_trx_id(
+				    trx->read_view,
+				    row_get_rec_trx_id(
+					    rec, clust_index, offsets))) {
+				rec_t*	old_vers;
+
+				row_vers_build_for_consistent_read(
+					rec, &mtr, clust_index, &offsets,
+					trx->read_view, &row_heap,
+					row_heap, &old_vers);
+
+				rec = old_vers;
+
+				if (!rec) {
+					continue;
+				}
+			}
+
+			if (rec_get_deleted_flag(
+				    rec,
+				    dict_table_is_comp(old_table))) {
+				/* This record was deleted in the latest
+				committed version, or it was deleted and
+				then reinserted-by-update before purge
+				kicked in. Skip it. */
+				continue;
+			}
+
+			ut_ad(!rec_offs_any_null_extern(rec, offsets));
+		} else if (rec_get_deleted_flag(
+				   rec, dict_table_is_comp(old_table))) {
+			/* Skip delete-marked records.
+
+			Skipping delete-marked records will make the
+			created indexes unuseable for transactions
+			whose read views were created before the index
+			creation completed, but preserving the history
+			would make it tricky to detect duplicate
+			keys. */
+			continue;
+		}
+
+		/* When !online, we are holding a lock on old_table, preventing
+		any inserts that could have written a record 'stub' before
+		writing out off-page columns. */
+		ut_ad(!rec_offs_any_null_extern(rec, offsets));
+
+		/* Build a row based on the clustered index. */
+
+		row = row_build(ROW_COPY_POINTERS, clust_index,
+				rec, offsets, new_table,
+				add_cols, col_map, &ext, row_heap);
+		ut_ad(row);
+
+		for (ulint i = 0; i < n_nonnull; i++) {
+			const dfield_t*	field	= &row->fields[nonnull[i]];
+
+			ut_ad(dfield_get_type(field)->prtype & DATA_NOT_NULL);
+
+			if (dfield_is_null(field)) {
+				err = DB_INVALID_NULL;
+				trx->error_key_num = 0;
+				goto func_exit;
+			}
+		}
+
+		/* Get the next Doc ID */
+		if (add_doc_id) {
+			doc_id++;
+		} else {
+			doc_id = 0;
+		}
+
+		if (add_autoinc != ULINT_UNDEFINED) {
+
+			ut_ad(add_autoinc
+			      < dict_table_get_n_user_cols(new_table));
+
+			const dfield_t*	dfield;
+
+			dfield = dtuple_get_nth_field(row, add_autoinc);
+			if (dfield_is_null(dfield)) {
+				goto write_buffers;
+			}
+
+			const dtype_t*  dtype = dfield_get_type(dfield);
+			byte*	b = static_cast<byte*>(dfield_get_data(dfield));
+
+			if (sequence.eof()) {
+				err = DB_ERROR;
+				trx->error_key_num = 0;
+
+				ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+					ER_AUTOINC_READ_FAILED, "[NULL]");
+
+				goto func_exit;
+			}
+
+			ulonglong	value = sequence++;
+
+			switch (dtype_get_mtype(dtype)) {
+			case DATA_INT: {
+				ibool	usign;
+				ulint	len = dfield_get_len(dfield);
+
+				usign = dtype_get_prtype(dtype) & DATA_UNSIGNED;
+				mach_write_ulonglong(b, value, len, usign);
+
+				break;
+				}
+
+			case DATA_FLOAT:
+				mach_float_write(
+					b, static_cast<float>(value));
+				break;
+
+			case DATA_DOUBLE:
+				mach_double_write(
+					b, static_cast<double>(value));
+				break;
+
+			default:
+				ut_ad(0);
+			}
+		}
+
+write_buffers:
+		/* Build all entries for all the indexes to be created
+		in a single scan of the clustered index. */
+
+		for (ulint i = 0; i < n_index; i++) {
+			row_merge_buf_t*	buf	= merge_buf[i];
+			merge_file_t*		file	= &files[i];
+			ulint			rows_added = 0;
+
+			if (UNIV_LIKELY
+			    (row && (rows_added = row_merge_buf_add(
+					buf, fts_index, old_table,
+					psort_info, row, ext, &doc_id)))) {
+
+				/* If we are creating FTS index,
+				a single row can generate more
+				records for tokenized word */
+				file->n_rec += rows_added;
+				if (doc_id > max_doc_id) {
+					max_doc_id = doc_id;
+				}
+
+				continue;
+			}
+
+			if ((buf->index->type & DICT_FTS)
+			    && (!row || !doc_id)) {
+				continue;
+			}
+
+			/* The buffer must be sufficiently large
+			to hold at least one record. It may only
+			be empty when we reach the end of the
+			clustered index. row_merge_buf_add()
+			must not have been called in this loop. */
+			ut_ad(buf->n_tuples || row == NULL);
+
+			/* We have enough data tuples to form a block.
+			Sort them and write to disk. */
+
+			if (buf->n_tuples) {
+				if (dict_index_is_unique(buf->index)) {
+					row_merge_dup_t	dup = {
+						buf->index, table, col_map, 0};
+
+					row_merge_buf_sort(buf, &dup);
+
+					if (dup.n_dup) {
+						err = DB_DUPLICATE_KEY;
+						trx->error_key_num
+							= key_numbers[i];
+						break;
+					}
+				} else {
+					row_merge_buf_sort(buf, NULL);
+				}
+			} else if (online && new_table == old_table) {
+				/* Note the newest transaction that
+				modified this index when the scan was
+				completed. We prevent older readers
+				from accessing this index, to ensure
+				read consistency. */
+
+				trx_id_t	max_trx_id;
+
+				ut_a(row == NULL);
+				rw_lock_x_lock(
+					dict_index_get_lock(buf->index));
+				ut_a(dict_index_get_online_status(buf->index)
+				     == ONLINE_INDEX_CREATION);
+
+				max_trx_id = row_log_get_max_trx(buf->index);
+
+				if (max_trx_id > buf->index->trx_id) {
+					buf->index->trx_id = max_trx_id;
+				}
+
+				rw_lock_x_unlock(
+					dict_index_get_lock(buf->index));
+			}
+
+			row_merge_buf_write(buf, file, block);
+
+			if (!row_merge_write(file->fd, file->offset++,
+					     block)) {
+				err = DB_OUT_OF_FILE_SPACE;
+				trx->error_key_num = i;
+				break;
+			}
+
+			UNIV_MEM_INVALID(&block[0], srv_sort_buf_size);
+			merge_buf[i] = row_merge_buf_empty(buf);
+
+			if (UNIV_LIKELY(row != NULL)) {
+				/* Try writing the record again, now
+				that the buffer has been written out
+				and emptied. */
+
+				if (UNIV_UNLIKELY
+				    (!(rows_added = row_merge_buf_add(
+						buf, fts_index, old_table,
+						psort_info, row, ext,
+						&doc_id)))) {
+					/* An empty buffer should have enough
+					room for at least one record. */
+					ut_error;
+				}
+
+				file->n_rec += rows_added;
+			}
+		}
+
+		if (row == NULL) {
+			goto all_done;
+		}
+
+		if (err != DB_SUCCESS) {
+			goto func_exit;
+		}
+
+		mem_heap_empty(row_heap);
+	}
+
+func_exit:
+	mtr_commit(&mtr);
+	mem_heap_free(row_heap);
+
+	if (nonnull) {
+		mem_free(nonnull);
+	}
+
+all_done:
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Scan Table\n");
+#endif
+	if (fts_pll_sort) {
+		bool	all_exit = false;
+		ulint	trial_count = 0;
+		const ulint max_trial_count = 10000;
+
+		/* Tell all children that parent has done scanning */
+		for (ulint i = 0; i < fts_sort_pll_degree; i++) {
+			psort_info[i].state = FTS_PARENT_COMPLETE;
+		}
+wait_again:
+		/* Now wait all children to report back to be completed */
+		os_event_wait_time_low(fts_parallel_sort_event,
+				       1000000, sig_count);
+
+		for (ulint i = 0; i < fts_sort_pll_degree; i++) {
+			if (psort_info[i].child_status != FTS_CHILD_COMPLETE
+			    && psort_info[i].child_status != FTS_CHILD_EXITING) {
+				sig_count = os_event_reset(
+					fts_parallel_sort_event);
+				goto wait_again;
+			}
+		}
+
+		/* Now all children should complete, wait a bit until
+		they all finish setting the event, before we free everything.
+		This has a 10 second timeout */
+		do {
+			all_exit = true;
+
+			for (ulint j = 0; j < fts_sort_pll_degree; j++) {
+				if (psort_info[j].child_status
+				    != FTS_CHILD_EXITING) {
+					all_exit = false;
+					os_thread_sleep(1000);
+					break;
+				}
+			}
+			trial_count++;
+		} while (!all_exit && trial_count < max_trial_count);
+
+		if (!all_exit) {
+			ut_ad(0);
+			ib_logf(IB_LOG_LEVEL_FATAL,
+				"Not all child sort threads exited"
+				" when creating FTS index '%s'",
+				fts_sort_idx->name);
+		}
+	}
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Tokenization\n");
+#endif
+	for (ulint i = 0; i < n_index; i++) {
+		row_merge_buf_free(merge_buf[i]);
+	}
+
+	row_fts_free_pll_merge_buf(psort_info);
+
+	mem_free(merge_buf);
+
+	btr_pcur_close(&pcur);
+
+	/* Update the next Doc ID we used. Table should be locked, so
+	no concurrent DML */
+	if (max_doc_id) {
+		fts_update_next_doc_id(
+			0, new_table, old_table->name, max_doc_id);
+	}
+
+	trx->op_info = "";
+
+	DBUG_RETURN(err);
+}
+
+/** Write a record via buffer 2 and read the next record to buffer N.
+@param N	number of the buffer (0 or 1)
+@param INDEX	record descriptor
+@param AT_END	statement to execute at end of input */
+#define ROW_MERGE_WRITE_GET_NEXT(N, INDEX, AT_END)			\
+	do {								\
+		b2 = row_merge_write_rec(&block[2 * srv_sort_buf_size], \
+					 &buf[2], b2,			\
+					 of->fd, &of->offset,		\
+					 mrec##N, offsets##N);		\
+		if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) {	\
+			goto corrupt;					\
+		}							\
+		b##N = row_merge_read_rec(&block[N * srv_sort_buf_size],\
+					  &buf[N], b##N, INDEX,		\
+					  file->fd, foffs##N,		\
+					  &mrec##N, offsets##N);	\
+		if (UNIV_UNLIKELY(!b##N)) {				\
+			if (mrec##N) {					\
+				goto corrupt;				\
+			}						\
+			AT_END;						\
+		}							\
+	} while (0)
+
+/*************************************************************//**
+Merge two blocks of records on disk and write a bigger block.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_merge_blocks(
+/*=============*/
+	const row_merge_dup_t*	dup,	/*!< in: descriptor of
+					index being created */
+	const merge_file_t*	file,	/*!< in: file containing
+					index entries */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	ulint*			foffs0,	/*!< in/out: offset of first
+					source list in the file */
+	ulint*			foffs1,	/*!< in/out: offset of second
+					source list in the file */
+	merge_file_t*		of)	/*!< in/out: output file */
+{
+	mem_heap_t*	heap;	/*!< memory heap for offsets0, offsets1 */
+
+	mrec_buf_t*	buf;	/*!< buffer for handling
+				split mrec in block[] */
+	const byte*	b0;	/*!< pointer to block[0] */
+	const byte*	b1;	/*!< pointer to block[srv_sort_buf_size] */
+	byte*		b2;	/*!< pointer to block[2 * srv_sort_buf_size] */
+	const mrec_t*	mrec0;	/*!< merge rec, points to block[0] or buf[0] */
+	const mrec_t*	mrec1;	/*!< merge rec, points to
+				block[srv_sort_buf_size] or buf[1] */
+	ulint*		offsets0;/* offsets of mrec0 */
+	ulint*		offsets1;/* offsets of mrec1 */
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_block) {
+		fprintf(stderr,
+			"row_merge_blocks fd=%d ofs=%lu + fd=%d ofs=%lu"
+			" = fd=%d ofs=%lu\n",
+			file->fd, (ulong) *foffs0,
+			file->fd, (ulong) *foffs1,
+			of->fd, (ulong) of->offset);
+	}
+#endif /* UNIV_DEBUG */
+
+	heap = row_merge_heap_create(dup->index, &buf, &offsets0, &offsets1);
+
+	/* Write a record and read the next record.  Split the output
+	file in two halves, which can be merged on the following pass. */
+
+	if (!row_merge_read(file->fd, *foffs0, &block[0])
+	    || !row_merge_read(file->fd, *foffs1, &block[srv_sort_buf_size])) {
+corrupt:
+		mem_heap_free(heap);
+		return(DB_CORRUPTION);
+	}
+
+	b0 = &block[0];
+	b1 = &block[srv_sort_buf_size];
+	b2 = &block[2 * srv_sort_buf_size];
+
+	b0 = row_merge_read_rec(
+		&block[0], &buf[0], b0, dup->index,
+		file->fd, foffs0, &mrec0, offsets0);
+	b1 = row_merge_read_rec(
+		&block[srv_sort_buf_size],
+		&buf[srv_sort_buf_size], b1, dup->index,
+		file->fd, foffs1, &mrec1, offsets1);
+	if (UNIV_UNLIKELY(!b0 && mrec0)
+	    || UNIV_UNLIKELY(!b1 && mrec1)) {
+
+		goto corrupt;
+	}
+
+	while (mrec0 && mrec1) {
+		switch (cmp_rec_rec_simple(
+				mrec0, mrec1, offsets0, offsets1,
+				dup->index, dup->table)) {
+		case 0:
+			mem_heap_free(heap);
+			return(DB_DUPLICATE_KEY);
+		case -1:
+			ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto merged);
+			break;
+		case 1:
+			ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto merged);
+			break;
+		default:
+			ut_error;
+		}
+	}
+
+merged:
+	if (mrec0) {
+		/* append all mrec0 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto done0);
+		}
+	}
+done0:
+	if (mrec1) {
+		/* append all mrec1 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto done1);
+		}
+	}
+done1:
+
+	mem_heap_free(heap);
+	b2 = row_merge_write_eof(&block[2 * srv_sort_buf_size],
+				 b2, of->fd, &of->offset);
+	return(b2 ? DB_SUCCESS : DB_CORRUPTION);
+}
+
+/*************************************************************//**
+Copy a block of index entries.
+@return	TRUE on success, FALSE on failure */
+static __attribute__((nonnull, warn_unused_result))
+ibool
+row_merge_blocks_copy(
+/*==================*/
+	const dict_index_t*	index,	/*!< in: index being created */
+	const merge_file_t*	file,	/*!< in: input file */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	ulint*			foffs0,	/*!< in/out: input file offset */
+	merge_file_t*		of)	/*!< in/out: output file */
+{
+	mem_heap_t*	heap;	/*!< memory heap for offsets0, offsets1 */
+
+	mrec_buf_t*	buf;	/*!< buffer for handling
+				split mrec in block[] */
+	const byte*	b0;	/*!< pointer to block[0] */
+	byte*		b2;	/*!< pointer to block[2 * srv_sort_buf_size] */
+	const mrec_t*	mrec0;	/*!< merge rec, points to block[0] */
+	ulint*		offsets0;/* offsets of mrec0 */
+	ulint*		offsets1;/* dummy offsets */
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_block) {
+		fprintf(stderr,
+			"row_merge_blocks_copy fd=%d ofs=%lu"
+			" = fd=%d ofs=%lu\n",
+			file->fd, (ulong) foffs0,
+			of->fd, (ulong) of->offset);
+	}
+#endif /* UNIV_DEBUG */
+
+	heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
+
+	/* Write a record and read the next record.  Split the output
+	file in two halves, which can be merged on the following pass. */
+
+	if (!row_merge_read(file->fd, *foffs0, &block[0])) {
+corrupt:
+		mem_heap_free(heap);
+		return(FALSE);
+	}
+
+	b0 = &block[0];
+
+	b2 = &block[2 * srv_sort_buf_size];
+
+	b0 = row_merge_read_rec(&block[0], &buf[0], b0, index,
+				file->fd, foffs0, &mrec0, offsets0);
+	if (UNIV_UNLIKELY(!b0 && mrec0)) {
+
+		goto corrupt;
+	}
+
+	if (mrec0) {
+		/* append all mrec0 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(0, index, goto done0);
+		}
+	}
+done0:
+
+	/* The file offset points to the beginning of the last page
+	that has been read.  Update it to point to the next block. */
+	(*foffs0)++;
+
+	mem_heap_free(heap);
+	return(row_merge_write_eof(&block[2 * srv_sort_buf_size],
+				   b2, of->fd, &of->offset)
+	       != NULL);
+}
+
+/*************************************************************//**
+Merge disk files.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull))
+dberr_t
+row_merge(
+/*======*/
+	trx_t*			trx,	/*!< in: transaction */
+	const row_merge_dup_t*	dup,	/*!< in: descriptor of
+					index being created */
+	merge_file_t*		file,	/*!< in/out: file containing
+					index entries */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	int*			tmpfd,	/*!< in/out: temporary file handle */
+	ulint*			num_run,/*!< in/out: Number of runs remain
+					to be merged */
+	ulint*			run_offset) /*!< in/out: Array contains the
+					first offset number for each merge
+					run */
+{
+	ulint		foffs0;	/*!< first input offset */
+	ulint		foffs1;	/*!< second input offset */
+	dberr_t		error;	/*!< error code */
+	merge_file_t	of;	/*!< output file */
+	const ulint	ihalf	= run_offset[*num_run / 2];
+				/*!< half the input file */
+	ulint		n_run	= 0;
+				/*!< num of runs generated from this merge */
+
+	UNIV_MEM_ASSERT_W(&block[0], 3 * srv_sort_buf_size);
+
+	ut_ad(ihalf < file->offset);
+
+	of.fd = *tmpfd;
+	of.offset = 0;
+	of.n_rec = 0;
+
+#ifdef POSIX_FADV_SEQUENTIAL
+	/* The input file will be read sequentially, starting from the
+	beginning and the middle.  In Linux, the POSIX_FADV_SEQUENTIAL
+	affects the entire file.  Each block will be read exactly once. */
+	posix_fadvise(file->fd, 0, 0,
+		      POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE);
+#endif /* POSIX_FADV_SEQUENTIAL */
+
+	/* Merge blocks to the output file. */
+	foffs0 = 0;
+	foffs1 = ihalf;
+
+	UNIV_MEM_INVALID(run_offset, *num_run * sizeof *run_offset);
+
+	for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) {
+
+		if (trx_is_interrupted(trx)) {
+			return(DB_INTERRUPTED);
+		}
+
+		/* Remember the offset number for this run */
+		run_offset[n_run++] = of.offset;
+
+		error = row_merge_blocks(dup, file, block,
+					 &foffs0, &foffs1, &of);
+
+		if (error != DB_SUCCESS) {
+			return(error);
+		}
+
+	}
+
+	/* Copy the last blocks, if there are any. */
+
+	while (foffs0 < ihalf) {
+		if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+			return(DB_INTERRUPTED);
+		}
+
+		/* Remember the offset number for this run */
+		run_offset[n_run++] = of.offset;
+
+		if (!row_merge_blocks_copy(dup->index, file, block,
+					   &foffs0, &of)) {
+			return(DB_CORRUPTION);
+		}
+	}
+
+	ut_ad(foffs0 == ihalf);
+
+	while (foffs1 < file->offset) {
+		if (trx_is_interrupted(trx)) {
+			return(DB_INTERRUPTED);
+		}
+
+		/* Remember the offset number for this run */
+		run_offset[n_run++] = of.offset;
+
+		if (!row_merge_blocks_copy(dup->index, file, block,
+					   &foffs1, &of)) {
+			return(DB_CORRUPTION);
+		}
+	}
+
+	ut_ad(foffs1 == file->offset);
+
+	if (UNIV_UNLIKELY(of.n_rec != file->n_rec)) {
+		return(DB_CORRUPTION);
+	}
+
+	ut_ad(n_run <= *num_run);
+
+	*num_run = n_run;
+
+	/* Each run can contain one or more offsets. As merge goes on,
+	the number of runs (to merge) will reduce until we have one
+	single run. So the number of runs will always be smaller than
+	the number of offsets in file */
+	ut_ad((*num_run) <= file->offset);
+
+	/* The number of offsets in output file is always equal or
+	smaller than input file */
+	ut_ad(of.offset <= file->offset);
+
+	/* Swap file descriptors for the next pass. */
+	*tmpfd = file->fd;
+	*file = of;
+
+	UNIV_MEM_INVALID(&block[0], 3 * srv_sort_buf_size);
+
+	return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Merge disk files.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_merge_sort(
+/*===========*/
+	trx_t*			trx,	/*!< in: transaction */
+	const row_merge_dup_t*	dup,	/*!< in: descriptor of
+					index being created */
+	merge_file_t*		file,	/*!< in/out: file containing
+					index entries */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	int*			tmpfd)	/*!< in/out: temporary file handle */
+{
+	const ulint	half	= file->offset / 2;
+	ulint		num_runs;
+	ulint*		run_offset;
+	dberr_t		error	= DB_SUCCESS;
+	DBUG_ENTER("row_merge_sort");
+
+	/* Record the number of merge runs we need to perform */
+	num_runs = file->offset;
+
+	/* If num_runs are less than 1, nothing to merge */
+	if (num_runs <= 1) {
+		DBUG_RETURN(error);
+	}
+
+	/* "run_offset" records each run's first offset number */
+	run_offset = (ulint*) mem_alloc(file->offset * sizeof(ulint));
+
+	/* This tells row_merge() where to start for the first round
+	of merge. */
+	run_offset[half] = half;
+
+	/* The file should always contain at least one byte (the end
+	of file marker).  Thus, it must be at least one block. */
+	ut_ad(file->offset > 0);
+
+	/* Merge the runs until we have one big run */
+	do {
+		error = row_merge(trx, dup, file, block, tmpfd,
+				  &num_runs, run_offset);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		UNIV_MEM_ASSERT_RW(run_offset, num_runs * sizeof *run_offset);
+	} while (num_runs > 1);
+
+	mem_free(run_offset);
+
+	DBUG_RETURN(error);
+}
+
+/*************************************************************//**
+Copy externally stored columns to the data tuple. */
+static __attribute__((nonnull))
+void
+row_merge_copy_blobs(
+/*=================*/
+	const mrec_t*	mrec,	/*!< in: merge record */
+	const ulint*	offsets,/*!< in: offsets of mrec */
+	ulint		zip_size,/*!< in: compressed page size in bytes, or 0 */
+	dtuple_t*	tuple,	/*!< in/out: data tuple */
+	mem_heap_t*	heap)	/*!< in/out: memory heap */
+{
+	ut_ad(rec_offs_any_extern(offsets));
+
+	for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) {
+		ulint		len;
+		const void*	data;
+		dfield_t*	field = dtuple_get_nth_field(tuple, i);
+
+		if (!dfield_is_ext(field)) {
+			continue;
+		}
+
+		ut_ad(!dfield_is_null(field));
+
+		/* During the creation of a PRIMARY KEY, the table is
+		X-locked, and we skip copying records that have been
+		marked for deletion. Therefore, externally stored
+		columns cannot possibly be freed between the time the
+		BLOB pointers are read (row_merge_read_clustered_index())
+		and dereferenced (below). */
+		data = btr_rec_copy_externally_stored_field(
+			mrec, offsets, zip_size, i, &len, heap);
+		/* Because we have locked the table, any records
+		written by incomplete transactions must have been
+		rolled back already. There must not be any incomplete
+		BLOB columns. */
+		ut_a(data);
+
+		dfield_set_data(field, data, len);
+	}
+}
+
+/********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return	DB_SUCCESS or error number */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_merge_insert_index_tuples(
+/*==========================*/
+	trx_id_t		trx_id,	/*!< in: transaction identifier */
+	dict_index_t*		index,	/*!< in: index */
+	const dict_table_t*	old_table,/*!< in: old table */
+	int			fd,	/*!< in: file descriptor */
+	row_merge_block_t*	block)	/*!< in/out: file buffer */
+{
+	const byte*		b;
+	mem_heap_t*		heap;
+	mem_heap_t*		tuple_heap;
+	mem_heap_t*		ins_heap;
+	dberr_t			error = DB_SUCCESS;
+	ulint			foffs = 0;
+	ulint*			offsets;
+	mrec_buf_t*		buf;
+	DBUG_ENTER("row_merge_insert_index_tuples");
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(!(index->type & DICT_FTS));
+	ut_ad(trx_id);
+
+	tuple_heap = mem_heap_create(1000);
+
+	{
+		ulint i	= 1 + REC_OFFS_HEADER_SIZE
+			+ dict_index_get_n_fields(index);
+		heap = mem_heap_create(sizeof *buf + i * sizeof *offsets);
+		ins_heap = mem_heap_create(sizeof *buf + i * sizeof *offsets);
+		offsets = static_cast<ulint*>(
+			mem_heap_alloc(heap, i * sizeof *offsets));
+		offsets[0] = i;
+		offsets[1] = dict_index_get_n_fields(index);
+	}
+
+	b = block;
+
+	if (!row_merge_read(fd, foffs, block)) {
+		error = DB_CORRUPTION;
+	} else {
+		buf = static_cast<mrec_buf_t*>(
+			mem_heap_alloc(heap, sizeof *buf));
+
+		for (;;) {
+			const mrec_t*	mrec;
+			dtuple_t*	dtuple;
+			ulint		n_ext;
+			big_rec_t*	big_rec;
+			rec_t*		rec;
+			btr_cur_t	cursor;
+			mtr_t		mtr;
+
+			b = row_merge_read_rec(block, buf, b, index,
+					       fd, &foffs, &mrec, offsets);
+			if (UNIV_UNLIKELY(!b)) {
+				/* End of list, or I/O error */
+				if (mrec) {
+					error = DB_CORRUPTION;
+				}
+				break;
+			}
+
+			dict_index_t*	old_index
+				= dict_table_get_first_index(old_table);
+
+			if (dict_index_is_clust(index)
+			    && dict_index_is_online_ddl(old_index)) {
+				error = row_log_table_get_error(old_index);
+				if (error != DB_SUCCESS) {
+					break;
+				}
+			}
+
+			dtuple = row_rec_to_index_entry_low(
+				mrec, index, offsets, &n_ext, tuple_heap);
+
+			if (!n_ext) {
+				/* There are no externally stored columns. */
+			} else {
+				ut_ad(dict_index_is_clust(index));
+				/* Off-page columns can be fetched safely
+				when concurrent modifications to the table
+				are disabled. (Purge can process delete-marked
+				records, but row_merge_read_clustered_index()
+				would have skipped them.)
+
+				When concurrent modifications are enabled,
+				row_merge_read_clustered_index() will
+				only see rows from transactions that were
+				committed before the ALTER TABLE started
+				(REPEATABLE READ).
+
+				Any modifications after the
+				row_merge_read_clustered_index() scan
+				will go through row_log_table_apply().
+				Any modifications to off-page columns
+				will be tracked by
+				row_log_table_blob_alloc() and
+				row_log_table_blob_free(). */
+				row_merge_copy_blobs(
+					mrec, offsets,
+					dict_table_zip_size(old_table),
+					dtuple, tuple_heap);
+			}
+
+			ut_ad(dtuple_validate(dtuple));
+			log_free_check();
+
+			mtr_start(&mtr);
+			/* Insert after the last user record. */
+			btr_cur_open_at_index_side(
+				false, index, BTR_MODIFY_LEAF,
+				&cursor, 0, &mtr);
+			page_cur_position(
+				page_rec_get_prev(btr_cur_get_rec(&cursor)),
+				btr_cur_get_block(&cursor),
+				btr_cur_get_page_cur(&cursor));
+			cursor.flag = BTR_CUR_BINARY;
+#ifdef UNIV_DEBUG
+			/* Check that the records are inserted in order. */
+			rec = btr_cur_get_rec(&cursor);
+
+			if (!page_rec_is_infimum(rec)) {
+				ulint*	rec_offsets = rec_get_offsets(
+					rec, index, offsets,
+					ULINT_UNDEFINED, &tuple_heap);
+				ut_ad(cmp_dtuple_rec(dtuple, rec, rec_offsets)
+				      > 0);
+			}
+#endif /* UNIV_DEBUG */
+			ulint*	ins_offsets = NULL;
+
+			error = btr_cur_optimistic_insert(
+				BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+				| BTR_KEEP_SYS_FLAG | BTR_CREATE_FLAG,
+				&cursor, &ins_offsets, &ins_heap,
+				dtuple, &rec, &big_rec, 0, NULL, &mtr);
+
+			if (error == DB_FAIL) {
+				ut_ad(!big_rec);
+				mtr_commit(&mtr);
+				mtr_start(&mtr);
+				btr_cur_open_at_index_side(
+					false, index, BTR_MODIFY_TREE,
+					&cursor, 0, &mtr);
+				page_cur_position(
+					page_rec_get_prev(btr_cur_get_rec(
+								  &cursor)),
+					btr_cur_get_block(&cursor),
+					btr_cur_get_page_cur(&cursor));
+
+				error = btr_cur_pessimistic_insert(
+					BTR_NO_UNDO_LOG_FLAG
+					| BTR_NO_LOCKING_FLAG
+					| BTR_KEEP_SYS_FLAG | BTR_CREATE_FLAG,
+					&cursor, &ins_offsets, &ins_heap,
+					dtuple, &rec, &big_rec, 0, NULL, &mtr);
+			}
+
+			if (!dict_index_is_clust(index)) {
+				page_update_max_trx_id(
+					btr_cur_get_block(&cursor),
+					btr_cur_get_page_zip(&cursor),
+					trx_id, &mtr);
+			}
+
+			mtr_commit(&mtr);
+
+			if (UNIV_LIKELY_NULL(big_rec)) {
+				/* If the system crashes at this
+				point, the clustered index record will
+				contain a null BLOB pointer. This
+				should not matter, because the copied
+				table will be dropped on crash
+				recovery anyway. */
+
+				ut_ad(dict_index_is_clust(index));
+				ut_ad(error == DB_SUCCESS);
+				error = row_ins_index_entry_big_rec(
+					dtuple, big_rec,
+					ins_offsets, &ins_heap,
+					index, NULL, __FILE__, __LINE__);
+				dtuple_convert_back_big_rec(
+					index, dtuple, big_rec);
+			}
+
+			if (error != DB_SUCCESS) {
+				goto err_exit;
+			}
+
+			mem_heap_empty(tuple_heap);
+			mem_heap_empty(ins_heap);
+		}
+	}
+
+err_exit:
+	mem_heap_free(tuple_heap);
+	mem_heap_free(ins_heap);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(error);
+}
+
+/*********************************************************************//**
+Sets an exclusive lock on a table, for the duration of creating indexes.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_merge_lock_table(
+/*=================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table,		/*!< in: table to lock */
+	enum lock_mode	mode)		/*!< in: LOCK_X or LOCK_S */
+{
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	dberr_t		err;
+	sel_node_t*	node;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mode == LOCK_X || mode == LOCK_S);
+
+	heap = mem_heap_create(512);
+
+	trx->op_info = "setting table lock for creating or dropping index";
+
+	node = sel_node_create(heap);
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+	thr->graph->state = QUE_FORK_ACTIVE;
+
+	/* We use the select query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = static_cast<que_thr_t*>(
+		que_fork_get_first_thr(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = thr;
+	thr->prev_node = thr->common.parent;
+
+	err = lock_table(0, table, mode, thr);
+
+	trx->error_state = err;
+
+	if (UNIV_LIKELY(err == DB_SUCCESS)) {
+		que_thr_stop_for_mysql_no_error(thr, trx);
+	} else {
+		que_thr_stop_for_mysql(thr);
+
+		if (err != DB_QUE_THR_SUSPENDED) {
+			bool	was_lock_wait;
+
+			was_lock_wait = row_mysql_handle_errors(
+				&err, trx, thr, NULL);
+
+			if (was_lock_wait) {
+				goto run_again;
+			}
+		} else {
+			que_thr_t*	run_thr;
+			que_node_t*	parent;
+
+			parent = que_node_get_parent(thr);
+
+			run_thr = que_fork_start_command(
+				static_cast<que_fork_t*>(parent));
+
+			ut_a(run_thr == thr);
+
+			/* There was a lock wait but the thread was not
+			in a ready to run or running state. */
+			trx->error_state = DB_LOCK_WAIT;
+
+			goto run_again;
+		}
+	}
+
+	que_graph_free(thr->graph);
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Drop an index that was created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+static
+void
+row_merge_drop_index_dict(
+/*======================*/
+	trx_t*		trx,	/*!< in/out: dictionary transaction */
+	index_id_t	index_id)/*!< in: index identifier */
+{
+	static const char sql[] =
+		"PROCEDURE DROP_INDEX_PROC () IS\n"
+		"BEGIN\n"
+		"DELETE FROM SYS_FIELDS WHERE INDEX_ID=:indexid;\n"
+		"DELETE FROM SYS_INDEXES WHERE ID=:indexid;\n"
+		"END;\n";
+	dberr_t		error;
+	pars_info_t*	info;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	info = pars_info_create();
+	pars_info_add_ull_literal(info, "indexid", index_id);
+	trx->op_info = "dropping index from dictionary";
+	error = que_eval_sql(info, sql, FALSE, trx);
+
+	if (error != DB_SUCCESS) {
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		trx->error_state = DB_SUCCESS;
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: row_merge_drop_index_dict "
+			"failed with error code: %u.\n", (unsigned) error);
+	}
+
+	trx->op_info = "";
+}
+
+/*********************************************************************//**
+Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+UNIV_INTERN
+void
+row_merge_drop_indexes_dict(
+/*========================*/
+	trx_t*		trx,	/*!< in/out: dictionary transaction */
+	table_id_t	table_id)/*!< in: table identifier */
+{
+	static const char sql[] =
+		"PROCEDURE DROP_INDEXES_PROC () IS\n"
+		"ixid CHAR;\n"
+		"found INT;\n"
+
+		"DECLARE CURSOR index_cur IS\n"
+		" SELECT ID FROM SYS_INDEXES\n"
+		" WHERE TABLE_ID=:tableid AND\n"
+		" SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
+		"FOR UPDATE;\n"
+
+		"BEGIN\n"
+		"found := 1;\n"
+		"OPEN index_cur;\n"
+		"WHILE found = 1 LOOP\n"
+		"  FETCH index_cur INTO ixid;\n"
+		"  IF (SQL % NOTFOUND) THEN\n"
+		"    found := 0;\n"
+		"  ELSE\n"
+		"    DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n"
+		"    DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE index_cur;\n"
+
+		"END;\n";
+	dberr_t		error;
+	pars_info_t*	info;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* It is possible that table->n_ref_count > 1 when
+	locked=TRUE. In this case, all code that should have an open
+	handle to the table be waiting for the next statement to execute,
+	or waiting for a meta-data lock.
+
+	A concurrent purge will be prevented by dict_operation_lock. */
+
+	info = pars_info_create();
+	pars_info_add_ull_literal(info, "tableid", table_id);
+	trx->op_info = "dropping indexes";
+	error = que_eval_sql(info, sql, FALSE, trx);
+
+	if (error != DB_SUCCESS) {
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		trx->error_state = DB_SUCCESS;
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: row_merge_drop_indexes_dict "
+			"failed with error code: %u.\n", (unsigned) error);
+	}
+
+	trx->op_info = "";
+}
+
+/*********************************************************************//**
+Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+UNIV_INTERN
+void
+row_merge_drop_indexes(
+/*===================*/
+	trx_t*		trx,	/*!< in/out: dictionary transaction */
+	dict_table_t*	table,	/*!< in/out: table containing the indexes */
+	ibool		locked)	/*!< in: TRUE=table locked,
+				FALSE=may need to do a lazy drop */
+{
+	dict_index_t*	index;
+	dict_index_t*	next_index;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	index = dict_table_get_first_index(table);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_COMPLETE);
+
+	/* the caller should have an open handle to the table */
+	ut_ad(table->n_ref_count >= 1);
+
+	/* It is possible that table->n_ref_count > 1 when
+	locked=TRUE. In this case, all code that should have an open
+	handle to the table be waiting for the next statement to execute,
+	or waiting for a meta-data lock.
+
+	A concurrent purge will be prevented by dict_operation_lock. */
+
+	if (!locked && table->n_ref_count > 1) {
+		/* We will have to drop the indexes later, when the
+		table is guaranteed to be no longer in use.  Mark the
+		indexes as incomplete and corrupted, so that other
+		threads will stop using them.  Let dict_table_close()
+		or crash recovery or the next invocation of
+		prepare_inplace_alter_table() take care of dropping
+		the indexes. */
+
+		while ((index = dict_table_get_next_index(index)) != NULL) {
+			ut_ad(!dict_index_is_clust(index));
+
+			switch (dict_index_get_online_status(index)) {
+			case ONLINE_INDEX_ABORTED_DROPPED:
+				continue;
+			case ONLINE_INDEX_COMPLETE:
+				if (*index->name != TEMP_INDEX_PREFIX) {
+					/* Do nothing to already
+					published indexes. */
+				} else if (index->type & DICT_FTS) {
+					/* Drop a completed FULLTEXT
+					index, due to a timeout during
+					MDL upgrade for
+					commit_inplace_alter_table().
+					Because only concurrent reads
+					are allowed (and they are not
+					seeing this index yet) we
+					are safe to drop the index. */
+					dict_index_t* prev = UT_LIST_GET_PREV(
+						indexes, index);
+					/* At least there should be
+					the clustered index before
+					this one. */
+					ut_ad(prev);
+					ut_a(table->fts);
+					fts_drop_index(table, index, trx);
+					/* Since
+					INNOBASE_SHARE::idx_trans_tbl
+					is shared between all open
+					ha_innobase handles to this
+					table, no thread should be
+					accessing this dict_index_t
+					object. Also, we should be
+					holding LOCK=SHARED MDL on the
+					table even after the MDL
+					upgrade timeout. */
+
+					/* We can remove a DICT_FTS
+					index from the cache, because
+					we do not allow ADD FULLTEXT INDEX
+					with LOCK=NONE. If we allowed that,
+					we should exclude FTS entries from
+					prebuilt->ins_node->entry_list
+					in ins_node_create_entry_list(). */
+					dict_index_remove_from_cache(
+						table, index);
+					index = prev;
+				} else {
+					rw_lock_x_lock(
+						dict_index_get_lock(index));
+					dict_index_set_online_status(
+						index, ONLINE_INDEX_ABORTED);
+					index->type |= DICT_CORRUPT;
+					table->drop_aborted = TRUE;
+					goto drop_aborted;
+				}
+				continue;
+			case ONLINE_INDEX_CREATION:
+				rw_lock_x_lock(dict_index_get_lock(index));
+				ut_ad(*index->name == TEMP_INDEX_PREFIX);
+				row_log_abort_sec(index);
+			drop_aborted:
+				rw_lock_x_unlock(dict_index_get_lock(index));
+
+				DEBUG_SYNC_C("merge_drop_index_after_abort");
+				/* covered by dict_sys->mutex */
+				MONITOR_INC(MONITOR_BACKGROUND_DROP_INDEX);
+				/* fall through */
+			case ONLINE_INDEX_ABORTED:
+				/* Drop the index tree from the
+				data dictionary and free it from
+				the tablespace, but keep the object
+				in the data dictionary cache. */
+				row_merge_drop_index_dict(trx, index->id);
+				rw_lock_x_lock(dict_index_get_lock(index));
+				dict_index_set_online_status(
+					index, ONLINE_INDEX_ABORTED_DROPPED);
+				rw_lock_x_unlock(dict_index_get_lock(index));
+				table->drop_aborted = TRUE;
+				continue;
+			}
+			ut_error;
+		}
+
+		return;
+	}
+
+	row_merge_drop_indexes_dict(trx, table->id);
+
+	/* Invalidate all row_prebuilt_t::ins_graph that are referring
+	to this table. That is, force row_get_prebuilt_insert_row() to
+	rebuild prebuilt->ins_node->entry_list). */
+	ut_ad(table->def_trx_id <= trx->id);
+	table->def_trx_id = trx->id;
+
+	next_index = dict_table_get_next_index(index);
+
+	while ((index = next_index) != NULL) {
+		/* read the next pointer before freeing the index */
+		next_index = dict_table_get_next_index(index);
+
+		ut_ad(!dict_index_is_clust(index));
+
+		if (*index->name == TEMP_INDEX_PREFIX) {
+			/* If it is FTS index, drop from table->fts
+			and also drop its auxiliary tables */
+			if (index->type & DICT_FTS) {
+				ut_a(table->fts);
+				fts_drop_index(table, index, trx);
+			}
+
+			switch (dict_index_get_online_status(index)) {
+			case ONLINE_INDEX_CREATION:
+				/* This state should only be possible
+				when prepare_inplace_alter_table() fails
+				after invoking row_merge_create_index().
+				In inplace_alter_table(),
+				row_merge_build_indexes()
+				should never leave the index in this state.
+				It would invoke row_log_abort_sec() on
+				failure. */
+			case ONLINE_INDEX_COMPLETE:
+				/* In these cases, we are able to drop
+				the index straight. The DROP INDEX was
+				never deferred. */
+				break;
+			case ONLINE_INDEX_ABORTED:
+			case ONLINE_INDEX_ABORTED_DROPPED:
+				/* covered by dict_sys->mutex */
+				MONITOR_DEC(MONITOR_BACKGROUND_DROP_INDEX);
+			}
+
+			dict_index_remove_from_cache(table, index);
+		}
+	}
+
+	table->drop_aborted = FALSE;
+	ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE));
+}
+
+/*********************************************************************//**
+Drop all partially created indexes during crash recovery. */
+UNIV_INTERN
+void
+row_merge_drop_temp_indexes(void)
+/*=============================*/
+{
+	static const char sql[] =
+		"PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
+		"ixid CHAR;\n"
+		"found INT;\n"
+
+		"DECLARE CURSOR index_cur IS\n"
+		" SELECT ID FROM SYS_INDEXES\n"
+		" WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
+		"FOR UPDATE;\n"
+
+		"BEGIN\n"
+		"found := 1;\n"
+		"OPEN index_cur;\n"
+		"WHILE found = 1 LOOP\n"
+		"  FETCH index_cur INTO ixid;\n"
+		"  IF (SQL % NOTFOUND) THEN\n"
+		"    found := 0;\n"
+		"  ELSE\n"
+		"    DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n"
+		"    DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE index_cur;\n"
+		"END;\n";
+	trx_t*	trx;
+	dberr_t	error;
+
+	/* Load the table definitions that contain partially defined
+	indexes, so that the data dictionary information can be checked
+	when accessing the tablename.ibd files. */
+	trx = trx_allocate_for_background();
+	trx->op_info = "dropping partially created indexes";
+	row_mysql_lock_data_dictionary(trx);
+	/* Ensure that this transaction will be rolled back and locks
+	will be released, if the server gets killed before the commit
+	gets written to the redo log. */
+	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+	trx->op_info = "dropping indexes";
+	error = que_eval_sql(NULL, sql, FALSE, trx);
+
+	if (error != DB_SUCCESS) {
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		trx->error_state = DB_SUCCESS;
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: row_merge_drop_temp_indexes "
+			"failed with error code: %u.\n", (unsigned) error);
+	}
+
+	trx_commit_for_mysql(trx);
+	row_mysql_unlock_data_dictionary(trx);
+	trx_free_for_background(trx);
+}
+
+/*********************************************************************//**
+Creates temporary merge files, and if UNIV_PFS_IO defined, register
+the file descriptor with Performance Schema.
+@return file descriptor, or -1 on failure */
+UNIV_INTERN
+int
+row_merge_file_create_low(void)
+/*===========================*/
+{
+	int	fd;
+#ifdef UNIV_PFS_IO
+	/* This temp file open does not go through normal
+	file APIs, add instrumentation to register with
+	performance schema */
+	struct PSI_file_locker*	locker = NULL;
+	PSI_file_locker_state	state;
+	register_pfs_file_open_begin(&state, locker, innodb_file_temp_key,
+				     PSI_FILE_OPEN,
+				     "Innodb Merge Temp File",
+				     __FILE__, __LINE__);
+#endif
+	fd = innobase_mysql_tmpfile();
+#ifdef UNIV_PFS_IO
+	register_pfs_file_open_end(locker, fd);
+#endif
+
+	if (fd < 0) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot create temporary merge file");
+		return (-1);
+	}
+	return(fd);
+}
+
+/*********************************************************************//**
+Create a merge file.
+@return file descriptor, or -1 on failure */
+UNIV_INTERN
+int
+row_merge_file_create(
+/*==================*/
+	merge_file_t*	merge_file)	/*!< out: merge file structure */
+{
+	merge_file->fd = row_merge_file_create_low();
+	merge_file->offset = 0;
+	merge_file->n_rec = 0;
+
+	if (merge_file->fd >= 0) {
+		if (srv_disable_sort_file_cache) {
+			os_file_set_nocache(merge_file->fd,
+				"row0merge.cc", "sort");
+		}
+	}
+	return(merge_file->fd);
+}
+
+/*********************************************************************//**
+Destroy a merge file. And de-register the file from Performance Schema
+if UNIV_PFS_IO is defined. */
+UNIV_INTERN
+void
+row_merge_file_destroy_low(
+/*=======================*/
+	int		fd)	/*!< in: merge file descriptor */
+{
+#ifdef UNIV_PFS_IO
+	struct PSI_file_locker*	locker = NULL;
+	PSI_file_locker_state	state;
+	register_pfs_file_io_begin(&state, locker,
+				   fd, 0, PSI_FILE_CLOSE,
+				   __FILE__, __LINE__);
+#endif
+	if (fd >= 0) {
+		close(fd);
+	}
+#ifdef UNIV_PFS_IO
+	register_pfs_file_io_end(locker, 0);
+#endif
+}
+/*********************************************************************//**
+Destroy a merge file. */
+UNIV_INTERN
+void
+row_merge_file_destroy(
+/*===================*/
+	merge_file_t*	merge_file)	/*!< in/out: merge file structure */
+{
+	ut_ad(!srv_read_only_mode);
+
+	if (merge_file->fd != -1) {
+		row_merge_file_destroy_low(merge_file->fd);
+		merge_file->fd = -1;
+	}
+}
+
+/*********************************************************************//**
+Rename an index in the dictionary that was created. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return	DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+row_merge_rename_index_to_add(
+/*==========================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	table_id_t	table_id,	/*!< in: table identifier */
+	index_id_t	index_id)	/*!< in: index identifier */
+{
+	dberr_t		err = DB_SUCCESS;
+	pars_info_t*	info = pars_info_create();
+
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in renaming indexes. */
+
+	static const char rename_index[] =
+		"PROCEDURE RENAME_INDEX_PROC () IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
+		"WHERE TABLE_ID = :tableid AND ID = :indexid;\n"
+		"END;\n";
+
+	ut_ad(trx);
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+
+	trx->op_info = "renaming index to add";
+
+	pars_info_add_ull_literal(info, "tableid", table_id);
+	pars_info_add_ull_literal(info, "indexid", index_id);
+
+	err = que_eval_sql(info, rename_index, FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		trx->error_state = DB_SUCCESS;
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error: row_merge_rename_index_to_add "
+			 "failed with error code: %u.\n", (unsigned) err);
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Rename an index in the dictionary that is to be dropped. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return	DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+row_merge_rename_index_to_drop(
+/*===========================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	table_id_t	table_id,	/*!< in: table identifier */
+	index_id_t	index_id)	/*!< in: index identifier */
+{
+	dberr_t		err;
+	pars_info_t*	info = pars_info_create();
+
+	ut_ad(!srv_read_only_mode);
+
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in renaming indexes. */
+
+	static const char rename_index[] =
+		"PROCEDURE RENAME_INDEX_PROC () IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_INDEXES SET NAME=CONCAT('"
+		TEMP_INDEX_PREFIX_STR "',NAME)\n"
+		"WHERE TABLE_ID = :tableid AND ID = :indexid;\n"
+		"END;\n";
+
+	ut_ad(trx);
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+
+	trx->op_info = "renaming index to drop";
+
+	pars_info_add_ull_literal(info, "tableid", table_id);
+	pars_info_add_ull_literal(info, "indexid", index_id);
+
+	err = que_eval_sql(info, rename_index, FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		trx->error_state = DB_SUCCESS;
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error: row_merge_rename_index_to_drop "
+			 "failed with error code: %u.\n", (unsigned) err);
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Provide a new pathname for a table that is being renamed if it belongs to
+a file-per-table tablespace.  The caller is responsible for freeing the
+memory allocated for the return value.
+@return	new pathname of tablespace file, or NULL if space = 0 */
+UNIV_INTERN
+char*
+row_make_new_pathname(
+/*==================*/
+	dict_table_t*	table,		/*!< in: table to be renamed */
+	const char*	new_name)	/*!< in: new name */
+{
+	char*	new_path;
+	char*	old_path;
+
+	ut_ad(table->space != TRX_SYS_SPACE);
+
+	old_path = fil_space_get_first_path(table->space);
+	ut_a(old_path);
+
+	new_path = os_file_make_new_pathname(old_path, new_name);
+
+	mem_free(old_path);
+
+	return(new_path);
+}
+
+/*********************************************************************//**
+Rename the tables in the data dictionary.  The data dictionary must
+have been locked exclusively by the caller, because the transaction
+will not be committed.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_merge_rename_tables_dict(
+/*=========================*/
+	dict_table_t*	old_table,	/*!< in/out: old table, renamed to
+					tmp_name */
+	dict_table_t*	new_table,	/*!< in/out: new table, renamed to
+					old_table->name */
+	const char*	tmp_name,	/*!< in: new name for old_table */
+	trx_t*		trx)		/*!< in/out: dictionary transaction */
+{
+	dberr_t		err	= DB_ERROR;
+	pars_info_t*	info;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(old_table != new_table);
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_TABLE
+	      || trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+
+	trx->op_info = "renaming tables";
+
+	/* We use the private SQL parser of Innobase to generate the query
+	graphs needed in updating the dictionary data in system tables. */
+
+	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "new_name", new_table->name);
+	pars_info_add_str_literal(info, "old_name", old_table->name);
+	pars_info_add_str_literal(info, "tmp_name", tmp_name);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE RENAME_TABLES () IS\n"
+			   "BEGIN\n"
+			   "UPDATE SYS_TABLES SET NAME = :tmp_name\n"
+			   " WHERE NAME = :old_name;\n"
+			   "UPDATE SYS_TABLES SET NAME = :old_name\n"
+			   " WHERE NAME = :new_name;\n"
+			   "END;\n", FALSE, trx);
+
+	/* Update SYS_TABLESPACES and SYS_DATAFILES if the old
+	table is in a non-system tablespace where space > 0. */
+	if (err == DB_SUCCESS
+	    && old_table->space != TRX_SYS_SPACE
+	    && !old_table->ibd_file_missing) {
+		/* Make pathname to update SYS_DATAFILES. */
+		char* tmp_path = row_make_new_pathname(old_table, tmp_name);
+
+		info = pars_info_create();
+
+		pars_info_add_str_literal(info, "tmp_name", tmp_name);
+		pars_info_add_str_literal(info, "tmp_path", tmp_path);
+		pars_info_add_int4_literal(info, "old_space",
+					   (lint) old_table->space);
+
+		err = que_eval_sql(info,
+				   "PROCEDURE RENAME_OLD_SPACE () IS\n"
+				   "BEGIN\n"
+				   "UPDATE SYS_TABLESPACES"
+				   " SET NAME = :tmp_name\n"
+				   " WHERE SPACE = :old_space;\n"
+				   "UPDATE SYS_DATAFILES"
+				   " SET PATH = :tmp_path\n"
+				   " WHERE SPACE = :old_space;\n"
+				   "END;\n", FALSE, trx);
+
+		mem_free(tmp_path);
+	}
+
+	/* Update SYS_TABLESPACES and SYS_DATAFILES if the new
+	table is in a non-system tablespace where space > 0. */
+	if (err == DB_SUCCESS && new_table->space != TRX_SYS_SPACE) {
+		/* Make pathname to update SYS_DATAFILES. */
+		char* old_path = row_make_new_pathname(
+			new_table, old_table->name);
+
+		info = pars_info_create();
+
+		pars_info_add_str_literal(info, "old_name", old_table->name);
+		pars_info_add_str_literal(info, "old_path", old_path);
+		pars_info_add_int4_literal(info, "new_space",
+					   (lint) new_table->space);
+
+		err = que_eval_sql(info,
+				   "PROCEDURE RENAME_NEW_SPACE () IS\n"
+				   "BEGIN\n"
+				   "UPDATE SYS_TABLESPACES"
+				   " SET NAME = :old_name\n"
+				   " WHERE SPACE = :new_space;\n"
+				   "UPDATE SYS_DATAFILES"
+				   " SET PATH = :old_path\n"
+				   " WHERE SPACE = :new_space;\n"
+				   "END;\n", FALSE, trx);
+
+		mem_free(old_path);
+	}
+
+	if (err == DB_SUCCESS && dict_table_is_discarded(new_table)) {
+		err = row_import_update_discarded_flag(
+			trx, new_table->id, true, true);
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Create and execute a query graph for creating an index.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_merge_create_index_graph(
+/*=========================*/
+	trx_t*		trx,		/*!< in: trx */
+	dict_table_t*	table,		/*!< in: table */
+	dict_index_t*	index)		/*!< in: index */
+{
+	ind_node_t*	node;		/*!< Index creation node */
+	mem_heap_t*	heap;		/*!< Memory heap */
+	que_thr_t*	thr;		/*!< Query thread */
+	dberr_t		err;
+
+	ut_ad(trx);
+	ut_ad(table);
+	ut_ad(index);
+
+	heap = mem_heap_create(512);
+
+	index->table = table;
+	node = ind_create_graph_create(index, heap, false);
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+
+	ut_a(thr == que_fork_start_command(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+
+	return(err);
+}
+
+/*********************************************************************//**
+Create the index and load in to the dictionary.
+@return	index, or NULL on error */
+UNIV_INTERN
+dict_index_t*
+row_merge_create_index(
+/*===================*/
+	trx_t*			trx,	/*!< in/out: trx (sets error_state) */
+	dict_table_t*		table,	/*!< in: the index is on this table */
+	const index_def_t*	index_def)
+					/*!< in: the index definition */
+{
+	dict_index_t*	index;
+	dberr_t		err;
+	ulint		n_fields = index_def->n_fields;
+	ulint		i;
+
+	ut_ad(!srv_read_only_mode);
+
+	/* Create the index prototype, using the passed in def, this is not
+	a persistent operation. We pass 0 as the space id, and determine at
+	a lower level the space id where to store the table. */
+
+	index = dict_mem_index_create(table->name, index_def->name,
+				      0, index_def->ind_type, n_fields);
+
+	ut_a(index);
+
+	for (i = 0; i < n_fields; i++) {
+		index_field_t*	ifield = &index_def->fields[i];
+
+		dict_mem_index_add_field(
+			index, dict_table_get_col_name(table, ifield->col_no),
+			ifield->prefix_len);
+	}
+
+	/* Add the index to SYS_INDEXES, using the index prototype. */
+	err = row_merge_create_index_graph(trx, table, index);
+
+	if (err == DB_SUCCESS) {
+
+		index = dict_table_get_index_on_name(table, index_def->name);
+
+		ut_a(index);
+
+		/* Note the id of the transaction that created this
+		index, we use it to restrict readers from accessing
+		this index, to ensure read consistency. */
+		ut_ad(index->trx_id == trx->id);
+	} else {
+		index = NULL;
+	}
+
+	return(index);
+}
+
+/*********************************************************************//**
+Check if a transaction can use an index. */
+UNIV_INTERN
+ibool
+row_merge_is_index_usable(
+/*======================*/
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_index_t*	index)	/*!< in: index to check */
+{
+	if (!dict_index_is_clust(index)
+	    && dict_index_is_online_ddl(index)) {
+		/* Indexes that are being created are not useable. */
+		return(FALSE);
+	}
+
+	return(!dict_index_is_corrupted(index)
+	       && (dict_table_is_temporary(index->table)
+		   || !trx->read_view
+		   || read_view_sees_trx_id(trx->read_view, index->trx_id)));
+}
+
+/*********************************************************************//**
+Drop a table. The caller must have ensured that the background stats
+thread is not processing the table. This can be done by calling
+dict_stats_wait_bg_to_stop_using_table() after locking the dictionary and
+before calling this function.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_merge_drop_table(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	table)		/*!< in: table to drop */
+{
+	ut_ad(!srv_read_only_mode);
+
+	/* There must be no open transactions on the table. */
+	ut_a(table->n_ref_count == 0);
+
+	return(row_drop_table_for_mysql(table->name, trx, false, false));
+}
+
+/*********************************************************************//**
+Build indexes on a table by reading a clustered index,
+creating a temporary file containing index entries, merge sorting
+these index entries and inserting sorted index entries to indexes.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_merge_build_indexes(
+/*====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	old_table,	/*!< in: table where rows are
+					read from */
+	dict_table_t*	new_table,	/*!< in: table where indexes are
+					created; identical to old_table
+					unless creating a PRIMARY KEY */
+	bool		online,		/*!< in: true if creating indexes
+					online */
+	dict_index_t**	indexes,	/*!< in: indexes to be created */
+	const ulint*	key_numbers,	/*!< in: MySQL key numbers */
+	ulint		n_indexes,	/*!< in: size of indexes[] */
+	struct TABLE*	table,		/*!< in/out: MySQL table, for
+					reporting erroneous key value
+					if applicable */
+	const dtuple_t*	add_cols,	/*!< in: default values of
+					added columns, or NULL */
+	const ulint*	col_map,	/*!< in: mapping of old column
+					numbers to new ones, or NULL
+					if old_table == new_table */
+	ulint		add_autoinc,	/*!< in: number of added
+					AUTO_INCREMENT column, or
+					ULINT_UNDEFINED if none is added */
+	ib_sequence_t&	sequence)	/*!< in: autoinc instance if
+					add_autoinc != ULINT_UNDEFINED */
+{
+	merge_file_t*		merge_files;
+	row_merge_block_t*	block;
+	ulint			block_size;
+	ulint			i;
+	ulint			j;
+	dberr_t			error;
+	int			tmpfd = -1;
+	dict_index_t*		fts_sort_idx = NULL;
+	fts_psort_t*		psort_info = NULL;
+	fts_psort_t*		merge_info = NULL;
+	ib_int64_t		sig_count = 0;
+	DBUG_ENTER("row_merge_build_indexes");
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad((old_table == new_table) == !col_map);
+	ut_ad(!add_cols || col_map);
+
+	/* Allocate memory for merge file data structure and initialize
+	fields */
+
+	block_size = 3 * srv_sort_buf_size;
+	block = static_cast<row_merge_block_t*>(
+		os_mem_alloc_large(&block_size, FALSE));
+
+	if (block == NULL) {
+		DBUG_RETURN(DB_OUT_OF_MEMORY);
+	}
+
+	trx_start_if_not_started_xa(trx);
+
+	merge_files = static_cast<merge_file_t*>(
+		mem_alloc(n_indexes * sizeof *merge_files));
+
+	/* Initialize all the merge file descriptors, so that we
+	don't call row_merge_file_destroy() on uninitialized
+	merge file descriptor */
+
+	for (i = 0; i < n_indexes; i++) {
+		merge_files[i].fd = -1;
+	}
+
+	for (i = 0; i < n_indexes; i++) {
+		if (row_merge_file_create(&merge_files[i]) < 0) {
+			error = DB_OUT_OF_MEMORY;
+			goto func_exit;
+		}
+
+		if (indexes[i]->type & DICT_FTS) {
+			ibool	opt_doc_id_size = FALSE;
+
+			/* To build FTS index, we would need to extract
+			doc's word, Doc ID, and word's position, so
+			we need to build a "fts sort index" indexing
+			on above three 'fields' */
+			fts_sort_idx = row_merge_create_fts_sort_index(
+				indexes[i], old_table, &opt_doc_id_size);
+
+			row_merge_dup_t* dup = static_cast<row_merge_dup_t*>(
+				ut_malloc(sizeof *dup));
+			dup->index = fts_sort_idx;
+			dup->table = table;
+			dup->col_map = col_map;
+			dup->n_dup = 0;
+
+			row_fts_psort_info_init(
+				trx, dup, new_table, opt_doc_id_size,
+				&psort_info, &merge_info);
+		}
+	}
+
+	tmpfd = row_merge_file_create_low();
+
+	if (tmpfd < 0) {
+		error = DB_OUT_OF_MEMORY;
+		goto func_exit;
+	}
+
+	/* Reset the MySQL row buffer that is used when reporting
+	duplicate keys. */
+	innobase_rec_reset(table);
+
+	/* Read clustered index of the table and create files for
+	secondary index entries for merge sort */
+
+	error = row_merge_read_clustered_index(
+		trx, table, old_table, new_table, online, indexes,
+		fts_sort_idx, psort_info, merge_files, key_numbers,
+		n_indexes, add_cols, col_map,
+		add_autoinc, sequence, block);
+
+	if (error != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	DEBUG_SYNC_C("row_merge_after_scan");
+
+	/* Now we have files containing index entries ready for
+	sorting and inserting. */
+
+	for (i = 0; i < n_indexes; i++) {
+		dict_index_t*	sort_idx = indexes[i];
+
+		if (indexes[i]->type & DICT_FTS) {
+			os_event_t	fts_parallel_merge_event;
+
+			sort_idx = fts_sort_idx;
+
+			fts_parallel_merge_event
+				= merge_info[0].psort_common->merge_event;
+
+			if (FTS_PLL_MERGE) {
+				ulint	trial_count = 0;
+				bool	all_exit = false;
+
+				os_event_reset(fts_parallel_merge_event);
+				row_fts_start_parallel_merge(merge_info);
+wait_again:
+				os_event_wait_time_low(
+					fts_parallel_merge_event, 1000000,
+					sig_count);
+
+				for (j = 0; j < FTS_NUM_AUX_INDEX; j++) {
+					if (merge_info[j].child_status
+					    != FTS_CHILD_COMPLETE
+					    && merge_info[j].child_status
+					    != FTS_CHILD_EXITING) {
+						sig_count = os_event_reset(
+						fts_parallel_merge_event);
+
+						goto wait_again;
+					}
+				}
+
+				/* Now all children should complete, wait
+				a bit until they all finish using event */
+				while (!all_exit && trial_count < 10000) {
+					all_exit = true;
+
+					for (j = 0; j < FTS_NUM_AUX_INDEX;
+					     j++) {
+						if (merge_info[j].child_status
+						    != FTS_CHILD_EXITING) {
+							all_exit = false;
+							os_thread_sleep(1000);
+							break;
+						}
+					}
+					trial_count++;
+				}
+
+				if (!all_exit) {
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"Not all child merge threads"
+						" exited when creating FTS"
+						" index '%s'",
+						indexes[i]->name);
+				}
+			} else {
+				/* This cannot report duplicates; an
+				assertion would fail in that case. */
+				error = row_fts_merge_insert(
+					sort_idx, new_table,
+					psort_info, 0);
+			}
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+			DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Insert\n");
+#endif
+		} else {
+			row_merge_dup_t	dup = {
+				sort_idx, table, col_map, 0};
+
+			error = row_merge_sort(
+				trx, &dup, &merge_files[i],
+				block, &tmpfd);
+
+			if (error == DB_SUCCESS) {
+				error = row_merge_insert_index_tuples(
+					trx->id, sort_idx, old_table,
+					merge_files[i].fd, block);
+			}
+		}
+
+		/* Close the temporary file to free up space. */
+		row_merge_file_destroy(&merge_files[i]);
+
+		if (indexes[i]->type & DICT_FTS) {
+			row_fts_psort_info_destroy(psort_info, merge_info);
+		} else if (error != DB_SUCCESS || !online) {
+			/* Do not apply any online log. */
+		} else if (old_table != new_table) {
+			ut_ad(!sort_idx->online_log);
+			ut_ad(sort_idx->online_status
+			      == ONLINE_INDEX_COMPLETE);
+		} else {
+			DEBUG_SYNC_C("row_log_apply_before");
+			error = row_log_apply(trx, sort_idx, table);
+			DEBUG_SYNC_C("row_log_apply_after");
+		}
+
+		if (error != DB_SUCCESS) {
+			trx->error_key_num = key_numbers[i];
+			goto func_exit;
+		}
+
+		if (indexes[i]->type & DICT_FTS && fts_enable_diag_print) {
+			char*	name = (char*) indexes[i]->name;
+
+			if (*name == TEMP_INDEX_PREFIX)  {
+				name++;
+			}
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr, " InnoDB: Finished building "
+				"full-text index %s\n", name);
+		}
+	}
+
+func_exit:
+	DBUG_EXECUTE_IF(
+		"ib_build_indexes_too_many_concurrent_trxs",
+		error = DB_TOO_MANY_CONCURRENT_TRXS;
+		trx->error_state = error;);
+
+	row_merge_file_destroy_low(tmpfd);
+
+	for (i = 0; i < n_indexes; i++) {
+		row_merge_file_destroy(&merge_files[i]);
+	}
+
+	if (fts_sort_idx) {
+		dict_mem_index_free(fts_sort_idx);
+	}
+
+	mem_free(merge_files);
+	os_mem_free_large(block, block_size);
+
+	DICT_TF2_FLAG_UNSET(new_table, DICT_TF2_FTS_ADD_DOC_ID);
+
+	if (online && old_table == new_table && error != DB_SUCCESS) {
+		/* On error, flag all online secondary index creation
+		as aborted. */
+		for (i = 0; i < n_indexes; i++) {
+			ut_ad(!(indexes[i]->type & DICT_FTS));
+			ut_ad(*indexes[i]->name == TEMP_INDEX_PREFIX);
+			ut_ad(!dict_index_is_clust(indexes[i]));
+
+			/* Completed indexes should be dropped as
+			well, and indexes whose creation was aborted
+			should be dropped from the persistent
+			storage. However, at this point we can only
+			set some flags in the not-yet-published
+			indexes. These indexes will be dropped later
+			in row_merge_drop_indexes(), called by
+			rollback_inplace_alter_table(). */
+
+			switch (dict_index_get_online_status(indexes[i])) {
+			case ONLINE_INDEX_COMPLETE:
+				break;
+			case ONLINE_INDEX_CREATION:
+				rw_lock_x_lock(
+					dict_index_get_lock(indexes[i]));
+				row_log_abort_sec(indexes[i]);
+				indexes[i]->type |= DICT_CORRUPT;
+				rw_lock_x_unlock(
+					dict_index_get_lock(indexes[i]));
+				new_table->drop_aborted = TRUE;
+				/* fall through */
+			case ONLINE_INDEX_ABORTED_DROPPED:
+			case ONLINE_INDEX_ABORTED:
+				MONITOR_MUTEX_INC(
+					&dict_sys->mutex,
+					MONITOR_BACKGROUND_DROP_INDEX);
+			}
+		}
+	}
+
+	DBUG_RETURN(error);
+}
diff --git a/storage/xtradb/row/row0mysql.c b/storage/xtradb/row/row0mysql.cc
index fc97ace1f32..7748bd747d9 100644
--- a/storage/xtradb/row/row0mysql.c
+++ b/storage/xtradb/row/row0mysql.cc
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0mysql.c
+@file row/row0mysql.cc
 Interface between Innobase row operations and MySQL.
 Contains also create table and other data dictionary operations.
 
@@ -41,7 +41,10 @@ Created 9/17/2000 Heikki Tuuri
 #include "dict0dict.h"
 #include "dict0crea.h"
 #include "dict0load.h"
+#include "dict0priv.h"
 #include "dict0boot.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
 #include "trx0roll.h"
 #include "trx0purge.h"
 #include "trx0rec.h"
@@ -52,7 +55,10 @@ Created 9/17/2000 Heikki Tuuri
 #include "btr0sea.h"
 #include "fil0fil.h"
 #include "ibuf0ibuf.h"
-#include "ha_prototypes.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "srv0start.h"
+#include "row0import.h"
 #include "m_string.h"
 #include "my_sys.h"
 #include "ha_prototypes.h"
@@ -61,21 +67,27 @@ Created 9/17/2000 Heikki Tuuri
 UNIV_INTERN ibool	row_rollback_on_timeout	= FALSE;
 
 /** Chain node of the list of tables to drop in the background. */
-typedef struct row_mysql_drop_struct	row_mysql_drop_t;
-
-/** Chain node of the list of tables to drop in the background. */
-struct row_mysql_drop_struct{
+struct row_mysql_drop_t{
 	char*				table_name;	/*!< table name */
 	UT_LIST_NODE_T(row_mysql_drop_t)row_mysql_drop_list;
 							/*!< list chain node */
 };
 
+#ifdef UNIV_PFS_MUTEX
+/* Key to register drop list mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	row_drop_list_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
 /** @brief List of tables we should drop in background.
 
 ALTER TABLE in MySQL requires that the table handler can drop the
 table in background when there are no queries to it any
-more.  Protected by kernel_mutex. */
+more.  Protected by row_drop_list_mutex. */
 static UT_LIST_BASE_NODE_T(row_mysql_drop_t)	row_mysql_drop_list;
+
+/** Mutex protecting the background table drop list. */
+static ib_mutex_t row_drop_list_mutex;
+
 /** Flag: has row_mysql_drop_list been initialized? */
 static ibool	row_mysql_drop_list_inited	= FALSE;
 
@@ -85,7 +97,9 @@ static const char S_innodb_monitor[] = "innodb_monitor";
 static const char S_innodb_lock_monitor[] = "innodb_lock_monitor";
 static const char S_innodb_tablespace_monitor[] = "innodb_tablespace_monitor";
 static const char S_innodb_table_monitor[] = "innodb_table_monitor";
+#ifdef UNIV_MEM_DEBUG
 static const char S_innodb_mem_validate[] = "innodb_mem_validate";
+#endif /* UNIV_MEM_DEBUG */
 /* @} */
 
 /** Evaluates to true if str1 equals str2_onstack, used for comparing
@@ -316,7 +330,7 @@ row_mysql_pad_col(
 /**************************************************************//**
 Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
 The counterpart of this function is row_sel_field_store_in_mysql_format() in
-row0sel.c.
+row0sel.cc.
 @return	up to which byte we used buf in the conversion */
 UNIV_INTERN
 byte*
@@ -327,7 +341,10 @@ row_mysql_store_col_in_innobase_format(
 					this function is called! */
 	byte*		buf,		/*!< in/out: buffer for a converted
 					integer value; this must be at least
-					col_len long then! */
+					col_len long then! NOTE that dfield
+					may also get a pointer to 'buf',
+					therefore do not discard this as long
+					as dfield is used! */
 	ibool		row_format_col,	/*!< TRUE if the mysql_data is from
 					a MySQL row, FALSE if from a MySQL
 					key value;
@@ -467,7 +484,7 @@ row_mysql_store_col_in_innobase_format(
 		be stored as "$%&a " (5 bytes).	 The string ".abc "
 		will be stored as "$%&abc" (6 bytes).
 
-		The space padding will be restored in row0sel.c, function
+		The space padding will be restored in row0sel.cc, function
 		row_sel_field_store_in_mysql_format(). */
 
 		ulint		n_chars;
@@ -493,7 +510,7 @@ row_mysql_store_col_in_innobase_format(
 /**************************************************************//**
 Convert a row in the MySQL format to a row in the Innobase format. Note that
 the function to convert a MySQL format key value to an InnoDB dtuple is
-row_sel_convert_mysql_key_to_innobase() in row0sel.c. */
+row_sel_convert_mysql_key_to_innobase() in row0sel.cc. */
 static
 void
 row_mysql_convert_row_to_innobase(
@@ -544,25 +561,34 @@ row_mysql_convert_row_to_innobase(
 next_column:
 		;
 	}
+
+	/* If there is a FTS doc id column and it is not user supplied (
+	generated by server) then assign it a new doc id. */
+	if (prebuilt->table->fts) {
+
+		ut_a(prebuilt->table->fts->doc_col != ULINT_UNDEFINED);
+
+		fts_create_doc_id(prebuilt->table, row, prebuilt->heap);
+	}
 }
 
 /****************************************************************//**
 Handles user errors and lock waits detected by the database engine.
-@return TRUE if it was a lock wait and we should continue running the
+@return true if it was a lock wait and we should continue running the
 query thread and in that case the thr is ALREADY in the running state. */
 UNIV_INTERN
-ibool
+bool
 row_mysql_handle_errors(
 /*====================*/
-	ulint*		new_err,/*!< out: possible new error encountered in
+	dberr_t*	new_err,/*!< out: possible new error encountered in
 				lock wait, or if no new error, the value
 				of trx->error_state at the entry of this
 				function */
 	trx_t*		trx,	/*!< in: transaction */
-	que_thr_t*	thr,	/*!< in: query thread */
-	trx_savept_t*	savept)	/*!< in: savepoint or NULL */
+	que_thr_t*	thr,	/*!< in: query thread, or NULL */
+	trx_savept_t*	savept)	/*!< in: savepoint, or NULL */
 {
-	ulint	err;
+	dberr_t	err;
 
 handle_new_error:
 	err = trx->error_state;
@@ -574,7 +600,7 @@ handle_new_error:
 	switch (err) {
 	case DB_LOCK_WAIT_TIMEOUT:
 		if (row_rollback_on_timeout) {
-			trx_general_rollback_for_mysql(trx, NULL);
+			trx_rollback_to_savepoint(trx, NULL);
 			break;
 		}
 		/* fall through */
@@ -587,17 +613,20 @@ handle_new_error:
 	case DB_CANNOT_ADD_CONSTRAINT:
 	case DB_TOO_MANY_CONCURRENT_TRXS:
 	case DB_OUT_OF_FILE_SPACE:
+	case DB_READ_ONLY:
+	case DB_FTS_INVALID_DOCID:
 	case DB_INTERRUPTED:
+	case DB_DICT_CHANGED:
 		if (savept) {
-			/* Roll back the latest, possibly incomplete
-			insertion or update */
+			/* Roll back the latest, possibly incomplete insertion
+			or update */
 
-			trx_general_rollback_for_mysql(trx, savept);
+			trx_rollback_to_savepoint(trx, savept);
 		}
 		/* MySQL will roll back the latest SQL statement */
 		break;
 	case DB_LOCK_WAIT:
-		srv_suspend_mysql_thread(thr);
+		lock_wait_suspend_thread(thr);
 
 		if (trx->error_state != DB_SUCCESS) {
 			que_thr_stop_for_mysql(thr);
@@ -607,14 +636,14 @@ handle_new_error:
 
 		*new_err = err;
 
-		return(TRUE);
+		return(true);
 
 	case DB_DEADLOCK:
 	case DB_LOCK_TABLE_FULL:
 		/* Roll back the whole transaction; this resolution was added
 		to version 3.23.43 */
 
-		trx_general_rollback_for_mysql(trx, NULL);
+		trx_rollback_to_savepoint(trx, NULL);
 		break;
 
 	case DB_MUST_GET_MORE_FILE_SPACE:
@@ -624,6 +653,7 @@ handle_new_error:
 		      " a new data file to\n"
 		      "InnoDB: my.cnf and restart the database.\n", stderr);
 
+		ut_ad(0);
 		exit(1);
 
 	case DB_CORRUPTION:
@@ -662,7 +692,7 @@ handle_new_error:
 
 	trx->error_state = DB_SUCCESS;
 
-	return(FALSE);
+	return(false);
 }
 
 /********************************************************************//**
@@ -725,7 +755,8 @@ row_create_prebuilt(
 	calls */
 	heap = mem_heap_create(PREBUILT_HEAP_INITIAL_SIZE);
 
-	prebuilt = mem_heap_zalloc(heap, sizeof(*prebuilt));
+	prebuilt = static_cast<row_prebuilt_t*>(
+		mem_heap_zalloc(heap, sizeof(*prebuilt)));
 
 	prebuilt->magic_n = ROW_PREBUILT_ALLOCATED;
 	prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED;
@@ -739,9 +770,7 @@ row_create_prebuilt(
 	btr_pcur_reset(&prebuilt->clust_pcur);
 
 	prebuilt->select_lock_type = LOCK_NONE;
-	prebuilt->stored_select_lock_type = 99999999;
-	UNIV_MEM_INVALID(&prebuilt->stored_select_lock_type,
-			 sizeof prebuilt->stored_select_lock_type);
+	prebuilt->stored_select_lock_type = LOCK_NONE_UNSET;
 
 	prebuilt->search_tuple = dtuple_create(heap, search_tuple_n_fields);
 
@@ -751,15 +780,18 @@ row_create_prebuilt(
 
 	prebuilt->clust_ref = ref;
 
-	prebuilt->autoinc_error = 0;
+	prebuilt->autoinc_error = DB_SUCCESS;
 	prebuilt->autoinc_offset = 0;
 
-	/* Default to 1, we will set the actual value later in 
+	/* Default to 1, we will set the actual value later in
 	ha_innobase::get_auto_increment(). */
 	prebuilt->autoinc_increment = 1;
 
 	prebuilt->autoinc_last_value = 0;
 
+	/* During UPDATE and DELETE we need the doc id. */
+	prebuilt->fts_doc_id = 0;
+
 	prebuilt->mysql_row_len = mysql_row_len;
 
 	return(prebuilt);
@@ -824,28 +856,40 @@ row_prebuilt_free(
 		mem_heap_free(prebuilt->old_vers_heap);
 	}
 
-	for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
-		if (prebuilt->fetch_cache[i] != NULL) {
+	if (prebuilt->fetch_cache[0] != NULL) {
+		byte*	base = prebuilt->fetch_cache[0] - 4;
+		byte*	ptr = base;
 
-			if ((ROW_PREBUILT_FETCH_MAGIC_N != mach_read_from_4(
-				     (prebuilt->fetch_cache[i]) - 4))
-			    || (ROW_PREBUILT_FETCH_MAGIC_N != mach_read_from_4(
-					(prebuilt->fetch_cache[i])
-					+ prebuilt->mysql_row_len))) {
-				fputs("InnoDB: Error: trying to free"
-				      " a corrupt fetch buffer.\n", stderr);
+		for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+			byte*	row;
+			ulint	magic1;
+			ulint	magic2;
+
+			magic1 = mach_read_from_4(ptr);
+			ptr += 4;
+
+			row = ptr;
+			ptr += prebuilt->mysql_row_len;
+
+			magic2 = mach_read_from_4(ptr);
+			ptr += 4;
 
-				mem_analyze_corruption(
-					prebuilt->fetch_cache[i]);
+			if (ROW_PREBUILT_FETCH_MAGIC_N != magic1
+			    || row != prebuilt->fetch_cache[i]
+			    || ROW_PREBUILT_FETCH_MAGIC_N != magic2) {
 
+				fputs("InnoDB: Error: trying to free"
+					" a corrupt fetch buffer.\n", stderr);
+
+				mem_analyze_corruption(base);
 				ut_error;
 			}
-
-			mem_free((prebuilt->fetch_cache[i]) - 4);
 		}
+
+		mem_free(base);
 	}
 
-	dict_table_decrement_handle_count(prebuilt->table, dict_locked);
+	dict_table_close(prebuilt->table, dict_locked, TRUE);
 
 	mem_heap_free(prebuilt->heap);
 }
@@ -912,40 +956,62 @@ row_get_prebuilt_insert_row(
 	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
 					handle */
 {
-	ins_node_t*	node;
-	dtuple_t*	row;
-	dict_table_t*	table	= prebuilt->table;
+	dict_table_t*		table	= prebuilt->table;
 
 	ut_ad(prebuilt && table && prebuilt->trx);
 
-	if (prebuilt->ins_node == NULL) {
+	if (prebuilt->ins_node != 0) {
 
-		/* Not called before for this handle: create an insert node
-		and query graph to the prebuilt struct */
+		/* Check if indexes have been dropped or added and we
+		may need to rebuild the row insert template. */
 
-		node = ins_node_create(INS_DIRECT, table, prebuilt->heap);
+		if (prebuilt->trx_id == table->def_trx_id
+		    && UT_LIST_GET_LEN(prebuilt->ins_node->entry_list)
+		    == UT_LIST_GET_LEN(table->indexes)) {
 
-		prebuilt->ins_node = node;
-
-		if (prebuilt->ins_upd_rec_buff == NULL) {
-			prebuilt->ins_upd_rec_buff = mem_heap_alloc(
-				prebuilt->heap, prebuilt->mysql_row_len);
+			return(prebuilt->ins_node->row);
 		}
 
-		row = dtuple_create(prebuilt->heap,
-				    dict_table_get_n_cols(table));
+		ut_ad(prebuilt->trx_id < table->def_trx_id);
+
+		que_graph_free_recursive(prebuilt->ins_graph);
+
+		prebuilt->ins_graph = 0;
+	}
+
+	/* Create an insert node and query graph to the prebuilt struct */
+
+	ins_node_t*		node;
 
-		dict_table_copy_types(row, table);
+	node = ins_node_create(INS_DIRECT, table, prebuilt->heap);
 
-		ins_node_set_new_row(node, row);
+	prebuilt->ins_node = node;
 
-		prebuilt->ins_graph = que_node_get_parent(
-			pars_complete_graph_for_exec(node,
-						     prebuilt->trx,
-						     prebuilt->heap));
-		prebuilt->ins_graph->state = QUE_FORK_ACTIVE;
+	if (prebuilt->ins_upd_rec_buff == 0) {
+		prebuilt->ins_upd_rec_buff = static_cast<byte*>(
+			mem_heap_alloc(
+				prebuilt->heap,
+				prebuilt->mysql_row_len));
 	}
 
+	dtuple_t*	row;
+
+	row = dtuple_create(prebuilt->heap, dict_table_get_n_cols(table));
+
+	dict_table_copy_types(row, table);
+
+	ins_node_set_new_row(node, row);
+
+	prebuilt->ins_graph = static_cast<que_fork_t*>(
+		que_node_get_parent(
+			pars_complete_graph_for_exec(
+				node,
+				prebuilt->trx, prebuilt->heap)));
+
+	prebuilt->ins_graph->state = QUE_FORK_ACTIVE;
+
+	prebuilt->trx_id = table->def_trx_id;
+
 	return(prebuilt->ins_node->row);
 }
 
@@ -958,41 +1024,41 @@ row_update_statistics_if_needed(
 /*============================*/
 	dict_table_t*	table)	/*!< in: table */
 {
-	ulint	counter;
+	ib_uint64_t	counter;
+	ib_uint64_t	n_rows;
+
+	if (!table->stat_initialized) {
+		DBUG_EXECUTE_IF(
+			"test_upd_stats_if_needed_not_inited",
+			fprintf(stderr, "test_upd_stats_if_needed_not_inited "
+				"was executed\n");
+		);
+		return;
+	}
 
-	counter = table->stat_modified_counter;
+	counter = table->stat_modified_counter++;
+	n_rows = dict_table_get_n_rows(table);
 
-	table->stat_modified_counter = counter + 1;
+	if (dict_stats_is_persistent_enabled(table)) {
+		if (counter > n_rows / 10 /* 10% */
+		    && dict_stats_auto_recalc_is_enabled(table)) {
 
-	if (!srv_stats_auto_update)
+			dict_stats_recalc_pool_add(table);
+			table->stat_modified_counter = 0;
+		}
 		return;
-
-	if (DICT_TABLE_CHANGED_TOO_MUCH(table)) {
-
-		dict_update_statistics(
-			table,
-			FALSE, /* update even if stats are initialized */
-			TRUE,
-			TRUE /* only update if stats changed too much */);
 	}
-}
 
-/*********************************************************************//**
-Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
-function should be called at the the end of an SQL statement, by the
-connection thread that owns the transaction (trx->mysql_thd). */
-UNIV_INTERN
-void
-row_unlock_table_autoinc_for_mysql(
-/*===============================*/
-	trx_t*	trx)	/*!< in/out: transaction */
-{
-	if (lock_trx_holds_autoinc_locks(trx)) {
-		mutex_enter(&kernel_mutex);
+	/* Calculate new statistics if 1 / 16 of table has been modified
+	since the last time a statistics batch was run.
+	We calculate statistics at most every 16th round, since we may have
+	a counter table which is very small and updated very often. */
 
-		lock_release_autoinc_locks(trx);
+	if (counter > 16 + n_rows / 16 /* 6.25% */) {
 
-		mutex_exit(&kernel_mutex);
+		ut_ad(!mutex_own(&dict_sys->mutex));
+		/* this will reset table->stat_modified_counter to 0 */
+		dict_stats_update(table, DICT_STATS_RECALC_TRANSIENT);
 	}
 }
 
@@ -1004,7 +1070,7 @@ It is not compatible with another AUTO_INC or exclusive lock on the
 table.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_lock_table_autoinc_for_mysql(
 /*=============================*/
 	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in the MySQL
@@ -1014,14 +1080,14 @@ row_lock_table_autoinc_for_mysql(
 	ins_node_t*		node	= prebuilt->ins_node;
 	const dict_table_t*	table	= prebuilt->table;
 	que_thr_t*		thr;
-	ulint			err;
+	dberr_t			err;
 	ibool			was_lock_wait;
 
 	ut_ad(trx);
 
 	/* If we already hold an AUTOINC lock on the table then do nothing.
         Note: We peek at the value of the current owner without acquiring
-	the kernel mutex. **/
+	the lock mutex. **/
 	if (trx == table->autoinc_trx) {
 
 		return(DB_SUCCESS);
@@ -1029,10 +1095,8 @@ row_lock_table_autoinc_for_mysql(
 
 	trx->op_info = "setting auto-inc lock";
 
-	if (node == NULL) {
-		row_get_prebuilt_insert_row(prebuilt);
-		node = prebuilt->ins_node;
-	}
+	row_get_prebuilt_insert_row(prebuilt);
+	node = prebuilt->ins_node;
 
 	/* We use the insert query graph as the dummy graph needed
 	in the lock module call */
@@ -1048,7 +1112,7 @@ run_again:
 	/* It may be that the current session has not yet started
 	its transaction, or it has been committed: */
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	err = lock_table(0, prebuilt->table, LOCK_AUTO_INC, thr);
 
@@ -1065,21 +1129,21 @@ run_again:
 
 		trx->op_info = "";
 
-		return((int) err);
+		return(err);
 	}
 
 	que_thr_stop_for_mysql_no_error(thr, trx);
 
 	trx->op_info = "";
 
-	return((int) err);
+	return(err);
 }
 
 /*********************************************************************//**
 Sets a table lock on the table mentioned in prebuilt.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_lock_table_for_mysql(
 /*=====================*/
 	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct in the MySQL
@@ -1093,7 +1157,7 @@ row_lock_table_for_mysql(
 {
 	trx_t*		trx		= prebuilt->trx;
 	que_thr_t*	thr;
-	ulint		err;
+	dberr_t		err;
 	ibool		was_lock_wait;
 
 	ut_ad(trx);
@@ -1119,13 +1183,18 @@ run_again:
 	/* It may be that the current session has not yet started
 	its transaction, or it has been committed: */
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	if (table) {
-		err = lock_table(0, table, mode, thr);
+		err = lock_table(
+			0, table,
+			static_cast<enum lock_mode>(mode), thr);
 	} else {
-		err = lock_table(0, prebuilt->table,
-				 prebuilt->select_lock_type, thr);
+		err = lock_table(
+			0, prebuilt->table,
+			static_cast<enum lock_mode>(
+				prebuilt->select_lock_type),
+			thr);
 	}
 
 	trx->error_state = err;
@@ -1141,21 +1210,21 @@ run_again:
 
 		trx->op_info = "";
 
-		return((int) err);
+		return(err);
 	}
 
 	que_thr_stop_for_mysql_no_error(thr, trx);
 
 	trx->op_info = "";
 
-	return((int) err);
+	return(err);
 }
 
 /*********************************************************************//**
 Does an insert for MySQL.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_insert_for_mysql(
 /*=================*/
 	byte*		mysql_rec,	/*!< in: row in the MySQL format */
@@ -1164,31 +1233,31 @@ row_insert_for_mysql(
 {
 	trx_savept_t	savept;
 	que_thr_t*	thr;
-	ulint		err;
+	dberr_t		err;
 	ibool		was_lock_wait;
 	trx_t*		trx		= prebuilt->trx;
 	ins_node_t*	node		= prebuilt->ins_node;
+	dict_table_t*	table		= prebuilt->table;
 
 	ut_ad(trx);
 
-	if (prebuilt->table->ibd_file_missing) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr, "  InnoDB: Error:\n"
-			"InnoDB: MySQL is trying to use a table handle"
-			" but the .ibd file for\n"
-			"InnoDB: table %s does not exist.\n"
-			"InnoDB: Have you deleted the .ibd file"
-			" from the database directory under\n"
-			"InnoDB: the MySQL datadir, or have you"
-			" used DISCARD TABLESPACE?\n"
-			"InnoDB: Look from\n"
-			"InnoDB: " REFMAN "innodb-troubleshooting.html\n"
-			"InnoDB: how you can resolve the problem.\n",
+	if (dict_table_is_discarded(prebuilt->table)) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"The table %s doesn't have a corresponding "
+			"tablespace, it was discarded.",
 			prebuilt->table->name);
-		return(DB_ERROR);
-	}
 
-	if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
+		return(DB_TABLESPACE_DELETED);
+
+	} else if (prebuilt->table->ibd_file_missing) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			".ibd file is missing for table %s",
+			prebuilt->table->name);
+
+		return(DB_TABLESPACE_NOT_FOUND);
+
+	} else if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
 		fprintf(stderr,
 			"InnoDB: Error: trying to free a corrupt\n"
 			"InnoDB: table handle. Magic n %lu, table name ",
@@ -1199,9 +1268,7 @@ row_insert_for_mysql(
 		mem_analyze_corruption(prebuilt);
 
 		ut_error;
-	}
-
-	if (UNIV_UNLIKELY(srv_created_new_raw || srv_force_recovery)) {
+	} else if (srv_created_new_raw || srv_force_recovery) {
 		fputs("InnoDB: A new raw disk partition was initialized or\n"
 		      "InnoDB: innodb_force_recovery is on: we do not allow\n"
 		      "InnoDB: database modifications by the user. Shut down\n"
@@ -1217,12 +1284,10 @@ row_insert_for_mysql(
 
 	row_mysql_delay_if_needed();
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
-	if (node == NULL) {
-		row_get_prebuilt_insert_row(prebuilt);
-		node = prebuilt->ins_node;
-	}
+	row_get_prebuilt_insert_row(prebuilt);
+	node = prebuilt->ins_node;
 
 	row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec);
 
@@ -1230,13 +1295,6 @@ row_insert_for_mysql(
 
 	thr = que_fork_get_first_thr(prebuilt->ins_graph);
 
-	if (!prebuilt->mysql_has_locked && !(prebuilt->table->flags & (DICT_TF2_TEMPORARY << DICT_TF2_SHIFT))) {
-		fprintf(stderr, "InnoDB: Error: row_insert_for_mysql is called without ha_innobase::external_lock()\n");
-		if (trx->mysql_thd != NULL) {
-			innobase_mysql_print_thd(stderr, trx->mysql_thd, 600);
-		}
-	}
-
 	if (prebuilt->sql_stat_start) {
 		node->state = INS_NODE_SET_IX_LOCK;
 		prebuilt->sql_stat_start = FALSE;
@@ -1255,41 +1313,99 @@ run_again:
 	err = trx->error_state;
 
 	if (err != DB_SUCCESS) {
+error_exit:
 		que_thr_stop_for_mysql(thr);
 
-		/* TODO: what is this? */ thr->lock_state= QUE_THR_LOCK_ROW;
+		/* FIXME: What's this ? */
+		thr->lock_state = QUE_THR_LOCK_ROW;
 
-		was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
-							&savept);
-		thr->lock_state= QUE_THR_LOCK_NOLOCK;
+		was_lock_wait = row_mysql_handle_errors(
+			&err, trx, thr, &savept);
+
+		thr->lock_state = QUE_THR_LOCK_NOLOCK;
 
 		if (was_lock_wait) {
+			ut_ad(node->state == INS_NODE_INSERT_ENTRIES
+			      || node->state == INS_NODE_ALLOC_ROW_ID);
 			goto run_again;
 		}
 
 		trx->op_info = "";
 
-		return((int) err);
+		return(err);
+	}
+
+	if (dict_table_has_fts_index(table)) {
+		doc_id_t        doc_id;
+
+		/* Extract the doc id from the hidden FTS column */
+		doc_id = fts_get_doc_id_from_row(table, node->row);
+
+		if (doc_id <= 0) {
+			fprintf(stderr,
+				"InnoDB: FTS Doc ID must be large than 0 \n");
+			err = DB_FTS_INVALID_DOCID;
+			trx->error_state = DB_FTS_INVALID_DOCID;
+			goto error_exit;
+		}
+
+		if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			doc_id_t	next_doc_id
+				= table->fts->cache->next_doc_id;
+
+			if (doc_id < next_doc_id) {
+				fprintf(stderr,
+					"InnoDB: FTS Doc ID must be large than"
+					" "UINT64PF" for table",
+					next_doc_id - 1);
+				ut_print_name(stderr, trx, TRUE, table->name);
+				putc('\n', stderr);
+
+				err = DB_FTS_INVALID_DOCID;
+				trx->error_state = DB_FTS_INVALID_DOCID;
+				goto error_exit;
+			}
+
+			/* Difference between Doc IDs are restricted within
+			4 bytes integer. See fts_get_encoded_len() */
+
+			if (doc_id - next_doc_id >= FTS_DOC_ID_MAX_STEP) {
+				fprintf(stderr,
+					"InnoDB: Doc ID "UINT64PF" is too"
+					" big. Its difference with largest"
+					" used Doc ID "UINT64PF" cannot"
+					" exceed or equal to %d\n",
+					doc_id, next_doc_id - 1,
+					FTS_DOC_ID_MAX_STEP);
+				err = DB_FTS_INVALID_DOCID;
+				trx->error_state = DB_FTS_INVALID_DOCID;
+				goto error_exit;
+			}
+		}
+
+		/* Pass NULL for the columns affected, since an INSERT affects
+		all FTS indexes. */
+		fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL);
 	}
 
 	que_thr_stop_for_mysql_no_error(thr, trx);
 
 	if (UNIV_LIKELY(!(trx->fake_changes))) {
 
-		prebuilt->table->stat_n_rows++;
+		srv_stats.n_rows_inserted.add((size_t)trx->id, 1);
 
-		if (prebuilt->table->stat_n_rows == 0) {
-			/* Avoid wrap-over */
-			prebuilt->table->stat_n_rows--;
-		}
+		/* Not protected by dict_table_stats_lock() for performance
+		reasons, we would rather get garbage in stat_n_rows (which is
+		just an estimate anyway) than protecting the following code
+		with a latch. */
+		dict_table_n_rows_inc(table);
 
-		srv_n_rows_inserted++;
-		row_update_statistics_if_needed(prebuilt->table);
+		row_update_statistics_if_needed(table);
 	}
 
 	trx->op_info = "";
 
-	return((int) err);
+	return(err);
 }
 
 /*********************************************************************//**
@@ -1309,10 +1425,11 @@ row_prebuild_sel_graph(
 
 		node = sel_node_create(prebuilt->heap);
 
-		prebuilt->sel_graph = que_node_get_parent(
-			pars_complete_graph_for_exec(node,
-						     prebuilt->trx,
-						     prebuilt->heap));
+		prebuilt->sel_graph = static_cast<que_fork_t*>(
+			que_node_get_parent(
+				pars_complete_graph_for_exec(
+					static_cast<sel_node_t*>(node),
+					prebuilt->trx, prebuilt->heap)));
 
 		prebuilt->sel_graph->state = QUE_FORK_ACTIVE;
 	}
@@ -1380,21 +1497,115 @@ row_get_prebuilt_update_vector(
 
 		prebuilt->upd_node = node;
 
-		prebuilt->upd_graph = que_node_get_parent(
-			pars_complete_graph_for_exec(node,
-						     prebuilt->trx,
-						     prebuilt->heap));
+		prebuilt->upd_graph = static_cast<que_fork_t*>(
+			que_node_get_parent(
+				pars_complete_graph_for_exec(
+					static_cast<upd_node_t*>(node),
+					prebuilt->trx, prebuilt->heap)));
+
 		prebuilt->upd_graph->state = QUE_FORK_ACTIVE;
 	}
 
 	return(prebuilt->upd_node->update);
 }
 
+/********************************************************************
+Handle an update of a column that has an FTS index. */
+static
+void
+row_fts_do_update(
+/*==============*/
+	trx_t*		trx,		/* in: transaction */
+	dict_table_t*	table,		/* in: Table with FTS index */
+	doc_id_t	old_doc_id,	/* in: old document id */
+	doc_id_t	new_doc_id)	/* in: new document id */
+{
+	if (trx->fts_next_doc_id) {
+		fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL);
+		fts_trx_add_op(trx, table, new_doc_id, FTS_INSERT, NULL);
+	}
+}
+
+/************************************************************************
+Handles FTS matters for an update or a delete.
+NOTE: should not be called if the table does not have an FTS index. .*/
+static
+dberr_t
+row_fts_update_or_delete(
+/*=====================*/
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	trx_t*		trx = prebuilt->trx;
+	dict_table_t*	table = prebuilt->table;
+	upd_node_t*	node = prebuilt->upd_node;
+	doc_id_t	old_doc_id = prebuilt->fts_doc_id;
+
+	ut_a(dict_table_has_fts_index(prebuilt->table));
+
+	/* Deletes are simple; get them out of the way first. */
+	if (node->is_delete) {
+		/* A delete affects all FTS indexes, so we pass NULL */
+		fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL);
+	} else {
+		doc_id_t	new_doc_id;
+
+		new_doc_id = fts_read_doc_id((byte*) &trx->fts_next_doc_id);
+
+		if (new_doc_id == 0) {
+			fprintf(stderr, " InnoDB FTS: Doc ID cannot be 0 \n");
+			return(DB_FTS_INVALID_DOCID);
+		}
+
+		row_fts_do_update(trx, table, old_doc_id, new_doc_id);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Initialize the Doc ID system for FK table with FTS index */
+static
+void
+init_fts_doc_id_for_ref(
+/*====================*/
+	dict_table_t*	table,		/*!< in: table */
+	ulint*		depth)		/*!< in: recusive call depth */
+{
+	dict_foreign_t* foreign;
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	table->fk_max_recusive_level = 0;
+
+	(*depth)++;
+
+	/* Limit on tables involved in cascading delete/update */
+	if (*depth > FK_MAX_CASCADE_DEL) {
+		return;
+	}
+
+	/* Loop through this table's referenced list and also
+	recursively traverse each table's foreign table list */
+	while (foreign && foreign->foreign_table) {
+		if (foreign->foreign_table->fts) {
+			fts_init_doc_id(foreign->foreign_table);
+		}
+
+		if (UT_LIST_GET_LEN(foreign->foreign_table->referenced_list)
+		    > 0 && foreign->foreign_table != table) {
+			init_fts_doc_id_for_ref(foreign->foreign_table, depth);
+		}
+
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+}
+
 /*********************************************************************//**
 Does an update or delete of a row for MySQL.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_update_for_mysql(
 /*=================*/
 	byte*		mysql_rec,	/*!< in: the row to be updated, in
@@ -1403,7 +1614,7 @@ row_update_for_mysql(
 					handle */
 {
 	trx_savept_t	savept;
-	ulint		err;
+	dberr_t		err;
 	que_thr_t*	thr;
 	ibool		was_lock_wait;
 	dict_index_t*	clust_index;
@@ -1411,6 +1622,7 @@ row_update_for_mysql(
 	upd_node_t*	node;
 	dict_table_t*	table		= prebuilt->table;
 	trx_t*		trx		= prebuilt->trx;
+	ulint		fk_depth	= 0;
 
 	ut_ad(prebuilt && trx);
 	UT_NOT_USED(mysql_rec);
@@ -1463,7 +1675,19 @@ row_update_for_mysql(
 
 	row_mysql_delay_if_needed();
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
+
+	if (dict_table_is_referenced_by_foreign_key(table)) {
+		/* Share lock the data dictionary to prevent any
+		table dictionary (for foreign constraint) change.
+		This is similar to row_ins_check_foreign_constraint
+		check protect by the dictionary lock as well.
+		In the future, this can be removed once the Foreign
+		key MDL is implemented */
+		row_mysql_freeze_data_dictionary(trx);
+		init_fts_doc_id_for_ref(table, &fk_depth);
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
 
 	node = prebuilt->upd_node;
 
@@ -1514,10 +1738,13 @@ run_again:
 			trx->error_state = DB_SUCCESS;
 			trx->op_info = "";
 
-			return((int) err);
+			return(err);
 		}
 
 		thr->lock_state= QUE_THR_LOCK_ROW;
+
+		DEBUG_SYNC(trx->mysql_thd, "row_update_for_mysql_error");
+
 		was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
 							&savept);
 		thr->lock_state= QUE_THR_LOCK_NOLOCK;
@@ -1528,24 +1755,36 @@ run_again:
 
 		trx->op_info = "";
 
-		return((int) err);
+		return(err);
 	}
 
 	que_thr_stop_for_mysql_no_error(thr, trx);
 
 	if (UNIV_UNLIKELY(trx->fake_changes)) {
+
 		trx->op_info = "";
-		return((int) err);
+		return(err);
 	}
 
-	if (node->is_delete) {
-		if (prebuilt->table->stat_n_rows > 0) {
-			prebuilt->table->stat_n_rows--;
+	if (dict_table_has_fts_index(table)
+	    && trx->fts_next_doc_id != UINT64_UNDEFINED) {
+		err = row_fts_update_or_delete(prebuilt);
+		if (err != DB_SUCCESS) {
+			trx->op_info = "";
+			return(err);
 		}
+	}
 
-		srv_n_rows_deleted++;
+	if (node->is_delete) {
+		/* Not protected by dict_table_stats_lock() for performance
+		reasons, we would rather get garbage in stat_n_rows (which is
+		just an estimate anyway) than protecting the following code
+		with a latch. */
+		dict_table_n_rows_dec(prebuilt->table);
+
+		srv_stats.n_rows_deleted.add((size_t)trx->id, 1);
 	} else {
-		srv_n_rows_updated++;
+		srv_stats.n_rows_updated.add((size_t)trx->id, 1);
 	}
 
 	/* We update table statistics only if it is a DELETE or UPDATE
@@ -1557,7 +1796,7 @@ run_again:
 
 	trx->op_info = "";
 
-	return((int) err);
+	return(err);
 }
 
 /*********************************************************************//**
@@ -1571,7 +1810,7 @@ prebuilt->clust_pcur.  Thus, this implements a 'mini-rollback' that
 releases the latest clustered index record lock we set.
 @return error code or DB_SUCCESS */
 UNIV_INTERN
-int
+void
 row_unlock_for_mysql(
 /*=================*/
 	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct in MySQL
@@ -1597,8 +1836,7 @@ row_unlock_for_mysql(
 			"InnoDB: innodb_locks_unsafe_for_binlog is FALSE and\n"
 			"InnoDB: this session is not using"
 			" READ COMMITTED isolation level.\n");
-
-		return(DB_SUCCESS);
+		return;
 	}
 
 	trx->op_info = "unlock_row";
@@ -1634,7 +1872,7 @@ row_unlock_for_mysql(
 			index = btr_pcur_get_btr_cur(clust_pcur)->index;
 		}
 
-		if (UNIV_UNLIKELY(!dict_index_is_clust(index))) {
+		if (!dict_index_is_clust(index)) {
 			/* This is not a clustered index record.  We
 			do not know how to unlock the record. */
 			goto no_unlock;
@@ -1666,19 +1904,23 @@ row_unlock_for_mysql(
 			/* We did not update the record: unlock it */
 
 			rec = btr_pcur_get_rec(pcur);
-			index = btr_pcur_get_btr_cur(pcur)->index;
 
-			lock_rec_unlock(trx, btr_pcur_get_block(pcur),
-					rec, prebuilt->select_lock_type);
+			lock_rec_unlock(
+				trx,
+				btr_pcur_get_block(pcur),
+				rec,
+				static_cast<enum lock_mode>(
+					prebuilt->select_lock_type));
 
 			if (prebuilt->new_rec_locks >= 2) {
 				rec = btr_pcur_get_rec(clust_pcur);
-				index = btr_pcur_get_btr_cur(clust_pcur)->index;
 
-				lock_rec_unlock(trx,
-						btr_pcur_get_block(clust_pcur),
-						rec,
-						prebuilt->select_lock_type);
+				lock_rec_unlock(
+					trx,
+					btr_pcur_get_block(clust_pcur),
+					rec,
+					static_cast<enum lock_mode>(
+						prebuilt->select_lock_type));
 			}
 		}
 no_unlock:
@@ -1686,15 +1928,13 @@ no_unlock:
 	}
 
 	trx->op_info = "";
-
-	return(DB_SUCCESS);
 }
 
 /**********************************************************************//**
 Does a cascaded delete or set null in a foreign key operation.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 row_update_cascade_for_mysql(
 /*=========================*/
 	que_thr_t*	thr,	/*!< in: query thread */
@@ -1702,7 +1942,7 @@ row_update_cascade_for_mysql(
 				or set null operation */
 	dict_table_t*	table)	/*!< in: table where we do the operation */
 {
-	ulint	err;
+	dberr_t	err;
 	trx_t*	trx;
 
 	trx = thr_get_trx(thr);
@@ -1713,12 +1953,14 @@ row_update_cascade_for_mysql(
 	thr->fk_cascade_depth++;
 
 	if (thr->fk_cascade_depth > FK_MAX_CASCADE_DEL) {
-		return (DB_FOREIGN_EXCEED_MAX_CASCADE);
+		return(DB_FOREIGN_EXCEED_MAX_CASCADE);
 	}
 run_again:
 	thr->run_node = node;
 	thr->prev_node = node;
 
+	DEBUG_SYNC_C("foreign_constraint_update_cascade");
+
 	row_upd_step(thr);
 
 	/* The recursive call for cascading update/delete happens
@@ -1738,7 +1980,7 @@ run_again:
 
 		que_thr_stop_for_mysql(thr);
 
-		srv_suspend_mysql_thread(thr);
+		lock_wait_suspend_thread(thr);
 
 		/* Note that a lock wait may also end in a lock wait timeout,
 		or this transaction is picked as a victim in selective
@@ -1765,13 +2007,15 @@ run_again:
 	}
 
 	if (node->is_delete) {
-		if (table->stat_n_rows > 0) {
-			table->stat_n_rows--;
-		}
+		/* Not protected by dict_table_stats_lock() for performance
+		reasons, we would rather get garbage in stat_n_rows (which is
+		just an estimate anyway) than protecting the following code
+		with a latch. */
+		dict_table_n_rows_dec(table);
 
-		srv_n_rows_deleted++;
+		srv_stats.n_rows_deleted.add((size_t)trx->id, 1);
 	} else {
-		srv_n_rows_updated++;
+		srv_stats.n_rows_updated.add((size_t)trx->id, 1);
 	}
 
 	row_update_statistics_if_needed(table);
@@ -1822,6 +2066,8 @@ row_mysql_unfreeze_data_dictionary(
 /*===============================*/
 	trx_t*	trx)	/*!< in/out: transaction */
 {
+	ut_ad(lock_trx_has_sys_table_locks(trx) == NULL);
+
 	ut_a(trx->dict_operation_lock_mode == RW_S_LATCH);
 
 	rw_lock_s_unlock(&dict_operation_lock);
@@ -1860,6 +2106,8 @@ row_mysql_unlock_data_dictionary(
 /*=============================*/
 	trx_t*	trx)	/*!< in/out: transaction */
 {
+	ut_ad(lock_trx_has_sys_table_locks(trx) == NULL);
+
 	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
 
 	/* Serialize data dictionary operations with dictionary mutex:
@@ -1880,19 +2128,21 @@ InnoDB will try to invoke mem_validate(). On failure the transaction will
 be rolled back and the 'table' object will be freed.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_create_table_for_mysql(
 /*=======================*/
 	dict_table_t*	table,	/*!< in, own: table definition
-				(will be freed) */
-	trx_t*		trx)	/*!< in: transaction handle */
+				(will be freed, or on DB_SUCCESS
+				added to the data dictionary cache) */
+	trx_t*		trx,	/*!< in/out: transaction */
+	bool		commit)	/*!< in: if true, commit the transaction */
 {
 	tab_node_t*	node;
 	mem_heap_t*	heap;
 	que_thr_t*	thr;
 	const char*	table_name;
 	ulint		table_name_len;
-	ulint		err;
+	dberr_t		err;
 
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
@@ -1900,6 +2150,11 @@ row_create_table_for_mysql(
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
 
+	DBUG_EXECUTE_IF(
+		"ib_create_table_fail_at_start_of_row_create_table_for_mysql",
+		goto err_exit;
+	);
+
 	if (srv_created_new_raw) {
 		fputs("InnoDB: A new raw disk partition was initialized:\n"
 		      "InnoDB: we do not allow database modifications"
@@ -1908,7 +2163,10 @@ row_create_table_for_mysql(
 		      " is replaced with raw.\n", stderr);
 err_exit:
 		dict_mem_table_free(table);
-		trx_commit_for_mysql(trx);
+
+		if (commit) {
+			trx_commit_for_mysql(trx);
+		}
 
 		return(DB_ERROR);
 	}
@@ -1926,15 +2184,13 @@ err_exit:
 		goto err_exit;
 	}
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	/* The table name is prefixed with the database name and a '/'.
 	Certain table names starting with 'innodb_' have their special
 	meaning regardless of the database name.  Thus, we need to
 	ignore the database name prefix in the comparisons. */
-	table_name = strchr(table->name, '/');
-	ut_a(table_name);
-	table_name++;
+	table_name = dict_remove_db_name(table->name);
 	table_name_len = strlen(table_name) + 1;
 
 	if (STR_EQ(table_name, table_name_len, S_innodb_monitor)) {
@@ -1947,23 +2203,24 @@ err_exit:
 		/* The lock timeout monitor thread also takes care
 		of InnoDB monitor prints */
 
-		os_event_set(srv_lock_timeout_thread_event);
+		os_event_set(lock_sys->timeout_event);
 	} else if (STR_EQ(table_name, table_name_len,
 			  S_innodb_lock_monitor)) {
 
 		srv_print_innodb_monitor = TRUE;
 		srv_print_innodb_lock_monitor = TRUE;
-		os_event_set(srv_lock_timeout_thread_event);
+		os_event_set(lock_sys->timeout_event);
 	} else if (STR_EQ(table_name, table_name_len,
 			  S_innodb_tablespace_monitor)) {
 
 		srv_print_innodb_tablespace_monitor = TRUE;
-		os_event_set(srv_lock_timeout_thread_event);
+		os_event_set(lock_sys->timeout_event);
 	} else if (STR_EQ(table_name, table_name_len,
 			  S_innodb_table_monitor)) {
 
 		srv_print_innodb_table_monitor = TRUE;
-		os_event_set(srv_lock_timeout_thread_event);
+		os_event_set(lock_sys->timeout_event);
+#ifdef UNIV_MEM_DEBUG
 	} else if (STR_EQ(table_name, table_name_len,
 			  S_innodb_mem_validate)) {
 		/* We define here a debugging feature intended for
@@ -1976,34 +2233,65 @@ err_exit:
 		      "quiet because allocation from a mem heap"
 		      " is not protected\n"
 		      "by any semaphore.\n", stderr);
-#ifdef UNIV_MEM_DEBUG
 		ut_a(mem_validate());
 		fputs("Memory validated\n", stderr);
-#else /* UNIV_MEM_DEBUG */
-		fputs("Memory NOT validated (recompile with UNIV_MEM_DEBUG)\n",
-		      stderr);
 #endif /* UNIV_MEM_DEBUG */
 	}
 
 	heap = mem_heap_create(512);
 
-	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+	switch (trx_get_dict_operation(trx)) {
+	case TRX_DICT_OP_NONE:
+		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+	case TRX_DICT_OP_TABLE:
+		break;
+	case TRX_DICT_OP_INDEX:
+		/* If the transaction was previously flagged as
+		TRX_DICT_OP_INDEX, we should be creating auxiliary
+		tables for full-text indexes. */
+		ut_ad(strstr(table->name, "/FTS_") != NULL);
+	}
 
-	node = tab_create_graph_create(table, heap);
+	node = tab_create_graph_create(table, heap, commit);
 
 	thr = pars_complete_graph_for_exec(node, trx, heap);
 
-	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+	ut_a(thr == que_fork_start_command(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
 	que_run_threads(thr);
 
 	err = trx->error_state;
 
+	if (table->space != TRX_SYS_SPACE) {
+		ut_a(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_TABLESPACE));
+
+		/* Update SYS_TABLESPACES and SYS_DATAFILES if a new
+		tablespace was created. */
+		if (err == DB_SUCCESS) {
+			char*	path;
+			path = fil_space_get_first_path(table->space);
+
+			err = dict_create_add_tablespace_to_dictionary(
+				table->space, table->name,
+				fil_space_get_flags(table->space),
+				path, trx, commit);
+
+			mem_free(path);
+		}
+
+		if (err != DB_SUCCESS) {
+			/* We must delete the link file. */
+			fil_delete_link_file(table->name);
+		}
+	}
+
 	switch (err) {
 	case DB_SUCCESS:
 		break;
 	case DB_OUT_OF_FILE_SPACE:
 		trx->error_state = DB_SUCCESS;
-		trx_general_rollback_for_mysql(trx, NULL);
+		trx_rollback_to_savepoint(trx, NULL);
 
 		ut_print_timestamp(stderr);
 		fputs("  InnoDB: Warning: cannot create table ",
@@ -2011,20 +2299,37 @@ err_exit:
 		ut_print_name(stderr, trx, TRUE, table->name);
 		fputs(" because tablespace full\n", stderr);
 
-		if (dict_table_get_low(table->name, DICT_ERR_IGNORE_NONE)) {
+		if (dict_table_open_on_name(table->name, TRUE, FALSE,
+					    DICT_ERR_IGNORE_NONE)) {
+
+			/* Make things easy for the drop table code. */
+
+			if (table->can_be_evicted) {
+				dict_table_move_from_lru_to_non_lru(table);
+			}
+
+			dict_table_close(table, TRUE, FALSE);
 
 			row_drop_table_for_mysql(table->name, trx, FALSE);
-			trx_commit_for_mysql(trx);
+
+			if (commit) {
+				trx_commit_for_mysql(trx);
+			}
 		} else {
 			dict_mem_table_free(table);
 		}
+
 		break;
 
 	case DB_TOO_MANY_CONCURRENT_TRXS:
 		/* We already have .ibd file here. it should be deleted. */
 
-		if (table->space && !fil_delete_tablespace(table->space,
-							   FALSE)) {
+		if (table->space
+		    && fil_delete_tablespace(
+			    table->space,
+			    BUF_REMOVE_FLUSH_NO_WRITE)
+		    != DB_SUCCESS) {
+
 			ut_print_timestamp(stderr);
 			fprintf(stderr,
 				"  InnoDB: Error: not able to"
@@ -2036,12 +2341,10 @@ err_exit:
 		/* fall through */
 
 	case DB_DUPLICATE_KEY:
+	case DB_TABLESPACE_EXISTS:
 	default:
-		/* We may also get err == DB_ERROR if the .ibd file for the
-		table already exists */
-
 		trx->error_state = DB_SUCCESS;
-		trx_general_rollback_for_mysql(trx, NULL);
+		trx_rollback_to_savepoint(trx, NULL);
 		dict_mem_table_free(table);
 		break;
 	}
@@ -2050,7 +2353,7 @@ err_exit:
 
 	trx->op_info = "";
 
-	return((int) err);
+	return(err);
 }
 
 /*********************************************************************//**
@@ -2059,7 +2362,7 @@ to create an index results in dropping the whole table! This is no problem
 currently as all indexes must be created at the same time as the table.
 @return	error number or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_create_index_for_mysql(
 /*=======================*/
 	dict_index_t*	index,		/*!< in, own: index definition
@@ -2075,11 +2378,13 @@ row_create_index_for_mysql(
 	ind_node_t*	node;
 	mem_heap_t*	heap;
 	que_thr_t*	thr;
-	ulint		err;
+	dberr_t		err;
 	ulint		i;
 	ulint		len;
 	char*		table_name;
+	char*		index_name;
 	dict_table_t*	table;
+	ibool		is_fts;
 
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
@@ -2092,10 +2397,14 @@ row_create_index_for_mysql(
 	table later, after the index object is freed (inside
 	que_run_threads()) and thus index->table_name is not available. */
 	table_name = mem_strdup(index->table_name);
+	index_name = mem_strdup(index->name);
+
+	is_fts = (index->type == DICT_FTS);
 
-	table = dict_table_get_low(table_name, DICT_ERR_IGNORE_NONE);
+	table = dict_table_open_on_name(table_name, TRUE, TRUE,
+					DICT_ERR_IGNORE_NONE);
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	for (i = 0; i < index->n_def; i++) {
 		/* Check that prefix_len and actual length
@@ -2107,6 +2416,11 @@ row_create_index_for_mysql(
 			len = ut_max(len, field_lengths[i]);
 		}
 
+		DBUG_EXECUTE_IF(
+			"ib_create_table_fail_at_create_index",
+			len = DICT_MAX_FIELD_LEN_BY_FORMAT(table) + 1;
+		);
+
 		/* Column or prefix length exceeds maximum column length */
 		if (len > (ulint) DICT_MAX_FIELD_LEN_BY_FORMAT(table)) {
 			err = DB_TOO_BIG_INDEX_COL;
@@ -2121,27 +2435,40 @@ row_create_index_for_mysql(
 	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
 
 	/* Note that the space id where we store the index is inherited from
-	the table in dict_build_index_def_step() in dict0crea.c. */
+	the table in dict_build_index_def_step() in dict0crea.cc. */
 
-	node = ind_create_graph_create(index, heap);
+	node = ind_create_graph_create(index, heap, true);
 
 	thr = pars_complete_graph_for_exec(node, trx, heap);
 
-	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+	ut_a(thr == que_fork_start_command(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
 	que_run_threads(thr);
 
 	err = trx->error_state;
 
 	que_graph_free((que_t*) que_node_get_parent(thr));
 
+	/* Create the index specific FTS auxiliary tables. */
+	if (err == DB_SUCCESS && is_fts) {
+		dict_index_t*	idx;
+
+		idx = dict_table_get_index_on_name(table, index_name);
+
+		ut_ad(idx);
+		err = fts_create_index_tables(trx, idx);
+	}
+
 error_handling:
+	dict_table_close(table, TRUE, FALSE);
 
 	if (err != DB_SUCCESS) {
 		/* We have special error handling here */
 
 		trx->error_state = DB_SUCCESS;
 
-		trx_general_rollback_for_mysql(trx, NULL);
+		trx_rollback_to_savepoint(trx, NULL);
 
 		row_drop_table_for_mysql(table_name, trx, FALSE);
 
@@ -2153,73 +2480,9 @@ error_handling:
 	trx->op_info = "";
 
 	mem_free(table_name);
+	mem_free(index_name);
 
-	return((int) err);
-}
-
-/*********************************************************************//**
-*/
-UNIV_INTERN
-int
-row_insert_stats_for_mysql(
-/*=======================*/
-	dict_index_t*	index,
-	trx_t*		trx)
-{
-	ind_node_t*	node;
-	mem_heap_t*	heap;
-	que_thr_t*	thr;
-	ulint		err;
-
-	//ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
-
-	trx->op_info = "try to insert rows to SYS_STATS";
-
-	trx_start_if_not_started(trx);
-	trx->error_state = DB_SUCCESS;
-
-	heap = mem_heap_create(512);
-
-	node = ind_insert_stats_graph_create(index, heap);
-
-	thr = pars_complete_graph_for_exec(node, trx, heap);
-
-	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
-	que_run_threads(thr);
-
-	err = trx->error_state;
-
-	que_graph_free((que_t*) que_node_get_parent(thr));
-
-	trx->op_info = "";
-
-	return((int) err);
-}
-
-/*********************************************************************//**
-*/
-UNIV_INTERN
-int
-row_delete_stats_for_mysql(
-/*=============================*/
-	dict_index_t*	index,
-	trx_t*		trx)
-{
-	pars_info_t*	info	= pars_info_create();
-
-	trx->op_info = "delete rows from SYS_STATS";
-
-	trx_start_if_not_started(trx);
-	trx->error_state = DB_SUCCESS;
-
-	pars_info_add_ull_literal(info, "indexid", index->id);
-
-	return((int) que_eval_sql(info,
-				  "PROCEDURE DELETE_STATISTICS_PROC () IS\n"
-				  "BEGIN\n"
-				  "DELETE FROM SYS_STATS WHERE INDEX_ID = :indexid;\n"
-				  "END;\n"
-				  , TRUE, trx));
+	return(err);
 }
 
 /*********************************************************************//**
@@ -2232,7 +2495,7 @@ fields than mentioned in the constraint. Check also that foreign key
 constraints which reference this table are ok.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_table_add_foreign_constraints(
 /*==============================*/
 	trx_t*		trx,		/*!< in: transaction */
@@ -2249,7 +2512,7 @@ row_table_add_foreign_constraints(
 					code DB_CANNOT_ADD_CONSTRAINT if
 					any foreign keys are found. */
 {
-	ulint	err;
+	dberr_t	err;
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 #ifdef UNIV_SYNC_DEBUG
@@ -2259,15 +2522,21 @@ row_table_add_foreign_constraints(
 
 	trx->op_info = "adding foreign keys";
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
 
 	err = dict_create_foreign_constraints(trx, sql_string, sql_length,
 					      name, reject_fks);
+
+	DBUG_EXECUTE_IF("ib_table_add_foreign_fail",
+			err = DB_DUPLICATE_KEY;);
+
+	DEBUG_SYNC_C("table_add_foreign_constraints");
+
 	if (err == DB_SUCCESS) {
 		/* Check that also referencing constraints are ok */
-		err = dict_load_foreigns(name, FALSE, TRUE,
+		err = dict_load_foreigns(name, NULL, false, true,
 					 DICT_ERR_IGNORE_NONE);
 	}
 
@@ -2276,7 +2545,7 @@ row_table_add_foreign_constraints(
 
 		trx->error_state = DB_SUCCESS;
 
-		trx_general_rollback_for_mysql(trx, NULL);
+		trx_rollback_to_savepoint(trx, NULL);
 
 		row_drop_table_for_mysql(name, trx, FALSE);
 
@@ -2285,7 +2554,7 @@ row_table_add_foreign_constraints(
 		trx->error_state = DB_SUCCESS;
 	}
 
-	return((int) err);
+	return(err);
 }
 
 /*********************************************************************//**
@@ -2294,15 +2563,15 @@ in ALTER TABLE to the fact that the table handler does not remove the
 table before all handles to it has been removed. Furhermore, the MySQL's
 call to drop table must be non-blocking. Therefore we do the drop table
 as a background operation, which is taken care of by the master thread
-in srv0srv.c.
+in srv0srv.cc.
 @return	error code or DB_SUCCESS */
 static
-int
+dberr_t
 row_drop_table_for_mysql_in_background(
 /*===================================*/
 	const char*	name)	/*!< in: table name */
 {
-	ulint	error;
+	dberr_t	error;
 	trx_t*	trx;
 
 	trx = trx_allocate_for_background();
@@ -2331,11 +2600,11 @@ row_drop_table_for_mysql_in_background(
 
 	trx_free_for_background(trx);
 
-	return((int) error);
+	return(error);
 }
 
 /*********************************************************************//**
-The master thread in srv0srv.c calls this regularly to drop tables which
+The master thread in srv0srv.cc calls this regularly to drop tables which
 we must drop in background after queries to them have ended. Such lazy
 dropping of tables is needed in ALTER TABLE on Unix.
 @return	how many tables dropped + remaining tables in list */
@@ -2349,19 +2618,15 @@ row_drop_tables_for_mysql_in_background(void)
 	ulint			n_tables;
 	ulint			n_tables_dropped = 0;
 loop:
-	mutex_enter(&kernel_mutex);
-
-	if (!row_mysql_drop_list_inited) {
+	mutex_enter(&row_drop_list_mutex);
 
-		UT_LIST_INIT(row_mysql_drop_list);
-		row_mysql_drop_list_inited = TRUE;
-	}
+	ut_a(row_mysql_drop_list_inited);
 
 	drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
 
 	n_tables = UT_LIST_GET_LEN(row_mysql_drop_list);
 
-	mutex_exit(&kernel_mutex);
+	mutex_exit(&row_drop_list_mutex);
 
 	if (drop == NULL) {
 		/* All tables dropped */
@@ -2369,9 +2634,8 @@ loop:
 		return(n_tables + n_tables_dropped);
 	}
 
-	mutex_enter(&(dict_sys->mutex));
-	table = dict_table_get_low(drop->table_name, DICT_ERR_IGNORE_NONE);
-	mutex_exit(&(dict_sys->mutex));
+	table = dict_table_open_on_name(drop->table_name, FALSE, FALSE,
+					DICT_ERR_IGNORE_NONE);
 
 	if (table == NULL) {
 		/* If for some reason the table has already been dropped
@@ -2380,6 +2644,10 @@ loop:
 		goto already_dropped;
 	}
 
+	ut_a(!table->can_be_evicted);
+
+	dict_table_close(table, FALSE, FALSE);
+
 	if (DB_SUCCESS != row_drop_table_for_mysql_in_background(
 		    drop->table_name)) {
 		/* If the DROP fails for some table, we return, and let the
@@ -2391,10 +2659,12 @@ loop:
 	n_tables_dropped++;
 
 already_dropped:
-	mutex_enter(&kernel_mutex);
+	mutex_enter(&row_drop_list_mutex);
 
 	UT_LIST_REMOVE(row_mysql_drop_list, row_mysql_drop_list, drop);
 
+	MONITOR_DEC(MONITOR_BACKGROUND_DROP_TABLE);
+
 	ut_print_timestamp(stderr);
 	fputs("  InnoDB: Dropped table ", stderr);
 	ut_print_name(stderr, NULL, TRUE, drop->table_name);
@@ -2404,29 +2674,31 @@ already_dropped:
 
 	mem_free(drop);
 
-	mutex_exit(&kernel_mutex);
+	mutex_exit(&row_drop_list_mutex);
 
 	goto loop;
 }
 
 /*********************************************************************//**
-Get the background drop list length. NOTE: the caller must own the kernel
-mutex!
+Get the background drop list length. NOTE: the caller must own the
+drop list mutex!
 @return	how many tables in list */
 UNIV_INTERN
 ulint
 row_get_background_drop_list_len_low(void)
 /*======================================*/
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ulint	len;
 
-	if (!row_mysql_drop_list_inited) {
+	mutex_enter(&row_drop_list_mutex);
 
-		UT_LIST_INIT(row_mysql_drop_list);
-		row_mysql_drop_list_inited = TRUE;
-	}
+	ut_a(row_mysql_drop_list_inited);
+
+	len = UT_LIST_GET_LEN(row_mysql_drop_list);
 
-	return(UT_LIST_GET_LEN(row_mysql_drop_list));
+	mutex_exit(&row_drop_list_mutex);
+
+	return(len);
 }
 
 /*********************************************************************//**
@@ -2444,413 +2716,466 @@ row_add_table_to_background_drop_list(
 {
 	row_mysql_drop_t*	drop;
 
-	mutex_enter(&kernel_mutex);
-
-	if (!row_mysql_drop_list_inited) {
+	mutex_enter(&row_drop_list_mutex);
 
-		UT_LIST_INIT(row_mysql_drop_list);
-		row_mysql_drop_list_inited = TRUE;
-	}
+	ut_a(row_mysql_drop_list_inited);
 
 	/* Look if the table already is in the drop list */
-	drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+	for (drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+	     drop != NULL;
+	     drop = UT_LIST_GET_NEXT(row_mysql_drop_list, drop)) {
 
-	while (drop != NULL) {
 		if (strcmp(drop->table_name, name) == 0) {
 			/* Already in the list */
 
-			mutex_exit(&kernel_mutex);
+			mutex_exit(&row_drop_list_mutex);
 
 			return(FALSE);
 		}
-
-		drop = UT_LIST_GET_NEXT(row_mysql_drop_list, drop);
 	}
 
-	drop = mem_alloc(sizeof(row_mysql_drop_t));
+	drop = static_cast<row_mysql_drop_t*>(
+		mem_alloc(sizeof(row_mysql_drop_t)));
 
 	drop->table_name = mem_strdup(name);
 
 	UT_LIST_ADD_LAST(row_mysql_drop_list, row_mysql_drop_list, drop);
 
+	MONITOR_INC(MONITOR_BACKGROUND_DROP_TABLE);
+
 	/*	fputs("InnoDB: Adding table ", stderr);
 	ut_print_name(stderr, trx, TRUE, drop->table_name);
 	fputs(" to background drop list\n", stderr); */
 
-	mutex_exit(&kernel_mutex);
+	mutex_exit(&row_drop_list_mutex);
 
 	return(TRUE);
 }
 
 /*********************************************************************//**
-Discards the tablespace of a table which stored in an .ibd file. Discarding
-means that this function deletes the .ibd file and assigns a new table id for
-the table. Also the flag table->ibd_file_missing is set TRUE.
+Reassigns the table identifier of a table.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
-row_discard_tablespace_for_mysql(
-/*=============================*/
-	const char*	name,	/*!< in: table name */
-	trx_t*		trx)	/*!< in: transaction handle */
+dberr_t
+row_mysql_table_id_reassign(
+/*========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx,	/*!< in/out: transaction */
+	table_id_t*	new_id)	/*!< out: new table id */
 {
-	dict_foreign_t*	foreign;
-	table_id_t	new_id;
-	dict_table_t*	table;
-	ibool		success;
-	ulint		err;
-	pars_info_t*	info = NULL;
+	dberr_t		err;
+	pars_info_t*	info	= pars_info_create();
 
-	/* How do we prevent crashes caused by ongoing operations on
-	the table? Old operations could try to access non-existent
-	pages.
+	dict_hdr_get_new_id(new_id, NULL, NULL);
 
-	1) SQL queries, INSERT, SELECT, ...: we must get an exclusive
-	MySQL table lock on the table before we can do DISCARD
-	TABLESPACE. Then there are no running queries on the table.
+	/* Remove all locks except the table-level S and X locks. */
+	lock_remove_all_on_table(table, FALSE);
 
-	2) Purge and rollback: we assign a new table id for the
-	table. Since purge and rollback look for the table based on
-	the table id, they see the table as 'dropped' and discard
-	their operations.
+	pars_info_add_ull_literal(info, "old_id", table->id);
+	pars_info_add_ull_literal(info, "new_id", *new_id);
+
+	err = que_eval_sql(
+		info,
+		"PROCEDURE RENUMBER_TABLE_PROC () IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_TABLES SET ID = :new_id\n"
+		" WHERE ID = :old_id;\n"
+		"UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n"
+		" WHERE TABLE_ID = :old_id;\n"
+		"UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n"
+		" WHERE TABLE_ID = :old_id;\n"
+		"END;\n", FALSE, trx);
 
-	3) Insert buffer: we remove all entries for the tablespace in
-	the insert buffer tree; as long as the tablespace mem object
-	does not exist, ongoing insert buffer page merges are
-	discarded in buf0rea.c. If we recreate the tablespace mem
-	object with IMPORT TABLESPACE later, then the tablespace will
-	have the same id, but the tablespace_version field in the mem
-	object is different, and ongoing old insert buffer page merges
-	get discarded.
+	return(err);
+}
 
-	4) Linear readahead and random readahead: we use the same
-	method as in 3) to discard ongoing operations.
+/*********************************************************************//**
+Setup the pre-requisites for DISCARD TABLESPACE. It will start the transaction,
+acquire the data dictionary lock in X mode and open the table.
+@return table instance or 0 if not found. */
+static
+dict_table_t*
+row_discard_tablespace_begin(
+/*=========================*/
+	const char*	name,	/*!< in: table name */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	trx->op_info = "discarding tablespace";
 
-	5) FOREIGN KEY operations: if
-	table->n_foreign_key_checks_running > 0, we do not allow the
-	discard. We also reserve the data dictionary latch. */
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
 
-	trx->op_info = "discarding tablespace";
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	/* Serialize data dictionary operations with dictionary mutex:
-	no deadlocks can occur then in these operations */
+	this is to avoid deadlocks during data dictionary operations */
 
 	row_mysql_lock_data_dictionary(trx);
 
-	table = dict_table_get_low(name, DICT_ERR_IGNORE_NONE);
-
-	if (!table) {
-		err = DB_TABLE_NOT_FOUND;
-
-		goto funct_exit;
-	}
+	dict_table_t*	table;
 
-	if (table->space == 0) {
-		ut_print_timestamp(stderr);
-		fputs("  InnoDB: Error: table ", stderr);
-		ut_print_name(stderr, trx, TRUE, name);
-		fputs("\n"
-		      "InnoDB: is in the system tablespace 0"
-		      " which cannot be discarded\n", stderr);
-		err = DB_ERROR;
+	table = dict_table_open_on_name(
+		name, TRUE, FALSE, DICT_ERR_IGNORE_NONE);
 
-		goto funct_exit;
+	if (table) {
+		dict_stats_wait_bg_to_stop_using_table(table, trx);
+		ut_a(table->space != TRX_SYS_SPACE);
+		ut_a(table->n_foreign_key_checks_running == 0);
 	}
 
-	if (table->n_foreign_key_checks_running > 0) {
-
-		ut_print_timestamp(stderr);
-		fputs("  InnoDB: You are trying to DISCARD table ", stderr);
-		ut_print_name(stderr, trx, TRUE, table->name);
-		fputs("\n"
-		      "InnoDB: though there is a foreign key check"
-		      " running on it.\n"
-		      "InnoDB: Cannot discard the table.\n",
-		      stderr);
-
-		err = DB_ERROR;
+	return(table);
+}
 
-		goto funct_exit;
-	}
+/*********************************************************************//**
+Do the foreign key constraint checks.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_discard_tablespace_foreign_key_checks(
+/*======================================*/
+	const trx_t*		trx,	/*!< in: transaction handle */
+	const dict_table_t*	table)	/*!< in: table to be discarded */
+{
+	const dict_foreign_t*	foreign;
 
 	/* Check if the table is referenced by foreign key constraints from
 	some other table (not the table itself) */
 
-	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+	for (foreign = UT_LIST_GET_FIRST(table->referenced_list);
+	     foreign && foreign->foreign_table == table;
+	     foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) {
 
-	while (foreign && foreign->foreign_table == table) {
-		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
 	}
 
-	if (foreign && trx->check_foreigns) {
+	if (!srv_read_only_mode && foreign && trx->check_foreigns) {
 
 		FILE*	ef	= dict_foreign_err_file;
 
 		/* We only allow discarding a referenced table if
 		FOREIGN_KEY_CHECKS is set to 0 */
 
-		err = DB_CANNOT_DROP_CONSTRAINT;
-
 		mutex_enter(&dict_foreign_err_mutex);
+
 		rewind(ef);
+
 		ut_print_timestamp(ef);
 
 		fputs("  Cannot DISCARD table ", ef);
-		ut_print_name(stderr, trx, TRUE, name);
+		ut_print_name(stderr, trx, TRUE, table->name);
 		fputs("\n"
 		      "because it is referenced by ", ef);
 		ut_print_name(stderr, trx, TRUE, foreign->foreign_table_name);
 		putc('\n', ef);
+
 		mutex_exit(&dict_foreign_err_mutex);
 
-		goto funct_exit;
+		return(DB_CANNOT_DROP_CONSTRAINT);
 	}
 
-	dict_hdr_get_new_id(&new_id, NULL, NULL);
+	return(DB_SUCCESS);
+}
 
-	/* Remove all locks except the table-level S and X locks. */
-	lock_remove_all_on_table(table, FALSE);
+/*********************************************************************//**
+Cleanup after the DISCARD TABLESPACE operation.
+@return error code. */
+static
+dberr_t
+row_discard_tablespace_end(
+/*=======================*/
+	trx_t*		trx,	/*!< in/out: transaction handle */
+	dict_table_t*	table,	/*!< in/out: table to be discarded */
+	dberr_t		err)	/*!< in: error code */
+{
+	if (table != 0) {
+		dict_table_close(table, TRUE, FALSE);
+	}
 
-	info = pars_info_create();
+	DBUG_EXECUTE_IF("ib_discard_before_commit_crash",
+			log_make_checkpoint_at(LSN_MAX, TRUE);
+			DBUG_SUICIDE(););
 
-	pars_info_add_str_literal(info, "table_name", name);
-	pars_info_add_ull_literal(info, "new_id", new_id);
+	trx_commit_for_mysql(trx);
 
-	err = que_eval_sql(info,
-			   "PROCEDURE DISCARD_TABLESPACE_PROC () IS\n"
-			   "old_id CHAR;\n"
-			   "BEGIN\n"
-			   "SELECT ID INTO old_id\n"
-			   "FROM SYS_TABLES\n"
-			   "WHERE NAME = :table_name\n"
-			   "LOCK IN SHARE MODE;\n"
-			   "IF (SQL % NOTFOUND) THEN\n"
-			   "       COMMIT WORK;\n"
-			   "       RETURN;\n"
-			   "END IF;\n"
-			   "UPDATE SYS_TABLES SET ID = :new_id\n"
-			   " WHERE ID = old_id;\n"
-			   "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n"
-			   " WHERE TABLE_ID = old_id;\n"
-			   "UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n"
-			   " WHERE TABLE_ID = old_id;\n"
-			   "COMMIT WORK;\n"
-			   "END;\n"
-			   , FALSE, trx);
+	DBUG_EXECUTE_IF("ib_discard_after_commit_crash",
+			log_make_checkpoint_at(LSN_MAX, TRUE);
+			DBUG_SUICIDE(););
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Do the DISCARD TABLESPACE operation.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_discard_tablespace(
+/*===================*/
+	trx_t*		trx,	/*!< in/out: transaction handle */
+	dict_table_t*	table)	/*!< in/out: table to be discarded */
+{
+	dberr_t		err;
+
+	/* How do we prevent crashes caused by ongoing operations on
+	the table? Old operations could try to access non-existent
+	pages. MySQL will block all DML on the table using MDL and a
+	DISCARD will not start unless all existing operations on the
+	table to be discarded are completed.
+
+	1) Acquire the data dictionary latch in X mode. To prevent any
+	internal operations that MySQL is not aware off and also for
+	the internal SQL parser.
+
+	2) Purge and rollback: we assign a new table id for the
+	table. Since purge and rollback look for the table based on
+	the table id, they see the table as 'dropped' and discard
+	their operations.
+
+	3) Insert buffer: we remove all entries for the tablespace in
+	the insert buffer tree.
+
+	4) FOREIGN KEY operations: if table->n_foreign_key_checks_running > 0,
+	we do not allow the discard. */
+
+	/* Play safe and remove all insert buffer entries, though we should
+	have removed them already when DISCARD TABLESPACE was called */
+
+	ibuf_delete_for_discarded_space(table->space);
+
+	table_id_t	new_id;
+
+	/* Set the TABLESPACE DISCARD flag in the table definition on disk. */
+
+	err = row_import_update_discarded_flag(trx, table->id, true, true);
 
 	if (err != DB_SUCCESS) {
-		trx->error_state = DB_SUCCESS;
-		trx_general_rollback_for_mysql(trx, NULL);
-		trx->error_state = DB_SUCCESS;
-	} else {
-		dict_table_change_id_in_cache(table, new_id);
+		return(err);
+	}
 
-		success = fil_discard_tablespace(table->space);
+	/* Update the index root pages in the system tables, on disk */
 
-		if (!success) {
-			trx->error_state = DB_SUCCESS;
-			trx_general_rollback_for_mysql(trx, NULL);
-			trx->error_state = DB_SUCCESS;
+	err = row_import_update_index_root(trx, table, true, true);
 
-			err = DB_ERROR;
-		} else {
-			dict_index_t*	index;
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
 
-			/* Set the flag which tells that now it is legal to
-			IMPORT a tablespace for this table */
-			table->tablespace_discarded = TRUE;
-			table->ibd_file_missing = TRUE;
+	/* Drop all the FTS auxiliary tables. */
+	if (dict_table_has_fts_index(table)
+	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
 
-			/* check adaptive hash entries */
-			index = dict_table_get_first_index(table);
-			while (index) {
-				ulint ref_count =
-					btr_search_info_get_ref_count(
-						index->search_info, index);
-				if (ref_count) {
-					fprintf(stderr, "InnoDB: Warning:"
-						" hash index ref_count (%lu) is not zero"
-						" after fil_discard_tablespace().\n"
-						"index: \"%s\""
-						" table: \"%s\"\n",
-						ref_count,
-						index->name,
-						table->name);
-				}
-				index = dict_table_get_next_index(index);
-			}
-		}
+		fts_drop_tables(trx, table);
 	}
 
-funct_exit:
-	trx_commit_for_mysql(trx);
+	/* Assign a new space ID to the table definition so that purge
+	can ignore the changes. Update the system table on disk. */
 
-	row_mysql_unlock_data_dictionary(trx);
+	err = row_mysql_table_id_reassign(table, trx, &new_id);
 
-	trx->op_info = "";
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Discard the physical file that is used for the tablespace. */
+
+	err = fil_discard_tablespace(table->space);
+
+	switch(err) {
+	case DB_SUCCESS:
+	case DB_IO_ERROR:
+	case DB_TABLESPACE_NOT_FOUND:
+		/* All persistent operations successful, update the
+		data dictionary memory cache. */
+
+		table->ibd_file_missing = TRUE;
+
+		table->flags2 |= DICT_TF2_DISCARDED;
+
+		dict_table_change_id_in_cache(table, new_id);
+
+		/* Reset the root page numbers. */
+
+		for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+		     index != 0;
+		     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+			index->page = FIL_NULL;
+			index->space = FIL_NULL;
+		}
+
+		/* If the tablespace did not already exist or we couldn't
+		write to it, we treat that as a successful DISCARD. It is
+		unusable anyway. */
+
+		err = DB_SUCCESS;
+		break;
+
+	default:
+		/* We need to rollback the disk changes, something failed. */
 
-	return((int) err);
+		trx->error_state = DB_SUCCESS;
+
+		trx_rollback_to_savepoint(trx, NULL);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	return(err);
 }
 
-/*****************************************************************//**
-Imports a tablespace. The space id in the .ibd file must match the space id
-of the table in the data dictionary.
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function renames the .ibd file and assigns a new table id for
+the table. Also the flag table->ibd_file_missing is set to TRUE.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
-row_import_tablespace_for_mysql(
-/*============================*/
+dberr_t
+row_discard_tablespace_for_mysql(
+/*=============================*/
 	const char*	name,	/*!< in: table name */
 	trx_t*		trx)	/*!< in: transaction handle */
 {
+	dberr_t		err;
 	dict_table_t*	table;
-	ibool		success;
-	ib_uint64_t	current_lsn;
-	ulint		err		= DB_SUCCESS;
 
-	trx_start_if_not_started(trx);
+	/* Open the table and start the transaction if not started. */
 
-	trx->op_info = "importing tablespace";
+	table = row_discard_tablespace_begin(name, trx);
 
-	current_lsn = log_get_lsn();
+	if (table == 0) {
+		err = DB_TABLE_NOT_FOUND;
+	} else if (table->space == TRX_SYS_SPACE) {
+		char	table_name[MAX_FULL_NAME_LEN + 1];
 
-	/* Enlarge the fatal lock wait timeout during import. */
-	mutex_enter(&kernel_mutex);
-	srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */
-	mutex_exit(&kernel_mutex);
+		innobase_format_name(
+			table_name, sizeof(table_name), table->name, FALSE);
 
-	/* It is possible, though very improbable, that the lsn's in the
-	tablespace to be imported have risen above the current system lsn, if
-	a lengthy purge, ibuf merge, or rollback was performed on a backup
-	taken with ibbackup. If that is the case, reset page lsn's in the
-	file. We assume that mysqld was shut down after it performed these
-	cleanup operations on the .ibd file, so that it stamped the latest lsn
-	to the FIL_PAGE_FILE_FLUSH_LSN in the first page of the .ibd file.
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			    ER_TABLE_IN_SYSTEM_TABLESPACE, table_name);
 
-	TODO: reset also the trx id's in clustered index records and write
-	a new space id to each data page. That would allow us to import clean
-	.ibd files from another MySQL installation. */
+		err = DB_ERROR;
 
-	success = fil_reset_too_high_lsns(name, current_lsn);
+	} else if (table->n_foreign_key_checks_running > 0) {
+		char	table_name[MAX_FULL_NAME_LEN + 1];
 
-	if (!success) {
-		ut_print_timestamp(stderr);
-		fputs("  InnoDB: Error: cannot reset lsn's in table ", stderr);
-		ut_print_name(stderr, trx, TRUE, name);
-		fputs("\n"
-		      "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n",
-		      stderr);
+		innobase_format_name(
+			table_name, sizeof(table_name), table->name, FALSE);
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			    ER_DISCARD_FK_CHECKS_RUNNING, table_name);
 
 		err = DB_ERROR;
 
-		row_mysql_lock_data_dictionary(trx);
+	} else {
+		/* Do foreign key constraint checks. */
 
-		goto funct_exit;
-	}
+		err = row_discard_tablespace_foreign_key_checks(trx, table);
 
-	/* Serialize data dictionary operations with dictionary mutex:
-	no deadlocks can occur then in these operations */
+		if (err == DB_SUCCESS) {
+			err = row_discard_tablespace(trx, table);
+		}
+	}
 
-	row_mysql_lock_data_dictionary(trx);
+	return(row_discard_tablespace_end(trx, table, err));
+}
 
-	table = dict_table_get_low(name, DICT_ERR_IGNORE_NONE);
+/*********************************************************************//**
+Sets an exclusive lock on a table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_mysql_lock_table(
+/*=================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table,		/*!< in: table to lock */
+	enum lock_mode	mode,		/*!< in: LOCK_X or LOCK_S */
+	const char*	op_info)	/*!< in: string for trx->op_info */
+{
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	dberr_t		err;
+	sel_node_t*	node;
 
-	if (!table) {
-		ut_print_timestamp(stderr);
-		fputs("  InnoDB: table ", stderr);
-		ut_print_name(stderr, trx, TRUE, name);
-		fputs("\n"
-		      "InnoDB: does not exist in the InnoDB data dictionary\n"
-		      "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n",
-		      stderr);
+	ut_ad(trx);
+	ut_ad(mode == LOCK_X || mode == LOCK_S);
 
-		err = DB_TABLE_NOT_FOUND;
+	heap = mem_heap_create(512);
 
-		goto funct_exit;
-	}
+	trx->op_info = op_info;
 
-	if (table->space == 0) {
-		ut_print_timestamp(stderr);
-		fputs("  InnoDB: Error: table ", stderr);
-		ut_print_name(stderr, trx, TRUE, name);
-		fputs("\n"
-		      "InnoDB: is in the system tablespace 0"
-		      " which cannot be imported\n", stderr);
-		err = DB_ERROR;
+	node = sel_node_create(heap);
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+	thr->graph->state = QUE_FORK_ACTIVE;
 
-		goto funct_exit;
-	}
+	/* We use the select query graph as the dummy graph needed
+	in the lock module call */
 
-	if (!table->tablespace_discarded) {
-		ut_print_timestamp(stderr);
-		fputs("  InnoDB: Error: you are trying to"
-		      " IMPORT a tablespace\n"
-		      "InnoDB: ", stderr);
-		ut_print_name(stderr, trx, TRUE, name);
-		fputs(", though you have not called DISCARD on it yet\n"
-		      "InnoDB: during the lifetime of the mysqld process!\n",
-		      stderr);
+	thr = que_fork_get_first_thr(
+		static_cast<que_fork_t*>(que_node_get_parent(thr)));
 
-		err = DB_ERROR;
+	que_thr_move_to_run_state_for_mysql(thr, trx);
 
-		goto funct_exit;
-	}
+run_again:
+	thr->run_node = thr;
+	thr->prev_node = thr->common.parent;
 
-	/* Play safe and remove all insert buffer entries, though we should
-	have removed them already when DISCARD TABLESPACE was called */
+	err = lock_table(0, table, mode, thr);
 
-	ibuf_delete_for_discarded_space(table->space);
+	trx->error_state = err;
 
-	success = fil_open_single_table_tablespace(
-		TRUE, table->space,
-		table->flags == DICT_TF_COMPACT ? 0 : table->flags,
-		table->name, trx);
-	if (success) {
-		table->ibd_file_missing = FALSE;
-		table->tablespace_discarded = FALSE;
+	if (err == DB_SUCCESS) {
+		que_thr_stop_for_mysql_no_error(thr, trx);
 	} else {
-		if (table->ibd_file_missing) {
-			ut_print_timestamp(stderr);
-			fputs("  InnoDB: cannot find or open in the"
-			      " database directory the .ibd file of\n"
-			      "InnoDB: table ", stderr);
-			ut_print_name(stderr, trx, TRUE, name);
-			fputs("\n"
-			      "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n",
-			      stderr);
-		}
+		que_thr_stop_for_mysql(thr);
 
-		err = DB_ERROR;
-	}
+		if (err != DB_QUE_THR_SUSPENDED) {
+			ibool	was_lock_wait;
 
-funct_exit:
-	trx_commit_for_mysql(trx);
+			was_lock_wait = row_mysql_handle_errors(
+				&err, trx, thr, NULL);
 
-	row_mysql_unlock_data_dictionary(trx);
+			if (was_lock_wait) {
+				goto run_again;
+			}
+		} else {
+			que_thr_t*	run_thr;
+			que_node_t*	parent;
 
-	trx->op_info = "";
+			parent = que_node_get_parent(thr);
+
+			run_thr = que_fork_start_command(
+				static_cast<que_fork_t*>(parent));
+
+			ut_a(run_thr == thr);
+
+			/* There was a lock wait but the thread was not
+			in a ready to run or running state. */
+			trx->error_state = DB_LOCK_WAIT;
 
-	/* Restore the fatal semaphore wait timeout */
-	mutex_enter(&kernel_mutex);
-	srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */
-	mutex_exit(&kernel_mutex);
+			goto run_again;
+		}
+	}
 
-	return((int) err);
+	que_graph_free(thr->graph);
+	trx->op_info = "";
+
+	return(err);
 }
 
 /*********************************************************************//**
 Truncates a table for MySQL.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_truncate_table_for_mysql(
 /*=========================*/
 	dict_table_t*	table,	/*!< in: table handle */
 	trx_t*		trx)	/*!< in: transaction handle */
 {
 	dict_foreign_t*	foreign;
-	ulint		err;
+	dberr_t		err;
 	mem_heap_t*	heap;
 	byte*		buf;
 	dtuple_t*	tuple;
@@ -2861,17 +3186,16 @@ row_truncate_table_for_mysql(
 	table_id_t	new_id;
 	ulint		recreate_space = 0;
 	pars_info_t*	info = NULL;
+	ibool		has_internal_doc_id;
+	ulint		old_space = table->space;
 
 	/* How do we prevent crashes caused by ongoing operations on
 	the table? Old operations could try to access non-existent
 	pages.
 
 	1) SQL queries, INSERT, SELECT, ...: we must get an exclusive
-	MySQL table lock on the table before we can do TRUNCATE
-	TABLE. Then there are no running queries on the table. This is
-	guaranteed, because in ha_innobase::store_lock(), we do not
-	weaken the TL_WRITE lock requested by MySQL when executing
-	SQLCOM_TRUNCATE.
+	InnoDB table lock on the table before we can do TRUNCATE
+	TABLE. Then there are no running queries on the table.
 
 	2) Purge and rollback: we assign a new table id for the
 	table. Since purge and rollback look for the table based on
@@ -2914,9 +3238,15 @@ row_truncate_table_for_mysql(
 		return(DB_ERROR);
 	}
 
-	trx->op_info = "truncating table";
+	if (dict_table_is_discarded(table)) {
+		return(DB_TABLESPACE_DELETED);
+	} else if (table->ibd_file_missing) {
+		return(DB_TABLESPACE_NOT_FOUND);
+	}
 
-	trx_start_if_not_started(trx);
+	trx_start_for_ddl(trx, TRX_DICT_OP_TABLE);
+
+	trx->op_info = "truncating table";
 
 	/* Serialize data dictionary operations with dictionary mutex:
 	no deadlocks can occur then in these operations */
@@ -2932,16 +3262,22 @@ row_truncate_table_for_mysql(
 	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
 
+	dict_stats_wait_bg_to_stop_using_table(table, trx);
+
 	/* Check if the table is referenced by foreign key constraints from
 	some other table (not the table itself) */
 
-	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+	for (foreign = UT_LIST_GET_FIRST(table->referenced_list);
+	     foreign != 0 && foreign->foreign_table == table;
+	     foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) {
 
-	while (foreign && foreign->foreign_table == table) {
-		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+		/* Do nothing. */
 	}
 
-	if (foreign && trx->check_foreigns) {
+	if (!srv_read_only_mode
+	    && foreign
+	    && trx->check_foreigns) {
+
 		FILE*	ef	= dict_foreign_err_file;
 
 		/* We only allow truncating a referenced table if
@@ -2965,7 +3301,7 @@ row_truncate_table_for_mysql(
 
 	/* TODO: could we replace the counter n_foreign_key_checks_running
 	with lock checks on the table? Acquire here an exclusive lock on the
-	table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that
+	table, and rewrite lock0lock.cc and the lock wait in srv0srv.cc so that
 	they can cope with the table having been truncated here? Foreign key
 	checks take an IS or IX lock on the table. */
 
@@ -2982,18 +3318,41 @@ row_truncate_table_for_mysql(
 		goto funct_exit;
 	}
 
-	/* Remove all locks except the table-level S and X locks. */
+	/* Remove all locks except the table-level X lock. */
+
 	lock_remove_all_on_table(table, FALSE);
 
+	/* Ensure that the table will be dropped by
+	trx_rollback_active() in case of a crash. */
+
 	trx->table_id = table->id;
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	/* Assign an undo segment for the transaction, so that the
+	transaction will be recovered after a crash. */
+
+	mutex_enter(&trx->undo_mutex);
+
+	err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE);
+
+	mutex_exit(&trx->undo_mutex);
+
+	if (err != DB_SUCCESS) {
+
+		goto funct_exit;
+	}
 
 	if (table->space && !table->dir_path_of_temp_table) {
 		/* Discard and create the single-table tablespace. */
 		ulint	space	= table->space;
 		ulint	flags	= fil_space_get_flags(space);
 
+		ut_a(!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY));
+
+		dict_get_and_save_data_dir_path(table, true);
+
 		if (flags != ULINT_UNDEFINED
-		    && fil_discard_tablespace(space)) {
+		    && fil_discard_tablespace(space) == DB_SUCCESS) {
 
 			dict_index_t*	index;
 
@@ -3006,14 +3365,18 @@ row_truncate_table_for_mysql(
 
 			if (space == ULINT_UNDEFINED
 			    || fil_create_new_single_table_tablespace(
-				    space, table->name, FALSE, flags,
-				    FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) {
+				    space, table->name,
+				    table->data_dir_path,
+				    flags, table->flags2,
+				    FIL_IBD_FILE_INITIAL_SIZE)
+			    != DB_SUCCESS) {
 				dict_table_x_unlock_indexes(table);
-				ut_print_timestamp(stderr);
-				fprintf(stderr,
-					"  InnoDB: TRUNCATE TABLE %s failed to"
-					" create a new tablespace\n",
+
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"TRUNCATE TABLE %s failed to "
+					"create a new tablespace",
 					table->name);
+
 				table->ibd_file_missing = 1;
 				err = DB_ERROR;
 				goto funct_exit;
@@ -3028,21 +3391,6 @@ row_truncate_table_for_mysql(
 			table->space = space;
 			index = dict_table_get_first_index(table);
 			do {
-				ulint ref_count =
-					btr_search_info_get_ref_count(
-						index->search_info, index);
-				/* check adaptive hash entries */
-				if (ref_count) {
-					fprintf(stderr, "InnoDB: Warning:"
-						" hash index ref_count (%lu) is not zero"
-						" after fil_discard_tablespace().\n"
-						"index: \"%s\""
-						" table: \"%s\"\n",
-						ref_count,
-						index->name,
-						table->name);
-				}
-
 				index->space = space;
 				index = dict_table_get_next_index(index);
 			} while (index);
@@ -3069,7 +3417,7 @@ row_truncate_table_for_mysql(
 	tuple = dtuple_create(heap, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
 
-	buf = mem_heap_alloc(heap, 8);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(buf, table->id);
 
 	dfield_set_data(dfield, buf, 8);
@@ -3092,7 +3440,8 @@ row_truncate_table_for_mysql(
 
 		rec = btr_pcur_get_rec(&pcur);
 
-		field = rec_get_nth_field_old(rec, 0, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_INDEXES__TABLE_ID, &len);
 		ut_ad(len == 8);
 
 		if (memcmp(buf, field, len) != 0) {
@@ -3114,7 +3463,7 @@ row_truncate_table_for_mysql(
 
 		if (root_page_no != FIL_NULL) {
 			page_rec_write_field(
-				rec, DICT_SYS_INDEXES_PAGE_NO_FIELD,
+				rec, DICT_FLD__SYS_INDEXES__PAGE_NO,
 				root_page_no, &mtr);
 			/* We will need to commit and restart the
 			mini-transaction in order to avoid deadlocks.
@@ -3135,38 +3484,110 @@ next_rec:
 	mtr_commit(&mtr);
 
 	mem_heap_free(heap);
-
 	/* Done with index truncation, release index tree locks,
 	subsequent work relates to table level metadata change */
 	dict_table_x_unlock_indexes(table);
 
 	dict_hdr_get_new_id(&new_id, NULL, NULL);
 
+	/* Create new FTS auxiliary tables with the new_id, and
+	drop the old index later, only if everything runs successful. */
+	has_internal_doc_id = dict_table_has_fts_index(table)
+			      || DICT_TF2_FLAG_IS_SET(
+				table, DICT_TF2_FTS_HAS_DOC_ID);
+	if (has_internal_doc_id) {
+		dict_table_t	fts_table;
+		ulint		i;
+
+		fts_table.name = table->name;
+		fts_table.id = new_id;
+
+		err = fts_create_common_tables(
+			trx, &fts_table, table->name, TRUE);
+
+		for (i = 0;
+		     i < ib_vector_size(table->fts->indexes)
+		     && err == DB_SUCCESS;
+		     i++) {
+
+			dict_index_t*	fts_index;
+
+			fts_index = static_cast<dict_index_t*>(
+				ib_vector_getp(table->fts->indexes, i));
+
+			err = fts_create_index_tables_low(
+				trx, fts_index, table->name, new_id);
+		}
+
+		if (err != DB_SUCCESS) {
+			trx->error_state = DB_SUCCESS;
+			trx_rollback_to_savepoint(trx, NULL);
+			trx->error_state = DB_SUCCESS;
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Unable to truncate FTS index for"
+			      " table", stderr);
+			ut_print_name(stderr, trx, TRUE, table->name);
+			fputs("\n", stderr);
+
+			goto funct_exit;
+		} else {
+			ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+		}
+	}
+
 	info = pars_info_create();
 
-	pars_info_add_int4_literal(info, "space", (lint) table->space);
+	pars_info_add_int4_literal(info, "new_space", (lint) table->space);
 	pars_info_add_ull_literal(info, "old_id", table->id);
 	pars_info_add_ull_literal(info, "new_id", new_id);
 
 	err = que_eval_sql(info,
-			   "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n"
+			   "PROCEDURE RENUMBER_TABLE_ID_PROC () IS\n"
 			   "BEGIN\n"
 			   "UPDATE SYS_TABLES"
-			   " SET ID = :new_id, SPACE = :space\n"
+			   " SET ID = :new_id, SPACE = :new_space\n"
 			   " WHERE ID = :old_id;\n"
 			   "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n"
 			   " WHERE TABLE_ID = :old_id;\n"
 			   "UPDATE SYS_INDEXES"
-			   " SET TABLE_ID = :new_id, SPACE = :space\n"
+			   " SET TABLE_ID = :new_id, SPACE = :new_space\n"
 			   " WHERE TABLE_ID = :old_id;\n"
-			   "COMMIT WORK;\n"
 			   "END;\n"
 			   , FALSE, trx);
 
+	if (err == DB_SUCCESS && old_space != table->space) {
+		info = pars_info_create();
+
+		pars_info_add_int4_literal(info, "old_space", (lint) old_space);
+
+		pars_info_add_int4_literal(
+			info, "new_space", (lint) table->space);
+
+		err = que_eval_sql(info,
+				   "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n"
+				   "BEGIN\n"
+				   "UPDATE SYS_TABLESPACES"
+				   " SET SPACE = :new_space\n"
+				   " WHERE SPACE = :old_space;\n"
+				   "UPDATE SYS_DATAFILES"
+				   " SET SPACE = :new_space"
+				   " WHERE SPACE = :old_space;\n"
+				   "END;\n"
+				   , FALSE, trx);
+	}
+	DBUG_EXECUTE_IF("ib_ddl_crash_before_fts_truncate", err = DB_ERROR;);
+
 	if (err != DB_SUCCESS) {
 		trx->error_state = DB_SUCCESS;
-		trx_general_rollback_for_mysql(trx, NULL);
+		trx_rollback_to_savepoint(trx, NULL);
 		trx->error_state = DB_SUCCESS;
+
+		/* Update system table failed.  Table in memory metadata
+		could be in an inconsistent state, mark the in-memory
+		table->corrupted to be true. In the long run, this should
+		be fixed by atomic truncate table */
+		table->corrupted = true;
+
 		ut_print_timestamp(stderr);
 		fputs("  InnoDB: Unable to assign a new identifier to table ",
 		      stderr);
@@ -3174,20 +3595,51 @@ next_rec:
 		fputs("\n"
 		      "InnoDB: after truncating it.  Background processes"
 		      " may corrupt the table!\n", stderr);
+
+		/* Failed to update the table id, so drop the new
+		FTS auxiliary tables */
+		if (has_internal_doc_id) {
+			ut_ad(trx->state == TRX_STATE_NOT_STARTED);
+
+			table_id_t	id = table->id;
+
+			table->id = new_id;
+
+			fts_drop_tables(trx, table);
+
+			table->id = id;
+
+			ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+		}
+
 		err = DB_ERROR;
 	} else {
+		/* Drop the old FTS index */
+		if (has_internal_doc_id) {
+			ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+			fts_drop_tables(trx, table);
+			ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+		}
+
+		DBUG_EXECUTE_IF("ib_truncate_crash_after_fts_drop",
+				DBUG_SUICIDE(););
+
 		dict_table_change_id_in_cache(table, new_id);
+
+		/* Reset the Doc ID in cache to 0 */
+		if (has_internal_doc_id && table->fts->cache) {
+			table->fts->fts_status |= TABLE_DICT_LOCKED;
+			fts_update_next_doc_id(trx, table, NULL, 0);
+			fts_cache_clear(table->fts->cache, TRUE);
+			fts_cache_init(table->fts->cache);
+			table->fts->fts_status &= ~TABLE_DICT_LOCKED;
+		}
 	}
 
 	/* Reset auto-increment. */
 	dict_table_autoinc_lock(table);
 	dict_table_autoinc_initialize(table, 1);
 	dict_table_autoinc_unlock(table);
-	dict_update_statistics(
-		table,
-		FALSE, /* update even if stats are initialized */
-		TRUE,
-		FALSE /* update even if not changed too much */);
 
 	trx_commit_for_mysql(trx);
 
@@ -3195,11 +3647,13 @@ funct_exit:
 
 	row_mysql_unlock_data_dictionary(trx);
 
+	dict_stats_update(table, DICT_STATS_EMPTY_TABLE);
+
 	trx->op_info = "";
 
 	srv_wake_master_thread();
 
-	return((int) err);
+	return(err);
 }
 
 /*********************************************************************//**
@@ -3211,22 +3665,29 @@ by the transaction, the transaction will be committed.  Otherwise, the
 data dictionary will remain locked.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_drop_table_for_mysql(
 /*=====================*/
 	const char*	name,	/*!< in: table name */
 	trx_t*		trx,	/*!< in: transaction handle */
-	ibool		drop_db)/*!< in: TRUE=dropping whole database */
+	bool		drop_db,/*!< in: true=dropping whole database */
+	bool		nonatomic)
+				/*!< in: whether it is permitted
+				to release and reacquire dict_operation_lock */
 {
+	dberr_t		err;
 	dict_foreign_t*	foreign;
 	dict_table_t*	table;
-	dict_index_t*	index;
+	ibool		print_msg;
 	ulint		space_id;
-	ulint		err;
-	const char*	table_name;
+	char*		filepath = NULL;
+	const char*	tablename_minus_db;
+	char*		tablename =  NULL;
+	bool		ibd_file_missing;
 	ulint		namelen;
-	ibool		locked_dictionary	= FALSE;
-	pars_info_t*    info			= NULL;
+	bool		locked_dictionary	= false;
+	pars_info_t*	info			= NULL;
+	mem_heap_t*	heap			= NULL;
 
 	ut_a(name != NULL);
 
@@ -3240,21 +3701,23 @@ row_drop_table_for_mysql(
 		return(DB_ERROR);
 	}
 
-	trx->op_info = "dropping table";
-
-	trx_start_if_not_started(trx);
-
 	/* The table name is prefixed with the database name and a '/'.
 	Certain table names starting with 'innodb_' have their special
 	meaning regardless of the database name.  Thus, we need to
 	ignore the database name prefix in the comparisons. */
-	table_name = strchr(name, '/');
-	ut_a(table_name);
-	table_name++;
-	namelen = strlen(table_name) + 1;
+	tablename_minus_db = strchr(name, '/');
+
+	if (tablename_minus_db) {
+		tablename_minus_db++;
+	} else {
+		/* Ancillary FTS tables don't have '/' characters. */
+		tablename_minus_db = name;
+	}
+
+	namelen = strlen(tablename_minus_db) + 1;
 
 	if (namelen == sizeof S_innodb_monitor
-	    && !memcmp(table_name, S_innodb_monitor,
+	    && !memcmp(tablename_minus_db, S_innodb_monitor,
 		       sizeof S_innodb_monitor)) {
 
 		/* Table name equals "innodb_monitor":
@@ -3263,17 +3726,17 @@ row_drop_table_for_mysql(
 		srv_print_innodb_monitor = FALSE;
 		srv_print_innodb_lock_monitor = FALSE;
 	} else if (namelen == sizeof S_innodb_lock_monitor
-		   && !memcmp(table_name, S_innodb_lock_monitor,
+		   && !memcmp(tablename_minus_db, S_innodb_lock_monitor,
 			      sizeof S_innodb_lock_monitor)) {
 		srv_print_innodb_monitor = FALSE;
 		srv_print_innodb_lock_monitor = FALSE;
 	} else if (namelen == sizeof S_innodb_tablespace_monitor
-		   && !memcmp(table_name, S_innodb_tablespace_monitor,
+		   && !memcmp(tablename_minus_db, S_innodb_tablespace_monitor,
 			      sizeof S_innodb_tablespace_monitor)) {
 
 		srv_print_innodb_tablespace_monitor = FALSE;
 	} else if (namelen == sizeof S_innodb_table_monitor
-		   && !memcmp(table_name, S_innodb_table_monitor,
+		   && !memcmp(tablename_minus_db, S_innodb_table_monitor,
 			      sizeof S_innodb_table_monitor)) {
 
 		srv_print_innodb_table_monitor = FALSE;
@@ -3282,13 +3745,21 @@ row_drop_table_for_mysql(
 	/* Serialize data dictionary operations with dictionary mutex:
 	no deadlocks can occur then in these operations */
 
+	trx->op_info = "dropping table";
+
+	/* This function is called recursively via fts_drop_tables(). */
+	if (trx->state == TRX_STATE_NOT_STARTED) {
+		trx_start_for_ddl(trx, TRX_DICT_OP_TABLE);
+	}
+
 	if (trx->dict_operation_lock_mode != RW_X_LATCH) {
 		/* Prevent foreign key checks etc. while we are dropping the
 		table */
 
 		row_mysql_lock_data_dictionary(trx);
 
-		locked_dictionary = TRUE;
+		locked_dictionary = true;
+		nonatomic = true;
 	}
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
@@ -3296,8 +3767,10 @@ row_drop_table_for_mysql(
 	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
 
-	table = dict_table_get_low(
-		name, DICT_ERR_IGNORE_INDEX_ROOT | DICT_ERR_IGNORE_CORRUPT);
+	table = dict_table_open_on_name(
+		name, TRUE, FALSE,
+		static_cast<dict_err_ignore_t>(
+			DICT_ERR_IGNORE_INDEX_ROOT | DICT_ERR_IGNORE_CORRUPT));
 
 	if (!table) {
 		err = DB_TABLE_NOT_FOUND;
@@ -3318,6 +3791,65 @@ row_drop_table_for_mysql(
 		goto funct_exit;
 	}
 
+	/* Turn on this drop bit before we could release the dictionary
+	latch */
+	table->to_be_dropped = true;
+
+	if (nonatomic) {
+		/* This trx did not acquire any locks on dictionary
+		table records yet. Thus it is safe to release and
+		reacquire the data dictionary latches. */
+		if (table->fts) {
+			ut_ad(!table->fts->add_wq);
+			ut_ad(lock_trx_has_sys_table_locks(trx) == 0);
+
+			row_mysql_unlock_data_dictionary(trx);
+			fts_optimize_remove_table(table);
+			row_mysql_lock_data_dictionary(trx);
+		}
+
+		/* Do not bother to deal with persistent stats for temp
+		tables since we know temp tables do not use persistent
+		stats. */
+		if (!dict_table_is_temporary(table)) {
+			dict_stats_wait_bg_to_stop_using_table(
+				table, trx);
+		}
+	}
+
+	/* make sure background stats thread is not running on the table */
+	ut_ad(!(table->stats_bg_flag & BG_STAT_IN_PROGRESS));
+
+	/* Delete the link file if used. */
+	if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+		fil_delete_link_file(name);
+	}
+
+	if (!dict_table_is_temporary(table)) {
+
+		dict_stats_recalc_pool_del(table);
+
+		/* Remove stats for this table and all of its indexes from the
+		persistent storage if it exists and if there are stats for this
+		table in there. This function creates its own trx and commits
+		it. */
+		char	errstr[1024];
+		err = dict_stats_drop_table(name, errstr, sizeof(errstr));
+
+		if (err != DB_SUCCESS) {
+			ib_logf(IB_LOG_LEVEL_WARN, "%s", errstr);
+		}
+	}
+
+	/* Move the table the the non-LRU list so that it isn't
+	considered for eviction. */
+
+	if (table->can_be_evicted) {
+		dict_table_move_from_lru_to_non_lru(table);
+	}
+
+	dict_table_close(table, TRUE, FALSE);
+
 	/* Check if the table is referenced by foreign key constraints from
 	some other table (not the table itself) */
 
@@ -3328,7 +3860,9 @@ check_next_foreign:
 		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
 	}
 
-	if (foreign && trx->check_foreigns
+	if (!srv_read_only_mode
+	    && foreign
+	    && trx->check_foreigns
 	    && !(drop_db && dict_tables_have_same_db(
 			 name, foreign->foreign_table_name_lookup))) {
 		FILE*	ef	= dict_foreign_err_file;
@@ -3357,25 +3891,34 @@ check_next_foreign:
 		goto check_next_foreign;
 	}
 
-	if (table->n_mysql_handles_opened > 0) {
-		ibool	added;
+	/* TODO: could we replace the counter n_foreign_key_checks_running
+	with lock checks on the table? Acquire here an exclusive lock on the
+	table, and rewrite lock0lock.cc and the lock wait in srv0srv.cc so that
+	they can cope with the table having been dropped here? Foreign key
+	checks take an IS or IX lock on the table. */
 
-		added = row_add_table_to_background_drop_list(table->name);
+	if (table->n_foreign_key_checks_running > 0) {
+
+		const char*	save_tablename = table->name;
+		ibool		added;
+
+		added = row_add_table_to_background_drop_list(save_tablename);
 
 		if (added) {
 			ut_print_timestamp(stderr);
-			fputs("  InnoDB: Warning: MySQL is"
-			      " trying to drop table ", stderr);
-			ut_print_name(stderr, trx, TRUE, table->name);
+			fputs("  InnoDB: You are trying to drop table ",
+			      stderr);
+			ut_print_name(stderr, trx, TRUE, save_tablename);
 			fputs("\n"
-			      "InnoDB: though there are still"
-			      " open handles to it.\n"
-			      "InnoDB: Adding the table to the"
-			      " background drop queue.\n",
+			      "InnoDB: though there is a"
+			      " foreign key check running on it.\n"
+			      "InnoDB: Adding the table to"
+			      " the background drop queue.\n",
 			      stderr);
 
 			/* We return DB_SUCCESS to MySQL though the drop will
 			happen lazily later */
+
 			err = DB_SUCCESS;
 		} else {
 			/* The table is already in the background drop list */
@@ -3385,34 +3928,42 @@ check_next_foreign:
 		goto funct_exit;
 	}
 
-	/* TODO: could we replace the counter n_foreign_key_checks_running
-	with lock checks on the table? Acquire here an exclusive lock on the
-	table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that
-	they can cope with the table having been dropped here? Foreign key
-	checks take an IS or IX lock on the table. */
+	/* Remove all locks that are on the table or its records, if there
+	are no refernces to the table but it has record locks, we release
+	the record locks unconditionally. One use case is:
 
-	if (table->n_foreign_key_checks_running > 0) {
+		CREATE TABLE t2 (PRIMARY KEY (a)) SELECT * FROM t1;
 
-		const char*	table_name = table->name;
-		ibool		added;
+	If after the user transaction has done the SELECT and there is a
+	problem in completing the CREATE TABLE operation, MySQL will drop
+	the table. InnoDB will create a new background transaction to do the
+	actual drop, the trx instance that is passed to this function. To
+	preserve existing behaviour we remove the locks but ideally we
+	shouldn't have to. There should never be record locks on a table
+	that is going to be dropped. */
 
-		added = row_add_table_to_background_drop_list(table_name);
+	if (table->n_ref_count == 0) {
+		lock_remove_all_on_table(table, TRUE);
+		ut_a(table->n_rec_locks == 0);
+	} else if (table->n_ref_count > 0 || table->n_rec_locks > 0) {
+		ibool	added;
+
+		added = row_add_table_to_background_drop_list(table->name);
 
 		if (added) {
 			ut_print_timestamp(stderr);
-			fputs("  InnoDB: You are trying to drop table ",
-			      stderr);
-			ut_print_name(stderr, trx, TRUE, table_name);
+			fputs("  InnoDB: Warning: MySQL is"
+			      " trying to drop table ", stderr);
+			ut_print_name(stderr, trx, TRUE, table->name);
 			fputs("\n"
-			      "InnoDB: though there is a"
-			      " foreign key check running on it.\n"
-			      "InnoDB: Adding the table to"
-			      " the background drop queue.\n",
+			      "InnoDB: though there are still"
+			      " open handles to it.\n"
+			      "InnoDB: Adding the table to the"
+			      " background drop queue.\n",
 			      stderr);
 
 			/* We return DB_SUCCESS to MySQL though the drop will
 			happen lazily later */
-
 			err = DB_SUCCESS;
 		} else {
 			/* The table is already in the background drop list */
@@ -3422,21 +3973,54 @@ check_next_foreign:
 		goto funct_exit;
 	}
 
-	/* Remove all locks there are on the table or its records */
-	lock_remove_all_on_table(table, TRUE);
+	/* The "to_be_dropped" marks table that is to be dropped, but
+	has not been dropped, instead, was put in the background drop
+	list due to being used by concurrent DML operations. Clear it
+	here since there are no longer any concurrent activities on it,
+	and it is free to be dropped */
+	table->to_be_dropped = false;
 
-	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
-	trx->table_id = table->id;
+	/* If we get this far then the table to be dropped must not have
+	any table or record locks on it. */
+
+	ut_a(!lock_table_has_locks(table));
+
+	switch (trx_get_dict_operation(trx)) {
+	case TRX_DICT_OP_NONE:
+		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+		trx->table_id = table->id;
+	case TRX_DICT_OP_TABLE:
+		break;
+	case TRX_DICT_OP_INDEX:
+		/* If the transaction was previously flagged as
+		TRX_DICT_OP_INDEX, we should be dropping auxiliary
+		tables for full-text indexes. */
+		ut_ad(strstr(table->name, "/FTS_") != NULL);
+	}
 
 	/* Mark all indexes unavailable in the data dictionary cache
 	before starting to drop the table. */
 
-	for (index = dict_table_get_first_index(table);
+	unsigned*	page_no;
+	unsigned*	page_nos;
+	heap = mem_heap_create(
+		200 + UT_LIST_GET_LEN(table->indexes) * sizeof *page_nos);
+	tablename = mem_heap_strdup(heap, name);
+
+	page_no = page_nos = static_cast<unsigned*>(
+		mem_heap_alloc(
+			heap,
+			UT_LIST_GET_LEN(table->indexes) * sizeof *page_no));
+
+	for (dict_index_t* index = dict_table_get_first_index(table);
 	     index != NULL;
 	     index = dict_table_get_next_index(index)) {
 		rw_lock_x_lock(dict_index_get_lock(index));
-		ut_ad(!index->to_be_dropped);
-		index->to_be_dropped = TRUE;
+		/* Save the page numbers so that we can restore them
+		if the operation fails. */
+		*page_no++ = index->page;
+		/* Mark the index unusable. */
+		index->page = FIL_NULL;
 		rw_lock_x_unlock(dict_index_get_lock(index));
 	}
 
@@ -3455,6 +4039,7 @@ check_next_foreign:
 			   "table_id CHAR;\n"
 			   "index_id CHAR;\n"
 			   "foreign_id CHAR;\n"
+			   "space_id INT;\n"
 			   "found INT;\n"
 
 			   "DECLARE CURSOR cur_fk IS\n"
@@ -3477,6 +4062,12 @@ check_next_foreign:
 			   "IF (SQL % NOTFOUND) THEN\n"
 			   "       RETURN;\n"
 			   "END IF;\n"
+			   "SELECT SPACE INTO space_id\n"
+			   "FROM SYS_TABLES\n"
+			   "WHERE NAME = :table_name;\n"
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "       RETURN;\n"
+			   "END IF;\n"
 			   "found := 1;\n"
 			   "SELECT ID INTO sys_foreign_id\n"
 			   "FROM SYS_TABLES\n"
@@ -3511,8 +4102,6 @@ check_next_foreign:
 			   "       IF (SQL % NOTFOUND) THEN\n"
 			   "               found := 0;\n"
 			   "       ELSE\n"
-			   "               DELETE FROM SYS_STATS\n"
-			   "               WHERE INDEX_ID = index_id;\n"
 			   "               DELETE FROM SYS_FIELDS\n"
 			   "               WHERE INDEX_ID = index_id;\n"
 			   "               DELETE FROM SYS_INDEXES\n"
@@ -3521,45 +4110,95 @@ check_next_foreign:
 			   "       END IF;\n"
 			   "END LOOP;\n"
 			   "CLOSE cur_idx;\n"
+			   "DELETE FROM SYS_TABLESPACES\n"
+			   "WHERE SPACE = space_id;\n"
+			   "DELETE FROM SYS_DATAFILES\n"
+			   "WHERE SPACE = space_id;\n"
 			   "DELETE FROM SYS_COLUMNS\n"
 			   "WHERE TABLE_ID = table_id;\n"
 			   "DELETE FROM SYS_TABLES\n"
-			   "WHERE ID = table_id;\n"
+			   "WHERE NAME = :table_name;\n"
 			   "END;\n"
 			   , FALSE, trx);
 
 	switch (err) {
-		ibool		is_temp;
-		const char*	name_or_path;
-		mem_heap_t*	heap;
+		ibool	is_temp;
 
 	case DB_SUCCESS:
-
-		heap = mem_heap_create(200);
-
 		/* Clone the name, in case it has been allocated
 		from table->heap, which will be freed by
 		dict_table_remove_from_cache(table) below. */
-		name = mem_heap_strdup(heap, name);
 		space_id = table->space;
+		ibd_file_missing = table->ibd_file_missing;
+
+		is_temp = DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY);
+
+		/* If there is a temp path then the temp flag is set.
+		However, during recovery, we might have a temp flag but
+		not know the temp path */
+		ut_a(table->dir_path_of_temp_table == NULL || is_temp);
+		if (dict_table_is_discarded(table)
+		    || table->ibd_file_missing) {
+			/* Do not attempt to drop known-to-be-missing
+			tablespaces. */
+			space_id = 0;
+		}
+
+		/* We do not allow temporary tables with a remote path. */
+		ut_a(!(is_temp && DICT_TF_HAS_DATA_DIR(table->flags)));
+
+		if (space_id && DICT_TF_HAS_DATA_DIR(table->flags)) {
+			dict_get_and_save_data_dir_path(table, true);
+			ut_a(table->data_dir_path);
 
-		if (table->dir_path_of_temp_table != NULL) {
-			name_or_path = mem_heap_strdup(
-				heap, table->dir_path_of_temp_table);
-			is_temp = TRUE;
+			filepath = os_file_make_remote_pathname(
+				table->data_dir_path, table->name, "ibd");
+		} else if (table->dir_path_of_temp_table) {
+			filepath = fil_make_ibd_name(
+				table->dir_path_of_temp_table, true);
 		} else {
-			name_or_path = name;
-			is_temp = (table->flags >> DICT_TF2_SHIFT)
-				& DICT_TF2_TEMPORARY;
+			filepath = fil_make_ibd_name(tablename, false);
+		}
+
+		if (dict_table_has_fts_index(table)
+		    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			ut_ad(table->n_ref_count == 0);
+			ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+			err = fts_drop_tables(trx, table);
+
+			if (err != DB_SUCCESS) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr," InnoDB: Error: (%s) not "
+					"able to remove ancillary FTS tables "
+					"for table ", ut_strerr(err));
+				ut_print_name(stderr, trx, TRUE, tablename);
+				fputs("\n", stderr);
+
+				goto funct_exit;
+			}
+		}
+
+		/* The table->fts flag can be set on the table for which
+		the cluster index is being rebuilt. Such table might not have
+		DICT_TF2_FTS flag set. So keep this out of above
+		dict_table_has_fts_index condition */
+		if (table->fts) {
+			/* Need to set TABLE_DICT_LOCKED bit, since
+			fts_que_graph_free_check_lock would try to acquire
+			dict mutex lock */
+			table->fts->fts_status |= TABLE_DICT_LOCKED;
+
+			fts_free(table);
 		}
 
 		dict_table_remove_from_cache(table);
 
-		if (dict_load_table(name, TRUE, DICT_ERR_IGNORE_NONE) != NULL) {
+		if (dict_load_table(tablename, TRUE,
+				    DICT_ERR_IGNORE_NONE) != NULL) {
 			ut_print_timestamp(stderr);
 			fputs("  InnoDB: Error: not able to remove table ",
 			      stderr);
-			ut_print_name(stderr, trx, TRUE, name);
+			ut_print_name(stderr, trx, TRUE, tablename);
 			fputs(" from the dictionary cache!\n", stderr);
 			err = DB_ERROR;
 		}
@@ -3567,25 +4206,46 @@ check_next_foreign:
 		/* Do not drop possible .ibd tablespace if something went
 		wrong: we do not want to delete valuable data of the user */
 
-		if (err == DB_SUCCESS && !trx_sys_sys_space(space_id)) {
-			if (!fil_space_for_table_exists_in_mem(space_id,
-							       name_or_path,
-							       is_temp, FALSE,
-							       !is_temp)) {
+		/* Don't spam the log if we can't find the tablespace of
+		a temp table or if the tablesace has been discarded. */
+		print_msg = !(is_temp || ibd_file_missing);
+
+		if (err == DB_SUCCESS && space_id > TRX_SYS_SPACE) {
+			if (!is_temp
+			    && !fil_space_for_table_exists_in_mem(
+					space_id, tablename, FALSE,
+					print_msg, false, NULL, 0)) {
+				/* This might happen if we are dropping a
+				discarded tablespace */
 				err = DB_SUCCESS;
 
+				if (print_msg) {
+					char msg_tablename[MAX_FULL_NAME_LEN + 1];
+
+					innobase_format_name(
+						msg_tablename, sizeof(tablename),
+						tablename, FALSE);
+
+					ib_logf(IB_LOG_LEVEL_INFO,
+						"Removed the table %s from "
+						"InnoDB's data dictionary",
+						msg_tablename);
+				}
+
+				/* Force a delete of any discarded
+				or temporary files. */
+
+				fil_delete_file(filepath);
+
+			} else if (fil_delete_tablespace(
+					space_id,
+					BUF_REMOVE_FLUSH_NO_WRITE)
+				   != DB_SUCCESS) {
 				fprintf(stderr,
 					"InnoDB: We removed now the InnoDB"
 					" internal data dictionary entry\n"
 					"InnoDB: of table ");
-				ut_print_name(stderr, trx, TRUE, name);
-				fprintf(stderr, ".\n");
-			} else if (!fil_delete_tablespace(space_id, FALSE)) {
-				fprintf(stderr,
-					"InnoDB: We removed now the InnoDB"
-					" internal data dictionary entry\n"
-					"InnoDB: of table ");
-				ut_print_name(stderr, trx, TRUE, name);
+				ut_print_name(stderr, trx, TRUE, tablename);
 				fprintf(stderr, ".\n");
 
 				ut_print_timestamp(stderr);
@@ -3593,13 +4253,21 @@ check_next_foreign:
 					"  InnoDB: Error: not able to"
 					" delete tablespace %lu of table ",
 					(ulong) space_id);
-				ut_print_name(stderr, trx, TRUE, name);
+				ut_print_name(stderr, trx, TRUE, tablename);
 				fputs("!\n", stderr);
 				err = DB_ERROR;
 			}
 		}
 
-		mem_heap_free(heap);
+		break;
+
+	case DB_OUT_OF_FILE_SPACE:
+		err = DB_MUST_GET_MORE_FILE_SPACE;
+
+		row_mysql_handle_errors(&err, trx, NULL, NULL);
+
+		/* raise error */
+		ut_error;
 		break;
 
 	case DB_TOO_MANY_CONCURRENT_TRXS:
@@ -3608,31 +4276,42 @@ check_next_foreign:
 		and return the DB_TOO_MANY_CONCURRENT_TRXS
 		error. */
 
+	default:
+		/* This is some error we do not expect. Print
+		the error number and rollback transaction */
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "InnoDB: unknown error code %lu"
+			" while dropping table:", (ulong) err);
+		ut_print_name(stderr, trx, TRUE, tablename);
+		fprintf(stderr, ".\n");
+
+		trx->error_state = DB_SUCCESS;
+		trx_rollback_to_savepoint(trx, NULL);
+		trx->error_state = DB_SUCCESS;
+
 		/* Mark all indexes available in the data dictionary
 		cache again. */
 
-		for (index = dict_table_get_first_index(table);
+		page_no = page_nos;
+
+		for (dict_index_t* index = dict_table_get_first_index(table);
 		     index != NULL;
 		     index = dict_table_get_next_index(index)) {
 			rw_lock_x_lock(dict_index_get_lock(index));
-			index->to_be_dropped = FALSE;
+			ut_a(index->page == FIL_NULL);
+			index->page = *page_no++;
 			rw_lock_x_unlock(dict_index_get_lock(index));
 		}
-		break;
-
-	case DB_OUT_OF_FILE_SPACE:
-		err = DB_MUST_GET_MORE_FILE_SPACE;
-
-		row_mysql_handle_errors(&err, trx, NULL, NULL);
-
-		/* Fall through to raise error */
-
-	default:
-		/* No other possible error returns */
-		ut_error;
 	}
 
 funct_exit:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	if (filepath) {
+		mem_free(filepath);
+	}
 
 	if (locked_dictionary) {
 		trx_commit_for_mysql(trx);
@@ -3644,7 +4323,7 @@ funct_exit:
 
 	srv_wake_master_thread();
 
-	return((int) err);
+	return(err);
 }
 
 /*********************************************************************//**
@@ -3668,9 +4347,9 @@ row_mysql_drop_temp_tables(void)
 	mtr_start(&mtr);
 
 	btr_pcur_open_at_index_side(
-		TRUE,
+		true,
 		dict_table_get_first_index(dict_sys->sys_tables),
-		BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+		BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
 
 	for (;;) {
 		const rec_t*	rec;
@@ -3685,23 +4364,32 @@ row_mysql_drop_temp_tables(void)
 			break;
 		}
 
+		/* The high order bit of N_COLS is set unless
+		ROW_FORMAT=REDUNDANT. */
 		rec = btr_pcur_get_rec(&pcur);
-		field = rec_get_nth_field_old(rec, 4/*N_COLS*/, &len);
-		if (len != 4 || !(mach_read_from_4(field) & 0x80000000UL)) {
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__NAME, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__N_COLS, &len);
+		if (len != 4
+		    || !(mach_read_from_4(field) & DICT_N_COLS_COMPACT)) {
 			continue;
 		}
 
-		/* Because this is not a ROW_FORMAT=REDUNDANT table,
-		the is_temp flag is valid.  Examine it. */
-
-		field = rec_get_nth_field_old(rec, 7/*MIX_LEN*/, &len);
+		/* Older versions of InnoDB, which only supported tables
+		in ROW_FORMAT=REDUNDANT could write garbage to
+		SYS_TABLES.MIX_LEN, where we now store the is_temp flag.
+		Above, we assumed is_temp=0 if ROW_FORMAT=REDUNDANT. */
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len);
 		if (len != 4
 		    || !(mach_read_from_4(field) & DICT_TF2_TEMPORARY)) {
 			continue;
 		}
 
 		/* This is a temporary table. */
-		field = rec_get_nth_field_old(rec, 0/*NAME*/, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__NAME, &len);
 		if (len == UNIV_SQL_NULL || len == 0) {
 			/* Corrupted SYS_TABLES.NAME */
 			continue;
@@ -3712,7 +4400,7 @@ row_mysql_drop_temp_tables(void)
 		btr_pcur_store_position(&pcur, &mtr);
 		btr_pcur_commit_specify_mtr(&pcur, &mtr);
 
-		table = dict_table_get_low(table_name, DICT_ERR_IGNORE_ALL);
+		table = dict_table_get_low(table_name);
 
 		if (table) {
 			row_drop_table_for_mysql(table_name, trx, FALSE);
@@ -3735,15 +4423,15 @@ row_mysql_drop_temp_tables(void)
 Drop all foreign keys in a database, see Bug#18942.
 Called at the end of row_drop_database_for_mysql().
 @return	error code or DB_SUCCESS */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 drop_all_foreign_keys_in_db(
 /*========================*/
 	const char*	name,	/*!< in: database name which ends to '/' */
 	trx_t*		trx)	/*!< in: transaction handle */
 {
 	pars_info_t*	pinfo;
-	ulint		err;
+	dberr_t		err;
 
 	ut_a(name[strlen(name) - 1] == '/');
 
@@ -3795,37 +4483,76 @@ drop_all_foreign_keys_in_db(
 Drops a database for MySQL.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 row_drop_database_for_mysql(
 /*========================*/
 	const char*	name,	/*!< in: database name which ends to '/' */
 	trx_t*		trx)	/*!< in: transaction handle */
 {
-	dict_table_t* table;
-	char*	table_name;
-	int	err	= DB_SUCCESS;
-	ulint	namelen	= strlen(name);
+	dict_table_t*	table;
+	char*		table_name;
+	dberr_t		err	= DB_SUCCESS;
+	ulint		namelen	= strlen(name);
 
 	ut_a(name != NULL);
 	ut_a(name[namelen - 1] == '/');
 
 	trx->op_info = "dropping database";
 
-	trx_start_if_not_started(trx);
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	trx_start_if_not_started_xa(trx);
 loop:
 	row_mysql_lock_data_dictionary(trx);
 
 	while ((table_name = dict_get_first_table_name_in_db(name))) {
 		ut_a(memcmp(table_name, name, namelen) == 0);
 
-		table = dict_table_get_low(table_name, DICT_ERR_IGNORE_NONE);
+		table = dict_table_open_on_name(
+			table_name, TRUE, FALSE, static_cast<dict_err_ignore_t>(
+				DICT_ERR_IGNORE_INDEX_ROOT
+				| DICT_ERR_IGNORE_CORRUPT));
+
+		if (!table) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Cannot load table %s from InnoDB internal "
+				"data dictionary during drop database",
+				table_name);
+			mem_free(table_name);
+			err = DB_TABLE_NOT_FOUND;
+			break;
 
-		ut_a(table);
+		}
+
+		if (!row_is_mysql_tmp_table_name(table->name)) {
+			/* There could be orphan temp tables left from
+			interrupted alter table. Leave them, and handle
+			the rest.*/
+			if (table->can_be_evicted) {
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"Orphan table encountered during "
+					"DROP DATABASE. This is possible if "
+					"'%s.frm' was lost.", table->name);
+			}
+
+			if (table->ibd_file_missing) {
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"Missing %s.ibd file for table %s.",
+					table->name, table->name);
+			}
+		}
+
+		dict_table_close(table, TRUE, FALSE);
+
+		/* The dict_table_t object must not be accessed before
+		dict_table_open() or after dict_table_close(). But this is OK
+		if we are holding, the dict_sys->mutex. */
+		ut_ad(mutex_own(&dict_sys->mutex));
 
 		/* Wait until MySQL does not have any queries running on
 		the table */
 
-		if (table->n_mysql_handles_opened > 0) {
+		if (table->n_ref_count > 0) {
 			row_mysql_unlock_data_dictionary(trx);
 
 			ut_print_timestamp(stderr);
@@ -3851,8 +4578,8 @@ loop:
 		if (err != DB_SUCCESS) {
 			fputs("InnoDB: DROP DATABASE ", stderr);
 			ut_print_name(stderr, trx, TRUE, name);
-			fprintf(stderr, " failed with error %lu for table ",
-				(ulint) err);
+			fprintf(stderr, " failed with error (%s) for table ",
+				ut_strerr(err));
 			ut_print_name(stderr, trx, TRUE, table_name);
 			putc('\n', stderr);
 			mem_free(table_name);
@@ -3865,7 +4592,7 @@ loop:
 	if (err == DB_SUCCESS) {
 		/* after dropping all tables try to drop all leftover
 		foreign keys in case orphaned ones exist */
-		err = (int) drop_all_foreign_keys_in_db(name, trx);
+		err = drop_all_foreign_keys_in_db(name, trx);
 
 		if (err != DB_SUCCESS) {
 			fputs("InnoDB: DROP DATABASE ", stderr);
@@ -3887,9 +4614,9 @@ loop:
 /*********************************************************************//**
 Checks if a table name contains the string "/#sql" which denotes temporary
 tables in MySQL.
-@return	TRUE if temporary table */
-static
-ibool
+@return	true if temporary table */
+UNIV_INTERN __attribute__((warn_unused_result))
+bool
 row_is_mysql_tmp_table_name(
 /*========================*/
 	const char*	name)	/*!< in: table name in the form
@@ -3902,8 +4629,8 @@ row_is_mysql_tmp_table_name(
 /****************************************************************//**
 Delete a single constraint.
 @return	error code or DB_SUCCESS */
-static
-int
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_delete_constraint_low(
 /*======================*/
 	const char*	id,		/*!< in: constraint id */
@@ -3913,7 +4640,7 @@ row_delete_constraint_low(
 
 	pars_info_add_str_literal(info, "id", id);
 
-	return((int) que_eval_sql(info,
+	return(que_eval_sql(info,
 			    "PROCEDURE DELETE_CONSTRAINT () IS\n"
 			    "BEGIN\n"
 			    "DELETE FROM SYS_FOREIGN_COLS WHERE ID = :id;\n"
@@ -3925,8 +4652,8 @@ row_delete_constraint_low(
 /****************************************************************//**
 Delete a single constraint.
 @return	error code or DB_SUCCESS */
-static
-int
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_delete_constraint(
 /*==================*/
 	const char*	id,		/*!< in: constraint id */
@@ -3935,7 +4662,7 @@ row_delete_constraint(
 	mem_heap_t*	heap,		/*!< in: memory heap */
 	trx_t*		trx)		/*!< in: transaction handle */
 {
-	ulint		err;
+	dberr_t	err;
 
 	/* New format constraints have ids <databasename>/<constraintname>. */
 	err = row_delete_constraint_low(
@@ -3952,23 +4679,24 @@ row_delete_constraint(
 		err = row_delete_constraint_low(id, trx);
 	}
 
-	return((int) err);
+	return(err);
 }
 
 /*********************************************************************//**
 Renames a table for MySQL.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 row_rename_table_for_mysql(
 /*=======================*/
 	const char*	old_name,	/*!< in: old table name */
 	const char*	new_name,	/*!< in: new table name */
-	trx_t*		trx,		/*!< in: transaction handle */
-	ibool		commit)		/*!< in: if TRUE then commit trx */
+	trx_t*		trx,		/*!< in/out: transaction */
+	bool		commit)		/*!< in: whether to commit trx */
 {
-	dict_table_t*	table;
-	ulint		err			= DB_ERROR;
+	dict_table_t*	table			= NULL;
+	ibool		dict_locked		= FALSE;
+	dberr_t		err			= DB_ERROR;
 	mem_heap_t*	heap			= NULL;
 	const char**	constraints_to_drop	= NULL;
 	ulint		n_constraints_to_drop	= 0;
@@ -3978,7 +4706,7 @@ row_rename_table_for_mysql(
 
 	ut_a(old_name != NULL);
 	ut_a(new_name != NULL);
-	ut_ad(trx->state == TRX_ACTIVE);
+	ut_ad(trx->state == TRX_STATE_ACTIVE);
 
 	if (srv_created_new_raw || srv_force_recovery) {
 		fputs("InnoDB: A new raw disk partition was initialized or\n"
@@ -4007,7 +4735,10 @@ row_rename_table_for_mysql(
 	old_is_tmp = row_is_mysql_tmp_table_name(old_name);
 	new_is_tmp = row_is_mysql_tmp_table_name(new_name);
 
-	table = dict_table_get_low(old_name, DICT_ERR_IGNORE_NONE);
+	dict_locked = trx->dict_operation_lock_mode == RW_X_LATCH;
+
+	table = dict_table_open_on_name(old_name, dict_locked, FALSE,
+					DICT_ERR_IGNORE_NONE);
 
 	if (!table) {
 		err = DB_TABLE_NOT_FOUND;
@@ -4026,18 +4757,19 @@ row_rename_table_for_mysql(
 		      "InnoDB: " REFMAN "innodb-troubleshooting.html\n",
 		      stderr);
 		goto funct_exit;
-	} else if (table->ibd_file_missing) {
+
+	} else if (table->ibd_file_missing
+		   && !dict_table_is_discarded(table)) {
+
 		err = DB_TABLE_NOT_FOUND;
-		ut_print_timestamp(stderr);
 
-		fputs("  InnoDB: Error: table ", stderr);
-		ut_print_name(stderr, trx, TRUE, old_name);
-		fputs(" does not have an .ibd file"
-		      " in the database directory.\n"
-		      "InnoDB: You can look for further help from\n"
-		      "InnoDB: " REFMAN "innodb-troubleshooting.html\n",
-		      stderr);
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Table %s does not have an .ibd file in the database "
+			"directory. See " REFMAN "innodb-troubleshooting.html",
+			old_name);
+
 		goto funct_exit;
+
 	} else if (new_is_tmp) {
 		/* MySQL is doing an ALTER TABLE command and it renames the
 		original table to a temporary table name. We want to preserve
@@ -4052,7 +4784,6 @@ row_rename_table_for_mysql(
 			&constraints_to_drop);
 
 		if (err != DB_SUCCESS) {
-
 			goto funct_exit;
 		}
 	}
@@ -4087,15 +4818,45 @@ row_rename_table_for_mysql(
 	err = que_eval_sql(info,
 			   "PROCEDURE RENAME_TABLE () IS\n"
 			   "BEGIN\n"
-			   "UPDATE SYS_TABLES SET NAME = :new_table_name\n"
+			   "UPDATE SYS_TABLES"
+			   " SET NAME = :new_table_name\n"
 			   " WHERE NAME = :old_table_name;\n"
 			   "END;\n"
 			   , FALSE, trx);
 
-	if (err != DB_SUCCESS) {
+	/* SYS_TABLESPACES and SYS_DATAFILES track non-system tablespaces
+	which have space IDs > 0. */
+	if (err == DB_SUCCESS
+	    && table->space != TRX_SYS_SPACE
+	    && !table->ibd_file_missing) {
+		/* Make a new pathname to update SYS_DATAFILES. */
+		char*	new_path = row_make_new_pathname(table, new_name);
 
+		info = pars_info_create();
+
+		pars_info_add_str_literal(info, "new_table_name", new_name);
+		pars_info_add_str_literal(info, "new_path_name", new_path);
+		pars_info_add_int4_literal(info, "space_id", table->space);
+
+		err = que_eval_sql(info,
+				   "PROCEDURE RENAME_SPACE () IS\n"
+				   "BEGIN\n"
+				   "UPDATE SYS_TABLESPACES"
+				   " SET NAME = :new_table_name\n"
+				   " WHERE SPACE = :space_id;\n"
+				   "UPDATE SYS_DATAFILES"
+				   " SET PATH = :new_path_name\n"
+				   " WHERE SPACE = :space_id;\n"
+				   "END;\n"
+				   , FALSE, trx);
+
+		mem_free(new_path);
+	}
+	if (err != DB_SUCCESS) {
 		goto end;
-	} else if (!new_is_tmp) {
+	}
+
+	if (!new_is_tmp) {
 		/* Rename all constraints. */
 		char	new_table_name[MAX_TABLE_NAME_LEN] = "";
 		char	old_table_utf8[MAX_TABLE_NAME_LEN] = "";
@@ -4222,6 +4983,24 @@ row_rename_table_for_mysql(
 		}
 	}
 
+	if (dict_table_has_fts_index(table)
+	    && !dict_tables_have_same_db(old_name, new_name)) {
+		err = fts_rename_aux_tables(table, new_name, trx);
+
+		if (err != DB_SUCCESS && (table->space != 0)) {
+			char*	orig_name = table->name;
+
+			/* If rename fails and table has its own tablespace,
+			we need to call fts_rename_aux_tables again to
+			revert the ibd file rename, which is not under the
+			control of trx. Also notice the parent table name
+			in cache is not changed yet. */
+			table->name = const_cast<char*>(new_name);
+			fts_rename_aux_tables(table, old_name, trx);
+			table->name = orig_name;
+		}
+	}
+
 end:
 	if (err != DB_SUCCESS) {
 		if (err == DB_DUPLICATE_KEY) {
@@ -4261,18 +5040,18 @@ end:
 			      "InnoDB: succeed.\n", stderr);
 		}
 		trx->error_state = DB_SUCCESS;
-		trx_general_rollback_for_mysql(trx, NULL);
+		trx_rollback_to_savepoint(trx, NULL);
 		trx->error_state = DB_SUCCESS;
 	} else {
 		/* The following call will also rename the .ibd data file if
 		the table is stored in a single-table tablespace */
 
-		if (!dict_table_rename_in_cache(table, new_name,
-						!new_is_tmp)) {
+		err = dict_table_rename_in_cache(
+			table, new_name, !new_is_tmp);
+		if (err != DB_SUCCESS) {
 			trx->error_state = DB_SUCCESS;
-			trx_general_rollback_for_mysql(trx, NULL);
+			trx_rollback_to_savepoint(trx, NULL);
 			trx->error_state = DB_SUCCESS;
-			err = DB_ERROR;
 			goto funct_exit;
 		}
 
@@ -4280,7 +5059,8 @@ end:
 		an ALTER, not in a RENAME. */
 
 		err = dict_load_foreigns(
-			new_name, FALSE, !old_is_tmp || trx->check_foreigns,
+			new_name, NULL,
+			false, !old_is_tmp || trx->check_foreigns,
 			DICT_ERR_IGNORE_NONE);
 
 		if (err != DB_SUCCESS) {
@@ -4309,27 +5089,18 @@ end:
 				      stderr);
 			}
 
-			ut_a(dict_table_rename_in_cache(table,
-							old_name, FALSE));
+			ut_a(DB_SUCCESS == dict_table_rename_in_cache(
+				table, old_name, FALSE));
 			trx->error_state = DB_SUCCESS;
-			trx_general_rollback_for_mysql(trx, NULL);
+			trx_rollback_to_savepoint(trx, NULL);
 			trx->error_state = DB_SUCCESS;
-		} else {
-			if (old_is_tmp && !new_is_tmp) {
-				/* After ALTER TABLE the table statistics
-				needs to be rebuilt.  Even if we close
-				table below there could be other
-				transactions using this table (e.g.
-				SELECT * FROM INFORMATION_SCHEMA.`TABLE_CONSTRAINTS`),
-				thus we can't remove table from dictionary cache
-				here. Therefore, we initialize the
-				transient statistics here. */
-				dict_stats_update_transient(table);
-			}
 		}
 	}
 
 funct_exit:
+	if (table != NULL) {
+		dict_table_close(table, dict_locked, FALSE);
+	}
 
 	if (commit) {
 		trx_commit_for_mysql(trx);
@@ -4348,9 +5119,9 @@ funct_exit:
 Checks that the index contains entries in an ascending order, unique
 constraint is not broken, and calculates the number of index entries
 in the read view of the current transaction.
-@return	TRUE if ok */
+@return	true if ok */
 UNIV_INTERN
-ibool
+bool
 row_check_index_for_mysql(
 /*======================*/
 	row_prebuilt_t*		prebuilt,	/*!< in: prebuilt struct
@@ -4365,7 +5136,7 @@ row_check_index_for_mysql(
 	byte*		buf;
 	ulint		ret;
 	rec_t*		rec;
-	ibool		is_ok		= TRUE;
+	bool		is_ok		= true;
 	int		cmp;
 	ibool		contains_null;
 	ulint		i;
@@ -4378,7 +5149,23 @@ row_check_index_for_mysql(
 
 	*n_rows = 0;
 
-	buf = mem_alloc(UNIV_PAGE_SIZE);
+	if (dict_index_is_clust(index)) {
+		/* The clustered index of a table is always available.
+		During online ALTER TABLE that rebuilds the table, the
+		clustered index in the old table will have
+		index->online_log pointing to the new table. All
+		indexes of the old table will remain valid and the new
+		table will be unaccessible to MySQL until the
+		completion of the ALTER TABLE. */
+	} else if (dict_index_is_online_ddl(index)
+		   || (index->type & DICT_FTS)) {
+		/* Full Text index are implemented by auxiliary tables,
+		not the B-tree. We also skip secondary indexes that are
+		being created online. */
+		return(true);
+	}
+
+	buf = static_cast<byte*>(mem_alloc(UNIV_PAGE_SIZE));
 	heap = mem_heap_create(100);
 
 	cnt = 1000;
@@ -4440,6 +5227,7 @@ func_exit:
 				    dtuple_get_nth_field(prev_entry, i))) {
 
 				contains_null = TRUE;
+				break;
 			}
 		}
 
@@ -4456,7 +5244,7 @@ not_ok:
 			      "InnoDB: record ", stderr);
 			rec_print_new(stderr, rec, offsets);
 			putc('\n', stderr);
-			is_ok = FALSE;
+			is_ok = false;
 		} else if (dict_index_is_unique(index)
 			   && !contains_null
 			   && matched_fields
@@ -4479,14 +5267,15 @@ not_ok:
 				* sizeof *offsets;
 
 			tmp_heap = mem_heap_create(size);
-			offsets = mem_heap_dup(tmp_heap, offsets, size);
+
+			offsets = static_cast<ulint*>(
+				mem_heap_dup(tmp_heap, offsets, size));
 		}
 
 		mem_heap_empty(heap);
 
-		prev_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec,
-						    index, offsets,
-						    &n_ext, heap);
+		prev_entry = row_rec_to_index_entry(
+			rec, index, offsets, &n_ext, heap);
 
 		if (UNIV_LIKELY_NULL(tmp_heap)) {
 			mem_heap_free(tmp_heap);
@@ -4500,9 +5289,9 @@ not_ok:
 
 /*********************************************************************//**
 Determines if a table is a magic monitor table.
-@return	TRUE if monitor table */
+@return	true if monitor table */
 UNIV_INTERN
-ibool
+bool
 row_is_magic_monitor_table(
 /*=======================*/
 	const char*	table_name)	/*!< in: name of the table, in the
@@ -4511,19 +5300,45 @@ row_is_magic_monitor_table(
 	const char*	name; /* table_name without database/ */
 	ulint		len;
 
-	name = strchr(table_name, '/');
-	ut_a(name != NULL);
-	name++;
+	name = dict_remove_db_name(table_name);
 	len = strlen(name) + 1;
 
-	if (STR_EQ(name, len, S_innodb_monitor)
-	    || STR_EQ(name, len, S_innodb_lock_monitor)
-	    || STR_EQ(name, len, S_innodb_tablespace_monitor)
-	    || STR_EQ(name, len, S_innodb_table_monitor)
-	    || STR_EQ(name, len, S_innodb_mem_validate)) {
+	return(STR_EQ(name, len, S_innodb_monitor)
+	       || STR_EQ(name, len, S_innodb_lock_monitor)
+	       || STR_EQ(name, len, S_innodb_tablespace_monitor)
+	       || STR_EQ(name, len, S_innodb_table_monitor)
+#ifdef UNIV_MEM_DEBUG
+	       || STR_EQ(name, len, S_innodb_mem_validate)
+#endif /* UNIV_MEM_DEBUG */
+	       );
+}
 
-		return(TRUE);
-	}
+/*********************************************************************//**
+Initialize this module */
+UNIV_INTERN
+void
+row_mysql_init(void)
+/*================*/
+{
+	mutex_create(
+		row_drop_list_mutex_key,
+		&row_drop_list_mutex, SYNC_NO_ORDER_CHECK);
+
+	UT_LIST_INIT(row_mysql_drop_list);
+
+	row_mysql_drop_list_inited = TRUE;
+}
+
+/*********************************************************************//**
+Close this module */
+UNIV_INTERN
+void
+row_mysql_close(void)
+/*================*/
+{
+	ut_a(UT_LIST_GET_LEN(row_mysql_drop_list) == 0);
+
+	mutex_free(&row_drop_list_mutex);
 
-	return(FALSE);
+	row_mysql_drop_list_inited = FALSE;
 }
diff --git a/storage/xtradb/row/row0purge.c b/storage/xtradb/row/row0purge.cc
index efcfdc3bac5..1b836c26c25 100644
--- a/storage/xtradb/row/row0purge.c
+++ b/storage/xtradb/row/row0purge.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0purge.c
+@file row/row0purge.cc
 Purge obsolete records
 
 Created 3/14/1997 Heikki Tuuri
@@ -42,7 +42,10 @@ Created 3/14/1997 Heikki Tuuri
 #include "row0upd.h"
 #include "row0vers.h"
 #include "row0mysql.h"
+#include "row0log.h"
 #include "log0log.h"
+#include "srv0mon.h"
+#include "srv0start.h"
 
 /*************************************************************************
 IMPORTANT NOTE: Any operation that generates redo MUST check that there
@@ -61,18 +64,19 @@ UNIV_INTERN
 purge_node_t*
 row_purge_node_create(
 /*==================*/
-	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
-	mem_heap_t*	heap)	/*!< in: memory heap where created */
+	que_thr_t*	parent,		/*!< in: parent node  */
+	mem_heap_t*	heap)		/*!< in: memory heap where created */
 {
 	purge_node_t*	node;
 
 	ut_ad(parent && heap);
 
-	node = mem_heap_alloc(heap, sizeof(purge_node_t));
+	node = static_cast<purge_node_t*>(
+		mem_heap_zalloc(heap, sizeof(*node)));
 
 	node->common.type = QUE_NODE_PURGE;
 	node->common.parent = parent;
-
+	node->done = TRUE;
 	node->heap = mem_heap_create(256);
 
 	return(node);
@@ -90,138 +94,127 @@ row_purge_reposition_pcur(
 	purge_node_t*	node,	/*!< in: row purge node */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	ibool	found;
-
 	if (node->found_clust) {
-		found = btr_pcur_restore_position(mode, &(node->pcur), mtr);
+		ibool	found;
 
-		return(found);
-	}
+		found = btr_pcur_restore_position(mode, &node->pcur, mtr);
 
-	found = row_search_on_row_ref(&(node->pcur), mode, node->table,
-				      node->ref, mtr);
-	node->found_clust = found;
+		return(found);
+	} else {
+		node->found_clust = row_search_on_row_ref(
+			&node->pcur, mode, node->table, node->ref, mtr);
 
-	if (found) {
-		btr_pcur_store_position(&(node->pcur), mtr);
+		if (node->found_clust) {
+			btr_pcur_store_position(&node->pcur, mtr);
+		}
 	}
 
-	return(found);
+	return(node->found_clust);
 }
 
 /***********************************************************//**
 Removes a delete marked clustered index record if possible.
-@return TRUE if success, or if not found, or if modified after the
-delete marking */
-static
-ibool
+@retval true if the row was not found, or it was successfully removed
+@retval false if the row was modified after the delete marking */
+static __attribute__((nonnull, warn_unused_result))
+bool
 row_purge_remove_clust_if_poss_low(
 /*===============================*/
-	purge_node_t*	node,	/*!< in: row purge node */
+	purge_node_t*	node,	/*!< in/out: row purge node */
 	ulint		mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
 {
-	dict_index_t*	index;
-	btr_pcur_t*	pcur;
-	btr_cur_t*	btr_cur;
-	ibool		success;
-	ulint		err;
-	mtr_t		mtr;
-	rec_t*		rec;
-	mem_heap_t*	heap		= NULL;
-	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	dict_index_t*		index;
+	bool			success		= true;
+	mtr_t			mtr;
+	rec_t*			rec;
+	mem_heap_t*		heap		= NULL;
+	ulint*			offsets;
+	ulint			offsets_[REC_OFFS_NORMAL_SIZE];
 	rec_offs_init(offsets_);
 
-	index = dict_table_get_first_index(node->table);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
 
-	pcur = &(node->pcur);
-	btr_cur = btr_pcur_get_btr_cur(pcur);
+	index = dict_table_get_first_index(node->table);
 
 	log_free_check();
 	mtr_start(&mtr);
 
-	success = row_purge_reposition_pcur(mode, node, &mtr);
-
-	if (!success) {
-		/* The record is already removed */
-
-		btr_pcur_commit_specify_mtr(pcur, &mtr);
-
-		return(TRUE);
+	if (!row_purge_reposition_pcur(mode, node, &mtr)) {
+		/* The record was already removed. */
+		goto func_exit;
 	}
 
-	rec = btr_pcur_get_rec(pcur);
-
-	if (node->roll_ptr != row_get_rec_roll_ptr(
-		    rec, index, rec_get_offsets(rec, index, offsets_,
-						ULINT_UNDEFINED, &heap))) {
-		if (UNIV_LIKELY_NULL(heap)) {
-			mem_heap_free(heap);
-		}
-		/* Someone else has modified the record later: do not remove */
-		btr_pcur_commit_specify_mtr(pcur, &mtr);
+	rec = btr_pcur_get_rec(&node->pcur);
 
-		return(TRUE);
-	}
+	offsets = rec_get_offsets(
+		rec, index, offsets_, ULINT_UNDEFINED, &heap);
 
-	if (UNIV_LIKELY_NULL(heap)) {
-		mem_heap_free(heap);
+	if (node->roll_ptr != row_get_rec_roll_ptr(rec, index, offsets)) {
+		/* Someone else has modified the record later: do not remove */
+		goto func_exit;
 	}
 
 	if (mode == BTR_MODIFY_LEAF) {
-		success = btr_cur_optimistic_delete(btr_cur, &mtr);
+		success = btr_cur_optimistic_delete(
+			btr_pcur_get_btr_cur(&node->pcur), 0, &mtr);
 	} else {
+		dberr_t	err;
 		ut_ad(mode == BTR_MODIFY_TREE);
-		btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
-					   RB_NONE, &mtr);
+		btr_cur_pessimistic_delete(
+			&err, FALSE, btr_pcur_get_btr_cur(&node->pcur), 0,
+			RB_NONE, &mtr);
 
-		if (err == DB_SUCCESS) {
-			success = TRUE;
-		} else if (err == DB_OUT_OF_FILE_SPACE) {
-			success = FALSE;
-		} else {
+		switch (err) {
+		case DB_SUCCESS:
+			break;
+		case DB_OUT_OF_FILE_SPACE:
+			success = false;
+			break;
+		default:
 			ut_error;
 		}
 	}
 
-	btr_pcur_commit_specify_mtr(pcur, &mtr);
+func_exit:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
 
 	return(success);
 }
 
 /***********************************************************//**
 Removes a clustered index record if it has not been modified after the delete
-marking. */
-static
-void
+marking.
+@retval true if the row was not found, or it was successfully removed
+@retval false the purge needs to be suspended because of running out
+of file space. */
+static __attribute__((nonnull, warn_unused_result))
+bool
 row_purge_remove_clust_if_poss(
 /*===========================*/
-	purge_node_t*	node)	/*!< in: row purge node */
+	purge_node_t*	node)	/*!< in/out: row purge node */
 {
-	ibool	success;
-	ulint	n_tries	= 0;
-
-	/*	fputs("Purge: Removing clustered record\n", stderr); */
-
-	success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF);
-	if (success) {
-
-		return;
+	if (row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF)) {
+		return(true);
 	}
-retry:
-	success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_TREE);
-	/* The delete operation may fail if we have little
-	file space left: TODO: easiest to crash the database
-	and restart with more file space */
 
-	if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
-		n_tries++;
+	for (ulint n_tries = 0;
+	     n_tries < BTR_CUR_RETRY_DELETE_N_TIMES;
+	     n_tries++) {
+		if (row_purge_remove_clust_if_poss_low(
+			    node, BTR_MODIFY_TREE)) {
+			return(true);
+		}
 
 		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
-
-		goto retry;
 	}
 
-	ut_a(success);
+	return(false);
 }
 
 /***********************************************************//**
@@ -233,21 +226,21 @@ is newer than the purge view.
 NOTE: This function should only be called by the purge thread, only
 while holding a latch on the leaf page of the secondary index entry
 (or keeping the buffer pool watch on the page).  It is possible that
-this function first returns TRUE and then FALSE, if a user transaction
+this function first returns true and then false, if a user transaction
 inserts a record that the secondary index entry would refer to.
 However, in that case, the user transaction would also re-insert the
 secondary index entry after purge has removed it and released the leaf
 page latch.
-@return	TRUE if the secondary index record can be purged */
+@return	true if the secondary index record can be purged */
 UNIV_INTERN
-ibool
+bool
 row_purge_poss_sec(
 /*===============*/
 	purge_node_t*	node,	/*!< in/out: row purge node */
 	dict_index_t*	index,	/*!< in: secondary index */
 	const dtuple_t*	entry)	/*!< in: secondary index entry */
 {
-	ibool	can_delete;
+	bool	can_delete;
 	mtr_t	mtr;
 
 	ut_ad(!dict_index_is_clust(index));
@@ -267,7 +260,7 @@ row_purge_poss_sec(
 Removes a secondary index entry if possible, by modifying the
 index tree.  Does not try to buffer the delete.
 @return	TRUE if success or if not found */
-static
+static __attribute__((nonnull, warn_unused_result))
 ibool
 row_purge_remove_sec_if_poss_tree(
 /*==============================*/
@@ -278,13 +271,35 @@ row_purge_remove_sec_if_poss_tree(
 	btr_pcur_t		pcur;
 	btr_cur_t*		btr_cur;
 	ibool			success	= TRUE;
-	ulint			err;
+	dberr_t			err;
 	mtr_t			mtr;
 	enum row_search_result	search_result;
 
 	log_free_check();
 	mtr_start(&mtr);
 
+	if (*index->name == TEMP_INDEX_PREFIX) {
+		/* The index->online_status may change if the
+		index->name starts with TEMP_INDEX_PREFIX (meaning
+		that the index is or was being created online). It is
+		protected by index->lock. */
+		mtr_x_lock(dict_index_get_lock(index), &mtr);
+
+		if (dict_index_is_online_ddl(index)) {
+			/* Online secondary index creation will not
+			copy any delete-marked records. Therefore
+			there is nothing to be purged. We must also
+			skip the purge when a completed index is
+			dropped by rollback_inplace_alter_table(). */
+			goto func_exit_no_pcur;
+		}
+	} else {
+		/* For secondary indexes,
+		index->online_status==ONLINE_INDEX_CREATION unless
+		index->name starts with TEMP_INDEX_PREFIX. */
+		ut_ad(!dict_index_is_online_ddl(index));
+	}
+
 	search_result = row_search_index_entry(index, entry, BTR_MODIFY_TREE,
 					       &pcur, &mtr);
 
@@ -326,7 +341,7 @@ row_purge_remove_sec_if_poss_tree(
 		      & rec_get_info_bits(btr_cur_get_rec(btr_cur),
 					  dict_table_is_comp(index->table)));
 
-		btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
 					   RB_NONE, &mtr);
 		switch (UNIV_EXPECT(err, DB_SUCCESS)) {
 		case DB_SUCCESS:
@@ -341,6 +356,7 @@ row_purge_remove_sec_if_poss_tree(
 
 func_exit:
 	btr_pcur_close(&pcur);
+func_exit_no_pcur:
 	mtr_commit(&mtr);
 
 	return(success);
@@ -349,9 +365,10 @@ func_exit:
 /***************************************************************
 Removes a secondary index entry without modifying the index tree,
 if possible.
-@return	TRUE if success or if not found */
-static
-ibool
+@retval	true if success or if not found
+@retval	false if row_purge_remove_sec_if_poss_tree() should be invoked */
+static __attribute__((nonnull, warn_unused_result))
+bool
 row_purge_remove_sec_if_poss_leaf(
 /*==============================*/
 	purge_node_t*	node,	/*!< in: row purge node */
@@ -360,23 +377,50 @@ row_purge_remove_sec_if_poss_leaf(
 {
 	mtr_t			mtr;
 	btr_pcur_t		pcur;
+	ulint			mode;
 	enum row_search_result	search_result;
+	bool			success	= true;
 
 	log_free_check();
 
 	mtr_start(&mtr);
 
+	if (*index->name == TEMP_INDEX_PREFIX) {
+		/* The index->online_status may change if the
+		index->name starts with TEMP_INDEX_PREFIX (meaning
+		that the index is or was being created online). It is
+		protected by index->lock. */
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+
+		if (dict_index_is_online_ddl(index)) {
+			/* Online secondary index creation will not
+			copy any delete-marked records. Therefore
+			there is nothing to be purged. We must also
+			skip the purge when a completed index is
+			dropped by rollback_inplace_alter_table(). */
+			goto func_exit_no_pcur;
+		}
+
+		mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED | BTR_DELETE;
+	} else {
+		/* For secondary indexes,
+		index->online_status==ONLINE_INDEX_CREATION unless
+		index->name starts with TEMP_INDEX_PREFIX. */
+		ut_ad(!dict_index_is_online_ddl(index));
+
+		mode = BTR_MODIFY_LEAF | BTR_DELETE;
+	}
+
 	/* Set the purge node for the call to row_purge_poss_sec(). */
 	pcur.btr_cur.purge_node = node;
 	/* Set the query thread, so that ibuf_insert_low() will be
 	able to invoke thd_get_trx(). */
-	pcur.btr_cur.thr = que_node_get_parent(node);
+	pcur.btr_cur.thr = static_cast<que_thr_t*>(que_node_get_parent(node));
 
 	search_result = row_search_index_entry(
-		index, entry, BTR_MODIFY_LEAF | BTR_DELETE, &pcur, &mtr);
+		index, entry, mode, &pcur, &mtr);
 
 	switch (search_result) {
-		ibool	success;
 	case ROW_FOUND:
 		/* Before attempting to purge a record, check
 		if it is safe to do so. */
@@ -389,11 +433,10 @@ row_purge_remove_sec_if_poss_leaf(
 				      btr_cur_get_rec(btr_cur),
 				      dict_table_is_comp(index->table)));
 
-			if (!btr_cur_optimistic_delete(btr_cur, &mtr)) {
+			if (!btr_cur_optimistic_delete(btr_cur, 0, &mtr)) {
 
 				/* The index entry could not be deleted. */
-				success = FALSE;
-				goto func_exit;
+				success = false;
 			}
 		}
 		/* fall through (the index entry is still needed,
@@ -404,9 +447,8 @@ row_purge_remove_sec_if_poss_leaf(
 		/* The deletion was buffered. */
 	case ROW_NOT_FOUND:
 		/* The index entry does not exist, nothing to do. */
-		success = TRUE;
-	func_exit:
 		btr_pcur_close(&pcur);
+	func_exit_no_pcur:
 		mtr_commit(&mtr);
 		return(success);
 	}
@@ -417,19 +459,26 @@ row_purge_remove_sec_if_poss_leaf(
 
 /***********************************************************//**
 Removes a secondary index entry if possible. */
-UNIV_INLINE
+UNIV_INLINE __attribute__((nonnull(1,2)))
 void
 row_purge_remove_sec_if_poss(
 /*=========================*/
 	purge_node_t*	node,	/*!< in: row purge node */
 	dict_index_t*	index,	/*!< in: index */
-	dtuple_t*	entry)	/*!< in: index entry */
+	const dtuple_t*	entry)	/*!< in: index entry */
 {
 	ibool	success;
 	ulint	n_tries		= 0;
 
 	/*	fputs("Purge: Removing secondary record\n", stderr); */
 
+	if (!entry) {
+		/* The node->row must have lacked some fields of this
+		index. This is possible when the undo log record was
+		written before this index was created. */
+		return;
+	}
+
 	if (row_purge_remove_sec_if_poss_leaf(node, index, entry)) {
 
 		return;
@@ -453,18 +502,17 @@ retry:
 }
 
 /***********************************************************//**
-Purges a delete marking of a record. */
-static
-void
+Purges a delete marking of a record.
+@retval true if the row was not found, or it was successfully removed
+@retval false the purge needs to be suspended because of
+running out of file space */
+static __attribute__((nonnull, warn_unused_result))
+bool
 row_purge_del_mark(
 /*===============*/
-	purge_node_t*	node)	/*!< in: row purge node */
+	purge_node_t*	node)	/*!< in/out: row purge node */
 {
 	mem_heap_t*	heap;
-	dtuple_t*	entry;
-	dict_index_t*	index;
-
-	ut_ad(node);
 
 	heap = mem_heap_create(1024);
 
@@ -476,19 +524,19 @@ row_purge_del_mark(
 			break;
 		}
 
-		index = node->index;
-
-		/* Build the index entry */
-		entry = row_build_index_entry(node->row, NULL, index, heap);
-		ut_a(entry);
-		row_purge_remove_sec_if_poss(node, index, entry);
+		if (node->index->type != DICT_FTS) {
+			dtuple_t*	entry = row_build_index_entry_low(
+				node->row, NULL, node->index, heap);
+			row_purge_remove_sec_if_poss(node, node->index, entry);
+			mem_heap_empty(heap);
+		}
 
 		node->index = dict_table_get_next_index(node->index);
 	}
 
 	mem_heap_free(heap);
 
-	row_purge_remove_clust_if_poss(node);
+	return(row_purge_remove_clust_if_poss(node));
 }
 
 /***********************************************************//**
@@ -499,21 +547,16 @@ void
 row_purge_upd_exist_or_extern_func(
 /*===============================*/
 #ifdef UNIV_DEBUG
-	const que_thr_t*thr,	/*!< in: query thread */
+	const que_thr_t*thr,		/*!< in: query thread */
 #endif /* UNIV_DEBUG */
-	purge_node_t*	node)	/*!< in: row purge node */
+	purge_node_t*	node,		/*!< in: row purge node */
+	trx_undo_rec_t*	undo_rec)	/*!< in: record to purge */
 {
 	mem_heap_t*	heap;
-	dtuple_t*	entry;
-	dict_index_t*	index;
-	ibool		is_insert;
-	ulint		rseg_id;
-	ulint		page_no;
-	ulint		offset;
-	ulint		i;
-	mtr_t		mtr;
 
-	ut_ad(node);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
 
 	if (node->rec_type == TRX_UNDO_UPD_DEL_REC
 	    || (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
@@ -530,15 +573,13 @@ row_purge_upd_exist_or_extern_func(
 			break;
 		}
 
-		index = node->index;
-
 		if (row_upd_changes_ord_field_binary(node->index, node->update,
 						     thr, NULL, NULL)) {
 			/* Build the older version of the index entry */
-			entry = row_build_index_entry(node->row, NULL,
-						      index, heap);
-			ut_a(entry);
-			row_purge_remove_sec_if_poss(node, index, entry);
+			dtuple_t*	entry = row_build_index_entry_low(
+				node->row, NULL, node->index, heap);
+			row_purge_remove_sec_if_poss(node, node->index, entry);
+			mem_heap_empty(heap);
 		}
 
 		node->index = dict_table_get_next_index(node->index);
@@ -548,18 +589,25 @@ row_purge_upd_exist_or_extern_func(
 
 skip_secondaries:
 	/* Free possible externally stored fields */
-	for (i = 0; i < upd_get_n_fields(node->update); i++) {
+	for (ulint i = 0; i < upd_get_n_fields(node->update); i++) {
 
 		const upd_field_t*	ufield
 			= upd_get_nth_field(node->update, i);
 
 		if (dfield_is_ext(&ufield->new_val)) {
+			trx_rseg_t*	rseg;
 			buf_block_t*	block;
 			ulint		internal_offset;
 			byte*		data_field;
+			dict_index_t*	index;
+			ibool		is_insert;
+			ulint		rseg_id;
+			ulint		page_no;
+			ulint		offset;
+			mtr_t		mtr;
 
 			/* We use the fact that new_val points to
-			node->undo_rec and get thus the offset of
+			undo_rec and get thus the offset of
 			dfield data inside the undo record. Then we
 			can calculate from node->roll_ptr the file
 			address of the new_val data */
@@ -567,20 +615,24 @@ skip_secondaries:
 			internal_offset
 				= ((const byte*)
 				   dfield_get_data(&ufield->new_val))
-				- node->undo_rec;
+				- undo_rec;
 
 			ut_a(internal_offset < UNIV_PAGE_SIZE);
 
 			trx_undo_decode_roll_ptr(node->roll_ptr,
 						 &is_insert, &rseg_id,
 						 &page_no, &offset);
+
+			rseg = trx_sys_get_nth_rseg(trx_sys, rseg_id);
+			ut_a(rseg != NULL);
+			ut_a(rseg->id == rseg_id);
+
 			mtr_start(&mtr);
 
 			/* We have to acquire an X-latch to the clustered
 			index tree */
 
 			index = dict_table_get_first_index(node->table);
-
 			mtr_x_lock(dict_index_get_lock(index), &mtr);
 
 			/* NOTE: we must also acquire an X-latch to the
@@ -594,10 +646,9 @@ skip_secondaries:
 
 			btr_root_get(index, &mtr);
 
-			/* We assume in purge of externally stored fields
-			that the space id of the undo log record is 0! */
+			block = buf_page_get(
+				rseg->space, 0, page_no, RW_X_LATCH, &mtr);
 
-			block = buf_page_get(0, 0, page_no, RW_X_LATCH, &mtr);
 			buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
 
 			data_field = buf_block_get_frame(block)
@@ -616,26 +667,25 @@ skip_secondaries:
 }
 
 #ifdef UNIV_DEBUG
-# define row_purge_upd_exist_or_extern(thr,node)	\
-	row_purge_upd_exist_or_extern_func(thr,node)
+# define row_purge_upd_exist_or_extern(thr,node,undo_rec)	\
+	row_purge_upd_exist_or_extern_func(thr,node,undo_rec)
 #else /* UNIV_DEBUG */
-# define row_purge_upd_exist_or_extern(thr,node)	\
-	row_purge_upd_exist_or_extern_func(node)
+# define row_purge_upd_exist_or_extern(thr,node,undo_rec)	\
+	row_purge_upd_exist_or_extern_func(node,undo_rec)
 #endif /* UNIV_DEBUG */
 
 /***********************************************************//**
 Parses the row reference and other info in a modify undo log record.
-@return TRUE if purge operation required: NOTE that then the CALLER
-must unfreeze data dictionary! */
+@return true if purge operation required */
 static
-ibool
+bool
 row_purge_parse_undo_rec(
 /*=====================*/
-	purge_node_t*	node,	/*!< in: row undo node */
-	ibool*		updated_extern,
-				/*!< out: TRUE if an externally stored field
-				was updated */
-	que_thr_t*	thr)	/*!< in: query thread */
+	purge_node_t*		node,		/*!< in: row undo node */
+	trx_undo_rec_t*		undo_rec,	/*!< in: record to purge */
+	bool*			updated_extern, /*!< out: true if an externally
+						stored field was updated */
+	que_thr_t*		thr)		/*!< in: query thread */
 {
 	dict_index_t*	clust_index;
 	byte*		ptr;
@@ -649,52 +699,39 @@ row_purge_parse_undo_rec(
 
 	ut_ad(node && thr);
 
-	trx = thr_get_trx(thr);
-
 	ptr = trx_undo_rec_get_pars(
-		node->undo_rec, &type, &node->cmpl_info,
+		undo_rec, &type, &node->cmpl_info,
 		updated_extern, &undo_no, &table_id);
+
 	node->rec_type = type;
 
-	if (type == TRX_UNDO_UPD_DEL_REC && !(*updated_extern)) {
+	if (type == TRX_UNDO_UPD_DEL_REC && !*updated_extern) {
 
-		return(FALSE);
+		return(false);
 	}
 
 	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
 					       &info_bits);
 	node->table = NULL;
 
-	if (type == TRX_UNDO_UPD_EXIST_REC
-	    && node->cmpl_info & UPD_NODE_NO_ORD_CHANGE
-	    && !(*updated_extern)) {
-
-		/* Purge requires no changes to indexes: we may return */
-
-		return(FALSE);
-	}
-
 	/* Prevent DROP TABLE etc. from running when we are doing the purge
 	for this row */
 
-	row_mysql_freeze_data_dictionary(trx);
-
-	mutex_enter(&(dict_sys->mutex));
-
-	node->table = dict_table_get_on_id_low(table_id);
+	rw_lock_s_lock_inline(&dict_operation_lock, 0, __FILE__, __LINE__);
 
-	mutex_exit(&(dict_sys->mutex));
+	node->table = dict_table_open_on_id(
+		table_id, FALSE, DICT_TABLE_OP_NORMAL);
 
 	if (node->table == NULL) {
 		/* The table has been dropped: no need to do purge */
-err_exit:
-		row_mysql_unfreeze_data_dictionary(trx);
-		return(FALSE);
+		goto err_exit;
 	}
 
 	if (node->table->ibd_file_missing) {
 		/* We skip purge of missing .ibd files */
 
+		dict_table_close(node->table, FALSE, FALSE);
+
 		node->table = NULL;
 
 		goto err_exit;
@@ -703,14 +740,29 @@ err_exit:
 	clust_index = dict_table_get_first_index(node->table);
 
 	if (clust_index == NULL) {
-		/* The table was corrupt in the data dictionary */
+		/* The table was corrupt in the data dictionary.
+		dict_set_corrupted() works on an index, and
+		we do not have an index to call it with. */
+close_exit:
+		dict_table_close(node->table, FALSE, FALSE);
+err_exit:
+		rw_lock_s_unlock(&dict_operation_lock);
+		return(false);
+	}
 
-		goto err_exit;
+	if (type == TRX_UNDO_UPD_EXIST_REC
+	    && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+	    && !*updated_extern) {
+
+		/* Purge requires no changes to indexes: we may return */
+		goto close_exit;
 	}
 
 	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
 				       node->heap);
 
+	trx = thr_get_trx(thr);
+
 	ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
 					     roll_ptr, info_bits, trx,
 					     node->heap, &(node->update));
@@ -724,10 +776,72 @@ err_exit:
 			node->heap);
 	}
 
-	return(TRUE);
+	return(true);
 }
 
 /***********************************************************//**
+Purges the parsed record.
+@return true if purged, false if skipped */
+static __attribute__((nonnull, warn_unused_result))
+bool
+row_purge_record_func(
+/*==================*/
+	purge_node_t*	node,		/*!< in: row purge node */
+	trx_undo_rec_t*	undo_rec,	/*!< in: record to purge */
+#ifdef UNIV_DEBUG
+	const que_thr_t*thr,		/*!< in: query thread */
+#endif /* UNIV_DEBUG */
+	bool		updated_extern)	/*!< in: whether external columns
+					were updated */
+{
+	dict_index_t*	clust_index;
+	bool		purged		= true;
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	node->index = dict_table_get_next_index(clust_index);
+
+	switch (node->rec_type) {
+	case TRX_UNDO_DEL_MARK_REC:
+		purged = row_purge_del_mark(node);
+		if (!purged) {
+			break;
+		}
+		MONITOR_INC(MONITOR_N_DEL_ROW_PURGE);
+		break;
+	default:
+		if (!updated_extern) {
+			break;
+		}
+		/* fall through */
+	case TRX_UNDO_UPD_EXIST_REC:
+		row_purge_upd_exist_or_extern(thr, node, undo_rec);
+		MONITOR_INC(MONITOR_N_UPD_EXIST_EXTERN);
+		break;
+	}
+
+	if (node->found_clust) {
+		btr_pcur_close(&node->pcur);
+		node->found_clust = FALSE;
+	}
+
+	if (node->table != NULL) {
+		dict_table_close(node->table, FALSE, FALSE);
+		node->table = NULL;
+	}
+
+	return(purged);
+}
+
+#ifdef UNIV_DEBUG
+# define row_purge_record(node,undo_rec,thr,updated_extern)	\
+	row_purge_record_func(node,undo_rec,thr,updated_extern)
+#else /* UNIV_DEBUG */
+# define row_purge_record(node,undo_rec,thr,updated_extern)	\
+	row_purge_record_func(node,undo_rec,updated_extern)
+#endif /* UNIV_DEBUG */
+
+/***********************************************************//**
 Fetches an undo log record and does the purge for the recorded operation.
 If none left, or the current purge completed, returns the control to the
 parent node, which is always a query thread node. */
@@ -735,53 +849,57 @@ static __attribute__((nonnull))
 void
 row_purge(
 /*======*/
-	purge_node_t*	node,	/*!< in: row purge node */
-	que_thr_t*	thr)	/*!< in: query thread */
+	purge_node_t*	node,		/*!< in: row purge node */
+	trx_undo_rec_t*	undo_rec,	/*!< in: record to purge */
+	que_thr_t*	thr)		/*!< in: query thread */
 {
-	ibool		updated_extern;
+	if (undo_rec != &trx_purge_dummy_rec) {
+		bool	updated_extern;
 
-	ut_ad(node);
-	ut_ad(thr);
+		while (row_purge_parse_undo_rec(
+			       node, undo_rec, &updated_extern, thr)) {
 
-	node->undo_rec = trx_purge_fetch_next_rec(&node->roll_ptr,
-						  &node->reservation,
-						  node->heap);
-	if (!node->undo_rec) {
-		/* Purge completed for this query thread */
+			bool purged = row_purge_record(
+				node, undo_rec, thr, updated_extern);
 
-		thr->run_node = que_node_get_parent(node);
+			rw_lock_s_unlock(&dict_operation_lock);
 
-		return;
+			if (purged
+			    || srv_shutdown_state != SRV_SHUTDOWN_NONE) {
+				return;
+			}
+
+			/* Retry the purge in a second. */
+			os_thread_sleep(1000000);
+		}
 	}
+}
 
-	if (node->undo_rec != &trx_purge_dummy_rec
-	    && row_purge_parse_undo_rec(node, &updated_extern, thr)) {
-		node->found_clust = FALSE;
+/***********************************************************//**
+Reset the purge query thread. */
+UNIV_INLINE
+void
+row_purge_end(
+/*==========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	purge_node_t*	node;
 
-		node->index = dict_table_get_next_index(
-			dict_table_get_first_index(node->table));
+	ut_ad(thr);
 
-		if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
-			row_purge_del_mark(node);
+	node = static_cast<purge_node_t*>(thr->run_node);
 
-		} else if (updated_extern
-			   || node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+	ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
 
-			row_purge_upd_exist_or_extern(thr, node);
-		}
+	thr->run_node = que_node_get_parent(node);
 
-		if (node->found_clust) {
-			btr_pcur_close(&(node->pcur));
-		}
+	node->undo_recs = NULL;
 
-		row_mysql_unfreeze_data_dictionary(thr_get_trx(thr));
-	}
+	node->done = TRUE;
 
-	/* Do some cleanup */
-	trx_purge_rec_release(node->reservation);
-	mem_heap_empty(node->heap);
+	ut_a(thr->run_node != NULL);
 
-	thr->run_node = node;
+	mem_heap_empty(node->heap);
 }
 
 /***********************************************************//**
@@ -798,11 +916,39 @@ row_purge_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<purge_node_t*>(thr->run_node);
+
+	node->table = NULL;
+	node->row = NULL;
+	node->ref = NULL;
+	node->index = NULL;
+	node->update = NULL;
+	node->found_clust = FALSE;
+	node->rec_type = ULINT_UNDEFINED;
+	node->cmpl_info = ULINT_UNDEFINED;
+
+	ut_a(!node->done);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
 
-	row_purge(node, thr);
+	if (!(node->undo_recs == NULL || ib_vector_is_empty(node->undo_recs))) {
+		trx_purge_rec_t*purge_rec;
+
+		purge_rec = static_cast<trx_purge_rec_t*>(
+			ib_vector_pop(node->undo_recs));
+
+		node->roll_ptr = purge_rec->roll_ptr;
+
+		row_purge(node, purge_rec->undo_rec, thr);
+
+		if (ib_vector_is_empty(node->undo_recs)) {
+			row_purge_end(thr);
+		} else {
+			thr->run_node = node;
+		}
+	} else {
+		row_purge_end(thr);
+	}
 
 	return(thr);
 }
diff --git a/storage/xtradb/row/row0quiesce.cc b/storage/xtradb/row/row0quiesce.cc
new file mode 100644
index 00000000000..79cced1c533
--- /dev/null
+++ b/storage/xtradb/row/row0quiesce.cc
@@ -0,0 +1,703 @@
+/*****************************************************************************
+
+Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0quiesce.cc
+Quiesce a tablespace.
+
+Created 2012-02-08 by Sunny Bains.
+*******************************************************/
+
+#include "row0quiesce.h"
+#include "row0mysql.h"
+
+#ifdef UNIV_NONINL
+#include "row0quiesce.ic"
+#endif
+
+#include "ibuf0ibuf.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+
+/*********************************************************************//**
+Write the meta data (index user fields) config file.
+@return DB_SUCCESS or error code. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_index_fields(
+/*===========================*/
+	const dict_index_t*	index,	/*!< in: write the meta data for
+					this index */
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd)	/*!< in/out: session */
+{
+	byte			row[sizeof(ib_uint32_t) * 2];
+
+	for (ulint i = 0; i < index->n_fields; ++i) {
+		byte*			ptr = row;
+		const dict_field_t*	field = &index->fields[i];
+
+		mach_write_to_4(ptr, field->prefix_len);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, field->fixed_len);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_9",
+				close(fileno(file)););
+
+		if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno),
+				"while writing index fields.");
+
+			return(DB_IO_ERROR);
+		}
+
+		/* Include the NUL byte in the length. */
+		ib_uint32_t	len = strlen(field->name) + 1;
+		ut_a(len > 1);
+
+		mach_write_to_4(row, len);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_10",
+				close(fileno(file)););
+
+		if (fwrite(row, 1,  sizeof(len), file) != sizeof(len)
+		    || fwrite(field->name, 1, len, file) != len) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno),
+				"while writing index column.");
+
+			return(DB_IO_ERROR);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the meta data config file index information.
+@return DB_SUCCESS or error code. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_indexes(
+/*======================*/
+	const dict_table_t*	table,	/*!< in: write the meta data for
+					this table */
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd)	/*!< in/out: session */
+{
+	{
+		byte		row[sizeof(ib_uint32_t)];
+
+		/* Write the number of indexes in the table. */
+		mach_write_to_4(row, UT_LIST_GET_LEN(table->indexes));
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_11",
+				close(fileno(file)););
+
+		if (fwrite(row, 1,  sizeof(row), file) != sizeof(row)) {
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno),
+				"while writing index count.");
+
+			return(DB_IO_ERROR);
+		}
+	}
+
+	dberr_t			err = DB_SUCCESS;
+
+	/* Write the index meta data. */
+	for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+	     index != 0 && err == DB_SUCCESS;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		byte*		ptr;
+		byte		row[sizeof(index_id_t)
+				    + sizeof(ib_uint32_t) * 8];
+
+		ptr = row;
+
+		ut_ad(sizeof(index_id_t) == 8);
+		mach_write_to_8(ptr, index->id);
+		ptr += sizeof(index_id_t);
+
+		mach_write_to_4(ptr, index->space);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->page);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->type);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->trx_id_offset);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->n_user_defined_cols);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->n_uniq);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->n_nullable);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->n_fields);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_12",
+				close(fileno(file)););
+
+		if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno),
+				"while writing index meta-data.");
+
+			return(DB_IO_ERROR);
+		}
+
+		/* Write the length of the index name.
+		NUL byte is included in the length. */
+		ib_uint32_t	len = strlen(index->name) + 1;
+		ut_a(len > 1);
+
+		mach_write_to_4(row, len);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_1",
+				close(fileno(file)););
+
+		if (fwrite(row, 1, sizeof(len), file) != sizeof(len)
+		    || fwrite(index->name, 1, len, file) != len) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno),
+				"while writing index name.");
+
+			return(DB_IO_ERROR);
+		}
+
+		err = row_quiesce_write_index_fields(index, file, thd);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Write the meta data (table columns) config file. Serialise the contents of
+dict_col_t structure, along with the column name. All fields are serialized
+as ib_uint32_t.
+@return DB_SUCCESS or error code. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_table(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: write the meta data for
+					this table */
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd)	/*!< in/out: session */
+{
+	dict_col_t*		col;
+	byte			row[sizeof(ib_uint32_t) * 7];
+
+	col = table->cols;
+
+	for (ulint i = 0; i < table->n_cols; ++i, ++col) {
+		byte*		ptr = row;
+
+		mach_write_to_4(ptr, col->prtype);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->mtype);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->len);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->mbminmaxlen);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->ind);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->ord_part);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->max_prefix);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_2",
+				close(fileno(file)););
+
+		if (fwrite(row, 1,  sizeof(row), file) != sizeof(row)) {
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno),
+				"while writing table column data.");
+
+			return(DB_IO_ERROR);
+		}
+
+		/* Write out the column name as [len, byte array]. The len
+		includes the NUL byte. */
+		ib_uint32_t	len;
+		const char*	col_name;
+
+		col_name = dict_table_get_col_name(table, dict_col_get_no(col));
+
+		/* Include the NUL byte in the length. */
+		len = strlen(col_name) + 1;
+		ut_a(len > 1);
+
+		mach_write_to_4(row, len);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_3",
+				close(fileno(file)););
+
+		if (fwrite(row, 1,  sizeof(len), file) != sizeof(len)
+		    || fwrite(col_name, 1, len, file) != len) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno),
+				"while writing column name.");
+
+			return(DB_IO_ERROR);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the meta data config file header.
+@return DB_SUCCESS or error code. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_header(
+/*=====================*/
+	const dict_table_t*	table,	/*!< in: write the meta data for
+					this table */
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd)	/*!< in/out: session */
+{
+	byte			value[sizeof(ib_uint32_t)];
+
+	/* Write the meta-data version number. */
+	mach_write_to_4(value, IB_EXPORT_CFG_VERSION_V1);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_4", close(fileno(file)););
+
+	if (fwrite(&value, 1,  sizeof(value), file) != sizeof(value)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			errno, strerror(errno),
+			"while writing meta-data version number.");
+
+		return(DB_IO_ERROR);
+	}
+
+	/* Write the server hostname. */
+	ib_uint32_t		len;
+	const char*		hostname = server_get_hostname();
+
+	/* Play it safe and check for NULL. */
+	if (hostname == 0) {
+		static const char	NullHostname[] = "Hostname unknown";
+
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Unable to determine server hostname.");
+
+		hostname = NullHostname;
+	}
+
+	/* The server hostname includes the NUL byte. */
+	len = strlen(hostname) + 1;
+	mach_write_to_4(value, len);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_5", close(fileno(file)););
+
+	if (fwrite(&value, 1,  sizeof(value), file) != sizeof(value)
+	    || fwrite(hostname, 1,  len, file) != len) {
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			errno, strerror(errno),
+			"while writing hostname.");
+
+		return(DB_IO_ERROR);
+	}
+
+	/* The table name includes the NUL byte. */
+	ut_a(table->name != 0);
+	len = strlen(table->name) + 1;
+
+	/* Write the table name. */
+	mach_write_to_4(value, len);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_6", close(fileno(file)););
+
+	if (fwrite(&value, 1,  sizeof(value), file) != sizeof(value)
+	    || fwrite(table->name, 1,  len, file) != len) {
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			errno, strerror(errno),
+			"while writing table name.");
+
+		return(DB_IO_ERROR);
+	}
+
+	byte		row[sizeof(ib_uint32_t) * 3];
+
+	/* Write the next autoinc value. */
+	mach_write_to_8(row, table->autoinc);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_7", close(fileno(file)););
+
+	if (fwrite(row, 1,  sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			errno, strerror(errno),
+			"while writing table autoinc value.");
+
+		return(DB_IO_ERROR);
+	}
+
+	byte*		ptr = row;
+
+	/* Write the system page size. */
+	mach_write_to_4(ptr, UNIV_PAGE_SIZE);
+	ptr += sizeof(ib_uint32_t);
+
+	/* Write the table->flags. */
+	mach_write_to_4(ptr, table->flags);
+	ptr += sizeof(ib_uint32_t);
+
+	/* Write the number of columns in the table. */
+	mach_write_to_4(ptr, table->n_cols);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_8", close(fileno(file)););
+
+	if (fwrite(row, 1,  sizeof(row), file) != sizeof(row)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			errno, strerror(errno),
+			"while writing table meta-data.");
+
+		return(DB_IO_ERROR);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the table meta data after quiesce.
+@return DB_SUCCESS or error code */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_cfg(
+/*==================*/
+	dict_table_t*	table,	/*!< in: write the meta data for
+					this table */
+	THD*			thd)	/*!< in/out: session */
+{
+	dberr_t			err;
+	char			name[OS_FILE_MAX_PATH];
+
+	srv_get_meta_data_filename(table, name, sizeof(name));
+
+	ib_logf(IB_LOG_LEVEL_INFO, "Writing table metadata to '%s'", name);
+
+	FILE*	file = fopen(name, "w+b");
+
+	if (file == NULL) {
+		ib_errf(thd, IB_LOG_LEVEL_WARN, ER_CANT_CREATE_FILE,
+			 name, errno, strerror(errno));
+
+		err = DB_IO_ERROR;
+	} else {
+		err = row_quiesce_write_header(table, file, thd);
+
+		if (err == DB_SUCCESS) {
+			err = row_quiesce_write_table(table, file, thd);
+		}
+
+		if (err == DB_SUCCESS) {
+			err = row_quiesce_write_indexes(table, file, thd);
+		}
+
+		if (fflush(file) != 0) {
+
+			char	msg[BUFSIZ];
+
+			ut_snprintf(msg, sizeof(msg), "%s flush() failed",
+				    name);
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno), msg);
+		}
+
+		if (fclose(file) != 0) {
+			char	msg[BUFSIZ];
+
+			ut_snprintf(msg, sizeof(msg), "%s flose() failed",
+				    name);
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno), msg);
+		}
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Check whether a table has an FTS index defined on it.
+@return true if an FTS index exists on the table */
+static
+bool
+row_quiesce_table_has_fts_index(
+/*============================*/
+	const dict_table_t*	table)	/*!< in: quiesce this table */
+{
+	bool			exists = false;
+
+	dict_mutex_enter_for_mysql();
+
+	for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+	     index != 0;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		if (index->type & DICT_FTS) {
+			exists = true;
+			break;
+		}
+	}
+
+	dict_mutex_exit_for_mysql();
+
+	return(exists);
+}
+
+/*********************************************************************//**
+Quiesce the tablespace that the table resides in. */
+UNIV_INTERN
+void
+row_quiesce_table_start(
+/*====================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	trx_t*		trx)		/*!< in/out: transaction/session */
+{
+	ut_a(trx->mysql_thd != 0);
+	ut_a(srv_n_purge_threads > 0);
+	ut_ad(!srv_read_only_mode);
+
+	char		table_name[MAX_FULL_NAME_LEN + 1];
+
+	ut_a(trx->mysql_thd != 0);
+
+	innobase_format_name(
+		table_name, sizeof(table_name), table->name, FALSE);
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Sync to disk of '%s' started.", table_name);
+
+	if (trx_purge_state() != PURGE_STATE_DISABLED) {
+		trx_purge_stop();
+	}
+
+	ut_a(table->id > 0);
+
+	for (ulint count = 0;
+	     ibuf_contract_in_background(table->id, TRUE) != 0
+	     && !trx_is_interrupted(trx);
+	     ++count) {
+		if (!(count % 20)) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Merging change buffer entries for '%s'",
+				table_name);
+		}
+	}
+
+	if (!trx_is_interrupted(trx)) {
+		buf_LRU_flush_or_remove_pages(
+			table->space, BUF_REMOVE_FLUSH_WRITE, trx);
+
+		if (trx_is_interrupted(trx)) {
+
+			ib_logf(IB_LOG_LEVEL_WARN, "Quiesce aborted!");
+
+		} else if (row_quiesce_write_cfg(table, trx->mysql_thd)
+			   != DB_SUCCESS) {
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"There was an error writing to the "
+				"meta data file");
+		} else {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Table '%s' flushed to disk", table_name);
+		}
+	} else {
+		ib_logf(IB_LOG_LEVEL_WARN, "Quiesce aborted!");
+	}
+
+	dberr_t	err = row_quiesce_set_state(table, QUIESCE_COMPLETE, trx);
+	ut_a(err == DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Cleanup after table quiesce. */
+UNIV_INTERN
+void
+row_quiesce_table_complete(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	trx_t*		trx)		/*!< in/out: transaction/session */
+{
+	ulint		count = 0;
+	char		table_name[MAX_FULL_NAME_LEN + 1];
+
+	ut_a(trx->mysql_thd != 0);
+
+	innobase_format_name(
+		table_name, sizeof(table_name), table->name, FALSE);
+
+	/* We need to wait for the operation to complete if the
+	transaction has been killed. */
+
+	while (table->quiesce != QUIESCE_COMPLETE) {
+
+		/* Print a warning after every minute. */
+		if (!(count % 60)) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Waiting for quiesce of '%s' to complete",
+				table_name);
+		}
+
+		/* Sleep for a second. */
+		os_thread_sleep(1000000);
+
+		++count;
+	}
+
+	/* Remove the .cfg file now that the user has resumed
+	normal operations. Otherwise it will cause problems when
+	the user tries to drop the database (remove directory). */
+	char		cfg_name[OS_FILE_MAX_PATH];
+
+	srv_get_meta_data_filename(table, cfg_name, sizeof(cfg_name));
+
+	os_file_delete_if_exists(innodb_file_data_key, cfg_name);
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Deleting the meta-data file '%s'", cfg_name);
+
+	if (trx_purge_state() != PURGE_STATE_DISABLED) {
+		trx_purge_run();
+	}
+
+	dberr_t	err = row_quiesce_set_state(table, QUIESCE_NONE, trx);
+	ut_a(err == DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Set a table's quiesce state.
+@return DB_SUCCESS or error code. */
+UNIV_INTERN
+dberr_t
+row_quiesce_set_state(
+/*==================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	ib_quiesce_t	state,		/*!< in: quiesce state to set */
+	trx_t*		trx)		/*!< in/out: transaction */
+{
+	ut_a(srv_n_purge_threads > 0);
+
+	if (srv_read_only_mode) {
+
+		ib_senderrf(trx->mysql_thd,
+			    IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+
+		return(DB_UNSUPPORTED);
+
+	} else if (table->space == TRX_SYS_SPACE) {
+
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name), table->name, FALSE);
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			    ER_TABLE_IN_SYSTEM_TABLESPACE, table_name);
+
+		return(DB_UNSUPPORTED);
+	} else if (row_quiesce_table_has_fts_index(table)) {
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			    ER_NOT_SUPPORTED_YET,
+			    "FLUSH TABLES on tables that have an FTS index. "
+			    "FTS auxiliary tables will not be flushed.");
+
+	} else if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+		/* If this flag is set then the table may not have any active
+		FTS indexes but it will still have the auxiliary tables. */
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			    ER_NOT_SUPPORTED_YET,
+			    "FLUSH TABLES on a table that had an FTS index, "
+			    "created on a hidden column, the "
+			    "auxiliary tables haven't been dropped as yet. "
+			    "FTS auxiliary tables will not be flushed.");
+	}
+
+	row_mysql_lock_data_dictionary(trx);
+
+	dict_table_x_lock_indexes(table);
+
+	switch (state) {
+	case QUIESCE_START:
+		ut_a(table->quiesce == QUIESCE_NONE);
+		break;
+
+	case QUIESCE_COMPLETE:
+		ut_a(table->quiesce == QUIESCE_START);
+		break;
+
+	case QUIESCE_NONE:
+		ut_a(table->quiesce == QUIESCE_COMPLETE);
+		break;
+	}
+
+	table->quiesce = state;
+
+	dict_table_x_unlock_indexes(table);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	return(DB_SUCCESS);
+}
+
diff --git a/storage/xtradb/row/row0row.c b/storage/xtradb/row/row0row.cc
index 2c33bdc4b15..be786f954fb 100644
--- a/storage/xtradb/row/row0row.c
+++ b/storage/xtradb/row/row0row.cc
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0row.c
+@file row/row0row.cc
 General row routines
 
 Created 4/20/1996 Heikki Tuuri
@@ -50,32 +50,30 @@ Created 4/20/1996 Heikki Tuuri
 /*****************************************************************//**
 When an insert or purge to a table is performed, this function builds
 the entry to be inserted into or purged from an index on the table.
-@return index entry which should be inserted or purged, or NULL if the
-externally stored columns in the clustered index record are
-unavailable and ext != NULL */
+@return index entry which should be inserted or purged
+@retval NULL if the externally stored columns in the clustered index record
+are unavailable and ext != NULL, or row is missing some needed columns. */
 UNIV_INTERN
 dtuple_t*
-row_build_index_entry(
-/*==================*/
-	const dtuple_t*	row,	/*!< in: row which should be
-				inserted or purged */
-	row_ext_t*	ext,	/*!< in: externally stored column prefixes,
-				or NULL */
-	dict_index_t*	index,	/*!< in: index on the table */
-	mem_heap_t*	heap)	/*!< in: memory heap from which the memory for
-				the index entry is allocated */
+row_build_index_entry_low(
+/*======================*/
+	const dtuple_t*		row,	/*!< in: row which should be
+					inserted or purged */
+	const row_ext_t*	ext,	/*!< in: externally stored column
+					prefixes, or NULL */
+	dict_index_t*		index,	/*!< in: index on the table */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory for the index entry
+					is allocated */
 {
 	dtuple_t*	entry;
 	ulint		entry_len;
 	ulint		i;
 
-	ut_ad(row && index && heap);
-	ut_ad(dtuple_check_typed(row));
-
 	entry_len = dict_index_get_n_fields(index);
 	entry = dtuple_create(heap, entry_len);
 
-	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+	if (dict_index_is_univ(index)) {
 		dtuple_set_n_fields_cmp(entry, entry_len);
 		/* There may only be externally stored columns
 		in a clustered index B-tree of a user table. */
@@ -96,8 +94,19 @@ row_build_index_entry(
 			= dtuple_get_nth_field(entry, i);
 		const dfield_t*		dfield2
 			= dtuple_get_nth_field(row, col_no);
-		ulint			len
-			= dfield_get_len(dfield2);
+		ulint			len;
+
+#if DATA_MISSING != 0
+# error "DATA_MISSING != 0"
+#endif
+		if (UNIV_UNLIKELY(dfield_get_type(dfield2)->mtype
+				  == DATA_MISSING)) {
+			/* The field has not been initialized in the row.
+			This should be from trx_undo_rec_get_partial_row(). */
+			return(NULL);
+		}
+
+		len = dfield_get_len(dfield2);
 
 		dfield_copy(dfield, dfield2);
 
@@ -124,7 +133,7 @@ row_build_index_entry(
 		stored off-page. */
 		ut_ad(col->ord_part);
 
-		if (UNIV_LIKELY_NULL(ext)) {
+		if (ext) {
 			/* See if the column is stored externally. */
 			const byte*	buf = row_ext_lookup(ext, col_no,
 							     &len);
@@ -166,13 +175,11 @@ row_build_index_entry(
 			len = dtype_get_at_most_n_mbchars(
 				col->prtype, col->mbminmaxlen,
 				ind_field->prefix_len, len,
-				dfield_get_data(dfield));
+				static_cast<char*>(dfield_get_data(dfield)));
 			dfield_set_len(dfield, len);
 		}
 	}
 
-	ut_ad(dtuple_check_typed(entry));
-
 	return(entry);
 }
 
@@ -211,21 +218,23 @@ row_build(
 					of an index, or NULL if
 					index->table should be
 					consulted instead */
+	const dtuple_t*		add_cols,
+					/*!< in: default values of
+					added columns, or NULL */
+	const ulint*		col_map,/*!< in: mapping of old column
+					numbers to new ones, or NULL */
 	row_ext_t**		ext,	/*!< out, own: cache of
 					externally stored column
 					prefixes, or NULL */
 	mem_heap_t*		heap)	/*!< in: memory heap from which
 					the memory needed is allocated */
 {
+	const byte*		copy;
 	dtuple_t*		row;
-	const dict_table_t*	table;
-	ulint			n_fields;
 	ulint			n_ext_cols;
 	ulint*			ext_cols	= NULL; /* remove warning */
 	ulint			len;
-	ulint			row_len;
 	byte*			buf;
-	ulint			i;
 	ulint			j;
 	mem_heap_t*		tmp_heap	= NULL;
 	ulint			offsets_[REC_OFFS_NORMAL_SIZE];
@@ -233,6 +242,8 @@ row_build(
 
 	ut_ad(index && rec && heap);
 	ut_ad(dict_index_is_clust(index));
+	ut_ad(!mutex_own(&trx_sys->mutex));
+	ut_ad(!col_map || col_table);
 
 	if (!offsets) {
 		offsets = rec_get_offsets(rec, index, offsets_,
@@ -241,7 +252,7 @@ row_build(
 		ut_ad(rec_offs_validate(rec, index, offsets));
 	}
 
-#ifdef UNIV_BLOB_NULL_DEBUG
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 	if (rec_offs_any_null_extern(rec, offsets)) {
 		/* This condition can occur during crash recovery
 		before trx_rollback_active() has completed execution,
@@ -252,59 +263,91 @@ row_build(
 		ut_a(trx_undo_roll_ptr_is_insert(
 			     row_get_rec_roll_ptr(rec, index, offsets)));
 	}
-#endif /* UNIV_BLOB_NULL_DEBUG */
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 
 	if (type != ROW_COPY_POINTERS) {
 		/* Take a copy of rec to heap */
-		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
-		rec = rec_copy(buf, rec, offsets);
-		/* Avoid a debug assertion in rec_offs_validate(). */
-		rec_offs_make_valid(rec, index, (ulint*) offsets);
+		buf = static_cast<byte*>(
+			mem_heap_alloc(heap, rec_offs_size(offsets)));
+
+		copy = rec_copy(buf, rec, offsets);
+	} else {
+		copy = rec;
 	}
 
-	table = index->table;
-	row_len = dict_table_get_n_cols(table);
+	n_ext_cols = rec_offs_n_extern(offsets);
+	if (n_ext_cols) {
+		ext_cols = static_cast<ulint*>(
+			mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols));
+	}
 
-	row = dtuple_create(heap, row_len);
+	/* Avoid a debug assertion in rec_offs_validate(). */
+	rec_offs_make_valid(copy, index, const_cast<ulint*>(offsets));
 
-	dict_table_copy_types(row, table);
+	if (!col_table) {
+		ut_ad(!col_map);
+		ut_ad(!add_cols);
+		col_table = index->table;
+	}
+
+	if (add_cols) {
+		ut_ad(col_map);
+		row = dtuple_copy(add_cols, heap);
+		/* dict_table_copy_types() would set the fields to NULL */
+		for (ulint i = 0; i < dict_table_get_n_cols(col_table); i++) {
+			dict_col_copy_type(
+				dict_table_get_nth_col(col_table, i),
+				dfield_get_type(dtuple_get_nth_field(row, i)));
+		}
+	} else {
+		row = dtuple_create(heap, dict_table_get_n_cols(col_table));
+		dict_table_copy_types(row, col_table);
+	}
 
 	dtuple_set_info_bits(row, rec_get_info_bits(
-				     rec, dict_table_is_comp(table)));
+				     copy, rec_offs_comp(offsets)));
 
-	n_fields = rec_offs_n_fields(offsets);
-	n_ext_cols = rec_offs_n_extern(offsets);
-	if (n_ext_cols) {
-		ext_cols = mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols);
-	}
+	j = 0;
 
-	for (i = j = 0; i < n_fields; i++) {
-		dict_field_t*		ind_field
+	for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) {
+		const dict_field_t*	ind_field
 			= dict_index_get_nth_field(index, i);
+
+		if (ind_field->prefix_len) {
+			/* Column prefixes can only occur in key
+			fields, which cannot be stored externally. For
+			a column prefix, there should also be the full
+			field in the clustered index tuple. The row
+			tuple comprises full fields, not prefixes. */
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+			continue;
+		}
+
 		const dict_col_t*	col
 			= dict_field_get_col(ind_field);
 		ulint			col_no
 			= dict_col_get_no(col);
-		dfield_t*		dfield
-			= dtuple_get_nth_field(row, col_no);
-
-		if (ind_field->prefix_len == 0) {
 
-			const byte*	field = rec_get_nth_field(
-				rec, offsets, i, &len);
+		if (col_map) {
+			col_no = col_map[col_no];
 
-			dfield_set_data(dfield, field, len);
+			if (col_no == ULINT_UNDEFINED) {
+				/* dropped column */
+				continue;
+			}
 		}
 
+		dfield_t*	dfield = dtuple_get_nth_field(row, col_no);
+
+		const byte*	field = rec_get_nth_field(
+			copy, offsets, i, &len);
+
+		dfield_set_data(dfield, field, len);
+
 		if (rec_offs_nth_extern(offsets, i)) {
 			dfield_set_ext(dfield);
 
-			if (UNIV_LIKELY_NULL(col_table)) {
-				ut_a(col_no
-				     < dict_table_get_n_cols(col_table));
-				col = dict_table_get_nth_col(
-					col_table, col_no);
-			}
+			col = dict_table_get_nth_col(col_table, col_no);
 
 			if (col->ord_part) {
 				/* We will have to fetch prefixes of
@@ -315,14 +358,20 @@ row_build(
 		}
 	}
 
+	rec_offs_make_valid(rec, index, const_cast<ulint*>(offsets));
+
 	ut_ad(dtuple_check_typed(row));
 
 	if (!ext) {
 		/* REDUNDANT and COMPACT formats store a local
 		768-byte prefix of each externally stored
-		column. No cache is needed. */
-		ut_ad(dict_table_get_format(index->table)
-		      < DICT_TF_FORMAT_ZIP);
+		column. No cache is needed.
+
+		During online table rebuild,
+		row_log_table_apply_delete_low()
+		may use a cache that was set up by
+		row_log_table_delete(). */
+
 	} else if (j) {
 		*ext = row_ext_create(j, ext_cols, index->table->flags, row,
 				      heap);
@@ -361,7 +410,7 @@ row_rec_to_index_entry_low(
 	ulint		rec_len;
 
 	ut_ad(rec && heap && index);
-	/* Because this function may be invoked by row0merge.c
+	/* Because this function may be invoked by row0merge.cc
 	on a record whose header is in different format, the check
 	rec_offs_validate(rec, index, offsets) must be avoided here. */
 	ut_ad(n_ext);
@@ -369,14 +418,6 @@ row_rec_to_index_entry_low(
 
 	rec_len = rec_offs_n_fields(offsets);
 
-	if (srv_use_sys_stats_table
-	    && index == UT_LIST_GET_FIRST(dict_sys->sys_stats->indexes)) {
-		if (rec_len < dict_index_get_n_fields(index)) {
-			/* the new record should be extended */
-			rec_len = dict_index_get_n_fields(index);
-		}
-	}
-
 	entry = dtuple_create(heap, rec_len);
 
 	dtuple_set_n_fields_cmp(entry,
@@ -388,14 +429,6 @@ row_rec_to_index_entry_low(
 	for (i = 0; i < rec_len; i++) {
 
 		dfield = dtuple_get_nth_field(entry, i);
-
-		if (srv_use_sys_stats_table
-		    && index == UT_LIST_GET_FIRST(dict_sys->sys_stats->indexes)
-		    && i >= rec_offs_n_fields(offsets)) {
-			dfield_set_null(dfield);
-			continue;
-		}
-
 		field = rec_get_nth_field(rec, offsets, i, &len);
 
 		dfield_set_data(dfield, field, len);
@@ -414,28 +447,14 @@ row_rec_to_index_entry_low(
 /*******************************************************************//**
 Converts an index record to a typed data tuple. NOTE that externally
 stored (often big) fields are NOT copied to heap.
-@return	own: index entry built; see the NOTE below! */
+@return	own: index entry built */
 UNIV_INTERN
 dtuple_t*
 row_rec_to_index_entry(
 /*===================*/
-	ulint			type,	/*!< in: ROW_COPY_DATA, or
-					ROW_COPY_POINTERS: the former
-					copies also the data fields to
-					heap as the latter only places
-					pointers to data fields on the
-					index page */
-	const rec_t*		rec,	/*!< in: record in the index;
-					NOTE: in the case
-					ROW_COPY_POINTERS the data
-					fields in the row will point
-					directly into this record,
-					therefore, the buffer page of
-					this record must be at least
-					s-latched and the latch held
-					as long as the dtuple is used! */
+	const rec_t*		rec,	/*!< in: record in the index */
 	const dict_index_t*	index,	/*!< in: index */
-	ulint*			offsets,/*!< in/out: rec_get_offsets(rec) */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec) */
 	ulint*			n_ext,	/*!< out: number of externally
 					stored columns */
 	mem_heap_t*		heap)	/*!< in: memory heap from which
@@ -443,23 +462,21 @@ row_rec_to_index_entry(
 {
 	dtuple_t*	entry;
 	byte*		buf;
+	const rec_t*	copy_rec;
 
 	ut_ad(rec && heap && index);
 	ut_ad(rec_offs_validate(rec, index, offsets));
 
-	if (type == ROW_COPY_DATA) {
-		/* Take a copy of rec to heap */
-		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
-		rec = rec_copy(buf, rec, offsets);
-		/* Avoid a debug assertion in rec_offs_validate(). */
-		rec_offs_make_valid(rec, index, offsets);
-#ifdef UNIV_BLOB_NULL_DEBUG
-	} else {
-		ut_a(!rec_offs_any_null_extern(rec, offsets));
-#endif /* UNIV_BLOB_NULL_DEBUG */
-	}
+	/* Take a copy of rec to heap */
+	buf = static_cast<byte*>(
+		mem_heap_alloc(heap, rec_offs_size(offsets)));
 
-	entry = row_rec_to_index_entry_low(rec, index, offsets, n_ext, heap);
+	copy_rec = rec_copy(buf, rec, offsets);
+
+	rec_offs_make_valid(copy_rec, index, const_cast<ulint*>(offsets));
+	entry = row_rec_to_index_entry_low(
+		copy_rec, index, offsets, n_ext, heap);
+	rec_offs_make_valid(rec, index, const_cast<ulint*>(offsets));
 
 	dtuple_set_info_bits(entry,
 			     rec_get_info_bits(rec, rec_offs_comp(offsets)));
@@ -517,7 +534,8 @@ row_build_row_ref(
 	if (type == ROW_COPY_DATA) {
 		/* Take a copy of rec to heap */
 
-		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+		buf = static_cast<byte*>(
+			mem_heap_alloc(heap, rec_offs_size(offsets)));
 
 		rec = rec_copy(buf, rec, offsets);
 		/* Avoid a debug assertion in rec_offs_validate(). */
@@ -829,8 +847,6 @@ row_search_index_entry(
 	return(ROW_FOUND);
 }
 
-#include <my_sys.h>
-
 /*******************************************************************//**
 Formats the raw data in "data" (in InnoDB on-disk format) that is of
 type DATA_INT using "prtype" and writes the result to "buf".
@@ -858,24 +874,17 @@ row_raw_format_int(
 {
 	ulint	ret;
 
-	if (data_len <= sizeof(ullint)) {
+	if (data_len <= sizeof(ib_uint64_t)) {
 
-		ullint		value;
+		ib_uint64_t	value;
 		ibool		unsigned_type = prtype & DATA_UNSIGNED;
 
-		value = mach_read_int_type((const byte*) data,
-					   data_len, unsigned_type);
-
-		if (unsigned_type) {
-
-			ret = ut_snprintf(buf, buf_size, "%llu",
-					  value) + 1;
-		} else {
-
-			ret = ut_snprintf(buf, buf_size, "%lld",
-					  (long long) value) + 1;
-		}
+		value = mach_read_int_type(
+			(const byte*) data, data_len, unsigned_type);
 
+		ret = ut_snprintf(
+			buf, buf_size,
+			unsigned_type ? UINT64PF : INT64PF, value) + 1;
 	} else {
 
 		*format_in_hex = TRUE;
@@ -1035,6 +1044,8 @@ test_row_raw_format_int()
 	ulint	ret;
 	char	buf[128];
 	ibool	format_in_hex;
+	speedo_t speedo;
+	ulint	i;
 
 #define CALL_AND_TEST(data, data_len, prtype, buf, buf_size,\
 		      ret_expected, buf_expected, format_in_hex_expected)\
@@ -1217,9 +1228,6 @@ test_row_raw_format_int()
 
 	/* speed test */
 
-	speedo_t	speedo;
-	ulint		i;
-
 	speedo_reset(&speedo);
 
 	for (i = 0; i < 1000000; i++) {
diff --git a/storage/xtradb/row/row0sel.c b/storage/xtradb/row/row0sel.cc
index a902854d4ca..95ebc143e61 100644
--- a/storage/xtradb/row/row0sel.c
+++ b/storage/xtradb/row/row0sel.cc
@@ -18,13 +18,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /***************************************************//**
-@file row/row0sel.c
+@file row/row0sel.cc
 Select
 
 Created 12/19/1997 Heikki Tuuri
@@ -57,6 +57,7 @@ Created 12/19/1997 Heikki Tuuri
 #include "read0read.h"
 #include "buf0lru.h"
 #include "ha_prototypes.h"
+#include "srv0start.h"
 #include "m_string.h" /* for my_sys.h */
 #include "my_sys.h" /* DEBUG_SYNC_C */
 
@@ -109,12 +110,12 @@ row_sel_sec_rec_is_for_blob(
 {
 	ulint	len;
 	byte	buf[REC_VERSION_56_MAX_INDEX_COL_LEN];
-	ulint	zip_size = dict_table_flags_to_zip_size(table->flags);
+	ulint	zip_size = dict_tf_get_zip_size(table->flags);
 
 	/* This function should never be invoked on an Antelope format
 	table, because they should always contain enough prefix in the
 	clustered index record. */
-	ut_ad(dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP);
+	ut_ad(dict_table_get_format(table) >= UNIV_FORMAT_B);
 	ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE);
 	ut_ad(prefix_len >= sec_len);
 	ut_ad(prefix_len > 0);
@@ -273,7 +274,9 @@ sel_node_create(
 {
 	sel_node_t*	node;
 
-	node = mem_heap_alloc(heap, sizeof(sel_node_t));
+	node = static_cast<sel_node_t*>(
+		mem_heap_alloc(heap, sizeof(sel_node_t)));
+
 	node->common.type = QUE_NODE_SELECT;
 	node->state = SEL_NODE_OPEN;
 
@@ -335,7 +338,8 @@ UNIV_INLINE
 void
 sel_assign_into_var_values(
 /*=======================*/
-	sym_node_t*	var,	/*!< in: first variable in a list of variables */
+	sym_node_t*	var,	/*!< in: first variable in a list of
+				variables */
 	sel_node_t*	node)	/*!< in: select node */
 {
 	que_node_t*	exp;
@@ -345,15 +349,15 @@ sel_assign_into_var_values(
 		return;
 	}
 
-	exp = node->select_list;
+	for (exp = node->select_list;
+	     var != 0;
+	     var = static_cast<sym_node_t*>(que_node_get_next(var))) {
 
-	while (var) {
 		ut_ad(exp);
 
 		eval_node_copy_val(var->alias, exp);
 
 		exp = que_node_get_next(exp);
-		var = que_node_get_next(var);
 	}
 }
 
@@ -370,12 +374,12 @@ sel_reset_aggregate_vals(
 
 	ut_ad(node->is_aggregate);
 
-	func_node = node->select_list;
+	for (func_node = static_cast<func_node_t*>(node->select_list);
+	     func_node != 0;
+	     func_node = static_cast<func_node_t*>(
+		     	que_node_get_next(func_node))) {
 
-	while (func_node) {
 		eval_node_set_int_val(func_node, 0);
-
-		func_node = que_node_get_next(func_node);
 	}
 
 	node->aggregate_already_fetched = FALSE;
@@ -501,8 +505,9 @@ sel_col_prefetch_buf_alloc(
 
 	ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
 
-	column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH
-					 * sizeof(sel_buf_t));
+	column->prefetch_buf = static_cast<sel_buf_t*>(
+		mem_alloc(SEL_MAX_N_PREFETCH * sizeof(sel_buf_t)));
+
 	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
 		sel_buf = column->prefetch_buf + i;
 
@@ -541,8 +546,8 @@ Pops the column values for a prefetched, cached row from the column prefetch
 buffers and places them to the val fields in the column nodes. */
 static
 void
-sel_pop_prefetched_row(
-/*===================*/
+sel_dequeue_prefetched_row(
+/*=======================*/
 	plan_t*	plan)	/*!< in: plan node for a table */
 {
 	sym_node_t*	column;
@@ -583,7 +588,7 @@ sel_pop_prefetched_row(
 		column values to be able to free it later: therefore
 		we swap the values for sel_buf and val */
 
-		sel_buf->data = dfield_get_data(val);
+		sel_buf->data = static_cast<byte*>(dfield_get_data(val));
 		sel_buf->len = dfield_get_len(val);
 		sel_buf->val_buf_size = que_node_get_val_buf_size(column);
 
@@ -603,8 +608,8 @@ Pushes the column values for a prefetched, cached row to the column prefetch
 buffers from the val fields in the column nodes. */
 UNIV_INLINE
 void
-sel_push_prefetched_row(
-/*====================*/
+sel_enqueue_prefetched_row(
+/*=======================*/
 	plan_t*	plan)	/*!< in: plan node for a table */
 {
 	sym_node_t*	column;
@@ -631,14 +636,14 @@ sel_push_prefetched_row(
 
 	ut_ad(pos < SEL_MAX_N_PREFETCH);
 
-	column = UT_LIST_GET_FIRST(plan->columns);
+	for (column = UT_LIST_GET_FIRST(plan->columns);
+	     column != 0;
+	     column = UT_LIST_GET_NEXT(col_var_list, column)) {
 
-	while (column) {
 		if (!column->copy_val) {
 			/* There is no sense to push pointers to database
 			page fields when we do not keep latch on the page! */
-
-			goto next_col;
+			continue;
 		}
 
 		if (!column->prefetch_buf) {
@@ -651,7 +656,7 @@ sel_push_prefetched_row(
 
 		val = que_node_get_val(column);
 
-		data = dfield_get_data(val);
+		data = static_cast<byte*>(dfield_get_data(val));
 		len = dfield_get_len(val);
 		val_buf_size = que_node_get_val_buf_size(column);
 
@@ -665,16 +670,14 @@ sel_push_prefetched_row(
 		sel_buf->data = data;
 		sel_buf->len = len;
 		sel_buf->val_buf_size = val_buf_size;
-next_col:
-		column = UT_LIST_GET_NEXT(col_var_list, column);
 	}
 }
 
 /*********************************************************************//**
 Builds a previous version of a clustered index record for a consistent read
 @return	DB_SUCCESS or error code */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_sel_build_prev_vers(
 /*====================*/
 	read_view_t*	read_view,	/*!< in: read view */
@@ -691,7 +694,7 @@ row_sel_build_prev_vers(
 					afterwards */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	ulint	err;
+	dberr_t	err;
 
 	if (*old_vers_heap) {
 		mem_heap_empty(*old_vers_heap);
@@ -707,10 +710,9 @@ row_sel_build_prev_vers(
 
 /*********************************************************************//**
 Builds the last committed version of a clustered index record for a
-semi-consistent read.
-@return	DB_SUCCESS or error code */
-static
-ulint
+semi-consistent read. */
+static __attribute__((nonnull))
+void
 row_sel_build_committed_vers_for_mysql(
 /*===================================*/
 	dict_index_t*	clust_index,	/*!< in: clustered index */
@@ -726,18 +728,16 @@ row_sel_build_committed_vers_for_mysql(
 					afterwards */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	ulint	err;
-
 	if (prebuilt->old_vers_heap) {
 		mem_heap_empty(prebuilt->old_vers_heap);
 	} else {
-		prebuilt->old_vers_heap = mem_heap_create(200);
+		prebuilt->old_vers_heap = mem_heap_create(
+			rec_offs_size(*offsets));
 	}
 
-	err = row_vers_build_for_semi_consistent_read(
+	row_vers_build_for_semi_consistent_read(
 		rec, mtr, clust_index, offsets, offset_heap,
 		prebuilt->old_vers_heap, old_vers);
-	return(err);
 }
 
 /*********************************************************************//**
@@ -757,13 +757,14 @@ row_sel_test_end_conds(
 	/* All conditions in end_conds are comparisons of a column to an
 	expression */
 
-	cond = UT_LIST_GET_FIRST(plan->end_conds);
+	for (cond = UT_LIST_GET_FIRST(plan->end_conds);
+	     cond != 0;
+	     cond = UT_LIST_GET_NEXT(cond_list, cond)) {
 
-	while (cond) {
 		/* Evaluate the left side of the comparison, i.e., get the
 		column value if there is an indirection */
 
-		eval_sym(cond->args);
+		eval_sym(static_cast<sym_node_t*>(cond->args));
 
 		/* Do the comparison */
 
@@ -771,8 +772,6 @@ row_sel_test_end_conds(
 
 			return(FALSE);
 		}
-
-		cond = UT_LIST_GET_NEXT(cond_list, cond);
 	}
 
 	return(TRUE);
@@ -810,8 +809,8 @@ row_sel_test_other_conds(
 Retrieves the clustered index record corresponding to a record in a
 non-clustered index. Does the necessary locking.
 @return	DB_SUCCESS or error code */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_sel_get_clust_rec(
 /*==================*/
 	sel_node_t*	node,	/*!< in: select_node */
@@ -829,7 +828,7 @@ row_sel_get_clust_rec(
 	dict_index_t*	index;
 	rec_t*		clust_rec;
 	rec_t*		old_vers;
-	ulint		err;
+	dberr_t		err;
 	mem_heap_t*	heap		= NULL;
 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
 	ulint*		offsets		= offsets_;
@@ -863,7 +862,7 @@ row_sel_get_clust_rec(
 		ut_a(node->read_view);
 
 		/* In a rare case it is possible that no clust rec is found
-		for a delete-marked secondary index record: if in row0umod.c
+		for a delete-marked secondary index record: if in row0umod.cc
 		in row_undo_mod_remove_clust_low() we have already removed
 		the clust rec, while purge is still cleaning and removing
 		secondary index records associated with earlier versions of
@@ -899,7 +898,9 @@ row_sel_get_clust_rec(
 		err = lock_clust_rec_read_check_and_lock(
 			0, btr_pcur_get_block(&plan->clust_pcur),
 			clust_rec, index, offsets,
-			node->row_lock_mode, lock_type, thr);
+			static_cast<enum lock_mode>(node->row_lock_mode),
+			lock_type,
+			thr);
 
 		switch (err) {
 		case DB_SUCCESS:
@@ -981,7 +982,7 @@ err_exit:
 Sets a lock on a record.
 @return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
 UNIV_INLINE
-enum db_err
+dberr_t
 sel_set_rec_lock(
 /*=============*/
 	const buf_block_t*	block,	/*!< in: buffer block of rec */
@@ -994,11 +995,11 @@ sel_set_rec_lock(
 	que_thr_t*		thr)	/*!< in: query thread */
 {
 	trx_t*		trx;
-	enum db_err	err;
+	dberr_t		err;
 
 	trx = thr_get_trx(thr);
 
-	if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) {
+	if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000) {
 		if (buf_LRU_buf_pool_running_out()) {
 
 			return(DB_LOCK_TABLE_FULL);
@@ -1007,10 +1008,12 @@ sel_set_rec_lock(
 
 	if (dict_index_is_clust(index)) {
 		err = lock_clust_rec_read_check_and_lock(
-			0, block, rec, index, offsets, mode, type, thr);
+			0, block, rec, index, offsets,
+			static_cast<enum lock_mode>(mode), type, thr);
 	} else {
 		err = lock_sec_rec_read_check_and_lock(
-			0, block, rec, index, offsets, mode, type, thr);
+			0, block, rec, index, offsets,
+			static_cast<enum lock_mode>(mode), type, thr);
 	}
 
 	return(err);
@@ -1081,7 +1084,7 @@ row_sel_open_pcur(
 		(FALSE: no init) */
 
 		btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF,
-					    &(plan->pcur), FALSE, mtr);
+					    &(plan->pcur), false, 0, mtr);
 	}
 
 	ut_ad(plan->n_rows_prefetched == 0);
@@ -1211,6 +1214,9 @@ row_sel_try_search_shortcut(
 	sel_node_t*	node,	/*!< in: select node for a consistent read */
 	plan_t*		plan,	/*!< in: plan for a unique search in clustered
 				index */
+	ibool		search_latch_locked,
+				/*!< in: whether the search holds
+				btr_search_latch */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
 	dict_index_t*	index;
@@ -1227,10 +1233,13 @@ row_sel_try_search_shortcut(
 	ut_ad(plan->unique_search);
 	ut_ad(!plan->must_get_clust);
 #ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(btr_search_get_latch(index), RW_LOCK_SHARED));
+	if (search_latch_locked) {
+		ut_ad(rw_lock_own(btr_search_get_latch(index),
+				  RW_LOCK_SHARED));
+	}
 #endif /* UNIV_SYNC_DEBUG */
 
-	row_sel_open_pcur(plan, TRUE, mtr);
+	row_sel_open_pcur(plan, search_latch_locked, mtr);
 
 	rec = btr_pcur_get_rec(&(plan->pcur));
 
@@ -1305,8 +1314,8 @@ func_exit:
 /*********************************************************************//**
 Performs a select step.
 @return	DB_SUCCESS or error code */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_sel(
 /*====*/
 	sel_node_t*	node,	/*!< in: select node */
@@ -1339,7 +1348,7 @@ row_sel(
 	&mtr must be committed before we move
 	to the next non-clustered record */
 	ulint		found_flag;
-	ulint		err;
+	dberr_t		err;
 	mem_heap_t*	heap				= NULL;
 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
 	ulint*		offsets				= offsets_;
@@ -1375,7 +1384,7 @@ table_loop:
 	index = plan->index;
 
 	if (plan->n_rows_prefetched > 0) {
-		sel_pop_prefetched_row(plan);
+		sel_dequeue_prefetched_row(plan);
 
 		goto next_table_no_mtr;
 	}
@@ -1415,7 +1424,9 @@ table_loop:
 			rw_lock_s_lock(btr_search_get_latch(index));
 		}
 
-		found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
+		found_flag = row_sel_try_search_shortcut(node, plan,
+							 search_latch_locked,
+							 &mtr);
 
 		if (found_flag == SEL_FOUND) {
 
@@ -1818,13 +1829,13 @@ skip_lock:
 		goto next_table;
 	}
 
-	sel_push_prefetched_row(plan);
+	sel_enqueue_prefetched_row(plan);
 
 	if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
 
 		/* The prefetch buffer is now full */
 
-		sel_pop_prefetched_row(plan);
+		sel_dequeue_prefetched_row(plan);
 
 		goto next_table;
 	}
@@ -1923,7 +1934,7 @@ table_exhausted:
 	if (plan->n_rows_prefetched > 0) {
 		/* The table became exhausted during a prefetch */
 
-		sel_pop_prefetched_row(plan);
+		sel_dequeue_prefetched_row(plan);
 
 		goto next_table_no_mtr;
 	}
@@ -2029,14 +2040,11 @@ row_sel_step(
 /*=========*/
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	ulint		i_lock_mode;
-	sym_node_t*	table_node;
 	sel_node_t*	node;
-	ulint		err;
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<sel_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
 
@@ -2054,7 +2062,7 @@ row_sel_step(
 		/* It may be that the current session has not yet started
 		its transaction, or it has been committed: */
 
-		trx_start_if_not_started(thr_get_trx(thr));
+		trx_start_if_not_started_xa(thr_get_trx(thr));
 
 		plan_reset_cursor(sel_node_get_nth_plan(node, 0));
 
@@ -2063,24 +2071,32 @@ row_sel_step(
 			node->read_view = trx_assign_read_view(
 				thr_get_trx(thr));
 		} else {
+			sym_node_t*	table_node;
+			enum lock_mode	i_lock_mode;
+
 			if (node->set_x_locks) {
 				i_lock_mode = LOCK_IX;
 			} else {
 				i_lock_mode = LOCK_IS;
 			}
 
-			table_node = node->table_list;
+			for (table_node = node->table_list;
+			     table_node != 0;
+			     table_node = static_cast<sym_node_t*>(
+					que_node_get_next(table_node))) {
+
+				dberr_t	err = lock_table(
+					0, table_node->table, i_lock_mode,
+					thr);
 
-			while (table_node) {
-				err = lock_table(0, table_node->table,
-						 i_lock_mode, thr);
 				if (err != DB_SUCCESS) {
-					thr_get_trx(thr)->error_state = err;
+					trx_t*	trx;
+
+					trx = thr_get_trx(thr);
+					trx->error_state = err;
 
 					return(NULL);
 				}
-
-				table_node = que_node_get_next(table_node);
 			}
 		}
 
@@ -2104,7 +2120,7 @@ row_sel_step(
 		}
 	}
 
-	err = row_sel(node, thr);
+	dberr_t	err = row_sel(node, thr);
 
 	/* NOTE! if queries are parallelized, the following assignment may
 	have problems; the assignment should be made only if thr is the
@@ -2135,7 +2151,7 @@ fetch_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<fetch_node_t*>(thr->run_node);
 	sel_node = node->cursor_def;
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
@@ -2148,12 +2164,12 @@ fetch_step(
 				sel_assign_into_var_values(node->into_list,
 							   sel_node);
 			} else {
-				void* ret = (*node->func->func)(
+				ibool ret = (*node->func->func)(
 					sel_node, node->func->arg);
 
 				if (!ret) {
 					sel_node->state
-						= SEL_NODE_NO_MORE_ROWS;
+						 = SEL_NODE_NO_MORE_ROWS;
 				}
 			}
 		}
@@ -2194,21 +2210,22 @@ row_fetch_print(
 	void*	row,		/*!< in:  sel_node_t* */
 	void*	user_arg)	/*!< in:  not used */
 {
-	sel_node_t*	node = row;
 	que_node_t*	exp;
 	ulint		i = 0;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
 
 	UT_NOT_USED(user_arg);
 
 	fprintf(stderr, "row_fetch_print: row %p\n", row);
 
-	exp = node->select_list;
+	for (exp = node->select_list;
+	     exp != 0;
+	     exp = que_node_get_next(exp), i++) {
 
-	while (exp) {
 		dfield_t*	dfield = que_node_get_val(exp);
 		const dtype_t*	type = dfield_get_type(dfield);
 
-		fprintf(stderr, " column %lu:\n", (ulong)i);
+		fprintf(stderr, " column %lu:\n", (ulong) i);
 
 		dtype_print(type);
 		putc('\n', stderr);
@@ -2220,9 +2237,6 @@ row_fetch_print(
 		} else {
 			fputs(" <NULL>;\n", stderr);
 		}
-
-		exp = que_node_get_next(exp);
-		i++;
 	}
 
 	return((void*)42);
@@ -2243,7 +2257,7 @@ row_printf_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<row_printf_node_t*>(thr->run_node);
 
 	sel_node = node->sel_node;
 
@@ -2486,6 +2500,7 @@ row_sel_convert_mysql_key_to_innobase(
 				dfield_set_len(dfield, len
 					       - (ulint) (key_ptr - key_end));
 			}
+                        ut_ad(0);
 		}
 
 		n_fields++;
@@ -2542,13 +2557,23 @@ row_sel_store_row_id_to_prebuilt(
 	ut_memcpy(prebuilt->row_id, data, len);
 }
 
+#ifdef UNIV_DEBUG
+/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
+# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
+	row_sel_field_store_in_mysql_format_func(dest,templ,idx,field,src,len)
+#else /* UNIV_DEBUG */
+/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
+# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
+	row_sel_field_store_in_mysql_format_func(dest,templ,src,len)
+#endif /* UNIV_DEBUG */
+
 /**************************************************************//**
 Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
-function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */
-static
+function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */
+static __attribute__((nonnull))
 void
-row_sel_field_store_in_mysql_format(
-/*================================*/
+row_sel_field_store_in_mysql_format_func(
+/*=====================================*/
 	byte*		dest,	/*!< in/out: buffer where to store; NOTE
 				that BLOBs are not in themselves
 				stored here: the caller must allocate
@@ -2560,10 +2585,22 @@ row_sel_field_store_in_mysql_format(
 				Its following fields are referenced:
 				type, is_unsigned, mysql_col_len,
 				mbminlen, mbmaxlen */
+#ifdef UNIV_DEBUG
+	const dict_index_t* index,
+				/*!< in: InnoDB index */
+	ulint		field_no,
+				/*!< in: templ->rec_field_no or
+				templ->clust_rec_field_no or
+				templ->icp_rec_field_no */
+#endif /* UNIV_DEBUG */
 	const byte*	data,	/*!< in: data to store */
 	ulint		len)	/*!< in: length of the data */
 {
-	byte*	ptr;
+	byte*			ptr;
+#ifdef UNIV_DEBUG
+	const dict_field_t*	field
+		= dict_index_get_nth_field(index, field_no);
+#endif /* UNIV_DEBUG */
 
 	ut_ad(len != UNIV_SQL_NULL);
 	UNIV_MEM_ASSERT_RW(data, len);
@@ -2660,17 +2697,30 @@ row_sel_field_store_in_mysql_format(
 		ut_ad(templ->mysql_col_len >= len);
 		ut_ad(templ->mbmaxlen >= templ->mbminlen);
 
+		/* If field_no equals to templ->icp_rec_field_no,
+		we are examining a row pointed by "icp_rec_field_no".
+		There is possibility that icp_rec_field_no refers to
+		a field in a secondary index while templ->rec_field_no
+		points to field in a primary index. The length
+		should still be equal, unless the field pointed
+		by icp_rec_field_no has a prefix */
 		ut_ad(templ->mbmaxlen > templ->mbminlen
-		      || templ->mysql_col_len == len);
+		      || templ->mysql_col_len == len
+		      || (field_no == templ->icp_rec_field_no
+			  && field->prefix_len > 0));
+
 		/* The following assertion would fail for old tables
 		containing UTF-8 ENUM columns due to Bug #9526. */
 		ut_ad(!templ->mbmaxlen
 		      || !(templ->mysql_col_len % templ->mbmaxlen));
-		ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len);
+		ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len
+		      || (field_no == templ->icp_rec_field_no
+			  && field->prefix_len > 0));
+		ut_ad(!(field->prefix_len % templ->mbmaxlen));
 
 		if (templ->mbminlen == 1 && templ->mbmaxlen != 1) {
 			/* Pad with spaces. This undoes the stripping
-			done in row0mysql.c, function
+			done in row0mysql.cc, function
 			row_mysql_store_col_in_innobase_format(). */
 
 			memset(dest + len, 0x20, templ->mysql_col_len - len);
@@ -2691,26 +2741,42 @@ row_sel_field_store_in_mysql_format(
 	case DATA_DECIMAL:
 		/* Above are the valid column types for MySQL data. */
 #endif /* UNIV_DEBUG */
+		ut_ad(field->prefix_len
+		      ? field->prefix_len == len
+		      : templ->mysql_col_len == len);
 		memcpy(dest, data, len);
 	}
 }
 
+#ifdef UNIV_DEBUG
+/** Convert a field from Innobase format to MySQL format. */
+# define row_sel_store_mysql_field(m,p,r,i,o,f,t) \
+	row_sel_store_mysql_field_func(m,p,r,i,o,f,t)
+#else /* UNIV_DEBUG */
+/** Convert a field from Innobase format to MySQL format. */
+# define row_sel_store_mysql_field(m,p,r,i,o,f,t) \
+	row_sel_store_mysql_field_func(m,p,r,o,f,t)
+#endif /* UNIV_DEBUG */
 /**************************************************************//**
 Convert a field in the Innobase format to a field in the MySQL format. */
 static __attribute__((warn_unused_result))
 ibool
-row_sel_store_mysql_field(
-/*======================*/
+row_sel_store_mysql_field_func(
+/*===========================*/
 	byte*			mysql_rec,	/*!< out: record in the
 						MySQL format */
 	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt struct */
 	const rec_t*		rec,		/*!< in: InnoDB record;
 						must be protected by
 						a page latch */
+#ifdef UNIV_DEBUG
+	const dict_index_t*	index,		/*!< in: index of rec */
+#endif
 	const ulint*		offsets,	/*!< in: array returned by
 						rec_get_offsets() */
 	ulint			field_no,	/*!< in: templ->rec_field_no or
-						templ->clust_rec_field_no */
+						templ->clust_rec_field_no or
+						templ->icp_rec_field_no */
 	const mysql_row_templ_t*templ)		/*!< in: row template */
 {
 	const byte*	data;
@@ -2723,7 +2789,7 @@ row_sel_store_mysql_field(
 	ut_ad(field_no == templ->clust_rec_field_no
 	      || field_no == templ->rec_field_no
 	      || field_no == templ->icp_rec_field_no);
-	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(rec_offs_validate(rec, index, offsets));
 
 	if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no))) {
 
@@ -2731,6 +2797,9 @@ row_sel_store_mysql_field(
 		/* Copy an externally stored field to a temporary heap */
 
 		ut_a(!prebuilt->trx->has_search_latch);
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(!btr_search_own_any());
+#endif
 		ut_ad(field_no == templ->clust_rec_field_no);
 
 		if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
@@ -2772,7 +2841,7 @@ row_sel_store_mysql_field(
 
 		row_sel_field_store_in_mysql_format(
 			mysql_rec + templ->mysql_col_offset,
-			templ, data, len);
+			templ, index, field_no, data, len);
 
 		if (heap != prebuilt->blob_heap) {
 			mem_heap_free(heap);
@@ -2816,12 +2885,13 @@ row_sel_store_mysql_field(
 					UNIV_PAGE_SIZE);
 			}
 
-			data = mem_heap_dup(prebuilt->blob_heap, data, len);
+			data = static_cast<byte*>(
+				mem_heap_dup(prebuilt->blob_heap, data, len));
 		}
 
 		row_sel_field_store_in_mysql_format(
 			mysql_rec + templ->mysql_col_offset,
-			templ, data, len);
+			templ, index, field_no, data, len);
 	}
 
 	ut_ad(len != UNIV_SQL_NULL);
@@ -2855,10 +2925,14 @@ row_sel_store_mysql_rec(
 	ibool		rec_clust,	/*!< in: TRUE if rec is in the
 					clustered index instead of
 					prebuilt->index */
+	const dict_index_t* index,	/*!< in: index of rec */
 	const ulint*	offsets)	/*!< in: array returned by
  					rec_get_offsets(rec) */
 {
-	ulint		i;
+	ulint	i;
+
+	ut_ad(rec_clust || index == prebuilt->index);
+	ut_ad(!rec_clust || dict_index_is_clust(index));
 
 	if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
 		mem_heap_free(prebuilt->blob_heap);
@@ -2867,25 +2941,41 @@ row_sel_store_mysql_rec(
 
 	for (i = 0; i < prebuilt->n_template; i++) {
 		const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
+		const ulint		field_no
+			= rec_clust
+			? templ->clust_rec_field_no
+			: templ->rec_field_no;
+		/* We should never deliver column prefixes to MySQL,
+		except for evaluating innobase_index_cond(). */
+		ut_ad(dict_index_get_nth_field(index, field_no)->prefix_len
+		      == 0);
 
 		if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
-					       rec, offsets,
-					       rec_clust
-					       ? templ->clust_rec_field_no
-					       : templ->rec_field_no,
-					       templ)) {
+					       rec, index, offsets,
+					       field_no, templ)) {
 			return(FALSE);
 		}
 	}
 
+	/* FIXME: We only need to read the doc_id if an FTS indexed
+	column is being updated.
+	NOTE, the record must be cluster index record. Secondary index
+	might not have the Doc ID */
+	if (dict_table_has_fts_index(prebuilt->table)
+	    && dict_index_is_clust(index)) {
+
+		prebuilt->fts_doc_id = fts_get_doc_id_from_rec(
+			prebuilt->table, rec, NULL);
+	}
+
 	return(TRUE);
 }
 
 /*********************************************************************//**
 Builds a previous version of a clustered index record for a consistent read
 @return	DB_SUCCESS or error code */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_sel_build_prev_vers_for_mysql(
 /*==============================*/
 	read_view_t*	read_view,	/*!< in: read view */
@@ -2902,7 +2992,7 @@ row_sel_build_prev_vers_for_mysql(
 					afterwards */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	ulint	err;
+	dberr_t	err;
 
 	if (prebuilt->old_vers_heap) {
 		mem_heap_empty(prebuilt->old_vers_heap);
@@ -2921,8 +3011,8 @@ Retrieves the clustered index record corresponding to a record in a
 non-clustered index. Does the necessary locking. Used in the MySQL
 interface.
 @return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
-static
-enum db_err
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_sel_get_clust_rec_for_mysql(
 /*============================*/
 	row_prebuilt_t*	prebuilt,/*!< in: prebuilt struct in the handle */
@@ -2949,7 +3039,7 @@ row_sel_get_clust_rec_for_mysql(
 	dict_index_t*	clust_index;
 	const rec_t*	clust_rec;
 	rec_t*		old_vers;
-	enum db_err	err;
+	dberr_t		err;
 	trx_t*		trx;
 
 	*out_rec = NULL;
@@ -2976,7 +3066,7 @@ row_sel_get_clust_rec_for_mysql(
 	    < dict_index_get_n_unique(clust_index)) {
 
 		/* In a rare case it is possible that no clust rec is found
-		for a delete-marked secondary index record: if in row0umod.c
+		for a delete-marked secondary index record: if in row0umod.cc
 		in row_undo_mod_remove_clust_low() we have already removed
 		the clust rec, while purge is still cleaning and removing
 		secondary index records associated with earlier versions of
@@ -3000,7 +3090,6 @@ row_sel_get_clust_rec_for_mysql(
 			rec_print(stderr, clust_rec, clust_index);
 			putc('\n', stderr);
 			trx_print(stderr, trx, 600);
-
 			fputs("\n"
 			      "InnoDB: Submit a detailed bug report"
 			      " to http://bugs.mysql.com\n", stderr);
@@ -3024,7 +3113,10 @@ row_sel_get_clust_rec_for_mysql(
 		err = lock_clust_rec_read_check_and_lock(
 			0, btr_pcur_get_block(&prebuilt->clust_pcur),
 			clust_rec, clust_index, *offsets,
-			prebuilt->select_lock_type, LOCK_REC_NOT_GAP, thr);
+			static_cast<enum lock_mode>(prebuilt->select_lock_type),
+			LOCK_REC_NOT_GAP,
+			thr);
+
 		switch (err) {
 		case DB_SUCCESS:
 		case DB_SUCCESS_LOCKED_REC:
@@ -3096,7 +3188,10 @@ row_sel_get_clust_rec_for_mysql(
 func_exit:
 	*out_rec = clust_rec;
 
-	if (prebuilt->select_lock_type != LOCK_NONE) {
+	/* Store the current position if select_lock_type is not
+	LOCK_NONE or if we are scanning using InnoDB APIs */
+	if (prebuilt->select_lock_type != LOCK_NONE
+	    || prebuilt->innodb_api) {
 		/* We may use the cursor in update or in unlock_row():
 		store its position */
 
@@ -3212,8 +3307,8 @@ row_sel_copy_cached_field_for_mysql(
 Pops a cached row for MySQL from the fetch cache. */
 UNIV_INLINE
 void
-row_sel_pop_cached_row_for_mysql(
-/*=============================*/
+row_sel_dequeue_cached_row_for_mysql(
+/*=================================*/
 	byte*		buf,		/*!< in/out: buffer where to copy the
 					row */
 	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct */
@@ -3239,14 +3334,10 @@ row_sel_pop_cached_row_for_mysql(
 			/* Copy NULL bit of the current field from cached_rec
 			to buf */
 			if (templ->mysql_null_bit_mask) {
-				/*buf[templ->mysql_null_byte_offset]
+				buf[templ->mysql_null_byte_offset]
 					^= (buf[templ->mysql_null_byte_offset]
 					    ^ cached_rec[templ->mysql_null_byte_offset])
-					& (byte)templ->mysql_null_bit_mask;*/
-                                byte *null_byte= buf + templ->mysql_null_byte_offset;
-                                (*null_byte)&= ~templ->mysql_null_bit_mask;
-                                (*null_byte)|= cached_rec[templ->mysql_null_byte_offset] & 
-                                               templ->mysql_null_bit_mask;
+					& (byte) templ->mysql_null_bit_mask;
 			}
 		}
 	} else if (prebuilt->mysql_prefix_len > 63) {
@@ -3276,6 +3367,39 @@ row_sel_pop_cached_row_for_mysql(
 }
 
 /********************************************************************//**
+Initialise the prefetch cache. */
+UNIV_INLINE
+void
+row_sel_prefetch_cache_init(
+/*========================*/
+	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
+{
+	ulint	i;
+	ulint	sz;
+	byte*	ptr;
+
+	/* Reserve space for the magic number. */
+	sz = UT_ARR_SIZE(prebuilt->fetch_cache) * (prebuilt->mysql_row_len + 8);
+	ptr = static_cast<byte*>(mem_alloc(sz));
+
+	for (i = 0; i < UT_ARR_SIZE(prebuilt->fetch_cache); i++) {
+
+		/* A user has reported memory corruption in these
+		buffers in Linux. Put magic numbers there to help
+		to track a possible bug. */
+
+		mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
+		ptr += 4;
+
+		prebuilt->fetch_cache[i] = ptr;
+		ptr += prebuilt->mysql_row_len;
+
+		mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
+		ptr += 4;
+	}
+}
+
+/********************************************************************//**
 Get the last fetch cache buffer from the queue.
 @return pointer to buffer. */
 UNIV_INLINE
@@ -3284,29 +3408,14 @@ row_sel_fetch_last_buf(
 /*===================*/
 	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
 {
-        ut_a(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
-	ut_a(!prebuilt->templ_contains_blob);
+	ut_ad(!prebuilt->templ_contains_blob);
+	ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
 
-	if (UNIV_UNLIKELY(prebuilt->fetch_cache[0] == NULL)) {
-		ulint	i;
+	if (prebuilt->fetch_cache[0] == NULL) {
 		/* Allocate memory for the fetch cache */
 		ut_ad(prebuilt->n_fetch_cached == 0);
 
-		for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
-			byte*	buf;
-
-			/* A user has reported memory corruption in these
-			buffers in Linux. Put magic numbers there to help
-			to track a possible bug. */
-
-			buf = mem_alloc(prebuilt->mysql_row_len + 8);
-
-			prebuilt->fetch_cache[i] = buf + 4;
-
-			mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N);
-			mach_write_to_4(buf + 4 + prebuilt->mysql_row_len,
-					ROW_PREBUILT_FETCH_MAGIC_N);
-		}
+		row_sel_prefetch_cache_init(prebuilt);
 	}
 
 	ut_ad(prebuilt->fetch_cache_first == 0);
@@ -3318,10 +3427,10 @@ row_sel_fetch_last_buf(
 
 /********************************************************************//**
 Pushes a row for MySQL to the fetch cache. */
-UNIV_INLINE 
+UNIV_INLINE
 void
-row_sel_push_cache_row_for_mysql(
-/*=============================*/
+row_sel_enqueue_cache_row_for_mysql(
+/*================================*/
 	byte*		mysql_rec,	/*!< in/out: MySQL record */
 	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
 {
@@ -3334,14 +3443,14 @@ row_sel_push_cache_row_for_mysql(
 		ut_memcpy(dest, mysql_rec, prebuilt->mysql_row_len);
 	}
 
-        ++prebuilt->n_fetch_cached;
+	++prebuilt->n_fetch_cached;
 }
 
 /*********************************************************************//**
 Tries to do a shortcut to fetch a clustered index record with a unique key,
 using the hash index if possible (not always). We assume that the search
 mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
-btr search latch has been locked in S-mode.
+btr search latch has been locked in S-mode if AHI is enabled.
 @return	SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
 static
 ulint
@@ -3438,6 +3547,8 @@ row_search_idx_cond_check(
 		return(ICP_MATCH);
 	}
 
+	MONITOR_INC(MONITOR_ICP_ATTEMPTS);
+
 	/* Convert to MySQL format those fields that are needed for
 	evaluating the index condition. */
 
@@ -3449,7 +3560,7 @@ row_search_idx_cond_check(
 		const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
 
 		if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
-					       rec, offsets,
+					       rec, prebuilt->index, offsets,
 					       templ->icp_rec_field_no,
 					       templ)) {
 			return(ICP_NO_MATCH);
@@ -3462,7 +3573,7 @@ row_search_idx_cond_check(
 	index, if the case of the column has been updated in
 	the past, or a record has been deleted and a record
 	inserted in a different case. */
-	result = handler_index_cond_check(prebuilt->idx_cond);
+	result = innobase_index_cond(prebuilt->idx_cond);
 	switch (result) {
 	case ICP_MATCH:
 		/* Convert the remaining fields to MySQL format.
@@ -3470,23 +3581,28 @@ row_search_idx_cond_check(
 		this until we have fetched the clustered index record. */
 		if (!prebuilt->need_to_access_clustered
 		    || dict_index_is_clust(prebuilt->index)) {
-			if (!row_sel_store_mysql_rec(mysql_rec, prebuilt,
-						     rec,
-						     FALSE, offsets)) {
+			if (!row_sel_store_mysql_rec(
+				    mysql_rec, prebuilt, rec, FALSE,
+				    prebuilt->index, offsets)) {
 				ut_ad(dict_index_is_clust(prebuilt->index));
-				result = ICP_NO_MATCH;
+				return(ICP_NO_MATCH);
 			}
 		}
-		/* fall through */
+		MONITOR_INC(MONITOR_ICP_MATCH);
+		return(result);
 	case ICP_NO_MATCH:
+		MONITOR_INC(MONITOR_ICP_NO_MATCH);
+		return(result);
 	case ICP_OUT_OF_RANGE:
+		MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE);
+		return(result);
+        case ICP_ERROR:
         case ICP_ABORTED_BY_USER:
 		return(result);
-        default: ;
 	}
 
 	ut_error;
-	return(ICP_ABORTED_BY_USER); /* Keep compiler happy */
+	return(result);
 }
 
 /********************************************************************//**
@@ -3499,7 +3615,7 @@ position and fetch next or fetch prev must not be tried to the cursor!
 DB_LOCK_TABLE_FULL, DB_CORRUPTION, DB_SEARCH_ABORTED_BY_USER or
 DB_TOO_BIG_RECORD */
 UNIV_INTERN
-ulint
+dberr_t
 row_search_for_mysql(
 /*=================*/
 	byte*		buf,		/*!< in/out: buffer for the fetched
@@ -3528,11 +3644,10 @@ row_search_for_mysql(
 	dict_index_t*	clust_index;
 	que_thr_t*	thr;
 	const rec_t*	rec;
-	const rec_t*	result_rec;
+	const rec_t*	result_rec = NULL;
 	const rec_t*	clust_rec;
-	ulint		err				= DB_SUCCESS;
+	dberr_t		err				= DB_SUCCESS;
 	ibool		unique_search			= FALSE;
-	ibool		unique_search_from_clust_index	= FALSE;
 	ibool		mtr_has_extra_clust_latch	= FALSE;
 	ibool		moves_up			= FALSE;
 	ibool		set_also_gap_locks		= TRUE;
@@ -3552,46 +3667,43 @@ row_search_for_mysql(
 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
 	ulint*		offsets				= offsets_;
 	ibool		table_lock_waited		= FALSE;
-	ibool		problematic_use = FALSE;
+	byte*		next_buf			= 0;
 
 	rec_offs_init(offsets_);
 
 	ut_ad(index && pcur && search_tuple);
 
+	/* We don't support FTS queries from the HANDLER interfaces, because
+	we implemented FTS as reversed inverted index with auxiliary tables.
+	So anything related to traditional index query would not apply to
+	it. */
+	if (index->type & DICT_FTS) {
+		return(DB_END_OF_INDEX);
+	}
+
 	ut_ad(!trx->has_search_latch);
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(!btr_search_own_any());
 	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
 #endif /* UNIV_SYNC_DEBUG */
 
-	if (UNIV_UNLIKELY(prebuilt->table->ibd_file_missing)) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr, "  InnoDB: Error:\n"
-			"InnoDB: MySQL is trying to use a table handle"
-			" but the .ibd file for\n"
-			"InnoDB: table %s does not exist.\n"
-			"InnoDB: Have you deleted the .ibd file"
-			" from the database directory under\n"
-			"InnoDB: the MySQL datadir, or have you used"
-			" DISCARD TABLESPACE?\n"
-			"InnoDB: Look from\n"
-			"InnoDB: " REFMAN "innodb-troubleshooting.html\n"
-			"InnoDB: how you can resolve the problem.\n",
-			prebuilt->table->name);
+	if (dict_table_is_discarded(prebuilt->table)) {
 
-		return(DB_ERROR);
-	}
+		return(DB_TABLESPACE_DELETED);
+
+	} else if (prebuilt->table->ibd_file_missing) {
+
+		return(DB_TABLESPACE_NOT_FOUND);
 
-	if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+	} else if (!prebuilt->index_usable) {
 
 		return(DB_MISSING_HISTORY);
-	}
 
-	if (dict_index_is_corrupted(index)) {
+	} else if (dict_index_is_corrupted(index)) {
+
 		return(DB_CORRUPTION);
-	}
 
-	if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
+	} else if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
 		fprintf(stderr,
 			"InnoDB: Error: trying to free a corrupt\n"
 			"InnoDB: table handle. Magic n %lu, table name ",
@@ -3638,7 +3750,6 @@ row_search_for_mysql(
 	fprintf(stderr, "N tables locked %lu\n",
 		(ulong) trx->mysql_n_tables_locked);
 #endif
-
 	/* Reset the new record lock info if srv_locks_unsafe_for_binlog
 	is set or session is using a READ COMMITED isolation level. Then
 	we are able to remove the record locks set here on an individual
@@ -3680,11 +3791,10 @@ row_search_for_mysql(
 			prebuilt->fetch_cache_first = 0;
 
 		} else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
-			row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+			row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
 
 			prebuilt->n_rows_fetched++;
 
-			srv_n_rows_read++;
 			err = DB_SUCCESS;
 			goto func_exit;
 		}
@@ -3763,12 +3873,11 @@ row_search_for_mysql(
 	    && dict_index_is_clust(index)
 	    && !prebuilt->templ_contains_blob
 	    && !prebuilt->used_in_HANDLER
-	    && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) {
+	    && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)
+	    && !prebuilt->innodb_api) {
 
 		mode = PAGE_CUR_GE;
 
-		unique_search_from_clust_index = TRUE;
-
 		if (trx->mysql_n_tables_locked == 0
 		    && prebuilt->select_lock_type == LOCK_NONE
 		    && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
@@ -3813,17 +3922,16 @@ row_search_for_mysql(
 					case ICP_NO_MATCH:
 					case ICP_OUT_OF_RANGE:
                                         case ICP_ABORTED_BY_USER:
+                                        case ICP_ERROR:
 						goto shortcut_mismatch;
 					case ICP_MATCH:
 						goto shortcut_match;
-                                        default: ;
 					}
-                                        ut_error;
 				}
 
-				if (!row_sel_store_mysql_rec(buf, prebuilt,
-                                                             rec, FALSE,
-							     offsets)) {
+				if (!row_sel_store_mysql_rec(
+					    buf, prebuilt,
+					    rec, FALSE, index, offsets)) {
 					/* Only fresh inserts may contain
 					incomplete externally stored
 					columns. Pretend that such
@@ -3845,8 +3953,6 @@ row_search_for_mysql(
 				/* ut_print_name(stderr, index->name);
 				fputs(" shortcut\n", stderr); */
 
-				srv_n_rows_read++;
-
 				err = DB_SUCCESS;
 				goto release_search_latch;
 
@@ -3889,9 +3995,16 @@ release_search_latch:
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(!btr_search_own_any());
 #endif
-	ut_ad(prebuilt->sql_stat_start || trx->state == TRX_ACTIVE);
-	ut_ad(trx->state == TRX_NOT_STARTED
-	      || trx->state == TRX_ACTIVE);
+
+	/* The state of a running trx can only be changed by the
+	thread that is currently serving the transaction. Because we
+	are that thread, we can read trx->state without holding any
+	mutex. */
+	ut_ad(prebuilt->sql_stat_start || trx->state == TRX_STATE_ACTIVE);
+
+	ut_ad(trx->state == TRX_STATE_NOT_STARTED
+	      || trx->state == TRX_STATE_ACTIVE);
+
 	ut_ad(prebuilt->sql_stat_start
 	      || prebuilt->select_lock_type != LOCK_NONE
 	      || trx->read_view);
@@ -3928,17 +4041,6 @@ release_search_latch:
 
 	/* Do some start-of-statement preparations */
 
-	if (!prebuilt->mysql_has_locked) {
-		if (!(prebuilt->table->flags & (DICT_TF2_TEMPORARY << DICT_TF2_SHIFT))) {
-			fprintf(stderr, "InnoDB: Error: row_search_for_mysql() is called without ha_innobase::external_lock()\n");
-			if (trx->mysql_thd != NULL) {
-				innobase_mysql_print_thd(stderr, trx->mysql_thd, 600);
-			}
-		}
-		problematic_use = TRUE;
-	}
-retry_check:
-	
 	if (!prebuilt->sql_stat_start) {
 		/* No need to set an intention lock or assign a read view */
 
@@ -3950,18 +4052,6 @@ retry_check:
 			      " perform a consistent read\n"
 			      "InnoDB: but the read view is not assigned!\n",
 			      stderr);
-			if (problematic_use) {
-				fprintf(stderr, "InnoDB: It may be caused by calling "
-						"without ha_innobase::external_lock()\n"
-						"InnoDB: For the first-aid, avoiding the crash. "
-						"But it should be fixed ASAP.\n");
-				if (prebuilt->table->flags & (DICT_TF2_TEMPORARY << DICT_TF2_SHIFT)
-				    && trx->mysql_thd != NULL) {
-					innobase_mysql_print_thd(stderr, trx->mysql_thd, 600);
-				}
-				prebuilt->sql_stat_start = TRUE;
-				goto retry_check;
-			}
 			trx_print(stderr, trx, 600);
 			fputc('\n', stderr);
 			ut_error;
@@ -4033,12 +4123,12 @@ wait_table_again:
 
 			/* Try to place a gap lock on the next index record
 			to prevent phantoms in ORDER BY ... DESC queries */
-			const rec_t*	next = page_rec_get_next_const(rec);
+			const rec_t*	next_rec = page_rec_get_next_const(rec);
 
-			offsets = rec_get_offsets(next, index, offsets,
+			offsets = rec_get_offsets(next_rec, index, offsets,
 						  ULINT_UNDEFINED, &heap);
 			err = sel_set_rec_lock(btr_pcur_get_block(pcur),
-					       next, index, offsets,
+					       next_rec, index, offsets,
 					       prebuilt->select_lock_type,
 					       LOCK_GAP, thr);
 
@@ -4051,16 +4141,10 @@ wait_table_again:
 				goto lock_wait_or_error;
 			}
 		}
-	} else {
-		if (mode == PAGE_CUR_G) {
-			btr_pcur_open_at_index_side(
-				TRUE, index, BTR_SEARCH_LEAF, pcur, FALSE,
-				&mtr);
-		} else if (mode == PAGE_CUR_L) {
-			btr_pcur_open_at_index_side(
-				FALSE, index, BTR_SEARCH_LEAF, pcur, FALSE,
-				&mtr);
-		}
+	} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_L) {
+		btr_pcur_open_at_index_side(
+			mode == PAGE_CUR_G, index, BTR_SEARCH_LEAF,
+			pcur, false, 0, &mtr);
 	}
 
 rec_loop:
@@ -4086,10 +4170,12 @@ rec_loop:
 #ifdef UNIV_SEARCH_DEBUG
 	/*
 	fputs("Using ", stderr);
-	dict_index_name_print(stderr, index);
+	dict_index_name_print(stderr, trx, index);
 	fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
 	page_get_page_no(page_align(rec)));
-	rec_print(rec);
+	rec_print(stderr, rec, index);
+	printf("delete-mark: %lu\n",
+	       rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
 	*/
 #endif /* UNIV_SEARCH_DEBUG */
 
@@ -4159,7 +4245,8 @@ rec_loop:
 	if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) {
 
 wrong_offs:
-		if (srv_pass_corrupt_table && !trx_sys_sys_space(index->table->space)) {
+		if (srv_pass_corrupt_table && index->table->space != 0 &&
+		    index->table->space < SRV_LOG_SPACE_FIRST_ID) {
 			index->table->is_corrupt = TRUE;
 			fil_space_set_corrupt(index->table->space);
 		}
@@ -4215,6 +4302,9 @@ wrong_offs:
 
 	/* Calculate the 'offsets' associated with 'rec' */
 
+	ut_ad(fil_page_get_type(btr_pcur_get_page(pcur)) == FIL_PAGE_INDEX);
+	ut_ad(btr_page_get_index_id(btr_pcur_get_page(pcur)) == index->id);
+
 	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
 
 	if (UNIV_UNLIKELY(srv_force_recovery > 0
@@ -4279,8 +4369,10 @@ wrong_offs:
 			btr_pcur_store_position(pcur, &mtr);
 
 			err = DB_RECORD_NOT_FOUND;
-			/* ut_print_name(stderr, index->name);
-			fputs(" record not found 3\n", stderr); */
+#if 0
+			ut_print_name(stderr, trx, FALSE, index->name);
+			fputs(" record not found 3\n", stderr);
+#endif
 
 			goto normal_return;
 		}
@@ -4318,8 +4410,10 @@ wrong_offs:
 			btr_pcur_store_position(pcur, &mtr);
 
 			err = DB_RECORD_NOT_FOUND;
-			/* ut_print_name(stderr, index->name);
-			fputs(" record not found 4\n", stderr); */
+#if 0
+			ut_print_name(stderr, trx, FALSE, index->name);
+			fputs(" record not found 4\n", stderr);
+#endif
 
 			goto normal_return;
 		}
@@ -4344,8 +4438,7 @@ wrong_offs:
 		if (!set_also_gap_locks
 		    || srv_locks_unsafe_for_binlog
 		    || trx->isolation_level <= TRX_ISO_READ_COMMITTED
-		    || (unique_search
-			&& !UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp)))) {
+		    || (unique_search && !rec_get_deleted_flag(rec, comp))) {
 
 			goto no_gap_lock;
 		} else {
@@ -4405,46 +4498,43 @@ no_gap_lock:
 
 			/* The following call returns 'offsets'
 			associated with 'old_vers' */
-			err = row_sel_build_committed_vers_for_mysql(
+			row_sel_build_committed_vers_for_mysql(
 				clust_index, prebuilt, rec,
 				&offsets, &heap, &old_vers, &mtr);
 
-			if (err != DB_SUCCESS) {
+			/* Check whether it was a deadlock or not, if not
+			a deadlock and the transaction had to wait then
+			release the lock it is waiting on. */
 
-				goto lock_wait_or_error;
-			}
+			err = lock_trx_handle_wait(trx);
 
-			mutex_enter(&kernel_mutex);
-			if (trx->was_chosen_as_deadlock_victim) {
-				mutex_exit(&kernel_mutex);
-				err = DB_DEADLOCK;
+			switch (err) {
+			case DB_SUCCESS:
+				/* The lock was granted while we were
+				searching for the last committed version.
+				Do a normal locking read. */
 
+				offsets = rec_get_offsets(
+					rec, index, offsets, ULINT_UNDEFINED,
+					&heap);
+				goto locks_ok;
+			case DB_DEADLOCK:
 				goto lock_wait_or_error;
+			case DB_LOCK_WAIT:
+				err = DB_SUCCESS;
+				break;
+			default:
+				ut_error;
 			}
-			if (UNIV_LIKELY(trx->wait_lock != NULL)) {
-				lock_cancel_waiting_and_release(
-					trx->wait_lock);
-				mutex_exit(&kernel_mutex);
-
-				if (old_vers == NULL) {
-					/* The row was not yet committed */
-
-					goto next_rec;
-				}
 
-				did_semi_consistent_read = TRUE;
-				rec = old_vers;
-			} else {
-				mutex_exit(&kernel_mutex);
+			if (old_vers == NULL) {
+				/* The row was not yet committed */
 
-				/* The lock was granted while we were
-				searching for the last committed version.
-				Do a normal locking read. */
-
-				offsets = rec_get_offsets(rec, index, offsets,
-							  ULINT_UNDEFINED,
-							  &heap);
+				goto next_rec;
 			}
+
+			did_semi_consistent_read = TRUE;
+			rec = old_vers;
 			break;
 		default:
 
@@ -4518,9 +4608,11 @@ no_gap_lock:
                                 case ICP_ABORTED_BY_USER:
 				        err = DB_SEARCH_ABORTED_BY_USER;
 					goto idx_cond_failed;
+                                case ICP_ERROR:
+				        err = DB_ERROR;
+					goto idx_cond_failed;
 				case ICP_MATCH:
 					goto requires_clust_rec;
-                                default: ;
 				}
 
 				ut_error;
@@ -4528,12 +4620,13 @@ no_gap_lock:
 		}
 	}
 
+locks_ok:
 	/* NOTE that at this point rec can be an old version of a clustered
 	index record built for a consistent read. We cannot assume after this
 	point that rec is on a buffer pool page. Functions like
 	page_rec_is_comp() cannot be used! */
 
-	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp))) {
+	if (rec_get_deleted_flag(rec, comp)) {
 
 		/* The record is delete-marked: we can skip it */
 
@@ -4555,7 +4648,9 @@ no_gap_lock:
 		delete marked record and the record following it.
 
 		For now this is applicable only to clustered indexes while
-		doing a unique search. There is scope for further optimization
+		doing a unique search except for HANDLER queries because
+		HANDLER allows NEXT and PREV even in unique search on
+		clustered index. There is scope for further optimization
 		applicable to unique secondary indexes. Current behaviour is
 		to widen the scope of a lock on an already delete marked record
 		if the same record is deleted twice by the same transaction */
@@ -4575,27 +4670,28 @@ no_gap_lock:
 	case ICP_NO_MATCH:
 		if (did_semi_consistent_read) {
 			row_unlock_for_mysql(prebuilt, TRUE);
- 		}
+		}
 		goto next_rec;
         case ICP_ABORTED_BY_USER:
 	        err = DB_SEARCH_ABORTED_BY_USER;
 		goto idx_cond_failed;
+        case ICP_ERROR:
+	        err = DB_ERROR;
+		goto idx_cond_failed;
 	case ICP_OUT_OF_RANGE:
 		err = DB_RECORD_NOT_FOUND;
 		goto idx_cond_failed;
 	case ICP_MATCH:
 		break;
-        default:
-	        ut_error;
- 	}
+	}
 
 	/* Get the clustered index record if needed, if we did not do the
 	search using the clustered index. */
+
 	if (index != clust_index && prebuilt->need_to_access_clustered) {
  
 requires_clust_rec:
 		ut_ad(index != clust_index);
-
 		/* We use a 'goto' to the preceding label if a consistent
 		read of a secondary index record requires us to look up old
 		versions of the associated clustered index record. */
@@ -4638,7 +4734,7 @@ requires_clust_rec:
 			goto lock_wait_or_error;
 		}
 
-		if (UNIV_UNLIKELY(rec_get_deleted_flag(clust_rec, comp))) {
+		if (rec_get_deleted_flag(clust_rec, comp)) {
 
 			/* The record is delete marked: we can skip it */
 
@@ -4660,14 +4756,21 @@ requires_clust_rec:
 		ut_ad(rec_offs_validate(result_rec, clust_index, offsets));
 
 		if (prebuilt->idx_cond) {
-			/* Convert the remaining fields to
-			MySQL format. We were unable to do
-			this in row_search_idx_cond_check(),
-			because the condition is on the
-			secondary index and the requested
-			column is in the clustered index. */
-			if (!row_sel_store_mysql_rec(buf, prebuilt, result_rec,
-						     TRUE, offsets)) {
+			/* Convert the record to MySQL format. We were
+			unable to do this in row_search_idx_cond_check(),
+			because the condition is on the secondary index
+			and the requested column is in the clustered index.
+			We convert all fields, including those that
+			may have been used in ICP, because the
+			secondary index may contain a column prefix
+			rather than the full column. Also, as noted
+			in Bug #56680, the column in the secondary
+			index may be in the wrong case, and the
+			authoritative case is in result_rec, the
+			appropriate version of the clustered index record. */
+			if (!row_sel_store_mysql_rec(
+				    buf, prebuilt, result_rec,
+				    TRUE, clust_index, offsets)) {
 				goto next_rec;
 			}
 		}
@@ -4693,8 +4796,10 @@ requires_clust_rec:
 	    && !prebuilt->templ_contains_blob
 	    && !prebuilt->clust_index_was_generated
 	    && !prebuilt->used_in_HANDLER
+	    && !prebuilt->innodb_api
 	    && prebuilt->template_type
-	    != ROW_MYSQL_DUMMY_TEMPLATE) {
+	    != ROW_MYSQL_DUMMY_TEMPLATE
+	    && !prebuilt->in_fts_query) {
 
 		/* Inside an update, for example, we do not cache rows,
 		since we may use the cursor position to do the actual
@@ -4705,31 +4810,63 @@ requires_clust_rec:
 		not cache rows because there the cursor is a scrollable
 		cursor. */
 
-                ut_a(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
-
-                /* We only convert from InnoDB row format to MySQL row
-                format when ICP is disabled. */
-
-		if (!prebuilt->idx_cond
-		    && !row_sel_store_mysql_rec(
-                            row_sel_fetch_last_buf(prebuilt),
-                            prebuilt, result_rec,
-			    result_rec != rec, offsets)) {
-			/* Only fresh inserts may contain incomplete
-			externally stored columns. Pretend that such
-			records do not exist. Such records may only be
-			accessed at the READ UNCOMMITTED isolation
-			level or when rolling back a recovered
-			transaction. Rollback happens at a lower
-			level, not here. */
-			goto next_rec;
-		}
+		ut_a(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+
+		/* We only convert from InnoDB row format to MySQL row
+		format when ICP is disabled. */
 
-                row_sel_push_cache_row_for_mysql(buf, prebuilt);
+		if (!prebuilt->idx_cond) {
 
-                if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) {
-                        goto next_rec;
+			/* We use next_buf to track the allocation of buffers
+			where we store and enqueue the buffers for our
+			pre-fetch optimisation.
+
+			If next_buf == 0 then we store the converted record
+			directly into the MySQL record buffer (buf). If it is
+			!= 0 then we allocate a pre-fetch buffer and store the
+			converted record there.
+
+			If the conversion fails and the MySQL record buffer
+			was not written to then we reset next_buf so that
+			we can re-use the MySQL record buffer in the next
+			iteration. */
+
+			next_buf = next_buf
+				 ? row_sel_fetch_last_buf(prebuilt) : buf;
+
+			if (!row_sel_store_mysql_rec(
+				next_buf, prebuilt, result_rec,
+				result_rec != rec,
+				result_rec != rec ? clust_index : index,
+				offsets)) {
+
+				if (next_buf == buf) {
+					ut_a(prebuilt->n_fetch_cached == 0);
+					next_buf = 0;
+				}
+
+				/* Only fresh inserts may contain incomplete
+				externally stored columns. Pretend that such
+				records do not exist. Such records may only be
+				accessed at the READ UNCOMMITTED isolation
+				level or when rolling back a recovered
+				transaction. Rollback happens at a lower
+				level, not here. */
+				goto next_rec;
+			}
+
+			if (next_buf != buf) {
+				row_sel_enqueue_cache_row_for_mysql(
+					next_buf, prebuilt);
+			}
+		} else {
+			row_sel_enqueue_cache_row_for_mysql(buf, prebuilt);
 		}
+
+		if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) {
+			goto next_rec;
+		}
+
 	} else {
 		if (UNIV_UNLIKELY
 		    (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE)) {
@@ -4750,11 +4887,13 @@ requires_clust_rec:
 			       rec_offs_size(offsets));
 			mach_write_to_4(buf,
 					rec_offs_extra_size(offsets) + 4);
-		} else if (!prebuilt->idx_cond) {
+		} else if (!prebuilt->idx_cond && !prebuilt->innodb_api) {
 			/* The record was not yet converted to MySQL format. */
 			if (!row_sel_store_mysql_rec(
-				    buf, prebuilt,
-				    result_rec, result_rec != rec, offsets)) {
+				    buf, prebuilt, result_rec,
+				    result_rec != rec,
+				    result_rec != rec ? clust_index : index,
+				    offsets)) {
 				/* Only fresh inserts may contain
 				incomplete externally stored
 				columns. Pretend that such records do
@@ -4768,13 +4907,10 @@ requires_clust_rec:
 		}
 
 		if (prebuilt->clust_index_was_generated) {
-			if (result_rec != rec) {
-				offsets = rec_get_offsets(
-					rec, index, offsets, ULINT_UNDEFINED,
-					&heap);
-			}
-			row_sel_store_row_id_to_prebuilt(prebuilt, rec,
-							 index, offsets);
+			row_sel_store_row_id_to_prebuilt(
+				prebuilt, result_rec,
+				result_rec == rec ? index : clust_index,
+				offsets);
 		}
 	}
 
@@ -4790,13 +4926,20 @@ requires_clust_rec:
 	err = DB_SUCCESS;
 
 idx_cond_failed:
-	if (!unique_search_from_clust_index
+	if (!unique_search
+	    || !dict_index_is_clust(index)
+	    || direction != 0
 	    || prebuilt->select_lock_type != LOCK_NONE
-	    || prebuilt->used_in_HANDLER) {
+	    || prebuilt->used_in_HANDLER
+	    || prebuilt->innodb_api) {
 
 		/* Inside an update always store the cursor position */
 
 		btr_pcur_store_position(pcur, &mtr);
+
+		if (prebuilt->innodb_api) {
+			prebuilt->innodb_api_rec = result_rec;
+		}
 	}
 
 	goto normal_return;
@@ -4813,7 +4956,18 @@ next_rec:
 	/*-------------------------------------------------------------*/
 	/* PHASE 5: Move the cursor to the next index record */
 
-	/*TODO: with ICP, do this when switching pages, every N pages */
+	/* NOTE: For moves_up==FALSE, the mini-transaction will be
+	committed and restarted every time when switching b-tree
+	pages. For moves_up==TRUE in index condition pushdown, we can
+	scan an entire secondary index tree within a single
+	mini-transaction. As long as the prebuilt->idx_cond does not
+	match, we do not need to consult the clustered index or
+	return records to MySQL, and thus we can avoid repositioning
+	the cursor. What prevents us from buffer-fixing all leaf pages
+	within the mini-transaction is the btr_leaf_page_release()
+	call in btr_pcur_move_to_next_page(). Only the leaf page where
+	the cursor is positioned will remain buffer-fixed. */
+
 	if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) {
 		/* We must commit mtr if we are moving to the next
 		non-clustered index record, because we could break the
@@ -4912,7 +5066,7 @@ lock_table_wait:
 			on the same user record, we cannot use
 			row_unlock_for_mysql() to unlock any records, and
 			we must thus reset the new rec lock info. Since
-			in lock0lock.c we have blocked the inheriting of gap
+			in lock0lock.cc we have blocked the inheriting of gap
 			X-locks, we actually do not have any new record locks
 			set in this case.
 
@@ -4947,8 +5101,23 @@ normal_return:
 
 	mtr_commit(&mtr);
 
-	if (prebuilt->n_fetch_cached > 0) {
-		row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+	if (prebuilt->idx_cond != 0) {
+
+		/* When ICP is active we don't write to the MySQL buffer
+		directly, only to buffers that are enqueued in the pre-fetch
+		queue. We need to dequeue the first buffer and copy the contents
+		to the record buffer that was passed in by MySQL. */
+
+		if (prebuilt->n_fetch_cached > 0) {
+			row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
+			err = DB_SUCCESS;
+		}
+
+	} else if (next_buf != 0) {
+
+		/* We may or may not have enqueued some buffers to the
+		pre-fetch queue, but we definitely wrote to the record
+		buffer passed to use by MySQL. */
 
 		err = DB_SUCCESS;
 	}
@@ -4958,9 +5127,6 @@ normal_return:
 	dict_index_name_print(stderr, index);
 	fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
 #endif /* UNIV_SEARCH_DEBUG */
-	if (err == DB_SUCCESS) {
-		srv_n_rows_read++;
-	}
 
 func_exit:
 	trx->op_info = "";
@@ -4987,6 +5153,9 @@ func_exit:
 	ut_ad(!btr_search_own_any());
 	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
 #endif /* UNIV_SYNC_DEBUG */
+
+	DEBUG_SYNC_C("innodb_row_search_for_mysql_exit");
+
 	return(err);
 }
 
@@ -5005,25 +5174,38 @@ row_search_check_if_query_cache_permitted(
 	dict_table_t*	table;
 	ibool		ret	= FALSE;
 
-	table = dict_table_get(norm_name, FALSE, DICT_ERR_IGNORE_NONE);
+	/* Disable query cache altogether for all tables if recovered XA
+	transactions in prepared state exist. This is because we do not
+	restore the table locks for those transactions and we may wrongly
+	set ret=TRUE above if "lock_table_get_n_locks(table) == 0". See
+	"Bug#14658648 XA ROLLBACK (DISTRIBUTED DATABASE) NOT WORKING WITH
+	QUERY CACHE ENABLED".
+	Read trx_sys->n_prepared_recovered_trx without mutex protection,
+	not possible to end up with a torn read since n_prepared_recovered_trx
+	is word size. */
+	if (trx_sys->n_prepared_recovered_trx > 0) {
+
+		return(FALSE);
+	}
+
+	table = dict_table_open_on_name(norm_name, FALSE, FALSE,
+					DICT_ERR_IGNORE_NONE);
 
 	if (table == NULL) {
 
 		return(FALSE);
 	}
 
-	mutex_enter(&kernel_mutex);
-
 	/* Start the transaction if it is not started yet */
 
-	trx_start_if_not_started_low(trx);
+	trx_start_if_not_started(trx);
 
 	/* If there are locks on the table or some trx has invalidated the
 	cache up to our trx id, then ret = FALSE.
 	We do not check what type locks there are on the table, though only
 	IX type locks actually would require ret = FALSE. */
 
-	if (UT_LIST_GET_LEN(table->locks) == 0
+	if (lock_table_get_n_locks(table) == 0
 	    && trx->id >= table->query_cache_inv_trx_id) {
 
 		ret = TRUE;
@@ -5036,13 +5218,12 @@ row_search_check_if_query_cache_permitted(
 
 			trx->read_view =
 				read_view_open_now(trx->id,
-						   trx->prebuilt_view, TRUE);
-			trx->prebuilt_view = trx->read_view;
+						   trx->prebuilt_view);
 			trx->global_read_view = trx->read_view;
 		}
 	}
 
-	mutex_exit(&kernel_mutex);
+	dict_table_close(table, FALSE, FALSE);
 
 	return(ret);
 }
@@ -5138,7 +5319,7 @@ Read the max AUTOINC value from an index.
 @return DB_SUCCESS if all OK else error code, DB_RECORD_NOT_FOUND if
 column name can't be found in index */
 UNIV_INTERN
-ulint
+dberr_t
 row_search_max_autoinc(
 /*===================*/
 	dict_index_t*	index,		/*!< in: index to search */
@@ -5148,7 +5329,7 @@ row_search_max_autoinc(
 	ulint		i;
 	ulint		n_cols;
 	dict_field_t*	dfield = NULL;
-	ulint		error = DB_SUCCESS;
+	dberr_t		error = DB_SUCCESS;
 
 	n_cols = dict_index_get_n_ordering_defined_by_user(index);
 
@@ -5170,12 +5351,11 @@ row_search_max_autoinc(
 
 		mtr_start(&mtr);
 
-		/* Open at the high/right end (FALSE), and INIT
-		cursor (TRUE) */
+		/* Open at the high/right end (false), and init cursor */
 		btr_pcur_open_at_index_side(
-			FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+			false, index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
 
-		if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) {
+		if (!page_is_empty(btr_pcur_get_page(&pcur))) {
 			const rec_t*	rec;
 
 			rec = row_search_autoinc_get_rec(&pcur, &mtr);
diff --git a/storage/xtradb/row/row0uins.c b/storage/xtradb/row/row0uins.cc
index 57c8c512698..7b50d8b62ae 100644
--- a/storage/xtradb/row/row0uins.c
+++ b/storage/xtradb/row/row0uins.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0uins.c
+@file row/row0uins.cc
 Fresh insert undo
 
 Created 2/25/1997 Heikki Tuuri
@@ -38,6 +38,7 @@ Created 2/25/1997 Heikki Tuuri
 #include "mach0data.h"
 #include "row0undo.h"
 #include "row0vers.h"
+#include "row0log.h"
 #include "trx0trx.h"
 #include "trx0rec.h"
 #include "row0row.h"
@@ -60,25 +61,61 @@ introduced where a call to log_free_check() is bypassed. */
 Removes a clustered index record. The pcur in node was positioned on the
 record, now it is detached.
 @return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
-static
-ulint
+static  __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_undo_ins_remove_clust_rec(
 /*==========================*/
 	undo_node_t*	node)	/*!< in: undo node */
 {
 	btr_cur_t*	btr_cur;
 	ibool		success;
-	ulint		err;
-	ulint		n_tries		= 0;
+	dberr_t		err;
+	ulint		n_tries	= 0;
 	mtr_t		mtr;
+	dict_index_t*	index	= node->pcur.btr_cur.index;
+	bool		online;
+
+	ut_ad(dict_index_is_clust(index));
 
 	mtr_start(&mtr);
 
-	success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur),
-					    &mtr);
+	/* This is similar to row_undo_mod_clust(). The DDL thread may
+	already have copied this row from the log to the new table.
+	We must log the removal, so that the row will be correctly
+	purged. However, we can log the removal out of sync with the
+	B-tree modification. */
+
+	online = dict_index_is_online_ddl(index);
+	if (online) {
+		ut_ad(node->trx->dict_operation_lock_mode
+		      != RW_X_LATCH);
+		ut_ad(node->table->id != DICT_INDEXES_ID);
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+	}
+
+	success = btr_pcur_restore_position(
+		online
+		? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED
+		: BTR_MODIFY_LEAF, &node->pcur, &mtr);
 	ut_a(success);
 
+	btr_cur = btr_pcur_get_btr_cur(&node->pcur);
+
+	ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur), btr_cur->index)
+	      == node->trx->id);
+
+	if (online && dict_index_is_online_ddl(index)) {
+		const rec_t*	rec	= btr_cur_get_rec(btr_cur);
+		mem_heap_t*	heap	= NULL;
+		const ulint*	offsets	= rec_get_offsets(
+			rec, index, NULL, ULINT_UNDEFINED, &heap);
+		row_log_table_delete(
+			rec, index, offsets, true, node->trx->id);
+		mem_heap_free(heap);
+	}
+
 	if (node->table->id == DICT_INDEXES_ID) {
+		ut_ad(!online);
 		ut_ad(node->trx->dict_operation_lock_mode == RW_X_LATCH);
 
 		/* Drop the index tree associated with the row in
@@ -90,22 +127,17 @@ row_undo_ins_remove_clust_rec(
 
 		mtr_start(&mtr);
 
-		success = btr_pcur_restore_position(BTR_MODIFY_LEAF,
-						    &(node->pcur), &mtr);
+		success = btr_pcur_restore_position(
+			BTR_MODIFY_LEAF, &node->pcur, &mtr);
 		ut_a(success);
 	}
 
-	btr_cur = btr_pcur_get_btr_cur(&(node->pcur));
-
-	success = btr_cur_optimistic_delete(btr_cur, &mtr);
-
-	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
-
-	if (success) {
-		trx_undo_rec_release(node->trx, node->undo_no);
-
-		return(DB_SUCCESS);
+	if (btr_cur_optimistic_delete(btr_cur, 0, &mtr)) {
+		err = DB_SUCCESS;
+		goto func_exit;
 	}
+
+	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
 retry:
 	/* If did not succeed, try pessimistic descent to tree */
 	mtr_start(&mtr);
@@ -114,7 +146,7 @@ retry:
 					    &(node->pcur), &mtr);
 	ut_a(success);
 
-	btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+	btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
 				   trx_is_recv(node->trx)
 				   ? RB_RECOVERY
 				   : RB_NORMAL, &mtr);
@@ -135,8 +167,8 @@ retry:
 		goto retry;
 	}
 
-	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
-
+func_exit:
+	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
 	trx_undo_rec_release(node->trx, node->undo_no);
 
 	return(err);
@@ -145,8 +177,8 @@ retry:
 /***************************************************************//**
 Removes a secondary index entry if found.
 @return	DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_undo_ins_remove_sec_low(
 /*========================*/
 	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
@@ -157,22 +189,31 @@ row_undo_ins_remove_sec_low(
 {
 	btr_pcur_t		pcur;
 	btr_cur_t*		btr_cur;
-	ulint			err;
+	dberr_t			err	= DB_SUCCESS;
 	mtr_t			mtr;
 	enum row_search_result	search_result;
 
+	log_free_check();
+
 	mtr_start(&mtr);
 
-	btr_cur = btr_pcur_get_btr_cur(&pcur);
+	if (mode == BTR_MODIFY_LEAF) {
+		mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+		mtr_x_lock(dict_index_get_lock(index), &mtr);
+	}
 
-	ut_ad(mode == BTR_MODIFY_TREE || mode == BTR_MODIFY_LEAF);
+	if (row_log_online_op_try(index, entry, 0)) {
+		goto func_exit_no_pcur;
+	}
 
 	search_result = row_search_index_entry(index, entry, mode,
 					       &pcur, &mtr);
 
 	switch (search_result) {
 	case ROW_NOT_FOUND:
-		err = DB_SUCCESS;
 		goto func_exit;
 	case ROW_FOUND:
 		break;
@@ -184,23 +225,24 @@ row_undo_ins_remove_sec_low(
 		ut_error;
 	}
 
-	if (mode == BTR_MODIFY_LEAF) {
-		err = btr_cur_optimistic_delete(btr_cur, &mtr)
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	if (mode != BTR_MODIFY_TREE) {
+		err = btr_cur_optimistic_delete(btr_cur, 0, &mtr)
 			? DB_SUCCESS : DB_FAIL;
 	} else {
-		ut_ad(mode == BTR_MODIFY_TREE);
-
 		/* No need to distinguish RB_RECOVERY here, because we
 		are deleting a secondary index record: the distinction
 		between RB_NORMAL and RB_RECOVERY only matters when
 		deleting a record that contains externally stored
 		columns. */
 		ut_ad(!dict_index_is_clust(index));
-		btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
 					   RB_NORMAL, &mtr);
 	}
 func_exit:
 	btr_pcur_close(&pcur);
+func_exit_no_pcur:
 	mtr_commit(&mtr);
 
 	return(err);
@@ -210,14 +252,14 @@ func_exit:
 Removes a secondary index entry from the index if found. Tries first
 optimistic, then pessimistic descent down the tree.
 @return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_undo_ins_remove_sec(
 /*====================*/
 	dict_index_t*	index,	/*!< in: index */
 	dtuple_t*	entry)	/*!< in: index entry to insert */
 {
-	ulint	err;
+	dberr_t	err;
 	ulint	n_tries	= 0;
 
 	/* Try first optimistic descent to the B-tree */
@@ -255,7 +297,8 @@ static
 void
 row_undo_ins_parse_undo_rec(
 /*========================*/
-	undo_node_t*	node)	/*!< in/out: row undo node */
+	undo_node_t*	node,		/*!< in/out: row undo node */
+	ibool		dict_locked)	/*!< in: TRUE if own dict_sys->mutex */
 {
 	dict_index_t*	clust_index;
 	byte*		ptr;
@@ -263,7 +306,7 @@ row_undo_ins_parse_undo_rec(
 	table_id_t	table_id;
 	ulint		type;
 	ulint		dummy;
-	ibool		dummy_extern;
+	bool		dummy_extern;
 
 	ut_ad(node);
 
@@ -273,18 +316,26 @@ row_undo_ins_parse_undo_rec(
 	node->rec_type = type;
 
 	node->update = NULL;
-	node->table = dict_table_get_on_id(table_id, node->trx);
+	node->table = dict_table_open_on_id(
+		table_id, dict_locked, DICT_TABLE_OP_NORMAL);
 
 	/* Skip the UNDO if we can't find the table or the .ibd file. */
 	if (UNIV_UNLIKELY(node->table == NULL)) {
 	} else if (UNIV_UNLIKELY(node->table->ibd_file_missing)) {
+close_table:
+		dict_table_close(node->table, dict_locked, FALSE);
 		node->table = NULL;
 	} else {
 		clust_index = dict_table_get_first_index(node->table);
 
 		if (clust_index != NULL) {
-			ptr = trx_undo_rec_get_row_ref(
+			trx_undo_rec_get_row_ref(
 				ptr, clust_index, &node->ref, node->heap);
+
+			if (!row_undo_search_clust_to_pcur(node)) {
+				goto close_table;
+			}
+
 		} else {
 			ut_print_timestamp(stderr);
 			fprintf(stderr, "  InnoDB: table ");
@@ -292,10 +343,68 @@ row_undo_ins_parse_undo_rec(
 				      node->table->name);
 			fprintf(stderr, " has no indexes, "
 				"ignoring the table\n");
+			goto close_table;
+		}
+	}
+}
+
+/***************************************************************//**
+Removes secondary index records.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_sec_rec(
+/*========================*/
+	undo_node_t*	node)	/*!< in/out: row undo node */
+{
+	dberr_t		err	= DB_SUCCESS;
+	dict_index_t*	index	= node->index;
+	mem_heap_t*	heap;
 
-			node->table = NULL;
+	heap = mem_heap_create(1024);
+
+	while (index != NULL) {
+		dtuple_t*	entry;
+
+		if (index->type & DICT_FTS) {
+			dict_table_next_uncorrupted_index(index);
+			continue;
+		}
+
+		/* An insert undo record TRX_UNDO_INSERT_REC will
+		always contain all fields of the index. It does not
+		matter if any indexes were created afterwards; all
+		index entries can be reconstructed from the row. */
+		entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The database must have crashed after
+			inserting a clustered index record but before
+			writing all the externally stored columns of
+			that record, or a statement is being rolled
+			back because an error occurred while storing
+			off-page columns.
+
+			Because secondary index entries are inserted
+			after the clustered index record, we may
+			assume that the secondary index record does
+			not exist. */
+		} else {
+			err = row_undo_ins_remove_sec(index, entry);
+
+			if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+				goto func_exit;
+			}
 		}
+
+		mem_heap_empty(heap);
+		dict_table_next_uncorrupted_index(index);
 	}
+
+func_exit:
+	node->index = index;
+	mem_heap_free(heap);
+	return(err);
 }
 
 /***********************************************************//**
@@ -306,17 +415,21 @@ if it figures out that an index record will be removed in the purge
 anyway, it will remove it in the rollback.
 @return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
 UNIV_INTERN
-ulint
+dberr_t
 row_undo_ins(
 /*=========*/
 	undo_node_t*	node)	/*!< in: row undo node */
 {
-	ut_ad(node);
+	dberr_t	err;
+	ibool	dict_locked;
+
 	ut_ad(node->state == UNDO_NODE_INSERT);
 
-	row_undo_ins_parse_undo_rec(node);
+	dict_locked = node->trx->dict_operation_lock_mode == RW_X_LATCH;
+
+	row_undo_ins_parse_undo_rec(node, dict_locked);
 
-	if (!node->table || !row_undo_search_clust_to_pcur(node)) {
+	if (node->table == NULL) {
 		trx_undo_rec_release(node->trx, node->undo_no);
 
 		return(DB_SUCCESS);
@@ -324,43 +437,40 @@ row_undo_ins(
 
 	/* Iterate over all the indexes and undo the insert.*/
 
+	node->index = dict_table_get_first_index(node->table);
+	ut_ad(dict_index_is_clust(node->index));
 	/* Skip the clustered index (the first index) */
-	node->index = dict_table_get_next_index(
-		dict_table_get_first_index(node->table));
+	node->index = dict_table_get_next_index(node->index);
 
 	dict_table_skip_corrupt_index(node->index);
 
-	while (node->index != NULL) {
-		dtuple_t*	entry;
-		ulint		err;
+	err = row_undo_ins_remove_sec_rec(node);
 
-		entry = row_build_index_entry(node->row, node->ext,
-					      node->index, node->heap);
-		if (UNIV_UNLIKELY(!entry)) {
-			/* The database must have crashed after
-			inserting a clustered index record but before
-			writing all the externally stored columns of
-			that record, or a statement is being rolled
-			back because an error occurred while storing
-			off-page columns.
+	if (err == DB_SUCCESS) {
 
-			Because secondary index entries are inserted
-			after the clustered index record, we may
-			assume that the secondary index record does
-			not exist. */
-		} else {
-			log_free_check();
-			err = row_undo_ins_remove_sec(node->index, entry);
+		log_free_check();
 
-			if (err != DB_SUCCESS) {
+		if (node->table->id == DICT_INDEXES_ID) {
 
-				return(err);
+			if (!dict_locked) {
+				mutex_enter(&dict_sys->mutex);
 			}
 		}
 
-		dict_table_next_uncorrupted_index(node->index);
+		// FIXME: We need to update the dict_index_t::space and
+		// page number fields too.
+		err = row_undo_ins_remove_clust_rec(node);
+
+		if (node->table->id == DICT_INDEXES_ID
+		    && !dict_locked) {
+
+			mutex_exit(&dict_sys->mutex);
+		}
 	}
 
-	log_free_check();
-	return(row_undo_ins_remove_clust_rec(node));
+	dict_table_close(node->table, dict_locked, FALSE);
+
+	node->table = NULL;
+
+	return(err);
 }
diff --git a/storage/xtradb/row/row0umod.c b/storage/xtradb/row/row0umod.cc
index 69831fee8ac..3c70c3e662b 100644
--- a/storage/xtradb/row/row0umod.c
+++ b/storage/xtradb/row/row0umod.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0umod.c
+@file row/row0umod.cc
 Undo modify of a row
 
 Created 2/27/1997 Heikki Tuuri
@@ -37,6 +37,7 @@ Created 2/27/1997 Heikki Tuuri
 #include "mach0data.h"
 #include "row0undo.h"
 #include "row0vers.h"
+#include "row0log.h"
 #include "trx0trx.h"
 #include "trx0rec.h"
 #include "row0row.h"
@@ -71,11 +72,20 @@ introduced where a call to log_free_check() is bypassed. */
 /***********************************************************//**
 Undoes a modify in a clustered index record.
 @return	DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_undo_mod_clust_low(
 /*===================*/
 	undo_node_t*	node,	/*!< in: row undo node */
+	ulint**		offsets,/*!< out: rec_get_offsets() on the record */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: memory heap that can be emptied */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	const dtuple_t**rebuilt_old_pk,
+				/*!< out: row_log_table_get_pk()
+				before the update, or NULL if
+				the table is not being rebuilt online or
+				the PRIMARY KEY definition does not change */
 	que_thr_t*	thr,	/*!< in: query thread */
 	mtr_t*		mtr,	/*!< in: mtr; must be committed before
 				latching any further pages */
@@ -83,12 +93,12 @@ row_undo_mod_clust_low(
 {
 	btr_pcur_t*	pcur;
 	btr_cur_t*	btr_cur;
-	ulint		err;
+	dberr_t		err;
 #ifdef UNIV_DEBUG
 	ibool		success;
 #endif /* UNIV_DEBUG */
 
-	pcur = &(node->pcur);
+	pcur = &node->pcur;
 	btr_cur = btr_pcur_get_btr_cur(pcur);
 
 #ifdef UNIV_DEBUG
@@ -97,31 +107,40 @@ row_undo_mod_clust_low(
 	btr_pcur_restore_position(mode, pcur, mtr);
 
 	ut_ad(success);
+	ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur),
+			     btr_cur_get_index(btr_cur))
+	      == thr_get_trx(thr)->id);
+
+	if (mode != BTR_MODIFY_LEAF
+	    && dict_index_is_online_ddl(btr_cur_get_index(btr_cur))) {
+		*rebuilt_old_pk = row_log_table_get_pk(
+			btr_cur_get_rec(btr_cur),
+			btr_cur_get_index(btr_cur), NULL, &heap);
+	} else {
+		*rebuilt_old_pk = NULL;
+	}
 
-	if (mode == BTR_MODIFY_LEAF) {
+	if (mode != BTR_MODIFY_TREE) {
+		ut_ad((mode & ~BTR_ALREADY_S_LATCHED) == BTR_MODIFY_LEAF);
 
-		err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG
-						| BTR_NO_UNDO_LOG_FLAG
-						| BTR_KEEP_SYS_FLAG,
-						btr_cur, node->update,
-						node->cmpl_info, thr, mtr);
+		err = btr_cur_optimistic_update(
+			BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG
+			| BTR_KEEP_SYS_FLAG,
+			btr_cur, offsets, offsets_heap,
+			node->update, node->cmpl_info,
+			thr, thr_get_trx(thr)->id, mtr);
 	} else {
-		mem_heap_t*	heap		= NULL;
 		big_rec_t*	dummy_big_rec;
 
-		ut_ad(mode == BTR_MODIFY_TREE);
-
 		err = btr_cur_pessimistic_update(
 			BTR_NO_LOCKING_FLAG
 			| BTR_NO_UNDO_LOG_FLAG
 			| BTR_KEEP_SYS_FLAG,
-			btr_cur, &heap, &dummy_big_rec, node->update,
-			node->cmpl_info, thr, mtr);
+			btr_cur, offsets, offsets_heap, heap,
+			&dummy_big_rec, node->update,
+			node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
 
 		ut_a(!dummy_big_rec);
-		if (UNIV_LIKELY_NULL(heap)) {
-			mem_heap_free(heap);
-		}
 	}
 
 	return(err);
@@ -133,8 +152,8 @@ This is attempted when the record was inserted by updating a
 delete-marked record and there no longer exist transactions
 that would see the delete-marked record.
 @return	DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_undo_mod_remove_clust_low(
 /*==========================*/
 	undo_node_t*	node,	/*!< in: row undo node */
@@ -143,7 +162,7 @@ row_undo_mod_remove_clust_low(
 	ulint		mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
 {
 	btr_cur_t*	btr_cur;
-	ulint		err;
+	dberr_t		err;
 	ulint		trx_id_offset;
 
 	ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
@@ -164,7 +183,7 @@ row_undo_mod_remove_clust_low(
 	if (!trx_id_offset) {
 		mem_heap_t*	heap	= NULL;
 		ulint		trx_id_col;
-		ulint*		offsets;
+		const ulint*	offsets;
 		ulint		len;
 
 		trx_id_col = dict_index_get_sys_col_pos(
@@ -196,7 +215,7 @@ row_undo_mod_remove_clust_low(
 				   dict_table_is_comp(node->table)));
 
 	if (mode == BTR_MODIFY_LEAF) {
-		err = btr_cur_optimistic_delete(btr_cur, mtr)
+		err = btr_cur_optimistic_delete(btr_cur, 0, mtr)
 			? DB_SUCCESS
 			: DB_FAIL;
 	} else {
@@ -205,7 +224,7 @@ row_undo_mod_remove_clust_low(
 		/* This operation is analogous to purge, we can free also
 		inherited externally stored fields */
 
-		btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
 					   thr_is_recv(thr)
 					   ? RB_RECOVERY_PURGE_REC
 					   : RB_NONE, mtr);
@@ -222,8 +241,8 @@ row_undo_mod_remove_clust_low(
 Undoes a modify in a clustered index record. Sets also the node state for the
 next round of undo.
 @return	DB_SUCCESS or error code: we may run out of file space */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_undo_mod_clust(
 /*===============*/
 	undo_node_t*	node,	/*!< in: row undo node */
@@ -231,20 +250,42 @@ row_undo_mod_clust(
 {
 	btr_pcur_t*	pcur;
 	mtr_t		mtr;
-	ulint		err;
+	dberr_t		err;
+	dict_index_t*	index;
+	bool		online;
 
-	ut_ad(node && thr);
+	ut_ad(thr_get_trx(thr) == node->trx);
+	ut_ad(node->trx->dict_operation_lock_mode);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)
+	      || rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
 
 	log_free_check();
-
-	pcur = &(node->pcur);
+	pcur = &node->pcur;
+	index = btr_cur_get_index(btr_pcur_get_btr_cur(pcur));
 
 	mtr_start(&mtr);
 
+	online = dict_index_is_online_ddl(index);
+	if (online) {
+		ut_ad(node->trx->dict_operation_lock_mode != RW_X_LATCH);
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+	}
+
+	mem_heap_t*	heap		= mem_heap_create(1024);
+	mem_heap_t*	offsets_heap	= NULL;
+	ulint*		offsets		= NULL;
+	const dtuple_t*	rebuilt_old_pk;
+
 	/* Try optimistic processing of the record, keeping changes within
 	the index page */
 
-	err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_LEAF);
+	err = row_undo_mod_clust_low(node, &offsets, &offsets_heap,
+				     heap, &rebuilt_old_pk,
+				     thr, &mtr, online
+				     ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED
+				     : BTR_MODIFY_LEAF);
 
 	if (err != DB_SUCCESS) {
 		btr_pcur_commit_specify_mtr(pcur, &mtr);
@@ -254,17 +295,56 @@ row_undo_mod_clust(
 
 		mtr_start(&mtr);
 
-		err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE);
+		err = row_undo_mod_clust_low(
+			node, &offsets, &offsets_heap, heap, &rebuilt_old_pk,
+			thr, &mtr, BTR_MODIFY_TREE);
+		ut_ad(err == DB_SUCCESS || err == DB_OUT_OF_FILE_SPACE);
+	}
+
+	/* Online rebuild cannot be initiated while we are holding
+	dict_operation_lock and index->lock. (It can be aborted.) */
+	ut_ad(online || !dict_index_is_online_ddl(index));
+
+	if (err == DB_SUCCESS && online) {
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED)
+		      || rw_lock_own(&index->lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+		switch (node->rec_type) {
+		case TRX_UNDO_DEL_MARK_REC:
+			row_log_table_insert(
+				btr_pcur_get_rec(pcur), index, offsets);
+			break;
+		case TRX_UNDO_UPD_EXIST_REC:
+			row_log_table_update(
+				btr_pcur_get_rec(pcur), index, offsets,
+				rebuilt_old_pk);
+			break;
+		case TRX_UNDO_UPD_DEL_REC:
+			row_log_table_delete(
+				btr_pcur_get_rec(pcur), index, offsets,
+				true, node->trx->id);
+			break;
+		default:
+			ut_ad(0);
+			break;
+		}
 	}
 
+	ut_ad(rec_get_trx_id(btr_pcur_get_rec(pcur), index)
+	      == node->new_trx_id);
+
 	btr_pcur_commit_specify_mtr(pcur, &mtr);
 
 	if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_UPD_DEL_REC) {
 
 		mtr_start(&mtr);
 
-		err = row_undo_mod_remove_clust_low(node, thr, &mtr,
-						    BTR_MODIFY_LEAF);
+		/* It is not necessary to call row_log_table,
+		because the record is delete-marked and would thus
+		be omitted from the rebuilt copy of the table. */
+		err = row_undo_mod_remove_clust_low(
+			node, thr, &mtr, BTR_MODIFY_LEAF);
 		if (err != DB_SUCCESS) {
 			btr_pcur_commit_specify_mtr(pcur, &mtr);
 
@@ -275,6 +355,9 @@ row_undo_mod_clust(
 
 			err = row_undo_mod_remove_clust_low(node, thr, &mtr,
 							    BTR_MODIFY_TREE);
+
+			ut_ad(err == DB_SUCCESS
+			      || err == DB_OUT_OF_FILE_SPACE);
 		}
 
 		btr_pcur_commit_specify_mtr(pcur, &mtr);
@@ -284,14 +367,18 @@ row_undo_mod_clust(
 
 	trx_undo_rec_release(node->trx, node->undo_no);
 
+	if (offsets_heap) {
+		mem_heap_free(offsets_heap);
+	}
+	mem_heap_free(heap);
 	return(err);
 }
 
 /***********************************************************//**
 Delete marks or removes a secondary index entry if found.
 @return	DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_undo_mod_del_mark_or_remove_sec_low(
 /*====================================*/
 	undo_node_t*	node,	/*!< in: row undo node */
@@ -305,7 +392,7 @@ row_undo_mod_del_mark_or_remove_sec_low(
 	btr_cur_t*		btr_cur;
 	ibool			success;
 	ibool			old_has;
-	ulint			err;
+	dberr_t			err	= DB_SUCCESS;
 	mtr_t			mtr;
 	mtr_t			mtr_vers;
 	enum row_search_result	search_result;
@@ -313,9 +400,30 @@ row_undo_mod_del_mark_or_remove_sec_low(
 	log_free_check();
 	mtr_start(&mtr);
 
-	btr_cur = btr_pcur_get_btr_cur(&pcur);
+	if (*index->name == TEMP_INDEX_PREFIX) {
+		/* The index->online_status may change if the
+		index->name starts with TEMP_INDEX_PREFIX (meaning
+		that the index is or was being created online). It is
+		protected by index->lock. */
+		if (mode == BTR_MODIFY_LEAF) {
+			mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
+			mtr_s_lock(dict_index_get_lock(index), &mtr);
+		} else {
+			ut_ad(mode == BTR_MODIFY_TREE);
+			mtr_x_lock(dict_index_get_lock(index), &mtr);
+		}
 
-	ut_ad(mode == BTR_MODIFY_TREE || mode == BTR_MODIFY_LEAF);
+		if (row_log_online_op_try(index, entry, 0)) {
+			goto func_exit_no_pcur;
+		}
+	} else {
+		/* For secondary indexes,
+		index->online_status==ONLINE_INDEX_CREATION unless
+		index->name starts with TEMP_INDEX_PREFIX. */
+		ut_ad(!dict_index_is_online_ddl(index));
+	}
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
 
 	search_result = row_search_index_entry(index, entry, mode,
 					       &pcur, &mtr);
@@ -331,8 +439,6 @@ row_undo_mod_del_mark_or_remove_sec_low(
 		In normal processing, if an update ends in a deadlock
 		before it has inserted all updated secondary index
 		records, then the undo will not find those records. */
-
-		err = DB_SUCCESS;
 		goto func_exit;
 	case ROW_FOUND:
 		break;
@@ -364,16 +470,14 @@ row_undo_mod_del_mark_or_remove_sec_low(
 	} else {
 		/* Remove the index record */
 
-		if (mode == BTR_MODIFY_LEAF) {
-			success = btr_cur_optimistic_delete(btr_cur, &mtr);
+		if (mode != BTR_MODIFY_TREE) {
+			success = btr_cur_optimistic_delete(btr_cur, 0, &mtr);
 			if (success) {
 				err = DB_SUCCESS;
 			} else {
 				err = DB_FAIL;
 			}
 		} else {
-			ut_ad(mode == BTR_MODIFY_TREE);
-
 			/* No need to distinguish RB_RECOVERY_PURGE here,
 			because we are deleting a secondary index record:
 			the distinction between RB_NORMAL and
@@ -381,7 +485,7 @@ row_undo_mod_del_mark_or_remove_sec_low(
 			record that contains externally stored
 			columns. */
 			ut_ad(!dict_index_is_clust(index));
-			btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+			btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
 						   RB_NORMAL, &mtr);
 
 			/* The delete operation may fail if we have little
@@ -394,6 +498,7 @@ row_undo_mod_del_mark_or_remove_sec_low(
 
 func_exit:
 	btr_pcur_close(&pcur);
+func_exit_no_pcur:
 	mtr_commit(&mtr);
 
 	return(err);
@@ -404,12 +509,12 @@ Delete marks or removes a secondary index entry if found.
 NOTE that if we updated the fields of a delete-marked secondary index record
 so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot
 return to the original values because we do not know them. But this should
-not cause problems because in row0sel.c, in queries we always retrieve the
+not cause problems because in row0sel.cc, in queries we always retrieve the
 clustered index record or an earlier version of it, if the secondary index
 record through which we do the search is delete-marked.
 @return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_undo_mod_del_mark_or_remove_sec(
 /*================================*/
 	undo_node_t*	node,	/*!< in: row undo node */
@@ -417,7 +522,7 @@ row_undo_mod_del_mark_or_remove_sec(
 	dict_index_t*	index,	/*!< in: index */
 	dtuple_t*	entry)	/*!< in: index entry */
 {
-	ulint	err;
+	dberr_t	err;
 
 	err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
 						      entry, BTR_MODIFY_LEAF);
@@ -436,42 +541,67 @@ Delete unmarks a secondary index entry which must be found. It might not be
 delete-marked at the moment, but it does not harm to unmark it anyway. We also
 need to update the fields of the secondary index record if we updated its
 fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'.
-@return	DB_FAIL or DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
-static
-ulint
+@retval	DB_SUCCESS on success
+@retval	DB_FAIL if BTR_MODIFY_TREE should be tried
+@retval	DB_OUT_OF_FILE_SPACE when running out of tablespace
+@retval	DB_DUPLICATE_KEY if the value was missing
+	and an insert would lead to a duplicate exists */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_undo_mod_del_unmark_sec_and_undo_update(
 /*========================================*/
 	ulint		mode,	/*!< in: search mode: BTR_MODIFY_LEAF or
 				BTR_MODIFY_TREE */
 	que_thr_t*	thr,	/*!< in: query thread */
 	dict_index_t*	index,	/*!< in: index */
-	const dtuple_t*	entry)	/*!< in: index entry */
+	dtuple_t*	entry)	/*!< in: index entry */
 {
-	mem_heap_t*		heap;
 	btr_pcur_t		pcur;
-	btr_cur_t*		btr_cur;
+	btr_cur_t*		btr_cur		= btr_pcur_get_btr_cur(&pcur);
 	upd_t*			update;
-	ulint			err		= DB_SUCCESS;
+	dberr_t			err		= DB_SUCCESS;
 	big_rec_t*		dummy_big_rec;
 	mtr_t			mtr;
 	trx_t*			trx		= thr_get_trx(thr);
+	const ulint		flags
+		= BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG;
 	enum row_search_result	search_result;
 
-	/* Ignore indexes that are being created. */
-	if (UNIV_UNLIKELY(*index->name == TEMP_INDEX_PREFIX)) {
-
-		return(DB_SUCCESS);
-	}
+	ut_ad(trx->id);
 
 	log_free_check();
 	mtr_start(&mtr);
 
-	ut_ad(mode == BTR_MODIFY_TREE || mode == BTR_MODIFY_LEAF);
+	if (*index->name == TEMP_INDEX_PREFIX) {
+		/* The index->online_status may change if the
+		index->name starts with TEMP_INDEX_PREFIX (meaning
+		that the index is or was being created online). It is
+		protected by index->lock. */
+		if (mode == BTR_MODIFY_LEAF) {
+			mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
+			mtr_s_lock(dict_index_get_lock(index), &mtr);
+		} else {
+			ut_ad(mode == BTR_MODIFY_TREE);
+			mtr_x_lock(dict_index_get_lock(index), &mtr);
+		}
+
+		if (row_log_online_op_try(index, entry, trx->id)) {
+			goto func_exit_no_pcur;
+		}
+	} else {
+		/* For secondary indexes,
+		index->online_status==ONLINE_INDEX_CREATION unless
+		index->name starts with TEMP_INDEX_PREFIX. */
+		ut_ad(!dict_index_is_online_ddl(index));
+	}
 
 	search_result = row_search_index_entry(index, entry, mode,
 					       &pcur, &mtr);
 
 	switch (search_result) {
+		mem_heap_t*	heap;
+		mem_heap_t*	offsets_heap;
+		ulint*		offsets;
 	case ROW_BUFFERED:
 	case ROW_NOT_DELETED_REF:
 		/* These are invalid outcomes, because the mode passed
@@ -479,98 +609,205 @@ row_undo_mod_del_unmark_sec_and_undo_update(
 		flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
 		ut_error;
 	case ROW_NOT_FOUND:
-		fputs("InnoDB: error in sec index entry del undo in\n"
-		      "InnoDB: ", stderr);
-		dict_index_name_print(stderr, trx, index);
-		fputs("\n"
-		      "InnoDB: tuple ", stderr);
-		dtuple_print(stderr, entry);
-		fputs("\n"
-		      "InnoDB: record ", stderr);
-		rec_print(stderr, btr_pcur_get_rec(&pcur), index);
-		putc('\n', stderr);
-		trx_print(stderr, trx, 0);
-		fputs("\n"
-		      "InnoDB: Submit a detailed bug report"
-		      " to http://bugs.mysql.com\n", stderr);
-		ut_ad(0);
+		if (*index->name != TEMP_INDEX_PREFIX) {
+			/* During online secondary index creation, it
+			is possible that MySQL is waiting for a
+			meta-data lock upgrade before invoking
+			ha_innobase::commit_inplace_alter_table()
+			while this ROLLBACK is executing. InnoDB has
+			finished building the index, but it does not
+			yet exist in MySQL. In this case, we suppress
+			the printout to the error log. */
+			fputs("InnoDB: error in sec index entry del undo in\n"
+			      "InnoDB: ", stderr);
+			dict_index_name_print(stderr, trx, index);
+			fputs("\n"
+			      "InnoDB: tuple ", stderr);
+			dtuple_print(stderr, entry);
+			fputs("\n"
+			      "InnoDB: record ", stderr);
+			rec_print(stderr, btr_pcur_get_rec(&pcur), index);
+			putc('\n', stderr);
+			trx_print(stderr, trx, 0);
+			fputs("\n"
+			      "InnoDB: Submit a detailed bug report"
+			      " to http://bugs.mysql.com\n", stderr);
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"record in index %s was not found"
+				" on rollback, trying to insert",
+				index->name);
+		}
+
+		if (btr_cur->up_match >= dict_index_get_n_unique(index)
+		    || btr_cur->low_match >= dict_index_get_n_unique(index)) {
+			if (*index->name != TEMP_INDEX_PREFIX) {
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"record in index %s was not found on"
+					" rollback, and a duplicate exists",
+					index->name);
+			}
+			err = DB_DUPLICATE_KEY;
+			break;
+		}
+
+		/* Insert the missing record that we were trying to
+		delete-unmark. */
+		big_rec_t*	big_rec;
+		rec_t*		insert_rec;
+		offsets = NULL;
+		offsets_heap = NULL;
+
+		err = btr_cur_optimistic_insert(
+			flags, btr_cur, &offsets, &offsets_heap,
+			entry, &insert_rec, &big_rec,
+			0, thr, &mtr);
+		ut_ad(!big_rec);
+
+		if (err == DB_FAIL && mode == BTR_MODIFY_TREE) {
+			err = btr_cur_pessimistic_insert(
+				flags, btr_cur,
+				&offsets, &offsets_heap,
+				entry, &insert_rec, &big_rec,
+				0, thr, &mtr);
+			/* There are no off-page columns in
+			secondary indexes. */
+			ut_ad(!big_rec);
+		}
+
+		if (err == DB_SUCCESS) {
+			page_update_max_trx_id(
+				btr_cur_get_block(btr_cur),
+				btr_cur_get_page_zip(btr_cur),
+				trx->id, &mtr);
+		}
+
+		if (offsets_heap) {
+			mem_heap_free(offsets_heap);
+		}
+
 		break;
 	case ROW_FOUND:
-		btr_cur = btr_pcur_get_btr_cur(&pcur);
-		err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
-						   btr_cur, FALSE, thr, &mtr);
+		err = btr_cur_del_mark_set_sec_rec(
+			BTR_NO_LOCKING_FLAG,
+			btr_cur, FALSE, thr, &mtr);
 		ut_a(err == DB_SUCCESS);
-		heap = mem_heap_create(100);
-
+		heap = mem_heap_create(
+			sizeof(upd_t)
+			+ dtuple_get_n_fields(entry) * sizeof(upd_field_t));
+		offsets_heap = NULL;
+		offsets = rec_get_offsets(
+			btr_cur_get_rec(btr_cur),
+			index, NULL, ULINT_UNDEFINED, &offsets_heap);
 		update = row_upd_build_sec_rec_difference_binary(
-			index, entry, btr_cur_get_rec(btr_cur), trx, heap);
+			btr_cur_get_rec(btr_cur), index, offsets, entry, heap);
 		if (upd_get_n_fields(update) == 0) {
 
 			/* Do nothing */
 
-		} else if (mode == BTR_MODIFY_LEAF) {
+		} else if (mode != BTR_MODIFY_TREE) {
 			/* Try an optimistic updating of the record, keeping
 			changes within the page */
 
+			/* TODO: pass offsets, not &offsets */
 			err = btr_cur_optimistic_update(
-				BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG,
-				btr_cur, update, 0, thr, &mtr);
+				flags, btr_cur, &offsets, &offsets_heap,
+				update, 0, thr, thr_get_trx(thr)->id, &mtr);
 			switch (err) {
 			case DB_OVERFLOW:
 			case DB_UNDERFLOW:
 			case DB_ZIP_OVERFLOW:
 				err = DB_FAIL;
+			default:
+				break;
 			}
 		} else {
-			ut_a(mode == BTR_MODIFY_TREE);
 			err = btr_cur_pessimistic_update(
-				BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG,
-				btr_cur, &heap, &dummy_big_rec,
-				update, 0, thr, &mtr);
+				flags, btr_cur, &offsets, &offsets_heap,
+				heap, &dummy_big_rec,
+				update, 0, thr, thr_get_trx(thr)->id, &mtr);
 			ut_a(!dummy_big_rec);
 		}
 
 		mem_heap_free(heap);
+		mem_heap_free(offsets_heap);
 	}
 
 	btr_pcur_close(&pcur);
+func_exit_no_pcur:
 	mtr_commit(&mtr);
 
 	return(err);
 }
 
 /***********************************************************//**
+Flags a secondary index corrupted. */
+static __attribute__((nonnull))
+void
+row_undo_mod_sec_flag_corrupted(
+/*============================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	dict_index_t*	index)	/*!< in: secondary index */
+{
+	ut_ad(!dict_index_is_clust(index));
+
+	switch (trx->dict_operation_lock_mode) {
+	case RW_S_LATCH:
+		/* Because row_undo() is holding an S-latch
+		on the data dictionary during normal rollback,
+		we can only mark the index corrupted in the
+		data dictionary cache. TODO: fix this somehow.*/
+		mutex_enter(&dict_sys->mutex);
+		dict_set_corrupted_index_cache_only(index, index->table);
+		mutex_exit(&dict_sys->mutex);
+		break;
+	default:
+		ut_ad(0);
+		/* fall through */
+	case RW_X_LATCH:
+		/* This should be the rollback of a data dictionary
+		transaction. */
+		dict_set_corrupted(index, trx, "rollback");
+	}
+}
+
+/***********************************************************//**
 Undoes a modify in secondary indexes when undo record type is UPD_DEL.
 @return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_undo_mod_upd_del_sec(
 /*=====================*/
 	undo_node_t*	node,	/*!< in: row undo node */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	mem_heap_t*	heap;
-	dtuple_t*	entry;
-	dict_index_t*	index;
-	ulint		err	= DB_SUCCESS;
+	dberr_t		err	= DB_SUCCESS;
 
 	ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
 	ut_ad(!node->undo_row);
+
 	heap = mem_heap_create(1024);
 
 	while (node->index != NULL) {
+		dict_index_t*	index	= node->index;
+		dtuple_t*	entry;
 
-		/* Skip all corrupted secondary index */
-		dict_table_skip_corrupt_index(node->index);
-
-		if (!node->index) {
-			break;
+		if (index->type & DICT_FTS) {
+			dict_table_next_uncorrupted_index(node->index);
+			continue;
 		}
 
-		index = node->index;
+		/* During online index creation,
+		HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE should
+		guarantee that any active transaction has not modified
+		indexed columns such that col->ord_part was 0 at the
+		time when the undo log record was written. When we get
+		to roll back an undo log entry TRX_UNDO_DEL_MARK_REC,
+		it should always cover all affected indexes. */
+		entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
 
-		entry = row_build_index_entry(node->row, node->ext,
-					      index, heap);
 		if (UNIV_UNLIKELY(!entry)) {
 			/* The database must have crashed after
 			inserting a clustered index record but before
@@ -586,15 +823,14 @@ row_undo_mod_upd_del_sec(
 			err = row_undo_mod_del_mark_or_remove_sec(
 				node, thr, index, entry);
 
-			if (err != DB_SUCCESS) {
+			if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
 
 				break;
 			}
 		}
 
 		mem_heap_empty(heap);
-
-		node->index = dict_table_get_next_index(node->index);
+		dict_table_next_uncorrupted_index(node->index);
 	}
 
 	mem_heap_free(heap);
@@ -605,35 +841,41 @@ row_undo_mod_upd_del_sec(
 /***********************************************************//**
 Undoes a modify in secondary indexes when undo record type is DEL_MARK.
 @return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_undo_mod_del_mark_sec(
 /*======================*/
 	undo_node_t*	node,	/*!< in: row undo node */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	mem_heap_t*	heap;
-	dtuple_t*	entry;
-	dict_index_t*	index;
-	ulint		err;
+	dberr_t		err	= DB_SUCCESS;
 
 	ut_ad(!node->undo_row);
 
 	heap = mem_heap_create(1024);
 
 	while (node->index != NULL) {
-		/* Skip all corrupted secondary index */
-		dict_table_skip_corrupt_index(node->index);
+		dict_index_t*	index	= node->index;
+		dtuple_t*	entry;
 
-		if (!node->index) {
-			break;
+		if (index->type == DICT_FTS) {
+			dict_table_next_uncorrupted_index(node->index);
+			continue;
 		}
 
-		index = node->index;
+		/* During online index creation,
+		HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE should
+		guarantee that any active transaction has not modified
+		indexed columns such that col->ord_part was 0 at the
+		time when the undo log record was written. When we get
+		to roll back an undo log entry TRX_UNDO_DEL_MARK_REC,
+		it should always cover all affected indexes. */
+		entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
 
-		entry = row_build_index_entry(node->row, node->ext,
-					      index, heap);
 		ut_a(entry);
+
 		err = row_undo_mod_del_unmark_sec_and_undo_update(
 			BTR_MODIFY_LEAF, thr, index, entry);
 		if (err == DB_FAIL) {
@@ -641,152 +883,154 @@ row_undo_mod_del_mark_sec(
 				BTR_MODIFY_TREE, thr, index, entry);
 		}
 
-		if (err != DB_SUCCESS) {
-
-			mem_heap_free(heap);
-
-			return(err);
+		if (err == DB_DUPLICATE_KEY) {
+			row_undo_mod_sec_flag_corrupted(
+				thr_get_trx(thr), index);
+			err = DB_SUCCESS;
+			/* Do not return any error to the caller. The
+			duplicate will be reported by ALTER TABLE or
+			CREATE UNIQUE INDEX. Unfortunately we cannot
+			report the duplicate key value to the DDL
+			thread, because the altered_table object is
+			private to its call stack. */
+		} else if (err != DB_SUCCESS) {
+			break;
 		}
 
-		node->index = dict_table_get_next_index(node->index);
+		mem_heap_empty(heap);
+		dict_table_next_uncorrupted_index(node->index);
 	}
 
 	mem_heap_free(heap);
 
-	return(DB_SUCCESS);
+	return(err);
 }
 
 /***********************************************************//**
 Undoes a modify in secondary indexes when undo record type is UPD_EXIST.
 @return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_undo_mod_upd_exist_sec(
 /*=======================*/
 	undo_node_t*	node,	/*!< in: row undo node */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	mem_heap_t*	heap;
-	dtuple_t*	entry;
-	dict_index_t*	index;
-	ulint		err;
+	dberr_t		err	= DB_SUCCESS;
 
-	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+	if (node->index == NULL
+	    || ((node->cmpl_info & UPD_NODE_NO_ORD_CHANGE))) {
 		/* No change in secondary indexes */
 
-		return(DB_SUCCESS);
+		return(err);
 	}
 
 	heap = mem_heap_create(1024);
 
 	while (node->index != NULL) {
-		/* Skip all corrupted secondary index */
-		dict_table_skip_corrupt_index(node->index);
-
-		if (!node->index) {
-			break;
+		dict_index_t*	index	= node->index;
+		dtuple_t*	entry;
+
+		if (index->type == DICT_FTS
+		    || !row_upd_changes_ord_field_binary(
+			index, node->update, thr, node->row, node->ext)) {
+			dict_table_next_uncorrupted_index(node->index);
+			continue;
 		}
 
-		index = node->index;
-
-		if (row_upd_changes_ord_field_binary(node->index, node->update,
-						     thr,
-						     node->row, node->ext)) {
-
-			/* Build the newest version of the index entry */
-			entry = row_build_index_entry(node->row, node->ext,
-						      index, heap);
-			if (UNIV_UNLIKELY(!entry)) {
-				/* The server must have crashed in
-				row_upd_clust_rec_by_insert() before
-				the updated externally stored columns (BLOBs)
-				of the new clustered index entry were
-				written. */
-
-				/* The table must be in DYNAMIC or COMPRESSED
-				format.  REDUNDANT and COMPACT formats
-				store a local 768-byte prefix of each
-				externally stored column. */
-				ut_a(dict_table_get_format(index->table)
-				     >= DICT_TF_FORMAT_ZIP);
-
-				/* This is only legitimate when
-				rolling back an incomplete transaction
-				after crash recovery. */
-				ut_a(thr_get_trx(thr)->is_recovered);
-
-				/* The server must have crashed before
-				completing the insert of the new
-				clustered index entry and before
-				inserting to the secondary indexes.
-				Because node->row was not yet written
-				to this index, we can ignore it.  But
-				we must restore node->undo_row. */
-			} else {
-				/* NOTE that if we updated the fields of a
-				delete-marked secondary index record so that
-				alphabetically they stayed the same, e.g.,
-				'abc' -> 'aBc', we cannot return to the
-				original values because we do not know them.
-				But this should not cause problems because
-				in row0sel.c, in queries we always retrieve
-				the clustered index record or an earlier
-				version of it, if the secondary index record
-				through which we do the search is
-				delete-marked. */
-
-				err = row_undo_mod_del_mark_or_remove_sec(
-					node, thr, index, entry);
-				if (err != DB_SUCCESS) {
-					mem_heap_free(heap);
-
-					return(err);
-				}
-
-				mem_heap_empty(heap);
+		/* Build the newest version of the index entry */
+		entry = row_build_index_entry(node->row, node->ext,
+					      index, heap);
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The server must have crashed in
+			row_upd_clust_rec_by_insert() before
+			the updated externally stored columns (BLOBs)
+			of the new clustered index entry were written. */
+
+			/* The table must be in DYNAMIC or COMPRESSED
+			format.  REDUNDANT and COMPACT formats
+			store a local 768-byte prefix of each
+			externally stored column. */
+			ut_a(dict_table_get_format(index->table)
+			     >= UNIV_FORMAT_B);
+
+			/* This is only legitimate when
+			rolling back an incomplete transaction
+			after crash recovery. */
+			ut_a(thr_get_trx(thr)->is_recovered);
+
+			/* The server must have crashed before
+			completing the insert of the new
+			clustered index entry and before
+			inserting to the secondary indexes.
+			Because node->row was not yet written
+			to this index, we can ignore it.  But
+			we must restore node->undo_row. */
+		} else {
+			/* NOTE that if we updated the fields of a
+			delete-marked secondary index record so that
+			alphabetically they stayed the same, e.g.,
+			'abc' -> 'aBc', we cannot return to the
+			original values because we do not know them.
+			But this should not cause problems because
+			in row0sel.cc, in queries we always retrieve
+			the clustered index record or an earlier
+			version of it, if the secondary index record
+			through which we do the search is
+			delete-marked. */
+
+			err = row_undo_mod_del_mark_or_remove_sec(
+				node, thr, index, entry);
+			if (err != DB_SUCCESS) {
+				break;
 			}
+		}
 
-			/* We may have to update the delete mark in the
-			secondary index record of the previous version of
-			the row. We also need to update the fields of
-			the secondary index record if we updated its fields
-			but alphabetically they stayed the same, e.g.,
-			'abc' -> 'aBc'. */
-			entry = row_build_index_entry(node->undo_row,
-						      node->undo_ext,
-						      index, heap);
-			ut_a(entry);
+		mem_heap_empty(heap);
+		/* We may have to update the delete mark in the
+		secondary index record of the previous version of
+		the row. We also need to update the fields of
+		the secondary index record if we updated its fields
+		but alphabetically they stayed the same, e.g.,
+		'abc' -> 'aBc'. */
+		entry = row_build_index_entry(node->undo_row,
+					      node->undo_ext,
+					      index, heap);
+		ut_a(entry);
 
+		err = row_undo_mod_del_unmark_sec_and_undo_update(
+			BTR_MODIFY_LEAF, thr, index, entry);
+		if (err == DB_FAIL) {
 			err = row_undo_mod_del_unmark_sec_and_undo_update(
-				BTR_MODIFY_LEAF, thr, index, entry);
-			if (err == DB_FAIL) {
-				err = row_undo_mod_del_unmark_sec_and_undo_update(
-					BTR_MODIFY_TREE, thr, index, entry);
-			}
-
-			if (err != DB_SUCCESS) {
-				mem_heap_free(heap);
+				BTR_MODIFY_TREE, thr, index, entry);
+		}
 
-				return(err);
-			}
+		if (err == DB_DUPLICATE_KEY) {
+			row_undo_mod_sec_flag_corrupted(
+				thr_get_trx(thr), index);
+			err = DB_SUCCESS;
+		} else if (err != DB_SUCCESS) {
+			break;
 		}
 
-		node->index = dict_table_get_next_index(node->index);
+		mem_heap_empty(heap);
+		dict_table_next_uncorrupted_index(node->index);
 	}
 
 	mem_heap_free(heap);
 
-	return(DB_SUCCESS);
+	return(err);
 }
 
 /***********************************************************//**
 Parses the row reference and other info in a modify undo log record. */
-static
+static __attribute__((nonnull))
 void
 row_undo_mod_parse_undo_rec(
 /*========================*/
-	undo_node_t*	node,	/*!< in: row undo node */
-	que_thr_t*	thr)	/*!< in: query thread */
+	undo_node_t*	node,		/*!< in: row undo node */
+	ibool		dict_locked)	/*!< in: TRUE if own dict_sys->mutex */
 {
 	dict_index_t*	clust_index;
 	byte*		ptr;
@@ -797,16 +1041,14 @@ row_undo_mod_parse_undo_rec(
 	ulint		info_bits;
 	ulint		type;
 	ulint		cmpl_info;
-	ibool		dummy_extern;
-	trx_t*		trx;
+	bool		dummy_extern;
 
-	ut_ad(node && thr);
-	trx = thr_get_trx(thr);
 	ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
 				    &dummy_extern, &undo_no, &table_id);
 	node->rec_type = type;
 
-	node->table = dict_table_get_on_id(table_id, trx);
+	node->table = dict_table_open_on_id(
+		table_id, dict_locked, DICT_TABLE_OP_NORMAL);
 
 	/* TODO: other fixes associated with DROP TABLE + rollback in the
 	same table by another user */
@@ -817,6 +1059,8 @@ row_undo_mod_parse_undo_rec(
 	}
 
 	if (node->table->ibd_file_missing) {
+		dict_table_close(node->table, dict_locked, FALSE);
+
 		/* We skip undo operations to missing .ibd files */
 		node->table = NULL;
 
@@ -832,30 +1076,42 @@ row_undo_mod_parse_undo_rec(
 				       node->heap);
 
 	trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
-				       roll_ptr, info_bits, trx,
+				       roll_ptr, info_bits, node->trx,
 				       node->heap, &(node->update));
 	node->new_trx_id = trx_id;
 	node->cmpl_info = cmpl_info;
+
+	if (!row_undo_search_clust_to_pcur(node)) {
+
+		dict_table_close(node->table, dict_locked, FALSE);
+
+		node->table = NULL;
+	}
 }
 
 /***********************************************************//**
 Undoes a modify operation on a row of a table.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 row_undo_mod(
 /*=========*/
 	undo_node_t*	node,	/*!< in: row undo node */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	ulint	err;
+	dberr_t	err;
+	ibool	dict_locked;
 
 	ut_ad(node && thr);
 	ut_ad(node->state == UNDO_NODE_MODIFY);
 
-	row_undo_mod_parse_undo_rec(node, thr);
+	dict_locked = thr_get_trx(thr)->dict_operation_lock_mode == RW_X_LATCH;
+
+	ut_ad(thr_get_trx(thr) == node->trx);
 
-	if (!node->table || !row_undo_search_clust_to_pcur(node)) {
+	row_undo_mod_parse_undo_rec(node, dict_locked);
+
+	if (node->table == NULL) {
 		/* It is already undone, or will be undone by another query
 		thread, or table was dropped */
 
@@ -865,30 +1121,37 @@ row_undo_mod(
 		return(DB_SUCCESS);
 	}
 
-	node->index = dict_table_get_next_index(
-		dict_table_get_first_index(node->table));
+	node->index = dict_table_get_first_index(node->table);
+	ut_ad(dict_index_is_clust(node->index));
+	/* Skip the clustered index (the first index) */
+	node->index = dict_table_get_next_index(node->index);
 
 	/* Skip all corrupted secondary index */
 	dict_table_skip_corrupt_index(node->index);
 
-	if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
-
+	switch (node->rec_type) {
+	case TRX_UNDO_UPD_EXIST_REC:
 		err = row_undo_mod_upd_exist_sec(node, thr);
-
-	} else if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
-
+		break;
+	case TRX_UNDO_DEL_MARK_REC:
 		err = row_undo_mod_del_mark_sec(node, thr);
-	} else {
-		ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+		break;
+	case TRX_UNDO_UPD_DEL_REC:
 		err = row_undo_mod_upd_del_sec(node, thr);
+		break;
+	default:
+		ut_error;
+		err = DB_ERROR;
 	}
 
-	if (err != DB_SUCCESS) {
+	if (err == DB_SUCCESS) {
 
-		return(err);
+		err = row_undo_mod_clust(node, thr);
 	}
 
-	err = row_undo_mod_clust(node, thr);
+	dict_table_close(node->table, dict_locked, FALSE);
+
+	node->table = NULL;
 
 	return(err);
 }
diff --git a/storage/xtradb/row/row0undo.c b/storage/xtradb/row/row0undo.cc
index 5a0a99570c4..9977a1e8f04 100644
--- a/storage/xtradb/row/row0undo.c
+++ b/storage/xtradb/row/row0undo.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0undo.c
+@file row/row0undo.cc
 Row undo
 
 Created 1/8/1997 Heikki Tuuri
@@ -135,7 +135,8 @@ row_undo_node_create(
 
 	ut_ad(trx && parent && heap);
 
-	undo = mem_heap_alloc(heap, sizeof(undo_node_t));
+	undo = static_cast<undo_node_t*>(
+		mem_heap_alloc(heap, sizeof(undo_node_t)));
 
 	undo->common.type = QUE_NODE_UNDO;
 	undo->common.parent = parent;
@@ -200,7 +201,7 @@ row_undo_search_clust_to_pcur(
 	} else {
 		row_ext_t**	ext;
 
-		if (dict_table_get_format(node->table) >= DICT_TF_FORMAT_ZIP) {
+		if (dict_table_get_format(node->table) >= UNIV_FORMAT_B) {
 			/* In DYNAMIC or COMPRESSED format, there is
 			no prefix of externally stored columns in the
 			clustered index record. Build a cache of
@@ -215,7 +216,8 @@ row_undo_search_clust_to_pcur(
 		}
 
 		node->row = row_build(ROW_COPY_DATA, clust_index, rec,
-				      offsets, NULL, ext, node->heap);
+				      offsets, NULL,
+				      NULL, NULL, ext, node->heap);
 		if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
 			node->undo_row = dtuple_copy(node->row, node->heap);
 			row_upd_replace(node->undo_row, &node->undo_ext,
@@ -243,14 +245,14 @@ Fetches an undo log record and does the undo for the recorded operation.
 If none left, or a partial rollback completed, returns control to the
 parent node, which is always a query thread node.
 @return	DB_SUCCESS if operation successfully completed, else error code */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_undo(
 /*=====*/
 	undo_node_t*	node,	/*!< in: row undo node */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	ulint		err;
+	dberr_t		err;
 	trx_t*		trx;
 	roll_ptr_t	roll_ptr;
 	ibool		locked_data_dict;
@@ -331,17 +333,17 @@ row_undo_step(
 /*==========*/
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	ulint		err;
+	dberr_t		err;
 	undo_node_t*	node;
 	trx_t*		trx;
 
 	ut_ad(thr);
 
-	srv_activity_count++;
+	srv_inc_activity_count();
 
 	trx = thr_get_trx(thr);
 
-	node = thr->run_node;
+	node = static_cast<undo_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
 
@@ -352,12 +354,12 @@ row_undo_step(
 	if (err != DB_SUCCESS) {
 		/* SQL error detected */
 
-		fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n",
-			(ulong) err);
+		fprintf(stderr, "InnoDB: Fatal error (%s) in rollback.\n",
+			ut_strerr(err));
 
 		if (err == DB_OUT_OF_FILE_SPACE) {
 			fprintf(stderr,
-				"InnoDB: Error 13 means out of tablespace.\n"
+				"InnoDB: Out of tablespace.\n"
 				"InnoDB: Consider increasing"
 				" your tablespace.\n");
 
diff --git a/storage/xtradb/row/row0upd.c b/storage/xtradb/row/row0upd.cc
index d1b6d098882..4cf1c604c47 100644
--- a/storage/xtradb/row/row0upd.c
+++ b/storage/xtradb/row/row0upd.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0upd.c
+@file row/row0upd.cc
 Update of a row
 
 Created 12/27/1996 Heikki Tuuri
@@ -31,6 +31,7 @@ Created 12/27/1996 Heikki Tuuri
 #include "row0upd.ic"
 #endif
 
+#include "ha_prototypes.h"
 #include "dict0dict.h"
 #include "trx0undo.h"
 #include "rem0rec.h"
@@ -43,8 +44,9 @@ Created 12/27/1996 Heikki Tuuri
 #include "que0que.h"
 #include "row0ext.h"
 #include "row0ins.h"
-#include "row0sel.h"
+#include "row0log.h"
 #include "row0row.h"
+#include "row0sel.h"
 #include "rem0cmp.h"
 #include "lock0lock.h"
 #include "log0log.h"
@@ -178,8 +180,8 @@ NOTE that this function will temporarily commit mtr and lose the
 pcur position!
 
 @return	DB_SUCCESS or an error code */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_upd_check_references_constraints(
 /*=================================*/
 	upd_node_t*	node,	/*!< in: row update node */
@@ -197,7 +199,7 @@ row_upd_check_references_constraints(
 	trx_t*		trx;
 	const rec_t*	rec;
 	ulint		n_ext;
-	ulint		err;
+	dberr_t		err;
 	ibool		got_s_lock	= FALSE;
 
 	if (UT_LIST_GET_FIRST(table->referenced_list) == NULL) {
@@ -212,11 +214,12 @@ row_upd_check_references_constraints(
 
 	heap = mem_heap_create(500);
 
-	entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
-				       &n_ext, heap);
+	entry = row_rec_to_index_entry(rec, index, offsets, &n_ext, heap);
 
 	mtr_commit(mtr);
 
+	DEBUG_SYNC_C("foreign_constraint_check_for_update");
+
 	mtr_start(mtr);
 
 	if (trx->dict_operation_lock_mode == 0) {
@@ -225,6 +228,7 @@ row_upd_check_references_constraints(
 		row_mysql_freeze_data_dictionary(trx);
 	}
 
+run_again:
 	foreign = UT_LIST_GET_FIRST(table->referenced_list);
 
 	while (foreign) {
@@ -238,20 +242,21 @@ row_upd_check_references_constraints(
 			|| row_upd_changes_first_fields_binary(
 				entry, index, node->update,
 				foreign->n_fields))) {
+			dict_table_t*	foreign_table = foreign->foreign_table;
 
-			if (foreign->foreign_table == NULL) {
-				dict_table_get(foreign->foreign_table_name_lookup,
-					       FALSE,
-					       DICT_ERR_IGNORE_NONE);
-			}
+			dict_table_t*	ref_table = NULL;
 
-			if (foreign->foreign_table) {
-				mutex_enter(&(dict_sys->mutex));
+			if (foreign_table == NULL) {
 
-				(foreign->foreign_table
-				 ->n_foreign_key_checks_running)++;
+				ref_table = dict_table_open_on_name(
+					foreign->foreign_table_name_lookup,
+					FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+			}
 
-				mutex_exit(&(dict_sys->mutex));
+			if (foreign_table) {
+				os_inc_counter(dict_sys->mutex,
+					       foreign_table
+					       ->n_foreign_key_checks_running);
 			}
 
 			/* NOTE that if the thread ends up waiting for a lock
@@ -262,20 +267,20 @@ row_upd_check_references_constraints(
 			err = row_ins_check_foreign_constraint(
 				FALSE, foreign, table, entry, thr);
 
-			if (foreign->foreign_table) {
-				mutex_enter(&(dict_sys->mutex));
-
-				ut_a(foreign->foreign_table
-				     ->n_foreign_key_checks_running > 0);
-
-				(foreign->foreign_table
-				 ->n_foreign_key_checks_running)--;
-
-				mutex_exit(&(dict_sys->mutex));
+			if (foreign_table) {
+				os_dec_counter(dict_sys->mutex,
+					       foreign_table
+					       ->n_foreign_key_checks_running);
 			}
 
-			if (err != DB_SUCCESS) {
+			if (ref_table != NULL) {
+				dict_table_close(ref_table, FALSE, FALSE);
+			}
 
+			/* Some table foreign key dropped, try again */
+			if (err == DB_DICT_CHANGED) {
+				goto run_again;
+			} else if (err != DB_SUCCESS) {
 				goto func_exit;
 			}
 		}
@@ -292,6 +297,8 @@ func_exit:
 
 	mem_heap_free(heap);
 
+	DEBUG_SYNC_C("foreign_constraint_check_for_update_done");
+
 	return(err);
 }
 
@@ -306,7 +313,9 @@ upd_node_create(
 {
 	upd_node_t*	node;
 
-	node = mem_heap_alloc(heap, sizeof(upd_node_t));
+	node = static_cast<upd_node_t*>(
+		mem_heap_alloc(heap, sizeof(upd_node_t)));
+
 	node->common.type = QUE_NODE_UPDATE;
 
 	node->state = UPD_NODE_UPDATE_CLUSTERED;
@@ -350,7 +359,7 @@ row_upd_rec_sys_fields_in_recovery(
 {
 	ut_ad(rec_offs_validate(rec, NULL, offsets));
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		page_zip_write_trx_id_and_roll_ptr(
 			page_zip, rec, offsets, pos, trx_id, roll_ptr);
 	} else {
@@ -391,7 +400,7 @@ row_upd_index_entry_sys_field(
 	pos = dict_index_get_sys_col_pos(index, type);
 
 	dfield = dtuple_get_nth_field(entry, pos);
-	field = dfield_get_data(dfield);
+	field = static_cast<byte*>(dfield_get_data(dfield));
 
 	if (type == DATA_TRX_ID) {
 		trx_write_trx_id(field, val);
@@ -442,12 +451,6 @@ row_upd_changes_field_size_or_external(
 				0);
 		}
 
-		if (srv_use_sys_stats_table
-		    && index == UT_LIST_GET_FIRST(dict_sys->sys_stats->indexes)
-		    && upd_field->field_no >= rec_offs_n_fields(offsets)) {
-			return(TRUE);
-		}
-
 		old_len = rec_offs_nth_size(offsets, upd_field->field_no);
 
 		if (rec_offs_comp(offsets)
@@ -472,6 +475,47 @@ row_upd_changes_field_size_or_external(
 
 	return(FALSE);
 }
+
+/***********************************************************//**
+Returns true if row update contains disowned external fields.
+@return true if the update contains disowned external fields. */
+UNIV_INTERN
+bool
+row_upd_changes_disowned_external(
+/*==============================*/
+	const upd_t*	update)	/*!< in: update vector */
+{
+	const upd_field_t*	upd_field;
+	const dfield_t*		new_val;
+	ulint			new_len;
+	ulint                   n_fields;
+	ulint			i;
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		const byte*	field_ref;
+
+		upd_field = upd_get_nth_field(update, i);
+		new_val = &(upd_field->new_val);
+		new_len = dfield_get_len(new_val);
+
+		if (!dfield_is_ext(new_val)) {
+			continue;
+		}
+
+		ut_ad(new_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+		field_ref = static_cast<const byte*>(dfield_get_data(new_val))
+			    + new_len - BTR_EXTERN_FIELD_REF_SIZE;
+
+		if (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
 #endif /* !UNIV_HOTBACKUP */
 
 /***********************************************************//**
@@ -552,7 +596,7 @@ row_upd_rec_in_place(
 #endif /* UNIV_BLOB_DEBUG */
 	}
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		page_zip_write_rec(page_zip, rec, index, offsets, 0);
 	}
 }
@@ -567,7 +611,7 @@ byte*
 row_upd_write_sys_vals_to_log(
 /*==========================*/
 	dict_index_t*	index,	/*!< in: clustered index */
-	trx_t*		trx,	/*!< in: transaction */
+	trx_id_t	trx_id,	/*!< in: transaction id */
 	roll_ptr_t	roll_ptr,/*!< in: roll ptr of the undo log record */
 	byte*		log_ptr,/*!< pointer to a buffer of size > 20 opened
 				in mlog */
@@ -583,7 +627,7 @@ row_upd_write_sys_vals_to_log(
 	trx_write_roll_ptr(log_ptr, roll_ptr);
 	log_ptr += DATA_ROLL_PTR_LEN;
 
-	log_ptr += mach_ull_write_compressed(log_ptr, trx->id);
+	log_ptr += mach_ull_write_compressed(log_ptr, trx_id);
 
 	return(log_ptr);
 }
@@ -681,9 +725,11 @@ row_upd_index_write_log(
 			} else {
 				mlog_close(mtr, log_ptr);
 
-				mlog_catenate_string(mtr,
-						     dfield_get_data(new_val),
-						     len);
+				mlog_catenate_string(
+					mtr,
+					static_cast<byte*>(
+						dfield_get_data(new_val)),
+					len);
 
 				log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
 				buf_end = log_ptr + MLOG_BUF_MARGIN;
@@ -784,10 +830,10 @@ UNIV_INTERN
 upd_t*
 row_upd_build_sec_rec_difference_binary(
 /*====================================*/
+	const rec_t*	rec,	/*!< in: secondary index record */
 	dict_index_t*	index,	/*!< in: index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
 	const dtuple_t*	entry,	/*!< in: entry to insert */
-	const rec_t*	rec,	/*!< in: secondary index record */
-	trx_t*		trx,	/*!< in: transaction */
 	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
 {
 	upd_field_t*	upd_field;
@@ -797,18 +843,16 @@ row_upd_build_sec_rec_difference_binary(
 	upd_t*		update;
 	ulint		n_diff;
 	ulint		i;
-	ulint		offsets_[REC_OFFS_SMALL_SIZE];
-	const ulint*	offsets;
-	rec_offs_init(offsets_);
 
 	/* This function is used only for a secondary index */
 	ut_a(!dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_n_fields(offsets) == dtuple_get_n_fields(entry));
+	ut_ad(!rec_offs_any_extern(offsets));
 
 	update = upd_create(dtuple_get_n_fields(entry), heap);
 
 	n_diff = 0;
-	offsets = rec_get_offsets(rec, index, offsets_,
-				  ULINT_UNDEFINED, &heap);
 
 	for (i = 0; i < dtuple_get_n_fields(entry); i++) {
 
@@ -822,7 +866,7 @@ row_upd_build_sec_rec_difference_binary(
 		and also in the case where we have a column prefix index
 		and the last characters in the index field are spaces; the
 		latter case probably caused the assertion failures reported at
-		row0upd.c line 713 in versions 4.0.14 - 4.0.16. */
+		row0upd.cc line 713 in versions 4.0.14 - 4.0.16. */
 
 		/* NOTE: we compare the fields as binary strings!
 		(No collation) */
@@ -833,7 +877,7 @@ row_upd_build_sec_rec_difference_binary(
 
 			dfield_copy(&(upd_field->new_val), dfield);
 
-			upd_field_set_field_no(upd_field, i, index, trx);
+			upd_field_set_field_no(upd_field, i, index, NULL);
 
 			n_diff++;
 		}
@@ -851,12 +895,15 @@ the equal ordering fields. NOTE: we compare the fields as binary strings!
 @return own: update vector of differing fields, excluding roll ptr and
 trx id */
 UNIV_INTERN
-upd_t*
+const upd_t*
 row_upd_build_difference_binary(
 /*============================*/
 	dict_index_t*	index,	/*!< in: clustered index */
 	const dtuple_t*	entry,	/*!< in: entry to insert */
 	const rec_t*	rec,	/*!< in: clustered index record */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index), or NULL */
+	bool		no_sys,	/*!< in: skip the system columns
+				DB_TRX_ID and DB_ROLL_PTR */
 	trx_t*		trx,	/*!< in: transaction */
 	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
 {
@@ -866,11 +913,9 @@ row_upd_build_difference_binary(
 	ulint		len;
 	upd_t*		update;
 	ulint		n_diff;
-	ulint		roll_ptr_pos;
 	ulint		trx_id_pos;
 	ulint		i;
 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
-	const ulint*	offsets;
 	rec_offs_init(offsets_);
 
 	/* This function is used only for a clustered index */
@@ -880,26 +925,19 @@ row_upd_build_difference_binary(
 
 	n_diff = 0;
 
-	roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR);
 	trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+	ut_ad(dict_index_get_sys_col_pos(index, DATA_ROLL_PTR)
+	      == trx_id_pos + 1);
 
-	offsets = rec_get_offsets(rec, index, offsets_,
-				  ULINT_UNDEFINED, &heap);
+	if (!offsets) {
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  ULINT_UNDEFINED, &heap);
+	} else {
+		ut_ad(rec_offs_validate(rec, index, offsets));
+	}
 
 	for (i = 0; i < dtuple_get_n_fields(entry); i++) {
 
-		if (srv_use_sys_stats_table
-		    && index == UT_LIST_GET_FIRST(dict_sys->sys_stats->indexes)
-		    && i >= rec_offs_n_fields(offsets)) {
-			dfield = dtuple_get_nth_field(entry, i);
-
-			upd_field = upd_get_nth_field(update, n_diff);
-			dfield_copy(&(upd_field->new_val), dfield);
-			upd_field_set_field_no(upd_field, i, index, trx);
-			n_diff++;
-			goto skip_compare;
-		}
-
 		data = rec_get_nth_field(rec, offsets, i, &len);
 
 		dfield = dtuple_get_nth_field(entry, i);
@@ -907,13 +945,13 @@ row_upd_build_difference_binary(
 		/* NOTE: we compare the fields as binary strings!
 		(No collation) */
 
-		if (i == trx_id_pos || i == roll_ptr_pos) {
+		if (no_sys && (i == trx_id_pos || i == trx_id_pos + 1)) {
 
-			goto skip_compare;
+			continue;
 		}
 
-		if (UNIV_UNLIKELY(!dfield_is_ext(dfield)
-				  != !rec_offs_nth_extern(offsets, i))
+		if (!dfield_is_ext(dfield)
+		    != !rec_offs_nth_extern(offsets, i)
 		    || !dfield_data_is_binary_equal(dfield, len, data)) {
 
 			upd_field = upd_get_nth_field(update, n_diff);
@@ -924,8 +962,6 @@ row_upd_build_difference_binary(
 
 			n_diff++;
 		}
-skip_compare:
-		;
 	}
 
 	update->n_fields = n_diff;
@@ -953,11 +989,11 @@ row_upd_ext_fetch(
 					out: fetched length of the prefix */
 	mem_heap_t*	heap)		/*!< in: heap where to allocate */
 {
-	byte*	buf = mem_heap_alloc(heap, *len);
+	byte*	buf = static_cast<byte*>(mem_heap_alloc(heap, *len));
+
+	*len = btr_copy_externally_stored_field_prefix(
+		buf, *len, zip_size, data, local_len);
 
-	*len = btr_copy_externally_stored_field_prefix(buf, *len,
-						       zip_size,
-						       data, local_len);
 	/* We should never update records containing a half-deleted BLOB. */
 	ut_a(*len);
 
@@ -991,7 +1027,7 @@ row_upd_index_replace_new_col_val(
 	}
 
 	len = dfield_get_len(dfield);
-	data = dfield_get_data(dfield);
+	data = static_cast<const byte*>(dfield_get_data(dfield));
 
 	if (field->prefix_len > 0) {
 		ibool		fetch_ext = dfield_is_ext(dfield)
@@ -1042,10 +1078,12 @@ row_upd_index_replace_new_col_val(
 		stored part of the column.  The data
 		will have to be copied. */
 		ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
-		buf = mem_heap_alloc(heap, uf->orig_len);
+		buf = static_cast<byte*>(mem_heap_alloc(heap, uf->orig_len));
+
 		/* Copy the locally stored prefix. */
 		memcpy(buf, data,
 		       uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE);
+
 		/* Copy the BLOB pointer. */
 		memcpy(buf + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE,
 		       data + len - BTR_EXTERN_FIELD_REF_SIZE,
@@ -1192,7 +1230,9 @@ row_upd_replace(
 	table = index->table;
 	ut_ad(n_cols == dict_table_get_n_cols(table));
 
-	ext_cols = mem_heap_alloc(heap, n_cols * sizeof *ext_cols);
+	ext_cols = static_cast<ulint*>(
+		mem_heap_alloc(heap, n_cols * sizeof *ext_cols));
+
 	n_ext_cols = 0;
 
 	dtuple_set_info_bits(row, update->info_bits);
@@ -1313,7 +1353,7 @@ row_upd_changes_ord_field_binary_func(
 		if (UNIV_LIKELY(ind_field->prefix_len == 0)
 		    || dfield_is_null(dfield)) {
 			/* do nothing special */
-		} else if (UNIV_LIKELY_NULL(ext)) {
+		} else if (ext) {
 			/* Silence a compiler warning without
 			silencing a Valgrind error. */
 			dfield_len = 0;
@@ -1344,7 +1384,8 @@ row_upd_changes_ord_field_binary_func(
 			dfield_len -= BTR_EXTERN_FIELD_REF_SIZE;
 			ut_a(dict_index_is_clust(index)
 			     || ind_field->prefix_len <= dfield_len);
-			buf = dfield_get_data(dfield);
+
+			buf = static_cast<byte*>(dfield_get_data(dfield));
 copy_dfield:
 			ut_a(dfield_len > 0);
 			dfield_copy(&dfield_ext, dfield);
@@ -1397,6 +1438,52 @@ row_upd_changes_some_index_ord_field_binary(
 }
 
 /***********************************************************//**
+Checks if an FTS Doc ID column is affected by an UPDATE.
+@return whether the Doc ID column is changed */
+UNIV_INTERN
+bool
+row_upd_changes_doc_id(
+/*===================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field)	/*!< in: field to check */
+{
+	ulint		col_no;
+	dict_index_t*	clust_index;
+	fts_t*		fts = table->fts;
+
+	clust_index = dict_table_get_first_index(table);
+
+	/* Convert from index-specific column number to table-global
+	column number. */
+	col_no = dict_index_get_nth_col_no(clust_index, upd_field->field_no);
+
+	return(col_no == fts->doc_col);
+}
+/***********************************************************//**
+Checks if an FTS indexed column is affected by an UPDATE.
+@return offset within fts_t::indexes if FTS indexed column updated else
+ULINT_UNDEFINED */
+UNIV_INTERN
+ulint
+row_upd_changes_fts_column(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field)	/*!< in: field to check */
+{
+	ulint		col_no;
+	dict_index_t*	clust_index;
+	fts_t*		fts = table->fts;
+
+	clust_index = dict_table_get_first_index(table);
+
+	/* Convert from index-specific column number to table-global
+	column number. */
+	col_no = dict_index_get_nth_col_no(clust_index, upd_field->field_no);
+
+	return(dict_table_is_fts_column(fts->indexes, col_no));
+}
+
+/***********************************************************//**
 Checks if an update vector changes some of the first ordering fields of an
 index record. This is only used in foreign key checks and we can assume
 that index does not contain column prefixes.
@@ -1530,7 +1617,7 @@ row_upd_store_row(
 	offsets = rec_get_offsets(rec, clust_index, offsets_,
 				  ULINT_UNDEFINED, &heap);
 
-	if (dict_table_get_format(node->table) >= DICT_TF_FORMAT_ZIP) {
+	if (dict_table_get_format(node->table) >= UNIV_FORMAT_B) {
 		/* In DYNAMIC or COMPRESSED format, there is no prefix
 		of externally stored columns in the clustered index
 		record. Build a cache of column prefixes. */
@@ -1544,7 +1631,7 @@ row_upd_store_row(
 	}
 
 	node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets,
-			      NULL, ext, node->heap);
+			      NULL, NULL, NULL, ext, node->heap);
 	if (node->is_delete) {
 		node->upd_row = NULL;
 		node->upd_ext = NULL;
@@ -1563,8 +1650,8 @@ row_upd_store_row(
 Updates a secondary index entry of a row.
 @return DB_SUCCESS if operation successfully completed, else error
 code or DB_LOCK_WAIT */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_upd_sec_index_entry(
 /*====================*/
 	upd_node_t*	node,	/*!< in: row update node */
@@ -1578,11 +1665,13 @@ row_upd_sec_index_entry(
 	dict_index_t*		index;
 	btr_cur_t*		btr_cur;
 	ibool			referenced;
-	ulint			err	= DB_SUCCESS;
+	dberr_t			err	= DB_SUCCESS;
 	trx_t*			trx	= thr_get_trx(thr);
-	ulint			mode	= BTR_MODIFY_LEAF;
+	ulint			mode;
 	enum row_search_result	search_result;
 
+	ut_ad(trx->id);
+
 	index = node->index;
 
 	referenced = row_upd_index_is_referenced(index, trx);
@@ -1593,22 +1682,78 @@ row_upd_sec_index_entry(
 	entry = row_build_index_entry(node->row, node->ext, index, heap);
 	ut_a(entry);
 
+	log_free_check();
+
+#ifdef UNIV_DEBUG
+	/* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC().
+	Once it is fixed, remove the 'ifdef', 'if' and this comment. */
+	if (!trx->ddl) {
+		DEBUG_SYNC_C_IF_THD(trx->mysql_thd,
+				    "before_row_upd_sec_index_entry");
+	}
+#endif /* UNIV_DEBUG */
+
 	mtr_start(&mtr);
 
+	if (*index->name == TEMP_INDEX_PREFIX) {
+		/* The index->online_status may change if the
+		index->name starts with TEMP_INDEX_PREFIX (meaning
+		that the index is or was being created online). It is
+		protected by index->lock. */
+
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+
+		switch (dict_index_get_online_status(index)) {
+		case ONLINE_INDEX_COMPLETE:
+			/* This is a normal index. Do not log anything.
+			Perform the update on the index tree directly. */
+			break;
+		case ONLINE_INDEX_CREATION:
+			/* Log a DELETE and optionally INSERT. */
+			row_log_online_op(index, entry, 0);
+
+			if (!node->is_delete) {
+				mem_heap_empty(heap);
+				entry = row_build_index_entry(
+					node->upd_row, node->upd_ext,
+					index, heap);
+				ut_a(entry);
+				row_log_online_op(index, entry, trx->id);
+			}
+			/* fall through */
+		case ONLINE_INDEX_ABORTED:
+		case ONLINE_INDEX_ABORTED_DROPPED:
+			mtr_commit(&mtr);
+			goto func_exit;
+		}
+
+		/* We can only buffer delete-mark operations if there
+		are no foreign key constraints referring to the index. */
+		mode = referenced
+			? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED
+			: BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED
+			| BTR_DELETE_MARK;
+	} else {
+		/* For secondary indexes,
+		index->online_status==ONLINE_INDEX_CREATION unless
+		index->name starts with TEMP_INDEX_PREFIX. */
+		ut_ad(!dict_index_is_online_ddl(index));
+
+		/* We can only buffer delete-mark operations if there
+		are no foreign key constraints referring to the index. */
+		mode = referenced
+			? BTR_MODIFY_LEAF
+			: BTR_MODIFY_LEAF | BTR_DELETE_MARK;
+	}
+
 	/* Set the query thread, so that ibuf_insert_low() will be
 	able to invoke thd_get_trx(). */
 	btr_pcur_get_btr_cur(&pcur)->thr = thr;
 
-	/* We can only try to use the insert/delete buffer to buffer
-	delete-mark operations if the index we're modifying has no foreign
-	key constraints referring to it. */
-	if (!referenced) {
-		mode |= BTR_DELETE_MARK;
-	}
-
 	search_result = row_search_index_entry(index, entry,
 					       UNIV_UNLIKELY(trx->fake_changes)
-					       ? BTR_SEARCH_LEAF : mode,
+					       ? BTR_SEARCH_LEAF
+					       : (btr_latch_mode)mode,
 					       &pcur, &mtr);
 
 	btr_cur = btr_pcur_get_btr_cur(&pcur);
@@ -1624,6 +1769,20 @@ row_upd_sec_index_entry(
 		break;
 
 	case ROW_NOT_FOUND:
+		if (*index->name == TEMP_INDEX_PREFIX) {
+			/* When online CREATE INDEX copied the update
+			that we already made to the clustered index,
+			and completed the secondary index creation
+			before we got here, the old secondary index
+			record would not exist. The CREATE INDEX
+			should be waiting for a MySQL meta-data lock
+			upgrade at least until this UPDATE
+			returns. After that point, the
+			TEMP_INDEX_PREFIX would be dropped from the
+			index name in commit_inplace_alter_table(). */
+			break;
+		}
+
 		fputs("InnoDB: error in sec index entry update in\n"
 		      "InnoDB: ", stderr);
 		dict_index_name_print(stderr, trx, index);
@@ -1634,9 +1793,7 @@ row_upd_sec_index_entry(
 		      "InnoDB: record ", stderr);
 		rec_print(stderr, rec, index);
 		putc('\n', stderr);
-
 		trx_print(stderr, trx, 0);
-
 		fputs("\n"
 		      "InnoDB: Submit a detailed bug report"
 		      " to http://bugs.mysql.com\n", stderr);
@@ -1645,11 +1802,9 @@ row_upd_sec_index_entry(
 	case ROW_FOUND:
 		/* Delete mark the old index record; it can already be
 		delete marked if we return after a lock wait in
-		row_ins_index_entry below */
-
+		row_ins_sec_index_entry() below */
 		if (!rec_get_deleted_flag(
-			rec, dict_table_is_comp(index->table))) {
-
+			    rec, dict_table_is_comp(index->table))) {
 			err = btr_cur_del_mark_set_sec_rec(
 				0, btr_cur, TRUE, thr, &mtr);
 
@@ -1679,13 +1834,15 @@ row_upd_sec_index_entry(
 		goto func_exit;
 	}
 
+	mem_heap_empty(heap);
+
 	/* Build a new index entry */
 	entry = row_build_index_entry(node->upd_row, node->upd_ext,
 				      index, heap);
 	ut_a(entry);
 
 	/* Insert new index entry */
-	err = row_ins_index_entry(index, entry, 0, TRUE, thr);
+	err = row_ins_sec_index_entry(index, entry, thr);
 
 func_exit:
 	mem_heap_free(heap);
@@ -1698,8 +1855,8 @@ Updates the secondary index record if it is changed in the row update or
 deletes it if this is a delete.
 @return DB_SUCCESS if operation successfully completed, else error
 code or DB_LOCK_WAIT */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_upd_sec_step(
 /*=============*/
 	upd_node_t*	node,	/*!< in: row update node */
@@ -1785,7 +1942,9 @@ row_upd_clust_rec_by_insert_inherit_func(
 		len = dfield_get_len(dfield);
 		ut_a(len != UNIV_SQL_NULL);
 		ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
-		data = dfield_get_data(dfield);
+
+		data = static_cast<byte*>(dfield_get_data(dfield));
+
 		data += len - BTR_EXTERN_FIELD_REF_SIZE;
 		/* The pointer must not be zero. */
 		ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
@@ -1810,8 +1969,8 @@ fields of the clustered index record change. This should be quite rare in
 database applications.
 @return DB_SUCCESS if operation successfully completed, else error
 code or DB_LOCK_WAIT */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_upd_clust_rec_by_insert(
 /*========================*/
 	upd_node_t*	node,	/*!< in/out: row update node */
@@ -1827,7 +1986,7 @@ row_upd_clust_rec_by_insert(
 	trx_t*		trx;
 	dict_table_t*	table;
 	dtuple_t*	entry;
-	ulint		err;
+	dberr_t		err;
 	ibool		change_ownership	= FALSE;
 	rec_t*		rec;
 	ulint*		offsets			= NULL;
@@ -1852,7 +2011,7 @@ row_upd_clust_rec_by_insert(
 	default:
 		ut_error;
 	case UPD_NODE_INSERT_BLOB:
-		/* A lock wait occurred in row_ins_index_entry() in
+		/* A lock wait occurred in row_ins_clust_index_entry() in
 		the previous invocation of this function. Mark the
 		off-page columns in the entry inherited. */
 
@@ -1863,7 +2022,7 @@ row_upd_clust_rec_by_insert(
 		}
 		/* fall through */
 	case UPD_NODE_INSERT_CLUSTERED:
-		/* A lock wait occurred in row_ins_index_entry() in
+		/* A lock wait occurred in row_ins_clust_index_entry() in
 		the previous invocation of this function. */
 		break;
 	case UPD_NODE_UPDATE_CLUSTERED:
@@ -1876,8 +2035,8 @@ row_upd_clust_rec_by_insert(
 		ut_ad(page_rec_is_user_rec(rec));
 
 		err = btr_cur_del_mark_set_clust_rec(
-			BTR_NO_LOCKING_FLAG, btr_cur_get_block(btr_cur),
-			rec, index, offsets, TRUE, thr, mtr);
+			btr_cur_get_block(btr_cur), rec, index, offsets,
+			thr, mtr);
 		if (err != DB_SUCCESS) {
 err_exit:
 			mtr_commit(mtr);
@@ -1915,9 +2074,9 @@ err_exit:
 
 	mtr_commit(mtr);
 
-	err = row_ins_index_entry(index, entry,
-				  node->upd_ext ? node->upd_ext->n_ext : 0,
-				  TRUE, thr);
+	err = row_ins_clust_index_entry(
+		index, entry, thr,
+		node->upd_ext ? node->upd_ext->n_ext : 0);
 	node->state = change_ownership
 		? UPD_NODE_INSERT_BLOB
 		: UPD_NODE_INSERT_CLUSTERED;
@@ -1943,11 +2102,17 @@ err_exit:
 		offsets = rec_get_offsets(rec, index, offsets,
 					  ULINT_UNDEFINED, &heap);
 		ut_ad(page_rec_is_user_rec(rec));
+		ut_ad(rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
 
 		btr_cur_disown_inherited_fields(
 			btr_cur_get_page_zip(btr_cur),
 			rec, index, offsets, node->update, mtr);
 
+		/* It is not necessary to call row_log_table for
+		this, because during online table rebuild, purge will
+		not free any BLOBs in the table, whether or not they
+		are owned by the clustered index record. */
+
 		mtr_commit(mtr);
 	}
 
@@ -1961,20 +2126,24 @@ Updates a clustered index record of a row when the ordering fields do
 not change.
 @return DB_SUCCESS if operation successfully completed, else error
 code or DB_LOCK_WAIT */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_upd_clust_rec(
 /*==============*/
 	upd_node_t*	node,	/*!< in: row update node */
 	dict_index_t*	index,	/*!< in: clustered index */
+	ulint*		offsets,/*!< in: rec_get_offsets() on node->pcur */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: memory heap, can be emptied */
 	que_thr_t*	thr,	/*!< in: query thread */
 	mtr_t*		mtr)	/*!< in: mtr; gets committed here */
 {
-	mem_heap_t*	heap	= NULL;
-	big_rec_t*	big_rec	= NULL;
+	mem_heap_t*	heap		= NULL;
+	big_rec_t*	big_rec		= NULL;
 	btr_pcur_t*	pcur;
 	btr_cur_t*	btr_cur;
-	ulint		err;
+	dberr_t		err;
+	const dtuple_t*	rebuilt_old_pk	= NULL;
 
 	ut_ad(node);
 	ut_ad(dict_index_is_clust(index));
@@ -1982,33 +2151,48 @@ row_upd_clust_rec(
 	pcur = node->pcur;
 	btr_cur = btr_pcur_get_btr_cur(pcur);
 
-	ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+	ut_ad(btr_cur_get_index(btr_cur) == index);
+	ut_ad(!rec_get_deleted_flag(btr_cur_get_rec(btr_cur),
 				    dict_table_is_comp(index->table)));
+	ut_ad(rec_offs_validate(btr_cur_get_rec(btr_cur), index, offsets));
+
+	if (dict_index_is_online_ddl(index)) {
+		rebuilt_old_pk = row_log_table_get_pk(
+			btr_cur_get_rec(btr_cur), index, offsets, &heap);
+	}
 
 	/* Try optimistic updating of the record, keeping changes within
 	the page; we do not check locks because we assume the x-lock on the
 	record to update */
 
 	if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) {
-		err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG,
-					      btr_cur, node->update,
-					      node->cmpl_info, thr, mtr);
+		err = btr_cur_update_in_place(
+			BTR_NO_LOCKING_FLAG, btr_cur,
+			offsets, node->update,
+			node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
 	} else {
-		err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG,
-						btr_cur, node->update,
-						node->cmpl_info, thr, mtr);
+		err = btr_cur_optimistic_update(
+			BTR_NO_LOCKING_FLAG, btr_cur,
+			&offsets, offsets_heap, node->update,
+			node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
+	}
+
+	if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) {
+		row_log_table_update(btr_cur_get_rec(btr_cur),
+				     index, offsets, rebuilt_old_pk);
 	}
 
 	mtr_commit(mtr);
 
 	if (UNIV_LIKELY(err == DB_SUCCESS)) {
 
-		return(DB_SUCCESS);
+		goto func_exit;
 	}
 
 	if (buf_LRU_buf_pool_running_out()) {
 
-		return(DB_LOCK_TABLE_FULL);
+		err = DB_LOCK_TABLE_FULL;
+		goto func_exit;
 	}
 	/* We may have to modify the tree structure: do a pessimistic descent
 	down the index tree */
@@ -2022,25 +2206,23 @@ row_upd_clust_rec(
 	Therefore we can assert that the restoration of the cursor succeeds. */
 
 	ut_a(btr_pcur_restore_position(
-		(UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)
-		 ? BTR_SEARCH_TREE : BTR_MODIFY_TREE),
-		pcur, mtr));
+		 UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)
+		 ? BTR_SEARCH_TREE : BTR_MODIFY_TREE,
+		 pcur, mtr));
 
 	ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
 				    dict_table_is_comp(index->table)));
 
-	err = btr_cur_pessimistic_update(
-		 BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, btr_cur,
-		 &heap, &big_rec, node->update,
-		 node->cmpl_info, thr, mtr);
-
-	/* skip store extern for fake_changes */
-	if (err == DB_SUCCESS && big_rec
-	    && UNIV_LIKELY(!(thr_get_trx(thr)->fake_changes))) {
-		ulint	offsets_[REC_OFFS_NORMAL_SIZE];
-		rec_t*	rec;
-		rec_offs_init(offsets_);
+	if (!heap) {
+		heap = mem_heap_create(1024);
+	}
 
+	err = btr_cur_pessimistic_update(
+		BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, btr_cur,
+		&offsets, offsets_heap, heap, &big_rec,
+		node->update, node->cmpl_info,
+		thr, thr_get_trx(thr)->id, mtr);
+	if (big_rec && UNIV_LIKELY(!(thr_get_trx(thr)->fake_changes))) {
 		ut_a(err == DB_SUCCESS);
 		/* Write out the externally stored
 		columns while still x-latching
@@ -2063,12 +2245,10 @@ row_upd_clust_rec(
 		portion of the file, in case the file was somehow
 		truncated in the crash. */
 
-		rec = btr_cur_get_rec(btr_cur);
 		DEBUG_SYNC_C("before_row_upd_extern");
 		err = btr_store_big_rec_extern_fields(
-			index, btr_cur_get_block(btr_cur), rec,
-			rec_get_offsets(rec, index, offsets_,
-					ULINT_UNDEFINED, &heap),
+			index, btr_cur_get_block(btr_cur),
+			btr_cur_get_rec(btr_cur), offsets,
 			big_rec, mtr, BTR_STORE_UPDATE);
 		DEBUG_SYNC_C("after_row_upd_extern");
 		/* If writing big_rec fails (for example, because of
@@ -2087,9 +2267,14 @@ row_upd_clust_rec(
 		ut_a(err == DB_SUCCESS);
 	}
 
-	mtr_commit(mtr);
+	if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) {
+		row_log_table_update(btr_cur_get_rec(btr_cur),
+				     index, offsets, rebuilt_old_pk);
+	}
 
-	if (UNIV_LIKELY_NULL(heap)) {
+	mtr_commit(mtr);
+func_exit:
+	if (heap) {
 		mem_heap_free(heap);
 	}
 
@@ -2103,8 +2288,8 @@ row_upd_clust_rec(
 /***********************************************************//**
 Delete marks a clustered index record.
 @return	DB_SUCCESS if operation successfully completed, else error code */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_upd_del_mark_clust_rec(
 /*=======================*/
 	upd_node_t*	node,	/*!< in: row update node */
@@ -2119,7 +2304,7 @@ row_upd_del_mark_clust_rec(
 {
 	btr_pcur_t*	pcur;
 	btr_cur_t*	btr_cur;
-	ulint		err;
+	dberr_t		err;
 
 	ut_ad(node);
 	ut_ad(dict_index_is_clust(index));
@@ -2137,8 +2322,8 @@ row_upd_del_mark_clust_rec(
 	locks, because we assume that we have an x-lock on the record */
 
 	err = btr_cur_del_mark_set_clust_rec(
-		BTR_NO_LOCKING_FLAG, btr_cur_get_block(btr_cur),
-		btr_cur_get_rec(btr_cur), index, offsets, TRUE, thr, mtr);
+		btr_cur_get_block(btr_cur), btr_cur_get_rec(btr_cur),
+		index, offsets, thr, mtr);
 	if (err == DB_SUCCESS && referenced) {
 		/* NOTE that the following call loses the position of pcur ! */
 
@@ -2155,8 +2340,8 @@ row_upd_del_mark_clust_rec(
 Updates the clustered index record.
 @return DB_SUCCESS if operation successfully completed, DB_LOCK_WAIT
 in case of a lock wait, else error code */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_upd_clust_step(
 /*===============*/
 	upd_node_t*	node,	/*!< in: row update node */
@@ -2165,11 +2350,10 @@ row_upd_clust_step(
 	dict_index_t*	index;
 	btr_pcur_t*	pcur;
 	ibool		success;
-	ulint		err;
-	mtr_t*		mtr;
-	mtr_t		mtr_buf;
+	dberr_t		err;
+	mtr_t		mtr;
 	rec_t*		rec;
-	mem_heap_t*	heap		= NULL;
+	mem_heap_t*	heap	= NULL;
 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
 	ulint*		offsets;
 	ibool		referenced;
@@ -2182,9 +2366,8 @@ row_upd_clust_step(
 	pcur = node->pcur;
 
 	/* We have to restore the cursor to its position */
-	mtr = &mtr_buf;
 
-	mtr_start(mtr);
+	mtr_start(&mtr);
 
 	/* If the restoration does not succeed, then the same
 	transaction has deleted the record on which the cursor was,
@@ -2196,15 +2379,34 @@ row_upd_clust_step(
 
 	ut_a(pcur->rel_pos == BTR_PCUR_ON);
 
-	success = btr_pcur_restore_position(
-		(UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)
-		 ? BTR_SEARCH_LEAF : BTR_MODIFY_LEAF),
-		pcur, mtr);
+	ulint	mode;
+
+#ifdef UNIV_DEBUG
+	/* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC().
+	Once it is fixed, remove the 'ifdef', 'if' and this comment. */
+	if (!thr_get_trx(thr)->ddl) {
+		DEBUG_SYNC_C_IF_THD(
+			thr_get_trx(thr)->mysql_thd,
+			"innodb_row_upd_clust_step_enter");
+	}
+#endif /* UNIV_DEBUG */
+
+	if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) {
+		mode = BTR_SEARCH_LEAF;
+	} else if (dict_index_is_online_ddl(index)) {
+		ut_ad(node->table->id != DICT_INDEXES_ID);
+		mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+	} else {
+		mode = BTR_MODIFY_LEAF;
+	}
+
+	success = btr_pcur_restore_position(mode, pcur, &mtr);
 
 	if (!success) {
 		err = DB_RECORD_NOT_FOUND;
 
-		mtr_commit(mtr);
+		mtr_commit(&mtr);
 
 		return(err);
 	}
@@ -2215,18 +2417,20 @@ row_upd_clust_step(
 
 	if (node->is_delete && node->table->id == DICT_INDEXES_ID) {
 
-		dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr);
+		ut_ad(!dict_index_is_online_ddl(index));
 
-		mtr_commit(mtr);
+		dict_drop_index_tree(btr_pcur_get_rec(pcur), &mtr);
 
-		mtr_start(mtr);
+		mtr_commit(&mtr);
+
+		mtr_start(&mtr);
 
 		success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur,
-						    mtr);
+						    &mtr);
 		if (!success) {
 			err = DB_ERROR;
 
-			mtr_commit(mtr);
+			mtr_commit(&mtr);
 
 			return(err);
 		}
@@ -2241,26 +2445,27 @@ row_upd_clust_step(
 			0, btr_pcur_get_block(pcur),
 			rec, index, offsets, thr);
 		if (err != DB_SUCCESS) {
-			mtr_commit(mtr);
+			mtr_commit(&mtr);
 			goto exit_func;
 		}
 	}
 
+	ut_ad(lock_trx_has_rec_x_lock(thr_get_trx(thr), index->table,
+				      btr_pcur_get_block(pcur),
+				      page_rec_get_heap_no(rec)));
+
 	/* NOTE: the following function calls will also commit mtr */
 
 	if (node->is_delete) {
 		err = row_upd_del_mark_clust_rec(
-			node, index, offsets, thr, referenced, mtr);
+			node, index, offsets, thr, referenced, &mtr);
 
 		if (err == DB_SUCCESS) {
 			node->state = UPD_NODE_UPDATE_ALL_SEC;
 			node->index = dict_table_get_next_index(index);
 		}
-exit_func:
-		if (UNIV_LIKELY_NULL(heap)) {
-			mem_heap_free(heap);
-		}
-		return(err);
+
+		goto exit_func;
 	}
 
 	/* If the update is made for MySQL, we already have the update vector
@@ -2274,14 +2479,11 @@ exit_func:
 		row_upd_eval_new_vals(node->update);
 	}
 
-	if (UNIV_LIKELY_NULL(heap)) {
-		mem_heap_free(heap);
-	}
-
 	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
 
-		err = row_upd_clust_rec(node, index, thr, mtr);
-		return(err);
+		err = row_upd_clust_rec(
+			node, index, offsets, &heap, thr, &mtr);
+		goto exit_func;
 	}
 
 	row_upd_store_row(node);
@@ -2301,20 +2503,21 @@ exit_func:
 		externally! */
 
 		err = row_upd_clust_rec_by_insert(
-			node, index, thr, referenced, mtr);
+			node, index, thr, referenced, &mtr);
 
 		if (err != DB_SUCCESS) {
 
-			return(err);
+			goto exit_func;
 		}
 
 		node->state = UPD_NODE_UPDATE_ALL_SEC;
 	} else {
-		err = row_upd_clust_rec(node, index, thr, mtr);
+		err = row_upd_clust_rec(
+			node, index, offsets, &heap, thr, &mtr);
 
 		if (err != DB_SUCCESS) {
 
-			return(err);
+			goto exit_func;
 		}
 
 		node->state = UPD_NODE_UPDATE_SOME_SEC;
@@ -2322,6 +2525,10 @@ exit_func:
 
 	node->index = dict_table_get_next_index(index);
 
+exit_func:
+	if (heap) {
+		mem_heap_free(heap);
+	}
 	return(err);
 }
 
@@ -2331,14 +2538,14 @@ to this node, we assume that we have a persistent cursor which was on a
 record, and the position of the cursor is stored in the cursor.
 @return DB_SUCCESS if operation successfully completed, else error
 code or DB_LOCK_WAIT */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 row_upd(
 /*====*/
 	upd_node_t*	node,	/*!< in: row update node */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	ulint	err	= DB_SUCCESS;
+	dberr_t		err	= DB_SUCCESS;
 
 	ut_ad(node && thr);
 
@@ -2356,26 +2563,38 @@ row_upd(
 		}
 	}
 
-	if (node->state == UPD_NODE_UPDATE_CLUSTERED
-	    || node->state == UPD_NODE_INSERT_CLUSTERED
-	    || node->state == UPD_NODE_INSERT_BLOB) {
-
+	switch (node->state) {
+	case UPD_NODE_UPDATE_CLUSTERED:
+	case UPD_NODE_INSERT_CLUSTERED:
+	case UPD_NODE_INSERT_BLOB:
 		log_free_check();
 		err = row_upd_clust_step(node, thr);
 
 		if (err != DB_SUCCESS) {
 
-			goto function_exit;
+			return(err);
 		}
 	}
 
-	if (!node->is_delete && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+	if (node->index == NULL
+	    || (!node->is_delete
+		&& (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE))) {
 
-		goto function_exit;
+		return(DB_SUCCESS);
 	}
 
-	while (node->index != NULL) {
+#ifdef UNIV_DEBUG
+	/* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC().
+	Once it is fixed, remove the 'ifdef', 'if' and this comment. */
+	if (!thr_get_trx(thr)->ddl) {
+		DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+				    "after_row_upd_clust");
+	}
+#endif /* UNIV_DEBUG */
+
+	DBUG_EXECUTE_IF("row_upd_skip_sec", node->index = NULL;);
 
+	do {
 		/* Skip corrupted index */
 		dict_table_skip_corrupt_index(node->index);
 
@@ -2383,32 +2602,32 @@ row_upd(
 			break;
 		}
 
-		log_free_check();
-		err = row_upd_sec_step(node, thr);
+		if (node->index->type != DICT_FTS) {
+			err = row_upd_sec_step(node, thr);
 
-		if (err != DB_SUCCESS) {
+			if (err != DB_SUCCESS) {
 
-			goto function_exit;
+				return(err);
+			}
 		}
 
 		node->index = dict_table_get_next_index(node->index);
-	}
+	} while (node->index != NULL);
 
-function_exit:
-	if (err == DB_SUCCESS) {
-		/* Do some cleanup */
+	ut_ad(err == DB_SUCCESS);
 
-		if (node->row != NULL) {
-			node->row = NULL;
-			node->ext = NULL;
-			node->upd_row = NULL;
-			node->upd_ext = NULL;
-			mem_heap_empty(node->heap);
-		}
+	/* Do some cleanup */
 
-		node->state = UPD_NODE_UPDATE_CLUSTERED;
+	if (node->row != NULL) {
+		node->row = NULL;
+		node->ext = NULL;
+		node->upd_row = NULL;
+		node->upd_ext = NULL;
+		mem_heap_empty(node->heap);
 	}
 
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
 	return(err);
 }
 
@@ -2425,16 +2644,16 @@ row_upd_step(
 	upd_node_t*	node;
 	sel_node_t*	sel_node;
 	que_node_t*	parent;
-	ulint		err		= DB_SUCCESS;
+	dberr_t		err		= DB_SUCCESS;
 	trx_t*		trx;
 
 	ut_ad(thr);
 
 	trx = thr_get_trx(thr);
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
-	node = thr->run_node;
+	node = static_cast<upd_node_t*>(thr->run_node);
 
 	sel_node = node->select;
 
diff --git a/storage/xtradb/row/row0vers.c b/storage/xtradb/row/row0vers.cc
index 1535273d54f..bde796831c6 100644
--- a/storage/xtradb/row/row0vers.c
+++ b/storage/xtradb/row/row0vers.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0vers.c
+@file row/row0vers.cc
 Row versions
 
 Created 2/6/1997 Heikki Tuuri
@@ -48,91 +48,51 @@ Created 2/6/1997 Heikki Tuuri
 
 /*****************************************************************//**
 Finds out if an active transaction has inserted or modified a secondary
-index record. NOTE: the kernel mutex is temporarily released in this
-function!
-@return NULL if committed, else the active transaction */
-UNIV_INTERN
-trx_t*
-row_vers_impl_x_locked_off_kernel(
-/*==============================*/
-	const rec_t*	rec,	/*!< in: record in a secondary index */
-	dict_index_t*	index,	/*!< in: the secondary index */
-	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+index record.
+@return 0 if committed, else the active transaction id;
+NOTE that this function can return false positives but never false
+negatives. The caller must confirm all positive results by calling
+trx_is_active() while holding lock_sys->mutex. */
+UNIV_INLINE
+trx_id_t
+row_vers_impl_x_locked_low(
+/*=======================*/
+	const rec_t*	clust_rec,	/*!< in: clustered index record */
+	dict_index_t*	clust_index,	/*!< in: the clustered index */
+	const rec_t*	rec,		/*!< in: secondary index record */
+	dict_index_t*	index,		/*!< in: the secondary index */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
-	dict_index_t*	clust_index;
-	rec_t*		clust_rec;
-	ulint*		clust_offsets;
-	rec_t*		version;
 	trx_id_t	trx_id;
-	mem_heap_t*	heap;
-	mem_heap_t*	heap2;
-	dtuple_t*	row;
-	dtuple_t*	entry	= NULL; /* assignment to eliminate compiler
-					warning */
-	trx_t*		trx;
-	ulint		rec_del;
-#ifdef UNIV_DEBUG
-	ulint		err;
-#endif /* UNIV_DEBUG */
-	mtr_t		mtr;
+	ibool		corrupt;
 	ulint		comp;
+	ulint		rec_del;
+	const rec_t*	version;
+	rec_t*		prev_version = NULL;
+	ulint*		clust_offsets;
+	mem_heap_t*	heap;
 
-	ut_ad(mutex_own(&kernel_mutex));
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
-#endif /* UNIV_SYNC_DEBUG */
-
-	mutex_exit(&kernel_mutex);
-
-	mtr_start(&mtr);
-
-	/* Search for the clustered index record: this is a time-consuming
-	operation: therefore we release the kernel mutex; also, the release
-	is required by the latching order convention. The latch on the
-	clustered index locks the top of the stack of versions. We also
-	reserve purge_latch to lock the bottom of the version stack. */
-
-	clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index,
-				      &clust_index, &mtr);
-	if (!clust_rec) {
-		/* In a rare case it is possible that no clust rec is found
-		for a secondary index record: if in row0umod.c
-		row_undo_mod_remove_clust_low() we have already removed the
-		clust rec, while purge is still cleaning and removing
-		secondary index records associated with earlier versions of
-		the clustered index record. In that case there cannot be
-		any implicit lock on the secondary index record, because
-		an active transaction which has modified the secondary index
-		record has also modified the clustered index record. And in
-		a rollback we always undo the modifications to secondary index
-		records before the clustered index record. */
-
-		mutex_enter(&kernel_mutex);
-		mtr_commit(&mtr);
-
-		return(NULL);
-	}
+	ut_ad(rec_offs_validate(rec, index, offsets));
 
 	heap = mem_heap_create(1024);
-	clust_offsets = rec_get_offsets(clust_rec, clust_index, NULL,
-					ULINT_UNDEFINED, &heap);
-	trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets);
 
-	mtr_s_lock(&(purge_sys->latch), &mtr);
+	clust_offsets = rec_get_offsets(
+		clust_rec, clust_index, NULL, ULINT_UNDEFINED, &heap);
 
-	mutex_enter(&kernel_mutex);
+	trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets);
+	corrupt = FALSE;
 
-	trx = NULL;
-	if (!trx_is_active(trx_id)) {
+	if (!trx_rw_is_active(trx_id, &corrupt)) {
 		/* The transaction that modified or inserted clust_rec is no
-		longer active: no implicit lock on rec */
-		goto exit_func;
-	}
-
-	if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index,
-				      clust_offsets, TRUE)) {
-		/* Corruption noticed: try to avoid a crash by returning */
-		goto exit_func;
+		longer active, or it is corrupt: no implicit lock on rec */
+		if (corrupt) {
+			lock_report_trx_id_insanity(
+				trx_id, clust_rec, clust_index, clust_offsets,
+				trx_sys_get_max_trx_id());
+		}
+		mem_heap_free(heap);
+		return(0);
 	}
 
 	comp = page_rec_is_comp(rec);
@@ -140,114 +100,110 @@ row_vers_impl_x_locked_off_kernel(
 	ut_ad(!!comp == dict_table_is_comp(index->table));
 	ut_ad(!comp == !page_rec_is_comp(clust_rec));
 
-	/* We look up if some earlier version, which was modified by the trx_id
-	transaction, of the clustered index record would require rec to be in
-	a different state (delete marked or unmarked, or have different field
-	values, or not existing). If there is such a version, then rec was
-	modified by the trx_id transaction, and it has an implicit x-lock on
-	rec. Note that if clust_rec itself would require rec to be in a
-	different state, then the trx_id transaction has not yet had time to
-	modify rec, and does not necessarily have an implicit x-lock on rec. */
-
 	rec_del = rec_get_deleted_flag(rec, comp);
-	trx = NULL;
-
-	version = clust_rec;
 
-	for (;;) {
-		rec_t*		prev_version;
-		ulint		vers_del;
+	/* We look up if some earlier version, which was modified by
+	the trx_id transaction, of the clustered index record would
+	require rec to be in a different state (delete marked or
+	unmarked, or have different field values, or not existing). If
+	there is such a version, then rec was modified by the trx_id
+	transaction, and it has an implicit x-lock on rec. Note that
+	if clust_rec itself would require rec to be in a different
+	state, then the trx_id transaction has not yet had time to
+	modify rec, and does not necessarily have an implicit x-lock
+	on rec. */
+
+	for (version = clust_rec;; version = prev_version) {
 		row_ext_t*	ext;
+		const dtuple_t*	row;
+		dtuple_t*	entry;
+		ulint		vers_del;
 		trx_id_t	prev_trx_id;
+		mem_heap_t*	old_heap = heap;
 
-		mutex_exit(&kernel_mutex);
-
-		/* While we retrieve an earlier version of clust_rec, we
-		release the kernel mutex, because it may take time to access
-		the disk. After the release, we have to check if the trx_id
-		transaction is still active. We keep the semaphore in mtr on
-		the clust_rec page, so that no other transaction can update
-		it and get an implicit x-lock on rec. */
+		/* We keep the semaphore in mtr on the clust_rec page, so
+		that no other transaction can update it and get an
+		implicit x-lock on rec until mtr_commit(mtr). */
 
-		heap2 = heap;
 		heap = mem_heap_create(1024);
-#ifdef UNIV_DEBUG
-		err =
-#endif /* UNIV_DEBUG */
-		trx_undo_prev_version_build(clust_rec, &mtr, version,
-					    clust_index, clust_offsets,
-					    heap, &prev_version);
-		mem_heap_free(heap2); /* free version and clust_offsets */
 
-		if (prev_version == NULL) {
-			mutex_enter(&kernel_mutex);
+		trx_undo_prev_version_build(
+			clust_rec, mtr, version, clust_index, clust_offsets,
+			heap, &prev_version);
 
-			if (!trx_is_active(trx_id)) {
-				/* Transaction no longer active: no
-				implicit x-lock */
+		/* Free version and clust_offsets. */
 
-				break;
-			}
-
-			/* If the transaction is still active,
-			clust_rec must be a fresh insert, because no
-			previous version was found. */
-			ut_ad(err == DB_SUCCESS);
+		mem_heap_free(old_heap);
 
-			/* It was a freshly inserted version: there is an
-			implicit x-lock on rec */
+		if (prev_version == NULL) {
 
-			trx = trx_get_on_id(trx_id);
+			/* clust_rec should be a fresh insert, because
+			no previous version was found or the transaction
+			has committed. The caller has to recheck as the
+			synopsis of this function states, whether trx_id
+			is active or not. */
 
 			break;
 		}
 
-		clust_offsets = rec_get_offsets(prev_version, clust_index,
-						NULL, ULINT_UNDEFINED, &heap);
+		clust_offsets = rec_get_offsets(
+			prev_version, clust_index, NULL, ULINT_UNDEFINED,
+			&heap);
 
 		vers_del = rec_get_deleted_flag(prev_version, comp);
+
 		prev_trx_id = row_get_rec_trx_id(prev_version, clust_index,
 						 clust_offsets);
 
 		/* The stack of versions is locked by mtr.  Thus, it
 		is safe to fetch the prefixes for externally stored
 		columns. */
+
 		row = row_build(ROW_COPY_POINTERS, clust_index, prev_version,
-				clust_offsets, NULL, &ext, heap);
+				clust_offsets,
+				NULL, NULL, NULL, &ext, heap);
+
 		entry = row_build_index_entry(row, ext, index, heap);
+
 		/* entry may be NULL if a record was inserted in place
 		of a deleted record, and the BLOB pointers of the new
 		record were not initialized yet.  But in that case,
 		prev_version should be NULL. */
-		ut_a(entry);
 
-		mutex_enter(&kernel_mutex);
+		ut_a(entry != NULL);
 
-		if (!trx_is_active(trx_id)) {
-			/* Transaction no longer active: no implicit x-lock */
-
-			break;
-		}
-
-		/* If we get here, we know that the trx_id transaction is
-		still active and it has modified prev_version. Let us check
-		if prev_version would require rec to be in a different
-		state. */
+		/* If we get here, we know that the trx_id transaction
+		modified prev_version. Let us check if prev_version
+		would require rec to be in a different state. */
 
 		/* The previous version of clust_rec must be
-		accessible, because the transaction is still active
-		and clust_rec was not a fresh insert. */
-		ut_ad(err == DB_SUCCESS);
+		accessible, because clust_rec was not a fresh insert.
+		There is no guarantee that the transaction is still
+		active. */
 
 		/* We check if entry and rec are identified in the alphabetical
 		ordering */
-		if (0 == cmp_dtuple_rec(entry, rec, offsets)) {
+
+		if (!trx_rw_is_active(trx_id, &corrupt)) {
+			/* Transaction no longer active: no implicit
+			x-lock. This situation should only be possible
+			because we are not holding lock_sys->mutex. */
+			ut_ad(!lock_mutex_own());
+			if (corrupt) {
+				lock_report_trx_id_insanity(
+					trx_id,
+					prev_version, clust_index,
+					clust_offsets,
+					trx_sys_get_max_trx_id());
+			}
+			trx_id = 0;
+			break;
+		} else if (0 == cmp_dtuple_rec(entry, rec, offsets)) {
 			/* The delete marks of rec and prev_version should be
 			equal for rec to be in the state required by
 			prev_version */
 
 			if (rec_del != vers_del) {
-				trx = trx_get_on_id(trx_id);
 
 				break;
 			}
@@ -257,38 +213,91 @@ row_vers_impl_x_locked_off_kernel(
 			alphabetical ordering, but the field values changed
 			still. For example, 'abc' -> 'ABC'. Check also that. */
 
-			dtuple_set_types_binary(entry,
-						dtuple_get_n_fields(entry));
-			if (0 != cmp_dtuple_rec(entry, rec, offsets)) {
+			dtuple_set_types_binary(
+				entry, dtuple_get_n_fields(entry));
 
-				trx = trx_get_on_id(trx_id);
+			if (0 != cmp_dtuple_rec(entry, rec, offsets)) {
 
 				break;
 			}
+
 		} else if (!rec_del) {
 			/* The delete mark should be set in rec for it to be
 			in the state required by prev_version */
 
-			trx = trx_get_on_id(trx_id);
-
 			break;
 		}
 
 		if (trx_id != prev_trx_id) {
-			/* The versions modified by the trx_id transaction end
-			to prev_version: no implicit x-lock */
+			/* prev_version was the first version modified by
+			the trx_id transaction: no implicit x-lock */
 
+			trx_id = 0;
 			break;
 		}
+	}
 
-		version = prev_version;
-	}/* for (;;) */
+	mem_heap_free(heap);
+	return(trx_id);
+}
+
+/*****************************************************************//**
+Finds out if an active transaction has inserted or modified a secondary
+index record.
+@return 0 if committed, else the active transaction id;
+NOTE that this function can return false positives but never false
+negatives. The caller must confirm all positive results by calling
+trx_is_active() while holding lock_sys->mutex. */
+UNIV_INTERN
+trx_id_t
+row_vers_impl_x_locked(
+/*===================*/
+	const rec_t*	rec,	/*!< in: record in a secondary index */
+	dict_index_t*	index,	/*!< in: the secondary index */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	dict_index_t*	clust_index;
+	const rec_t*	clust_rec;
+	trx_id_t	trx_id;
+	mtr_t		mtr;
+
+	ut_ad(!lock_mutex_own());
+	ut_ad(!mutex_own(&trx_sys->mutex));
+
+	mtr_start(&mtr);
+
+	/* Search for the clustered index record. The latch on the
+	page of clust_rec locks the top of the stack of versions. The
+	bottom of the version stack is not locked; oldest versions may
+	disappear by the fact that transactions may be committed and
+	collected by the purge. This is not a problem, because we are
+	only interested in active transactions. */
+
+	clust_rec = row_get_clust_rec(
+		BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr);
+
+	if (UNIV_UNLIKELY(!clust_rec)) {
+		/* In a rare case it is possible that no clust rec is found
+		for a secondary index record: if in row0umod.cc
+		row_undo_mod_remove_clust_low() we have already removed the
+		clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case there cannot be
+		any implicit lock on the secondary index record, because
+		an active transaction which has modified the secondary index
+		record has also modified the clustered index record. And in
+		a rollback we always undo the modifications to secondary index
+		records before the clustered index record. */
+
+		trx_id = 0;
+	} else {
+		trx_id = row_vers_impl_x_locked_low(
+			clust_rec, clust_index, rec, index, offsets, &mtr);
+	}
 
-exit_func:
 	mtr_commit(&mtr);
-	mem_heap_free(heap);
 
-	return(trx);
+	return(trx_id);
 }
 
 /*****************************************************************//**
@@ -310,15 +319,7 @@ row_vers_must_preserve_del_marked(
 
 	mtr_s_lock(&(purge_sys->latch), mtr);
 
-	if (trx_purge_update_undo_must_exist(trx_id)) {
-
-		/* A purge operation is not yet allowed to remove this
-		delete marked record */
-
-		return(TRUE);
-	}
-
-	return(FALSE);
+	return(!read_view_sees_trx_id(purge_sys->view, trx_id));
 }
 
 /*****************************************************************//**
@@ -350,7 +351,6 @@ row_vers_old_has_index_entry(
 	mem_heap_t*	heap2;
 	const dtuple_t*	row;
 	const dtuple_t*	entry;
-	ulint		err;
 	ulint		comp;
 
 	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
@@ -358,7 +358,6 @@ row_vers_old_has_index_entry(
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
 #endif /* UNIV_SYNC_DEBUG */
-	mtr_s_lock(&(purge_sys->latch), mtr);
 
 	clust_index = dict_table_get_first_index(index->table);
 
@@ -371,11 +370,16 @@ row_vers_old_has_index_entry(
 	if (also_curr && !rec_get_deleted_flag(rec, comp)) {
 		row_ext_t*	ext;
 
-		/* The stack of versions is locked by mtr.
+		/* The top of the stack of versions is locked by the
+		mtr holding a latch on the page containing the
+		clustered index record. The bottom of the stack is
+		locked by the fact that the purge_sys->view must
+		'overtake' any read view of an active transaction.
 		Thus, it is safe to fetch the prefixes for
 		externally stored columns. */
 		row = row_build(ROW_COPY_POINTERS, clust_index,
-				rec, clust_offsets, NULL, &ext, heap);
+				rec, clust_offsets,
+				NULL, NULL, NULL, &ext, heap);
 		entry = row_build_index_entry(row, ext, index, heap);
 
 		/* If entry == NULL, the record contains unset BLOB
@@ -412,12 +416,12 @@ row_vers_old_has_index_entry(
 	for (;;) {
 		heap2 = heap;
 		heap = mem_heap_create(1024);
-		err = trx_undo_prev_version_build(rec, mtr, version,
-						  clust_index, clust_offsets,
-						  heap, &prev_version);
+		trx_undo_prev_version_build(rec, mtr, version,
+					    clust_index, clust_offsets,
+					    heap, &prev_version);
 		mem_heap_free(heap2); /* free version and clust_offsets */
 
-		if (err != DB_SUCCESS || !prev_version) {
+		if (!prev_version) {
 			/* Versions end here */
 
 			mem_heap_free(heap);
@@ -436,7 +440,7 @@ row_vers_old_has_index_entry(
 			externally stored columns. */
 			row = row_build(ROW_COPY_POINTERS, clust_index,
 					prev_version, clust_offsets,
-					NULL, &ext, heap);
+					NULL, NULL, NULL, &ext, heap);
 			entry = row_build_index_entry(row, ext, index, heap);
 
 			/* If entry == NULL, the record contains unset
@@ -469,7 +473,7 @@ read should see. We assume that the trx id stored in rec is such that
 the consistent read should not see rec in its present version.
 @return	DB_SUCCESS or DB_MISSING_HISTORY */
 UNIV_INTERN
-ulint
+dberr_t
 row_vers_build_for_consistent_read(
 /*===============================*/
 	const rec_t*	rec,	/*!< in: record in a clustered index; the
@@ -487,8 +491,9 @@ row_vers_build_for_consistent_read(
 				*old_vers is allocated; memory for possible
 				intermediate versions is allocated and freed
 				locally within the function */
-	rec_t**		old_vers)/*!< out, own: old version, or NULL if the
-				record does not exist in the view, that is,
+	rec_t**		old_vers)/*!< out, own: old version, or NULL
+				if the history is missing or the record
+				does not exist in the view, that is,
 				it was freshly inserted afterwards */
 {
 	const rec_t*	version;
@@ -496,7 +501,7 @@ row_vers_build_for_consistent_read(
 	trx_id_t	trx_id;
 	mem_heap_t*	heap		= NULL;
 	byte*		buf;
-	ulint		err;
+	dberr_t		err;
 
 	ut_ad(dict_index_is_clust(index));
 	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
@@ -511,7 +516,6 @@ row_vers_build_for_consistent_read(
 
 	ut_ad(!read_view_sees_trx_id(view, trx_id));
 
-	rw_lock_s_lock(&(purge_sys->latch));
 	version = rec;
 
 	for (;;) {
@@ -539,47 +543,42 @@ row_vers_build_for_consistent_read(
 				/* The view already sees this version: we can
 				copy it to in_heap and return */
 
-#ifdef UNIV_BLOB_NULL_DEBUG
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 				ut_a(!rec_offs_any_null_extern(
 					     version, *offsets));
-#endif /* UNIV_BLOB_NULL_DEBUG */
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+				buf = static_cast<byte*>(mem_heap_alloc(
+					in_heap, rec_offs_size(*offsets)));
 
-				buf = mem_heap_alloc(in_heap,
-						     rec_offs_size(*offsets));
 				*old_vers = rec_copy(buf, version, *offsets);
 				rec_offs_make_valid(*old_vers, index,
 						    *offsets);
 				err = DB_SUCCESS;
-
 				break;
 			}
 		}
 
 		err = trx_undo_prev_version_build(rec, mtr, version, index,
 						  *offsets, heap,
-						  &prev_version);
+						  &prev_version)
+			? DB_SUCCESS : DB_MISSING_HISTORY;
 		if (heap2) {
 			mem_heap_free(heap2); /* free version */
 		}
 
-		if (err != DB_SUCCESS) {
-			break;
-		}
-
 		if (prev_version == NULL) {
 			/* It was a freshly inserted version */
 			*old_vers = NULL;
-			err = DB_SUCCESS;
-
 			break;
 		}
 
 		*offsets = rec_get_offsets(prev_version, index, *offsets,
 					   ULINT_UNDEFINED, offset_heap);
 
-#ifdef UNIV_BLOB_NULL_DEBUG
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 		ut_a(!rec_offs_any_null_extern(prev_version, *offsets));
-#endif /* UNIV_BLOB_NULL_DEBUG */
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 
 		trx_id = row_get_rec_trx_id(prev_version, index, *offsets);
 
@@ -588,11 +587,12 @@ row_vers_build_for_consistent_read(
 			/* The view already sees this version: we can copy
 			it to in_heap and return */
 
-			buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets));
+			buf = static_cast<byte*>(
+				mem_heap_alloc(
+					in_heap, rec_offs_size(*offsets)));
+
 			*old_vers = rec_copy(buf, prev_version, *offsets);
 			rec_offs_make_valid(*old_vers, index, *offsets);
-			err = DB_SUCCESS;
-
 			break;
 		}
 
@@ -600,17 +600,15 @@ row_vers_build_for_consistent_read(
 	}/* for (;;) */
 
 	mem_heap_free(heap);
-	rw_lock_s_unlock(&(purge_sys->latch));
 
 	return(err);
 }
 
 /*****************************************************************//**
 Constructs the last committed version of a clustered index record,
-which should be seen by a semi-consistent read.
-@return	DB_SUCCESS or DB_MISSING_HISTORY */
+which should be seen by a semi-consistent read. */
 UNIV_INTERN
-ulint
+void
 row_vers_build_for_semi_consistent_read(
 /*====================================*/
 	const rec_t*	rec,	/*!< in: record in a clustered index; the
@@ -634,7 +632,6 @@ row_vers_build_for_semi_consistent_read(
 	const rec_t*	version;
 	mem_heap_t*	heap		= NULL;
 	byte*		buf;
-	ulint		err;
 	trx_id_t	rec_trx_id	= 0;
 
 	ut_ad(dict_index_is_clust(index));
@@ -646,16 +643,10 @@ row_vers_build_for_semi_consistent_read(
 
 	ut_ad(rec_offs_validate(rec, index, *offsets));
 
-	rw_lock_s_lock(&(purge_sys->latch));
-	/* The S-latch on purge_sys prevents the purge view from
-	changing.  Thus, if we have an uncommitted transaction at
-	this point, then purge cannot remove its undo log even if
-	the transaction could commit now. */
-
 	version = rec;
 
 	for (;;) {
-		trx_t*		version_trx;
+		trx_id_t*	version_trx_descr;
 		mem_heap_t*	heap2;
 		rec_t*		prev_version;
 		trx_id_t	version_trx_id;
@@ -665,33 +656,32 @@ row_vers_build_for_semi_consistent_read(
 			rec_trx_id = version_trx_id;
 		}
 
-		mutex_enter(&kernel_mutex);
-		version_trx = trx_get_on_id(version_trx_id);
-		if (version_trx &&
-		    (version_trx->state == TRX_COMMITTED_IN_MEMORY
-		     || version_trx->state == TRX_NOT_STARTED)) {
-
-			version_trx = NULL;
-		}
-		mutex_exit(&kernel_mutex);
-
-		if (!version_trx) {
-
+		mutex_enter(&trx_sys->mutex);
+		version_trx_descr = trx_find_descriptor(trx_sys->descriptors,
+							trx_sys->descr_n_used,
+							version_trx_id);
+		/* Because version_trx is a read-write transaction,
+		its state cannot change from or to NOT_STARTED while
+		we are holding the trx_sys->mutex.  It may change from
+		ACTIVE to PREPARED or COMMITTED. */
+		mutex_exit(&trx_sys->mutex);
+
+		if (!version_trx_descr) {
+committed_version_trx:
 			/* We found a version that belongs to a
 			committed transaction: return it. */
 
-#ifdef UNIV_BLOB_NULL_DEBUG
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 			ut_a(!rec_offs_any_null_extern(version, *offsets));
-#endif /* UNIV_BLOB_NULL_DEBUG */
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 
 			if (rec == version) {
 				*old_vers = rec;
-				err = DB_SUCCESS;
 				break;
 			}
 
 			/* We assume that a rolled-back transaction stays in
-			TRX_ACTIVE state until all the changes have been
+			TRX_STATE_ACTIVE state until all the changes have been
 			rolled back and the transaction is removed from
 			the global list of transactions. */
 
@@ -708,48 +698,48 @@ row_vers_build_for_semi_consistent_read(
 							   offset_heap);
 			}
 
-			buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets));
+			buf = static_cast<byte*>(
+				mem_heap_alloc(
+					in_heap, rec_offs_size(*offsets)));
+
 			*old_vers = rec_copy(buf, version, *offsets);
 			rec_offs_make_valid(*old_vers, index, *offsets);
-			err = DB_SUCCESS;
-
 			break;
 		}
 
+		DEBUG_SYNC_C("after_row_vers_check_trx_active");
+
 		heap2 = heap;
 		heap = mem_heap_create(1024);
 
-		err = trx_undo_prev_version_build(rec, mtr, version, index,
-						  *offsets, heap,
-						  &prev_version);
-		if (heap2) {
-			mem_heap_free(heap2); /* free version */
+		if (!trx_undo_prev_version_build(rec, mtr, version, index,
+						 *offsets, heap,
+						 &prev_version)) {
+			mem_heap_free(heap);
+			heap = heap2;
+			heap2 = NULL;
+			goto committed_version_trx;
 		}
 
-		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
-			break;
+		if (heap2) {
+			mem_heap_free(heap2); /* free version */
 		}
 
 		if (prev_version == NULL) {
 			/* It was a freshly inserted version */
 			*old_vers = NULL;
-			err = DB_SUCCESS;
-
 			break;
 		}
 
 		version = prev_version;
 		*offsets = rec_get_offsets(version, index, *offsets,
 					   ULINT_UNDEFINED, offset_heap);
-#ifdef UNIV_BLOB_NULL_DEBUG
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 		ut_a(!rec_offs_any_null_extern(version, *offsets));
-#endif /* UNIV_BLOB_NULL_DEBUG */
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 	}/* for (;;) */
 
 	if (heap) {
 		mem_heap_free(heap);
 	}
-	rw_lock_s_unlock(&(purge_sys->latch));
-
-	return(err);
 }
diff --git a/storage/xtradb/srv/srv0conc.cc b/storage/xtradb/srv/srv0conc.cc
new file mode 100644
index 00000000000..413d5c4eab2
--- /dev/null
+++ b/storage/xtradb/srv/srv0conc.cc
@@ -0,0 +1,618 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0conc.cc
+
+InnoDB concurrency manager
+
+Created 2011/04/18 Sunny Bains
+*******************************************************/
+
+#include "srv0srv.h"
+#include "sync0sync.h"
+#include "btr0types.h"
+#include "trx0trx.h"
+
+#include "mysql/plugin.h"
+
+/** Number of times a thread is allowed to enter InnoDB within the same
+SQL query after it has once got the ticket. */
+UNIV_INTERN ulong	srv_n_free_tickets_to_enter = 500;
+
+#ifdef HAVE_ATOMIC_BUILTINS
+/** Maximum sleep delay (in micro-seconds), value of 0 disables it. */
+UNIV_INTERN ulong	srv_adaptive_max_sleep_delay = 150000;
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+UNIV_INTERN ulong	srv_thread_sleep_delay	= 10000;
+
+
+/** We are prepared for a situation that we have this many threads waiting for
+a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the
+value. */
+
+UNIV_INTERN ulint	srv_max_n_threads	= 0;
+
+/** The following controls how many threads we let inside InnoDB concurrently:
+threads waiting for locks are not counted into the number because otherwise
+we could get a deadlock. Value of 0 will disable the concurrency check. */
+
+UNIV_INTERN ulong	srv_thread_concurrency	= 0;
+
+#ifndef HAVE_ATOMIC_BUILTINS
+
+/** This mutex protects srv_conc data structures */
+static os_fast_mutex_t	srv_conc_mutex;
+
+/** Concurrency list node */
+typedef UT_LIST_NODE_T(struct srv_conc_slot_t)	srv_conc_node_t;
+
+/** Slot for a thread waiting in the concurrency control queue. */
+struct srv_conc_slot_t{
+	os_event_t	event;		/*!< event to wait */
+	ibool		reserved;	/*!< TRUE if slot
+					reserved */
+	ibool		wait_ended;	/*!< TRUE when another thread has
+					already set the event and the thread
+					in this slot is free to proceed; but
+					reserved may still be TRUE at that
+					point */
+	srv_conc_node_t	srv_conc_queue;	/*!< queue node */
+};
+
+/** Queue of threads waiting to get in */
+typedef UT_LIST_BASE_NODE_T(srv_conc_slot_t)	srv_conc_queue_t;
+
+static srv_conc_queue_t	srv_conc_queue;
+
+/** Array of wait slots */
+static srv_conc_slot_t*	srv_conc_slots;
+
+#if defined(UNIV_PFS_MUTEX)
+/* Key to register srv_conc_mutex_key with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_conc_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+/** Variables tracking the active and waiting threads. */
+struct srv_conc_t {
+	char		pad[64  - (sizeof(ulint) + sizeof(lint))];
+
+	/** Number of transactions that have declared_to_be_inside_innodb set.
+	It used to be a non-error for this value to drop below zero temporarily.
+	This is no longer true. We'll, however, keep the lint datatype to add
+	assertions to catch any corner cases that we may have missed. */
+
+	volatile lint	n_active;
+
+	/** Number of OS threads waiting in the FIFO for permission to
+	enter InnoDB */
+	volatile lint	n_waiting;
+};
+
+/* Control variables for tracking concurrency. */
+static srv_conc_t	srv_conc;
+
+/*********************************************************************//**
+Initialise the concurrency management data structures */
+void
+srv_conc_init(void)
+/*===============*/
+{
+#ifndef HAVE_ATOMIC_BUILTINS
+	ulint		i;
+
+	/* Init the server concurrency restriction data structures */
+
+	os_fast_mutex_init(srv_conc_mutex_key, &srv_conc_mutex);
+
+	UT_LIST_INIT(srv_conc_queue);
+
+	srv_conc_slots = static_cast<srv_conc_slot_t*>(
+		mem_zalloc(OS_THREAD_MAX_N * sizeof(*srv_conc_slots)));
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+		srv_conc_slot_t*	conc_slot = &srv_conc_slots[i];
+
+		conc_slot->event = os_event_create();
+		ut_a(conc_slot->event);
+	}
+#endif /* !HAVE_ATOMIC_BUILTINS */
+}
+
+/*********************************************************************//**
+Free the concurrency management data structures */
+void
+srv_conc_free(void)
+/*===============*/
+{
+#ifndef HAVE_ATOMIC_BUILTINS
+	os_fast_mutex_free(&srv_conc_mutex);
+	mem_free(srv_conc_slots);
+	srv_conc_slots = NULL;
+#endif /* !HAVE_ATOMIC_BUILTINS */
+}
+
+#ifdef HAVE_ATOMIC_BUILTINS
+/*********************************************************************//**
+Note that a user thread is entering InnoDB. */
+static
+void
+srv_enter_innodb_with_tickets(
+/*==========================*/
+	trx_t*	trx)			/*!< in/out: transaction that wants
+					to enter InnoDB */
+{
+	trx->declared_to_be_inside_innodb = TRUE;
+	trx->n_tickets_to_enter_innodb = srv_n_free_tickets_to_enter;
+}
+
+/*********************************************************************//**
+Handle the scheduling of a user thread that wants to enter InnoDB.  Setting
+srv_adaptive_max_sleep_delay > 0 switches the adaptive sleep calibration to
+ON. When set, we want to wait in the queue for as little time as possible.
+However, very short waits will result in a lot of context switches and that
+is also not desirable. When threads need to sleep multiple times we increment
+os_thread_sleep_delay by one. When we see threads getting a slot without
+waiting and there are no other threads waiting in the queue, we try and reduce
+the wait as much as we can. Currently we reduce it by half each time. If the
+thread only had to wait for one turn before it was able to enter InnoDB we
+decrement it by one. This is to try and keep the sleep time stable around the
+"optimum" sleep time. */
+static
+void
+srv_conc_enter_innodb_with_atomics(
+/*===============================*/
+	trx_t*	trx)			/*!< in/out: transaction that wants
+					to enter InnoDB */
+{
+	ulint	n_sleeps = 0;
+	ibool	notified_mysql = FALSE;
+
+	ut_a(!trx->declared_to_be_inside_innodb);
+
+	for (;;) {
+		ulint	sleep_in_us;
+
+		if (srv_conc.n_active < (lint) srv_thread_concurrency) {
+			ulint	n_active;
+
+			/* Check if there are any free tickets. */
+			n_active = os_atomic_increment_lint(
+				&srv_conc.n_active, 1);
+
+			if (n_active <= srv_thread_concurrency) {
+
+				srv_enter_innodb_with_tickets(trx);
+
+				if (notified_mysql) {
+
+					(void) os_atomic_decrement_lint(
+						&srv_conc.n_waiting, 1);
+
+					thd_wait_end(trx->mysql_thd);
+				}
+
+				if (srv_adaptive_max_sleep_delay > 0) {
+					if (srv_thread_sleep_delay > 20
+					    && n_sleeps == 1) {
+
+						--srv_thread_sleep_delay;
+					}
+
+					if (srv_conc.n_waiting == 0) {
+						srv_thread_sleep_delay >>= 1;
+					}
+				}
+
+				return;
+			}
+
+			/* Since there were no free seats, we relinquish
+			the overbooked ticket. */
+
+			(void) os_atomic_decrement_lint(
+				&srv_conc.n_active, 1);
+		}
+
+		if (!notified_mysql) {
+			(void) os_atomic_increment_lint(
+				&srv_conc.n_waiting, 1);
+
+			/* Release possible search system latch this
+			thread has */
+
+			if (trx->has_search_latch) {
+				trx_search_latch_release_if_reserved(trx);
+			}
+
+			thd_wait_begin(trx->mysql_thd, THD_WAIT_USER_LOCK);
+
+			notified_mysql = TRUE;
+		}
+
+		trx->op_info = "sleeping before entering InnoDB";
+
+		sleep_in_us = srv_thread_sleep_delay;
+
+		/* Guard against overflow when adaptive sleep delay is on. */
+
+		if (srv_adaptive_max_sleep_delay > 0
+		    && sleep_in_us > srv_adaptive_max_sleep_delay) {
+
+			sleep_in_us = srv_adaptive_max_sleep_delay;
+			srv_thread_sleep_delay = sleep_in_us;
+		}
+
+		os_thread_sleep(sleep_in_us);
+		trx->innodb_que_wait_timer += sleep_in_us;
+
+		trx->op_info = "";
+
+		++n_sleeps;
+
+		if (srv_adaptive_max_sleep_delay > 0 && n_sleeps > 1) {
+			++srv_thread_sleep_delay;
+		}
+	}
+}
+
+/*********************************************************************//**
+Note that a user thread is leaving InnoDB code. */
+static
+void
+srv_conc_exit_innodb_with_atomics(
+/*==============================*/
+	trx_t*	trx)		/*!< in/out: transaction */
+{
+	trx->n_tickets_to_enter_innodb = 0;
+	trx->declared_to_be_inside_innodb = FALSE;
+
+	(void) os_atomic_decrement_lint(&srv_conc.n_active, 1);
+}
+#else
+/*********************************************************************//**
+Note that a user thread is leaving InnoDB code. */
+static
+void
+srv_conc_exit_innodb_without_atomics(
+/*=================================*/
+	trx_t*	trx)		/*!< in/out: transaction */
+{
+	srv_conc_slot_t*	slot;
+
+	os_fast_mutex_lock(&srv_conc_mutex);
+
+	ut_ad(srv_conc.n_active > 0);
+	srv_conc.n_active--;
+	trx->declared_to_be_inside_innodb = FALSE;
+	trx->n_tickets_to_enter_innodb = 0;
+
+	slot = NULL;
+
+	if (srv_conc.n_active < (lint) srv_thread_concurrency) {
+		/* Look for a slot where a thread is waiting and no other
+		thread has yet released the thread */
+
+		for (slot = UT_LIST_GET_FIRST(srv_conc_queue);
+		     slot != NULL && slot->wait_ended == TRUE;
+		     slot = UT_LIST_GET_NEXT(srv_conc_queue, slot)) {
+
+			/* No op */
+		}
+
+		if (slot != NULL) {
+			slot->wait_ended = TRUE;
+
+			/* We increment the count on behalf of the released
+			thread */
+
+			srv_conc.n_active++;
+		}
+	}
+
+	os_fast_mutex_unlock(&srv_conc_mutex);
+
+	if (slot != NULL) {
+		os_event_set(slot->event);
+	}
+}
+
+/*********************************************************************//**
+Handle the scheduling of a user thread that wants to enter InnoDB. */
+static
+void
+srv_conc_enter_innodb_without_atomics(
+/*==================================*/
+	trx_t*	trx)			/*!< in/out: transaction that wants
+					to enter InnoDB */
+{
+	ulint			i;
+	srv_conc_slot_t*	slot = NULL;
+	ibool			has_slept = FALSE;
+	ib_uint64_t		start_time = 0L;
+	ib_uint64_t		finish_time = 0L;
+	ulint			sec;
+	ulint			ms;
+
+	os_fast_mutex_lock(&srv_conc_mutex);
+retry:
+	if (UNIV_UNLIKELY(trx->declared_to_be_inside_innodb)) {
+		os_fast_mutex_unlock(&srv_conc_mutex);
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: trying to declare trx"
+		      " to enter InnoDB, but\n"
+		      "InnoDB: it already is declared.\n", stderr);
+		trx_print(stderr, trx, 0);
+		putc('\n', stderr);
+		return;
+	}
+
+	ut_ad(srv_conc.n_active >= 0);
+
+	if (srv_conc.n_active < (lint) srv_thread_concurrency) {
+
+		srv_conc.n_active++;
+		trx->declared_to_be_inside_innodb = TRUE;
+		trx->n_tickets_to_enter_innodb = srv_n_free_tickets_to_enter;
+
+		os_fast_mutex_unlock(&srv_conc_mutex);
+
+		return;
+	}
+
+	/* If the transaction is not holding resources, let it sleep
+	for srv_thread_sleep_delay microseconds, and try again then */
+
+	if (!has_slept && !trx->has_search_latch
+	    && NULL == UT_LIST_GET_FIRST(trx->lock.trx_locks)) {
+
+		has_slept = TRUE; /* We let it sleep only once to avoid
+				starvation */
+
+		srv_conc.n_waiting++;
+
+		os_fast_mutex_unlock(&srv_conc_mutex);
+
+		trx->op_info = "sleeping before joining InnoDB queue";
+
+		/* Peter Zaitsev suggested that we take the sleep away
+		altogether. But the sleep may be good in pathological
+		situations of lots of thread switches. Simply put some
+		threads aside for a while to reduce the number of thread
+		switches. */
+		if (srv_thread_sleep_delay > 0) {
+			os_thread_sleep(srv_thread_sleep_delay);
+			trx->innodb_que_wait_timer += sleep_in_us;
+		}
+
+		trx->op_info = "";
+
+		os_fast_mutex_lock(&srv_conc_mutex);
+
+		srv_conc.n_waiting--;
+
+		goto retry;
+	}
+
+	/* Too many threads inside: put the current thread to a queue */
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+		slot = srv_conc_slots + i;
+
+		if (!slot->reserved) {
+
+			break;
+		}
+	}
+
+	if (i == OS_THREAD_MAX_N) {
+		/* Could not find a free wait slot, we must let the
+		thread enter */
+
+		srv_conc.n_active++;
+		trx->declared_to_be_inside_innodb = TRUE;
+		trx->n_tickets_to_enter_innodb = 0;
+
+		os_fast_mutex_unlock(&srv_conc_mutex);
+
+		return;
+	}
+
+	/* Release possible search system latch this thread has */
+	if (trx->has_search_latch) {
+		trx_search_latch_release_if_reserved(trx);
+	}
+
+	/* Add to the queue */
+	slot->reserved = TRUE;
+	slot->wait_ended = FALSE;
+
+	UT_LIST_ADD_LAST(srv_conc_queue, srv_conc_queue, slot);
+
+	os_event_reset(slot->event);
+
+	srv_conc.n_waiting++;
+
+	os_fast_mutex_unlock(&srv_conc_mutex);
+
+	/* Go to wait for the event; when a thread leaves InnoDB it will
+	release this thread */
+
+	ut_ad(!trx->has_search_latch);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (UNIV_UNLIKELY(trx->take_stats)) {
+		ut_usectime(&sec, &ms);
+		start_time = (ib_uint64_t)sec * 1000000 + ms;
+	} else {
+		start_time = 0;
+	}
+
+	trx->op_info = "waiting in InnoDB queue";
+
+	thd_wait_begin(trx->mysql_thd, THD_WAIT_USER_LOCK);
+
+	os_event_wait(slot->event);
+	thd_wait_end(trx->mysql_thd);
+
+	trx->op_info = "";
+
+	if (UNIV_UNLIKELY(start_time != 0)) {
+		ut_usectime(&sec, &ms);
+		finish_time = (ib_uint64_t)sec * 1000000 + ms;
+		trx->innodb_que_wait_timer += (ulint)(finish_time - start_time);
+	}
+
+	os_fast_mutex_lock(&srv_conc_mutex);
+
+	srv_conc.n_waiting--;
+
+	/* NOTE that the thread which released this thread already
+	incremented the thread counter on behalf of this thread */
+
+	slot->reserved = FALSE;
+
+	UT_LIST_REMOVE(srv_conc_queue, srv_conc_queue, slot);
+
+	trx->declared_to_be_inside_innodb = TRUE;
+	trx->n_tickets_to_enter_innodb = srv_n_free_tickets_to_enter;
+
+	os_fast_mutex_unlock(&srv_conc_mutex);
+}
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+/*********************************************************************//**
+Puts an OS thread to wait if there are too many concurrent threads
+(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
+UNIV_INTERN
+void
+srv_conc_enter_innodb(
+/*==================*/
+	trx_t*	trx)	/*!< in: transaction object associated with the
+			thread */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	srv_conc_enter_innodb_with_atomics(trx);
+#else
+	srv_conc_enter_innodb_without_atomics(trx);
+#endif /* HAVE_ATOMIC_BUILTINS */
+}
+
+/*********************************************************************//**
+This lets a thread enter InnoDB regardless of the number of threads inside
+InnoDB. This must be called when a thread ends a lock wait. */
+UNIV_INTERN
+void
+srv_conc_force_enter_innodb(
+/*========================*/
+	trx_t*	trx)	/*!< in: transaction object associated with the
+			thread */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (!srv_thread_concurrency) {
+
+		return;
+	}
+
+	ut_ad(srv_conc.n_active >= 0);
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	(void) os_atomic_increment_lint(&srv_conc.n_active, 1);
+#else
+	os_fast_mutex_lock(&srv_conc_mutex);
+	++srv_conc.n_active;
+	os_fast_mutex_unlock(&srv_conc_mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+	trx->n_tickets_to_enter_innodb = 1;
+	trx->declared_to_be_inside_innodb = TRUE;
+}
+
+/*********************************************************************//**
+This must be called when a thread exits InnoDB in a lock wait or at the
+end of an SQL statement. */
+UNIV_INTERN
+void
+srv_conc_force_exit_innodb(
+/*=======================*/
+	trx_t*	trx)	/*!< in: transaction object associated with the
+			thread */
+{
+	if ((trx->mysql_thd != NULL
+	     && thd_is_replication_slave_thread(trx->mysql_thd))
+	    || trx->declared_to_be_inside_innodb == FALSE) {
+
+		return;
+	}
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	srv_conc_exit_innodb_with_atomics(trx);
+#else
+	srv_conc_exit_innodb_without_atomics(trx);
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+}
+
+/*********************************************************************//**
+Get the count of threads waiting inside InnoDB. */
+UNIV_INTERN
+ulint
+srv_conc_get_waiting_threads(void)
+/*==============================*/
+{
+	return(srv_conc.n_waiting);
+}
+
+/*********************************************************************//**
+Get the count of threads active inside InnoDB. */
+UNIV_INTERN
+ulint
+srv_conc_get_active_threads(void)
+/*==============================*/
+{
+	return(srv_conc.n_active);
+ }
+
diff --git a/storage/xtradb/srv/srv0mon.cc b/storage/xtradb/srv/srv0mon.cc
new file mode 100644
index 00000000000..d98315ae9a2
--- /dev/null
+++ b/storage/xtradb/srv/srv0mon.cc
@@ -0,0 +1,1910 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0mon.cc
+Database monitor counter interfaces
+
+Created 12/9/2009 Jimmy Yang
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+#include "os0file.h"
+#include "mach0data.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "buf0buf.h"
+#include "trx0sys.h"
+#include "trx0rseg.h"
+#include "lock0lock.h"
+#include "ibuf0ibuf.h"
+#ifdef UNIV_NONINL
+#include "srv0mon.ic"
+#endif
+
+/* Macro to standardize the counter names for counters in the
+"monitor_buf_page" module as they have very structured defines */
+#define	MONITOR_BUF_PAGE(name, description, code, op, op_code)	\
+	{"buffer_page_"op"_"name, "buffer_page_io",		\
+	 "Number of "description" Pages "op,			\
+	 MONITOR_GROUP_MODULE, MONITOR_DEFAULT_START,		\
+	 MONITOR_##code##_##op_code}
+
+#define MONITOR_BUF_PAGE_READ(name, description, code)		\
+	 MONITOR_BUF_PAGE(name, description, code, "read", PAGE_READ)
+
+#define MONITOR_BUF_PAGE_WRITTEN(name, description, code)	\
+	 MONITOR_BUF_PAGE(name, description, code, "written", PAGE_WRITTEN)
+
+
+/** This array defines basic static information of monitor counters,
+including each monitor's name, module it belongs to, a short
+description and its property/type and corresponding monitor_id.
+Please note: If you add a monitor here, please add its corresponding
+monitor_id to "enum monitor_id_value" structure in srv0mon.h file. */
+
+static monitor_info_t	innodb_counter_info[] =
+{
+	/* A dummy item to mark the module start, this is
+	to accomodate the default value (0) set for the
+	global variables with the control system. */
+	{"module_start", "module_start", "module_start",
+	MONITOR_MODULE,
+	MONITOR_DEFAULT_START, MONITOR_DEFAULT_START},
+
+	/* ========== Counters for Server Metadata ========== */
+	{"module_metadata", "metadata", "Server Metadata",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_METADATA},
+
+	{"metadata_table_handles_opened", "metadata",
+	 "Number of table handles opened",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLE_OPEN},
+
+	{"metadata_table_handles_closed", "metadata",
+	 "Number of table handles closed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLE_CLOSE},
+
+	{"metadata_table_reference_count", "metadata",
+	 "Table reference counter",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLE_REFERENCE},
+
+	{"metadata_mem_pool_size", "metadata",
+	 "Size of a memory pool InnoDB uses to store data dictionary"
+	 " and internal data structures in bytes",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_META_MEM_POOL},
+
+	/* ========== Counters for Lock Module ========== */
+	{"module_lock", "lock", "Lock Module",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_LOCK},
+
+	{"lock_deadlocks", "lock", "Number of deadlocks",
+	 MONITOR_DEFAULT_ON,
+	 MONITOR_DEFAULT_START, MONITOR_DEADLOCK},
+
+	{"lock_timeouts", "lock", "Number of lock timeouts",
+	 MONITOR_DEFAULT_ON,
+	 MONITOR_DEFAULT_START, MONITOR_TIMEOUT},
+
+	{"lock_rec_lock_waits", "lock",
+	 "Number of times enqueued into record lock wait queue",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LOCKREC_WAIT},
+
+	{"lock_table_lock_waits", "lock",
+	 "Number of times enqueued into table lock wait queue",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLELOCK_WAIT},
+
+	{"lock_rec_lock_requests", "lock",
+	 "Number of record locks requested",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_RECLOCK_REQ},
+
+	{"lock_rec_lock_created", "lock", "Number of record locks created",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_RECLOCK_CREATED},
+
+	{"lock_rec_lock_removed", "lock",
+	 "Number of record locks removed from the lock queue",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_RECLOCK_REMOVED},
+
+	{"lock_rec_locks", "lock",
+	 "Current number of record locks on tables",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_RECLOCK},
+
+	{"lock_table_lock_created", "lock", "Number of table locks created",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLELOCK_CREATED},
+
+	{"lock_table_lock_removed", "lock",
+	 "Number of table locks removed from the lock queue",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLELOCK_REMOVED},
+
+	{"lock_table_locks", "lock",
+	 "Current number of table locks on tables",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_TABLELOCK},
+
+	{"lock_row_lock_current_waits", "lock",
+	 "Number of row locks currently being waited for"
+	 " (innodb_row_lock_current_waits)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT},
+
+	{"lock_row_lock_time", "lock",
+	 "Time spent in acquiring row locks, in milliseconds"
+	 " (innodb_row_lock_time)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_WAIT_TIME},
+
+	{"lock_row_lock_time_max", "lock",
+	 "The maximum time to acquire a row lock, in milliseconds"
+	 " (innodb_row_lock_time_max)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_MAX_WAIT_TIME},
+
+	{"lock_row_lock_waits", "lock",
+	 "Number of times a row lock had to be waited for"
+	 " (innodb_row_lock_waits)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_ROW_LOCK_WAIT},
+
+	{"lock_row_lock_time_avg", "lock",
+	 "The average time to acquire a row lock, in milliseconds"
+	 " (innodb_row_lock_time_avg)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_AVG_WAIT_TIME},
+
+	/* ========== Counters for Buffer Manager and I/O ========== */
+	{"module_buffer", "buffer", "Buffer Manager Module",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_BUFFER},
+
+	{"buffer_pool_size", "server",
+	 "Server buffer pool size (all buffer pools) in bytes",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUFFER_POOL_SIZE},
+
+	{"buffer_pool_reads", "buffer",
+	 "Number of reads directly from disk (innodb_buffer_pool_reads)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READS},
+
+	{"buffer_pool_read_requests", "buffer",
+	 "Number of logical read requests (innodb_buffer_pool_read_requests)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_REQUESTS},
+
+	{"buffer_pool_write_requests", "buffer",
+	 "Number of write requests (innodb_buffer_pool_write_requests)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_WRITE_REQUEST},
+
+	{"buffer_pool_wait_free", "buffer",
+	 "Number of times waited for free buffer"
+	 " (innodb_buffer_pool_wait_free)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_WAIT_FREE},
+
+	{"buffer_pool_read_ahead", "buffer",
+	 "Number of pages read as read ahead (innodb_buffer_pool_read_ahead)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_AHEAD},
+
+	{"buffer_pool_read_ahead_evicted", "buffer",
+	 "Read-ahead pages evicted without being accessed"
+	 " (innodb_buffer_pool_read_ahead_evicted)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED},
+
+	{"buffer_pool_pages_total", "buffer",
+	 "Total buffer pool size in pages (innodb_buffer_pool_pages_total)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGE_TOTAL},
+
+	{"buffer_pool_pages_misc", "buffer",
+	 "Buffer pages for misc use such as row locks or the adaptive"
+	 " hash index (innodb_buffer_pool_pages_misc)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGE_MISC},
+
+	{"buffer_pool_pages_data", "buffer",
+	 "Buffer pages containing data (innodb_buffer_pool_pages_data)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_DATA},
+
+	{"buffer_pool_bytes_data", "buffer",
+	 "Buffer bytes containing data (innodb_buffer_pool_bytes_data)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_BYTES_DATA},
+
+	{"buffer_pool_pages_dirty", "buffer",
+	 "Buffer pages currently dirty (innodb_buffer_pool_pages_dirty)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_DIRTY},
+
+	{"buffer_pool_bytes_dirty", "buffer",
+	 "Buffer bytes currently dirty (innodb_buffer_pool_bytes_dirty)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_BYTES_DIRTY},
+
+	{"buffer_pool_pages_free", "buffer",
+	 "Buffer pages currently free (innodb_buffer_pool_pages_free)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_FREE},
+
+	{"buffer_pages_created", "buffer",
+	 "Number of pages created (innodb_pages_created)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_CREATED},
+
+	{"buffer_pages_written", "buffer",
+	 "Number of pages written (innodb_pages_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_WRITTEN},
+
+	{"buffer_pages_read", "buffer",
+	 "Number of pages read (innodb_pages_read)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_READ},
+
+	{"buffer_data_reads", "buffer",
+	 "Amount of data read in bytes (innodb_data_reads)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BYTE_READ},
+
+	{"buffer_data_written", "buffer",
+	 "Amount of data written in bytes (innodb_data_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BYTE_WRITTEN},
+
+	/* Cumulative counter for scanning in flush batches */
+	{"buffer_flush_batch_scanned", "buffer",
+	 "Total pages scanned as part of flush batch",
+	 MONITOR_SET_OWNER,
+	 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+	 MONITOR_FLUSH_BATCH_SCANNED},
+
+	{"buffer_flush_batch_num_scan", "buffer",
+	 "Number of times buffer flush list flush is called",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_SCANNED,
+	 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL},
+
+	{"buffer_flush_batch_scanned_per_call", "buffer",
+	 "Pages scanned per flush batch scan",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_SCANNED,
+	 MONITOR_FLUSH_BATCH_SCANNED_PER_CALL},
+
+	{"buffer_flush_batch_rescan", "buffer",
+	 "Number of times rescan of flush list forced",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_HP_RESCAN},
+
+	/* Cumulative counter for pages flushed in flush batches */
+	{"buffer_flush_batch_total_pages", "buffer",
+	 "Total pages flushed as part of flush batch",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_BATCH_COUNT,
+	 MONITOR_FLUSH_BATCH_TOTAL_PAGE},
+
+	{"buffer_flush_batches", "buffer",
+	 "Number of flush batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+	 MONITOR_FLUSH_BATCH_COUNT},
+
+	{"buffer_flush_batch_pages", "buffer",
+	 "Pages queued as a flush batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+	 MONITOR_FLUSH_BATCH_PAGES},
+
+	/* Cumulative counter for flush batches because of neighbor */
+	{"buffer_flush_neighbor_total_pages", "buffer",
+	 "Total neighbors flushed as part of neighbor flush",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_NEIGHBOR_COUNT,
+	 MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE},
+
+	{"buffer_flush_neighbor", "buffer",
+	 "Number of times neighbors flushing is invoked",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+	 MONITOR_FLUSH_NEIGHBOR_COUNT},
+
+	{"buffer_flush_neighbor_pages", "buffer",
+	 "Pages queued as a neighbor batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+	 MONITOR_FLUSH_NEIGHBOR_PAGES},
+
+	{"buffer_flush_n_to_flush_requested", "buffer",
+	 "Number of pages requested for flushing.",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_N_TO_FLUSH_REQUESTED},
+
+	{"buffer_flush_avg_page_rate", "buffer",
+	 "Average number of pages at which flushing is happening",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_AVG_PAGE_RATE},
+
+	{"buffer_flush_lsn_avg_rate", "buffer",
+	 "Average redo generation rate",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_LSN_AVG_RATE},
+
+	{"buffer_flush_pct_for_dirty", "buffer",
+	 "Percent of IO capacity used to avoid max dirty page limit",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_PCT_FOR_DIRTY},
+
+	{"buffer_flush_pct_for_lsn", "buffer",
+	 "Percent of IO capacity used to avoid reusable redo space limit",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_PCT_FOR_LSN},
+
+	{"buffer_flush_sync_waits", "buffer",
+	 "Number of times a wait happens due to sync flushing",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_SYNC_WAITS},
+
+	/* Cumulative counter for flush batches for adaptive flushing  */
+	{"buffer_flush_adaptive_total_pages", "buffer",
+	 "Total pages flushed as part of adaptive flushing",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_ADAPTIVE_COUNT,
+	 MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE},
+
+	{"buffer_flush_adaptive", "buffer",
+	 "Number of adaptive batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+	 MONITOR_FLUSH_ADAPTIVE_COUNT},
+
+	{"buffer_flush_adaptive_pages", "buffer",
+	 "Pages queued as an adaptive batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+	 MONITOR_FLUSH_ADAPTIVE_PAGES},
+
+	/* Cumulative counter for flush batches because of sync */
+	{"buffer_flush_sync_total_pages", "buffer",
+	 "Total pages flushed as part of sync batches",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_SYNC_COUNT,
+	 MONITOR_FLUSH_SYNC_TOTAL_PAGE},
+
+	{"buffer_flush_sync", "buffer",
+	 "Number of sync batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+	 MONITOR_FLUSH_SYNC_COUNT},
+
+	{"buffer_flush_sync_pages", "buffer",
+	 "Pages queued as a sync batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+	 MONITOR_FLUSH_SYNC_PAGES},
+
+	/* Cumulative counter for flush batches because of background */
+	{"buffer_flush_background_total_pages", "buffer",
+	 "Total pages flushed as part of background batches",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_BACKGROUND_COUNT,
+	 MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE},
+
+	{"buffer_flush_background", "buffer",
+	 "Number of background batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+	 MONITOR_FLUSH_BACKGROUND_COUNT},
+
+	{"buffer_flush_background_pages", "buffer",
+	 "Pages queued as a background batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+	 MONITOR_FLUSH_BACKGROUND_PAGES},
+
+	/* Cumulative counter for LRU batch scan */
+	{"buffer_LRU_batch_scanned", "buffer",
+	 "Total pages scanned as part of LRU batch",
+	 MONITOR_SET_OWNER, MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+	 MONITOR_LRU_BATCH_SCANNED},
+
+	{"buffer_LRU_batch_num_scan", "buffer",
+	 "Number of times LRU batch is called",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_SCANNED,
+	 MONITOR_LRU_BATCH_SCANNED_NUM_CALL},
+
+	{"buffer_LRU_batch_scanned_per_call", "buffer",
+	 "Pages scanned per LRU batch call",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_SCANNED,
+	 MONITOR_LRU_BATCH_SCANNED_PER_CALL},
+
+	/* Cumulative counter for LRU batch pages flushed */
+	{"buffer_LRU_batch_total_pages", "buffer",
+	 "Total pages flushed as part of LRU batches",
+	 MONITOR_SET_OWNER, MONITOR_LRU_BATCH_COUNT,
+	 MONITOR_LRU_BATCH_TOTAL_PAGE},
+
+	{"buffer_LRU_batches", "buffer",
+	 "Number of LRU batches",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_TOTAL_PAGE,
+	 MONITOR_LRU_BATCH_COUNT},
+
+	{"buffer_LRU_batch_pages", "buffer",
+	 "Pages queued as an LRU batch",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_TOTAL_PAGE,
+	 MONITOR_LRU_BATCH_PAGES},
+
+	/* Cumulative counter for single page LRU scans */
+	{"buffer_LRU_single_flush_scanned", "buffer",
+	 "Total pages scanned as part of single page LRU flush",
+	 MONITOR_SET_OWNER,
+	 MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
+	 MONITOR_LRU_SINGLE_FLUSH_SCANNED},
+
+	{"buffer_LRU_single_flush_num_scan", "buffer",
+	 "Number of times single page LRU flush is called",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_SINGLE_FLUSH_SCANNED,
+	 MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL},
+
+	{"buffer_LRU_single_flush_scanned_per_call", "buffer",
+	 "Page scanned per single LRU flush",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_SINGLE_FLUSH_SCANNED,
+	 MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL},
+
+	{"buffer_LRU_single_flush_failure_count", "Buffer",
+	 "Number of times attempt to flush a single page from LRU failed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT},
+
+	{"buffer_LRU_get_free_search", "Buffer",
+	 "Number of searches performed for a clean page",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LRU_GET_FREE_SEARCH},
+
+	/* Cumulative counter for LRU search scans */
+	{"buffer_LRU_search_scanned", "buffer",
+	 "Total pages scanned as part of LRU search",
+	 MONITOR_SET_OWNER,
+	 MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+	 MONITOR_LRU_SEARCH_SCANNED},
+
+	{"buffer_LRU_search_num_scan", "buffer",
+	 "Number of times LRU search is performed",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_SEARCH_SCANNED,
+	 MONITOR_LRU_SEARCH_SCANNED_NUM_CALL},
+
+	{"buffer_LRU_search_scanned_per_call", "buffer",
+	 "Page scanned per single LRU search",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_SEARCH_SCANNED,
+	 MONITOR_LRU_SEARCH_SCANNED_PER_CALL},
+
+	/* Cumulative counter for LRU unzip search scans */
+	{"buffer_LRU_unzip_search_scanned", "buffer",
+	 "Total pages scanned as part of LRU unzip search",
+	 MONITOR_SET_OWNER,
+	 MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+	 MONITOR_LRU_UNZIP_SEARCH_SCANNED},
+
+	{"buffer_LRU_unzip_search_num_scan", "buffer",
+	 "Number of times LRU unzip search is performed",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+	 MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL},
+
+	{"buffer_LRU_unzip_search_scanned_per_call", "buffer",
+	 "Page scanned per single LRU unzip search",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+	 MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL},
+
+	/* ========== Counters for Buffer Page I/O ========== */
+	{"module_buffer_page", "buffer_page_io", "Buffer Page I/O Module",
+	 static_cast<monitor_type_t>(
+	 MONITOR_MODULE | MONITOR_GROUP_MODULE),
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_BUF_PAGE},
+
+	MONITOR_BUF_PAGE_READ("index_leaf","Index Leaf", INDEX_LEAF),
+
+	MONITOR_BUF_PAGE_READ("index_non_leaf","Index Non-leaf",
+			      INDEX_NON_LEAF),
+
+	MONITOR_BUF_PAGE_READ("index_ibuf_leaf", "Insert Buffer Index Leaf",
+			      INDEX_IBUF_LEAF),
+
+	MONITOR_BUF_PAGE_READ("index_ibuf_non_leaf",
+			      "Insert Buffer Index Non-Leaf",
+			       INDEX_IBUF_NON_LEAF),
+
+	MONITOR_BUF_PAGE_READ("undo_log", "Undo Log", UNDO_LOG),
+
+	MONITOR_BUF_PAGE_READ("index_inode", "Index Inode", INODE),
+
+	MONITOR_BUF_PAGE_READ("ibuf_free_list", "Insert Buffer Free List",
+			      IBUF_FREELIST),
+
+	MONITOR_BUF_PAGE_READ("ibuf_bitmap", "Insert Buffer Bitmap",
+			      IBUF_BITMAP),
+
+	MONITOR_BUF_PAGE_READ("system_page", "System", SYSTEM),
+
+	MONITOR_BUF_PAGE_READ("trx_system", "Transaction System", TRX_SYSTEM),
+
+	MONITOR_BUF_PAGE_READ("fsp_hdr", "File Space Header", FSP_HDR),
+
+	MONITOR_BUF_PAGE_READ("xdes", "Extent Descriptor", XDES),
+
+	MONITOR_BUF_PAGE_READ("blob", "Uncompressed BLOB", BLOB),
+
+	MONITOR_BUF_PAGE_READ("zblob", "First Compressed BLOB", ZBLOB),
+
+	MONITOR_BUF_PAGE_READ("zblob2", "Subsequent Compressed BLOB", ZBLOB2),
+
+	MONITOR_BUF_PAGE_READ("other", "other/unknown (old version of InnoDB)",
+			      OTHER),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_leaf","Index Leaf", INDEX_LEAF),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_non_leaf","Index Non-leaf",
+				 INDEX_NON_LEAF),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_ibuf_leaf", "Insert Buffer Index Leaf",
+				 INDEX_IBUF_LEAF),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_ibuf_non_leaf",
+				 "Insert Buffer Index Non-Leaf",
+				 INDEX_IBUF_NON_LEAF),
+
+	MONITOR_BUF_PAGE_WRITTEN("undo_log", "Undo Log", UNDO_LOG),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_inode", "Index Inode", INODE),
+
+	MONITOR_BUF_PAGE_WRITTEN("ibuf_free_list", "Insert Buffer Free List",
+				 IBUF_FREELIST),
+
+	MONITOR_BUF_PAGE_WRITTEN("ibuf_bitmap", "Insert Buffer Bitmap",
+				 IBUF_BITMAP),
+
+	MONITOR_BUF_PAGE_WRITTEN("system_page", "System", SYSTEM),
+
+	MONITOR_BUF_PAGE_WRITTEN("trx_system", "Transaction System",
+				 TRX_SYSTEM),
+
+	MONITOR_BUF_PAGE_WRITTEN("fsp_hdr", "File Space Header", FSP_HDR),
+
+	MONITOR_BUF_PAGE_WRITTEN("xdes", "Extent Descriptor", XDES),
+
+	MONITOR_BUF_PAGE_WRITTEN("blob", "Uncompressed BLOB", BLOB),
+
+	MONITOR_BUF_PAGE_WRITTEN("zblob", "First Compressed BLOB", ZBLOB),
+
+	MONITOR_BUF_PAGE_WRITTEN("zblob2", "Subsequent Compressed BLOB",
+				 ZBLOB2),
+
+	MONITOR_BUF_PAGE_WRITTEN("other", "other/unknown (old version InnoDB)",
+				 OTHER),
+
+	/* ========== Counters for OS level operations ========== */
+	{"module_os", "os", "OS Level Operation",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_OS},
+
+	{"os_data_reads", "os",
+	 "Number of reads initiated (innodb_data_reads)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FILE_READ},
+
+	{"os_data_writes", "os",
+	 "Number of writes initiated (innodb_data_writes)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FILE_WRITE},
+
+	{"os_data_fsyncs", "os",
+	 "Number of fsync() calls (innodb_data_fsyncs)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FSYNC},
+
+	{"os_pending_reads", "os", "Number of reads pending",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OS_PENDING_READS},
+
+	{"os_pending_writes", "os", "Number of writes pending",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OS_PENDING_WRITES},
+
+	{"os_log_bytes_written", "os",
+	 "Bytes of log written (innodb_os_log_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_WRITTEN},
+
+	{"os_log_fsyncs", "os",
+	 "Number of fsync log writes (innodb_os_log_fsyncs)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_FSYNC},
+
+	{"os_log_pending_fsyncs", "os",
+	 "Number of pending fsync write (innodb_os_log_pending_fsyncs)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_PENDING_FSYNC},
+
+	{"os_log_pending_writes", "os",
+	 "Number of pending log file writes (innodb_os_log_pending_writes)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_PENDING_WRITES},
+
+	/* ========== Counters for Transaction Module ========== */
+	{"module_trx", "transaction", "Transaction Manager",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_TRX},
+
+	{"trx_rw_commits", "transaction", "Number of read-write transactions "
+	  "committed",
+	 MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_RW_COMMIT},
+
+	{"trx_ro_commits", "transaction", "Number of read-only transactions "
+	  "committed",
+	 MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_RO_COMMIT},
+
+	{"trx_nl_ro_commits", "transaction", "Number of non-locking "
+	 "auto-commit read-only transactions committed",
+	 MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_NL_RO_COMMIT},
+
+	{"trx_commits_insert_update", "transaction",
+	 "Number of transactions committed with inserts and updates",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_COMMIT_UNDO},
+
+	{"trx_rollbacks", "transaction",
+	 "Number of transactions rolled back",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK},
+
+	{"trx_rollbacks_savepoint", "transaction",
+	 "Number of transactions rolled back to savepoint",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK_SAVEPOINT},
+
+	{"trx_rollback_active", "transaction",
+	 "Number of resurrected active transactions rolled back",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK_ACTIVE},
+
+	{"trx_active_transactions", "transaction",
+	 "Number of active transactions",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_ACTIVE},
+
+	{"trx_rseg_history_len", "transaction",
+	 "Length of the TRX_RSEG_HISTORY list",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_RSEG_HISTORY_LEN},
+
+	{"trx_undo_slots_used", "transaction", "Number of undo slots used",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_UNDO_SLOT_USED},
+
+	{"trx_undo_slots_cached", "transaction",
+	 "Number of undo slots cached",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_UNDO_SLOT_CACHED},
+
+	{"trx_rseg_current_size", "transaction",
+	 "Current rollback segment size in pages",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_RSEG_CUR_SIZE},
+
+	/* ========== Counters for Purge Module ========== */
+	{"module_purge", "purge", "Purge Module",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_PURGE},
+
+	{"purge_del_mark_records", "purge",
+	 "Number of delete-marked rows purged",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_N_DEL_ROW_PURGE},
+
+	{"purge_upd_exist_or_extern_records", "purge",
+	 "Number of purges on updates of existing records and "
+	 " updates on delete marked record with externally stored field",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_N_UPD_EXIST_EXTERN},
+
+	{"purge_invoked", "purge",
+	 "Number of times purge was invoked",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PURGE_INVOKED},
+
+	{"purge_undo_log_pages", "purge",
+	 "Number of undo log pages handled by the purge",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PURGE_N_PAGE_HANDLED},
+
+	{"purge_dml_delay_usec", "purge",
+	 "Microseconds DML to be delayed due to purge lagging",
+	 MONITOR_DISPLAY_CURRENT,
+	 MONITOR_DEFAULT_START, MONITOR_DML_PURGE_DELAY},
+
+	{"purge_stop_count", "purge",
+	 "Number of times purge was stopped",
+	 MONITOR_DISPLAY_CURRENT,
+	 MONITOR_DEFAULT_START, MONITOR_PURGE_STOP_COUNT},
+
+	{"purge_resume_count", "purge",
+	 "Number of times purge was resumed",
+	 MONITOR_DISPLAY_CURRENT,
+	 MONITOR_DEFAULT_START, MONITOR_PURGE_RESUME_COUNT},
+
+	/* ========== Counters for Recovery Module ========== */
+	{"module_log", "recovery", "Recovery Module",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_RECOVERY},
+
+	{"log_checkpoints", "recovery", "Number of checkpoints",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_CHECKPOINT},
+
+	{"log_lsn_last_flush", "recovery", "LSN of Last flush",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_FLUSHDISK},
+
+	{"log_lsn_last_checkpoint", "recovery", "LSN at last checkpoint",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_CHECKPOINT},
+
+	{"log_lsn_current", "recovery", "Current LSN value",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_CURRENT},
+
+	{"log_lsn_checkpoint_age", "recovery",
+	 "Current LSN value minus LSN at last checkpoint",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LSN_CHECKPOINT_AGE},
+
+	{"log_lsn_buf_pool_oldest", "recovery",
+	 "The oldest modified block LSN in the buffer pool",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_OLDEST_LSN},
+
+	{"log_max_modified_age_async", "recovery",
+	 "Maximum LSN difference; when exceeded, start asynchronous preflush",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_MAX_AGE_ASYNC},
+
+	{"log_max_modified_age_sync", "recovery",
+	 "Maximum LSN difference; when exceeded, start synchronous preflush",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_MAX_AGE_SYNC},
+
+	{"log_pending_log_writes", "recovery", "Pending log writes",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PENDING_LOG_WRITE},
+
+	{"log_pending_checkpoint_writes", "recovery", "Pending checkpoints",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PENDING_CHECKPOINT_WRITE},
+
+	{"log_num_log_io", "recovery", "Number of log I/Os",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LOG_IO},
+
+	{"log_waits", "recovery",
+	 "Number of log waits due to small log buffer (innodb_log_waits)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WAITS},
+
+	{"log_write_requests", "recovery",
+	 "Number of log write requests (innodb_log_write_requests)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WRITE_REQUEST},
+
+	{"log_writes", "recovery",
+	 "Number of log writes (innodb_log_writes)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WRITES},
+
+	/* ========== Counters for Page Compression ========== */
+	{"module_compress", "compression", "Page Compression Info",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_PAGE},
+
+	{"compress_pages_compressed", "compression",
+	 "Number of pages compressed", MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PAGE_COMPRESS},
+
+	{"compress_pages_decompressed", "compression",
+	 "Number of pages decompressed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PAGE_DECOMPRESS},
+
+	{"compression_pad_increments", "compression",
+	 "Number of times padding is incremented to avoid compression failures",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PAD_INCREMENTS},
+
+	{"compression_pad_decrements", "compression",
+	 "Number of times padding is decremented due to good compressibility",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PAD_DECREMENTS},
+
+	/* ========== Counters for Index ========== */
+	{"module_index", "index", "Index Manager",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_INDEX},
+
+	{"index_splits", "index", "Number of index splits",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_INDEX_SPLIT},
+
+	{"index_merges", "index", "Number of index merges",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_INDEX_MERGE},
+
+	/* ========== Counters for Adaptive Hash Index ========== */
+	{"module_adaptive_hash", "adaptive_hash_index", "Adpative Hash Index",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_ADAPTIVE_HASH},
+
+	{"adaptive_hash_searches", "adaptive_hash_index",
+	 "Number of successful searches using Adaptive Hash Index",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_ADAPTIVE_HASH_SEARCH},
+
+	{"adaptive_hash_searches_btree", "adaptive_hash_index",
+	 "Number of searches using B-tree on an index search",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE},
+
+	{"adaptive_hash_pages_added", "adaptive_hash_index",
+	 "Number of index pages on which the Adaptive Hash Index is built",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_PAGE_ADDED},
+
+	{"adaptive_hash_pages_removed", "adaptive_hash_index",
+	 "Number of index pages whose corresponding Adaptive Hash Index"
+	 " entries were removed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_PAGE_REMOVED},
+
+	{"adaptive_hash_rows_added", "adaptive_hash_index",
+	 "Number of Adaptive Hash Index rows added",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_ADDED},
+
+	{"adaptive_hash_rows_removed", "adaptive_hash_index",
+	 "Number of Adaptive Hash Index rows removed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_REMOVED},
+
+	{"adaptive_hash_rows_deleted_no_hash_entry", "adaptive_hash_index",
+	 "Number of rows deleted that did not have corresponding Adaptive Hash"
+	 " Index entries",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND},
+
+	{"adaptive_hash_rows_updated", "adaptive_hash_index",
+	 "Number of Adaptive Hash Index rows updated",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_UPDATED},
+
+	/* ========== Counters for tablespace ========== */
+	{"module_file", "file_system", "Tablespace and File System Manager",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_FIL_SYSTEM},
+
+	{"file_num_open_files", "file_system",
+	 "Number of files currently open (innodb_num_open_files)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_N_FILE_OPENED},
+
+	/* ========== Counters for Change Buffer ========== */
+	{"module_ibuf_system", "change_buffer", "InnoDB Change Buffer",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_IBUF_SYSTEM},
+
+	{"ibuf_merges_insert", "change_buffer",
+	 "Number of inserted records merged by change buffering",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_INSERT},
+
+	{"ibuf_merges_delete_mark", "change_buffer",
+	 "Number of deleted records merged by change buffering",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DELETE},
+
+	{"ibuf_merges_delete", "change_buffer",
+	 "Number of purge records merged by change buffering",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_PURGE},
+
+	{"ibuf_merges_discard_insert", "change_buffer",
+	 "Number of insert merged operations discarded",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT},
+
+	{"ibuf_merges_discard_delete_mark", "change_buffer",
+	 "Number of deleted merged operations discarded",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE},
+
+	{"ibuf_merges_discard_delete", "change_buffer",
+	 "Number of purge merged  operations discarded",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE},
+
+	{"ibuf_merges", "change_buffer", "Number of change buffer merges",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGES},
+
+	{"ibuf_size", "change_buffer", "Change buffer size in pages",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_SIZE},
+
+	/* ========== Counters for server operations ========== */
+	{"module_innodb", "innodb",
+	 "Counter for general InnoDB server wide operations and properties",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_SERVER},
+
+	{"innodb_master_thread_sleeps", "server",
+	 "Number of times (seconds) master thread sleeps",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_MASTER_THREAD_SLEEP},
+
+	{"innodb_activity_count", "server", "Current server activity count",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_SERVER_ACTIVITY},
+
+	{"innodb_master_active_loops", "server",
+	 "Number of times master thread performs its tasks when"
+	 " server is active",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_MASTER_ACTIVE_LOOPS},
+
+	{"innodb_master_idle_loops", "server",
+	 "Number of times master thread performs its tasks when server is idle",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_MASTER_IDLE_LOOPS},
+
+	{"innodb_background_drop_table_usec", "server",
+	 "Time (in microseconds) spent to process drop table list",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND},
+
+	{"innodb_ibuf_merge_usec", "server",
+	 "Time (in microseconds) spent to process change buffer merge",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_IBUF_MERGE_MICROSECOND},
+
+	{"innodb_log_flush_usec", "server",
+	 "Time (in microseconds) spent to flush log records",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_LOG_FLUSH_MICROSECOND},
+
+	{"innodb_mem_validate_usec", "server",
+	 "Time (in microseconds) spent to do memory validation",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_MEM_VALIDATE_MICROSECOND},
+
+	{"innodb_master_purge_usec", "server",
+	 "Time (in microseconds) spent by master thread to purge records",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_PURGE_MICROSECOND},
+
+	{"innodb_dict_lru_usec", "server",
+	 "Time (in microseconds) spent to process DICT LRU list",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_DICT_LRU_MICROSECOND},
+
+	{"innodb_checkpoint_usec", "server",
+	 "Time (in microseconds) spent by master thread to do checkpoint",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_CHECKPOINT_MICROSECOND},
+
+	{"innodb_dblwr_writes", "server",
+	 "Number of doublewrite operations that have been performed"
+	 " (innodb_dblwr_writes)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_DBLWR_WRITES},
+
+	{"innodb_dblwr_pages_written", "server",
+	 "Number of pages that have been written for doublewrite operations"
+	 " (innodb_dblwr_pages_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN},
+
+	{"innodb_page_size", "server",
+	 "InnoDB page size in bytes (innodb_page_size)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_PAGE_SIZE},
+
+	{"innodb_rwlock_s_spin_waits", "server",
+	 "Number of rwlock spin waits due to shared latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_SPIN_WAITS},
+
+	{"innodb_rwlock_x_spin_waits", "server",
+	 "Number of rwlock spin waits due to exclusive latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_SPIN_WAITS},
+
+	{"innodb_rwlock_s_spin_rounds", "server",
+	 "Number of rwlock spin loop rounds due to shared latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS},
+
+	{"innodb_rwlock_x_spin_rounds", "server",
+	 "Number of rwlock spin loop rounds due to exclusive latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS},
+
+	{"innodb_rwlock_s_os_waits", "server",
+	 "Number of OS waits due to shared latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_OS_WAITS},
+
+	{"innodb_rwlock_x_os_waits", "server",
+	 "Number of OS waits due to exclusive latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_OS_WAITS},
+
+	/* ========== Counters for DML operations ========== */
+	{"module_dml", "dml", "Statistics for DMLs",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_DML_STATS},
+
+	{"dml_reads", "dml", "Number of rows read",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_READ},
+
+	{"dml_inserts", "dml", "Number of rows inserted",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_INSERTED},
+
+	{"dml_deletes", "dml", "Number of rows deleted",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_DELETED},
+
+	{"dml_updates", "dml", "Number of rows updated",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_UPDTATED},
+
+	/* ========== Counters for DDL operations ========== */
+	{"module_ddl", "ddl", "Statistics for DDLs",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_DDL_STATS},
+
+	{"ddl_background_drop_indexes", "ddl",
+	 "Number of indexes waiting to be dropped after failed index creation",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_BACKGROUND_DROP_INDEX},
+
+	{"ddl_background_drop_tables", "ddl",
+	 "Number of tables in background drop table list",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_BACKGROUND_DROP_TABLE},
+
+	{"ddl_online_create_index", "ddl",
+	 "Number of indexes being created online",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ONLINE_CREATE_INDEX},
+
+	{"ddl_pending_alter_table", "ddl",
+	 "Number of ALTER TABLE, CREATE INDEX, DROP INDEX in progress",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PENDING_ALTER_TABLE},
+
+	/* ===== Counters for ICP (Index Condition Pushdown) Module ===== */
+	{"module_icp", "icp", "Index Condition Pushdown",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_ICP},
+
+	{"icp_attempts", "icp",
+	 "Number of attempts for index push-down condition checks",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ICP_ATTEMPTS},
+
+	{"icp_no_match", "icp", "Index push-down condition does not match",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ICP_NO_MATCH},
+
+	{"icp_out_of_range", "icp", "Index push-down condition out of range",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ICP_OUT_OF_RANGE},
+
+	{"icp_match", "icp", "Index push-down condition matches",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ICP_MATCH},
+
+	/* ========== To turn on/off reset all counters ========== */
+	{"all", "All Counters", "Turn on/off and reset all counters",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_ALL_COUNTER}
+};
+
+/* The "innodb_counter_value" array stores actual counter values */
+UNIV_INTERN monitor_value_t	innodb_counter_value[NUM_MONITOR];
+
+/* monitor_set_tbl is used to record and determine whether a monitor
+has been turned on/off. */
+UNIV_INTERN ulint		monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT
+						- 1) / NUM_BITS_ULINT];
+
+#ifndef HAVE_ATOMIC_BUILTINS_64
+/** Mutex protecting atomic operations on platforms that lack
+built-in operations for atomic memory access */
+ib_mutex_t	monitor_mutex;
+
+/** Key to register monitor_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	monitor_mutex_key;
+
+/****************************************************************//**
+Initialize the monitor subsystem. */
+UNIV_INTERN
+void
+srv_mon_create(void)
+/*================*/
+{
+	mutex_create(monitor_mutex_key, &monitor_mutex, SYNC_ANY_LATCH);
+}
+/****************************************************************//**
+Close the monitor subsystem. */
+UNIV_INTERN
+void
+srv_mon_free(void)
+/*==============*/
+{
+	mutex_free(&monitor_mutex);
+}
+#endif /* !HAVE_ATOMIC_BUILTINS_64 */
+
+/****************************************************************//**
+Get a monitor's "monitor_info" by its monitor id (index into the
+innodb_counter_info array.
+@return	Point to corresponding monitor_info_t, or NULL if no such
+monitor */
+UNIV_INTERN
+monitor_info_t*
+srv_mon_get_info(
+/*=============*/
+	monitor_id_t	monitor_id)	/*!< id indexing into the
+					innodb_counter_info array */
+{
+	ut_a(monitor_id < NUM_MONITOR);
+
+	return((monitor_id < NUM_MONITOR)
+			? &innodb_counter_info[monitor_id]
+			: NULL);
+}
+
+/****************************************************************//**
+Get monitor's name by its monitor id (indexing into the
+innodb_counter_info array.
+@return	corresponding monitor name, or NULL if no such
+monitor */
+UNIV_INTERN
+const char*
+srv_mon_get_name(
+/*=============*/
+	monitor_id_t	monitor_id)	/*!< id index into the
+					innodb_counter_info array */
+{
+	ut_a(monitor_id < NUM_MONITOR);
+
+	return((monitor_id < NUM_MONITOR)
+			? innodb_counter_info[monitor_id].monitor_name
+			: NULL);
+}
+
+/****************************************************************//**
+Turn on/off, reset monitor counters in a module. If module_id
+is MONITOR_ALL_COUNTER then turn on all monitor counters.
+turned on because it has already been turned on. */
+UNIV_INTERN
+void
+srv_mon_set_module_control(
+/*=======================*/
+	monitor_id_t	module_id,	/*!< in: Module ID as in
+					monitor_counter_id. If it is
+					set to MONITOR_ALL_COUNTER, this means
+					we shall turn on all the counters */
+	mon_option_t	set_option)	/*!< in: Turn on/off reset the
+					counter */
+{
+	ulint	ix;
+	ulint	start_id;
+	ibool	set_current_module = FALSE;
+
+	ut_a(module_id <= NUM_MONITOR);
+	ut_a(UT_ARR_SIZE(innodb_counter_info) == NUM_MONITOR);
+
+	/* The module_id must be an ID of MONITOR_MODULE type */
+	ut_a(innodb_counter_info[module_id].monitor_type & MONITOR_MODULE);
+
+	/* start with the first monitor in the module. If module_id
+	is MONITOR_ALL_COUNTER, this means we need to turn on all
+	monitor counters. */
+	if (module_id == MONITOR_ALL_COUNTER) {
+		start_id = 1;
+	} else if (innodb_counter_info[module_id].monitor_type
+		   & MONITOR_GROUP_MODULE) {
+		/* Counters in this module are set as a group together
+		and cannot be turned on/off individually. Need to set
+		the on/off bit in the module counter */
+		start_id = module_id;
+		set_current_module = TRUE;
+
+	} else {
+		start_id = module_id + 1;
+	}
+
+	for (ix = start_id; ix < NUM_MONITOR; ix++) {
+		/* if we hit the next module counter, we will
+		continue if we want to turn on all monitor counters,
+		and break if just turn on the counters in the
+		current module. */
+		if (innodb_counter_info[ix].monitor_type & MONITOR_MODULE) {
+
+			if (set_current_module) {
+				/* Continue to set on/off bit on current
+				module */
+				set_current_module = FALSE;
+			} else if (module_id == MONITOR_ALL_COUNTER) {
+				continue;
+			} else {
+				/* Hitting the next module, stop */
+				break;
+			}
+		}
+
+		/* Cannot turn on a monitor already been turned on. User
+		should be aware some counters are already on before
+		turn them on again (which could reset counter value) */
+		if (MONITOR_IS_ON(ix) && (set_option == MONITOR_TURN_ON)) {
+			fprintf(stderr, "Monitor '%s' is already enabled.\n",
+				srv_mon_get_name((monitor_id_t) ix));
+			continue;
+		}
+
+		/* For some existing counters (server status variables),
+		we will get its counter value at the start/stop time
+		to calculate the actual value during the time. */
+		if (innodb_counter_info[ix].monitor_type & MONITOR_EXISTING) {
+			srv_mon_process_existing_counter(
+				static_cast<monitor_id_t>(ix), set_option);
+		}
+
+		/* Currently support 4 operations on the monitor counters:
+		turn on, turn off, reset and reset all operations. */
+		switch (set_option) {
+		case MONITOR_TURN_ON:
+			MONITOR_ON(ix);
+			MONITOR_INIT(ix);
+			MONITOR_SET_START(ix);
+			break;
+
+		case MONITOR_TURN_OFF:
+			MONITOR_OFF(ix);
+			MONITOR_SET_OFF(ix);
+			break;
+
+		case MONITOR_RESET_VALUE:
+			srv_mon_reset(static_cast<monitor_id_t>(ix));
+			break;
+
+		case MONITOR_RESET_ALL_VALUE:
+			srv_mon_reset_all(static_cast<monitor_id_t>(ix));
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+}
+
+/****************************************************************//**
+Get transaction system's rollback segment size in pages
+@return size in pages */
+static
+ulint
+srv_mon_get_rseg_size(void)
+/*=======================*/
+{
+	ulint		i;
+	ulint		value = 0;
+
+	/* rseg_array is a static array, so we can go through it without
+	mutex protection. In addition, we provide an estimate of the
+	total rollback segment size and to avoid mutex contention we
+	don't acquire the rseg->mutex" */
+	for (i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+		const trx_rseg_t*	rseg = trx_sys->rseg_array[i];
+
+		if (rseg != NULL) {
+			value += rseg->curr_size;
+		}
+	}
+
+	return(value);
+}
+
+/****************************************************************//**
+This function consolidates some existing server counters used
+by "system status variables". These existing system variables do not have
+mechanism to start/stop and reset the counters, so we simulate these
+controls by remembering the corresponding counter values when the
+corresponding monitors are turned on/off/reset, and do appropriate
+mathematics to deduct the actual value. Please also refer to
+srv_export_innodb_status() for related global counters used by
+the existing status variables.*/
+UNIV_INTERN
+void
+srv_mon_process_existing_counter(
+/*=============================*/
+	monitor_id_t	monitor_id,	/*!< in: the monitor's ID as in
+					monitor_counter_id */
+	mon_option_t	set_option)	/*!< in: Turn on/off reset the
+					counter */
+{
+	mon_type_t		value;
+	monitor_info_t*		monitor_info;
+	ibool			update_min = FALSE;
+	buf_pool_stat_t		stat;
+	buf_pools_list_size_t	buf_pools_list_size;
+	ulint			LRU_len;
+	ulint			free_len;
+	ulint			flush_list_len;
+
+	monitor_info = srv_mon_get_info(monitor_id);
+
+	ut_a(monitor_info->monitor_type & MONITOR_EXISTING);
+	ut_a(monitor_id < NUM_MONITOR);
+
+	/* Get the value from corresponding global variable */
+	switch (monitor_id) {
+	case MONITOR_OVLD_META_MEM_POOL:
+		value = srv_mem_pool_size;
+		break;
+
+	/* export_vars.innodb_buffer_pool_reads. Num Reads from
+	disk (page not in buffer) */
+	case MONITOR_OVLD_BUF_POOL_READS:
+		value = srv_stats.buf_pool_reads;
+		break;
+
+	/* innodb_buffer_pool_read_requests, the number of logical
+	read requests */
+	case MONITOR_OVLD_BUF_POOL_READ_REQUESTS:
+		buf_get_total_stat(&stat);
+		value = stat.n_page_gets;
+		break;
+
+	/* innodb_buffer_pool_write_requests, the number of
+	write request */
+	case MONITOR_OVLD_BUF_POOL_WRITE_REQUEST:
+		value = srv_stats.buf_pool_write_requests;
+		break;
+
+	/* innodb_buffer_pool_wait_free */
+	case MONITOR_OVLD_BUF_POOL_WAIT_FREE:
+		value = srv_stats.buf_pool_wait_free;
+		break;
+
+	/* innodb_buffer_pool_read_ahead */
+	case MONITOR_OVLD_BUF_POOL_READ_AHEAD:
+		buf_get_total_stat(&stat);
+		value = stat.n_ra_pages_read;
+		break;
+
+	/* innodb_buffer_pool_read_ahead_evicted */
+	case MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED:
+		buf_get_total_stat(&stat);
+		value = stat.n_ra_pages_evicted;
+		break;
+
+	/* innodb_buffer_pool_pages_total */
+	case MONITOR_OVLD_BUF_POOL_PAGE_TOTAL:
+		value = buf_pool_get_n_pages();
+		break;
+
+	/* innodb_buffer_pool_pages_misc */
+	case MONITOR_OVLD_BUF_POOL_PAGE_MISC:
+		buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
+		value = buf_pool_get_n_pages() - LRU_len - free_len;
+		break;
+
+	/* innodb_buffer_pool_pages_data */
+	case MONITOR_OVLD_BUF_POOL_PAGES_DATA:
+		buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
+		value = LRU_len;
+		break;
+
+	/* innodb_buffer_pool_bytes_data */
+	case MONITOR_OVLD_BUF_POOL_BYTES_DATA:
+		buf_get_total_list_size_in_bytes(&buf_pools_list_size);
+		value = buf_pools_list_size.LRU_bytes
+			+ buf_pools_list_size.unzip_LRU_bytes;
+		break;
+
+	/* innodb_buffer_pool_pages_dirty */
+	case MONITOR_OVLD_BUF_POOL_PAGES_DIRTY:
+		buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
+		value = flush_list_len;
+		break;
+
+	/* innodb_buffer_pool_bytes_dirty */
+	case MONITOR_OVLD_BUF_POOL_BYTES_DIRTY:
+		buf_get_total_list_size_in_bytes(&buf_pools_list_size);
+		value = buf_pools_list_size.flush_list_bytes;
+		break;
+
+	/* innodb_buffer_pool_pages_free */
+	case MONITOR_OVLD_BUF_POOL_PAGES_FREE:
+		buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
+		value = free_len;
+		break;
+
+	/* innodb_pages_created, the number of pages created */
+	case MONITOR_OVLD_PAGE_CREATED:
+		buf_get_total_stat(&stat);
+		value = stat.n_pages_created;
+		break;
+
+	/* innodb_pages_written, the number of page written */
+	case MONITOR_OVLD_PAGES_WRITTEN:
+		buf_get_total_stat(&stat);
+		value = stat.n_pages_written;
+		break;
+
+	/* innodb_pages_read */
+	case MONITOR_OVLD_PAGES_READ:
+		buf_get_total_stat(&stat);
+		value = stat.n_pages_read;
+		break;
+
+	/* innodb_data_reads, the total number of data reads */
+	case MONITOR_OVLD_BYTE_READ:
+		value = srv_stats.data_read;
+		break;
+
+	/* innodb_data_writes, the total number of data writes. */
+	case MONITOR_OVLD_BYTE_WRITTEN:
+		value = srv_stats.data_written;
+		break;
+
+	/* innodb_data_reads, the total number of data reads. */
+	case MONITOR_OVLD_OS_FILE_READ:
+		value = os_n_file_reads;
+		break;
+
+	/* innodb_data_writes, the total number of data writes*/
+	case MONITOR_OVLD_OS_FILE_WRITE:
+		value = os_n_file_writes;
+		break;
+
+	/* innodb_data_fsyncs, number of fsync() operations so far. */
+	case MONITOR_OVLD_OS_FSYNC:
+		value = os_n_fsyncs;
+		break;
+
+	/* innodb_os_log_written */
+	case MONITOR_OVLD_OS_LOG_WRITTEN:
+		value = (mon_type_t) srv_stats.os_log_written;
+		break;
+
+	/* innodb_os_log_fsyncs */
+	case MONITOR_OVLD_OS_LOG_FSYNC:
+		value = fil_n_log_flushes;
+		break;
+
+	/* innodb_os_log_pending_fsyncs */
+	case MONITOR_OVLD_OS_LOG_PENDING_FSYNC:
+		value = fil_n_pending_log_flushes;
+		update_min = TRUE;
+		break;
+
+	/* innodb_os_log_pending_writes */
+	case MONITOR_OVLD_OS_LOG_PENDING_WRITES:
+		value = srv_stats.os_log_pending_writes;
+		update_min = TRUE;
+		break;
+
+	/* innodb_log_waits */
+	case MONITOR_OVLD_LOG_WAITS:
+		value = srv_stats.log_waits;
+		break;
+
+	/* innodb_log_write_requests */
+	case MONITOR_OVLD_LOG_WRITE_REQUEST:
+		value = srv_stats.log_write_requests;
+		break;
+
+	/* innodb_log_writes */
+	case MONITOR_OVLD_LOG_WRITES:
+		value = srv_stats.log_writes;
+		break;
+
+	/* innodb_dblwr_writes */
+	case MONITOR_OVLD_SRV_DBLWR_WRITES:
+		value = srv_stats.dblwr_writes;
+		break;
+
+	/* innodb_dblwr_pages_written */
+	case MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN:
+		value = srv_stats.dblwr_pages_written;
+		break;
+
+	/* innodb_page_size */
+	case MONITOR_OVLD_SRV_PAGE_SIZE:
+		value = UNIV_PAGE_SIZE;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_S_SPIN_WAITS:
+		value = rw_lock_stats.rw_s_spin_wait_count;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_X_SPIN_WAITS:
+		value = rw_lock_stats.rw_x_spin_wait_count;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS:
+		value = rw_lock_stats.rw_s_spin_round_count;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS:
+		value = rw_lock_stats.rw_x_spin_round_count;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_S_OS_WAITS:
+		value = rw_lock_stats.rw_s_os_wait_count;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_X_OS_WAITS:
+		value = rw_lock_stats.rw_x_os_wait_count;
+		break;
+
+	case MONITOR_OVLD_BUFFER_POOL_SIZE:
+		value = srv_buf_pool_size;
+		break;
+
+	/* innodb_rows_read */
+	case MONITOR_OLVD_ROW_READ:
+		value = srv_stats.n_rows_read;
+		break;
+
+	/* innodb_rows_inserted */
+	case MONITOR_OLVD_ROW_INSERTED:
+		value = srv_stats.n_rows_inserted;
+		break;
+
+	/* innodb_rows_deleted */
+	case MONITOR_OLVD_ROW_DELETED:
+		value = srv_stats.n_rows_deleted;
+		break;
+
+	/* innodb_rows_updated */
+	case MONITOR_OLVD_ROW_UPDTATED:
+		value = srv_stats.n_rows_updated;
+		break;
+
+	/* innodb_row_lock_current_waits */
+	case MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT:
+		value = srv_stats.n_lock_wait_current_count;
+		break;
+
+	/* innodb_row_lock_time */
+	case MONITOR_OVLD_LOCK_WAIT_TIME:
+		value = srv_stats.n_lock_wait_time / 1000;
+		break;
+
+	/* innodb_row_lock_time_max */
+	case MONITOR_OVLD_LOCK_MAX_WAIT_TIME:
+		value = lock_sys->n_lock_max_wait_time / 1000;
+		break;
+
+	/* innodb_row_lock_time_avg */
+	case MONITOR_OVLD_LOCK_AVG_WAIT_TIME:
+		if (srv_stats.n_lock_wait_count > 0) {
+			value = srv_stats.n_lock_wait_time / 1000
+				/ srv_stats.n_lock_wait_count;
+		} else {
+			value = 0;
+		}
+		break;
+
+	/* innodb_row_lock_waits */
+	case MONITOR_OVLD_ROW_LOCK_WAIT:
+		value = srv_stats.n_lock_wait_count;
+		break;
+
+	case MONITOR_RSEG_HISTORY_LEN:
+		value = trx_sys->rseg_history_len;
+		break;
+
+	case MONITOR_RSEG_CUR_SIZE:
+		value = srv_mon_get_rseg_size();
+		break;
+
+	case MONITOR_OVLD_N_FILE_OPENED:
+		value = fil_n_file_opened;
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_INSERT:
+		value = ibuf->n_merged_ops[IBUF_OP_INSERT];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_DELETE:
+		value = ibuf->n_merged_ops[IBUF_OP_DELETE_MARK];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_PURGE:
+		value = ibuf->n_merged_ops[IBUF_OP_DELETE];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT:
+		value = ibuf->n_discarded_ops[IBUF_OP_INSERT];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE:
+		value = ibuf->n_discarded_ops[IBUF_OP_DELETE_MARK];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE:
+		value = ibuf->n_discarded_ops[IBUF_OP_DELETE];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGES:
+		value = ibuf->n_merges;
+		break;
+
+	case MONITOR_OVLD_IBUF_SIZE:
+		value = ibuf->size;
+		break;
+
+	case MONITOR_OVLD_SERVER_ACTIVITY:
+		value = srv_get_activity_count();
+		break;
+
+	case MONITOR_OVLD_LSN_FLUSHDISK:
+		value = (mon_type_t) log_sys->flushed_to_disk_lsn;
+		break;
+
+	case MONITOR_OVLD_LSN_CURRENT:
+		value = (mon_type_t) log_sys->lsn;
+		break;
+
+	case MONITOR_OVLD_BUF_OLDEST_LSN:
+		value = (mon_type_t) buf_pool_get_oldest_modification();
+		break;
+
+	case MONITOR_OVLD_LSN_CHECKPOINT:
+		value = (mon_type_t) log_sys->last_checkpoint_lsn;
+		break;
+
+	case MONITOR_OVLD_MAX_AGE_ASYNC:
+		value = log_sys->max_modified_age_async;
+		break;
+
+	case MONITOR_OVLD_MAX_AGE_SYNC:
+		value = log_sys->max_modified_age_sync;
+		break;
+
+	case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH:
+		value = btr_cur_n_sea;
+		break;
+
+	case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE:
+		value = btr_cur_n_non_sea;
+		break;
+
+	default:
+		ut_error;
+	}
+
+	switch (set_option) {
+	case MONITOR_TURN_ON:
+		/* Save the initial counter value in mon_start_value
+		field */
+		MONITOR_SAVE_START(monitor_id, value);
+		return;
+
+	case MONITOR_TURN_OFF:
+		/* Save the counter value to mon_last_value when we
+		turn off the monitor but not yet reset. Note the
+		counter has not yet been set to off in the bitmap
+		table for normal turn off. We need to check the
+		count status (on/off) to avoid reset the value
+		for an already off conte */
+		if (MONITOR_IS_ON(monitor_id)) {
+			srv_mon_process_existing_counter(monitor_id,
+							 MONITOR_GET_VALUE);
+			MONITOR_SAVE_LAST(monitor_id);
+		}
+		return;
+
+	case MONITOR_GET_VALUE:
+		if (MONITOR_IS_ON(monitor_id)) {
+
+			/* If MONITOR_DISPLAY_CURRENT bit is on, we
+			only record the current value, rather than
+			incremental value over a period. Most of
+`			this type of counters are resource related
+			counters such as number of buffer pages etc. */
+			if (monitor_info->monitor_type
+			    & MONITOR_DISPLAY_CURRENT) {
+				MONITOR_SET(monitor_id, value);
+			} else {
+				/* Most status counters are montonically
+				increasing, no need to update their
+				minimum values. Only do so
+				if "update_min" set to TRUE */
+				MONITOR_SET_DIFF(monitor_id, value);
+
+				if (update_min
+				    && (MONITOR_VALUE(monitor_id)
+					< MONITOR_MIN_VALUE(monitor_id))) {
+					MONITOR_MIN_VALUE(monitor_id) =
+						MONITOR_VALUE(monitor_id);
+				}
+			}
+		}
+		return;
+
+	case MONITOR_RESET_VALUE:
+		if (!MONITOR_IS_ON(monitor_id)) {
+			MONITOR_LAST_VALUE(monitor_id) = 0;
+		}
+		return;
+
+	/* Nothing special for reset all operation for these existing
+	counters */
+	case MONITOR_RESET_ALL_VALUE:
+		return;
+	}
+}
+
+/*************************************************************//**
+Reset a monitor, create a new base line with the current monitor
+value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */
+UNIV_INTERN
+void
+srv_mon_reset(
+/*==========*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	ibool	monitor_was_on;
+
+	monitor_was_on = MONITOR_IS_ON(monitor);
+
+	if (monitor_was_on) {
+		/* Temporarily turn off the counter for the resetting
+		operation */
+		MONITOR_OFF(monitor);
+	}
+
+	/* Before resetting the current monitor value, first
+	calculate and set the max/min value since monitor
+	start */
+	srv_mon_calc_max_since_start(monitor);
+	srv_mon_calc_min_since_start(monitor);
+
+	/* Monitors with MONITOR_DISPLAY_CURRENT bit
+	are not incremental, no need to remember
+	the reset value. */
+	if (innodb_counter_info[monitor].monitor_type
+	    & MONITOR_DISPLAY_CURRENT) {
+		MONITOR_VALUE_RESET(monitor) = 0;
+	} else {
+		/* Remember the new baseline */
+		MONITOR_VALUE_RESET(monitor) = MONITOR_VALUE_RESET(monitor)
+					       + MONITOR_VALUE(monitor);
+	}
+
+	/* Reset the counter value */
+	MONITOR_VALUE(monitor) = 0;
+	MONITOR_MAX_VALUE(monitor) = MAX_RESERVED;
+	MONITOR_MIN_VALUE(monitor) = MIN_RESERVED;
+
+	MONITOR_FIELD((monitor), mon_reset_time) = time(NULL);
+
+	if (monitor_was_on) {
+		MONITOR_ON(monitor);
+	}
+}
+
+/*************************************************************//**
+Turn on monitor counters that are marked as default ON. */
+UNIV_INTERN
+void
+srv_mon_default_on(void)
+/*====================*/
+{
+	ulint   ix;
+
+	for (ix = 0; ix < NUM_MONITOR; ix++) {
+		if (innodb_counter_info[ix].monitor_type
+		    & MONITOR_DEFAULT_ON) {
+			/* Turn on monitor counters that are default on */
+			MONITOR_ON(ix);
+			MONITOR_INIT(ix);
+			MONITOR_SET_START(ix);
+		}
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/srv/srv0srv.c b/storage/xtradb/srv/srv0srv.c
deleted file mode 100644
index 478fc0505bc..00000000000
--- a/storage/xtradb/srv/srv0srv.c
+++ /dev/null
@@ -1,4251 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2008, 2009 Google Inc.
-Copyright (c) 2009, Percona Inc.
-
-Portions of this file contain modifications contributed and copyrighted by
-Google, Inc. Those modifications are gratefully acknowledged and are described
-briefly in the InnoDB documentation. The contributions by Google are
-incorporated with their permission, and subject to the conditions contained in
-the file COPYING.Google.
-
-Portions of this file contain modifications contributed and copyrighted
-by Percona Inc.. Those modifications are
-gratefully acknowledged and are described briefly in the InnoDB
-documentation. The contributions by Percona Inc. are incorporated with
-their permission, and subject to the conditions contained in the file
-COPYING.Percona.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file srv/srv0srv.c
-The database server main program
-
-NOTE: SQL Server 7 uses something which the documentation
-calls user mode scheduled threads (UMS threads). One such
-thread is usually allocated per processor. Win32
-documentation does not know any UMS threads, which suggests
-that the concept is internal to SQL Server 7. It may mean that
-SQL Server 7 does all the scheduling of threads itself, even
-in i/o waits. We should maybe modify InnoDB to use the same
-technique, because thread switches within NT may be too slow.
-
-SQL Server 7 also mentions fibers, which are cooperatively
-scheduled threads. They can boost performance by 5 %,
-according to the Delaney and Soukup's book.
-
-Windows 2000 will have something called thread pooling
-(see msdn website), which we could possibly use.
-
-Another possibility could be to use some very fast user space
-thread library. This might confuse NT though.
-
-Created 10/8/1995 Heikki Tuuri
-*******************************************************/
-
-/* Dummy comment */
-#include "m_string.h" /* for my_sys.h */
-#include "my_sys.h" /* DEBUG_SYNC_C */
-#include "srv0srv.h"
-
-#include "ut0mem.h"
-#include "ut0ut.h"
-#include "os0proc.h"
-#include "mem0mem.h"
-#include "mem0pool.h"
-#include "sync0sync.h"
-#include "que0que.h"
-#include "log0online.h"
-#include "log0recv.h"
-#include "pars0pars.h"
-#include "usr0sess.h"
-#include "lock0lock.h"
-#include "trx0purge.h"
-#include "ibuf0ibuf.h"
-#include "buf0flu.h"
-#include "buf0lru.h"
-#include "btr0sea.h"
-#include "dict0load.h"
-#include "dict0boot.h"
-#include "srv0start.h"
-#include "row0mysql.h"
-#include "ha_prototypes.h"
-#include "trx0i_s.h"
-#include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
-#include "read0read.h"
-#include "mysql/plugin.h"
-#include "mysql/service_thd_wait.h"
-
-/* prototypes of new functions added to ha_innodb.cc for kill_idle_transaction */
-ibool		innobase_thd_is_idle(const void* thd);
-ib_int64_t	innobase_thd_get_start_time(const void* thd);
-void		innobase_thd_kill(ulong thd_id);
-ulong		innobase_thd_get_thread_id(const void* thd);
-
-/* prototypes for new functions added to ha_innodb.cc */
-ibool	innobase_get_slow_log();
-
-/* The following counter is incremented whenever there is some user activity
-in the server */
-UNIV_INTERN ulint	srv_activity_count	= 0;
-
-/* The following is the maximum allowed duration of a lock wait. */
-UNIV_INTERN ulint	srv_fatal_semaphore_wait_threshold = 600;
-
-/**/
-UNIV_INTERN long long	srv_kill_idle_transaction = 0;
-
-/* How much data manipulation language (DML) statements need to be delayed,
-in microseconds, in order to reduce the lagging of the purge thread. */
-UNIV_INTERN ulint	srv_dml_needed_delay = 0;
-
-UNIV_INTERN ibool	srv_lock_timeout_active = FALSE;
-UNIV_INTERN ibool	srv_monitor_active = FALSE;
-UNIV_INTERN ibool	srv_error_monitor_active = FALSE;
-
-UNIV_INTERN const char*	srv_main_thread_op_info = "";
-
-/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
-UNIV_INTERN const char	srv_mysql50_table_name_prefix[9] = "#mysql50#";
-
-/* Server parameters which are read from the initfile */
-
-/* The following three are dir paths which are catenated before file
-names, where the file name itself may also contain a path */
-
-UNIV_INTERN char*	srv_data_home	= NULL;
-#ifdef UNIV_LOG_ARCHIVE
-UNIV_INTERN char*	srv_arch_dir	= NULL;
-#endif /* UNIV_LOG_ARCHIVE */
-
-/** store to its own file each table created by an user; data
-dictionary tables are in the system tablespace 0 */
-UNIV_INTERN my_bool	srv_file_per_table;
-/** The file format to use on new *.ibd files. */
-UNIV_INTERN ulint	srv_file_format = 0;
-/** Whether to check file format during startup.  A value of
-DICT_TF_FORMAT_MAX + 1 means no checking ie. FALSE.  The default is to
-set it to the highest format we support. */
-UNIV_INTERN ulint	srv_max_file_format_at_startup = DICT_TF_FORMAT_MAX;
-
-#if DICT_TF_FORMAT_51
-# error "DICT_TF_FORMAT_51 must be 0!"
-#endif
-/** Place locks to records only i.e. do not use next-key locking except
-on duplicate key checking and foreign key checking */
-UNIV_INTERN ibool	srv_locks_unsafe_for_binlog = FALSE;
-
-/* If this flag is TRUE, then we will use the native aio of the
-OS (provided we compiled Innobase with it in), otherwise we will
-use simulated aio we build below with threads.
-Currently we support native aio on windows and linux */
-UNIV_INTERN my_bool	srv_use_native_aio = TRUE;
-
-#ifdef __WIN__
-/* Windows native condition variables. We use runtime loading / function
-pointers, because they are not available on Windows Server 2003 and
-Windows XP/2000.
-
-We use condition for events on Windows if possible, even if os_event
-resembles Windows kernel event object well API-wise. The reason is
-performance, kernel objects are heavyweights and WaitForSingleObject() is a
-performance killer causing calling thread to context switch. Besides, Innodb
-is preallocating large number (often millions) of os_events. With kernel event
-objects it takes a big chunk out of non-paged pool, which is better suited
-for tasks like IO than for storing idle event objects. */
-UNIV_INTERN ibool	srv_use_native_conditions = FALSE;
-#endif /* __WIN__ */
-
-UNIV_INTERN ulint	srv_n_data_files = 0;
-UNIV_INTERN char**	srv_data_file_names = NULL;
-/* size in database pages */
-UNIV_INTERN ulint*	srv_data_file_sizes = NULL;
-
-UNIV_INTERN char*	srv_doublewrite_file = NULL;
-
-UNIV_INTERN ibool	srv_recovery_stats = FALSE;
-
-UNIV_INTERN my_bool	srv_track_changed_pages = FALSE;
-
-UNIV_INTERN ib_uint64_t	srv_max_bitmap_file_size = 100 * 1024 * 1024;
-
-UNIV_INTERN ulonglong	srv_max_changed_pages = 0;
-
-/** When TRUE, fake change transcations take S rather than X row locks.
-    When FALSE, row locks are not taken at all. */
-UNIV_INTERN my_bool	srv_fake_changes_locks = TRUE;
-
-/* if TRUE, then we auto-extend the last data file */
-UNIV_INTERN ibool	srv_auto_extend_last_data_file	= FALSE;
-/* if != 0, this tells the max size auto-extending may increase the
-last data file size */
-UNIV_INTERN ulint	srv_last_file_size_max	= 0;
-/* If the last data file is auto-extended, we add this
-many pages to it at a time */
-UNIV_INTERN ulong	srv_auto_extend_increment = 8;
-UNIV_INTERN ulint*	srv_data_file_is_raw_partition = NULL;
-
-/* If the following is TRUE we do not allow inserts etc. This protects
-the user from forgetting the 'newraw' keyword to my.cnf */
-
-UNIV_INTERN ibool	srv_created_new_raw	= FALSE;
-
-UNIV_INTERN char**	srv_log_group_home_dirs = NULL;
-
-UNIV_INTERN ulint	srv_n_log_groups	= ULINT_MAX;
-UNIV_INTERN ulint	srv_n_log_files		= ULINT_MAX;
-/* size in database pages */
-UNIV_INTERN ulint	srv_log_file_size	= ULINT_MAX;
-/* size in database pages */
-UNIV_INTERN ulint	srv_log_buffer_size	= ULINT_MAX;
-//UNIV_INTERN ulong	srv_flush_log_at_trx_commit = 1;
-UNIV_INTERN char	srv_use_global_flush_log_at_trx_commit	= TRUE;
-
-/* Try to flush dirty pages so as to avoid IO bursts at
-the checkpoints. */
-UNIV_INTERN char	srv_adaptive_flushing	= TRUE;
-
-UNIV_INTERN ulong	srv_show_locks_held	= 10;
-UNIV_INTERN ulong	srv_show_verbose_locks	= 0;
-
-/** Maximum number of times allowed to conditionally acquire
-mutex before switching to blocking wait on the mutex */
-#define MAX_MUTEX_NOWAIT	20
-
-/** Check whether the number of failed nonblocking mutex
-acquisition attempts exceeds maximum allowed value. If so,
-srv_printf_innodb_monitor() will request mutex acquisition
-with mutex_enter(), which will wait until it gets the mutex. */
-#define MUTEX_NOWAIT(mutex_skipped)	((mutex_skipped) < MAX_MUTEX_NOWAIT)
-
-/** The sort order table of the MySQL latin1_swedish_ci character set
-collation */
-UNIV_INTERN const byte*	srv_latin1_ordering;
-
-/* use os/external memory allocator */
-UNIV_INTERN my_bool	srv_use_sys_malloc	= TRUE;
-/* requested size in kilobytes */
-UNIV_INTERN ulint	srv_buf_pool_size	= ULINT_MAX;
-/* force virtual page preallocation (prefault) */
-UNIV_INTERN my_bool	srv_buf_pool_populate	= FALSE;
-/* requested number of buffer pool instances */
-UNIV_INTERN ulint       srv_buf_pool_instances  = 1;
-/* previously requested size */
-UNIV_INTERN ulint	srv_buf_pool_old_size;
-/* current size in kilobytes */
-UNIV_INTERN ulint	srv_buf_pool_curr_size	= 0;
-/* size in bytes */
-UNIV_INTERN ulint	srv_mem_pool_size	= ULINT_MAX;
-UNIV_INTERN ulint	srv_lock_table_size	= ULINT_MAX;
-
-/* This parameter is deprecated. Use srv_n_io_[read|write]_threads
-instead. */
-UNIV_INTERN ulint	srv_n_file_io_threads	= ULINT_MAX;
-UNIV_INTERN ulint	srv_n_read_io_threads	= ULINT_MAX;
-UNIV_INTERN ulint	srv_n_write_io_threads	= ULINT_MAX;
-
-/* Switch to enable random read ahead. */
-UNIV_INTERN my_bool	srv_random_read_ahead	= FALSE;
-
-/* The universal page size of the database */
-UNIV_INTERN ulint	srv_page_size_shift	= 0;
-UNIV_INTERN ulint	srv_page_size		= 0;
-
-/* The log block size */
-UNIV_INTERN ulint	srv_log_block_size	= 0;
-
-/* User settable value of the number of pages that must be present
-in the buffer cache and accessed sequentially for InnoDB to trigger a
-readahead request. */
-UNIV_INTERN ulong	srv_read_ahead_threshold	= 56;
-
-#ifdef UNIV_LOG_ARCHIVE
-UNIV_INTERN ibool		srv_log_archive_on	= FALSE;
-UNIV_INTERN ibool		srv_archive_recovery	= 0;
-UNIV_INTERN ib_uint64_t	srv_archive_recovery_limit_lsn;
-#endif /* UNIV_LOG_ARCHIVE */
-
-/* This parameter is used to throttle the number of insert buffers that are
-merged in a batch. By increasing this parameter on a faster disk you can
-possibly reduce the number of I/O operations performed to complete the
-merge operation. The value of this parameter is used as is by the
-background loop when the system is idle (low load), on a busy system
-the parameter is scaled down by a factor of 4, this is to avoid putting
-a heavier load on the I/O sub system. */
-
-UNIV_INTERN ulong	srv_insert_buffer_batch_size = 20;
-
-UNIV_INTERN char*	srv_file_flush_method_str = NULL;
-UNIV_INTERN ulint	srv_unix_file_flush_method = SRV_UNIX_FSYNC;
-UNIV_INTERN ulint	srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
-
-UNIV_INTERN ulint	srv_max_n_open_files	  = 300;
-
-/* Number of IO operations per second the server can do */
-UNIV_INTERN ulong	srv_io_capacity         = 200;
-
-/* The InnoDB main thread tries to keep the ratio of modified pages
-in the buffer pool to all database pages in the buffer pool smaller than
-the following number. But it is not guaranteed that the value stays below
-that during a time of heavy update/insert activity. */
-
-UNIV_INTERN ulong	srv_max_buf_pool_modified_pct	= 75;
-
-/* the number of purge threads to use from the worker pool (currently 0 or 1).*/
-UNIV_INTERN ulong srv_n_purge_threads = 0;
-
-/* the number of pages to purge in one batch */
-UNIV_INTERN ulong srv_purge_batch_size = 20;
-
-/* the number of rollback segments to use */
-UNIV_INTERN ulong srv_rollback_segments = TRX_SYS_N_RSEGS;
-
-/* Internal setting for "innodb_stats_method". Decides how InnoDB treats
-NULL value when collecting statistics. By default, it is set to
-SRV_STATS_NULLS_EQUAL(0), ie. all NULL value are treated equal */
-UNIV_INTERN ulong srv_innodb_stats_method = SRV_STATS_NULLS_EQUAL;
-
-/** Time in seconds between automatic buffer pool dumps */
-UNIV_INTERN uint srv_auto_lru_dump = 0;
-
-/** Whether startup should be blocked until buffer pool is fully restored */
-UNIV_INTERN ibool srv_blocking_lru_restore;
-
-/* structure to pass status variables to MySQL */
-UNIV_INTERN export_struc export_vars;
-
-/* If the following is != 0 we do not allow inserts etc. This protects
-the user from forgetting the innodb_force_recovery keyword to my.cnf */
-
-UNIV_INTERN ulint	srv_force_recovery	= 0;
-/*-----------------------*/
-/* We are prepared for a situation that we have this many threads waiting for
-a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the
-value. */
-
-UNIV_INTERN ulint	srv_max_n_threads	= 0;
-
-/* The following controls how many threads we let inside InnoDB concurrently:
-threads waiting for locks are not counted into the number because otherwise
-we could get a deadlock. MySQL creates a thread for each user session, and
-semaphore contention and convoy problems can occur withput this restriction.
-Value 10 should be good if there are less than 4 processors + 4 disks in the
-computer. Bigger computers need bigger values. Value 0 will disable the
-concurrency check. */
-
-UNIV_INTERN ibool	srv_thread_concurrency_timer_based = FALSE;
-UNIV_INTERN ulong	srv_thread_concurrency	= 0;
-
-/* this mutex protects srv_conc data structures */
-UNIV_INTERN os_fast_mutex_t	srv_conc_mutex;
-/* number of transactions that have declared_to_be_inside_innodb set.
-It used to be a non-error for this value to drop below zero temporarily.
-This is no longer true. We'll, however, keep the lint datatype to add
-assertions to catch any corner cases that we may have missed. */
-UNIV_INTERN lint	srv_conc_n_threads	= 0;
-/* number of OS threads waiting in the FIFO for a permission to enter
-InnoDB */
-UNIV_INTERN ulint	srv_conc_n_waiting_threads = 0;
-
-/* print all user-level transactions deadlocks to mysqld stderr */
-UNIV_INTERN my_bool	srv_print_all_deadlocks = FALSE;
-
-/* Produce a stacktrace on long semaphore wait */
-UNIV_INTERN my_bool     srv_use_stacktrace = FALSE;
-
-typedef struct srv_conc_slot_struct	srv_conc_slot_t;
-struct srv_conc_slot_struct{
-	os_event_t			event;		/*!< event to wait */
-	ibool				reserved;	/*!< TRUE if slot
-							reserved */
-	ibool				wait_ended;	/*!< TRUE when another
-							thread has already set
-							the event and the
-							thread in this slot is
-							free to proceed; but
-							reserved may still be
-							TRUE at that point */
-	UT_LIST_NODE_T(srv_conc_slot_t)	srv_conc_queue;	/*!< queue node */
-};
-
-/* queue of threads waiting to get in */
-UNIV_INTERN UT_LIST_BASE_NODE_T(srv_conc_slot_t)	srv_conc_queue;
-/* array of wait slots */
-UNIV_INTERN srv_conc_slot_t* srv_conc_slots;
-
-/* Number of times a thread is allowed to enter InnoDB within the same
-SQL query after it has once got the ticket at srv_conc_enter_innodb */
-#define SRV_FREE_TICKETS_TO_ENTER srv_n_free_tickets_to_enter
-#define SRV_THREAD_SLEEP_DELAY srv_thread_sleep_delay
-/*-----------------------*/
-/* If the following is set to 1 then we do not run purge and insert buffer
-merge to completion before shutdown. If it is set to 2, do not even flush the
-buffer pool to data files at the shutdown: we effectively 'crash'
-InnoDB (but lose no committed transactions). */
-UNIV_INTERN ulint	srv_fast_shutdown	= 0;
-
-/* Generate a innodb_status.<pid> file */
-UNIV_INTERN ibool	srv_innodb_status	= FALSE;
-
-/* When estimating number of different key values in an index, sample
-this many index pages */
-UNIV_INTERN unsigned long long	srv_stats_sample_pages = 8;
-UNIV_INTERN ulint	srv_stats_auto_update = 1;
-UNIV_INTERN ulint	srv_stats_update_need_lock = 1;
-UNIV_INTERN ibool	srv_use_sys_stats_table = FALSE;
-#ifdef UNIV_DEBUG
-UNIV_INTERN ulong	srv_sys_stats_root_page = 0;
-#endif
-
-UNIV_INTERN ibool	srv_use_doublewrite_buf	= TRUE;
-UNIV_INTERN ibool       srv_use_atomic_writes = FALSE;
-#ifdef HAVE_POSIX_FALLOCATE
-UNIV_INTERN ibool       srv_use_posix_fallocate = FALSE;
-#endif
-
-UNIV_INTERN ibool	srv_use_checksums = TRUE;
-UNIV_INTERN ibool	srv_fast_checksum = FALSE;
-
-UNIV_INTERN ulong	srv_replication_delay		= 0;
-
-UNIV_INTERN long long	srv_ibuf_max_size = 0;
-UNIV_INTERN ulong	srv_ibuf_active_contract = 0; /* 0:disable 1:enable */
-UNIV_INTERN ulong	srv_ibuf_accel_rate = 100;
-#define PCT_IBUF_IO(pct) ((ulint) (srv_io_capacity * srv_ibuf_accel_rate * ((double) pct / 10000.0)))
-
-UNIV_INTERN ulint	srv_checkpoint_age_target = 0;
-UNIV_INTERN ulong	srv_flush_neighbor_pages = 1; /* 0:disable 1:area 2:contiguous */
-
-UNIV_INTERN ulint	srv_deprecated_enable_unsafe_group_commit = 0;
-UNIV_INTERN ulong	srv_read_ahead = 3; /* 1: random  2: linear  3: Both */
-UNIV_INTERN ulong	srv_adaptive_flushing_method = 0; /* 0: native  1: estimate  2: keep_average */
-
-UNIV_INTERN ulong	srv_expand_import = 0; /* 0:disable 1:enable */
-UNIV_INTERN ulong	srv_pass_corrupt_table = 0; /* 0:disable 1:enable */
-
-UNIV_INTERN ulint	srv_dict_size_limit = 0;
-/*-------------------------------------------*/
-UNIV_INTERN ulong	srv_n_spin_wait_rounds	= 30;
-UNIV_INTERN ulong	srv_n_free_tickets_to_enter = 500;
-UNIV_INTERN ulong	srv_thread_sleep_delay = 10000;
-UNIV_INTERN ulong	srv_spin_wait_delay	= 6;
-UNIV_INTERN ibool	srv_priority_boost	= TRUE;
-
-#ifdef UNIV_DEBUG
-UNIV_INTERN ibool	srv_print_thread_releases	= FALSE;
-UNIV_INTERN ibool	srv_print_lock_waits		= FALSE;
-UNIV_INTERN ibool	srv_print_buf_io		= FALSE;
-UNIV_INTERN ibool	srv_print_log_io		= FALSE;
-UNIV_INTERN ibool	srv_print_latch_waits		= FALSE;
-#endif /* UNIV_DEBUG */
-
-static ulint	srv_n_rows_inserted_old		= 0;
-static ulint	srv_n_rows_updated_old		= 0;
-static ulint	srv_n_rows_deleted_old		= 0;
-static ulint	srv_n_rows_read_old		= 0;
-
-/* Ensure counters are on separate cache lines */
-
-#define CACHE_LINE_SIZE 64
-#define CACHE_ALIGNED __attribute__ ((aligned (CACHE_LINE_SIZE)))
-
-UNIV_INTERN byte
-counters_pad_start[CACHE_LINE_SIZE] __attribute__((unused)) = {0};
-
-UNIV_INTERN ulint		srv_n_rows_inserted CACHE_ALIGNED	= 0;
-UNIV_INTERN ulint		srv_n_rows_updated CACHE_ALIGNED	= 0;
-UNIV_INTERN ulint		srv_n_rows_deleted CACHE_ALIGNED	= 0;
-UNIV_INTERN ulint		srv_n_rows_read CACHE_ALIGNED		= 0;
-
-UNIV_INTERN ulint		srv_read_views_memory CACHE_ALIGNED	= 0;
-UNIV_INTERN ulint		srv_descriptors_memory CACHE_ALIGNED	= 0;
-
-UNIV_INTERN ulint		srv_n_lock_deadlock_count CACHE_ALIGNED	= 0;
-UNIV_INTERN ulint		srv_n_lock_wait_count CACHE_ALIGNED	= 0;
-UNIV_INTERN ulint		srv_n_lock_wait_current_count CACHE_ALIGNED = 0;
-UNIV_INTERN ib_int64_t	srv_n_lock_wait_time CACHE_ALIGNED		= 0;
-UNIV_INTERN ulint		srv_n_lock_max_wait_time CACHE_ALIGNED	= 0;
-
-UNIV_INTERN ulint		srv_truncated_status_writes CACHE_ALIGNED = 0;
-
-/* variable counts amount of data read in total (in bytes) */
-UNIV_INTERN ulint srv_data_read CACHE_ALIGNED			= 0;
-
-/* here we count the amount of data written in total (in bytes) */
-UNIV_INTERN ulint srv_data_written CACHE_ALIGNED		= 0;
-
-/* the number of the log write requests done */
-UNIV_INTERN ulint srv_log_write_requests CACHE_ALIGNED		= 0;
-
-/* the number of physical writes to the log performed */
-UNIV_INTERN ulint srv_log_writes CACHE_ALIGNED			= 0;
-
-/* amount of data written to the log files in bytes */
-UNIV_INTERN ulint srv_os_log_written CACHE_ALIGNED		= 0;
-
-/* amount of writes being done to the log files */
-UNIV_INTERN ulint srv_os_log_pending_writes CACHE_ALIGNED	= 0;
-
-/* we increase this counter, when there we don't have enough space in the
-log buffer and have to flush it */
-UNIV_INTERN ulint srv_log_waits CACHE_ALIGNED			= 0;
-
-/* this variable counts the amount of times, when the doublewrite buffer
-was flushed */
-UNIV_INTERN ulint srv_dblwr_writes CACHE_ALIGNED		= 0;
-
-/* here we store the number of pages that have been flushed to the
-doublewrite buffer */
-UNIV_INTERN ulint srv_dblwr_pages_written CACHE_ALIGNED		= 0;
-
-/* in this variable we store the number of write requests issued */
-UNIV_INTERN ulint srv_buf_pool_write_requests CACHE_ALIGNED	= 0;
-
-/* here we store the number of times when we had to wait for a free page
-in the buffer pool. It happens when the buffer pool is full and we need
-to make a flush, in order to be able to read or create a page. */
-UNIV_INTERN ulint srv_buf_pool_wait_free CACHE_ALIGNED		= 0;
-
-/** Number of buffer pool reads that led to the
-reading of a disk page */
-UNIV_INTERN ulint srv_buf_pool_reads CACHE_ALIGNED		= 0;
-
-/* variable to count the number of pages that were written from buffer
-pool to the disk */
-UNIV_INTERN ulint srv_buf_pool_flushed CACHE_ALIGNED		= 0;
-
-/* variable to count the number of LRU flushed pages */
-UNIV_INTERN ulint buf_lru_flush_page_count CACHE_ALIGNED	= 0;
-
-UNIV_INTERN byte
-counters_pad_end[CACHE_LINE_SIZE] __attribute__((unused)) = {0};
-
-/*
-  Set the following to 0 if you want InnoDB to write messages on
-  stderr on startup/shutdown
-*/
-UNIV_INTERN ibool	srv_print_verbose_log		= TRUE;
-UNIV_INTERN ibool	srv_print_innodb_monitor	= FALSE;
-UNIV_INTERN ibool	srv_print_innodb_lock_monitor	= FALSE;
-UNIV_INTERN ibool	srv_print_innodb_tablespace_monitor = FALSE;
-UNIV_INTERN ibool	srv_print_innodb_table_monitor = FALSE;
-
-/* Array of English strings describing the current state of an
-i/o handler thread */
-
-UNIV_INTERN const char* srv_io_thread_op_info[SRV_MAX_N_IO_THREADS];
-UNIV_INTERN const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS];
-
-UNIV_INTERN time_t	srv_last_monitor_time;
-
-UNIV_INTERN mutex_t	srv_innodb_monitor_mutex;
-
-/* Mutex for locking srv_monitor_file */
-UNIV_INTERN mutex_t	srv_monitor_file_mutex;
-
-#ifdef UNIV_PFS_MUTEX
-/* Key to register kernel_mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	kernel_mutex_key;
-/* Key to register srv_innodb_monitor_mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	srv_innodb_monitor_mutex_key;
-/* Key to register srv_monitor_file_mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	srv_monitor_file_mutex_key;
-/* Key to register srv_dict_tmpfile_mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	srv_dict_tmpfile_mutex_key;
-/* Key to register the mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	srv_misc_tmpfile_mutex_key;
-#endif /* UNIV_PFS_MUTEX */
-
-/* Temporary file for innodb monitor output */
-UNIV_INTERN FILE*	srv_monitor_file;
-/* Mutex for locking srv_dict_tmpfile.
-This mutex has a very high rank; threads reserving it should not
-be holding any InnoDB latches. */
-UNIV_INTERN mutex_t	srv_dict_tmpfile_mutex;
-/* Temporary file for output from the data dictionary */
-UNIV_INTERN FILE*	srv_dict_tmpfile;
-/* Mutex for locking srv_misc_tmpfile.
-This mutex has a very low rank; threads reserving it should not
-acquire any further latches or sleep before releasing this one. */
-UNIV_INTERN mutex_t	srv_misc_tmpfile_mutex;
-/* Temporary file for miscellanous diagnostic output */
-UNIV_INTERN FILE*	srv_misc_tmpfile;
-
-UNIV_INTERN ulint	srv_main_thread_process_no	= 0;
-UNIV_INTERN ulint	srv_main_thread_id		= 0;
-
-/* The following count work done by srv_master_thread. */
-
-/* Iterations by the 'once per second' loop. */
-static ulint   srv_main_1_second_loops		= 0;
-/* Calls to sleep by the 'once per second' loop. */
-static ulint   srv_main_sleeps			= 0;
-/* Iterations by the 'once per 10 seconds' loop. */
-static ulint   srv_main_10_second_loops		= 0;
-/* Iterations of the loop bounded by the 'background_loop' label. */
-static ulint   srv_main_background_loops	= 0;
-/* Iterations of the loop bounded by the 'flush_loop' label. */
-static ulint   srv_main_flush_loops		= 0;
-/* Log writes involving flush. */
-static ulint   srv_log_writes_and_flush		= 0;
-
-/* This is only ever touched by the master thread. It records the
-time when the last flush of log file has happened. The master
-thread ensures that we flush the log files at least once per
-second. */
-static time_t	srv_last_log_flush_time;
-
-/* The master thread performs various tasks based on the current
-state of IO activity and the level of IO utilization is past
-intervals. Following macros define thresholds for these conditions. */
-#define SRV_PEND_IO_THRESHOLD	(PCT_IO(3))
-#define SRV_RECENT_IO_ACTIVITY	(PCT_IO(5))
-#define SRV_PAST_IO_ACTIVITY	(PCT_IO(200))
-
-/*
-	IMPLEMENTATION OF THE SERVER MAIN PROGRAM
-	=========================================
-
-There is the following analogue between this database
-server and an operating system kernel:
-
-DB concept			equivalent OS concept
-----------			---------------------
-transaction		--	process;
-
-query thread		--	thread;
-
-lock			--	semaphore;
-
-transaction set to
-the rollback state	--	kill signal delivered to a process;
-
-kernel			--	kernel;
-
-query thread execution:
-(a) without kernel mutex
-reserved		--	process executing in user mode;
-(b) with kernel mutex reserved
-			--	process executing in kernel mode;
-
-The server is controlled by a master thread which runs at
-a priority higher than normal, that is, higher than user threads.
-It sleeps most of the time, and wakes up, say, every 300 milliseconds,
-to check whether there is anything happening in the server which
-requires intervention of the master thread. Such situations may be,
-for example, when flushing of dirty blocks is needed in the buffer
-pool or old version of database rows have to be cleaned away.
-
-The threads which we call user threads serve the queries of
-the clients and input from the console of the server.
-They run at normal priority. The server may have several
-communications endpoints. A dedicated set of user threads waits
-at each of these endpoints ready to receive a client request.
-Each request is taken by a single user thread, which then starts
-processing and, when the result is ready, sends it to the client
-and returns to wait at the same endpoint the thread started from.
-
-So, we do not have dedicated communication threads listening at
-the endpoints and dealing the jobs to dedicated worker threads.
-Our architecture saves one thread swithch per request, compared
-to the solution with dedicated communication threads
-which amounts to 15 microseconds on 100 MHz Pentium
-running NT. If the client
-is communicating over a network, this saving is negligible, but
-if the client resides in the same machine, maybe in an SMP machine
-on a different processor from the server thread, the saving
-can be important as the threads can communicate over shared
-memory with an overhead of a few microseconds.
-
-We may later implement a dedicated communication thread solution
-for those endpoints which communicate over a network.
-
-Our solution with user threads has two problems: for each endpoint
-there has to be a number of listening threads. If there are many
-communication endpoints, it may be difficult to set the right number
-of concurrent threads in the system, as many of the threads
-may always be waiting at less busy endpoints. Another problem
-is queuing of the messages, as the server internally does not
-offer any queue for jobs.
-
-Another group of user threads is intended for splitting the
-queries and processing them in parallel. Let us call these
-parallel communication threads. These threads are waiting for
-parallelized tasks, suspended on event semaphores.
-
-A single user thread waits for input from the console,
-like a command to shut the database.
-
-Utility threads are a different group of threads which takes
-care of the buffer pool flushing and other, mainly background
-operations, in the server.
-Some of these utility threads always run at a lower than normal
-priority, so that they are always in background. Some of them
-may dynamically boost their priority by the pri_adjust function,
-even to higher than normal priority, if their task becomes urgent.
-The running of utilities is controlled by high- and low-water marks
-of urgency. The urgency may be measured by the number of dirty blocks
-in the buffer pool, in the case of the flush thread, for example.
-When the high-water mark is exceeded, an utility starts running, until
-the urgency drops under the low-water mark. Then the utility thread
-suspend itself to wait for an event. The master thread is
-responsible of signaling this event when the utility thread is
-again needed.
-
-For each individual type of utility, some threads always remain
-at lower than normal priority. This is because pri_adjust is implemented
-so that the threads at normal or higher priority control their
-share of running time by calling sleep. Thus, if the load of the
-system sudenly drops, these threads cannot necessarily utilize
-the system fully. The background priority threads make up for this,
-starting to run when the load drops.
-
-When there is no activity in the system, also the master thread
-suspends itself to wait for an event making
-the server totally silent. The responsibility to signal this
-event is on the user thread which again receives a message
-from a client.
-
-There is still one complication in our server design. If a
-background utility thread obtains a resource (e.g., mutex) needed by a user
-thread, and there is also some other user activity in the system,
-the user thread may have to wait indefinitely long for the
-resource, as the OS does not schedule a background thread if
-there is some other runnable user thread. This problem is called
-priority inversion in real-time programming.
-
-One solution to the priority inversion problem would be to
-keep record of which thread owns which resource and
-in the above case boost the priority of the background thread
-so that it will be scheduled and it can release the resource.
-This solution is called priority inheritance in real-time programming.
-A drawback of this solution is that the overhead of acquiring a mutex
-increases slightly, maybe 0.2 microseconds on a 100 MHz Pentium, because
-the thread has to call os_thread_get_curr_id.
-This may be compared to 0.5 microsecond overhead for a mutex lock-unlock
-pair. Note that the thread
-cannot store the information in the resource, say mutex, itself,
-because competing threads could wipe out the information if it is
-stored before acquiring the mutex, and if it stored afterwards,
-the information is outdated for the time of one machine instruction,
-at least. (To be precise, the information could be stored to
-lock_word in mutex if the machine supports atomic swap.)
-
-The above solution with priority inheritance may become actual in the
-future, but at the moment we plan to implement a more coarse solution,
-which could be called a global priority inheritance. If a thread
-has to wait for a long time, say 300 milliseconds, for a resource,
-we just guess that it may be waiting for a resource owned by a background
-thread, and boost the priority of all runnable background threads
-to the normal level. The background threads then themselves adjust
-their fixed priority back to background after releasing all resources
-they had (or, at some fixed points in their program code).
-
-What is the performance of the global priority inheritance solution?
-We may weigh the length of the wait time 300 milliseconds, during
-which the system processes some other thread
-to the cost of boosting the priority of each runnable background
-thread, rescheduling it, and lowering the priority again.
-On 100 MHz Pentium + NT this overhead may be of the order 100
-microseconds per thread. So, if the number of runnable background
-threads is not very big, say < 100, the cost is tolerable.
-Utility threads probably will access resources used by
-user threads not very often, so collisions of user threads
-to preempted utility threads should not happen very often.
-
-The thread table contains
-information of the current status of each thread existing in the system,
-and also the event semaphores used in suspending the master thread
-and utility and parallel communication threads when they have nothing to do.
-The thread table can be seen as an analogue to the process table
-in a traditional Unix implementation.
-
-The thread table is also used in the global priority inheritance
-scheme. This brings in one additional complication: threads accessing
-the thread table must have at least normal fixed priority,
-because the priority inheritance solution does not work if a background
-thread is preempted while possessing the mutex protecting the thread table.
-So, if a thread accesses the thread table, its priority has to be
-boosted at least to normal. This priority requirement can be seen similar to
-the privileged mode used when processing the kernel calls in traditional
-Unix.*/
-
-/* Thread slot in the thread table */
-struct srv_slot_struct{
-	unsigned	type:1;		/*!< thread type: user, utility etc. */
-	unsigned	in_use:1;	/*!< TRUE if this slot is in use */
-	unsigned	suspended:1;	/*!< TRUE if the thread is waiting
-					for the event of this slot */
-	ib_time_t	suspend_time;	/*!< time when the thread was
-					suspended */
-	os_event_t	event;		/*!< event used in suspending the
-					thread when it has nothing to do */
-	que_thr_t*	thr;		/*!< suspended query thread (only
-					used for MySQL threads) */
-};
-
-/* Table for MySQL threads where they will be suspended to wait for locks */
-UNIV_INTERN srv_slot_t*	srv_mysql_table = NULL;
-
-UNIV_INTERN os_event_t	srv_timeout_event;
-
-UNIV_INTERN os_event_t	srv_monitor_event;
-
-UNIV_INTERN os_event_t	srv_error_event;
-
-UNIV_INTERN os_event_t	srv_lock_timeout_thread_event;
-
-UNIV_INTERN os_event_t	srv_shutdown_event;
-
-UNIV_INTERN os_event_t	srv_checkpoint_completed_event;
-
-UNIV_INTERN os_event_t	srv_redo_log_thread_finished_event;
-
-UNIV_INTERN srv_sys_t*	srv_sys	= NULL;
-
-/* padding to prevent other memory update hotspots from residing on
-the same memory cache line */
-UNIV_INTERN byte	srv_pad1[64];
-/* mutex protecting the server, trx structs, query threads, and lock table */
-UNIV_INTERN mutex_t*	kernel_mutex_temp;
-/* padding to prevent other memory update hotspots from residing on
-the same memory cache line */
-UNIV_INTERN byte	srv_pad2[64];
-
-#if 0
-/* The following three values measure the urgency of the jobs of
-buffer, version, and insert threads. They may vary from 0 - 1000.
-The server mutex protects all these variables. The low-water values
-tell that the server can acquiesce the utility when the value
-drops below this low-water mark. */
-
-static ulint	srv_meter[SRV_MASTER + 1];
-static ulint	srv_meter_low_water[SRV_MASTER + 1];
-static ulint	srv_meter_high_water[SRV_MASTER + 1];
-static ulint	srv_meter_high_water2[SRV_MASTER + 1];
-static ulint	srv_meter_foreground[SRV_MASTER + 1];
-#endif
-
-/* The following values give info about the activity going on in
-the database. They are protected by the server mutex. The arrays
-are indexed by the type of the thread. */
-
-UNIV_INTERN ulint	srv_n_threads_active[SRV_MASTER + 1];
-UNIV_INTERN ulint	srv_n_threads[SRV_MASTER + 1];
-
-/*********************************************************************//**
-Asynchronous purge thread.
-@return	a dummy parameter */
-UNIV_INTERN
-os_thread_ret_t
-srv_purge_thread(
-/*=============*/
-	void*	arg __attribute__((unused))); /*!< in: a dummy parameter
-					      required by os_thread_create */
-
-/***********************************************************************
-Prints counters for work done by srv_master_thread. */
-static
-void
-srv_print_master_thread_info(
-/*=========================*/
-	FILE  *file)    /* in: output stream */
-{
-	fprintf(file, "srv_master_thread loops: %lu 1_second, %lu sleeps, "
-		"%lu 10_second, %lu background, %lu flush\n",
-		srv_main_1_second_loops, srv_main_sleeps,
-		srv_main_10_second_loops, srv_main_background_loops,
-		srv_main_flush_loops);
-	fprintf(file, "srv_master_thread log flush and writes: %lu\n",
-		      srv_log_writes_and_flush);
-}
-
-/*********************************************************************//**
-Sets the info describing an i/o thread current state. */
-UNIV_INTERN
-void
-srv_set_io_thread_op_info(
-/*======================*/
-	ulint		i,	/*!< in: the 'segment' of the i/o thread */
-	const char*	str)	/*!< in: constant char string describing the
-				state */
-{
-	ut_a(i < SRV_MAX_N_IO_THREADS);
-
-	srv_io_thread_op_info[i] = str;
-}
-
-/*********************************************************************//**
-Accessor function to get pointer to n'th slot in the server thread
-table.
-@return	pointer to the slot */
-static
-srv_slot_t*
-srv_table_get_nth_slot(
-/*===================*/
-	ulint	index)		/*!< in: index of the slot */
-{
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_a(index < OS_THREAD_MAX_N);
-
-	return(srv_sys->threads + index);
-}
-
-/*********************************************************************//**
-Gets the number of threads in the system.
-@return	sum of srv_n_threads[] */
-UNIV_INTERN
-ulint
-srv_get_n_threads(void)
-/*===================*/
-{
-	ulint	i;
-	ulint	n_threads	= 0;
-
-	mutex_enter(&kernel_mutex);
-
-	for (i = 0; i < SRV_MASTER + 1; i++) {
-
-		n_threads += srv_n_threads[i];
-	}
-
-	mutex_exit(&kernel_mutex);
-
-	return(n_threads);
-}
-
-#ifdef UNIV_DEBUG
-/*********************************************************************//**
-Validates the type of a thread table slot.
-@return TRUE if ok */
-static
-ibool
-srv_thread_type_validate(
-/*=====================*/
-	enum srv_thread_type	type)	/*!< in: thread type */
-{
-	switch (type) {
-	case SRV_WORKER:
-	case SRV_MASTER:
-		return(TRUE);
-	}
-	ut_error;
-	return(FALSE);
-}
-#endif /* UNIV_DEBUG */
-
-/*********************************************************************//**
-Gets the type of a thread table slot.
-@return thread type */
-static
-enum srv_thread_type
-srv_slot_get_type(
-/*==============*/
-	const srv_slot_t*	slot)	/*!< in: thread slot */
-{
-	enum srv_thread_type	type	= (enum srv_thread_type) slot->type;
-	ut_ad(srv_thread_type_validate(type));
-	return(type);
-}
-
-/*********************************************************************//**
-Reserves a slot in the thread table for the current thread.
-NOTE! The server mutex has to be reserved by the caller!
-@return	reserved slot */
-static
-srv_slot_t*
-srv_table_reserve_slot(
-/*===================*/
-	enum srv_thread_type	type)	/*!< in: type of the thread */
-{
-	srv_slot_t*	slot;
-	ulint		i;
-
-	ut_ad(srv_thread_type_validate(type));
-	ut_ad(mutex_own(&kernel_mutex));
-
-	i = 0;
-	slot = srv_table_get_nth_slot(i);
-
-	while (slot->in_use) {
-		i++;
-		slot = srv_table_get_nth_slot(i);
-	}
-
-	slot->in_use = TRUE;
-	slot->suspended = FALSE;
-	slot->type = type;
-	ut_ad(srv_slot_get_type(slot) == type);
-
-	return(slot);
-}
-
-/*********************************************************************//**
-Suspends the calling thread to wait for the event in its thread slot.
-NOTE! The server mutex has to be reserved by the caller! */
-static
-void
-srv_suspend_thread(
-/*===============*/
-	srv_slot_t*	slot)	/*!< in/out: thread slot */
-{
-	enum srv_thread_type	type;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(slot->in_use);
-	ut_ad(!slot->suspended);
-
-	if (srv_print_thread_releases) {
-		fprintf(stderr,
-			"Suspending thread %lu to slot %lu\n",
-			(ulong) os_thread_get_curr_id(),
-			(ulong) (slot - srv_sys->threads));
-	}
-
-	type = srv_slot_get_type(slot);
-
-	slot->suspended = TRUE;
-
-	ut_ad(srv_n_threads_active[type] > 0);
-
-	srv_n_threads_active[type]--;
-
-	os_event_reset(slot->event);
-}
-
-/*********************************************************************//**
-Releases threads of the type given from suspension in the thread table.
-NOTE! The server mutex has to be reserved by the caller!
-@return number of threads released: this may be less than n if not
-enough threads were suspended at the moment */
-UNIV_INTERN
-ulint
-srv_release_threads(
-/*================*/
-	enum srv_thread_type	type,	/*!< in: thread type */
-	ulint			n)	/*!< in: number of threads to release */
-{
-	srv_slot_t*	slot;
-	ulint		i;
-	ulint		count	= 0;
-
-	ut_ad(srv_thread_type_validate(type));
-	ut_ad(n > 0);
-	ut_ad(mutex_own(&kernel_mutex));
-
-	for (i = 0; i < OS_THREAD_MAX_N; i++) {
-
-		slot = srv_table_get_nth_slot(i);
-
-		if (slot->in_use && slot->suspended
-		    && srv_slot_get_type(slot) == type) {
-
-			slot->suspended = FALSE;
-
-			srv_n_threads_active[type]++;
-
-			os_event_set(slot->event);
-
-			if (srv_print_thread_releases) {
-				fprintf(stderr,
-					"Releasing thread type %lu"
-					" from slot %lu\n",
-					(ulong) type, (ulong) i);
-			}
-
-			count++;
-
-			if (count == n) {
-				break;
-			}
-		}
-	}
-
-	return(count);
-}
-
-/*********************************************************************//**
-Check whether thread type has reserved a slot. Return the first slot that
-is found. This works because we currently have only 1 thread of each type.
-@return	slot number or ULINT_UNDEFINED if not found*/
-UNIV_INTERN
-ulint
-srv_thread_has_reserved_slot(
-/*=========================*/
-	enum srv_thread_type	type)	/*!< in: thread type to check */
-{
-	ulint			i;
-	ulint			slot_no = ULINT_UNDEFINED;
-
-	ut_ad(srv_thread_type_validate(type));
-	mutex_enter(&kernel_mutex);
-
-	for (i = 0; i < OS_THREAD_MAX_N; i++) {
-		srv_slot_t*	slot;
-
-		slot = srv_table_get_nth_slot(i);
-
-		if (slot->in_use && slot->type == type) {
-			slot_no = i;
-			break;
-		}
-	}
-
-	mutex_exit(&kernel_mutex);
-
-	return(slot_no);
-}
-
-/*********************************************************************//**
-Initializes the server. */
-UNIV_INTERN
-void
-srv_init(void)
-/*==========*/
-{
-	srv_conc_slot_t*	conc_slot;
-	srv_slot_t*		slot;
-	ulint			i;
-
-	srv_sys = mem_alloc(sizeof(srv_sys_t));
-
-	kernel_mutex_temp = mem_alloc(sizeof(mutex_t));
-	mutex_create(kernel_mutex_key, &kernel_mutex, SYNC_KERNEL);
-
-	mutex_create(srv_innodb_monitor_mutex_key,
-		     &srv_innodb_monitor_mutex, SYNC_NO_ORDER_CHECK);
-
-	srv_sys->threads = mem_zalloc(OS_THREAD_MAX_N * sizeof(srv_slot_t));
-
-	for (i = 0; i < OS_THREAD_MAX_N; i++) {
-		slot = srv_sys->threads + i;
-		slot->event = os_event_create(NULL);
-		ut_a(slot->event);
-	}
-
-	srv_mysql_table = mem_zalloc(OS_THREAD_MAX_N * sizeof(srv_slot_t));
-
-	for (i = 0; i < OS_THREAD_MAX_N; i++) {
-		slot = srv_mysql_table + i;
-		slot->event = os_event_create(NULL);
-		ut_a(slot->event);
-	}
-
-	srv_error_event = os_event_create(NULL);
-
-	srv_timeout_event = os_event_create(NULL);
-
-	srv_monitor_event = os_event_create(NULL);
-
-	srv_lock_timeout_thread_event = os_event_create(NULL);
-	srv_shutdown_event = os_event_create(NULL);
-
-	srv_checkpoint_completed_event = os_event_create(NULL);
-	srv_redo_log_thread_finished_event = os_event_create(NULL);
-
-	for (i = 0; i < SRV_MASTER + 1; i++) {
-		srv_n_threads_active[i] = 0;
-		srv_n_threads[i] = 0;
-#if 0
-		srv_meter[i] = 30;
-		srv_meter_low_water[i] = 50;
-		srv_meter_high_water[i] = 100;
-		srv_meter_high_water2[i] = 200;
-		srv_meter_foreground[i] = 250;
-#endif
-	}
-
-	UT_LIST_INIT(srv_sys->tasks);
-
-	/* Create dummy indexes for infimum and supremum records */
-
-	dict_ind_init();
-
-	/* Init the server concurrency restriction data structures */
-
-	os_fast_mutex_init(&srv_conc_mutex);
-
-	UT_LIST_INIT(srv_conc_queue);
-
-	srv_conc_slots = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_conc_slot_t));
-
-	for (i = 0; i < OS_THREAD_MAX_N; i++) {
-		conc_slot = srv_conc_slots + i;
-		conc_slot->reserved = FALSE;
-		conc_slot->event = os_event_create(NULL);
-		ut_a(conc_slot->event);
-	}
-
-	/* Initialize some INFORMATION SCHEMA internal structures */
-	trx_i_s_cache_init(trx_i_s_cache);
-}
-
-/*********************************************************************//**
-Frees the data structures created in srv_init(). */
-UNIV_INTERN
-void
-srv_free(void)
-/*==========*/
-{
-	os_fast_mutex_free(&srv_conc_mutex);
-	mem_free(srv_conc_slots);
-	srv_conc_slots = NULL;
-
-	mem_free(srv_sys->threads);
-	mem_free(srv_sys);
-	srv_sys = NULL;
-
-	mem_free(kernel_mutex_temp);
-	kernel_mutex_temp = NULL;
-	mem_free(srv_mysql_table);
-	srv_mysql_table = NULL;
-
-	trx_i_s_cache_free(trx_i_s_cache);
-}
-
-/*********************************************************************//**
-Initializes the synchronization primitives, memory system, and the thread
-local storage. */
-UNIV_INTERN
-void
-srv_general_init(void)
-/*==================*/
-{
-	ut_mem_init();
-	/* Reset the system variables in the recovery module. */
-	recv_sys_var_init();
-	os_sync_init();
-	sync_init();
-	mem_init(srv_mem_pool_size);
-}
-
-/*======================= InnoDB Server FIFO queue =======================*/
-
-/* Maximum allowable purge history length.  <=0 means 'infinite'. */
-UNIV_INTERN ulong	srv_max_purge_lag		= 0;
-
-/*********************************************************************//**
-Puts an OS thread to wait if there are too many concurrent threads
-(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
-
-#ifdef HAVE_ATOMIC_BUILTINS
-static void
-enter_innodb_with_tickets(trx_t* trx)
-{
-	trx->declared_to_be_inside_innodb = TRUE;
-	trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER;
-	return;
-}
-
-static void
-srv_conc_enter_innodb_timer_based(trx_t* trx)
-{
-	lint	conc_n_threads;
-	ibool	has_yielded = FALSE;
-	ulint	has_slept = 0;
-
-	if (trx->declared_to_be_inside_innodb) {
-		ut_print_timestamp(stderr);
-		fputs(
-"  InnoDB: Error: trying to declare trx to enter InnoDB, but\n"
-"InnoDB: it already is declared.\n", stderr);
-		trx_print(stderr, trx, 0);
-		putc('\n', stderr);
-	}
-retry:
-	if (srv_conc_n_threads < (lint) srv_thread_concurrency) {
-		conc_n_threads = os_atomic_increment_lint(&srv_conc_n_threads, 1);
-		if (conc_n_threads <= (lint) srv_thread_concurrency) {
-			enter_innodb_with_tickets(trx);
-			return;
-		}
-		(void) os_atomic_increment_lint(&srv_conc_n_threads, -1);
-	}
-	if (!has_yielded)
-	{
-		has_yielded = TRUE;
-		os_thread_yield();
-		goto retry;
-	}
-
-	ut_ad(!trx->has_search_latch);
-
-	if (NULL != UT_LIST_GET_FIRST(trx->trx_locks)) {
-
-		conc_n_threads = os_atomic_increment_lint(&srv_conc_n_threads, 1);
-		enter_innodb_with_tickets(trx);
-		return;
-	}
-	if (has_slept < 2)
-	{
-		trx->op_info = "sleeping before entering InnoDB";
-		os_thread_sleep(10000);
-		trx->op_info = "";
-		has_slept++;
-	}
-	conc_n_threads = os_atomic_increment_lint(&srv_conc_n_threads, 1);
-	enter_innodb_with_tickets(trx);
-	return;
-}
-
-static void
-srv_conc_exit_innodb_timer_based(trx_t* trx)
-{
-	(void) os_atomic_increment_lint(&srv_conc_n_threads, -1);
-	trx->declared_to_be_inside_innodb = FALSE;
-	trx->n_tickets_to_enter_innodb = 0;
-	return;
-}
-#endif
-
-UNIV_INTERN
-void
-srv_conc_enter_innodb(
-/*==================*/
-	trx_t*	trx)	/*!< in: transaction object associated with the
-			thread */
-{
-	ibool			has_slept = FALSE;
-	srv_conc_slot_t*	slot	  = NULL;
-	ulint			i;
-	ib_uint64_t             start_time = 0L;
-	ib_uint64_t             finish_time = 0L;
-	ulint                   sec;
-	ulint                   ms;
-
-	ut_ad(!trx->has_search_latch);
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(!btr_search_own_any());
-	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
-#endif /* UNIV_SYNC_DEBUG */
-
-	if (trx->mysql_thd != NULL
-	    && thd_is_replication_slave_thread(trx->mysql_thd)) {
-
-		UT_WAIT_FOR(srv_conc_n_threads
-			    < (lint)srv_thread_concurrency,
-			    srv_replication_delay * 1000);
-
-		return;
-	}
-
-	/* If trx has 'free tickets' to enter the engine left, then use one
-	such ticket */
-
-	if (trx->n_tickets_to_enter_innodb > 0) {
-		trx->n_tickets_to_enter_innodb--;
-
-		return;
-	}
-
-#ifdef HAVE_ATOMIC_BUILTINS
-	if (srv_thread_concurrency_timer_based) {
-		srv_conc_enter_innodb_timer_based(trx);
-		return;
-	}
-#endif
-
-	os_fast_mutex_lock(&srv_conc_mutex);
-retry:
-	if (trx->declared_to_be_inside_innodb) {
-		ut_print_timestamp(stderr);
-		fputs("  InnoDB: Error: trying to declare trx"
-		      " to enter InnoDB, but\n"
-		      "InnoDB: it already is declared.\n", stderr);
-		trx_print(stderr, trx, 0);
-		putc('\n', stderr);
-		os_fast_mutex_unlock(&srv_conc_mutex);
-
-		return;
-	}
-
-	ut_ad(srv_conc_n_threads >= 0);
-
-	if (srv_conc_n_threads < (lint)srv_thread_concurrency) {
-
-		srv_conc_n_threads++;
-		trx->declared_to_be_inside_innodb = TRUE;
-		trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER;
-
-		os_fast_mutex_unlock(&srv_conc_mutex);
-
-		return;
-	}
-
-	/* If the transaction is not holding resources, let it sleep
-	for SRV_THREAD_SLEEP_DELAY microseconds, and try again then */
-
-	ut_ad(!trx->has_search_latch);
-
-	if (!has_slept
-	    && NULL == UT_LIST_GET_FIRST(trx->trx_locks)) {
-
-		has_slept = TRUE; /* We let it sleep only once to avoid
-				  starvation */
-
-		srv_conc_n_waiting_threads++;
-
-		os_fast_mutex_unlock(&srv_conc_mutex);
-
-		trx->op_info = "sleeping before joining InnoDB queue";
-
-		/* Peter Zaitsev suggested that we take the sleep away
-		altogether. But the sleep may be good in pathological
-		situations of lots of thread switches. Simply put some
-		threads aside for a while to reduce the number of thread
-		switches. */
-		if (SRV_THREAD_SLEEP_DELAY > 0) {
-			os_thread_sleep(SRV_THREAD_SLEEP_DELAY);
-			trx->innodb_que_wait_timer += SRV_THREAD_SLEEP_DELAY;
-		}
-
-		trx->op_info = "";
-
-		os_fast_mutex_lock(&srv_conc_mutex);
-
-		srv_conc_n_waiting_threads--;
-
-		goto retry;
-	}
-
-	/* Too many threads inside: put the current thread to a queue */
-
-	for (i = 0; i < OS_THREAD_MAX_N; i++) {
-		slot = srv_conc_slots + i;
-
-		if (!slot->reserved) {
-
-			break;
-		}
-	}
-
-	if (i == OS_THREAD_MAX_N) {
-		/* Could not find a free wait slot, we must let the
-		thread enter */
-
-		srv_conc_n_threads++;
-		trx->declared_to_be_inside_innodb = TRUE;
-		trx->n_tickets_to_enter_innodb = 0;
-
-		os_fast_mutex_unlock(&srv_conc_mutex);
-
-		return;
-	}
-
-	/* No-op for XtraDB. */
-	trx_search_latch_release_if_reserved(trx);
-
-	/* Add to the queue */
-	slot->reserved = TRUE;
-	slot->wait_ended = FALSE;
-
-	UT_LIST_ADD_LAST(srv_conc_queue, srv_conc_queue, slot);
-
-	os_event_reset(slot->event);
-
-	srv_conc_n_waiting_threads++;
-
-	os_fast_mutex_unlock(&srv_conc_mutex);
-
-	/* Go to wait for the event; when a thread leaves InnoDB it will
-	release this thread */
-
-	ut_ad(!trx->has_search_latch);
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(!btr_search_own_any());
-	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
-#endif /* UNIV_SYNC_DEBUG */
-
-	if (UNIV_UNLIKELY(trx->take_stats)) {
-		ut_usectime(&sec, &ms);
-		start_time = (ib_uint64_t)sec * 1000000 + ms;
-	} else {
-		start_time = 0;
-	}
-
-	trx->op_info = "waiting in InnoDB queue";
-
-	thd_wait_begin(trx->mysql_thd, THD_WAIT_USER_LOCK);
-	os_event_wait(slot->event);
-	thd_wait_end(trx->mysql_thd);
-
-	trx->op_info = "";
-
-	if (UNIV_UNLIKELY(start_time != 0)) {
-		ut_usectime(&sec, &ms);
-		finish_time = (ib_uint64_t)sec * 1000000 + ms;
-		trx->innodb_que_wait_timer += (ulint)(finish_time - start_time);
-	}
-
-	os_fast_mutex_lock(&srv_conc_mutex);
-
-	srv_conc_n_waiting_threads--;
-
-	/* NOTE that the thread which released this thread already
-	incremented the thread counter on behalf of this thread */
-
-	slot->reserved = FALSE;
-
-	UT_LIST_REMOVE(srv_conc_queue, srv_conc_queue, slot);
-
-	trx->declared_to_be_inside_innodb = TRUE;
-	trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER;
-
-	os_fast_mutex_unlock(&srv_conc_mutex);
-}
-
-/*********************************************************************//**
-This lets a thread enter InnoDB regardless of the number of threads inside
-InnoDB. This must be called when a thread ends a lock wait. */
-UNIV_INTERN
-void
-srv_conc_force_enter_innodb(
-/*========================*/
-	trx_t*	trx)	/*!< in: transaction object associated with the
-			thread */
-{
-	ut_ad(!trx->has_search_latch);
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(!btr_search_own_any());
-	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
-#endif /* UNIV_SYNC_DEBUG */
-
-	if (UNIV_LIKELY(!srv_thread_concurrency)) {
-
-		return;
-	}
-
-	ut_ad(srv_conc_n_threads >= 0);
-#ifdef HAVE_ATOMIC_BUILTINS
-	if (srv_thread_concurrency_timer_based) {
-		(void) os_atomic_increment_lint(&srv_conc_n_threads, 1);
-		trx->declared_to_be_inside_innodb = TRUE;
-		trx->n_tickets_to_enter_innodb = 1;
-		return;
-	}
-#endif
-
-	os_fast_mutex_lock(&srv_conc_mutex);
-
-	srv_conc_n_threads++;
-	trx->declared_to_be_inside_innodb = TRUE;
-	trx->n_tickets_to_enter_innodb = 1;
-
-	os_fast_mutex_unlock(&srv_conc_mutex);
-}
-
-/*********************************************************************//**
-This must be called when a thread exits InnoDB in a lock wait or at the
-end of an SQL statement. */
-UNIV_INTERN
-void
-srv_conc_force_exit_innodb(
-/*=======================*/
-	trx_t*	trx)	/*!< in: transaction object associated with the
-			thread */
-{
-	srv_conc_slot_t*	slot	= NULL;
-
-	if (trx->mysql_thd != NULL
-	    && thd_is_replication_slave_thread(trx->mysql_thd)) {
-
-		return;
-	}
-
-	if (trx->declared_to_be_inside_innodb == FALSE) {
-
-		return;
-	}
-
-#ifdef HAVE_ATOMIC_BUILTINS
-	if (srv_thread_concurrency_timer_based) {
-		srv_conc_exit_innodb_timer_based(trx);
-		return;
-	}
-#endif
-
-	os_fast_mutex_lock(&srv_conc_mutex);
-
-	ut_ad(srv_conc_n_threads > 0);
-	srv_conc_n_threads--;
-	trx->declared_to_be_inside_innodb = FALSE;
-	trx->n_tickets_to_enter_innodb = 0;
-
-	if (srv_conc_n_threads < (lint)srv_thread_concurrency) {
-		/* Look for a slot where a thread is waiting and no other
-		thread has yet released the thread */
-
-		slot = UT_LIST_GET_FIRST(srv_conc_queue);
-
-		while (slot && slot->wait_ended == TRUE) {
-			slot = UT_LIST_GET_NEXT(srv_conc_queue, slot);
-		}
-
-		if (slot != NULL) {
-			slot->wait_ended = TRUE;
-
-			/* We increment the count on behalf of the released
-			thread */
-
-			srv_conc_n_threads++;
-		}
-	}
-
-	os_fast_mutex_unlock(&srv_conc_mutex);
-
-	if (slot != NULL) {
-		os_event_set(slot->event);
-	}
-
-	ut_ad(!trx->has_search_latch);
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(!btr_search_own_any());
-	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
-#endif /* UNIV_SYNC_DEBUG */
-}
-
-/*********************************************************************//**
-This must be called when a thread exits InnoDB. */
-UNIV_INTERN
-void
-srv_conc_exit_innodb(
-/*=================*/
-	trx_t*	trx)	/*!< in: transaction object associated with the
-			thread */
-{
-	ut_ad(!trx->has_search_latch);
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(!btr_search_own_any());
-	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
-#endif /* UNIV_SYNC_DEBUG */
-
-	if (trx->n_tickets_to_enter_innodb > 0) {
-		/* We will pretend the thread is still inside InnoDB though it
-		now leaves the InnoDB engine. In this way we save
-		a lot of semaphore operations. srv_conc_force_exit_innodb is
-		used to declare the thread definitely outside InnoDB. It
-		should be called when there is a lock wait or an SQL statement
-		ends. */
-
-		return;
-	}
-
-	srv_conc_force_exit_innodb(trx);
-}
-
-/*========================================================================*/
-
-/*********************************************************************//**
-Normalizes init parameter values to use units we use inside InnoDB.
-@return	DB_SUCCESS or error code */
-static
-ulint
-srv_normalize_init_values(void)
-/*===========================*/
-{
-	ulint	n;
-	ulint	i;
-
-	n = srv_n_data_files;
-
-	for (i = 0; i < n; i++) {
-		srv_data_file_sizes[i] = srv_data_file_sizes[i]
-			* ((1024 * 1024) / UNIV_PAGE_SIZE);
-	}
-
-	srv_last_file_size_max = srv_last_file_size_max
-		* ((1024 * 1024) / UNIV_PAGE_SIZE);
-
-	srv_log_file_size = srv_log_file_size / UNIV_PAGE_SIZE;
-
-	srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE;
-
-	srv_lock_table_size = 5 * (srv_buf_pool_size / UNIV_PAGE_SIZE);
-
-	return(DB_SUCCESS);
-}
-
-/*********************************************************************//**
-Boots the InnoDB server.
-@return	DB_SUCCESS or error code */
-UNIV_INTERN
-ulint
-srv_boot(void)
-/*==========*/
-{
-	ulint	err;
-
-	/* Transform the init parameter values given by MySQL to
-	use units we use inside InnoDB: */
-
-	err = srv_normalize_init_values();
-
-	if (err != DB_SUCCESS) {
-		return(err);
-	}
-
-	/* Initialize synchronization primitives, memory management, and thread
-	local storage */
-
-	srv_general_init();
-
-	/* Initialize this module */
-
-	srv_init();
-
-	return(DB_SUCCESS);
-}
-
-/*********************************************************************//**
-Reserves a slot in the thread table for the current MySQL OS thread.
-NOTE! The kernel mutex has to be reserved by the caller!
-@return	reserved slot */
-static
-srv_slot_t*
-srv_table_reserve_slot_for_mysql(void)
-/*==================================*/
-{
-	srv_slot_t*	slot;
-	ulint		i;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	i = 0;
-	slot = srv_mysql_table + i;
-
-	while (slot->in_use) {
-		i++;
-
-		if (UNIV_UNLIKELY(i >= OS_THREAD_MAX_N)) {
-
-			ut_print_timestamp(stderr);
-
-			fprintf(stderr,
-				"  InnoDB: There appear to be %lu MySQL"
-				" threads currently waiting\n"
-				"InnoDB: inside InnoDB, which is the"
-				" upper limit. Cannot continue operation.\n"
-				"InnoDB: We intentionally generate"
-				" a seg fault to print a stack trace\n"
-				"InnoDB: on Linux. But first we print"
-				" a list of waiting threads.\n", (ulong) i);
-
-			for (i = 0; i < OS_THREAD_MAX_N; i++) {
-
-				slot = srv_mysql_table + i;
-
-				fprintf(stderr,
-					"Slot %lu: thread type %lu,"
-					" in use %lu, susp %lu, time %lu\n",
-					(ulong) i,
-					(ulong) slot->type,
-					(ulong) slot->in_use,
-					(ulong) slot->suspended,
-					(ulong) difftime(ut_time(),
-							 slot->suspend_time));
-			}
-
-			ut_error;
-		}
-
-		slot = srv_mysql_table + i;
-	}
-
-	ut_a(slot->in_use == FALSE);
-
-	slot->in_use = TRUE;
-
-	return(slot);
-}
-
-/***************************************************************//**
-Puts a MySQL OS thread to wait for a lock to be released. If an error
-occurs during the wait trx->error_state associated with thr is
-!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
-are possible errors. DB_DEADLOCK is returned if selective deadlock
-resolution chose this transaction as a victim. */
-UNIV_INTERN
-void
-srv_suspend_mysql_thread(
-/*=====================*/
-	que_thr_t*	thr)	/*!< in: query thread associated with the MySQL
-				OS thread */
-{
-	srv_slot_t*	slot;
-	os_event_t	event;
-	double		wait_time;
-	trx_t*		trx;
-	ulint		had_dict_lock;
-	ibool		was_declared_inside_innodb	= FALSE;
-	ib_int64_t	start_time			= 0;
-	ib_int64_t	finish_time;
-	ulint		diff_time;
-	ulint		sec;
-	ulint		ms;
-	ulong		lock_wait_timeout;
-
-	ut_ad(!mutex_own(&kernel_mutex));
-
-	trx = thr_get_trx(thr);
-
-	if (trx->mysql_thd != 0) {
-		DEBUG_SYNC_C("srv_suspend_mysql_thread_enter");
-	}
-
-	os_event_set(srv_lock_timeout_thread_event);
-
-	mutex_enter(&kernel_mutex);
-
-	trx->error_state = DB_SUCCESS;
-
-	if (thr->state == QUE_THR_RUNNING) {
-
-		ut_ad(thr->is_active == TRUE);
-
-		/* The lock has already been released or this transaction
-		was chosen as a deadlock victim: no need to suspend */
-
-		if (trx->was_chosen_as_deadlock_victim) {
-
-			trx->error_state = DB_DEADLOCK;
-			trx->was_chosen_as_deadlock_victim = FALSE;
-		}
-
-		mutex_exit(&kernel_mutex);
-
-		return;
-	}
-
-	ut_ad(thr->is_active == FALSE);
-
-	slot = srv_table_reserve_slot_for_mysql();
-
-	event = slot->event;
-
-	slot->thr = thr;
-
-	os_event_reset(event);
-
-	slot->suspend_time = ut_time();
-
-	if (thr->lock_state == QUE_THR_LOCK_ROW) {
-		srv_n_lock_wait_count++;
-		srv_n_lock_wait_current_count++;
-
-		if (ut_usectime(&sec, &ms) == -1) {
-			start_time = -1;
-		} else {
-			start_time = (ib_int64_t) sec * 1000000 + ms;
-		}
-	}
-	/* Wake the lock timeout monitor thread, if it is suspended */
-
-	os_event_set(srv_lock_timeout_thread_event);
-
-	mutex_exit(&kernel_mutex);
-
-	had_dict_lock = trx->dict_operation_lock_mode;
-
-	switch (had_dict_lock) {
-	case RW_S_LATCH:
-		/* Release foreign key check latch */
-		row_mysql_unfreeze_data_dictionary(trx);
-		break;
-	case RW_X_LATCH:
-		/* There should never be a lock wait when the
-		dictionary latch is reserved in X mode.  Dictionary
-		transactions should only acquire locks on dictionary
-		tables, not other tables. All access to dictionary
-		tables should be covered by dictionary
-		transactions. */
-		ut_print_timestamp(stderr);
-		fputs("  InnoDB: Error: dict X latch held in "
-		      "srv_suspend_mysql_thread\n", stderr);
-		/* This should never occur. This incorrect handling
-		was added in the early development of
-		ha_innobase::add_index() in InnoDB Plugin 1.0. */
-		/* Release fast index creation latch */
-		row_mysql_unlock_data_dictionary(trx);
-		break;
-	}
-
-	ut_a(trx->dict_operation_lock_mode == 0);
-
-	if (trx->declared_to_be_inside_innodb) {
-
-		was_declared_inside_innodb = TRUE;
-
-		/* We must declare this OS thread to exit InnoDB, since a
-		possible other thread holding a lock which this thread waits
-		for must be allowed to enter, sooner or later */
-
-		srv_conc_force_exit_innodb(trx);
-	}
-
-	/* Suspend this thread and wait for the event. */
-
-	thd_wait_begin(trx->mysql_thd, THD_WAIT_ROW_LOCK);
-	os_event_wait(event);
-	thd_wait_end(trx->mysql_thd);
-
-	ut_ad(!trx->has_search_latch);
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(!btr_search_own_any());
-	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
-#endif /* UNIV_SYNC_DEBUG */
-
-	if (was_declared_inside_innodb) {
-
-		/* Return back inside InnoDB */
-
-		srv_conc_force_enter_innodb(trx);
-	}
-
-	/* After resuming, reacquire the data dictionary latch if
-	necessary. */
-
-	switch (had_dict_lock) {
-	case RW_S_LATCH:
-		row_mysql_freeze_data_dictionary(trx);
-		break;
-	case RW_X_LATCH:
-		/* This should never occur. This incorrect handling
-		was added in the early development of
-		ha_innobase::add_index() in InnoDB Plugin 1.0. */
-		row_mysql_lock_data_dictionary(trx);
-		break;
-	}
-
-	mutex_enter(&kernel_mutex);
-
-	/* Release the slot for others to use */
-
-	slot->in_use = FALSE;
-
-	wait_time = ut_difftime(ut_time(), slot->suspend_time);
-
-	if (thr->lock_state == QUE_THR_LOCK_ROW) {
-		if (ut_usectime(&sec, &ms) == -1) {
-			finish_time = -1;
-		} else {
-			finish_time = (ib_int64_t) sec * 1000000 + ms;
-		}
-
-		diff_time = (finish_time > start_time) ?
-			    (ulint) (finish_time - start_time) : 0;
-
-		srv_n_lock_wait_current_count--;
-		srv_n_lock_wait_time = srv_n_lock_wait_time + diff_time;
-		if (diff_time > srv_n_lock_max_wait_time &&
-		    /* only update the variable if we successfully
-		    retrieved the start and finish times. See Bug#36819. */
-		    start_time != -1 && finish_time != -1) {
-			srv_n_lock_max_wait_time = diff_time;
-		}
-
-		/* Record the lock wait time for this thread */
-		thd_set_lock_wait_time(trx->mysql_thd, diff_time);
-	}
-
-	if (trx->was_chosen_as_deadlock_victim) {
-
-		trx->error_state = DB_DEADLOCK;
-		trx->was_chosen_as_deadlock_victim = FALSE;
-	}
-
-	mutex_exit(&kernel_mutex);
-
-	/* InnoDB system transactions (such as the purge, and
-	incomplete transactions that are being rolled back after crash
-	recovery) will use the global value of
-	innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */
-	lock_wait_timeout = thd_lock_wait_timeout(trx->mysql_thd);
-
-	if (lock_wait_timeout < 100000000
-	    && wait_time > (double) lock_wait_timeout) {
-
-		trx->error_state = DB_LOCK_WAIT_TIMEOUT;
-	}
-
-	if (trx_is_interrupted(trx)) {
-
-		trx->error_state = DB_INTERRUPTED;
-	}
-}
-
-/********************************************************************//**
-Releases a MySQL OS thread waiting for a lock to be released, if the
-thread is already suspended. */
-UNIV_INTERN
-void
-srv_release_mysql_thread_if_suspended(
-/*==================================*/
-	que_thr_t*	thr)	/*!< in: query thread associated with the
-				MySQL OS thread	 */
-{
-	srv_slot_t*	slot;
-	ulint		i;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	for (i = 0; i < OS_THREAD_MAX_N; i++) {
-
-		slot = srv_mysql_table + i;
-
-		if (slot->in_use && slot->thr == thr) {
-			/* Found */
-
-			os_event_set(slot->event);
-
-			return;
-		}
-	}
-
-	/* not found */
-}
-
-/******************************************************************//**
-Refreshes the values used to calculate per-second averages. */
-static
-void
-srv_refresh_innodb_monitor_stats(void)
-/*==================================*/
-{
-	mutex_enter(&srv_innodb_monitor_mutex);
-
-	srv_last_monitor_time = time(NULL);
-
-	os_aio_refresh_stats();
-
-	btr_cur_n_sea_old = btr_cur_n_sea;
-	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
-
-	log_refresh_stats();
-
-	buf_refresh_io_stats_all();
-
-	srv_n_rows_inserted_old = srv_n_rows_inserted;
-	srv_n_rows_updated_old = srv_n_rows_updated;
-	srv_n_rows_deleted_old = srv_n_rows_deleted;
-	srv_n_rows_read_old = srv_n_rows_read;
-
-	mutex_exit(&srv_innodb_monitor_mutex);
-}
-
-/******************************************************************//**
-Outputs to a file the output of the InnoDB Monitor.
-@return FALSE if not all information printed
-due to failure to obtain necessary mutex */
-UNIV_INTERN
-ibool
-srv_printf_innodb_monitor(
-/*======================*/
-	FILE*	file,		/*!< in: output stream */
-	ibool	nowait,		/*!< in: whether to wait for kernel mutex */
-	ulint*	trx_start,	/*!< out: file position of the start of
-				the list of active transactions */
-	ulint*	trx_end)	/*!< out: file position of the end of
-				the list of active transactions */
-{
-	double	time_elapsed;
-	time_t	current_time;
-	ulint	n_reserved;
-	ibool	ret;
-
-	ulong	btr_search_sys_constant;
-	ulong	btr_search_sys_variable;
-	ulint	lock_sys_subtotal;
-	ulint	recv_sys_subtotal;
-
-	ulint	i;
-	trx_t*	trx;
-
-	mutex_enter(&srv_innodb_monitor_mutex);
-
-	current_time = time(NULL);
-
-	/* We add 0.001 seconds to time_elapsed to prevent division
-	by zero if two users happen to call SHOW INNODB STATUS at the same
-	time */
-
-	time_elapsed = difftime(current_time, srv_last_monitor_time)
-		+ 0.001;
-
-	srv_last_monitor_time = time(NULL);
-
-	fputs("\n=====================================\n", file);
-
-	ut_print_timestamp(file);
-	fprintf(file,
-		" INNODB MONITOR OUTPUT\n"
-		"=====================================\n"
-		"Per second averages calculated from the last %lu seconds\n",
-		(ulong)time_elapsed);
-
-	fputs("-----------------\n"
-	      "BACKGROUND THREAD\n"
-	      "-----------------\n", file);
-	srv_print_master_thread_info(file);
-
-	fputs("----------\n"
-	      "SEMAPHORES\n"
-	      "----------\n", file);
-	sync_print(file);
-
-	/* Conceptually, srv_innodb_monitor_mutex has a very high latching
-	order level in sync0sync.h, while dict_foreign_err_mutex has a very
-	low level 135. Therefore we can reserve the latter mutex here without
-	a danger of a deadlock of threads. */
-
-	mutex_enter(&dict_foreign_err_mutex);
-
-	if (ftell(dict_foreign_err_file) != 0L) {
-		fputs("------------------------\n"
-		      "LATEST FOREIGN KEY ERROR\n"
-		      "------------------------\n", file);
-		ut_copy_file(file, dict_foreign_err_file);
-	}
-
-	mutex_exit(&dict_foreign_err_mutex);
-
-	fputs("--------\n"
-	      "FILE I/O\n"
-	      "--------\n", file);
-	os_aio_print(file);
-
-	fputs("-------------------------------------\n"
-	      "INSERT BUFFER AND ADAPTIVE HASH INDEX\n"
-	      "-------------------------------------\n", file);
-	ibuf_print(file);
-
-	for (i = 0; i < btr_search_index_num; i++) {
-		ha_print_info(file, btr_search_sys->hash_tables[i]);
-	}
-
-	fprintf(file,
-		"%.2f hash searches/s, %.2f non-hash searches/s\n",
-		(btr_cur_n_sea - btr_cur_n_sea_old)
-		/ time_elapsed,
-		(btr_cur_n_non_sea - btr_cur_n_non_sea_old)
-		/ time_elapsed);
-	btr_cur_n_sea_old = btr_cur_n_sea;
-	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
-
-	fputs("---\n"
-	      "LOG\n"
-	      "---\n", file);
-	log_print(file);
-
-	fputs("----------------------\n"
-	      "BUFFER POOL AND MEMORY\n"
-	      "----------------------\n", file);
-	fprintf(file,
-			"Total memory allocated " ULINTPF
-			"; in additional pool allocated " ULINTPF "\n",
-			ut_total_allocated_memory,
-			mem_pool_get_reserved(mem_comm_pool));
-	fprintf(file,
-		"Total memory allocated by read views " ULINTPF "\n",
-		srv_read_views_memory);
-
-	/* Calculate AHI constant and variable memory allocations */
-
-	btr_search_sys_constant = 0;
-	btr_search_sys_variable = 0;
-
-	ut_ad(btr_search_sys->hash_tables);
-
-	for (i = 0; i < btr_search_index_num; i++) {
-		hash_table_t* ht = btr_search_sys->hash_tables[i];
-
-		ut_ad(ht);
-		ut_ad(ht->heap);
-
-		/* Multiple mutexes/heaps are currently never used for adaptive
-		hash index tables. */
-		ut_ad(!ht->n_mutexes);
-		ut_ad(!ht->heaps);
-
-		btr_search_sys_variable += mem_heap_get_size(ht->heap);
-		btr_search_sys_constant += ht->n_cells * sizeof(hash_cell_t);
-	}
-
-	lock_sys_subtotal = 0;
-	if (trx_sys) {
-		mutex_enter(&kernel_mutex);
-		trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
-		while (trx) {
-			lock_sys_subtotal += ((trx->lock_heap) ? mem_heap_get_size(trx->lock_heap) : 0);
-			trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
-		}
-		mutex_exit(&kernel_mutex);
-	}
-
-	recv_sys_subtotal = ((recv_sys && recv_sys->addr_hash)
-			? mem_heap_get_size(recv_sys->heap) : 0);
-
-	fprintf(file,
-			"Internal hash tables (constant factor + variable factor)\n"
-			"    Adaptive hash index %lu \t(%lu + %lu)\n"
-			"    Page hash           %lu (buffer pool 0 only)\n"
-			"    Dictionary cache    %lu \t(%lu + %lu)\n"
-			"    File system         %lu \t(%lu + %lu)\n"
-			"    Lock system         %lu \t(%lu + %lu)\n"
-			"    Recovery system     %lu \t(%lu + %lu)\n",
-
-			btr_search_sys_constant + btr_search_sys_variable,
-			btr_search_sys_constant,
-			btr_search_sys_variable,
-
-			(ulong) (buf_pool_from_array(0)->page_hash->n_cells * sizeof(hash_cell_t)),
-
-			(ulong) (dict_sys ? ((dict_sys->table_hash->n_cells
-						+ dict_sys->table_id_hash->n_cells
-						) * sizeof(hash_cell_t)
-					+ dict_sys->size) : 0),
-			(ulong) (dict_sys ? ((dict_sys->table_hash->n_cells
-							+ dict_sys->table_id_hash->n_cells
-							) * sizeof(hash_cell_t)) : 0),
-			(ulong) (dict_sys ? (dict_sys->size) : 0),
-
-			(ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)
-					+ fil_system_hash_nodes()),
-			(ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)),
-			(ulong) fil_system_hash_nodes(),
-
-			(ulong) ((lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0)
-					+ lock_sys_subtotal),
-			(ulong) (lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0),
-			(ulong) lock_sys_subtotal,
-
-			(ulong) (((recv_sys && recv_sys->addr_hash)
-						? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0)
-					+ recv_sys_subtotal),
-			(ulong) ((recv_sys && recv_sys->addr_hash)
-					? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0),
-			(ulong) recv_sys_subtotal);
-
-	fprintf(file, "Dictionary memory allocated " ULINTPF "\n",
-		dict_sys->size);
-
-	buf_print_io(file);
-
-	fputs("--------------\n"
-	      "ROW OPERATIONS\n"
-	      "--------------\n", file);
-	fprintf(file, "%ld queries inside InnoDB, %lu queries in queue\n",
-		(long) srv_conc_n_threads,
-		(ulong) srv_conc_n_waiting_threads);
-
-	mutex_enter(&kernel_mutex);
-
-	fprintf(file, "%lu read views open inside InnoDB\n",
-		UT_LIST_GET_LEN(trx_sys->view_list));
-
-	fprintf(file, "%lu transactions active inside InnoDB\n",
-		UT_LIST_GET_LEN(trx_sys->trx_list));
-
-	fprintf(file, "%lu out of %lu descriptors used\n",
-		trx_sys->descr_n_used, trx_sys->descr_n_max);
-
-	if (UT_LIST_GET_LEN(trx_sys->view_list)) {
-		read_view_t*	view = UT_LIST_GET_LAST(trx_sys->view_list);
-
-		if (view) {
-			fprintf(file, "---OLDEST VIEW---\n");
-			read_view_print(file, view);
-			fprintf(file, "-----------------\n");
-		}
-	}
-
-	mutex_exit(&kernel_mutex);
-
-	n_reserved = fil_space_get_n_reserved_extents(0);
-	if (n_reserved > 0) {
-		fprintf(file,
-			"%lu tablespace extents now reserved for"
-			" B-tree split operations\n",
-			(ulong) n_reserved);
-	}
-
-#ifdef UNIV_LINUX
-	fprintf(file, "Main thread process no. %lu, id %lu, state: %s\n",
-		(ulong) srv_main_thread_process_no,
-		(ulong) srv_main_thread_id,
-		srv_main_thread_op_info);
-#else
-	fprintf(file, "Main thread id %lu, state: %s\n",
-		(ulong) srv_main_thread_id,
-		srv_main_thread_op_info);
-#endif
-	fprintf(file,
-		"Number of rows inserted " ULINTPF
-		", updated " ULINTPF ", deleted " ULINTPF
-		", read " ULINTPF "\n",
-		srv_n_rows_inserted,
-		srv_n_rows_updated,
-		srv_n_rows_deleted,
-		srv_n_rows_read);
-	fprintf(file,
-		"%.2f inserts/s, %.2f updates/s,"
-		" %.2f deletes/s, %.2f reads/s\n",
-		(srv_n_rows_inserted - srv_n_rows_inserted_old)
-		/ time_elapsed,
-		(srv_n_rows_updated - srv_n_rows_updated_old)
-		/ time_elapsed,
-		(srv_n_rows_deleted - srv_n_rows_deleted_old)
-		/ time_elapsed,
-		(srv_n_rows_read - srv_n_rows_read_old)
-		/ time_elapsed);
-
-	srv_n_rows_inserted_old = srv_n_rows_inserted;
-	srv_n_rows_updated_old = srv_n_rows_updated;
-	srv_n_rows_deleted_old = srv_n_rows_deleted;
-	srv_n_rows_read_old = srv_n_rows_read;
-
-	/* Only if lock_print_info_summary proceeds correctly,
-	before we call the lock_print_info_all_transactions
-	to print all the lock information. */
-	ret = lock_print_info_summary(file, nowait);
-
-	if (ret) {
-		if (trx_start) {
-			long	t = ftell(file);
-			if (t < 0) {
-				*trx_start = ULINT_UNDEFINED;
-			} else {
-				*trx_start = (ulint) t;
-			}
-		}
-		lock_print_info_all_transactions(file);
-		if (trx_end) {
-			long	t = ftell(file);
-			if (t < 0) {
-				*trx_end = ULINT_UNDEFINED;
-			} else {
-				*trx_end = (ulint) t;
-			}
-		}
-	}
-
-	fputs("----------------------------\n"
-	      "END OF INNODB MONITOR OUTPUT\n"
-	      "============================\n", file);
-	mutex_exit(&srv_innodb_monitor_mutex);
-	fflush(file);
-
-	return(ret);
-}
-
-/******************************************************************//**
-Function to pass InnoDB status variables to MySQL */
-UNIV_INTERN
-void
-srv_export_innodb_status(void)
-/*==========================*/
-{
-	buf_pool_stat_t		stat;
-	buf_pools_list_size_t	buf_pools_list_size;
-	ulint			LRU_len;
-	ulint			free_len;
-	ulint			flush_list_len;
-	ulint			mem_adaptive_hash, mem_dictionary;
-	read_view_t*		oldest_view;
-	ulint			i;
-
-	buf_get_total_stat(&stat);
-	buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
-	buf_get_total_list_size_in_bytes(&buf_pools_list_size);
-
-	mem_adaptive_hash = 0;
-
-	ut_ad(btr_search_sys->hash_tables);
-
-	for (i = 0; i < btr_search_index_num; i++) {
-		hash_table_t*	ht = btr_search_sys->hash_tables[i];
-
-		ut_ad(ht);
-		ut_ad(ht->heap);
-		/* Multiple mutexes/heaps are currently never used for adaptive
-		hash index tables. */
-		ut_ad(!ht->n_mutexes);
-		ut_ad(!ht->heaps);
-
-		mem_adaptive_hash += mem_heap_get_size(ht->heap);
-		mem_adaptive_hash += ht->n_cells * sizeof(hash_cell_t);
-	}
-
-	mem_dictionary = (dict_sys ? ((dict_sys->table_hash->n_cells
-					+ dict_sys->table_id_hash->n_cells
-				      ) * sizeof(hash_cell_t)
-				+ dict_sys->size) : 0);
-
-	mutex_enter(&srv_innodb_monitor_mutex);
-
-	export_vars.innodb_adaptive_hash_cells = 0;
-	export_vars.innodb_adaptive_hash_heap_buffers = 0;
-	for (i = 0; i < btr_search_index_num; i++) {
-		hash_table_t*	table = btr_search_sys->hash_tables[i];
-
-		export_vars.innodb_adaptive_hash_cells
-			+= hash_get_n_cells(table);
-		export_vars.innodb_adaptive_hash_heap_buffers
-			+= (UT_LIST_GET_LEN(table->heap->base) - 1);
-	}
-	export_vars.innodb_adaptive_hash_hash_searches
-		= btr_cur_n_sea;
-	export_vars.innodb_adaptive_hash_non_hash_searches
-		= btr_cur_n_non_sea;
-	export_vars.innodb_background_log_sync
-		= srv_log_writes_and_flush;
-	export_vars.innodb_data_pending_reads
-		= os_n_pending_reads;
-	export_vars.innodb_data_pending_writes
-		= os_n_pending_writes;
-	export_vars.innodb_data_pending_fsyncs
-		= fil_n_pending_log_flushes
-		+ fil_n_pending_tablespace_flushes;
-	export_vars.innodb_data_fsyncs = os_n_fsyncs;
-	export_vars.innodb_data_read = srv_data_read;
-	export_vars.innodb_data_reads = os_n_file_reads;
-	export_vars.innodb_data_writes = os_n_file_writes;
-	export_vars.innodb_data_written = srv_data_written;
-	export_vars.innodb_dict_tables= (dict_sys ? UT_LIST_GET_LEN(dict_sys->table_LRU) : 0);
-	export_vars.innodb_buffer_pool_read_requests = stat.n_page_gets;
-	export_vars.innodb_buffer_pool_write_requests
-		= srv_buf_pool_write_requests;
-	export_vars.innodb_buffer_pool_wait_free = srv_buf_pool_wait_free;
-	export_vars.innodb_buffer_pool_pages_flushed = srv_buf_pool_flushed;
-	export_vars.innodb_buffer_pool_pages_LRU_flushed = buf_lru_flush_page_count;
-	export_vars.innodb_buffer_pool_reads = srv_buf_pool_reads;
-	export_vars.innodb_buffer_pool_read_ahead_rnd
-		= stat.n_ra_pages_read_rnd;
-	export_vars.innodb_buffer_pool_read_ahead
-		= stat.n_ra_pages_read;
-	export_vars.innodb_buffer_pool_read_ahead_evicted
-		= stat.n_ra_pages_evicted;
-	export_vars.innodb_buffer_pool_pages_data = LRU_len;
-	export_vars.innodb_buffer_pool_bytes_data =
-		buf_pools_list_size.LRU_bytes
-		+ buf_pools_list_size.unzip_LRU_bytes;
-	export_vars.innodb_buffer_pool_pages_dirty = flush_list_len;
-	export_vars.innodb_buffer_pool_bytes_dirty =
-		buf_pools_list_size.flush_list_bytes;
-	export_vars.innodb_buffer_pool_pages_free = free_len;
-	export_vars.innodb_deadlocks = srv_n_lock_deadlock_count;
-#ifdef UNIV_DEBUG
-	export_vars.innodb_buffer_pool_pages_latched
-		= buf_get_latched_pages_number();
-#endif /* UNIV_DEBUG */
-	export_vars.innodb_buffer_pool_pages_total = buf_pool_get_n_pages();
-
-	export_vars.innodb_buffer_pool_pages_misc
-	       	= buf_pool_get_n_pages() - LRU_len - free_len;
-
-	export_vars.innodb_buffer_pool_pages_made_young
-		= stat.n_pages_made_young;
-	export_vars.innodb_buffer_pool_pages_made_not_young
-		= stat.n_pages_not_made_young;
-	export_vars.innodb_buffer_pool_pages_old = 0;
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool = buf_pool_from_array(i);
-		export_vars.innodb_buffer_pool_pages_old
-			+= buf_pool->LRU_old_len;
-	}
-	export_vars.innodb_checkpoint_age
-		= (log_sys->lsn - log_sys->last_checkpoint_lsn);
-	export_vars.innodb_checkpoint_max_age
-		= log_sys->max_checkpoint_age;
-	export_vars.innodb_checkpoint_target_age
-		= srv_checkpoint_age_target
-		  ? ut_min(log_sys->max_checkpoint_age_async, srv_checkpoint_age_target)
-		  : log_sys->max_checkpoint_age_async;
-	export_vars.innodb_history_list_length
-		= trx_sys->rseg_history_len;
-	ibuf_export_ibuf_status(
-			&export_vars.innodb_ibuf_size,
-			&export_vars.innodb_ibuf_free_list,
-			&export_vars.innodb_ibuf_segment_size,
-			&export_vars.innodb_ibuf_merges,
-			&export_vars.innodb_ibuf_merged_inserts,
-			&export_vars.innodb_ibuf_merged_delete_marks,
-			&export_vars.innodb_ibuf_merged_deletes,
-			&export_vars.innodb_ibuf_discarded_inserts,
-			&export_vars.innodb_ibuf_discarded_delete_marks,
-			&export_vars.innodb_ibuf_discarded_deletes);
-	export_vars.innodb_lsn_current
-		= log_sys->lsn;
-	export_vars.innodb_lsn_flushed
-		= log_sys->flushed_to_disk_lsn;
-	export_vars.innodb_lsn_last_checkpoint
-		= log_sys->last_checkpoint_lsn;
-	export_vars.innodb_master_thread_1_second_loops
-		= srv_main_1_second_loops;
-	export_vars.innodb_master_thread_10_second_loops
-		= srv_main_10_second_loops;
-	export_vars.innodb_master_thread_background_loops
-		= srv_main_background_loops;
-	export_vars.innodb_master_thread_main_flush_loops
-		= srv_main_flush_loops;
-	export_vars.innodb_master_thread_sleeps
-		= srv_main_sleeps;
-	export_vars.innodb_max_trx_id
-		= trx_sys->max_trx_id;
-	export_vars.innodb_mem_adaptive_hash
-		= mem_adaptive_hash;
-	export_vars.innodb_mem_dictionary
-		= mem_dictionary;
-	export_vars.innodb_mem_total
-		= ut_total_allocated_memory;
-	export_vars.innodb_mutex_os_waits
-		= mutex_os_wait_count;
-	export_vars.innodb_mutex_spin_rounds
-		= mutex_spin_round_count;
-	export_vars.innodb_mutex_spin_waits
-		= mutex_spin_wait_count;
-	export_vars.innodb_s_lock_os_waits
-		= rw_s_os_wait_count;
-	export_vars.innodb_s_lock_spin_rounds
-		= rw_s_spin_round_count;
-	export_vars.innodb_s_lock_spin_waits
-		= rw_s_spin_wait_count;
-	export_vars.innodb_x_lock_os_waits
-		= rw_x_os_wait_count;
-	export_vars.innodb_x_lock_spin_rounds
-		= rw_x_spin_round_count;
-	export_vars.innodb_x_lock_spin_waits
-		= rw_x_spin_wait_count;
-
-	oldest_view = UT_LIST_GET_LAST(trx_sys->view_list);
-	export_vars.innodb_oldest_view_low_limit_trx_id
-		= oldest_view ? oldest_view->low_limit_id : 0;
-
-	export_vars.innodb_purge_trx_id
-		= purge_sys->purge_trx_no;
-	export_vars.innodb_purge_undo_no
-		= purge_sys->purge_undo_no;
-	export_vars.innodb_current_row_locks
-		= lock_sys->rec_num;
-
-#ifdef HAVE_ATOMIC_BUILTINS
-	export_vars.innodb_have_atomic_builtins = 1;
-#else
-	export_vars.innodb_have_atomic_builtins = 0;
-#endif
-	export_vars.innodb_page_size = UNIV_PAGE_SIZE;
-	export_vars.innodb_log_waits = srv_log_waits;
-	export_vars.innodb_os_log_written = srv_os_log_written;
-	export_vars.innodb_os_log_fsyncs = fil_n_log_flushes;
-	export_vars.innodb_os_log_pending_fsyncs = fil_n_pending_log_flushes;
-	export_vars.innodb_os_log_pending_writes = srv_os_log_pending_writes;
-	export_vars.innodb_log_write_requests = srv_log_write_requests;
-	export_vars.innodb_log_writes = srv_log_writes;
-	export_vars.innodb_dblwr_pages_written = srv_dblwr_pages_written;
-	export_vars.innodb_dblwr_writes = srv_dblwr_writes;
-	export_vars.innodb_pages_created = stat.n_pages_created;
-	export_vars.innodb_pages_read = stat.n_pages_read;
-	export_vars.innodb_pages_written = stat.n_pages_written;
-	export_vars.innodb_row_lock_waits = srv_n_lock_wait_count;
-	export_vars.innodb_row_lock_current_waits
-		= srv_n_lock_wait_current_count;
-	export_vars.innodb_row_lock_time = srv_n_lock_wait_time / 1000;
-	if (srv_n_lock_wait_count > 0) {
-		export_vars.innodb_row_lock_time_avg = (ulint)
-			(srv_n_lock_wait_time / 1000 / srv_n_lock_wait_count);
-	} else {
-		export_vars.innodb_row_lock_time_avg = 0;
-	}
-	export_vars.innodb_row_lock_time_max
-		= srv_n_lock_max_wait_time / 1000;
-	export_vars.innodb_rows_read = srv_n_rows_read;
-	export_vars.innodb_rows_inserted = srv_n_rows_inserted;
-	export_vars.innodb_rows_updated = srv_n_rows_updated;
-	export_vars.innodb_rows_deleted = srv_n_rows_deleted;
-	export_vars.innodb_truncated_status_writes = srv_truncated_status_writes;
-	export_vars.innodb_read_views_memory = srv_read_views_memory;
-	export_vars.innodb_descriptors_memory = srv_descriptors_memory;
-
-#ifdef UNIV_DEBUG
-	{
-		trx_id_t	done_trx_no;
-		trx_id_t	up_limit_id;
-
-		rw_lock_s_lock(&purge_sys->latch);
-		done_trx_no	= purge_sys->done_trx_no;
-		up_limit_id	= purge_sys->view
-			? purge_sys->view->up_limit_id
-			: 0;
-		rw_lock_s_unlock(&purge_sys->latch);
-
-		if (trx_sys->max_trx_id < done_trx_no) {
-			export_vars.innodb_purge_trx_id_age = 0;
-		} else {
-			export_vars.innodb_purge_trx_id_age =
-				trx_sys->max_trx_id - done_trx_no;
-		}
-
-		if (!up_limit_id
-		    || trx_sys->max_trx_id < up_limit_id) {
-			export_vars.innodb_purge_view_trx_id_age = 0;
-		} else {
-			export_vars.innodb_purge_view_trx_id_age =
-				trx_sys->max_trx_id - up_limit_id;
-		}
-	}
-#endif /* UNIV_DEBUG */
-
-	mutex_exit(&srv_innodb_monitor_mutex);
-}
-
-/*********************************************************************//**
-A thread which prints the info output by various InnoDB monitors.
-@return	a dummy parameter */
-UNIV_INTERN
-os_thread_ret_t
-srv_monitor_thread(
-/*===============*/
-	void*	arg __attribute__((unused)))
-			/*!< in: a dummy parameter required by
-			os_thread_create */
-{
-	ib_int64_t	sig_count;
-	double		time_elapsed;
-	time_t		current_time;
-	time_t		last_table_monitor_time;
-	time_t		last_tablespace_monitor_time;
-	time_t		last_monitor_time;
-	ulint		mutex_skipped;
-	ibool		last_srv_print_monitor;
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	fprintf(stderr, "Lock timeout thread starts, id %lu\n",
-		os_thread_pf(os_thread_get_curr_id()));
-#endif
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(srv_monitor_thread_key);
-#endif
-
-	UT_NOT_USED(arg);
-	srv_last_monitor_time = ut_time();
-	last_table_monitor_time = ut_time();
-	last_tablespace_monitor_time = ut_time();
-	last_monitor_time = ut_time();
-	mutex_skipped = 0;
-	last_srv_print_monitor = srv_print_innodb_monitor;
-loop:
-	srv_monitor_active = TRUE;
-
-	/* Wake up every 5 seconds to see if we need to print
-	monitor information or if signalled at shutdown. */
-
-	sig_count = os_event_reset(srv_monitor_event);
-
-	os_event_wait_time_low(srv_monitor_event, 5000000, sig_count);
-
-	current_time = ut_time();
-
-	time_elapsed = difftime(current_time, last_monitor_time);
-
-	if (time_elapsed > 15) {
-		last_monitor_time = ut_time();
-
-		if (srv_print_innodb_monitor) {
-			/* Reset mutex_skipped counter everytime
-			srv_print_innodb_monitor changes. This is to
-			ensure we will not be blocked by kernel_mutex
-			for short duration information printing,
-			such as requested by sync_array_print_long_waits() */
-			if (!last_srv_print_monitor) {
-				mutex_skipped = 0;
-				last_srv_print_monitor = TRUE;
-			}
-
-			if (!srv_printf_innodb_monitor(stderr,
-						MUTEX_NOWAIT(mutex_skipped),
-						NULL, NULL)) {
-				mutex_skipped++;
-			} else {
-				/* Reset the counter */
-				mutex_skipped = 0;
-			}
-		} else {
-			last_srv_print_monitor = FALSE;
-		}
-
-
-		if (srv_innodb_status) {
-			mutex_enter(&srv_monitor_file_mutex);
-			rewind(srv_monitor_file);
-			if (!srv_printf_innodb_monitor(srv_monitor_file,
-						MUTEX_NOWAIT(mutex_skipped),
-						NULL, NULL)) {
-				mutex_skipped++;
-			} else {
-				mutex_skipped = 0;
-			}
-
-			os_file_set_eof(srv_monitor_file);
-			mutex_exit(&srv_monitor_file_mutex);
-		}
-
-		if (srv_print_innodb_tablespace_monitor
-		    && difftime(current_time,
-				last_tablespace_monitor_time) > 60) {
-			last_tablespace_monitor_time = ut_time();
-
-			fputs("========================"
-			      "========================\n",
-			      stderr);
-
-			ut_print_timestamp(stderr);
-
-			fputs(" INNODB TABLESPACE MONITOR OUTPUT\n"
-			      "========================"
-			      "========================\n",
-			      stderr);
-
-			fsp_print(0);
-			fputs("Validating tablespace\n", stderr);
-			fsp_validate(0);
-			fputs("Validation ok\n"
-			      "---------------------------------------\n"
-			      "END OF INNODB TABLESPACE MONITOR OUTPUT\n"
-			      "=======================================\n",
-			      stderr);
-		}
-
-		if (srv_print_innodb_table_monitor
-		    && difftime(current_time, last_table_monitor_time) > 60) {
-
-			last_table_monitor_time = ut_time();
-
-			fputs("===========================================\n",
-			      stderr);
-
-			ut_print_timestamp(stderr);
-
-			fputs(" INNODB TABLE MONITOR OUTPUT\n"
-			      "===========================================\n",
-			      stderr);
-			dict_print();
-
-			fputs("-----------------------------------\n"
-			      "END OF INNODB TABLE MONITOR OUTPUT\n"
-			      "==================================\n",
-			      stderr);
-		}
-	}
-
-	if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
-		goto exit_func;
-	}
-
-	if (srv_print_innodb_monitor
-	    || srv_print_innodb_lock_monitor
-	    || srv_print_innodb_tablespace_monitor
-	    || srv_print_innodb_table_monitor) {
-		goto loop;
-	}
-
-	srv_monitor_active = FALSE;
-
-	goto loop;
-
-exit_func:
-	srv_monitor_active = FALSE;
-
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-
-	os_thread_exit(NULL);
-
-	OS_THREAD_DUMMY_RETURN;
-}
-
-/*********************************************************************//**
-A thread which wakes up threads whose lock wait may have lasted too long.
-@return	a dummy parameter */
-UNIV_INTERN
-os_thread_ret_t
-srv_lock_timeout_thread(
-/*====================*/
-	void*	arg __attribute__((unused)))
-			/* in: a dummy parameter required by
-			os_thread_create */
-{
-	srv_slot_t*	slot;
-	ibool		some_waits;
-	double		wait_time;
-	ulint		i;
-	ib_int64_t	sig_count;
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(srv_lock_timeout_thread_key);
-#endif
-
-loop:
-
-	/* When someone is waiting for a lock, we wake up every second
-	and check if a timeout has passed for a lock wait */
-
-	sig_count = os_event_reset(srv_timeout_event);
-
-	os_event_wait_time_low(srv_timeout_event, 1000000, sig_count);
-
-	srv_lock_timeout_active = TRUE;
-
-	mutex_enter(&kernel_mutex);
-
-	some_waits = FALSE;
-
-	/* Check of all slots if a thread is waiting there, and if it
-	has exceeded the time limit */
-
-	for (i = 0; i < OS_THREAD_MAX_N; i++) {
-
-		slot = srv_mysql_table + i;
-
-		if (slot->in_use) {
-			trx_t*	trx;
-			ulong	lock_wait_timeout;
-
-			some_waits = TRUE;
-
-			wait_time = ut_difftime(ut_time(), slot->suspend_time);
-
-			trx = thr_get_trx(slot->thr);
-			lock_wait_timeout = thd_lock_wait_timeout(
-				trx->mysql_thd);
-
-			if (trx_is_interrupted(trx)
-			    || (lock_wait_timeout < 100000000
-				&& (wait_time > (double) lock_wait_timeout
-				    || wait_time < 0))) {
-
-				/* Timeout exceeded or a wrap-around in system
-				time counter: cancel the lock request queued
-				by the transaction and release possible
-				other transactions waiting behind; it is
-				possible that the lock has already been
-				granted: in that case do nothing */
-
-				if (trx->wait_lock) {
-					lock_cancel_waiting_and_release(
-						trx->wait_lock);
-				}
-			}
-		}
-	}
-
-	os_event_reset(srv_lock_timeout_thread_event);
-
-	mutex_exit(&kernel_mutex);
-
-	if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
-		goto exit_func;
-	}
-
-	if (some_waits) {
-		goto loop;
-	}
-
-	srv_lock_timeout_active = FALSE;
-
-#if 0
-	/* The following synchronisation is disabled, since
-	the InnoDB monitor output is to be updated every 15 seconds. */
-	os_event_wait(srv_lock_timeout_thread_event);
-#endif
-	goto loop;
-
-exit_func:
-	srv_lock_timeout_active = FALSE;
-
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-
-	os_thread_exit(NULL);
-
-	OS_THREAD_DUMMY_RETURN;
-}
-
-/*********************************************************************//**
-A thread which prints warnings about semaphore waits which have lasted
-too long. These can be used to track bugs which cause hangs.
-@return	a dummy parameter */
-UNIV_INTERN
-os_thread_ret_t
-srv_error_monitor_thread(
-/*=====================*/
-	void*	arg __attribute__((unused)))
-			/*!< in: a dummy parameter required by
-			os_thread_create */
-{
-	/* number of successive fatal timeouts observed */
-	ulint		fatal_cnt	= 0;
-	ib_uint64_t	old_lsn;
-	ib_uint64_t	new_lsn;
-	ib_int64_t	sig_count;
-	/* longest waiting thread for a semaphore */
-	os_thread_id_t	waiter		= os_thread_get_curr_id();
-	os_thread_id_t	old_waiter	= waiter;
-	/* the semaphore that is being waited for */
-	const void*	sema		= NULL;
-	const void*	old_sema	= NULL;
-
-	old_lsn = srv_start_lsn;
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	fprintf(stderr, "Error monitor thread starts, id %lu\n",
-		os_thread_pf(os_thread_get_curr_id()));
-#endif
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(srv_error_monitor_thread_key);
-#endif
-
-loop:
-	srv_error_monitor_active = TRUE;
-
-	/* Try to track a strange bug reported by Harald Fuchs and others,
-	where the lsn seems to decrease at times */
-
-	new_lsn = log_get_lsn();
-
-	if (new_lsn < old_lsn) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: Error: old log sequence number %llu"
-			" was greater\n"
-			"InnoDB: than the new log sequence number %llu!\n"
-			"InnoDB: Please submit a bug report"
-			" to http://bugs.mysql.com\n",
-			old_lsn, new_lsn);
-		ut_ad(0);
-	}
-
-	old_lsn = new_lsn;
-
-	if (difftime(time(NULL), srv_last_monitor_time) > 60) {
-		/* We referesh InnoDB Monitor values so that averages are
-		printed from at most 60 last seconds */
-
-		srv_refresh_innodb_monitor_stats();
-	}
-
-	/* Update the statistics collected for deciding LRU
-	eviction policy. */
-	buf_LRU_stat_update();
-
-	/* Update the statistics collected for flush rate policy. */
-	buf_flush_stat_update();
-
-	/* In case mutex_exit is not a memory barrier, it is
-	theoretically possible some threads are left waiting though
-	the semaphore is already released. Wake up those threads: */
-
-	sync_arr_wake_threads_if_sema_free();
-
-	if (sync_array_print_long_waits(&waiter, &sema)
-	    && sema == old_sema && os_thread_eq(waiter, old_waiter)) {
-		fatal_cnt++;
-		if (fatal_cnt > 10) {
-
-			fprintf(stderr,
-				"InnoDB: Error: semaphore wait has lasted"
-				" > %lu seconds\n"
-				"InnoDB: We intentionally crash the server,"
-				" because it appears to be hung.\n",
-				(ulong) srv_fatal_semaphore_wait_threshold);
-
-			ut_error;
-		}
-	} else {
-		fatal_cnt = 0;
-		old_waiter = waiter;
-		old_sema = sema;
-	}
-
-	if (srv_kill_idle_transaction && trx_sys) {
-		trx_t*	trx;
-		time_t	now;
-rescan_idle:
-		now = time(NULL);
-		mutex_enter(&kernel_mutex);
-		trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
-		while (trx) {
-			if (trx->state == TRX_ACTIVE
-			    && trx->mysql_thd
-			    && innobase_thd_is_idle(trx->mysql_thd)) {
-				ib_int64_t	start_time = innobase_thd_get_start_time(trx->mysql_thd);
-				ulong		thd_id = innobase_thd_get_thread_id(trx->mysql_thd);
-
-				if (trx->last_stmt_start != start_time) {
-					trx->idle_start = now;
-					trx->last_stmt_start = start_time;
-				} else if (difftime(now, trx->idle_start)
-					   > srv_kill_idle_transaction) {
-					/* kill the session */
-					mutex_exit(&kernel_mutex);
-					innobase_thd_kill(thd_id);
-					goto rescan_idle;
-				}
-			}
-			trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
-		}
-		mutex_exit(&kernel_mutex);
-	}
-
-	/* Flush stderr so that a database user gets the output
-	to possible MySQL error file */
-
-	fflush(stderr);
-
-	sig_count = os_event_reset(srv_error_event);
-
-	os_event_wait_time_low(srv_error_event, 1000000, sig_count);
-
-	if (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP) {
-
-		goto loop;
-	}
-
-	srv_error_monitor_active = FALSE;
-
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-
-	os_thread_exit(NULL);
-
-	OS_THREAD_DUMMY_RETURN;
-}
-
-/*********************************************************************//**
-A thread which restores the buffer pool from a dump file on startup and does
-periodic buffer pool dumps.
-@return	a dummy parameter */
-UNIV_INTERN
-os_thread_ret_t
-srv_LRU_dump_restore_thread(
-/*====================*/
-	void*	arg __attribute__((unused)))
-			/*!< in: a dummy parameter required by
-			os_thread_create */
-{
-	uint	auto_lru_dump;
-	time_t	last_dump_time;
-	time_t	time_elapsed;
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	fprintf(stderr, "The LRU dump/restore thread has started, id %lu\n",
-		os_thread_pf(os_thread_get_curr_id()));
-#endif
-
-	/* If srv_blocking_lru_restore is TRUE, restore will be done
-	synchronously on startup. */
-	if (srv_auto_lru_dump && !srv_blocking_lru_restore)
-		buf_LRU_file_restore();
-
-	last_dump_time = time(NULL);
-
-loop:
-	os_event_wait_time_low(srv_shutdown_event, 5000000, 0);
-
-	if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
-		goto exit_func;
-	}
-
-	time_elapsed = time(NULL) - last_dump_time;
-	auto_lru_dump = srv_auto_lru_dump;
-	if (auto_lru_dump > 0 && (time_t) auto_lru_dump < time_elapsed) {
-		last_dump_time = time(NULL);
-		buf_LRU_file_dump();
-	}
-
-	goto loop;
-exit_func:
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-
-	os_thread_exit(NULL);
-
-	OS_THREAD_DUMMY_RETURN;
-}
-
-/**********************************************************************//**
-Check whether any background thread is active. If so return the thread
-type
-@return ULINT_UNDEFINED if all are suspended or have exited, thread
-type if any are still active. */
-UNIV_INTERN
-ulint
-srv_get_active_thread_type(void)
-/*============================*/
-{
-	ulint	i;
-	ibool	ret = ULINT_UNDEFINED;
-
-	mutex_enter(&kernel_mutex);
-
-	for (i = 0; i <= SRV_MASTER; ++i) {
-		if (srv_n_threads_active[i] != 0) {
-			ret = i;
-			break;
-		}
-	}
-
-	mutex_exit(&kernel_mutex);
-
-	return(ret);
-}
-
-/*********************************************************************//**
-This function prints progress message every 60 seconds during server
-shutdown, for any activities that master thread is pending on. */
-static
-void
-srv_shutdown_print_master_pending(
-/*==============================*/
-	ib_time_t*	last_print_time,	/*!< last time the function
-						print the message */
-	ulint		n_tables_to_drop,	/*!< number of tables to
-						be dropped */
-	ulint		n_bytes_merged,		/*!< number of change buffer
-						just merged */
-	ulint		n_pages_flushed)	/*!< number of pages flushed */
-{
-	ib_time_t	current_time;
-	double		time_elapsed;
-
-	current_time = ut_time();
-	time_elapsed = ut_difftime(current_time, *last_print_time);
-
-	if (time_elapsed > 60) {
-		*last_print_time = ut_time();
-
-		if (n_tables_to_drop) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr, "  InnoDB: Waiting for "
-				"%lu table(s) to be dropped\n",
-				(ulong) n_tables_to_drop);
-		}
-
-		/* Check change buffer merge, we only wait for change buffer
-		merge if it is a slow shutdown */
-		if (!srv_fast_shutdown && n_bytes_merged) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr, "  InnoDB: Waiting for change "
-				"buffer merge to complete\n"
-				"  InnoDB: number of bytes of change buffer "
-				"just merged:  %lu\n",
-				n_bytes_merged);
-		}
-
-		if (n_pages_flushed) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr, "  InnoDB: Waiting for "
-				"%lu pages to be flushed\n",
-				(ulong) n_pages_flushed);
-		}
-        }
-}
-
-/******************************************************************//**
-A thread which follows the redo log and outputs the changed page bitmap.
-@return a dummy value */
-os_thread_ret_t
-srv_redo_log_follow_thread(
-/*=======================*/
-	void*	arg __attribute__((unused)))	/*!< in: a dummy parameter
-						     required by
-						     os_thread_create */
-{
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	fprintf(stderr, "Redo log follower thread starts, id %lu\n",
-		os_thread_pf(os_thread_get_curr_id()));
-#endif
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(srv_log_tracking_thread_key);
-#endif
-
-	my_thread_init();
-
-	do {
-		os_event_wait(srv_checkpoint_completed_event);
-		os_event_reset(srv_checkpoint_completed_event);
-
-#ifdef UNIV_DEBUG
-		if (!srv_track_changed_pages) {
-			continue;
-		}
-#endif
-
-		if (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE) {
-			if (!log_online_follow_redo_log()) {
-				/* TODO: sync with I_S log tracking status? */
-				fprintf(stderr,
-					"InnoDB: Error: log tracking bitmap "
-					"write failed, stopping log tracking "
-					"thread!\n");
-				break;
-			}
-		}
-
-	} while (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE);
-
-	srv_track_changed_pages = FALSE;
-	log_online_read_shutdown();
-	os_event_set(srv_redo_log_thread_finished_event);
-
-	my_thread_end();
-	os_thread_exit(NULL);
-
-	OS_THREAD_DUMMY_RETURN;
-}
-
-/*******************************************************************//**
-Tells the InnoDB server that there has been activity in the database
-and wakes up the master thread if it is suspended (not sleeping). Used
-in the MySQL interface. Note that there is a small chance that the master
-thread stays suspended (we do not protect our operation with the
-srv_sys_t->mutex, for performance reasons). */
-UNIV_INTERN
-void
-srv_active_wake_master_thread(void)
-/*===============================*/
-{
-	srv_activity_count++;
-
-	if (srv_n_threads_active[SRV_MASTER] == 0) {
-
-		mutex_enter(&kernel_mutex);
-
-		srv_release_threads(SRV_MASTER, 1);
-
-		mutex_exit(&kernel_mutex);
-	}
-}
-
-/*******************************************************************//**
-Tells the purge thread that there has been activity in the database
-and wakes up the purge thread if it is suspended (not sleeping).  Note
-that there is a small chance that the purge thread stays suspended
-(we do not protect our operation with the kernel mutex, for
-performace reasons). */
-UNIV_INTERN
-void
-srv_wake_purge_thread_if_not_active(void)
-/*=====================================*/
-{
-	ut_ad(!mutex_own(&kernel_mutex));
-
-	if (srv_n_purge_threads > 0
-	    && srv_n_threads_active[SRV_WORKER] == 0) {
-
-		mutex_enter(&kernel_mutex);
-
-		srv_release_threads(SRV_WORKER, 1);
-
-		mutex_exit(&kernel_mutex);
-	}
-}
-
-/*******************************************************************//**
-Wakes up the master thread if it is suspended or being suspended. */
-UNIV_INTERN
-void
-srv_wake_master_thread(void)
-/*========================*/
-{
-	srv_activity_count++;
-
-	mutex_enter(&kernel_mutex);
-
-	srv_release_threads(SRV_MASTER, 1);
-
-	mutex_exit(&kernel_mutex);
-}
-
-/*******************************************************************//**
-Wakes up the purge thread if it's not already awake. */
-UNIV_INTERN
-void
-srv_wake_purge_thread(void)
-/*=======================*/
-{
-	ut_ad(!mutex_own(&kernel_mutex));
-
-	if (srv_n_purge_threads > 0) {
-
-		mutex_enter(&kernel_mutex);
-
-		srv_release_threads(SRV_WORKER, 1);
-
-		mutex_exit(&kernel_mutex);
-	}
-}
-
-/**********************************************************************
-The master thread is tasked to ensure that flush of log file happens
-once every second in the background. This is to ensure that not more
-than one second of trxs are lost in case of crash when
-innodb_flush_logs_at_trx_commit != 1 */
-static
-void
-srv_sync_log_buffer_in_background(void)
-/*===================================*/
-{
-	time_t	current_time = time(NULL);
-
-	srv_main_thread_op_info = "flushing log";
-	if (difftime(current_time, srv_last_log_flush_time) >= 1) {
-		log_buffer_sync_in_background(TRUE);
-		srv_last_log_flush_time = current_time;
-		srv_log_writes_and_flush++;
-	}
-}
-
-/********************************************************************//**
-Do a full purge, reconfigure the purge sub-system if a dynamic
-change is detected. */
-static
-void
-srv_master_do_purge(void)
-/*=====================*/
-{
-	ulint	n_pages_purged;
-
-	ut_ad(!mutex_own(&kernel_mutex));
-
-	ut_a(srv_n_purge_threads == 0 || (srv_shutdown_state > 0 && srv_n_threads_active[SRV_WORKER] == 0));
-
-	do {
-		/* Check for shutdown and change in purge config. */
-		if (srv_fast_shutdown && srv_shutdown_state > 0) {
-			/* Nothing to purge. */
-			n_pages_purged = 0;
-		} else {
-			n_pages_purged = trx_purge(srv_purge_batch_size);
-		}
-
-		srv_sync_log_buffer_in_background();
-
-	} while (n_pages_purged > 0);
-}
-
-/*********************************************************************//**
-The master thread controlling the server.
-@return	a dummy parameter */
-UNIV_INTERN
-os_thread_ret_t
-srv_master_thread(
-/*==============*/
-	void*	arg __attribute__((unused)))
-			/*!< in: a dummy parameter required by
-			os_thread_create */
-{
-	buf_pool_stat_t buf_stat;
-	srv_slot_t*	slot;
-	ulint		old_activity_count;
-	ulint		n_pages_purged	= 0;
-	ulint		n_bytes_merged;
-	ulint		n_pages_flushed;
-	ulint		n_pages_flushed_prev = 0;
-	ulint		n_bytes_archived;
-	ulint		n_tables_to_drop;
-	ulint		n_ios;
-	ulint		n_ios_old;
-	ulint		n_ios_very_old;
-	ulint		n_pend_ios;
-	ulint		next_itr_time;
-	ulint		prev_adaptive_flushing_method = ULINT_UNDEFINED;
-	ulint		inner_loop = 0;
-	ibool		skip_sleep	= FALSE;
-	ulint		i;
-	struct t_prev_flush_info_struct {
-		ulint		count;
-		unsigned	space:32;
-		unsigned	offset:32;
-		ib_uint64_t	oldest_modification;
-	} prev_flush_info[MAX_BUFFER_POOLS];
-
-	ib_uint64_t	lsn_old;
-
-	ib_uint64_t	oldest_lsn;
-	ib_time_t	last_print_time;
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	fprintf(stderr, "Master thread starts, id %lu\n",
-		os_thread_pf(os_thread_get_curr_id()));
-#endif
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(srv_master_thread_key);
-#endif
-
-	srv_main_thread_process_no = os_proc_get_number();
-	srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
-
-        memset(&prev_flush_info, 0, sizeof(prev_flush_info));
-	mutex_enter(&kernel_mutex);
-
-	slot = srv_table_reserve_slot(SRV_MASTER);
-
-	srv_n_threads_active[SRV_MASTER]++;
-
-	mutex_exit(&kernel_mutex);
-
-	mutex_enter(&(log_sys->mutex));
-	lsn_old = log_sys->lsn;
-	mutex_exit(&(log_sys->mutex));
-
-	last_print_time = ut_time();
-
-loop:
-	/*****************************************************************/
-	/* ---- When there is database activity by users, we cycle in this
-	loop */
-
-	srv_main_thread_op_info = "reserving kernel mutex";
-
-	buf_get_total_stat(&buf_stat);
-	n_ios_very_old = log_sys->n_log_ios + buf_stat.n_pages_read
-		+ buf_stat.n_pages_written;
-        n_pages_flushed= 0;
-
-	mutex_enter(&kernel_mutex);
-
-	/* Store the user activity counter at the start of this loop */
-	old_activity_count = srv_activity_count;
-
-	mutex_exit(&kernel_mutex);
-
-	if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND) {
-
-		goto suspend_thread;
-	}
-
-	/* ---- We run the following loop approximately once per second
-	when there is database activity */
-
-	srv_last_log_flush_time = time(NULL);
-
-	/* Sleep for 1 second on entrying the for loop below the first time. */
-	next_itr_time = ut_time_ms() + 1000;
-
-	skip_sleep = FALSE;
-
-	for (i = 0; i < 10; i++) {
-		ulint	cur_time = ut_time_ms();
-
-#ifdef UNIV_DEBUG
-		if (btr_cur_limit_optimistic_insert_debug
-		    && srv_n_purge_threads == 0) {
-			/* If btr_cur_limit_optimistic_insert_debug is enabled
-			and no purge_threads, purge opportunity is increased
-			by x100 (1purge/100msec), to speed up debug scripts
-			which should wait for purged. */
-			next_itr_time -= 900;
-
-			srv_main_thread_op_info = "master purging";
-
-			srv_master_do_purge();
-
-			if (srv_fast_shutdown && srv_shutdown_state > 0) {
-
-				goto background_loop;
-			}
-		}
-#endif /* UNIV_DEBUG */
-
-		n_pages_flushed = 0; /* initialize */
-
-		/* ALTER TABLE in MySQL requires on Unix that the table handler
-		can drop tables lazily after there no longer are SELECT
-		queries to them. */
-
-		srv_main_thread_op_info = "doing background drop tables";
-
-		row_drop_tables_for_mysql_in_background();
-
-		srv_main_thread_op_info = "";
-
-		if (srv_fast_shutdown && srv_shutdown_state > 0) {
-
-			goto background_loop;
-		}
-
-		buf_get_total_stat(&buf_stat);
-
-		n_ios_old = log_sys->n_log_ios + buf_stat.n_pages_read
-			+ buf_stat.n_pages_written;
-
-		srv_main_thread_op_info = "sleeping";
-		srv_main_1_second_loops++;
-
-		if (!skip_sleep) {
-		if (next_itr_time > cur_time
-		    && srv_shutdown_state == SRV_SHUTDOWN_NONE) {
-
-			/* Get sleep interval in micro seconds. We use
-			ut_min() to avoid long sleep in case of
-			wrap around. */
-			os_event_wait_time_low(srv_shutdown_event,
-					       ut_min(1000000,
-						      (next_itr_time - cur_time)
-						      * 1000),
-					       0);
-			srv_main_sleeps++;
-
-			/*
-			mutex_enter(&(log_sys->mutex));
-			oldest_lsn = buf_pool_get_oldest_modification();
-			ib_uint64_t	lsn = log_sys->lsn;
-			mutex_exit(&(log_sys->mutex));
-
-			if(oldest_lsn)
-			fprintf(stderr,
-				"InnoDB flush: age pct: %lu, lsn progress: %lu\n",
-				(lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age,
-				lsn - lsn_old);
-			*/
-		}
-
-		/* Each iteration should happen at 1 second interval. */
-		next_itr_time = ut_time_ms() + 1000;
-		} /* if (!skip_sleep) */
-
-		skip_sleep = FALSE;
-
-		/* Flush logs if needed */
-		srv_sync_log_buffer_in_background();
-
-		srv_main_thread_op_info = "making checkpoint";
-		log_free_check();
-
-		/* If i/os during one second sleep were less than 5% of
-		capacity, we assume that there is free disk i/o capacity
-		available, and it makes sense to do an insert buffer merge. */
-
-		buf_get_total_stat(&buf_stat);
-		n_pend_ios = buf_get_n_pending_ios()
-			+ log_sys->n_pending_writes;
-		n_ios = log_sys->n_log_ios + buf_stat.n_pages_read
-			+ buf_stat.n_pages_written;
-		if (n_pend_ios < SRV_PEND_IO_THRESHOLD
-		    && (n_ios - n_ios_old < SRV_RECENT_IO_ACTIVITY)) {
-			srv_main_thread_op_info = "doing insert buffer merge";
-			ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5));
-
-			/* Flush logs if needed */
-			srv_sync_log_buffer_in_background();
-		}
-
-		if (UNIV_UNLIKELY(buf_get_modified_ratio_pct()
-				  > srv_max_buf_pool_modified_pct)) {
-
-			/* Try to keep the number of modified pages in the
-			buffer pool under the limit wished by the user */
-
-			srv_main_thread_op_info =
-				"flushing buffer pool pages";
-			n_pages_flushed = buf_flush_list(
-				PCT_IO(100), IB_ULONGLONG_MAX);
-
-			mutex_enter(&(log_sys->mutex));
-			lsn_old = log_sys->lsn;
-			mutex_exit(&(log_sys->mutex));
-			prev_adaptive_flushing_method = ULINT_UNDEFINED;
-		} else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 0) {
-
-			/* Try to keep the rate of flushing of dirty
-			pages such that redo log generation does not
-			produce bursts of IO at checkpoint time. */
-			ulint n_flush = buf_flush_get_desired_flush_rate();
-
-			if (n_flush) {
-				srv_main_thread_op_info =
-					"flushing buffer pool pages";
-				n_flush = ut_min(PCT_IO(100), n_flush);
-				n_pages_flushed =
-					buf_flush_list(
-						n_flush,
-						IB_ULONGLONG_MAX);
-			}
-
-			mutex_enter(&(log_sys->mutex));
-			lsn_old = log_sys->lsn;
-			mutex_exit(&(log_sys->mutex));
-			prev_adaptive_flushing_method = ULINT_UNDEFINED;
-		} else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 1) {
-
-			/* Try to keep modified age not to exceed
-			max_checkpoint_age * 7/8 line */
-
-			mutex_enter(&(log_sys->mutex));
-
-			oldest_lsn = buf_pool_get_oldest_modification();
-			if (oldest_lsn == 0) {
-				lsn_old = log_sys->lsn;
-				mutex_exit(&(log_sys->mutex));
-
-			} else {
-				if ((log_sys->lsn - oldest_lsn)
-				    > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) {
-					/* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */
-					/* We should not flush from here. */
-					lsn_old = log_sys->lsn;
-					mutex_exit(&(log_sys->mutex));
-				} else if ((log_sys->lsn - oldest_lsn)
-					   > (log_sys->max_checkpoint_age)/4 ) {
-
-					/* defence line (max_checkpoint_age * 1/2) */
-					ib_uint64_t	lsn = log_sys->lsn;
-
-					ib_uint64_t	level, bpl;
-					buf_page_t*	bpage;
-					ulint		j;
-
-					mutex_exit(&(log_sys->mutex));
-
-					bpl = 0;
-
-					for (j = 0; j < srv_buf_pool_instances; j++) {
-						buf_pool_t*	buf_pool;
-						ulint		n_blocks;
-
-						buf_pool = buf_pool_from_array(j);
-
-						buf_flush_list_mutex_enter(buf_pool);
-						level = 0;
-						n_blocks = 0;
-						bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
-
-						while (bpage != NULL) {
-							ib_uint64_t	oldest_modification = bpage->oldest_modification;
-							if (oldest_modification != 0) {
-								level += log_sys->max_checkpoint_age
-									 - (lsn - oldest_modification);
-							}
-							bpage = UT_LIST_GET_NEXT(flush_list, bpage);
-							n_blocks++;
-						}
-						buf_flush_list_mutex_exit(buf_pool);
-
-						if (level) {
-							bpl += ((ib_uint64_t) n_blocks * n_blocks
-								* (lsn - lsn_old)) / level;
-						}
-
-					}
-
-					if (!srv_use_doublewrite_buf) {
-						/* flush is faster than when doublewrite */
-						bpl = (bpl * 7) / 8;
-					}
-
-					if (bpl) {
-retry_flush_batch:
-						n_pages_flushed = buf_flush_list(bpl,
-									oldest_lsn + (lsn - lsn_old));
-						if (n_pages_flushed == ULINT_UNDEFINED) {
-							os_thread_sleep(5000);
-							goto retry_flush_batch;
-						}
-					}
-
-					lsn_old = lsn;
-					/*
-					fprintf(stderr,
-						"InnoDB flush: age pct: %lu, lsn progress: %lu, blocks to flush:%llu\n",
-						(lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age,
-						lsn - lsn_old, bpl);
-					*/
-				} else {
-					lsn_old = log_sys->lsn;
-					mutex_exit(&(log_sys->mutex));
-				}
-			}
-			prev_adaptive_flushing_method = 1;
-		} else if (srv_adaptive_flushing && srv_adaptive_flushing_method == 2) {
-			buf_pool_t*	buf_pool;
-			buf_page_t*	bpage;
-			ib_uint64_t	lsn;
-			ulint		j;
-
-			mutex_enter(&(log_sys->mutex));
-			oldest_lsn = buf_pool_get_oldest_modification();
-			lsn = log_sys->lsn;
-			mutex_exit(&(log_sys->mutex));
-
-			/* upper loop/sec. (x10) */
-			next_itr_time -= 900; /* 1000 - 900 == 100 */
-			inner_loop++;
-			if (inner_loop < 10) {
-				i--;
-			} else {
-				inner_loop = 0;
-			}
-
-			if (prev_adaptive_flushing_method == 2) {
-				lint	n_flush;
-				lint	blocks_sum;
-				ulint	new_blocks_sum, flushed_blocks_sum;
-
-				blocks_sum = new_blocks_sum = flushed_blocks_sum = 0;
-
-				/* prev_flush_info[j] should be the previous loop's */
-				for (j = 0; j < srv_buf_pool_instances; j++) {
-					lint	blocks_num, new_blocks_num = 0;
-					lint	flushed_blocks_num;
-
-					buf_pool = buf_pool_from_array(j);
-					buf_flush_list_mutex_enter(buf_pool);
-
-					blocks_num = UT_LIST_GET_LEN(buf_pool->flush_list);
-					bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
-
-					while (bpage != NULL) {
-						if (prev_flush_info[j].space == bpage->space
-						    && prev_flush_info[j].offset == bpage->offset
-						    && prev_flush_info[j].oldest_modification
-								== bpage->oldest_modification) {
-							break;
-						}
-						bpage = UT_LIST_GET_NEXT(flush_list, bpage);
-						new_blocks_num++;
-					}
-
-					flushed_blocks_num = new_blocks_num + prev_flush_info[j].count
-								- blocks_num;
-					if (flushed_blocks_num < 0) {
-						flushed_blocks_num = 0;
-					}
-
-					bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
-
-					prev_flush_info[j].count = UT_LIST_GET_LEN(buf_pool->flush_list);
-					if (bpage) {
-						prev_flush_info[j].space = bpage->space;
-						prev_flush_info[j].offset = bpage->offset;
-						prev_flush_info[j].oldest_modification = bpage->oldest_modification;
-						buf_flush_list_mutex_exit(buf_pool);
-					} else {
-						buf_flush_list_mutex_exit(buf_pool);
-						prev_flush_info[j].space = 0;
-						prev_flush_info[j].offset = 0;
-						prev_flush_info[j].oldest_modification = 0;
-					}
-
-					new_blocks_sum += new_blocks_num;
-					flushed_blocks_sum += flushed_blocks_num;
-					blocks_sum += blocks_num;
-				}
-
-				n_flush = (lint) (blocks_sum * (lsn - lsn_old) / log_sys->max_modified_age_async);
-				if ((ulint) flushed_blocks_sum > n_pages_flushed_prev) {
-					n_flush -= (flushed_blocks_sum - n_pages_flushed_prev);
-				}
-
-				if (n_flush > 0) {
-					n_flush++;
-					n_pages_flushed = buf_flush_list(n_flush, oldest_lsn + (lsn - lsn_old));
-				} else {
-					n_pages_flushed = 0;
-				}					
-			} else {
-				/* store previous first pages of the flush_list */
-				for (j = 0; j < srv_buf_pool_instances; j++) {
-					buf_pool = buf_pool_from_array(j);
-					buf_flush_list_mutex_enter(buf_pool);
-
-					bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
-
-					prev_flush_info[j].count = UT_LIST_GET_LEN(buf_pool->flush_list);
-					if (bpage) {
-						prev_flush_info[j].space = bpage->space;
-						prev_flush_info[j].offset = bpage->offset;
-						prev_flush_info[j].oldest_modification = bpage->oldest_modification;
-						buf_flush_list_mutex_exit(buf_pool);
-					} else {
-						buf_flush_list_mutex_exit(buf_pool);
-						prev_flush_info[j].space = 0;
-						prev_flush_info[j].offset = 0;
-						prev_flush_info[j].oldest_modification = 0;
-					}
-				}
-				n_pages_flushed = 0;
-			}
-
-			lsn_old = lsn;
-			prev_adaptive_flushing_method = 2;
-		} else {
-			mutex_enter(&(log_sys->mutex));
-			lsn_old = log_sys->lsn;
-			mutex_exit(&(log_sys->mutex));
-			prev_adaptive_flushing_method = ULINT_UNDEFINED;
-		}
-
-		if (n_pages_flushed == ULINT_UNDEFINED) {
-			n_pages_flushed_prev = 0;
-		} else {
-			n_pages_flushed_prev = n_pages_flushed;
-		}
-
-		if (srv_activity_count == old_activity_count) {
-
-			/* There is no user activity at the moment, go to
-			the background loop */
-
-			goto background_loop;
-		}
-	}
-
-	/* ---- We perform the following code approximately once per
-	10 seconds when there is database activity */
-
-#ifdef MEM_PERIODIC_CHECK
-	/* Check magic numbers of every allocated mem block once in 10
-	seconds */
-	mem_validate_all_blocks();
-#endif
-	/* If i/os during the 10 second period were less than 200% of
-	capacity, we assume that there is free disk i/o capacity
-	available, and it makes sense to flush srv_io_capacity pages.
-
-	Note that this is done regardless of the fraction of dirty
-	pages relative to the max requested by the user. The one second
-	loop above requests writes for that case. The writes done here
-	are not required, and may be disabled. */
-
-	buf_get_total_stat(&buf_stat);
-	n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes;
-	n_ios = log_sys->n_log_ios + buf_stat.n_pages_read
-		+ buf_stat.n_pages_written;
-
-	srv_main_10_second_loops++;
-	if (n_pend_ios < SRV_PEND_IO_THRESHOLD
-	    && (n_ios - n_ios_very_old < SRV_PAST_IO_ACTIVITY)) {
-
-		srv_main_thread_op_info = "flushing buffer pool pages";
-		buf_flush_list(PCT_IO(100), IB_ULONGLONG_MAX);
-
-		/* Flush logs if needed */
-		srv_sync_log_buffer_in_background();
-	}
-
-	/* We run a batch of insert buffer merge every 10 seconds,
-	even if the server were active */
-
-	srv_main_thread_op_info = "doing insert buffer merge";
-	ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5));
-
-	/* Flush logs if needed */
-	srv_sync_log_buffer_in_background();
-
-	if (srv_n_purge_threads == 0 || (srv_shutdown_state > 0 && srv_n_threads_active[SRV_WORKER] == 0)) {
-		srv_main_thread_op_info = "master purging";
-
-		srv_master_do_purge();
-
-		if (srv_fast_shutdown && srv_shutdown_state > 0) {
-
-			goto background_loop;
-		}
-	}
-
-	srv_main_thread_op_info = "flushing buffer pool pages";
-
-	/* Flush a few oldest pages to make a new checkpoint younger */
-
-	if (buf_get_modified_ratio_pct() > 70) {
-
-		/* If there are lots of modified pages in the buffer pool
-		(> 70 %), we assume we can afford reserving the disk(s) for
-		the time it requires to flush 100 pages */
-
-		n_pages_flushed = buf_flush_list(
-			PCT_IO(100), IB_ULONGLONG_MAX);
-	} else {
-		/* Otherwise, we only flush a small number of pages so that
-		we do not unnecessarily use much disk i/o capacity from
-		other work */
-
-		n_pages_flushed = buf_flush_list(
-			  PCT_IO(10), IB_ULONGLONG_MAX);
-	}
-
-	srv_main_thread_op_info = "making checkpoint";
-
-	/* Make a new checkpoint about once in 10 seconds */
-
-	log_checkpoint(TRUE, FALSE, TRUE);
-
-	srv_main_thread_op_info = "reserving kernel mutex";
-
-	mutex_enter(&kernel_mutex);
-
-	/* ---- When there is database activity, we jump from here back to
-	the start of loop */
-
-	if (srv_activity_count != old_activity_count) {
-		mutex_exit(&kernel_mutex);
-		goto loop;
-	}
-
-	mutex_exit(&kernel_mutex);
-
-	/* If the database is quiet, we enter the background loop */
-
-	/*****************************************************************/
-background_loop:
-	/* ---- In this loop we run background operations when the server
-	is quiet from user activity. Also in the case of a shutdown, we
-	loop here, flushing the buffer pool to the data files. */
-
-	/* The server has been quiet for a while: start running background
-	operations */
-	srv_main_background_loops++;
-	srv_main_thread_op_info = "doing background drop tables";
-
-	n_tables_to_drop = row_drop_tables_for_mysql_in_background();
-
-	if (n_tables_to_drop > 0) {
-		/* Do not monopolize the CPU even if there are tables waiting
-		in the background drop queue. (It is essentially a bug if
-		MySQL tries to drop a table while there are still open handles
-		to it and we had to put it to the background drop queue.) */
-
-		if (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
-			os_thread_sleep(100000);
-		}
-	}
-
-	if (srv_n_purge_threads == 0 || (srv_shutdown_state > 0 && srv_n_threads_active[SRV_WORKER] == 0)) {
-		srv_main_thread_op_info = "master purging";
-
-		srv_master_do_purge();
-	}
-
-	srv_main_thread_op_info = "reserving kernel mutex";
-
-	mutex_enter(&kernel_mutex);
-	if (srv_activity_count != old_activity_count) {
-		mutex_exit(&kernel_mutex);
-		goto loop;
-	}
-	mutex_exit(&kernel_mutex);
-
-	srv_main_thread_op_info = "doing insert buffer merge";
-
-	if (srv_fast_shutdown && srv_shutdown_state > 0) {
-		n_bytes_merged = 0;
-	} else {
-		/* This should do an amount of IO similar to the number of
-		dirty pages that will be flushed in the call to
-		buf_flush_list below. Otherwise, the system favors
-		clean pages over cleanup throughput. */
-		n_bytes_merged = ibuf_contract_for_n_pages(FALSE,
-							   PCT_IBUF_IO(100));
-	}
-
-	srv_main_thread_op_info = "reserving kernel mutex";
-
-	mutex_enter(&kernel_mutex);
-	if (srv_activity_count != old_activity_count) {
-		mutex_exit(&kernel_mutex);
-		goto loop;
-	}
-	mutex_exit(&kernel_mutex);
-
-flush_loop:
-	srv_main_thread_op_info = "flushing buffer pool pages";
-	srv_main_flush_loops++;
-	if (srv_fast_shutdown < 2 || srv_shutdown_state == SRV_SHUTDOWN_NONE) {
-		n_pages_flushed = buf_flush_list(
-			  PCT_IO(100), IB_ULONGLONG_MAX);
-	} else {
-		/* In the fastest shutdown we do not flush the buffer pool
-		to data files: we set n_pages_flushed to 0 artificially. */
-		ut_ad(srv_fast_shutdown == 2);
-		ut_ad(srv_shutdown_state > 0);
-
-		n_pages_flushed = 0;
-
-		DBUG_PRINT("master", ("doing very fast shutdown"));
-	}
-
-	srv_main_thread_op_info = "reserving kernel mutex";
-
-	mutex_enter(&kernel_mutex);
-	if (srv_activity_count != old_activity_count) {
-		mutex_exit(&kernel_mutex);
-		goto loop;
-	}
-	mutex_exit(&kernel_mutex);
-
-	srv_main_thread_op_info = "waiting for buffer pool flush to end";
-	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
-
-	/* Flush logs if needed */
-	srv_sync_log_buffer_in_background();
-
-	srv_main_thread_op_info = "making checkpoint";
-
-	log_checkpoint(TRUE, FALSE, TRUE);
-
-	if (!(srv_fast_shutdown == 2 && srv_shutdown_state > 0)
-	    && (buf_get_modified_ratio_pct()
-		> srv_max_buf_pool_modified_pct)) {
-
-		/* If the server is doing a very fast shutdown, then
-		we will not come here. */
-
-		/* Try to keep the number of modified pages in the
-		buffer pool under the limit wished by the user */
-
-		goto flush_loop;
-	}
-
-	srv_main_thread_op_info = "reserving kernel mutex";
-
-	mutex_enter(&kernel_mutex);
-	if (srv_activity_count != old_activity_count) {
-		mutex_exit(&kernel_mutex);
-		goto loop;
-	}
-	mutex_exit(&kernel_mutex);
-	/*
-	srv_main_thread_op_info = "archiving log (if log archive is on)";
-
-	log_archive_do(FALSE, &n_bytes_archived);
-	*/
-	n_bytes_archived = 0;
-
-	/* Print progress message every 60 seconds during shutdown */
-	if (srv_shutdown_state > 0 && srv_print_verbose_log) {
-		srv_shutdown_print_master_pending(&last_print_time,
-						  n_tables_to_drop,
-						  n_bytes_merged,
-						  n_pages_flushed);
-	}
-
-	/* Keep looping in the background loop if still work to do */
-
-	if (srv_fast_shutdown && srv_shutdown_state > 0) {
-		if (n_tables_to_drop + n_pages_flushed
-		    + n_bytes_archived != 0) {
-
-			/* If we are doing a fast shutdown (= the default)
-			we do not do purge or insert buffer merge. But we
-			flush the buffer pool completely to disk.
-			In a 'very fast' shutdown we do not flush the buffer
-			pool to data files: we have set n_pages_flushed to
-			0 artificially. */
-
-			goto background_loop;
-		}
-	} else if (n_tables_to_drop
-		   + n_pages_purged + n_bytes_merged + n_pages_flushed
-		   + n_bytes_archived != 0) {
-
-		/* In a 'slow' shutdown we run purge and the insert buffer
-		merge to completion */
-
-		goto background_loop;
-	}
-
-	/* There is no work for background operations either: suspend
-	master thread to wait for more server activity */
-
-suspend_thread:
-	srv_main_thread_op_info = "suspending";
-
-	mutex_enter(&kernel_mutex);
-
-	if (row_get_background_drop_list_len_low() > 0) {
-		mutex_exit(&kernel_mutex);
-
-		goto loop;
-	}
-
-	srv_suspend_thread(slot);
-
-	mutex_exit(&kernel_mutex);
-
-	/* DO NOT CHANGE THIS STRING. innobase_start_or_create_for_mysql()
-	waits for database activity to die down when converting < 4.1.x
-	databases, and relies on this string being exactly as it is. InnoDB
-	manual also mentions this string in several places. */
-	srv_main_thread_op_info = "waiting for server activity";
-
-	os_event_wait(slot->event);
-
-	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
-		os_thread_exit(NULL);
-	}
-
-	/* When there is user activity, InnoDB will set the event and the
-	main thread goes back to loop. */
-
-	goto loop;
-}
-
-/*********************************************************************//**
-Asynchronous purge thread.
-@return	a dummy parameter */
-UNIV_INTERN
-os_thread_ret_t
-srv_purge_thread(
-/*=============*/
-	void*	arg __attribute__((unused)))	/*!< in: a dummy parameter
-						required by os_thread_create */
-{
-	srv_slot_t*	slot;
-	ulint		retries = 0;
-	ulint		n_total_purged = ULINT_UNDEFINED;
-	ulint		next_itr_time;
-	ib_int64_t	sig_count;
-
-	ut_a(srv_n_purge_threads == 1);
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(srv_purge_thread_key);
-#endif /* UNIV_PFS_THREAD */
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	fprintf(stderr, "InnoDB: Purge thread running, id %lu\n",
-		os_thread_pf(os_thread_get_curr_id()));
-#endif /* UNIV_DEBUG_THREAD_CREATION */
-
-	mutex_enter(&kernel_mutex);
-
-	slot = srv_table_reserve_slot(SRV_WORKER);
-
-	++srv_n_threads_active[SRV_WORKER];
-
-	mutex_exit(&kernel_mutex);
-
-	next_itr_time = ut_time_ms();
-
-	while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) {
-
-		ulint	n_pages_purged = 0;
-		ulint	cur_time;
-
-		/* If there are very few records to purge or the last
-		purge didn't purge any records then wait for activity.
-	        We peek at the history len without holding any mutex
-		because in the worst case we will end up waiting for
-		the next purge event. */
-		if (trx_sys->rseg_history_len < srv_purge_batch_size
-		    || (n_total_purged == 0
-			&& retries >= TRX_SYS_N_RSEGS)) {
-
-			mutex_enter(&kernel_mutex);
-
-			srv_suspend_thread(slot);
-
-			mutex_exit(&kernel_mutex);
-
-			os_event_wait(slot->event);
-
-			retries = 0;
-		}
-
-		/* Check for shutdown and whether we should do purge at all. */
-		if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND
-		    || srv_shutdown_state != 0
-		    || srv_fast_shutdown) {
-
-			break;
-		}
-
-		if (n_total_purged == 0 && retries <= TRX_SYS_N_RSEGS) {
-			++retries;
-		} else if (n_total_purged > 0) {
-			retries = 0;
-			n_total_purged = 0;
-		}
-
-		/* Purge until there are no more records to purge and there is
-		no change in configuration or server state. */
-		do {
-			n_pages_purged = trx_purge(srv_purge_batch_size);
-
-			n_total_purged += n_pages_purged;
-
-		} while (n_pages_purged > 0 && !srv_fast_shutdown);
-
-		srv_sync_log_buffer_in_background();
-
-		cur_time = ut_time_ms();
-		sig_count = os_event_reset(srv_shutdown_event);
-		if (next_itr_time > cur_time) {
-			os_event_wait_time_low(srv_shutdown_event,
-					       ut_min(1000000,
-						      (next_itr_time - cur_time)
-						      * 1000),
-					       sig_count);
-			next_itr_time = ut_time_ms() + 1000;
-		} else {
-			next_itr_time = cur_time + 1000;
-		}
-	}
-
-	mutex_enter(&kernel_mutex);
-
-	/* Decrement the active count. */
-	srv_suspend_thread(slot);
-
-	slot->in_use = FALSE;
-
-	mutex_exit(&kernel_mutex);
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	fprintf(stderr, "InnoDB: Purge thread exiting, id %lu\n",
-		os_thread_pf(os_thread_get_curr_id()));
-#endif /* UNIV_DEBUG_THREAD_CREATION */
-
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-	os_thread_exit(NULL);
-
-	OS_THREAD_DUMMY_RETURN;	/* Not reached, avoid compiler warning */
-}
-
-/**********************************************************************//**
-Enqueues a task to server task queue and releases a worker thread, if there
-is a suspended one. */
-UNIV_INTERN
-void
-srv_que_task_enqueue_low(
-/*=====================*/
-	que_thr_t*	thr)	/*!< in: query thread */
-{
-	ut_ad(thr);
-
-	mutex_enter(&kernel_mutex);
-
-	UT_LIST_ADD_LAST(queue, srv_sys->tasks, thr);
-
-	srv_release_threads(SRV_WORKER, 1);
-
-	mutex_exit(&kernel_mutex);
-}
diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc
new file mode 100644
index 00000000000..953bbba11f7
--- /dev/null
+++ b/storage/xtradb/srv/srv0srv.cc
@@ -0,0 +1,3508 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, 2009 Google Inc.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0srv.cc
+The database server main program
+
+Created 10/8/1995 Heikki Tuuri
+*******************************************************/
+
+/* Dummy comment */
+#include "srv0srv.h"
+
+#include "ut0mem.h"
+#include "ut0ut.h"
+#include "os0proc.h"
+#include "mem0mem.h"
+#include "mem0pool.h"
+#include "sync0sync.h"
+#include "que0que.h"
+#include "log0online.h"
+#include "log0recv.h"
+#include "pars0pars.h"
+#include "usr0sess.h"
+#include "lock0lock.h"
+#include "trx0purge.h"
+#include "ibuf0ibuf.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "btr0sea.h"
+#include "dict0load.h"
+#include "dict0boot.h"
+#include "dict0stats_bg.h" /* dict_stats_event */
+#include "srv0start.h"
+#include "row0mysql.h"
+#include "ha_prototypes.h"
+#include "trx0i_s.h"
+#include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
+#include "srv0mon.h"
+#include "ut0crc32.h"
+#include "os0file.h"
+
+#include "mysql/plugin.h"
+#include "mysql/service_thd_wait.h"
+
+/* prototypes of new functions added to ha_innodb.cc for kill_idle_transaction */
+ibool		innobase_thd_is_idle(const void* thd);
+ib_int64_t	innobase_thd_get_start_time(const void* thd);
+void		innobase_thd_kill(ulong thd_id);
+ulong		innobase_thd_get_thread_id(const void* thd);
+
+/* prototypes for new functions added to ha_innodb.cc */
+ibool	innobase_get_slow_log();
+
+/* The following is the maximum allowed duration of a lock wait. */
+UNIV_INTERN ulint	srv_fatal_semaphore_wait_threshold = 600;
+
+/**/
+UNIV_INTERN long long	srv_kill_idle_transaction = 0;
+
+/* How much data manipulation language (DML) statements need to be delayed,
+in microseconds, in order to reduce the lagging of the purge thread. */
+UNIV_INTERN ulint	srv_dml_needed_delay = 0;
+
+UNIV_INTERN ibool	srv_monitor_active = FALSE;
+UNIV_INTERN ibool	srv_error_monitor_active = FALSE;
+
+UNIV_INTERN ibool	srv_buf_dump_thread_active = FALSE;
+
+UNIV_INTERN ibool	srv_dict_stats_thread_active = FALSE;
+
+UNIV_INTERN const char*	srv_main_thread_op_info = "";
+
+/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
+const char		srv_mysql50_table_name_prefix[10] = "#mysql50#";
+
+/* Server parameters which are read from the initfile */
+
+/* The following three are dir paths which are catenated before file
+names, where the file name itself may also contain a path */
+
+UNIV_INTERN char*	srv_data_home	= NULL;
+
+/** Rollback files directory, can be absolute. */
+UNIV_INTERN char*	srv_undo_dir = NULL;
+
+/** The number of tablespaces to use for rollback segments. */
+UNIV_INTERN ulong	srv_undo_tablespaces = 8;
+
+/** The number of UNDO tablespaces that are open and ready to use. */
+UNIV_INTERN ulint	srv_undo_tablespaces_open = 8;
+
+/* The number of rollback segments to use */
+UNIV_INTERN ulong	srv_undo_logs = 1;
+
+#ifdef UNIV_LOG_ARCHIVE
+UNIV_INTERN char*	srv_arch_dir	= NULL;
+UNIV_INTERN ulong	srv_log_arch_expire_sec	= 0;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/** Set if InnoDB must operate in read-only mode. We don't do any
+recovery and open all tables in RO mode instead of RW mode. We don't
+sync the max trx id to disk either. */
+UNIV_INTERN my_bool	srv_read_only_mode;
+/** store to its own file each table created by an user; data
+dictionary tables are in the system tablespace 0 */
+UNIV_INTERN my_bool	srv_file_per_table;
+/** The file format to use on new *.ibd files. */
+UNIV_INTERN ulint	srv_file_format = 0;
+/** Whether to check file format during startup.  A value of
+UNIV_FORMAT_MAX + 1 means no checking ie. FALSE.  The default is to
+set it to the highest format we support. */
+UNIV_INTERN ulint	srv_max_file_format_at_startup = UNIV_FORMAT_MAX;
+
+#if UNIV_FORMAT_A
+# error "UNIV_FORMAT_A must be 0!"
+#endif
+
+/** Place locks to records only i.e. do not use next-key locking except
+on duplicate key checking and foreign key checking */
+UNIV_INTERN ibool	srv_locks_unsafe_for_binlog = FALSE;
+/** Sort buffer size in index creation */
+UNIV_INTERN ulong	srv_sort_buf_size = 1048576;
+/** Maximum modification log file size for online index creation */
+UNIV_INTERN unsigned long long	srv_online_max_size;
+
+/* If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio we build below with threads.
+Currently we support native aio on windows and linux */
+UNIV_INTERN my_bool	srv_use_native_aio = TRUE;
+
+#ifdef __WIN__
+/* Windows native condition variables. We use runtime loading / function
+pointers, because they are not available on Windows Server 2003 and
+Windows XP/2000.
+
+We use condition for events on Windows if possible, even if os_event
+resembles Windows kernel event object well API-wise. The reason is
+performance, kernel objects are heavyweights and WaitForSingleObject() is a
+performance killer causing calling thread to context switch. Besides, Innodb
+is preallocating large number (often millions) of os_events. With kernel event
+objects it takes a big chunk out of non-paged pool, which is better suited
+for tasks like IO than for storing idle event objects. */
+UNIV_INTERN ibool	srv_use_native_conditions = FALSE;
+#endif /* __WIN__ */
+
+UNIV_INTERN ulint	srv_n_data_files = 0;
+UNIV_INTERN char**	srv_data_file_names = NULL;
+/* size in database pages */
+UNIV_INTERN ulint*	srv_data_file_sizes = NULL;
+
+UNIV_INTERN my_bool	srv_track_changed_pages = FALSE;
+
+UNIV_INTERN ulonglong	srv_max_bitmap_file_size = 100 * 1024 * 1024;
+
+UNIV_INTERN ulonglong	srv_max_changed_pages = 0;
+
+/** When TRUE, fake change transcations take S rather than X row locks.
+    When FALSE, row locks are not taken at all. */
+UNIV_INTERN my_bool	srv_fake_changes_locks = TRUE;
+
+/* if TRUE, then we auto-extend the last data file */
+UNIV_INTERN ibool	srv_auto_extend_last_data_file	= FALSE;
+/* if != 0, this tells the max size auto-extending may increase the
+last data file size */
+UNIV_INTERN ulint	srv_last_file_size_max	= 0;
+/* If the last data file is auto-extended, we add this
+many pages to it at a time */
+UNIV_INTERN ulong	srv_auto_extend_increment = 8;
+UNIV_INTERN ulint*	srv_data_file_is_raw_partition = NULL;
+
+/* If the following is TRUE we do not allow inserts etc. This protects
+the user from forgetting the 'newraw' keyword to my.cnf */
+
+UNIV_INTERN ibool	srv_created_new_raw	= FALSE;
+
+UNIV_INTERN char*	srv_log_group_home_dir	= NULL;
+
+UNIV_INTERN ulong	srv_n_log_files		= SRV_N_LOG_FILES_MAX;
+/* size in database pages */
+UNIV_INTERN ib_uint64_t	srv_log_file_size	= IB_UINT64_MAX;
+UNIV_INTERN ib_uint64_t	srv_log_file_size_requested;
+/* size in database pages */
+UNIV_INTERN ulint	srv_log_buffer_size	= ULINT_MAX;
+UNIV_INTERN uint	srv_flush_log_at_timeout = 1;
+UNIV_INTERN ulong	srv_page_size		= UNIV_PAGE_SIZE_DEF;
+UNIV_INTERN ulong	srv_page_size_shift	= UNIV_PAGE_SIZE_SHIFT_DEF;
+UNIV_INTERN char	srv_use_global_flush_log_at_trx_commit	= TRUE;
+
+/* Try to flush dirty pages so as to avoid IO bursts at
+the checkpoints. */
+UNIV_INTERN char	srv_adaptive_flushing	= TRUE;
+
+UNIV_INTERN ulong	srv_show_locks_held	= 10;
+UNIV_INTERN ulong	srv_show_verbose_locks	= 0;
+
+/** Maximum number of times allowed to conditionally acquire
+mutex before switching to blocking wait on the mutex */
+#define MAX_MUTEX_NOWAIT	20
+
+/** Check whether the number of failed nonblocking mutex
+acquisition attempts exceeds maximum allowed value. If so,
+srv_printf_innodb_monitor() will request mutex acquisition
+with mutex_enter(), which will wait until it gets the mutex. */
+#define MUTEX_NOWAIT(mutex_skipped)	((mutex_skipped) < MAX_MUTEX_NOWAIT)
+
+/** The sort order table of the MySQL latin1_swedish_ci character set
+collation */
+UNIV_INTERN const byte*	srv_latin1_ordering;
+
+/* use os/external memory allocator */
+UNIV_INTERN my_bool	srv_use_sys_malloc	= TRUE;
+/* requested size in kilobytes */
+UNIV_INTERN ulint	srv_buf_pool_size	= ULINT_MAX;
+/* force virtual page preallocation (prefault) */
+UNIV_INTERN my_bool	srv_buf_pool_populate	= FALSE;
+/* requested number of buffer pool instances */
+UNIV_INTERN ulint       srv_buf_pool_instances  = 1;
+/* number of locks to protect buf_pool->page_hash */
+UNIV_INTERN ulong	srv_n_page_hash_locks = 16;
+/** Scan depth for LRU flush batch i.e.: number of blocks scanned*/
+UNIV_INTERN ulong	srv_LRU_scan_depth	= 1024;
+/** whether or not to flush neighbors of a block */
+UNIV_INTERN ulong	srv_flush_neighbors	= 1;
+/* previously requested size */
+UNIV_INTERN ulint	srv_buf_pool_old_size;
+/* current size in kilobytes */
+UNIV_INTERN ulint	srv_buf_pool_curr_size	= 0;
+/* size in bytes */
+UNIV_INTERN ulint	srv_mem_pool_size	= ULINT_MAX;
+UNIV_INTERN ulint	srv_lock_table_size	= ULINT_MAX;
+
+/** Query thread preflush algorithm */
+UNIV_INTERN ulong	srv_foreground_preflush
+	= SRV_FOREGROUND_PREFLUSH_EXP_BACKOFF;
+
+/** The maximum time limit for a single LRU tail flush iteration by the page
+cleaner thread */
+UNIV_INTERN ulint	srv_cleaner_max_lru_time = 1000;
+
+/** The maximum time limit for a single flush list flush iteration by the page
+cleaner thread */
+UNIV_INTERN ulint	srv_cleaner_max_flush_time = 1000;
+
+/** Page cleaner flush list flush batches are further divided into this chunk
+size  */
+UNIV_INTERN ulint	srv_cleaner_flush_chunk_size = 100;
+
+/** Page cleaner LRU list flush batches are further divided into this chunk
+size  */
+UNIV_INTERN ulint	srv_cleaner_lru_chunk_size = 100;
+
+/** If free list length is lower than this percentage of srv_LRU_scan_depth,
+page cleaner LRU flushes will issue flush batches to the same instance in a
+row  */
+UNIV_INTERN ulint	srv_cleaner_free_list_lwm = 10;
+
+/** If TRUE, page cleaner heuristics use evicted instead of flushed page counts
+for its heuristics  */
+UNIV_INTERN my_bool	srv_cleaner_eviction_factor = FALSE;
+
+/** Page cleaner LSN age factor formula option */
+UNIV_INTERN ulong	srv_cleaner_lsn_age_factor
+	= SRV_CLEANER_LSN_AGE_FACTOR_HIGH_CHECKPOINT;
+
+/** Empty free list for a query thread handling algorithm option  */
+UNIV_INTERN ulong	srv_empty_free_list_algorithm
+	= SRV_EMPTY_FREE_LIST_BACKOFF;
+
+/* This parameter is deprecated. Use srv_n_io_[read|write]_threads
+instead. */
+UNIV_INTERN ulint	srv_n_file_io_threads	= ULINT_MAX;
+UNIV_INTERN ulint	srv_n_read_io_threads	= ULINT_MAX;
+UNIV_INTERN ulint	srv_n_write_io_threads	= ULINT_MAX;
+
+/* Switch to enable random read ahead. */
+UNIV_INTERN my_bool	srv_random_read_ahead	= FALSE;
+
+/* The log block size */
+UNIV_INTERN ulint	srv_log_block_size	= 0;
+
+/* User settable value of the number of pages that must be present
+in the buffer cache and accessed sequentially for InnoDB to trigger a
+readahead request. */
+UNIV_INTERN ulong	srv_read_ahead_threshold	= 56;
+
+#ifdef UNIV_LOG_ARCHIVE
+UNIV_INTERN ibool		srv_log_archive_on	= FALSE;
+UNIV_INTERN ibool		srv_archive_recovery	= 0;
+UNIV_INTERN ib_uint64_t	srv_archive_recovery_limit_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/* This parameter is used to throttle the number of insert buffers that are
+merged in a batch. By increasing this parameter on a faster disk you can
+possibly reduce the number of I/O operations performed to complete the
+merge operation. The value of this parameter is used as is by the
+background loop when the system is idle (low load), on a busy system
+the parameter is scaled down by a factor of 4, this is to avoid putting
+a heavier load on the I/O sub system. */
+
+UNIV_INTERN ulong	srv_insert_buffer_batch_size = 20;
+
+UNIV_INTERN char*	srv_file_flush_method_str = NULL;
+UNIV_INTERN ulint	srv_unix_file_flush_method = SRV_UNIX_FSYNC;
+UNIV_INTERN ulint	srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+
+UNIV_INTERN ulint	srv_max_n_open_files	  = 300;
+
+/* Number of IO operations per second the server can do */
+UNIV_INTERN ulong	srv_io_capacity         = 200;
+UNIV_INTERN ulong	srv_max_io_capacity     = 400;
+
+/* The InnoDB main thread tries to keep the ratio of modified pages
+in the buffer pool to all database pages in the buffer pool smaller than
+the following number. But it is not guaranteed that the value stays below
+that during a time of heavy update/insert activity. */
+
+UNIV_INTERN ulong	srv_max_buf_pool_modified_pct	= 75;
+UNIV_INTERN ulong	srv_max_dirty_pages_pct_lwm	= 50;
+
+/* This is the percentage of log capacity at which adaptive flushing,
+if enabled, will kick in. */
+UNIV_INTERN ulong	srv_adaptive_flushing_lwm	= 10;
+
+/* Number of iterations over which adaptive flushing is averaged. */
+UNIV_INTERN ulong	srv_flushing_avg_loops		= 30;
+
+/* The tid of the cleaner thread */
+UNIV_INTERN os_tid_t	srv_cleaner_tid;
+
+/* The tids of the purge threads */
+UNIV_INTERN os_tid_t	srv_purge_tids[SRV_MAX_N_PURGE_THREADS];
+
+/* The tids of the I/O threads */
+UNIV_INTERN os_tid_t	srv_io_tids[SRV_MAX_N_IO_THREADS];
+
+/* The tid of the master thread */
+UNIV_INTERN os_tid_t	srv_master_tid;
+
+/* The relative scheduling priority of the cleaner thread */
+UNIV_INTERN ulint	srv_sched_priority_cleaner	= 19;
+
+/* The relative scheduling priority of the purge threads */
+UNIV_INTERN ulint	srv_sched_priority_purge	= 19;
+
+/* The relative scheduling priority of the I/O threads */
+UNIV_INTERN ulint	srv_sched_priority_io		= 19;
+
+/* The relative scheduling priority of the master thread */
+UNIV_INTERN ulint	srv_sched_priority_master	= 19;
+
+/* The relative priority of the current thread.  If 0, low priority; if 1, high
+priority.  */
+UNIV_INTERN UNIV_THREAD_LOCAL ulint srv_current_thread_priority = 0;
+
+/* The relative priority of the purge coordinator and worker threads.  */
+UNIV_INTERN my_bool	srv_purge_thread_priority	= FALSE;
+
+/* The relative priority of the I/O threads.  */
+UNIV_INTERN my_bool	srv_io_thread_priority		= FALSE;
+
+/* The relative priority of the cleaner thread.  */
+UNIV_INTERN my_bool	srv_cleaner_thread_priority	= FALSE;
+
+/* The relative priority of the master thread.  */
+UNIV_INTERN my_bool	srv_master_thread_priority	= FALSE;
+
+/* The number of purge threads to use.*/
+UNIV_INTERN ulong	srv_n_purge_threads = 1;
+
+/* the number of pages to purge in one batch */
+UNIV_INTERN ulong	srv_purge_batch_size = 20;
+
+/* Internal setting for "innodb_stats_method". Decides how InnoDB treats
+NULL value when collecting statistics. By default, it is set to
+SRV_STATS_NULLS_EQUAL(0), ie. all NULL value are treated equal */
+UNIV_INTERN ulong srv_innodb_stats_method = SRV_STATS_NULLS_EQUAL;
+
+UNIV_INTERN srv_stats_t	srv_stats;
+
+/* structure to pass status variables to MySQL */
+UNIV_INTERN export_var_t export_vars;
+
+/** Normally 0. When nonzero, skip some phases of crash recovery,
+starting from SRV_FORCE_IGNORE_CORRUPT, so that data can be recovered
+by SELECT or mysqldump. When this is nonzero, we do not allow any user
+modifications to the data. */
+UNIV_INTERN ulong	srv_force_recovery;
+#ifndef DBUG_OFF
+/** Inject a crash at different steps of the recovery process.
+This is for testing and debugging only. */
+UNIV_INTERN ulong	srv_force_recovery_crash;
+#endif /* !DBUG_OFF */
+
+/** Print all user-level transactions deadlocks to mysqld stderr */
+
+UNIV_INTERN my_bool	srv_print_all_deadlocks = FALSE;
+
+/* Produce a stacktrace on long semaphore wait */
+UNIV_INTERN my_bool     srv_use_stacktrace = FALSE;
+
+/** Enable INFORMATION_SCHEMA.innodb_cmp_per_index */
+UNIV_INTERN my_bool	srv_cmp_per_index_enabled = FALSE;
+
+/* If the following is set to 1 then we do not run purge and insert buffer
+merge to completion before shutdown. If it is set to 2, do not even flush the
+buffer pool to data files at the shutdown: we effectively 'crash'
+InnoDB (but lose no committed transactions). */
+UNIV_INTERN ulint	srv_fast_shutdown	= 0;
+
+/* Generate a innodb_status.<pid> file */
+UNIV_INTERN ibool	srv_innodb_status	= FALSE;
+
+/* When estimating number of different key values in an index, sample
+this many index pages, there are 2 ways to calculate statistics:
+* persistent stats that are calculated by ANALYZE TABLE and saved
+  in the innodb database.
+* quick transient stats, that are used if persistent stats for the given
+  table/index are not found in the innodb database */
+UNIV_INTERN unsigned long long	srv_stats_transient_sample_pages = 8;
+UNIV_INTERN my_bool		srv_stats_persistent = TRUE;
+UNIV_INTERN unsigned long long	srv_stats_persistent_sample_pages = 20;
+UNIV_INTERN my_bool		srv_stats_auto_recalc = TRUE;
+
+UNIV_INTERN ibool	srv_use_doublewrite_buf	= TRUE;
+UNIV_INTERN ibool       srv_use_atomic_writes = FALSE;
+#ifdef HAVE_POSIX_FALLOCATE
+UNIV_INTERN ibool       srv_use_posix_fallocate = FALSE;
+#endif
+
+/** doublewrite buffer is 1MB is size i.e.: it can hold 128 16K pages.
+The following parameter is the size of the buffer that is used for
+batch flushing i.e.: LRU flushing and flush_list flushing. The rest
+of the pages are used for single page flushing. */
+UNIV_INTERN ulong	srv_doublewrite_batch_size	= 120;
+
+UNIV_INTERN ulong	srv_replication_delay		= 0;
+
+UNIV_INTERN ulong	srv_pass_corrupt_table = 0; /* 0:disable 1:enable */
+
+UNIV_INTERN ulong	srv_log_checksum_algorithm =
+	SRV_CHECKSUM_ALGORITHM_INNODB;
+
+/*-------------------------------------------*/
+UNIV_INTERN ulong	srv_n_spin_wait_rounds	= 30;
+UNIV_INTERN ulong	srv_spin_wait_delay	= 6;
+UNIV_INTERN ibool	srv_priority_boost	= TRUE;
+
+#ifdef UNIV_DEBUG
+UNIV_INTERN ibool	srv_print_thread_releases	= FALSE;
+UNIV_INTERN ibool	srv_print_lock_waits		= FALSE;
+UNIV_INTERN ibool	srv_print_buf_io		= FALSE;
+UNIV_INTERN ibool	srv_print_log_io		= FALSE;
+UNIV_INTERN ibool	srv_print_latch_waits		= FALSE;
+#endif /* UNIV_DEBUG */
+
+static ulint		srv_n_rows_inserted_old		= 0;
+static ulint		srv_n_rows_updated_old		= 0;
+static ulint		srv_n_rows_deleted_old		= 0;
+static ulint		srv_n_rows_read_old		= 0;
+
+UNIV_INTERN ulint	srv_truncated_status_writes	= 0;
+UNIV_INTERN ulint	srv_available_undo_logs         = 0;
+
+/* Ensure status variables are on separate cache lines */
+
+#define CACHE_LINE_SIZE 64
+#define CACHE_ALIGNED __attribute__ ((aligned (CACHE_LINE_SIZE)))
+
+UNIV_INTERN byte
+counters_pad_start[CACHE_LINE_SIZE] __attribute__((unused)) = {0};
+
+UNIV_INTERN ulint		srv_read_views_memory CACHE_ALIGNED	= 0;
+UNIV_INTERN ulint		srv_descriptors_memory CACHE_ALIGNED	= 0;
+
+UNIV_INTERN byte
+counters_pad_end[CACHE_LINE_SIZE] __attribute__((unused)) = {0};
+
+/* Set the following to 0 if you want InnoDB to write messages on
+stderr on startup/shutdown. */
+UNIV_INTERN ibool	srv_print_verbose_log		= TRUE;
+UNIV_INTERN ibool	srv_print_innodb_monitor	= FALSE;
+UNIV_INTERN ibool	srv_print_innodb_lock_monitor	= FALSE;
+UNIV_INTERN ibool	srv_print_innodb_tablespace_monitor = FALSE;
+UNIV_INTERN ibool	srv_print_innodb_table_monitor = FALSE;
+
+/* Array of English strings describing the current state of an
+i/o handler thread */
+
+UNIV_INTERN const char* srv_io_thread_op_info[SRV_MAX_N_IO_THREADS];
+UNIV_INTERN const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS];
+
+UNIV_INTERN time_t	srv_last_monitor_time;
+
+UNIV_INTERN ib_mutex_t	srv_innodb_monitor_mutex;
+
+/* Mutex for locking srv_monitor_file. Not created if srv_read_only_mode */
+UNIV_INTERN ib_mutex_t	srv_monitor_file_mutex;
+
+#ifdef UNIV_PFS_MUTEX
+# ifndef HAVE_ATOMIC_BUILTINS
+/* Key to register server_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	server_mutex_key;
+# endif /* !HAVE_ATOMIC_BUILTINS */
+/** Key to register srv_innodb_monitor_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_innodb_monitor_mutex_key;
+/** Key to register srv_monitor_file_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_monitor_file_mutex_key;
+/** Key to register srv_dict_tmpfile_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_dict_tmpfile_mutex_key;
+/** Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_misc_tmpfile_mutex_key;
+/** Key to register srv_sys_t::mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_sys_mutex_key;
+/** Key to register srv_sys_t::tasks_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_sys_tasks_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/** Temporary file for innodb monitor output */
+UNIV_INTERN FILE*	srv_monitor_file;
+/** Mutex for locking srv_dict_tmpfile. Not created if srv_read_only_mode.
+This mutex has a very high rank; threads reserving it should not
+be holding any InnoDB latches. */
+UNIV_INTERN ib_mutex_t	srv_dict_tmpfile_mutex;
+/** Temporary file for output from the data dictionary */
+UNIV_INTERN FILE*	srv_dict_tmpfile;
+/** Mutex for locking srv_misc_tmpfile. Not created if srv_read_only_mode.
+This mutex has a very low rank; threads reserving it should not
+acquire any further latches or sleep before releasing this one. */
+UNIV_INTERN ib_mutex_t	srv_misc_tmpfile_mutex;
+/** Temporary file for miscellanous diagnostic output */
+UNIV_INTERN FILE*	srv_misc_tmpfile;
+
+UNIV_INTERN ulint	srv_main_thread_process_no	= 0;
+UNIV_INTERN ulint	srv_main_thread_id		= 0;
+
+/* The following counts are used by the srv_master_thread. */
+
+/** Iterations of the loop bounded by 'srv_active' label. */
+static ulint		srv_main_active_loops		= 0;
+/** Iterations of the loop bounded by the 'srv_idle' label. */
+static ulint		srv_main_idle_loops		= 0;
+/** Iterations of the loop bounded by the 'srv_shutdown' label. */
+static ulint		srv_main_shutdown_loops		= 0;
+/** Log writes involving flush. */
+static ulint		srv_log_writes_and_flush	= 0;
+
+/* This is only ever touched by the master thread. It records the
+time when the last flush of log file has happened. The master
+thread ensures that we flush the log files at least once per
+second. */
+static time_t	srv_last_log_flush_time;
+
+/* Interval in seconds at which various tasks are performed by the
+master thread when server is active. In order to balance the workload,
+we should try to keep intervals such that they are not multiple of
+each other. For example, if we have intervals for various tasks
+defined as 5, 10, 15, 60 then all tasks will be performed when
+current_time % 60 == 0 and no tasks will be performed when
+current_time % 5 != 0. */
+
+# define	SRV_MASTER_CHECKPOINT_INTERVAL		(7)
+# define	SRV_MASTER_PURGE_INTERVAL		(10)
+#ifdef MEM_PERIODIC_CHECK
+# define	SRV_MASTER_MEM_VALIDATE_INTERVAL	(13)
+#endif /* MEM_PERIODIC_CHECK */
+# define	SRV_MASTER_DICT_LRU_INTERVAL		(47)
+
+/** Acquire the system_mutex. */
+#define srv_sys_mutex_enter() do {			\
+	mutex_enter(&srv_sys->mutex);			\
+} while (0)
+
+/** Test if the system mutex is owned. */
+#define srv_sys_mutex_own() (mutex_own(&srv_sys->mutex)	\
+			     && !srv_read_only_mode)
+
+/** Release the system mutex. */
+#define srv_sys_mutex_exit() do {			\
+	mutex_exit(&srv_sys->mutex);			\
+} while (0)
+
+#define fetch_lock_wait_timeout(trx)			\
+	((trx)->lock.allowed_to_wait			\
+	 ? thd_lock_wait_timeout((trx)->mysql_thd)	\
+	 : 0)
+
+/*
+	IMPLEMENTATION OF THE SERVER MAIN PROGRAM
+	=========================================
+
+There is the following analogue between this database
+server and an operating system kernel:
+
+DB concept			equivalent OS concept
+----------			---------------------
+transaction		--	process;
+
+query thread		--	thread;
+
+lock			--	semaphore;
+
+kernel			--	kernel;
+
+query thread execution:
+(a) without lock mutex
+reserved		--	process executing in user mode;
+(b) with lock mutex reserved
+			--	process executing in kernel mode;
+
+The server has several backgroind threads all running at the same
+priority as user threads. It periodically checks if here is anything
+happening in the server which requires intervention of the master
+thread. Such situations may be, for example, when flushing of dirty
+blocks is needed in the buffer pool or old version of database rows
+have to be cleaned away (purged). The user can configure a separate
+dedicated purge thread(s) too, in which case the master thread does not
+do any purging.
+
+The threads which we call user threads serve the queries of the MySQL
+server. They run at normal priority.
+
+When there is no activity in the system, also the master thread
+suspends itself to wait for an event making the server totally silent.
+
+There is still one complication in our server design. If a
+background utility thread obtains a resource (e.g., mutex) needed by a user
+thread, and there is also some other user activity in the system,
+the user thread may have to wait indefinitely long for the
+resource, as the OS does not schedule a background thread if
+there is some other runnable user thread. This problem is called
+priority inversion in real-time programming.
+
+One solution to the priority inversion problem would be to keep record
+of which thread owns which resource and in the above case boost the
+priority of the background thread so that it will be scheduled and it
+can release the resource.  This solution is called priority inheritance
+in real-time programming.  A drawback of this solution is that the overhead
+of acquiring a mutex increases slightly, maybe 0.2 microseconds on a 100
+MHz Pentium, because the thread has to call os_thread_get_curr_id.  This may
+be compared to 0.5 microsecond overhead for a mutex lock-unlock pair. Note
+that the thread cannot store the information in the resource , say mutex,
+itself, because competing threads could wipe out the information if it is
+stored before acquiring the mutex, and if it stored afterwards, the
+information is outdated for the time of one machine instruction, at least.
+(To be precise, the information could be stored to lock_word in mutex if
+the machine supports atomic swap.)
+
+The above solution with priority inheritance may become actual in the
+future, currently we do not implement any priority twiddling solution.
+Our general aim is to reduce the contention of all mutexes by making
+them more fine grained.
+
+The thread table contains information of the current status of each
+thread existing in the system, and also the event semaphores used in
+suspending the master thread and utility threads when they have nothing
+to do.  The thread table can be seen as an analogue to the process table
+in a traditional Unix implementation. */
+
+/** The server system struct */
+struct srv_sys_t{
+	ib_mutex_t	tasks_mutex;		/*!< variable protecting the
+						tasks queue */
+	UT_LIST_BASE_NODE_T(que_thr_t)
+			tasks;			/*!< task queue */
+
+	ib_mutex_t	mutex;			/*!< variable protecting the
+						fields below. */
+	ulint		n_sys_threads;		/*!< size of the sys_threads
+						array */
+
+	srv_slot_t*	sys_threads;		/*!< server thread table */
+
+	ulint		n_threads_active[SRV_MASTER + 1];
+						/*!< number of threads active
+						in a thread class */
+
+	srv_stats_t::ulint_ctr_1_t
+			activity_count;		/*!< For tracking server
+						activity */
+};
+
+#ifndef HAVE_ATOMIC_BUILTINS
+/** Mutex protecting some server global variables. */
+UNIV_INTERN ib_mutex_t	server_mutex;
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+static srv_sys_t*	srv_sys	= NULL;
+
+/** Event to signal the monitor thread. */
+UNIV_INTERN os_event_t	srv_monitor_event;
+
+/** Event to signal the error thread */
+UNIV_INTERN os_event_t	srv_error_event;
+
+/** Event to signal the buffer pool dump/load thread */
+UNIV_INTERN os_event_t	srv_buf_dump_event;
+
+/** The buffer pool dump/load file name */
+UNIV_INTERN char*	srv_buf_dump_filename;
+
+/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown
+and/or load it during startup. */
+UNIV_INTERN char	srv_buffer_pool_dump_at_shutdown = FALSE;
+UNIV_INTERN char	srv_buffer_pool_load_at_startup = FALSE;
+
+/** Slot index in the srv_sys->sys_threads array for the purge thread. */
+static const ulint	SRV_PURGE_SLOT	= 1;
+
+/** Slot index in the srv_sys->sys_threads array for the master thread. */
+static const ulint	SRV_MASTER_SLOT = 0;
+
+UNIV_INTERN os_event_t	srv_checkpoint_completed_event;
+
+UNIV_INTERN os_event_t	srv_redo_log_thread_finished_event;
+
+/*********************************************************************//**
+Prints counters for work done by srv_master_thread. */
+static
+void
+srv_print_master_thread_info(
+/*=========================*/
+	FILE  *file)    /* in: output stream */
+{
+	fprintf(file, "srv_master_thread loops: %lu srv_active, "
+		"%lu srv_shutdown, %lu srv_idle\n",
+		srv_main_active_loops,
+		srv_main_shutdown_loops,
+		srv_main_idle_loops);
+	fprintf(file, "srv_master_thread log flush and writes: %lu\n",
+		srv_log_writes_and_flush);
+}
+
+/*********************************************************************//**
+Sets the info describing an i/o thread current state. */
+UNIV_INTERN
+void
+srv_set_io_thread_op_info(
+/*======================*/
+	ulint		i,	/*!< in: the 'segment' of the i/o thread */
+	const char*	str)	/*!< in: constant char string describing the
+				state */
+{
+	ut_a(i < SRV_MAX_N_IO_THREADS);
+
+	srv_io_thread_op_info[i] = str;
+}
+
+/*********************************************************************//**
+Resets the info describing an i/o thread current state. */
+UNIV_INTERN
+void
+srv_reset_io_thread_op_info()
+/*=========================*/
+{
+	for (ulint i = 0; i < UT_ARR_SIZE(srv_io_thread_op_info); ++i) {
+		srv_io_thread_op_info[i] = "not started yet";
+	}
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Validates the type of a thread table slot.
+@return TRUE if ok */
+static
+ibool
+srv_thread_type_validate(
+/*=====================*/
+	srv_thread_type	type)	/*!< in: thread type */
+{
+	switch (type) {
+	case SRV_NONE:
+		break;
+	case SRV_WORKER:
+	case SRV_PURGE:
+	case SRV_MASTER:
+		return(TRUE);
+	}
+	ut_error;
+	return(FALSE);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the type of a thread table slot.
+@return thread type */
+static
+srv_thread_type
+srv_slot_get_type(
+/*==============*/
+	const srv_slot_t*	slot)	/*!< in: thread slot */
+{
+	srv_thread_type	type = slot->type;
+	ut_ad(srv_thread_type_validate(type));
+	return(type);
+}
+
+/*********************************************************************//**
+Reserves a slot in the thread table for the current thread.
+@return	reserved slot */
+static
+srv_slot_t*
+srv_reserve_slot(
+/*=============*/
+	srv_thread_type	type)	/*!< in: type of the thread */
+{
+	srv_slot_t*	slot = 0;
+
+	srv_sys_mutex_enter();
+
+	ut_ad(srv_thread_type_validate(type));
+
+	switch (type) {
+	case SRV_MASTER:
+		slot = &srv_sys->sys_threads[SRV_MASTER_SLOT];
+		break;
+
+	case SRV_PURGE:
+		slot = &srv_sys->sys_threads[SRV_PURGE_SLOT];
+		break;
+
+	case SRV_WORKER:
+		/* Find an empty slot, skip the master and purge slots. */
+		for (slot = &srv_sys->sys_threads[2];
+		     slot->in_use;
+		     ++slot) {
+
+			ut_a(slot < &srv_sys->sys_threads[
+			     srv_sys->n_sys_threads]);
+		}
+		break;
+
+	case SRV_NONE:
+		ut_error;
+	}
+
+	ut_a(!slot->in_use);
+
+	slot->in_use = TRUE;
+	slot->suspended = FALSE;
+	slot->type = type;
+
+	ut_ad(srv_slot_get_type(slot) == type);
+
+	++srv_sys->n_threads_active[type];
+
+	srv_sys_mutex_exit();
+
+	return(slot);
+}
+
+/*********************************************************************//**
+Suspends the calling thread to wait for the event in its thread slot.
+@return the current signal count of the event. */
+static
+ib_int64_t
+srv_suspend_thread_low(
+/*===================*/
+	srv_slot_t*	slot)	/*!< in/out: thread slot */
+{
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(srv_sys_mutex_own());
+
+	ut_ad(slot->in_use);
+
+	srv_thread_type	type = srv_slot_get_type(slot);
+
+	switch (type) {
+	case SRV_NONE:
+		ut_error;
+
+	case SRV_MASTER:
+		/* We have only one master thread and it
+		should be the first entry always. */
+		ut_a(srv_sys->n_threads_active[type] == 1);
+		break;
+
+	case SRV_PURGE:
+		/* We have only one purge coordinator thread
+		and it should be the second entry always. */
+		ut_a(srv_sys->n_threads_active[type] == 1);
+		break;
+
+	case SRV_WORKER:
+		ut_a(srv_n_purge_threads > 1);
+		ut_a(srv_sys->n_threads_active[type] > 0);
+		break;
+	}
+
+	ut_a(!slot->suspended);
+	slot->suspended = TRUE;
+
+	ut_a(srv_sys->n_threads_active[type] > 0);
+
+	srv_sys->n_threads_active[type]--;
+
+	return(os_event_reset(slot->event));
+}
+
+/*********************************************************************//**
+Suspends the calling thread to wait for the event in its thread slot.
+@return the current signal count of the event. */
+static
+ib_int64_t
+srv_suspend_thread(
+/*===============*/
+	srv_slot_t*	slot)	/*!< in/out: thread slot */
+{
+	srv_sys_mutex_enter();
+
+	ib_int64_t	sig_count = srv_suspend_thread_low(slot);
+
+	srv_sys_mutex_exit();
+
+	return(sig_count);
+}
+
+/*********************************************************************//**
+Releases threads of the type given from suspension in the thread table.
+NOTE! The server mutex has to be reserved by the caller!
+@return number of threads released: this may be less than n if not
+        enough threads were suspended at the moment. */
+UNIV_INTERN
+ulint
+srv_release_threads(
+/*================*/
+	srv_thread_type	type,	/*!< in: thread type */
+	ulint		n)	/*!< in: number of threads to release */
+{
+	ulint		i;
+	ulint		count	= 0;
+
+	ut_ad(srv_thread_type_validate(type));
+	ut_ad(n > 0);
+
+	srv_sys_mutex_enter();
+
+	for (i = 0; i < srv_sys->n_sys_threads; i++) {
+		srv_slot_t*	slot;
+
+		slot = &srv_sys->sys_threads[i];
+
+		if (slot->in_use
+		    && srv_slot_get_type(slot) == type
+		    && slot->suspended) {
+
+			switch (type) {
+			case SRV_NONE:
+				ut_error;
+
+			case SRV_MASTER:
+				/* We have only one master thread and it
+				should be the first entry always. */
+				ut_a(n == 1);
+				ut_a(i == SRV_MASTER_SLOT);
+				ut_a(srv_sys->n_threads_active[type] == 0);
+				break;
+
+			case SRV_PURGE:
+				/* We have only one purge coordinator thread
+				and it should be the second entry always. */
+				ut_a(n == 1);
+				ut_a(i == SRV_PURGE_SLOT);
+				ut_a(srv_n_purge_threads > 0);
+				ut_a(srv_sys->n_threads_active[type] == 0);
+				break;
+
+			case SRV_WORKER:
+				ut_a(srv_n_purge_threads > 1);
+				ut_a(srv_sys->n_threads_active[type]
+				     < srv_n_purge_threads - 1);
+				break;
+			}
+
+			slot->suspended = FALSE;
+
+			++srv_sys->n_threads_active[type];
+
+			os_event_set(slot->event);
+
+			if (++count == n) {
+				break;
+			}
+		}
+	}
+
+	srv_sys_mutex_exit();
+
+	return(count);
+}
+
+/*********************************************************************//**
+Release a thread's slot. */
+static
+void
+srv_free_slot(
+/*==========*/
+	srv_slot_t*	slot)	/*!< in/out: thread slot */
+{
+	srv_sys_mutex_enter();
+
+	if (!slot->suspended) {
+		/* Mark the thread as inactive. */
+		srv_suspend_thread_low(slot);
+	}
+
+	/* Free the slot for reuse. */
+	ut_ad(slot->in_use);
+	slot->in_use = FALSE;
+
+	srv_sys_mutex_exit();
+}
+
+/*********************************************************************//**
+Initializes the server. */
+UNIV_INTERN
+void
+srv_init(void)
+/*==========*/
+{
+	ulint	n_sys_threads = 0;
+	ulint	srv_sys_sz = sizeof(*srv_sys);
+
+#ifndef HAVE_ATOMIC_BUILTINS
+	mutex_create(server_mutex_key, &server_mutex, SYNC_ANY_LATCH);
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+	mutex_create(srv_innodb_monitor_mutex_key,
+		     &srv_innodb_monitor_mutex, SYNC_NO_ORDER_CHECK);
+
+	if (!srv_read_only_mode) {
+
+		/* Number of purge threads + master thread */
+		n_sys_threads = srv_n_purge_threads + 1;
+
+		srv_sys_sz += n_sys_threads * sizeof(*srv_sys->sys_threads);
+	}
+
+	srv_sys = static_cast<srv_sys_t*>(mem_zalloc(srv_sys_sz));
+
+	srv_sys->n_sys_threads = n_sys_threads;
+
+	if (!srv_read_only_mode) {
+
+		mutex_create(srv_sys_mutex_key, &srv_sys->mutex, SYNC_THREADS);
+
+		mutex_create(srv_sys_tasks_mutex_key,
+			     &srv_sys->tasks_mutex, SYNC_ANY_LATCH);
+
+		srv_sys->sys_threads = (srv_slot_t*) &srv_sys[1];
+
+		for (ulint i = 0; i < srv_sys->n_sys_threads; ++i) {
+			srv_slot_t*	slot = &srv_sys->sys_threads[i];
+
+			slot->event = os_event_create();
+
+			ut_a(slot->event);
+		}
+
+		srv_error_event = os_event_create();
+
+		srv_monitor_event = os_event_create();
+
+		srv_buf_dump_event = os_event_create();
+
+		srv_checkpoint_completed_event = os_event_create();
+
+		srv_redo_log_thread_finished_event = os_event_create();
+
+		UT_LIST_INIT(srv_sys->tasks);
+	}
+
+	/* page_zip_stat_per_index_mutex is acquired from:
+	1. page_zip_compress() (after SYNC_FSP)
+	2. page_zip_decompress()
+	3. i_s_cmp_per_index_fill_low() (where SYNC_DICT is acquired)
+	4. innodb_cmp_per_index_update(), no other latches
+	since we do not acquire any other latches while holding this mutex,
+	it can have very low level. We pick SYNC_ANY_LATCH for it. */
+
+	mutex_create(
+		page_zip_stat_per_index_mutex_key,
+		&page_zip_stat_per_index_mutex, SYNC_ANY_LATCH);
+
+	/* Create dummy indexes for infimum and supremum records */
+
+	dict_ind_init();
+
+	srv_conc_init();
+
+	/* Initialize some INFORMATION SCHEMA internal structures */
+	trx_i_s_cache_init(trx_i_s_cache);
+
+	ut_crc32_init();
+}
+
+/*********************************************************************//**
+Frees the data structures created in srv_init(). */
+UNIV_INTERN
+void
+srv_free(void)
+/*==========*/
+{
+	srv_conc_free();
+
+	/* The mutexes srv_sys->mutex and srv_sys->tasks_mutex should have
+	been freed by sync_close() already. */
+	mem_free(srv_sys);
+	srv_sys = NULL;
+
+	trx_i_s_cache_free(trx_i_s_cache);
+
+	if (!srv_read_only_mode) {
+		os_event_free(srv_buf_dump_event);
+		srv_buf_dump_event = NULL;
+	}
+}
+
+/*********************************************************************//**
+Initializes the synchronization primitives, memory system, and the thread
+local storage. */
+UNIV_INTERN
+void
+srv_general_init(void)
+/*==================*/
+{
+	ut_mem_init();
+	/* Reset the system variables in the recovery module. */
+	recv_sys_var_init();
+	os_sync_init();
+	sync_init();
+	mem_init(srv_mem_pool_size);
+	que_init();
+	row_mysql_init();
+}
+
+/*********************************************************************//**
+Normalizes init parameter values to use units we use inside InnoDB. */
+static
+void
+srv_normalize_init_values(void)
+/*===========================*/
+{
+	ulint	n;
+	ulint	i;
+
+	n = srv_n_data_files;
+
+	for (i = 0; i < n; i++) {
+		srv_data_file_sizes[i] = srv_data_file_sizes[i]
+			* ((1024 * 1024) / UNIV_PAGE_SIZE);
+	}
+
+	srv_last_file_size_max = srv_last_file_size_max
+		* ((1024 * 1024) / UNIV_PAGE_SIZE);
+
+	srv_log_file_size = srv_log_file_size / UNIV_PAGE_SIZE;
+
+	srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE;
+
+	srv_lock_table_size = 5 * (srv_buf_pool_size / UNIV_PAGE_SIZE);
+}
+
+/*********************************************************************//**
+Boots the InnoDB server. */
+UNIV_INTERN
+void
+srv_boot(void)
+/*==========*/
+{
+	/* Transform the init parameter values given by MySQL to
+	use units we use inside InnoDB: */
+
+	srv_normalize_init_values();
+
+	/* Initialize synchronization primitives, memory management, and thread
+	local storage */
+
+	srv_general_init();
+
+	/* Initialize this module */
+
+	srv_init();
+	srv_mon_create();
+}
+
+/******************************************************************//**
+Refreshes the values used to calculate per-second averages. */
+static
+void
+srv_refresh_innodb_monitor_stats(void)
+/*==================================*/
+{
+	mutex_enter(&srv_innodb_monitor_mutex);
+
+	srv_last_monitor_time = time(NULL);
+
+	os_aio_refresh_stats();
+
+	btr_cur_n_sea_old = btr_cur_n_sea;
+	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
+
+	log_refresh_stats();
+
+	buf_refresh_io_stats_all();
+
+	srv_n_rows_inserted_old = srv_stats.n_rows_inserted;
+	srv_n_rows_updated_old = srv_stats.n_rows_updated;
+	srv_n_rows_deleted_old = srv_stats.n_rows_deleted;
+	srv_n_rows_read_old = srv_stats.n_rows_read;
+
+	mutex_exit(&srv_innodb_monitor_mutex);
+}
+
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
+UNIV_INTERN
+ibool
+srv_printf_innodb_monitor(
+/*======================*/
+	FILE*	file,		/*!< in: output stream */
+	ibool	nowait,		/*!< in: whether to wait for the
+				lock_sys_t:: mutex */
+	ulint*	trx_start_pos,	/*!< out: file position of the start of
+				the list of active transactions */
+	ulint*	trx_end)	/*!< out: file position of the end of
+				the list of active transactions */
+{
+	double	time_elapsed;
+	time_t	current_time;
+	ulint	n_reserved;
+	ibool	ret;
+
+	ulong	btr_search_sys_constant;
+	ulong	btr_search_sys_variable;
+	ulint	lock_sys_subtotal;
+	ulint	recv_sys_subtotal;
+
+	ulint	i;
+	trx_t*	trx;
+
+	mutex_enter(&srv_innodb_monitor_mutex);
+
+	current_time = time(NULL);
+
+	/* We add 0.001 seconds to time_elapsed to prevent division
+	by zero if two users happen to call SHOW ENGINE INNODB STATUS at the
+	same time */
+
+	time_elapsed = difftime(current_time, srv_last_monitor_time)
+		+ 0.001;
+
+	srv_last_monitor_time = time(NULL);
+
+	fputs("\n=====================================\n", file);
+
+	ut_print_timestamp(file);
+	fprintf(file,
+		" INNODB MONITOR OUTPUT\n"
+		"=====================================\n"
+		"Per second averages calculated from the last %lu seconds\n",
+		(ulong) time_elapsed);
+
+	fputs("-----------------\n"
+	      "BACKGROUND THREAD\n"
+	      "-----------------\n", file);
+	srv_print_master_thread_info(file);
+
+	fputs("----------\n"
+	      "SEMAPHORES\n"
+	      "----------\n", file);
+	sync_print(file);
+
+	/* Conceptually, srv_innodb_monitor_mutex has a very high latching
+	order level in sync0sync.h, while dict_foreign_err_mutex has a very
+	low level 135. Therefore we can reserve the latter mutex here without
+	a danger of a deadlock of threads. */
+
+	mutex_enter(&dict_foreign_err_mutex);
+
+	if (!srv_read_only_mode && ftell(dict_foreign_err_file) != 0L) {
+		fputs("------------------------\n"
+		      "LATEST FOREIGN KEY ERROR\n"
+		      "------------------------\n", file);
+		ut_copy_file(file, dict_foreign_err_file);
+	}
+
+	mutex_exit(&dict_foreign_err_mutex);
+
+	/* Only if lock_print_info_summary proceeds correctly,
+	before we call the lock_print_info_all_transactions
+	to print all the lock information. IMPORTANT NOTE: This
+	function acquires the lock mutex on success. */
+	ret = lock_print_info_summary(file, nowait);
+
+	if (ret) {
+		if (trx_start_pos) {
+			long	t = ftell(file);
+			if (t < 0) {
+				*trx_start_pos = ULINT_UNDEFINED;
+			} else {
+				*trx_start_pos = (ulint) t;
+			}
+		}
+
+		/* NOTE: If we get here then we have the lock mutex. This
+		function will release the lock mutex that we acquired when
+		we called the lock_print_info_summary() function earlier. */
+
+		lock_print_info_all_transactions(file);
+
+		if (trx_end) {
+			long	t = ftell(file);
+			if (t < 0) {
+				*trx_end = ULINT_UNDEFINED;
+			} else {
+				*trx_end = (ulint) t;
+			}
+		}
+	}
+
+	fputs("--------\n"
+	      "FILE I/O\n"
+	      "--------\n", file);
+	os_aio_print(file);
+
+	fputs("-------------------------------------\n"
+	      "INSERT BUFFER AND ADAPTIVE HASH INDEX\n"
+	      "-------------------------------------\n", file);
+	ibuf_print(file);
+
+
+	fprintf(file,
+		"%.2f hash searches/s, %.2f non-hash searches/s\n",
+		(btr_cur_n_sea - btr_cur_n_sea_old)
+		/ time_elapsed,
+		(btr_cur_n_non_sea - btr_cur_n_non_sea_old)
+		/ time_elapsed);
+	btr_cur_n_sea_old = btr_cur_n_sea;
+	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
+
+	fputs("---\n"
+	      "LOG\n"
+	      "---\n", file);
+	log_print(file);
+
+	fputs("----------------------\n"
+	      "BUFFER POOL AND MEMORY\n"
+	      "----------------------\n", file);
+	fprintf(file,
+			"Total memory allocated " ULINTPF
+			"; in additional pool allocated " ULINTPF "\n",
+			ut_total_allocated_memory,
+			mem_pool_get_reserved(mem_comm_pool));
+
+	fprintf(file,
+		"Total memory allocated by read views " ULINTPF "\n",
+		os_atomic_increment_ulint(&srv_read_views_memory, 0));
+
+	/* Calculate AHI constant and variable memory allocations */
+
+	btr_search_sys_constant = 0;
+	btr_search_sys_variable = 0;
+
+	ut_ad(btr_search_sys->hash_tables);
+
+	for (i = 0; i < btr_search_index_num; i++) {
+		hash_table_t* ht = btr_search_sys->hash_tables[i];
+
+		ut_ad(ht);
+		ut_ad(ht->heap);
+
+		/* Multiple mutexes/heaps are currently never used for adaptive
+		hash index tables. */
+		ut_ad(!ht->n_sync_obj);
+		ut_ad(!ht->heaps);
+
+		btr_search_sys_variable += mem_heap_get_size(ht->heap);
+		btr_search_sys_constant += ht->n_cells * sizeof(hash_cell_t);
+	}
+
+	lock_sys_subtotal = 0;
+	if (trx_sys) {
+		mutex_enter(&trx_sys->mutex);
+		trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
+		while (trx) {
+			lock_sys_subtotal
+				+= ((trx->lock.lock_heap)
+				    ? mem_heap_get_size(trx->lock.lock_heap)
+				    : 0);
+			trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
+		}
+		mutex_exit(&trx_sys->mutex);
+	}
+
+	recv_sys_subtotal = ((recv_sys && recv_sys->addr_hash)
+			? mem_heap_get_size(recv_sys->heap) : 0);
+
+	fprintf(file,
+			"Internal hash tables (constant factor + variable factor)\n"
+			"    Adaptive hash index %lu \t(%lu + " ULINTPF ")\n"
+			"    Page hash           %lu (buffer pool 0 only)\n"
+			"    Dictionary cache    %lu \t(%lu + " ULINTPF ")\n"
+			"    File system         %lu \t(%lu + " ULINTPF ")\n"
+			"    Lock system         %lu \t(%lu + " ULINTPF ")\n"
+			"    Recovery system     %lu \t(%lu + " ULINTPF ")\n",
+
+			btr_search_sys_constant + btr_search_sys_variable,
+			btr_search_sys_constant,
+			btr_search_sys_variable,
+
+			(ulong) (buf_pool_from_array(0)->page_hash->n_cells * sizeof(hash_cell_t)),
+
+			(ulong) (dict_sys ? ((dict_sys->table_hash->n_cells
+						+ dict_sys->table_id_hash->n_cells
+						) * sizeof(hash_cell_t)
+					+ dict_sys->size) : 0),
+			(ulong) (dict_sys ? ((dict_sys->table_hash->n_cells
+							+ dict_sys->table_id_hash->n_cells
+							) * sizeof(hash_cell_t)) : 0),
+			dict_sys ? (dict_sys->size) : 0,
+
+			(ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)
+					+ fil_system_hash_nodes()),
+			(ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)),
+			fil_system_hash_nodes(),
+
+			(ulong) ((lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0)
+					+ lock_sys_subtotal),
+			(ulong) (lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0),
+			lock_sys_subtotal,
+
+			(ulong) (((recv_sys && recv_sys->addr_hash)
+						? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0)
+					+ recv_sys_subtotal),
+			(ulong) ((recv_sys && recv_sys->addr_hash)
+					? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0),
+			recv_sys_subtotal);
+
+	fprintf(file, "Dictionary memory allocated " ULINTPF "\n",
+		dict_sys->size);
+
+	buf_print_io(file);
+
+	fputs("--------------\n"
+	      "ROW OPERATIONS\n"
+	      "--------------\n", file);
+	fprintf(file, "%ld queries inside InnoDB, %lu queries in queue\n",
+		(long) srv_conc_get_active_threads(),
+		srv_conc_get_waiting_threads());
+
+	mutex_enter(&trx_sys->mutex);
+
+	fprintf(file, "%lu read views open inside InnoDB\n",
+		UT_LIST_GET_LEN(trx_sys->view_list));
+
+	fprintf(file, "%lu RW transactions active inside InnoDB\n",
+		UT_LIST_GET_LEN(trx_sys->rw_trx_list));
+
+	fprintf(file, "%lu RO transactions active inside InnoDB\n",
+		UT_LIST_GET_LEN(trx_sys->ro_trx_list));
+
+	fprintf(file, "%lu out of %lu descriptors used\n",
+		trx_sys->descr_n_used, trx_sys->descr_n_max);
+
+	if (UT_LIST_GET_LEN(trx_sys->view_list)) {
+		read_view_t*	view = UT_LIST_GET_LAST(trx_sys->view_list);
+
+		if (view) {
+			fprintf(file, "---OLDEST VIEW---\n");
+			read_view_print(file, view);
+			fprintf(file, "-----------------\n");
+		}
+	}
+
+	mutex_exit(&trx_sys->mutex);
+
+	n_reserved = fil_space_get_n_reserved_extents(0);
+	if (n_reserved > 0) {
+		fprintf(file,
+			"%lu tablespace extents now reserved for"
+			" B-tree split operations\n",
+			(ulong) n_reserved);
+	}
+
+#ifdef UNIV_LINUX
+	fprintf(file, "Main thread process no. %lu, id %lu, state: %s\n",
+		(ulong) srv_main_thread_process_no,
+		(ulong) srv_main_thread_id,
+		srv_main_thread_op_info);
+#else
+	fprintf(file, "Main thread id %lu, state: %s\n",
+		(ulong) srv_main_thread_id,
+		srv_main_thread_op_info);
+#endif
+	fprintf(file,
+		"Number of rows inserted " ULINTPF
+		", updated " ULINTPF ", deleted " ULINTPF
+		", read " ULINTPF "\n",
+		(ulint) srv_stats.n_rows_inserted,
+		(ulint) srv_stats.n_rows_updated,
+		(ulint) srv_stats.n_rows_deleted,
+		(ulint) srv_stats.n_rows_read);
+	fprintf(file,
+		"%.2f inserts/s, %.2f updates/s,"
+		" %.2f deletes/s, %.2f reads/s\n",
+		((ulint) srv_stats.n_rows_inserted - srv_n_rows_inserted_old)
+		/ time_elapsed,
+		((ulint) srv_stats.n_rows_updated - srv_n_rows_updated_old)
+		/ time_elapsed,
+		((ulint) srv_stats.n_rows_deleted - srv_n_rows_deleted_old)
+		/ time_elapsed,
+		((ulint) srv_stats.n_rows_read - srv_n_rows_read_old)
+		/ time_elapsed);
+
+	srv_n_rows_inserted_old = srv_stats.n_rows_inserted;
+	srv_n_rows_updated_old = srv_stats.n_rows_updated;
+	srv_n_rows_deleted_old = srv_stats.n_rows_deleted;
+	srv_n_rows_read_old = srv_stats.n_rows_read;
+
+	/* Only if lock_print_info_summary proceeds correctly,
+	before we call the lock_print_info_all_transactions
+	to print all the lock information. */
+	ret = lock_print_info_summary(file, nowait);
+
+	if (ret) {
+		lock_print_info_all_transactions(file);
+	}
+
+	fputs("----------------------------\n"
+	      "END OF INNODB MONITOR OUTPUT\n"
+	      "============================\n", file);
+	mutex_exit(&srv_innodb_monitor_mutex);
+	fflush(file);
+
+	return(ret);
+}
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
+UNIV_INTERN
+void
+srv_export_innodb_status(void)
+/*==========================*/
+{
+	buf_pool_stat_t		stat;
+	buf_pools_list_size_t	buf_pools_list_size;
+	ulint			LRU_len;
+	ulint			free_len;
+	ulint			flush_list_len;
+	ulint			mem_adaptive_hash, mem_dictionary;
+	read_view_t*		oldest_view;
+	ulint			i;
+
+	buf_get_total_stat(&stat);
+	buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
+	buf_get_total_list_size_in_bytes(&buf_pools_list_size);
+
+	mem_adaptive_hash = 0;
+
+	ut_ad(btr_search_sys->hash_tables);
+
+	for (i = 0; i < btr_search_index_num; i++) {
+		hash_table_t*	ht = btr_search_sys->hash_tables[i];
+
+		ut_ad(ht);
+		ut_ad(ht->heap);
+		/* Multiple mutexes/heaps are currently never used for adaptive
+		hash index tables. */
+		ut_ad(!ht->n_sync_obj);
+		ut_ad(!ht->heaps);
+
+		mem_adaptive_hash += mem_heap_get_size(ht->heap);
+		mem_adaptive_hash += ht->n_cells * sizeof(hash_cell_t);
+	}
+
+	mem_dictionary = (dict_sys ? ((dict_sys->table_hash->n_cells
+					+ dict_sys->table_id_hash->n_cells
+				      ) * sizeof(hash_cell_t)
+				+ dict_sys->size) : 0);
+
+	mutex_enter(&srv_innodb_monitor_mutex);
+
+	export_vars.innodb_data_pending_reads =
+		os_n_pending_reads;
+
+	export_vars.innodb_data_pending_writes =
+		os_n_pending_writes;
+
+	export_vars.innodb_data_pending_fsyncs =
+		fil_n_pending_log_flushes
+		+ fil_n_pending_tablespace_flushes;
+	export_vars.innodb_adaptive_hash_hash_searches
+		= btr_cur_n_sea;
+	export_vars.innodb_adaptive_hash_non_hash_searches
+		= btr_cur_n_non_sea;
+	export_vars.innodb_background_log_sync
+		= srv_log_writes_and_flush;
+
+	export_vars.innodb_data_fsyncs = os_n_fsyncs;
+
+	export_vars.innodb_data_read = srv_stats.data_read;
+
+	export_vars.innodb_data_reads = os_n_file_reads;
+
+	export_vars.innodb_data_writes = os_n_file_writes;
+
+	export_vars.innodb_data_written = srv_stats.data_written;
+
+	export_vars.innodb_buffer_pool_read_requests = stat.n_page_gets;
+
+	export_vars.innodb_buffer_pool_write_requests =
+		srv_stats.buf_pool_write_requests;
+
+	export_vars.innodb_buffer_pool_wait_free =
+		srv_stats.buf_pool_wait_free;
+
+	export_vars.innodb_buffer_pool_pages_flushed =
+		srv_stats.buf_pool_flushed;
+
+	export_vars.innodb_buffer_pool_reads = srv_stats.buf_pool_reads;
+
+	export_vars.innodb_buffer_pool_read_ahead_rnd =
+		stat.n_ra_pages_read_rnd;
+
+	export_vars.innodb_buffer_pool_read_ahead =
+		stat.n_ra_pages_read;
+
+	export_vars.innodb_buffer_pool_read_ahead_evicted =
+		stat.n_ra_pages_evicted;
+
+	export_vars.innodb_buffer_pool_pages_LRU_flushed =
+		stat.buf_lru_flush_page_count;
+
+	export_vars.innodb_buffer_pool_pages_data = LRU_len;
+
+	export_vars.innodb_buffer_pool_bytes_data =
+		buf_pools_list_size.LRU_bytes
+		+ buf_pools_list_size.unzip_LRU_bytes;
+
+	export_vars.innodb_buffer_pool_pages_dirty = flush_list_len;
+
+	export_vars.innodb_buffer_pool_bytes_dirty =
+		buf_pools_list_size.flush_list_bytes;
+
+	export_vars.innodb_buffer_pool_pages_free = free_len;
+
+	export_vars.innodb_deadlocks = srv_stats.lock_deadlock_count;
+
+#ifdef UNIV_DEBUG
+	export_vars.innodb_buffer_pool_pages_latched =
+		buf_get_latched_pages_number();
+#endif /* UNIV_DEBUG */
+	export_vars.innodb_buffer_pool_pages_total = buf_pool_get_n_pages();
+
+	export_vars.innodb_buffer_pool_pages_misc =
+		buf_pool_get_n_pages() - LRU_len - free_len;
+
+	export_vars.innodb_buffer_pool_pages_made_young
+		= stat.n_pages_made_young;
+	export_vars.innodb_buffer_pool_pages_made_not_young
+		= stat.n_pages_not_made_young;
+	export_vars.innodb_buffer_pool_pages_old = 0;
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool = buf_pool_from_array(i);
+		export_vars.innodb_buffer_pool_pages_old
+			+= buf_pool->LRU_old_len;
+	}
+	export_vars.innodb_checkpoint_age
+		= (log_sys->lsn - log_sys->last_checkpoint_lsn);
+	export_vars.innodb_checkpoint_max_age
+		= log_sys->max_checkpoint_age;
+	export_vars.innodb_history_list_length
+		= trx_sys->rseg_history_len;
+	ibuf_export_ibuf_status(
+			&export_vars.innodb_ibuf_size,
+			&export_vars.innodb_ibuf_free_list,
+			&export_vars.innodb_ibuf_segment_size,
+			&export_vars.innodb_ibuf_merges,
+			&export_vars.innodb_ibuf_merged_inserts,
+			&export_vars.innodb_ibuf_merged_delete_marks,
+			&export_vars.innodb_ibuf_merged_deletes,
+			&export_vars.innodb_ibuf_discarded_inserts,
+			&export_vars.innodb_ibuf_discarded_delete_marks,
+			&export_vars.innodb_ibuf_discarded_deletes);
+	export_vars.innodb_lsn_current
+		= log_sys->lsn;
+	export_vars.innodb_lsn_flushed
+		= log_sys->flushed_to_disk_lsn;
+	export_vars.innodb_lsn_last_checkpoint
+		= log_sys->last_checkpoint_lsn;
+	export_vars.innodb_master_thread_active_loops
+		= srv_main_active_loops;
+	export_vars.innodb_master_thread_idle_loops
+		= srv_main_idle_loops;
+	export_vars.innodb_max_trx_id
+		= trx_sys->max_trx_id;
+	export_vars.innodb_mem_adaptive_hash
+		= mem_adaptive_hash;
+	export_vars.innodb_mem_dictionary
+		= mem_dictionary;
+	export_vars.innodb_mem_total
+		= ut_total_allocated_memory;
+	export_vars.innodb_mutex_os_waits
+		= mutex_os_wait_count;
+	export_vars.innodb_mutex_spin_rounds
+		= mutex_spin_round_count;
+	export_vars.innodb_mutex_spin_waits
+		= mutex_spin_wait_count;
+	export_vars.innodb_s_lock_os_waits
+		= rw_lock_stats.rw_s_os_wait_count;
+	export_vars.innodb_s_lock_spin_rounds
+		= rw_lock_stats.rw_s_spin_round_count;
+	export_vars.innodb_s_lock_spin_waits
+		= rw_lock_stats.rw_s_spin_wait_count;
+	export_vars.innodb_x_lock_os_waits
+		= rw_lock_stats.rw_x_os_wait_count;
+	export_vars.innodb_x_lock_spin_rounds
+		= rw_lock_stats.rw_x_spin_round_count;
+	export_vars.innodb_x_lock_spin_waits
+		= rw_lock_stats.rw_x_spin_wait_count;
+
+	oldest_view = UT_LIST_GET_LAST(trx_sys->view_list);
+	export_vars.innodb_oldest_view_low_limit_trx_id
+		= oldest_view ? oldest_view->low_limit_id : 0;
+
+	export_vars.innodb_purge_trx_id = purge_sys->limit.trx_no;
+	export_vars.innodb_purge_undo_no = purge_sys->limit.undo_no;
+	export_vars.innodb_current_row_locks
+		= lock_sys->rec_num;
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	export_vars.innodb_have_atomic_builtins = 1;
+#else
+	export_vars.innodb_have_atomic_builtins = 0;
+#endif
+	export_vars.innodb_page_size = UNIV_PAGE_SIZE;
+
+	export_vars.innodb_log_waits = srv_stats.log_waits;
+
+	export_vars.innodb_os_log_written = srv_stats.os_log_written;
+
+	export_vars.innodb_os_log_fsyncs = fil_n_log_flushes;
+
+	export_vars.innodb_os_log_pending_fsyncs = fil_n_pending_log_flushes;
+
+	export_vars.innodb_os_log_pending_writes =
+		srv_stats.os_log_pending_writes;
+
+	export_vars.innodb_log_write_requests = srv_stats.log_write_requests;
+
+	export_vars.innodb_log_writes = srv_stats.log_writes;
+
+	export_vars.innodb_dblwr_pages_written =
+		srv_stats.dblwr_pages_written;
+
+	export_vars.innodb_dblwr_writes = srv_stats.dblwr_writes;
+
+	export_vars.innodb_pages_created = stat.n_pages_created;
+
+	export_vars.innodb_pages_read = stat.n_pages_read;
+
+	export_vars.innodb_pages_written = stat.n_pages_written;
+
+	export_vars.innodb_row_lock_waits = srv_stats.n_lock_wait_count;
+
+	export_vars.innodb_row_lock_current_waits =
+		srv_stats.n_lock_wait_current_count;
+
+	export_vars.innodb_row_lock_time = srv_stats.n_lock_wait_time / 1000;
+
+	if (srv_stats.n_lock_wait_count > 0) {
+
+		export_vars.innodb_row_lock_time_avg = (ulint)
+			(srv_stats.n_lock_wait_time
+			 / 1000 / srv_stats.n_lock_wait_count);
+
+	} else {
+		export_vars.innodb_row_lock_time_avg = 0;
+	}
+
+	export_vars.innodb_row_lock_time_max =
+		lock_sys->n_lock_max_wait_time / 1000;
+
+	export_vars.innodb_rows_read = srv_stats.n_rows_read;
+
+	export_vars.innodb_rows_inserted = srv_stats.n_rows_inserted;
+
+	export_vars.innodb_rows_updated = srv_stats.n_rows_updated;
+
+	export_vars.innodb_rows_deleted = srv_stats.n_rows_deleted;
+
+	export_vars.innodb_num_open_files = fil_n_file_opened;
+
+	export_vars.innodb_truncated_status_writes =
+		srv_truncated_status_writes;
+
+	export_vars.innodb_available_undo_logs = srv_available_undo_logs;
+	export_vars.innodb_read_views_memory
+		= os_atomic_increment_ulint(&srv_read_views_memory, 0);
+	export_vars.innodb_descriptors_memory
+		= os_atomic_increment_ulint(&srv_descriptors_memory, 0);
+
+#ifdef UNIV_DEBUG
+	rw_lock_s_lock(&purge_sys->latch);
+	trx_id_t	done_trx_no	= purge_sys->done.trx_no;
+	trx_id_t	up_limit_id	= purge_sys->view
+		? purge_sys->view->up_limit_id
+		: 0;
+	rw_lock_s_unlock(&purge_sys->latch);
+
+	mutex_enter(&trx_sys->mutex);
+	trx_id_t	max_trx_id	= trx_sys->rw_max_trx_id;
+	mutex_exit(&trx_sys->mutex);
+
+	if (!done_trx_no || max_trx_id < done_trx_no - 1) {
+		export_vars.innodb_purge_trx_id_age = 0;
+	} else {
+		export_vars.innodb_purge_trx_id_age =
+			(ulint) (max_trx_id - done_trx_no + 1);
+	}
+
+	if (!up_limit_id
+	    || max_trx_id < up_limit_id) {
+		export_vars.innodb_purge_view_trx_id_age = 0;
+	} else {
+		export_vars.innodb_purge_view_trx_id_age =
+			(ulint) (max_trx_id - up_limit_id);
+	}
+#endif /* UNIV_DEBUG */
+
+	mutex_exit(&srv_innodb_monitor_mutex);
+}
+
+/*********************************************************************//**
+A thread which prints the info output by various InnoDB monitors.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_monitor_thread)(
+/*===============================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	ib_int64_t	sig_count;
+	double		time_elapsed;
+	time_t		current_time;
+	time_t		last_table_monitor_time;
+	time_t		last_tablespace_monitor_time;
+	time_t		last_monitor_time;
+	ulint		mutex_skipped;
+	ibool		last_srv_print_monitor;
+
+	ut_ad(!srv_read_only_mode);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Lock timeout thread starts, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(srv_monitor_thread_key);
+#endif /* UNIV_PFS_THREAD */
+	srv_monitor_active = TRUE;
+
+	UT_NOT_USED(arg);
+	srv_last_monitor_time = ut_time();
+	last_table_monitor_time = ut_time();
+	last_tablespace_monitor_time = ut_time();
+	last_monitor_time = ut_time();
+	mutex_skipped = 0;
+	last_srv_print_monitor = srv_print_innodb_monitor;
+loop:
+	/* Wake up every 5 seconds to see if we need to print
+	monitor information or if signalled at shutdown. */
+
+	sig_count = os_event_reset(srv_monitor_event);
+
+	os_event_wait_time_low(srv_monitor_event, 5000000, sig_count);
+
+	current_time = ut_time();
+
+	time_elapsed = difftime(current_time, last_monitor_time);
+
+	if (time_elapsed > 15) {
+		last_monitor_time = ut_time();
+
+		if (srv_print_innodb_monitor) {
+			/* Reset mutex_skipped counter everytime
+			srv_print_innodb_monitor changes. This is to
+			ensure we will not be blocked by lock_sys->mutex
+			for short duration information printing,
+			such as requested by sync_array_print_long_waits() */
+			if (!last_srv_print_monitor) {
+				mutex_skipped = 0;
+				last_srv_print_monitor = TRUE;
+			}
+
+			if (!srv_printf_innodb_monitor(stderr,
+						MUTEX_NOWAIT(mutex_skipped),
+						NULL, NULL)) {
+				mutex_skipped++;
+			} else {
+				/* Reset the counter */
+				mutex_skipped = 0;
+			}
+		} else {
+			last_srv_print_monitor = FALSE;
+		}
+
+
+		/* We don't create the temp files or associated
+		mutexes in read-only-mode */
+
+		if (!srv_read_only_mode && srv_innodb_status) {
+			mutex_enter(&srv_monitor_file_mutex);
+			rewind(srv_monitor_file);
+			if (!srv_printf_innodb_monitor(srv_monitor_file,
+						MUTEX_NOWAIT(mutex_skipped),
+						NULL, NULL)) {
+				mutex_skipped++;
+			} else {
+				mutex_skipped = 0;
+			}
+
+			os_file_set_eof(srv_monitor_file);
+			mutex_exit(&srv_monitor_file_mutex);
+		}
+
+		if (srv_print_innodb_tablespace_monitor
+		    && difftime(current_time,
+				last_tablespace_monitor_time) > 60) {
+			last_tablespace_monitor_time = ut_time();
+
+			fputs("========================"
+			      "========================\n",
+			      stderr);
+
+			ut_print_timestamp(stderr);
+
+			fputs(" INNODB TABLESPACE MONITOR OUTPUT\n"
+			      "========================"
+			      "========================\n",
+			      stderr);
+
+			fsp_print(0);
+			fputs("Validating tablespace\n", stderr);
+			fsp_validate(0);
+			fputs("Validation ok\n"
+			      "---------------------------------------\n"
+			      "END OF INNODB TABLESPACE MONITOR OUTPUT\n"
+			      "=======================================\n",
+			      stderr);
+		}
+
+		if (srv_print_innodb_table_monitor
+		    && difftime(current_time, last_table_monitor_time) > 60) {
+
+			last_table_monitor_time = ut_time();
+
+			fprintf(stderr, "Warning: %s\n",
+				DEPRECATED_MSG_INNODB_TABLE_MONITOR);
+
+			fputs("===========================================\n",
+			      stderr);
+
+			ut_print_timestamp(stderr);
+
+			fputs(" INNODB TABLE MONITOR OUTPUT\n"
+			      "===========================================\n",
+			      stderr);
+			dict_print();
+
+			fputs("-----------------------------------\n"
+			      "END OF INNODB TABLE MONITOR OUTPUT\n"
+			      "==================================\n",
+			      stderr);
+
+			fprintf(stderr, "Warning: %s\n",
+				DEPRECATED_MSG_INNODB_TABLE_MONITOR);
+		}
+	}
+
+	if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
+		goto exit_func;
+	}
+
+	if (srv_print_innodb_monitor
+	    || srv_print_innodb_lock_monitor
+	    || srv_print_innodb_tablespace_monitor
+	    || srv_print_innodb_table_monitor) {
+		goto loop;
+	}
+
+	goto loop;
+
+exit_func:
+	srv_monitor_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+A thread which prints warnings about semaphore waits which have lasted
+too long. These can be used to track bugs which cause hangs.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_error_monitor_thread)(
+/*=====================================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	/* number of successive fatal timeouts observed */
+	ulint		fatal_cnt	= 0;
+	lsn_t		old_lsn;
+	lsn_t		new_lsn;
+	ib_int64_t	sig_count;
+	/* longest waiting thread for a semaphore */
+	os_thread_id_t	waiter		= os_thread_get_curr_id();
+	os_thread_id_t	old_waiter	= waiter;
+	/* the semaphore that is being waited for */
+	const void*	sema		= NULL;
+	const void*	old_sema	= NULL;
+
+	ut_ad(!srv_read_only_mode);
+
+	old_lsn = srv_start_lsn;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Error monitor thread starts, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(srv_error_monitor_thread_key);
+#endif /* UNIV_PFS_THREAD */
+	srv_error_monitor_active = TRUE;
+
+loop:
+	/* Try to track a strange bug reported by Harald Fuchs and others,
+	where the lsn seems to decrease at times */
+
+	new_lsn = log_get_lsn();
+
+	if (new_lsn < old_lsn) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: old log sequence number " LSN_PF
+			" was greater\n"
+			"InnoDB: than the new log sequence number " LSN_PF "!\n"
+			"InnoDB: Please submit a bug report"
+			" to http://bugs.mysql.com\n",
+			old_lsn, new_lsn);
+		ut_ad(0);
+	}
+
+	old_lsn = new_lsn;
+
+	if (difftime(time(NULL), srv_last_monitor_time) > 60) {
+		/* We referesh InnoDB Monitor values so that averages are
+		printed from at most 60 last seconds */
+
+		srv_refresh_innodb_monitor_stats();
+	}
+
+	/* Update the statistics collected for deciding LRU
+	eviction policy. */
+	buf_LRU_stat_update();
+
+	/* In case mutex_exit is not a memory barrier, it is
+	theoretically possible some threads are left waiting though
+	the semaphore is already released. Wake up those threads: */
+
+	sync_arr_wake_threads_if_sema_free();
+
+	if (sync_array_print_long_waits(&waiter, &sema)
+	    && sema == old_sema && os_thread_eq(waiter, old_waiter)) {
+		fatal_cnt++;
+		if (fatal_cnt > 10) {
+
+			fprintf(stderr,
+				"InnoDB: Error: semaphore wait has lasted"
+				" > %lu seconds\n"
+				"InnoDB: We intentionally crash the server,"
+				" because it appears to be hung.\n",
+				(ulong) srv_fatal_semaphore_wait_threshold);
+
+			ut_error;
+		}
+	} else {
+		fatal_cnt = 0;
+		old_waiter = waiter;
+		old_sema = sema;
+	}
+
+	if (srv_kill_idle_transaction && trx_sys) {
+		trx_t*	trx;
+		time_t	now;
+rescan_idle:
+		now = time(NULL);
+		mutex_enter(&trx_sys->mutex);
+		trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
+		while (trx) {
+			if (!trx_state_eq(trx, TRX_STATE_NOT_STARTED)
+			    && trx_state_eq(trx, TRX_STATE_ACTIVE)
+			    && trx->mysql_thd
+			    && innobase_thd_is_idle(trx->mysql_thd)) {
+				ib_int64_t	start_time = innobase_thd_get_start_time(trx->mysql_thd);
+				ulong		thd_id = innobase_thd_get_thread_id(trx->mysql_thd);
+
+				if (trx->last_stmt_start != start_time) {
+					trx->idle_start = now;
+					trx->last_stmt_start = start_time;
+				} else if (difftime(now, trx->idle_start)
+					   > srv_kill_idle_transaction) {
+					/* kill the session */
+					mutex_exit(&trx_sys->mutex);
+					innobase_thd_kill(thd_id);
+					goto rescan_idle;
+				}
+			}
+			trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
+		}
+		mutex_exit(&trx_sys->mutex);
+	}
+
+	/* Flush stderr so that a database user gets the output
+	to possible MySQL error file */
+
+	fflush(stderr);
+
+	sig_count = os_event_reset(srv_error_event);
+
+	os_event_wait_time_low(srv_error_event, 1000000, sig_count);
+
+	if (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP) {
+
+		goto loop;
+	}
+
+	srv_error_monitor_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/******************************************************************//**
+Increment the server activity count. */
+UNIV_INTERN
+void
+srv_inc_activity_count(void)
+/*========================*/
+{
+	srv_sys->activity_count.inc();
+}
+
+/**********************************************************************//**
+Check whether any background thread is active. If so return the thread
+type.
+@return SRV_NONE if all are suspended or have exited, thread
+type if any are still active. */
+UNIV_INTERN
+srv_thread_type
+srv_get_active_thread_type(void)
+/*============================*/
+{
+	srv_thread_type ret = SRV_NONE;
+
+	if (srv_read_only_mode) {
+		return(SRV_NONE);
+	}
+
+	srv_sys_mutex_enter();
+
+	for (ulint i = SRV_WORKER; i <= SRV_MASTER; ++i) {
+		if (srv_sys->n_threads_active[i] != 0) {
+			ret = static_cast<srv_thread_type>(i);
+			break;
+		}
+	}
+
+	srv_sys_mutex_exit();
+
+	/* Check only on shutdown. */
+	if (ret == SRV_NONE
+	    && srv_shutdown_state != SRV_SHUTDOWN_NONE
+	    && trx_purge_state() != PURGE_STATE_DISABLED
+	    && trx_purge_state() != PURGE_STATE_EXIT) {
+
+		ret = SRV_PURGE;
+	}
+
+	return(ret);
+}
+
+/**********************************************************************//**
+Check whether any background thread are active. If so print which thread
+is active. Send the threads wakeup signal.
+@return name of thread that is active or NULL */
+UNIV_INTERN
+const char*
+srv_any_background_threads_are_active(void)
+/*=======================================*/
+{
+	const char*	thread_active = NULL;
+
+	if (srv_read_only_mode) {
+		return(NULL);
+	} else if (srv_error_monitor_active) {
+		thread_active = "srv_error_monitor_thread";
+	} else if (lock_sys->timeout_thread_active) {
+		thread_active = "srv_lock_timeout thread";
+	} else if (srv_monitor_active) {
+		thread_active = "srv_monitor_thread";
+	} else if (srv_buf_dump_thread_active) {
+		thread_active = "buf_dump_thread";
+	} else if (srv_dict_stats_thread_active) {
+		thread_active = "dict_stats_thread";
+	}
+
+	os_event_set(srv_error_event);
+	os_event_set(srv_monitor_event);
+	os_event_set(srv_buf_dump_event);
+	os_event_set(lock_sys->timeout_event);
+	os_event_set(dict_stats_event);
+
+	return(thread_active);
+}
+
+/******************************************************************//**
+A thread which follows the redo log and outputs the changed page bitmap.
+@return a dummy value */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_redo_log_follow_thread)(
+/*=======================================*/
+	void*	arg __attribute__((unused)))	/*!< in: a dummy parameter
+						     required by
+						     os_thread_create */
+{
+	ut_ad(!srv_read_only_mode);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Redo log follower thread starts, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(srv_log_tracking_thread_key);
+#endif
+
+	my_thread_init();
+
+	do {
+		os_event_wait(srv_checkpoint_completed_event);
+		os_event_reset(srv_checkpoint_completed_event);
+
+#ifdef UNIV_DEBUG
+		if (!srv_track_changed_pages) {
+			continue;
+		}
+#endif
+
+		if (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE) {
+			if (!log_online_follow_redo_log()) {
+				/* TODO: sync with I_S log tracking status? */
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"log tracking bitmap write failed, "
+					"stopping log tracking thread!\n");
+				break;
+			}
+		}
+
+	} while (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE);
+
+	srv_track_changed_pages = FALSE;
+	log_online_read_shutdown();
+	os_event_set(srv_redo_log_thread_finished_event);
+
+	my_thread_end();
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*************************************************************//**
+Removes old archived transaction log files.
+Both parameters couldn't be provided at the same time */
+dberr_t
+purge_archived_logs(
+	time_t	before_date,		/*!< in: all files modified
+					before timestamp should be removed */
+	lsn_t	before_no)		/*!< in: files with this number in name
+					and earler should be removed */
+{
+	log_group_t*	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	os_file_dir_t	dir;
+	os_file_stat_t	fileinfo;
+	char		archived_log_filename[OS_FILE_MAX_PATH];
+	char		namegen[OS_FILE_MAX_PATH];
+	ulint		dirnamelen;
+
+	if (srv_arch_dir) {
+		dir = os_file_opendir(srv_arch_dir, FALSE);
+		if (!dir) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"opening archived log directory %s failed. "
+				"Purge archived logs are not available\n",
+				srv_arch_dir);
+			/* failed to open directory */
+			return(DB_ERROR);
+		}
+	} else {
+		/* log archive directory is not specified */
+		return(DB_ERROR);
+	}
+
+	dirnamelen = strlen(srv_arch_dir);
+
+	memcpy(archived_log_filename, srv_arch_dir, dirnamelen);
+	if (dirnamelen &&
+		archived_log_filename[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
+		archived_log_filename[dirnamelen++] = SRV_PATH_SEPARATOR;
+	}
+
+	memset(&fileinfo, 0, sizeof(fileinfo));
+	while(!os_file_readdir_next_file(srv_arch_dir, dir,
+				&fileinfo) ) {
+		if (strncmp(fileinfo.name,
+			IB_ARCHIVED_LOGS_PREFIX, IB_ARCHIVED_LOGS_PREFIX_LEN)) {
+			continue;
+		}
+		if (dirnamelen + strlen(fileinfo.name) + 2 > OS_FILE_MAX_PATH)
+			continue;
+
+		snprintf(archived_log_filename + dirnamelen, OS_FILE_MAX_PATH,
+				"%s", fileinfo.name);
+
+		if (before_no) {
+			ib_uint64_t log_file_no = strtoull(fileinfo.name +
+					IB_ARCHIVED_LOGS_PREFIX_LEN,
+					NULL, 10);
+			if (log_file_no == 0 || before_no <= log_file_no) {
+				continue;
+			}
+		} else {
+			fileinfo.mtime = 0;
+			if (os_file_get_status(archived_log_filename,
+					&fileinfo, false) != DB_SUCCESS ||
+					fileinfo.mtime == 0) {
+				continue;
+			}
+
+			if (before_date == 0 || fileinfo.mtime > before_date) {
+				continue;
+			}
+		}
+
+		/* We are going to delete archived file. Acquire log_sys->mutex
+		to make sure that we are the only who try to delete file. This
+		also prevents log system from using this file. Do not delete
+		file if it is currently in progress of writting or have
+		pending IO. This is enforced by checking:
+		  1. fil_space_contains_node.
+		  2. group->archived_offset % group->file_size != 0, i.e. 
+		     there is archive in progress and we are going to delete it.
+		This covers 3 cases:
+		  a. Usual case when we have one archive in progress,
+		     both 1 and 2 are TRUE
+		  b. When we have more then 1 archive in fil_space,
+		     this can happen when flushed LSN range crosses file
+		     boundary
+		  c. When we have empty fil_space, but existing file will be
+		     opened once archiving operation is requested. This usually
+		     happens on startup.
+		*/
+
+		mutex_enter(&log_sys->mutex);
+
+		log_archived_file_name_gen(namegen, sizeof(namegen),
+					   group->id, group->archived_file_no);
+
+		if (fil_space_contains_node(group->archive_space_id,
+					    archived_log_filename) ||
+		    (group->archived_offset % group->file_size != 0 &&
+		     strcmp(namegen, archived_log_filename) == 0)) {
+
+			mutex_exit(&log_sys->mutex);
+			continue;
+		}
+
+		if (!os_file_delete_if_exists(innodb_file_data_key,
+					     archived_log_filename)) {
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"can't delete archived log file %s.\n",
+				archived_log_filename);
+
+			mutex_exit(&log_sys->mutex);
+			os_file_closedir(dir);
+
+			return(DB_ERROR);
+		}
+
+		mutex_exit(&log_sys->mutex);
+	}
+
+	os_file_closedir(dir);
+
+	return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Tells the InnoDB server that there has been activity in the database
+and wakes up the master thread if it is suspended (not sleeping). Used
+in the MySQL interface. Note that there is a small chance that the master
+thread stays suspended (we do not protect our operation with the
+srv_sys_t->mutex, for performance reasons). */
+UNIV_INTERN
+void
+srv_active_wake_master_thread(void)
+/*===============================*/
+{
+	if (srv_read_only_mode) {
+		return;
+	}
+
+	ut_ad(!srv_sys_mutex_own());
+
+	srv_inc_activity_count();
+
+	if (srv_sys->n_threads_active[SRV_MASTER] == 0) {
+		srv_slot_t*	slot;
+
+		srv_sys_mutex_enter();
+
+		slot = &srv_sys->sys_threads[SRV_MASTER_SLOT];
+
+		/* Only if the master thread has been started. */
+
+		if (slot->in_use) {
+			ut_a(srv_slot_get_type(slot) == SRV_MASTER);
+
+			if (slot->suspended) {
+
+				slot->suspended = FALSE;
+
+				++srv_sys->n_threads_active[SRV_MASTER];
+
+				os_event_set(slot->event);
+			}
+		}
+
+		srv_sys_mutex_exit();
+	}
+}
+
+/*******************************************************************//**
+Tells the purge thread that there has been activity in the database
+and wakes up the purge thread if it is suspended (not sleeping).  Note
+that there is a small chance that the purge thread stays suspended
+(we do not protect our check with the srv_sys_t:mutex and the
+purge_sys->latch, for performance reasons). */
+UNIV_INTERN
+void
+srv_wake_purge_thread_if_not_active(void)
+/*=====================================*/
+{
+	ut_ad(!srv_sys_mutex_own());
+
+	if (purge_sys->state == PURGE_STATE_RUN
+	    && srv_sys->n_threads_active[SRV_PURGE] == 0) {
+
+		srv_release_threads(SRV_PURGE, 1);
+	}
+}
+
+/*******************************************************************//**
+Wakes up the master thread if it is suspended or being suspended. */
+UNIV_INTERN
+void
+srv_wake_master_thread(void)
+/*========================*/
+{
+	ut_ad(!srv_sys_mutex_own());
+
+	srv_inc_activity_count();
+
+	srv_release_threads(SRV_MASTER, 1);
+}
+
+/*******************************************************************//**
+Get current server activity count. We don't hold srv_sys::mutex while
+reading this value as it is only used in heuristics.
+@return activity count. */
+UNIV_INTERN
+ulint
+srv_get_activity_count(void)
+/*========================*/
+{
+	return(srv_sys->activity_count);
+}
+
+/*******************************************************************//**
+Check if there has been any activity.
+@return FALSE if no change in activity counter. */
+UNIV_INTERN
+ibool
+srv_check_activity(
+/*===============*/
+	ulint		old_activity_count)	/*!< in: old activity count */
+{
+	return(srv_sys->activity_count != old_activity_count);
+}
+
+/********************************************************************//**
+The master thread is tasked to ensure that flush of log file happens
+once every second in the background. This is to ensure that not more
+than one second of trxs are lost in case of crash when
+innodb_flush_logs_at_trx_commit != 1 */
+static
+void
+srv_sync_log_buffer_in_background(void)
+/*===================================*/
+{
+	time_t	current_time = time(NULL);
+
+	srv_main_thread_op_info = "flushing log";
+	if (difftime(current_time, srv_last_log_flush_time)
+	    >= srv_flush_log_at_timeout) {
+		log_buffer_sync_in_background(TRUE);
+		srv_last_log_flush_time = current_time;
+		srv_log_writes_and_flush++;
+	}
+}
+
+/********************************************************************//**
+Make room in the table cache by evicting an unused table.
+@return number of tables evicted. */
+static
+ulint
+srv_master_evict_from_table_cache(
+/*==============================*/
+	ulint	pct_check)	/*!< in: max percent to check */
+{
+	ulint	n_tables_evicted = 0;
+
+	rw_lock_x_lock(&dict_operation_lock);
+
+	dict_mutex_enter_for_mysql();
+
+	n_tables_evicted = dict_make_room_in_cache(
+		innobase_get_table_cache_size(), pct_check);
+
+	dict_mutex_exit_for_mysql();
+
+	rw_lock_x_unlock(&dict_operation_lock);
+
+	return(n_tables_evicted);
+}
+
+/*********************************************************************//**
+This function prints progress message every 60 seconds during server
+shutdown, for any activities that master thread is pending on. */
+static
+void
+srv_shutdown_print_master_pending(
+/*==============================*/
+	ib_time_t*	last_print_time,	/*!< last time the function
+						print the message */
+	ulint		n_tables_to_drop,	/*!< number of tables to
+						be dropped */
+	ulint		n_bytes_merged)		/*!< number of change buffer
+						just merged */
+{
+	ib_time_t	current_time;
+	double		time_elapsed;
+
+	current_time = ut_time();
+	time_elapsed = ut_difftime(current_time, *last_print_time);
+
+	if (time_elapsed > 60) {
+		*last_print_time = ut_time();
+
+		if (n_tables_to_drop) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Waiting for "
+				"%lu table(s) to be dropped\n",
+				(ulong) n_tables_to_drop);
+		}
+
+		/* Check change buffer merge, we only wait for change buffer
+		merge if it is a slow shutdown */
+		if (!srv_fast_shutdown && n_bytes_merged) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Waiting for change "
+				"buffer merge to complete\n"
+				"  InnoDB: number of bytes of change buffer "
+				"just merged:  %lu\n",
+				n_bytes_merged);
+		}
+	}
+}
+
+/*********************************************************************//**
+Perform the tasks that the master thread is supposed to do when the
+server is active. There are two types of tasks. The first category is
+of such tasks which are performed at each inovcation of this function.
+We assume that this function is called roughly every second when the
+server is active. The second category is of such tasks which are
+performed at some interval e.g.: purge, dict_LRU cleanup etc. */
+static
+void
+srv_master_do_active_tasks(void)
+/*============================*/
+{
+	ib_time_t	cur_time = ut_time();
+	ullint		counter_time = ut_time_us(NULL);
+
+	/* First do the tasks that we are suppose to do at each
+	invocation of this function. */
+
+	++srv_main_active_loops;
+
+	MONITOR_INC(MONITOR_MASTER_ACTIVE_LOOPS);
+
+	/* ALTER TABLE in MySQL requires on Unix that the table handler
+	can drop tables lazily after there no longer are SELECT
+	queries to them. */
+	srv_main_thread_op_info = "doing background drop tables";
+	row_drop_tables_for_mysql_in_background();
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND, counter_time);
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	/* make sure that there is enough reusable space in the redo
+	log files */
+	srv_main_thread_op_info = "checking free log space";
+	log_free_check();
+
+	/* Do an ibuf merge */
+	srv_main_thread_op_info = "doing insert buffer merge";
+	counter_time = ut_time_us(NULL);
+	ibuf_contract_in_background(0, FALSE);
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_IBUF_MERGE_MICROSECOND, counter_time);
+
+	/* Flush logs if needed */
+	srv_main_thread_op_info = "flushing log";
+	srv_sync_log_buffer_in_background();
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_LOG_FLUSH_MICROSECOND, counter_time);
+
+	/* Now see if various tasks that are performed at defined
+	intervals need to be performed. */
+
+#ifdef MEM_PERIODIC_CHECK
+	/* Check magic numbers of every allocated mem block once in
+	SRV_MASTER_MEM_VALIDATE_INTERVAL seconds */
+	if (cur_time % SRV_MASTER_MEM_VALIDATE_INTERVAL == 0) {
+		mem_validate_all_blocks();
+		MONITOR_INC_TIME_IN_MICRO_SECS(
+			MONITOR_SRV_MEM_VALIDATE_MICROSECOND, counter_time);
+	}
+#endif
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	if (cur_time % SRV_MASTER_DICT_LRU_INTERVAL == 0) {
+		srv_main_thread_op_info = "enforcing dict cache limit";
+		srv_master_evict_from_table_cache(50);
+		MONITOR_INC_TIME_IN_MICRO_SECS(
+			MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time);
+	}
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	/* Make a new checkpoint */
+	if (cur_time % SRV_MASTER_CHECKPOINT_INTERVAL == 0) {
+		srv_main_thread_op_info = "making checkpoint";
+		log_checkpoint(TRUE, FALSE, TRUE);
+		MONITOR_INC_TIME_IN_MICRO_SECS(
+			MONITOR_SRV_CHECKPOINT_MICROSECOND, counter_time);
+	}
+}
+
+/*********************************************************************//**
+Perform the tasks that the master thread is supposed to do whenever the
+server is idle. We do check for the server state during this function
+and if the server has entered the shutdown phase we may return from
+the function without completing the required tasks.
+Note that the server can move to active state when we are executing this
+function but we don't check for that as we are suppose to perform more
+or less same tasks when server is active. */
+static
+void
+srv_master_do_idle_tasks(void)
+/*==========================*/
+{
+	ullint	counter_time;
+
+	++srv_main_idle_loops;
+
+	MONITOR_INC(MONITOR_MASTER_IDLE_LOOPS);
+
+
+	/* ALTER TABLE in MySQL requires on Unix that the table handler
+	can drop tables lazily after there no longer are SELECT
+	queries to them. */
+	counter_time = ut_time_us(NULL);
+	srv_main_thread_op_info = "doing background drop tables";
+	row_drop_tables_for_mysql_in_background();
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND,
+			 counter_time);
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	/* make sure that there is enough reusable space in the redo
+	log files */
+	srv_main_thread_op_info = "checking free log space";
+	log_free_check();
+
+	/* Do an ibuf merge */
+	counter_time = ut_time_us(NULL);
+	srv_main_thread_op_info = "doing insert buffer merge";
+	ibuf_contract_in_background(0, TRUE);
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_IBUF_MERGE_MICROSECOND, counter_time);
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	srv_main_thread_op_info = "enforcing dict cache limit";
+	srv_master_evict_from_table_cache(100);
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time);
+
+	/* Flush logs if needed */
+	srv_sync_log_buffer_in_background();
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_LOG_FLUSH_MICROSECOND, counter_time);
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	/* Make a new checkpoint */
+	srv_main_thread_op_info = "making checkpoint";
+	log_checkpoint(TRUE, FALSE, TRUE);
+	MONITOR_INC_TIME_IN_MICRO_SECS(MONITOR_SRV_CHECKPOINT_MICROSECOND,
+				       counter_time);
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	if (srv_log_arch_expire_sec) {
+		srv_main_thread_op_info = "purging archived logs";
+		purge_archived_logs(ut_time() - srv_log_arch_expire_sec,
+				0);
+	}
+}
+
+/*********************************************************************//**
+Perform the tasks during shutdown. The tasks that we do at shutdown
+depend on srv_fast_shutdown:
+2 => very fast shutdown => do no book keeping
+1 => normal shutdown => clear drop table queue and make checkpoint
+0 => slow shutdown => in addition to above do complete purge and ibuf
+merge
+@return TRUE if some work was done. FALSE otherwise */
+static
+ibool
+srv_master_do_shutdown_tasks(
+/*=========================*/
+	ib_time_t*	last_print_time)/*!< last time the function
+					print the message */
+{
+	ulint		n_bytes_merged = 0;
+	ulint		n_tables_to_drop = 0;
+
+	ut_ad(!srv_read_only_mode);
+
+	++srv_main_shutdown_loops;
+
+	ut_a(srv_shutdown_state > 0);
+
+	/* In very fast shutdown none of the following is necessary */
+	if (srv_fast_shutdown == 2) {
+		return(FALSE);
+	}
+
+	/* ALTER TABLE in MySQL requires on Unix that the table handler
+	can drop tables lazily after there no longer are SELECT
+	queries to them. */
+	srv_main_thread_op_info = "doing background drop tables";
+	n_tables_to_drop = row_drop_tables_for_mysql_in_background();
+
+	/* make sure that there is enough reusable space in the redo
+	log files */
+	srv_main_thread_op_info = "checking free log space";
+	log_free_check();
+
+	/* In case of normal shutdown we don't do ibuf merge or purge */
+	if (srv_fast_shutdown == 1) {
+		goto func_exit;
+	}
+
+	/* Do an ibuf merge */
+	srv_main_thread_op_info = "doing insert buffer merge";
+	n_bytes_merged = ibuf_contract_in_background(0, TRUE);
+
+	/* Flush logs if needed */
+	srv_sync_log_buffer_in_background();
+
+func_exit:
+	/* Make a new checkpoint about once in 10 seconds */
+	srv_main_thread_op_info = "making checkpoint";
+	log_checkpoint(TRUE, FALSE, FALSE);
+
+	/* Print progress message every 60 seconds during shutdown */
+	if (srv_shutdown_state > 0 && srv_print_verbose_log) {
+		srv_shutdown_print_master_pending(
+			last_print_time, n_tables_to_drop, n_bytes_merged);
+	}
+
+	return(n_bytes_merged || n_tables_to_drop);
+}
+
+/*********************************************************************//**
+Puts master thread to sleep. At this point we are using polling to
+service various activities. Master thread sleeps for one second before
+checking the state of the server again */
+static
+void
+srv_master_sleep(void)
+/*==================*/
+{
+	srv_main_thread_op_info = "sleeping";
+	os_thread_sleep(1000000);
+	srv_main_thread_op_info = "";
+}
+
+/*********************************************************************//**
+The master thread controlling the server.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_master_thread)(
+/*==============================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	srv_slot_t*	slot;
+	ulint		old_activity_count = srv_get_activity_count();
+	ib_time_t	last_print_time;
+
+	ut_ad(!srv_read_only_mode);
+
+	srv_master_tid = os_thread_get_tid();
+
+	os_thread_set_priority(srv_master_tid, srv_sched_priority_master);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Master thread starts, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(srv_master_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+	srv_main_thread_process_no = os_proc_get_number();
+	srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
+
+	slot = srv_reserve_slot(SRV_MASTER);
+	ut_a(slot == srv_sys->sys_threads);
+
+	last_print_time = ut_time();
+loop:
+	if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND) {
+		goto suspend_thread;
+	}
+
+	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+
+		srv_master_sleep();
+
+		MONITOR_INC(MONITOR_MASTER_THREAD_SLEEP);
+
+		srv_current_thread_priority = srv_master_thread_priority;
+
+		if (srv_check_activity(old_activity_count)) {
+			old_activity_count = srv_get_activity_count();
+			srv_master_do_active_tasks();
+		} else {
+			srv_master_do_idle_tasks();
+		}
+	}
+
+	while (srv_master_do_shutdown_tasks(&last_print_time)) {
+
+		/* Shouldn't loop here in case of very fast shutdown */
+		ut_ad(srv_fast_shutdown < 2);
+	}
+
+suspend_thread:
+	srv_main_thread_op_info = "suspending";
+
+	srv_suspend_thread(slot);
+
+	/* DO NOT CHANGE THIS STRING. innobase_start_or_create_for_mysql()
+	waits for database activity to die down when converting < 4.1.x
+	databases, and relies on this string being exactly as it is. InnoDB
+	manual also mentions this string in several places. */
+	srv_main_thread_op_info = "waiting for server activity";
+
+	os_event_wait(slot->event);
+
+	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+		os_thread_exit(NULL);
+	}
+
+	goto loop;
+
+	OS_THREAD_DUMMY_RETURN;	/* Not reached, avoid compiler warning */
+}
+
+/*********************************************************************//**
+Check if purge should stop.
+@return true if it should shutdown. */
+static
+bool
+srv_purge_should_exit(
+/*==============*/
+	ulint		n_purged)	/*!< in: pages purged in last batch */
+{
+	switch (srv_shutdown_state) {
+	case SRV_SHUTDOWN_NONE:
+		/* Normal operation. */
+		break;
+
+	case SRV_SHUTDOWN_CLEANUP:
+	case SRV_SHUTDOWN_EXIT_THREADS:
+		/* Exit unless slow shutdown requested or all done. */
+		return(srv_fast_shutdown != 0 || n_purged == 0);
+
+	case SRV_SHUTDOWN_LAST_PHASE:
+	case SRV_SHUTDOWN_FLUSH_PHASE:
+		ut_error;
+	}
+
+	return(false);
+}
+
+/*********************************************************************//**
+Fetch and execute a task from the work queue.
+@return	true if a task was executed */
+static
+bool
+srv_task_execute(void)
+/*==================*/
+{
+	que_thr_t*	thr = NULL;
+
+	ut_ad(!srv_read_only_mode);
+	ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+
+	mutex_enter(&srv_sys->tasks_mutex);
+
+	if (UT_LIST_GET_LEN(srv_sys->tasks) > 0) {
+
+		thr = UT_LIST_GET_FIRST(srv_sys->tasks);
+
+		ut_a(que_node_get_type(thr->child) == QUE_NODE_PURGE);
+
+		UT_LIST_REMOVE(queue, srv_sys->tasks, thr);
+	}
+
+	mutex_exit(&srv_sys->tasks_mutex);
+
+	if (thr != NULL) {
+
+		que_run_threads(thr);
+
+		os_atomic_inc_ulint(
+			&purge_sys->bh_mutex, &purge_sys->n_completed, 1);
+
+		srv_inc_activity_count();
+	}
+
+	return(thr != NULL);
+}
+
+static ulint purge_tid_i = 0;
+
+/*********************************************************************//**
+Worker thread that reads tasks from the work queue and executes them.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_worker_thread)(
+/*==============================*/
+	void*	arg __attribute__((unused)))	/*!< in: a dummy parameter
+						required by os_thread_create */
+{
+	srv_slot_t*	slot;
+	ulint		tid_i = os_atomic_increment_ulint(&purge_tid_i, 1);
+
+	ut_ad(tid_i < srv_n_purge_threads);
+	ut_ad(!srv_read_only_mode);
+	ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+
+	srv_purge_tids[tid_i] = os_thread_get_tid();
+	os_thread_set_priority(srv_purge_tids[tid_i],
+			       srv_sched_priority_purge);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: worker thread starting, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	slot = srv_reserve_slot(SRV_WORKER);
+
+	ut_a(srv_n_purge_threads > 1);
+
+	srv_sys_mutex_enter();
+
+	ut_a(srv_sys->n_threads_active[SRV_WORKER] < srv_n_purge_threads);
+
+	srv_sys_mutex_exit();
+
+	/* We need to ensure that the worker threads exit after the
+	purge coordinator thread. Otherwise the purge coordinaor can
+	end up waiting forever in trx_purge_wait_for_workers_to_complete() */
+
+	do {
+		srv_suspend_thread(slot);
+
+		os_event_wait(slot->event);
+
+		srv_current_thread_priority = srv_purge_thread_priority;
+
+		if (srv_task_execute()) {
+
+			/* If there are tasks in the queue, wakeup
+			the purge coordinator thread. */
+
+			srv_wake_purge_thread_if_not_active();
+		}
+
+		/* Note: we are checking the state without holding the
+		purge_sys->latch here. */
+	} while (purge_sys->state != PURGE_STATE_EXIT);
+
+	srv_free_slot(slot);
+
+	rw_lock_x_lock(&purge_sys->latch);
+
+	ut_a(!purge_sys->running);
+	ut_a(purge_sys->state == PURGE_STATE_EXIT);
+	ut_a(srv_shutdown_state > SRV_SHUTDOWN_NONE);
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: Purge worker thread exiting, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;	/* Not reached, avoid compiler warning */
+}
+
+/*********************************************************************//**
+Do the actual purge operation.
+@return length of history list before the last purge batch. */
+static
+ulint
+srv_do_purge(
+/*=========*/
+	ulint		n_threads,	/*!< in: number of threads to use */
+	ulint*		n_total_purged)	/*!< in/out: total pages purged */
+{
+	ulint		n_pages_purged;
+
+	static ulint	count = 0;
+	static ulint	n_use_threads = 0;
+	static ulint	rseg_history_len = 0;
+	ulint		old_activity_count = srv_get_activity_count();
+
+	ut_a(n_threads > 0);
+	ut_ad(!srv_read_only_mode);
+
+	/* Purge until there are no more records to purge and there is
+	no change in configuration or server state. If the user has
+	configured more than one purge thread then we treat that as a
+	pool of threads and only use the extra threads if purge can't
+	keep up with updates. */
+
+	if (n_use_threads == 0) {
+		n_use_threads = n_threads;
+	}
+
+	do {
+		srv_current_thread_priority = srv_purge_thread_priority;
+
+		if (trx_sys->rseg_history_len > rseg_history_len
+		    || (srv_max_purge_lag > 0
+			&& rseg_history_len > srv_max_purge_lag)) {
+
+			/* History length is now longer than what it was
+			when we took the last snapshot. Use more threads. */
+
+			if (n_use_threads < n_threads) {
+				++n_use_threads;
+			}
+
+		} else if (srv_check_activity(old_activity_count)
+			   && n_use_threads > 1) {
+
+			/* History length same or smaller since last snapshot,
+			use fewer threads. */
+
+			--n_use_threads;
+
+			old_activity_count = srv_get_activity_count();
+		}
+
+		/* Ensure that the purge threads are less than what
+		was configured. */
+
+		ut_a(n_use_threads > 0);
+		ut_a(n_use_threads <= n_threads);
+
+		/* Take a snapshot of the history list before purge. */
+		if ((rseg_history_len = trx_sys->rseg_history_len) == 0) {
+			break;
+		}
+
+		n_pages_purged = trx_purge(
+			n_use_threads, srv_purge_batch_size, false);
+
+		if (!(count++ % TRX_SYS_N_RSEGS)) {
+			/* Force a truncate of the history list. */
+			n_pages_purged += trx_purge(
+				1, srv_purge_batch_size, true);
+		}
+
+		*n_total_purged += n_pages_purged;
+
+	} while (!srv_purge_should_exit(n_pages_purged) && n_pages_purged > 0);
+
+	return(rseg_history_len);
+}
+
+/*********************************************************************//**
+Suspend the purge coordinator thread. */
+static
+void
+srv_purge_coordinator_suspend(
+/*==========================*/
+	srv_slot_t*	slot,			/*!< in/out: Purge coordinator
+						thread slot */
+	ulint		rseg_history_len)	/*!< in: history list length
+						before last purge */
+{
+	ut_ad(!srv_read_only_mode);
+	ut_a(slot->type == SRV_PURGE);
+
+	bool		stop = false;
+
+	/** Maximum wait time on the purge event, in micro-seconds. */
+	static const ulint SRV_PURGE_MAX_TIMEOUT = 10000;
+
+	ib_int64_t	sig_count = srv_suspend_thread(slot);
+
+	do {
+		ulint		ret;
+
+		rw_lock_x_lock(&purge_sys->latch);
+
+		purge_sys->running = false;
+
+		rw_lock_x_unlock(&purge_sys->latch);
+
+		/* We don't wait right away on the the non-timed wait because
+		we want to signal the thread that wants to suspend purge. */
+
+		if (stop) {
+			os_event_wait_low(slot->event, sig_count);
+			ret = 0;
+		} else if (rseg_history_len <= trx_sys->rseg_history_len) {
+			ret = os_event_wait_time_low(
+				slot->event, SRV_PURGE_MAX_TIMEOUT, sig_count);
+		} else {
+			/* We don't want to waste time waiting, if the
+			history list increased by the time we got here,
+			unless purge has been stopped. */
+			ret = 0;
+		}
+
+		srv_sys_mutex_enter();
+
+		/* The thread can be in state !suspended after the timeout
+		but before this check if another thread sent a wakeup signal. */
+
+		if (slot->suspended) {
+			slot->suspended = FALSE;
+			++srv_sys->n_threads_active[slot->type];
+			ut_a(srv_sys->n_threads_active[slot->type] == 1);
+		}
+
+		srv_sys_mutex_exit();
+
+		sig_count = srv_suspend_thread(slot);
+
+		rw_lock_x_lock(&purge_sys->latch);
+
+		stop = (purge_sys->state == PURGE_STATE_STOP);
+
+		if (!stop) {
+			ut_a(purge_sys->n_stop == 0);
+			purge_sys->running = true;
+		} else {
+			ut_a(purge_sys->n_stop > 0);
+
+			/* Signal that we are suspended. */
+			os_event_set(purge_sys->event);
+		}
+
+		rw_lock_x_unlock(&purge_sys->latch);
+
+		if (ret == OS_SYNC_TIME_EXCEEDED) {
+
+			/* No new records added since wait started then simply
+			wait for new records. The magic number 5000 is an
+			approximation for the case where we have cached UNDO
+			log records which prevent truncate of the UNDO
+			segments. */
+
+			if (rseg_history_len == trx_sys->rseg_history_len
+			    && trx_sys->rseg_history_len < 5000) {
+
+				stop = true;
+			}
+		}
+
+	} while (stop);
+
+	srv_sys_mutex_enter();
+
+	if (slot->suspended) {
+		slot->suspended = FALSE;
+		++srv_sys->n_threads_active[slot->type];
+		ut_a(srv_sys->n_threads_active[slot->type] == 1);
+	}
+
+	srv_sys_mutex_exit();
+}
+
+/*********************************************************************//**
+Purge coordinator thread that schedules the purge tasks.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_purge_coordinator_thread)(
+/*=========================================*/
+	void*	arg __attribute__((unused)))	/*!< in: a dummy parameter
+						required by os_thread_create */
+{
+	srv_slot_t*	slot;
+	ulint           n_total_purged = ULINT_UNDEFINED;
+
+	ut_ad(!srv_read_only_mode);
+	ut_a(srv_n_purge_threads >= 1);
+	ut_a(trx_purge_state() == PURGE_STATE_INIT);
+	ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+
+	srv_purge_tids[0] = os_thread_get_tid();
+	os_thread_set_priority(srv_purge_tids[0], srv_sched_priority_purge);
+
+	rw_lock_x_lock(&purge_sys->latch);
+
+	purge_sys->running = true;
+	purge_sys->state = PURGE_STATE_RUN;
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(srv_purge_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: Purge coordinator thread created, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	slot = srv_reserve_slot(SRV_PURGE);
+
+	ulint	rseg_history_len = trx_sys->rseg_history_len;
+
+	do {
+		/* If there are no records to purge or the last
+		purge didn't purge any records then wait for activity. */
+
+		if (purge_sys->state == PURGE_STATE_STOP
+		    || n_total_purged == 0) {
+
+			srv_purge_coordinator_suspend(slot, rseg_history_len);
+		}
+
+		if (srv_purge_should_exit(n_total_purged)) {
+			ut_a(!slot->suspended);
+			break;
+		}
+
+		n_total_purged = 0;
+
+		srv_current_thread_priority = srv_purge_thread_priority;
+
+		rseg_history_len = srv_do_purge(
+			srv_n_purge_threads, &n_total_purged);
+
+		srv_inc_activity_count();
+
+	} while (!srv_purge_should_exit(n_total_purged));
+
+	/* Ensure that we don't jump out of the loop unless the
+	exit condition is satisfied. */
+
+	ut_a(srv_purge_should_exit(n_total_purged));
+
+	ulint	n_pages_purged = ULINT_MAX;
+
+	/* Ensure that all records are purged if it is not a fast shutdown.
+	This covers the case where a record can be added after we exit the
+	loop above. */
+	while (srv_fast_shutdown == 0 && n_pages_purged > 0) {
+		n_pages_purged = trx_purge(1, srv_purge_batch_size, false);
+	}
+
+	/* Force a truncate of the history list. */
+	n_pages_purged = trx_purge(1, srv_purge_batch_size, true);
+	ut_a(n_pages_purged == 0 || srv_fast_shutdown != 0);
+
+	/* The task queue should always be empty, independent of fast
+	shutdown state. */
+	ut_a(srv_get_task_queue_length() == 0);
+
+	srv_free_slot(slot);
+
+	/* Note that we are shutting down. */
+	rw_lock_x_lock(&purge_sys->latch);
+
+	purge_sys->state = PURGE_STATE_EXIT;
+
+	purge_sys->running = false;
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: Purge coordinator exiting, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	/* Ensure that all the worker threads quit. */
+	if (srv_n_purge_threads > 1) {
+		srv_release_threads(SRV_WORKER, srv_n_purge_threads - 1);
+	}
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;	/* Not reached, avoid compiler warning */
+}
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
+UNIV_INTERN
+void
+srv_que_task_enqueue_low(
+/*=====================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad(!srv_read_only_mode);
+	mutex_enter(&srv_sys->tasks_mutex);
+
+	UT_LIST_ADD_LAST(queue, srv_sys->tasks, thr);
+
+	mutex_exit(&srv_sys->tasks_mutex);
+
+	srv_release_threads(SRV_WORKER, 1);
+}
+
+/**********************************************************************//**
+Get count of tasks in the queue.
+@return number of tasks in queue  */
+UNIV_INTERN
+ulint
+srv_get_task_queue_length(void)
+/*===========================*/
+{
+	ulint	n_tasks;
+
+	ut_ad(!srv_read_only_mode);
+
+	mutex_enter(&srv_sys->tasks_mutex);
+
+	n_tasks = UT_LIST_GET_LEN(srv_sys->tasks);
+
+	mutex_exit(&srv_sys->tasks_mutex);
+
+	return(n_tasks);
+}
+
+/**********************************************************************//**
+Wakeup the purge threads. */
+UNIV_INTERN
+void
+srv_purge_wakeup(void)
+/*==================*/
+{
+	ut_ad(!srv_read_only_mode);
+
+	if (srv_force_recovery < SRV_FORCE_NO_BACKGROUND) {
+
+		srv_release_threads(SRV_PURGE, 1);
+
+		if (srv_n_purge_threads > 1) {
+			ulint	n_workers = srv_n_purge_threads - 1;
+
+			srv_release_threads(SRV_WORKER, n_workers);
+		}
+	}
+}
+
diff --git a/storage/xtradb/srv/srv0start.c b/storage/xtradb/srv/srv0start.c
deleted file mode 100644
index f4567c49ca0..00000000000
--- a/storage/xtradb/srv/srv0start.c
+++ /dev/null
@@ -1,2552 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved.
-Copyright (c) 2008, Google Inc.
-Copyright (c) 2009, Percona Inc.
-
-Portions of this file contain modifications contributed and copyrighted by
-Google, Inc. Those modifications are gratefully acknowledged and are described
-briefly in the InnoDB documentation. The contributions by Google are
-incorporated with their permission, and subject to the conditions contained in
-the file COPYING.Google.
-
-Portions of this file contain modifications contributed and copyrighted
-by Percona Inc.. Those modifications are
-gratefully acknowledged and are described briefly in the InnoDB
-documentation. The contributions by Percona Inc. are incorporated with
-their permission, and subject to the conditions contained in the file
-COPYING.Percona.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-
-*****************************************************************************/
-
-/********************************************************************//**
-@file srv/srv0start.c
-Starts the InnoDB database server
-
-Created 2/16/1996 Heikki Tuuri
-*************************************************************************/
-
-#include "ut0mem.h"
-#include "mem0mem.h"
-#include "data0data.h"
-#include "data0type.h"
-#include "dict0dict.h"
-#include "buf0buf.h"
-#include "os0file.h"
-#include "os0thread.h"
-#include "fil0fil.h"
-#include "fsp0fsp.h"
-#include "rem0rec.h"
-#include "mtr0mtr.h"
-#include "log0log.h"
-#include "log0online.h"
-#include "log0recv.h"
-#include "page0page.h"
-#include "page0cur.h"
-#include "trx0trx.h"
-#include "trx0sys.h"
-#include "btr0btr.h"
-#include "btr0cur.h"
-#include "rem0rec.h"
-#include "ibuf0ibuf.h"
-#include "srv0start.h"
-#include "srv0srv.h"
-#ifndef UNIV_HOTBACKUP
-# include "os0proc.h"
-# include "sync0sync.h"
-# include "buf0flu.h"
-# include "buf0rea.h"
-# include "dict0boot.h"
-# include "dict0load.h"
-# include "que0que.h"
-# include "usr0sess.h"
-# include "lock0lock.h"
-# include "trx0roll.h"
-# include "trx0purge.h"
-# include "lock0lock.h"
-# include "pars0pars.h"
-# include "btr0sea.h"
-# include "rem0cmp.h"
-# include "dict0crea.h"
-# include "row0ins.h"
-# include "row0sel.h"
-# include "row0upd.h"
-# include "row0row.h"
-# include "row0mysql.h"
-# include "btr0pcur.h"
-# include "os0sync.h" /* for INNODB_RW_LOCKS_USE_ATOMICS */
-# include "zlib.h" /* for ZLIB_VERSION */
-# include "buf0lru.h" /* for buf_LRU_file_restore() */
-# include "os0stacktrace.h"
-
-/** Log sequence number immediately after startup */
-UNIV_INTERN ib_uint64_t	srv_start_lsn;
-/** Log sequence number at shutdown */
-UNIV_INTERN ib_uint64_t	srv_shutdown_lsn;
-
-#ifdef HAVE_DARWIN_THREADS
-# include <sys/utsname.h>
-/** TRUE if the F_FULLFSYNC option is available */
-UNIV_INTERN ibool	srv_have_fullfsync = FALSE;
-#endif
-
-/** TRUE if a raw partition is in use */
-UNIV_INTERN ibool	srv_start_raw_disk_in_use = FALSE;
-
-/** TRUE if the server is being started, before rolling back any
-incomplete transactions */
-UNIV_INTERN ibool	srv_startup_is_before_trx_rollback_phase = FALSE;
-/** TRUE if the server is being started */
-UNIV_INTERN ibool	srv_is_being_started = FALSE;
-/** TRUE if the server was successfully started */
-UNIV_INTERN ibool	srv_was_started = FALSE;
-/** TRUE if innobase_start_or_create_for_mysql() has been called */
-static ibool	srv_start_has_been_called = FALSE;
-
-/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
-SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
-UNIV_INTERN enum srv_shutdown_state	srv_shutdown_state = SRV_SHUTDOWN_NONE;
-
-/** Files comprising the system tablespace */
-static os_file_t	files[1000];
-
-/** io_handler_thread parameters for thread identification */
-static ulint		n[SRV_MAX_N_IO_THREADS + 8];
-/** io_handler_thread identifiers */
-static os_thread_id_t	thread_ids[SRV_MAX_N_IO_THREADS + 8];
-
-/** We use this mutex to test the return value of pthread_mutex_trylock
-   on successful locking. HP-UX does NOT return 0, though Linux et al do. */
-static os_fast_mutex_t	srv_os_test_mutex;
-
-/** Name of srv_monitor_file */
-static char*	srv_monitor_file_name;
-#endif /* !UNIV_HOTBACKUP */
-
-/** */
-#define SRV_N_PENDING_IOS_PER_THREAD	OS_AIO_N_PENDING_IOS_PER_THREAD
-#define SRV_MAX_N_PENDING_SYNC_IOS	100
-
-#ifdef UNIV_PFS_THREAD
-/* Keys to register InnoDB threads with performance schema */
-UNIV_INTERN mysql_pfs_key_t	io_handler_thread_key;
-UNIV_INTERN mysql_pfs_key_t	srv_lock_timeout_thread_key;
-UNIV_INTERN mysql_pfs_key_t	srv_error_monitor_thread_key;
-UNIV_INTERN mysql_pfs_key_t	srv_monitor_thread_key;
-UNIV_INTERN mysql_pfs_key_t	srv_master_thread_key;
-UNIV_INTERN mysql_pfs_key_t	srv_purge_thread_key;
-UNIV_INTERN mysql_pfs_key_t	srv_log_tracking_thread_key;
-#endif /* UNIV_PFS_THREAD */
-
-/*********************************************************************//**
-Convert a numeric string that optionally ends in G or M, to a number
-containing megabytes.
-@return	next character in string */
-static
-char*
-srv_parse_megabytes(
-/*================*/
-	char*	str,	/*!< in: string containing a quantity in bytes */
-	ulint*	megs)	/*!< out: the number in megabytes */
-{
-	char*	endp;
-	ulint	size;
-
-	size = strtoul(str, &endp, 10);
-
-	str = endp;
-
-	switch (*str) {
-	case 'G': case 'g':
-		size *= 1024;
-		/* fall through */
-	case 'M': case 'm':
-		str++;
-		break;
-	default:
-		size /= 1024 * 1024;
-		break;
-	}
-
-	*megs = size;
-	return(str);
-}
-
-/*********************************************************************//**
-Reads the data files and their sizes from a character string given in
-the .cnf file.
-@return	TRUE if ok, FALSE on parse error */
-UNIV_INTERN
-ibool
-srv_parse_data_file_paths_and_sizes(
-/*================================*/
-	char*	str)	/*!< in/out: the data file path string */
-{
-	char*	input_str;
-	char*	path;
-	ulint	size;
-	ulint	i	= 0;
-
-	srv_auto_extend_last_data_file = FALSE;
-	srv_last_file_size_max = 0;
-	srv_data_file_names = NULL;
-	srv_data_file_sizes = NULL;
-	srv_data_file_is_raw_partition = NULL;
-
-	input_str = str;
-
-	/* First calculate the number of data files and check syntax:
-	path:size[M | G];path:size[M | G]... . Note that a Windows path may
-	contain a drive name and a ':'. */
-
-	while (*str != '\0') {
-		path = str;
-
-		while ((*str != ':' && *str != '\0')
-		       || (*str == ':'
-			   && (*(str + 1) == '\\' || *(str + 1) == '/'
-			       || *(str + 1) == ':'))) {
-			str++;
-		}
-
-		if (*str == '\0') {
-			return(FALSE);
-		}
-
-		str++;
-
-		str = srv_parse_megabytes(str, &size);
-
-		if (0 == strncmp(str, ":autoextend",
-				 (sizeof ":autoextend") - 1)) {
-
-			str += (sizeof ":autoextend") - 1;
-
-			if (0 == strncmp(str, ":max:",
-					 (sizeof ":max:") - 1)) {
-
-				str += (sizeof ":max:") - 1;
-
-				str = srv_parse_megabytes(str, &size);
-			}
-
-			if (*str != '\0') {
-
-				return(FALSE);
-			}
-		}
-
-		if (strlen(str) >= 6
-		    && *str == 'n'
-		    && *(str + 1) == 'e'
-		    && *(str + 2) == 'w') {
-			str += 3;
-		}
-
-		if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') {
-			str += 3;
-		}
-
-		if (size == 0) {
-			return(FALSE);
-		}
-
-		i++;
-
-		if (*str == ';') {
-			str++;
-		} else if (*str != '\0') {
-
-			return(FALSE);
-		}
-	}
-
-	if (i == 0) {
-		/* If innodb_data_file_path was defined it must contain
-		at least one data file definition */
-
-		return(FALSE);
-	}
-
-	srv_data_file_names = malloc(i * sizeof *srv_data_file_names);
-	srv_data_file_sizes = malloc(i * sizeof *srv_data_file_sizes);
-	srv_data_file_is_raw_partition = malloc(
-		i * sizeof *srv_data_file_is_raw_partition);
-
-	srv_n_data_files = i;
-
-	/* Then store the actual values to our arrays */
-
-	str = input_str;
-	i = 0;
-
-	while (*str != '\0') {
-		path = str;
-
-		/* Note that we must step over the ':' in a Windows path;
-		a Windows path normally looks like C:\ibdata\ibdata1:1G, but
-		a Windows raw partition may have a specification like
-		\\.\C::1Gnewraw or \\.\PHYSICALDRIVE2:1Gnewraw */
-
-		while ((*str != ':' && *str != '\0')
-		       || (*str == ':'
-			   && (*(str + 1) == '\\' || *(str + 1) == '/'
-			       || *(str + 1) == ':'))) {
-			str++;
-		}
-
-		if (*str == ':') {
-			/* Make path a null-terminated string */
-			*str = '\0';
-			str++;
-		}
-
-		str = srv_parse_megabytes(str, &size);
-
-		srv_data_file_names[i] = path;
-		srv_data_file_sizes[i] = size;
-
-		if (0 == strncmp(str, ":autoextend",
-				 (sizeof ":autoextend") - 1)) {
-
-			srv_auto_extend_last_data_file = TRUE;
-
-			str += (sizeof ":autoextend") - 1;
-
-			if (0 == strncmp(str, ":max:",
-					 (sizeof ":max:") - 1)) {
-
-				str += (sizeof ":max:") - 1;
-
-				str = srv_parse_megabytes(
-					str, &srv_last_file_size_max);
-			}
-
-			if (*str != '\0') {
-
-				return(FALSE);
-			}
-		}
-
-		(srv_data_file_is_raw_partition)[i] = 0;
-
-		if (strlen(str) >= 6
-		    && *str == 'n'
-		    && *(str + 1) == 'e'
-		    && *(str + 2) == 'w') {
-			str += 3;
-			(srv_data_file_is_raw_partition)[i] = SRV_NEW_RAW;
-		}
-
-		if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') {
-			str += 3;
-
-			if ((srv_data_file_is_raw_partition)[i] == 0) {
-				(srv_data_file_is_raw_partition)[i] = SRV_OLD_RAW;
-			}
-		}
-
-		i++;
-
-		if (*str == ';') {
-			str++;
-		}
-	}
-
-	return(TRUE);
-}
-
-/*********************************************************************//**
-Reads log group home directories from a character string given in
-the .cnf file.
-@return	TRUE if ok, FALSE on parse error */
-UNIV_INTERN
-ibool
-srv_parse_log_group_home_dirs(
-/*==========================*/
-	char*	str)	/*!< in/out: character string */
-{
-	char*	input_str;
-	char*	path;
-	ulint	i	= 0;
-
-	srv_log_group_home_dirs = NULL;
-
-	input_str = str;
-
-	/* First calculate the number of directories and check syntax:
-	path;path;... */
-
-	while (*str != '\0') {
-		path = str;
-
-		while (*str != ';' && *str != '\0') {
-			str++;
-		}
-
-		i++;
-
-		if (*str == ';') {
-			str++;
-		} else if (*str != '\0') {
-
-			return(FALSE);
-		}
-	}
-
-	if (i != 1) {
-		/* If innodb_log_group_home_dir was defined it must
-		contain exactly one path definition under current MySQL */
-
-		return(FALSE);
-	}
-
-	srv_log_group_home_dirs = malloc(i * sizeof *srv_log_group_home_dirs);
-
-	/* Then store the actual values to our array */
-
-	str = input_str;
-	i = 0;
-
-	while (*str != '\0') {
-		path = str;
-
-		while (*str != ';' && *str != '\0') {
-			str++;
-		}
-
-		if (*str == ';') {
-			*str = '\0';
-			str++;
-		}
-
-		srv_log_group_home_dirs[i] = path;
-
-		i++;
-	}
-
-	return(TRUE);
-}
-
-/*********************************************************************//**
-Frees the memory allocated by srv_parse_data_file_paths_and_sizes()
-and srv_parse_log_group_home_dirs(). */
-UNIV_INTERN
-void
-srv_free_paths_and_sizes(void)
-/*==========================*/
-{
-	free(srv_data_file_names);
-	srv_data_file_names = NULL;
-	free(srv_data_file_sizes);
-	srv_data_file_sizes = NULL;
-	free(srv_data_file_is_raw_partition);
-	srv_data_file_is_raw_partition = NULL;
-	free(srv_log_group_home_dirs);
-	srv_log_group_home_dirs = NULL;
-}
-
-#ifndef UNIV_HOTBACKUP
-/********************************************************************//**
-I/o-handler thread function.
-@return	OS_THREAD_DUMMY_RETURN */
-static
-os_thread_ret_t
-io_handler_thread(
-/*==============*/
-	void*	arg)	/*!< in: pointer to the number of the segment in
-			the aio array */
-{
-	ulint	segment;
-
-	segment = *((ulint*)arg);
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	fprintf(stderr, "Io handler thread %lu starts, id %lu\n", segment,
-		os_thread_pf(os_thread_get_curr_id()));
-#endif
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(io_handler_thread_key);
-#endif /* UNIV_PFS_THREAD */
-
-	while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) {
-		fil_aio_wait(segment);
-	}
-
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit.
-	The thread actually never comes here because it is exited in an
-	os_event_wait(). */
-
-	os_thread_exit(NULL);
-
-	OS_THREAD_DUMMY_RETURN;
-}
-#endif /* !UNIV_HOTBACKUP */
-
-/*********************************************************************//**
-Normalizes a directory path for Windows: converts slashes to backslashes. */
-UNIV_INTERN
-void
-srv_normalize_path_for_win(
-/*=======================*/
-	char*	str __attribute__((unused)))	/*!< in/out: null-terminated
-						character string */
-{
-#ifdef __WIN__
-	for (; *str; str++) {
-
-		if (*str == '/') {
-			*str = '\\';
-		}
-	}
-#endif
-}
-
-#ifndef UNIV_HOTBACKUP
-/*********************************************************************//**
-Calculates the low 32 bits when a file size which is given as a number
-database pages is converted to the number of bytes.
-@return	low 32 bytes of file size when expressed in bytes */
-static
-ulint
-srv_calc_low32(
-/*===========*/
-	ulint	file_size)	/*!< in: file size in database pages */
-{
-	return(0xFFFFFFFFUL & (file_size << UNIV_PAGE_SIZE_SHIFT));
-}
-
-/*********************************************************************//**
-Calculates the high 32 bits when a file size which is given as a number
-database pages is converted to the number of bytes.
-@return	high 32 bytes of file size when expressed in bytes */
-static
-ulint
-srv_calc_high32(
-/*============*/
-	ulint	file_size)	/*!< in: file size in database pages */
-{
-	return(file_size >> (32 - UNIV_PAGE_SIZE_SHIFT));
-}
-
-/*********************************************************************//**
-Creates or opens the log files and closes them.
-@return	DB_SUCCESS or error code */
-static
-ulint
-open_or_create_log_file(
-/*====================*/
-	ibool	create_new_db,		/*!< in: TRUE if we should create a
-					new database */
-	ibool*	log_file_created,	/*!< out: TRUE if new log file
-					created */
-	ibool	log_file_has_been_opened,/*!< in: TRUE if a log file has been
-					opened before: then it is an error
-					to try to create another log file */
-	ulint	k,			/*!< in: log group number */
-	ulint	i)			/*!< in: log file number in group */
-{
-	ibool	ret;
-	ulint	size;
-	ulint	size_high;
-	char	name[10000];
-	ulint	dirnamelen;
-
-	UT_NOT_USED(create_new_db);
-
-	*log_file_created = FALSE;
-
-	srv_normalize_path_for_win(srv_log_group_home_dirs[k]);
-
-	dirnamelen = strlen(srv_log_group_home_dirs[k]);
-	ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
-	memcpy(name, srv_log_group_home_dirs[k], dirnamelen);
-
-	/* Add a path separator if needed. */
-	if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
-		name[dirnamelen++] = SRV_PATH_SEPARATOR;
-	}
-
-	sprintf(name + dirnamelen, "%s%lu", "ib_logfile", (ulong) i);
-
-	files[i] = os_file_create(innodb_file_log_key, name,
-				  OS_FILE_CREATE, OS_FILE_NORMAL,
-				  OS_LOG_FILE, &ret);
-	if (ret == FALSE) {
-		if (os_file_get_last_error(FALSE) != OS_FILE_ALREADY_EXISTS
-#ifdef UNIV_AIX
-		    /* AIX 5.1 after security patch ML7 may have errno set
-		    to 0 here, which causes our function to return 100;
-		    work around that AIX problem */
-		    && os_file_get_last_error(FALSE) != 100
-#endif
-		    ) {
-			fprintf(stderr,
-				"InnoDB: Error in creating"
-				" or opening %s\n", name);
-
-			return(DB_ERROR);
-		}
-
-		files[i] = os_file_create(innodb_file_log_key, name,
-					  OS_FILE_OPEN, OS_FILE_AIO,
-					  OS_LOG_FILE, &ret);
-		if (!ret) {
-			fprintf(stderr,
-				"InnoDB: Error in opening %s\n", name);
-
-			return(DB_ERROR);
-		}
-
-		ret = os_file_get_size(files[i], &size, &size_high);
-		ut_a(ret);
-
-		if (size != srv_calc_low32(srv_log_file_size)
-		    || size_high != srv_calc_high32(srv_log_file_size)) {
-
-			fprintf(stderr,
-				"InnoDB: Warning: log file %s is"
-				" of different size %lu %lu bytes\n"
-				"InnoDB: than specified in the .cnf"
-				" file %lu %lu bytes!\n",
-				name, (ulong) size_high, (ulong) size,
-				(ulong) srv_calc_high32(srv_log_file_size),
-				(ulong) srv_calc_low32(srv_log_file_size));
-
-                        srv_log_file_size= ((size +
-                                             (((longlong) size_high) << 32)) /
-                                            UNIV_PAGE_SIZE);
-		}
-	} else {
-		*log_file_created = TRUE;
-
-		ut_print_timestamp(stderr);
-
-		fprintf(stderr,
-			"  InnoDB: Log file %s did not exist:"
-			" new to be created\n",
-			name);
-		if (log_file_has_been_opened) {
-
-			return(DB_ERROR);
-		}
-
-		fprintf(stderr, "InnoDB: Setting log file %s size to %lu MB\n",
-			name, (ulong) srv_log_file_size
-			>> (20 - UNIV_PAGE_SIZE_SHIFT));
-
-		fprintf(stderr,
-			"InnoDB: Database physically writes the file"
-			" full: wait...\n");
-
-		ret = os_file_set_size(name, files[i],
-				       srv_calc_low32(srv_log_file_size),
-				       srv_calc_high32(srv_log_file_size));
-		if (!ret) {
-			fprintf(stderr,
-				"InnoDB: Error in creating %s:"
-				" probably out of disk space\n",
-				name);
-
-			return(DB_ERROR);
-		}
-	}
-
-	ret = os_file_close(files[i]);
-	ut_a(ret);
-
-	if (i == 0) {
-		/* Create in memory the file space object
-		which is for this log group */
-
-		fil_space_create(name,
-				 2 * k + SRV_LOG_SPACE_FIRST_ID, 0, FIL_LOG);
-	}
-
-	ut_a(fil_validate());
-
-	fil_node_create(name, srv_log_file_size,
-			2 * k + SRV_LOG_SPACE_FIRST_ID, FALSE);
-#ifdef UNIV_LOG_ARCHIVE
-	/* If this is the first log group, create the file space object
-	for archived logs.
-	Under MySQL, no archiving ever done. */
-
-	if (k == 0 && i == 0) {
-		arch_space_id = 2 * k + 1 + SRV_LOG_SPACE_FIRST_ID;
-
-		fil_space_create("arch_log_space", arch_space_id, 0, FIL_LOG);
-	} else {
-		arch_space_id = ULINT_UNDEFINED;
-	}
-#endif /* UNIV_LOG_ARCHIVE */
-	if (i == 0) {
-		log_group_init(k, srv_n_log_files,
-			       srv_log_file_size * UNIV_PAGE_SIZE,
-			       2 * k + SRV_LOG_SPACE_FIRST_ID,
-			       SRV_LOG_SPACE_FIRST_ID + 1); /* dummy arch
-							    space id */
-	}
-
-	return(DB_SUCCESS);
-}
-
-/*********************************************************************//**
-Creates or opens database data files and closes them.
-@return	DB_SUCCESS or error code */
-static
-ulint
-open_or_create_data_files(
-/*======================*/
-	ibool*		create_new_db,	/*!< out: TRUE if new database should be
-					created */
-	ibool*		create_new_doublewrite_file,
-#ifdef UNIV_LOG_ARCHIVE
-	ulint*		min_arch_log_no,/*!< out: min of archived log
-					numbers in data files */
-	ulint*		max_arch_log_no,/*!< out: max of archived log
-					numbers in data files */
-#endif /* UNIV_LOG_ARCHIVE */
-	ib_uint64_t*	min_flushed_lsn,/*!< out: min of flushed lsn
-					values in data files */
-	ib_uint64_t*	max_flushed_lsn,/*!< out: max of flushed lsn
-					values in data files */
-	ulint*		sum_of_new_sizes)/*!< out: sum of sizes of the
-					new files added */
-{
-	ibool	ret;
-	ulint	i;
-	ibool	one_opened	= FALSE;
-	ibool	one_created	= FALSE;
-	ulint	size;
-	ulint	size_high;
-	ulint	flags;
-	ulint	rounded_size_pages;
-	char	name[10000];
-
-	if (srv_n_data_files >= 1000) {
-		fprintf(stderr, "InnoDB: can only have < 1000 data files\n"
-			"InnoDB: you have defined %lu\n",
-			(ulong) srv_n_data_files);
-		return(DB_ERROR);
-	}
-
-	*sum_of_new_sizes = 0;
-
-	*create_new_db = FALSE;
-	*create_new_doublewrite_file = FALSE;
-
-	srv_normalize_path_for_win(srv_data_home);
-
-	for (i = 0; i < srv_n_data_files; i++) {
-		ulint	dirnamelen;
-
-		srv_normalize_path_for_win(srv_data_file_names[i]);
-		dirnamelen = strlen(srv_data_home);
-
-		ut_a(dirnamelen + strlen(srv_data_file_names[i])
-		     < (sizeof name) - 1);
-		memcpy(name, srv_data_home, dirnamelen);
-		/* Add a path separator if needed. */
-		if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
-			name[dirnamelen++] = SRV_PATH_SEPARATOR;
-		}
-
-		strcpy(name + dirnamelen, srv_data_file_names[i]);
-
-		if (srv_data_file_is_raw_partition[i] == 0) {
-
-			/* First we try to create the file: if it already
-			exists, ret will get value FALSE */
-
-			files[i] = os_file_create(innodb_file_data_key,
-						  name, OS_FILE_CREATE,
-						  OS_FILE_NORMAL,
-						  OS_DATA_FILE, &ret);
-
-			if (ret == FALSE && os_file_get_last_error(FALSE)
-			    != OS_FILE_ALREADY_EXISTS
-#ifdef UNIV_AIX
-			    /* AIX 5.1 after security patch ML7 may have
-			    errno set to 0 here, which causes our function
-			    to return 100; work around that AIX problem */
-			    && os_file_get_last_error(FALSE) != 100
-#endif
-			    ) {
-				fprintf(stderr,
-					"InnoDB: Error in creating"
-					" or opening %s\n",
-					name);
-
-				return(DB_ERROR);
-			}
-		} else if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) {
-			/* The partition is opened, not created; then it is
-			written over */
-
-			srv_start_raw_disk_in_use = TRUE;
-			srv_created_new_raw = TRUE;
-
-			files[i] = os_file_create(innodb_file_data_key,
-						  name, OS_FILE_OPEN_RAW,
-						  OS_FILE_NORMAL,
-						  OS_DATA_FILE, &ret);
-			if (!ret) {
-				fprintf(stderr,
-					"InnoDB: Error in opening %s\n", name);
-
-				return(DB_ERROR);
-			}
-		} else if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
-			srv_start_raw_disk_in_use = TRUE;
-
-			ret = FALSE;
-		} else {
-			ut_a(0);
-		}
-
-		if (ret == FALSE) {
-			const char* check_msg;
-			/* We open the data file */
-
-			if (one_created) {
-				fprintf(stderr,
-					"InnoDB: Error: data files can only"
-					" be added at the end\n");
-				fprintf(stderr,
-					"InnoDB: of a tablespace, but"
-					" data file %s existed beforehand.\n",
-					name);
-				return(DB_ERROR);
-			}
-
-			if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
-				files[i] = os_file_create(
-					innodb_file_data_key,
-					name, OS_FILE_OPEN_RAW,
-					OS_FILE_NORMAL, OS_DATA_FILE, &ret);
-			} else if (i == 0) {
-				files[i] = os_file_create(
-					innodb_file_data_key,
-					name, OS_FILE_OPEN_RETRY,
-					OS_FILE_NORMAL, OS_DATA_FILE, &ret);
-			} else {
-				files[i] = os_file_create(
-					innodb_file_data_key,
-					name, OS_FILE_OPEN, OS_FILE_NORMAL,
-					OS_DATA_FILE, &ret);
-			}
-
-			if (!ret) {
-				fprintf(stderr,
-					"InnoDB: Error in opening %s\n", name);
-				os_file_get_last_error(TRUE);
-
-				return(DB_ERROR);
-			}
-
-			if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
-
-				goto skip_size_check;
-			}
-
-			ret = os_file_get_size(files[i], &size, &size_high);
-			ut_a(ret);
-			/* Round size downward to megabytes */
-
-			rounded_size_pages
-				= (size / (1024 * 1024) + 4096 * size_high)
-					<< (20 - UNIV_PAGE_SIZE_SHIFT);
-
-			if (i == srv_n_data_files - 1
-			    && srv_auto_extend_last_data_file) {
-
-				if (srv_data_file_sizes[i] > rounded_size_pages
-				    || (srv_last_file_size_max > 0
-					&& srv_last_file_size_max
-					< rounded_size_pages)) {
-
-					fprintf(stderr,
-						"InnoDB: Error: auto-extending"
-						" data file %s is"
-						" of a different size\n"
-						"InnoDB: %lu pages (rounded"
-						" down to MB) than specified"
-						" in the .cnf file:\n"
-						"InnoDB: initial %lu pages,"
-						" max %lu (relevant if"
-						" non-zero) pages!\n",
-						name,
-						(ulong) rounded_size_pages,
-						(ulong) srv_data_file_sizes[i],
-						(ulong)
-						srv_last_file_size_max);
-
-					return(DB_ERROR);
-				}
-
-				srv_data_file_sizes[i] = rounded_size_pages;
-			}
-
-			if (rounded_size_pages != srv_data_file_sizes[i]) {
-
-				fprintf(stderr,
-					"InnoDB: Error: data file %s"
-					" is of a different size\n"
-					"InnoDB: %lu pages"
-					" (rounded down to MB)\n"
-					"InnoDB: than specified"
-					" in the .cnf file %lu pages!\n",
-					name,
-					(ulong) rounded_size_pages,
-					(ulong) srv_data_file_sizes[i]);
-
-				return(DB_ERROR);
-			}
-skip_size_check:
-			check_msg = fil_read_first_page(
-				files[i], one_opened, &flags,
-#ifdef UNIV_LOG_ARCHIVE
-				min_arch_log_no, max_arch_log_no,
-#endif /* UNIV_LOG_ARCHIVE */
-				min_flushed_lsn, max_flushed_lsn);
-
-			if (check_msg) {
-				fprintf(stderr,
-					"InnoDB: Error: %s in data file %s\n",
-					check_msg, name);
-				return(DB_ERROR);
-			}
-
-			if (!one_opened
-			    && UNIV_PAGE_SIZE
-			       != fsp_flags_get_page_size(flags)) {
-
-				ut_print_timestamp(stderr);
-				fprintf(stderr,
-					" InnoDB: Error: data file %s"
-					" uses page size %lu,\n",
-					name,
-					fsp_flags_get_page_size(flags));
-				ut_print_timestamp(stderr);
-				fprintf(stderr,
-					" InnoDB: but the only supported"
-					" page size in this release is=%lu\n",
-					(ulong) UNIV_PAGE_SIZE);
-
-				return(DB_ERROR);
-			}
-
-			one_opened = TRUE;
-		} else {
-			/* We created the data file and now write it full of
-			zeros */
-
-			one_created = TRUE;
-
-			if (i > 0) {
-				ut_print_timestamp(stderr);
-				fprintf(stderr,
-					"  InnoDB: Data file %s did not"
-					" exist: new to be created\n",
-					name);
-			} else {
-				fprintf(stderr,
-					"InnoDB: The first specified"
-					" data file %s did not exist:\n"
-					"InnoDB: a new database"
-					" to be created!\n", name);
-				*create_new_db = TRUE;
-			}
-
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: Setting file %s size to %lu MB\n",
-				name,
-				(ulong) (srv_data_file_sizes[i]
-					 >> (20 - UNIV_PAGE_SIZE_SHIFT)));
-
-			fprintf(stderr,
-				"InnoDB: Database physically writes the"
-				" file full: wait...\n");
-
-			ret = os_file_set_size(
-				name, files[i],
-				srv_calc_low32(srv_data_file_sizes[i]),
-				srv_calc_high32(srv_data_file_sizes[i]));
-
-			if (!ret) {
-				fprintf(stderr,
-					"InnoDB: Error in creating %s:"
-					" probably out of disk space\n", name);
-
-				return(DB_ERROR);
-			}
-
-			*sum_of_new_sizes = *sum_of_new_sizes
-				+ srv_data_file_sizes[i];
-		}
-
-		ret = os_file_close(files[i]);
-		ut_a(ret);
-
-		if (i == 0) {
-			fil_space_create(name, 0, 0, FIL_TABLESPACE);
-		}
-
-		ut_a(fil_validate());
-
-		fil_node_create(name, srv_data_file_sizes[i], 0,
-				srv_data_file_is_raw_partition[i] != 0);
-	}
-
-	/* special file for doublewrite buffer */
-	if (srv_doublewrite_file)
-	{
-		srv_normalize_path_for_win(srv_doublewrite_file);
-
-		fprintf(stderr,
-			"InnoDB: Note: The innodb_doublewrite_file option has been specified.\n"
-			"InnoDB: This option is for experts only. Don't use it unless you understand WELL what it is.\n"
-			"InnoDB: ### Don't specify a file older than the last checkpoint. ###\n"
-			"InnoDB: Otherwise, the older doublewrite buffer will break your data during recovery!\n");
-
-		strcpy(name, srv_doublewrite_file);
-
-		/* First we try to create the file: if it already
-		exists, ret will get value FALSE */
-
-		files[i] = os_file_create(innodb_file_data_key, name, OS_FILE_CREATE,
-					  OS_FILE_NORMAL,
-					  OS_DATA_FILE, &ret);
-
-		if (ret == FALSE && os_file_get_last_error(FALSE)
-		    != OS_FILE_ALREADY_EXISTS
-#ifdef UNIV_AIX
-		    /* AIX 5.1 after security patch ML7 may have
-		    errno set to 0 here, which causes our function
-		    to return 100; work around that AIX problem */
-		    && os_file_get_last_error(FALSE) != 100
-#endif
-		    ) {
-			fprintf(stderr,
-				"InnoDB: Error in creating"
-				" or opening %s\n",
-				name);
-
-			return(DB_ERROR);
-		}
-
-		if (ret == FALSE) {
-
-			const char* check_msg;
-
-			/* We open the data file */
-
-			files[i] = os_file_create(innodb_file_data_key,
-				name, OS_FILE_OPEN, OS_FILE_NORMAL,
-				OS_DATA_FILE, &ret);
-
-			if (!ret) {
-				fprintf(stderr,
-					"InnoDB: Error in opening %s\n", name);
-				os_file_get_last_error(TRUE);
-
-				return(DB_ERROR);
-			}
-
-			ret = os_file_get_size(files[i], &size, &size_high);
-			ut_a(ret);
-			/* Round size downward to megabytes */
-
-			rounded_size_pages
-				= (size / (1024 * 1024) + 4096 * size_high)
-					<< (20 - UNIV_PAGE_SIZE_SHIFT);
-
-			if (rounded_size_pages != TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9) {
-
-				fprintf(stderr,
-					"InnoDB: Warning: doublewrite buffer file %s"
-					" is of a different size\n"
-					"InnoDB: %lu pages"
-					" (rounded down to MB)\n"
-					"InnoDB: than intended size"
-					" %lu pages...\n",
-					name,
-					(ulong) rounded_size_pages,
-					(ulong) TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9);
-			}
-
-			check_msg = fil_read_first_page(
-				files[i], one_opened, &flags,
-#ifdef UNIV_LOG_ARCHIVE
-				min_arch_log_no, max_arch_log_no,
-#endif /* UNIV_LOG_ARCHIVE */
-				min_flushed_lsn, max_flushed_lsn);
-
-			if (check_msg) {
-				fprintf(stderr,
-					"InnoDB: Error: %s in doublewrite "
-					"buffer file %s\n", check_msg, name);
-				return(DB_ERROR);
-			}
-
-			one_opened = TRUE;
-		} else {
-			/* We created the data file and now write it full of
-			zeros */
-
-			*create_new_doublewrite_file = TRUE;
-
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: Doublewrite buffer file %s did not"
-				" exist. It will be be created.\n",
-				name);
-
-			if (*create_new_db == FALSE) {
-				fprintf(stderr,
-					"InnoDB: Notice: Previous version's ibdata files may cause crash.\n"
-					"        If you use that, please use the ibdata files of this version.\n");
-			}
-
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: Setting file %s size to %lu MB\n",
-				name,
-				(ulong) ((TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9)
-					 >> (20 - UNIV_PAGE_SIZE_SHIFT)));
-
-			fprintf(stderr,
-				"InnoDB: Database physically writes the"
-				" file full: wait...\n");
-
-			ret = os_file_set_size(
-				name, files[i],
-				srv_calc_low32(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9),
-				srv_calc_high32(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9));
-
-			if (!ret) {
-				fprintf(stderr,
-					"InnoDB: Error in creating %s:"
-					" probably out of disk space\n", name);
-
-				return(DB_ERROR);
-			}
-		}
-
-		ret = os_file_close(files[i]);
-		ut_a(ret);
-
-		fil_space_create(name, TRX_DOUBLEWRITE_SPACE, 0, FIL_TABLESPACE);
-
-		ut_a(fil_validate());
-
-		fil_node_create(name, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9, TRX_DOUBLEWRITE_SPACE, FALSE);
-
-		i++;
-	}
-
-	return(DB_SUCCESS);
-}
-
-/*********************************************************************//**
-Initializes the log tracking subsystem and starts its thread.  */
-static
-void
-init_log_online(void)
-/*=================*/
-{
-	if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
-		srv_track_changed_pages = FALSE;
-		return;
-	}
-
-	if (srv_track_changed_pages) {
-
-		log_online_read_init();
-
-		/* Create the thread that follows the redo log to output the
-		   changed page bitmap */
-		os_thread_create(&srv_redo_log_follow_thread, NULL,
-				 thread_ids + 5 + SRV_MAX_N_IO_THREADS);
-	}
-}
-
-/********************************************************************
-Starts InnoDB and creates a new database if database files
-are not found and the user wants.
-@return	DB_SUCCESS or error code */
-UNIV_INTERN
-int
-innobase_start_or_create_for_mysql(void)
-/*====================================*/
-{
-	ibool		create_new_db;
-	ibool		create_new_doublewrite_file;
-	ibool		log_file_created;
-	ibool		log_created	= FALSE;
-	ibool		log_opened	= FALSE;
-	ib_uint64_t	min_flushed_lsn;
-	ib_uint64_t	max_flushed_lsn;
-#ifdef UNIV_LOG_ARCHIVE
-	ulint		min_arch_log_no;
-	ulint		max_arch_log_no;
-#endif /* UNIV_LOG_ARCHIVE */
-	ulint		sum_of_new_sizes;
-	ulint		sum_of_data_file_sizes;
-	ulint		tablespace_size_in_header;
-	ulint		err;
-	ulint		i;
-	ulint		io_limit;
-	my_bool		srv_file_per_table_original_value
-		= srv_file_per_table;
-	mtr_t		mtr;
-#ifdef HAVE_DARWIN_THREADS
-# ifdef F_FULLFSYNC
-	/* This executable has been compiled on Mac OS X 10.3 or later.
-	Assume that F_FULLFSYNC is available at run-time. */
-	srv_have_fullfsync = TRUE;
-# else /* F_FULLFSYNC */
-	/* This executable has been compiled on Mac OS X 10.2
-	or earlier.  Determine if the executable is running
-	on Mac OS X 10.3 or later. */
-	struct utsname utsname;
-	if (uname(&utsname)) {
-		ut_print_timestamp(stderr);
-		fputs(" InnoDB: cannot determine Mac OS X version!\n", stderr);
-	} else {
-		srv_have_fullfsync = strcmp(utsname.release, "7.") >= 0;
-	}
-	if (!srv_have_fullfsync) {
-		ut_print_timestamp(stderr);
-		fputs(" InnoDB: On Mac OS X, fsync() may be "
-		      "broken on internal drives,\n", stderr);
-		ut_print_timestamp(stderr);
-		fputs(" InnoDB: making transactions unsafe!\n", stderr);
-	}
-# endif /* F_FULLFSYNC */
-#endif /* HAVE_DARWIN_THREADS */
-
-	if (sizeof(ulint) != sizeof(void*)) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Error: size of InnoDB's ulint is %lu, "
-			"but size of void*\n", (ulong) sizeof(ulint));
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: is %lu. The sizes should be the same "
-			"so that on a 64-bit\n",
-			(ulong) sizeof(void*));
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: platforms you can allocate more than 4 GB "
-			"of memory.\n");
-	}
-
-	/* If stacktrace is used we set up signal handler for SIGUSR2 signal
-	here. If signal handler set fails we report that and disable
-	stacktrace feature. */
-
-	if (srv_use_stacktrace) {
-#ifdef __linux__
-		 struct sigaction sigact;
-
-		 sigact.sa_sigaction = os_stacktrace_print;
-		 sigact.sa_flags = SA_RESTART | SA_SIGINFO;
-
-		 if (sigaction(SIGUSR2, &sigact, (struct sigaction *)NULL) != 0)
-		 {
-			 fprintf(stderr, " InnoDB:error setting signal handler for %d (%s)\n",
-				 SIGUSR2, strsignal(SIGUSR2));
-			 srv_use_stacktrace = FALSE;
-
-		 }
-#endif /* __linux__ */
-	}
-
-
-	/* System tables are created in tablespace 0.  Thus, we must
-	temporarily clear srv_file_per_table.  This is ok, because the
-	server will not accept connections (which could modify
-	innodb_file_per_table) until this function has returned. */
-	srv_file_per_table = FALSE;
-#ifdef UNIV_DEBUG
-	ut_print_timestamp(stderr);
-	fprintf(stderr,
-		" InnoDB: !!!!!!!! UNIV_DEBUG switched on !!!!!!!!!\n");
-#endif
-
-#ifdef UNIV_IBUF_DEBUG
-	ut_print_timestamp(stderr);
-	fprintf(stderr,
-		" InnoDB: !!!!!!!! UNIV_IBUF_DEBUG switched on !!!!!!!!!\n");
-# ifdef UNIV_IBUF_COUNT_DEBUG
-	ut_print_timestamp(stderr);
-	fprintf(stderr,
-		" InnoDB: !!!!!!!! UNIV_IBUF_COUNT_DEBUG switched on "
-		"!!!!!!!!!\n");
-	ut_print_timestamp(stderr);
-	fprintf(stderr,
-		" InnoDB: Crash recovery will fail with UNIV_IBUF_COUNT_DEBUG\n");
-# endif
-#endif
-
-#ifdef UNIV_BLOB_DEBUG
-	fprintf(stderr,
-		"InnoDB: !!!!!!!! UNIV_BLOB_DEBUG switched on !!!!!!!!!\n"
-		"InnoDB: Server restart may fail with UNIV_BLOB_DEBUG\n");
-#endif /* UNIV_BLOB_DEBUG */
-
-#ifdef UNIV_SYNC_DEBUG
-	ut_print_timestamp(stderr);
-	fprintf(stderr,
-		" InnoDB: !!!!!!!! UNIV_SYNC_DEBUG switched on !!!!!!!!!\n");
-#endif
-
-#ifdef UNIV_SEARCH_DEBUG
-	ut_print_timestamp(stderr);
-	fprintf(stderr,
-		" InnoDB: !!!!!!!! UNIV_SEARCH_DEBUG switched on !!!!!!!!!\n");
-#endif
-
-#ifdef UNIV_LOG_LSN_DEBUG
-	ut_print_timestamp(stderr);
-	fprintf(stderr,
-		" InnoDB: !!!!!!!! UNIV_LOG_LSN_DEBUG switched on !!!!!!!!!\n");
-#endif /* UNIV_LOG_LSN_DEBUG */
-#ifdef UNIV_MEM_DEBUG
-	ut_print_timestamp(stderr);
-	fprintf(stderr,
-		" InnoDB: !!!!!!!! UNIV_MEM_DEBUG switched on !!!!!!!!!\n");
-#endif
-
-	if (UNIV_LIKELY(srv_use_sys_malloc)) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: The InnoDB memory heap is disabled\n");
-	}
-
-	ut_print_timestamp(stderr);
-	fputs(" InnoDB: " IB_ATOMICS_STARTUP_MSG "\n", stderr);
-
-	ut_print_timestamp(stderr);
-	fputs(" InnoDB: Compressed tables use zlib " ZLIB_VERSION
-#ifdef UNIV_ZIP_DEBUG
-	      " with validation"
-#endif /* UNIV_ZIP_DEBUG */
-	      "\n" , stderr);
-#ifdef UNIV_ZIP_COPY
-	ut_print_timestamp(stderr);
-	fputs(" InnoDB: and extra copying\n", stderr);
-#endif /* UNIV_ZIP_COPY */
-
-	/* Since InnoDB does not currently clean up all its internal data
-	structures in MySQL Embedded Server Library server_end(), we
-	print an error message if someone tries to start up InnoDB a
-	second time during the process lifetime. */
-
-	if (srv_start_has_been_called) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr, " InnoDB: Error: startup called second time "
-			"during the process\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr, " InnoDB: lifetime. In the MySQL Embedded "
-			"Server Library you\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr, " InnoDB: cannot call server_init() more "
-			"than once during the\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr, " InnoDB: process lifetime.\n");
-	}
-
-	srv_start_has_been_called = TRUE;
-
-#ifdef UNIV_DEBUG
-	log_do_write = TRUE;
-#endif /* UNIV_DEBUG */
-	/*	yydebug = TRUE; */
-
-	srv_is_being_started = TRUE;
-	srv_startup_is_before_trx_rollback_phase = TRUE;
-
-#ifdef __WIN__
-	switch (os_get_os_version()) {
-	case OS_WIN95:
-	case OS_WIN31:
-	case OS_WINNT:
-		srv_use_native_conditions = FALSE;
-		/* On Win 95, 98, ME, Win32 subsystem for Windows 3.1,
-		and NT use simulated aio. In NT Windows provides async i/o,
-		but when run in conjunction with InnoDB Hot Backup, it seemed
-		to corrupt the data files. */
-
-		srv_use_native_aio = FALSE;
-		break;
-
-	case OS_WIN2000:
-	case OS_WINXP:
-		/* On 2000 and XP, async IO is available, but no condition variables. */
-		srv_use_native_aio = TRUE;
-		srv_use_native_conditions = FALSE;
- 		break;
-
-	default:
-		/* Vista and later have both async IO and condition variables */
-		srv_use_native_aio = TRUE;
-		srv_use_native_conditions = TRUE;
-		break;
-	}
-
-#elif defined(LINUX_NATIVE_AIO)
-
-	if (srv_use_native_aio) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Using Linux native AIO\n");
-	}
-#else
-	/* Currently native AIO is supported only on windows and linux
-	and that also when the support is compiled in. In all other
-	cases, we ignore the setting of innodb_use_native_aio. */
-	srv_use_native_aio = FALSE;
-
-#endif
-
-	if (srv_file_flush_method_str == NULL) {
-		/* These are the default options */
-
-		srv_unix_file_flush_method = SRV_UNIX_FSYNC;
-
-		srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
-	} else if (0 == ut_strcmp(srv_file_flush_method_str, "fsync")) {
-		srv_unix_file_flush_method = SRV_UNIX_FSYNC;
-
-	} else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DSYNC")) {
-		srv_unix_file_flush_method = SRV_UNIX_O_DSYNC;
-
-	} else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) {
-		srv_unix_file_flush_method = SRV_UNIX_O_DIRECT;
-
-	} else if (0 == ut_strcmp(srv_file_flush_method_str, "ALL_O_DIRECT")) {
-		srv_unix_file_flush_method = SRV_UNIX_ALL_O_DIRECT;
-
-	} else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) {
-		srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC;
-
-	} else if (0 == ut_strcmp(srv_file_flush_method_str, "nosync")) {
-		srv_unix_file_flush_method = SRV_UNIX_NOSYNC;
-#ifdef _WIN32
-	} else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) {
-		srv_win_file_flush_method = SRV_WIN_IO_NORMAL;
-		srv_use_native_aio = FALSE;
-
-	} else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) {
-		srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
-		srv_use_native_aio = FALSE;
-
-	} else if (0 == ut_strcmp(srv_file_flush_method_str,
-				  "async_unbuffered")) {
-		srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
-		srv_use_native_aio = TRUE;
-#endif
-	} else {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Unrecognized value %s for"
-			" innodb_flush_method\n",
-			srv_file_flush_method_str);
-		return(DB_ERROR);
-	}
-
-	/* Note that the call srv_boot() also changes the values of
-	some variables to the units used by InnoDB internally */
-
-	/* Set the maximum number of threads which can wait for a semaphore
-	inside InnoDB: this is the 'sync wait array' size, as well as the
-	maximum number of threads that can wait in the 'srv_conc array' for
-	their time to enter InnoDB. */
-
-	if (srv_buf_pool_size >= 1000 * 1024 * 1024) {
-		/* If buffer pool is less than 1000 MB,
-		assume fewer threads. Also use only one
-		buffer pool instance */
-		srv_max_n_threads = 50000;
-
-	} else if (srv_buf_pool_size >= 8 * 1024 * 1024) {
-
-		srv_buf_pool_instances = 1;
-		srv_max_n_threads = 10000;
-	} else {
-		srv_buf_pool_instances = 1;
-		srv_max_n_threads = 1000;	/* saves several MB of memory,
-						especially in 64-bit
-						computers */
-	}
-
-	err = srv_boot();
-
-	if (err != DB_SUCCESS) {
-
-		return((int) err);
-	}
-
-	mutex_create(srv_monitor_file_mutex_key,
-		     &srv_monitor_file_mutex, SYNC_NO_ORDER_CHECK);
-
-	if (srv_innodb_status) {
-		srv_monitor_file_name = mem_alloc(
-			strlen(fil_path_to_mysql_datadir)
-			+ 20 + sizeof "/innodb_status.");
-		sprintf(srv_monitor_file_name, "%s/innodb_status.%lu",
-			fil_path_to_mysql_datadir, os_proc_get_number());
-		srv_monitor_file = fopen(srv_monitor_file_name, "w+");
-		if (!srv_monitor_file) {
-			fprintf(stderr, "InnoDB: unable to create %s: %s\n",
-				srv_monitor_file_name, strerror(errno));
-			return(DB_ERROR);
-		}
-	} else {
-		srv_monitor_file_name = NULL;
-		srv_monitor_file = os_file_create_tmpfile();
-		if (!srv_monitor_file) {
-			return(DB_ERROR);
-		}
-	}
-
-	mutex_create(srv_dict_tmpfile_mutex_key,
-		     &srv_dict_tmpfile_mutex, SYNC_DICT_OPERATION);
-
-	srv_dict_tmpfile = os_file_create_tmpfile();
-	if (!srv_dict_tmpfile) {
-		return(DB_ERROR);
-	}
-
-	mutex_create(srv_misc_tmpfile_mutex_key,
-		     &srv_misc_tmpfile_mutex, SYNC_ANY_LATCH);
-
-	srv_misc_tmpfile = os_file_create_tmpfile();
-	if (!srv_misc_tmpfile) {
-		return(DB_ERROR);
-	}
-
-	/* If user has set the value of innodb_file_io_threads then
-	we'll emit a message telling the user that this parameter
-	is now deprecated. */
-	if (srv_n_file_io_threads != 4) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr, " InnoDB: Warning:"
-			" innodb_file_io_threads is deprecated."
-			" Please use innodb_read_io_threads and"
-			" innodb_write_io_threads instead\n");
-	}
-
-	/* Now overwrite the value on srv_n_file_io_threads */
-	srv_n_file_io_threads = 2 + srv_n_read_io_threads
-				+ srv_n_write_io_threads;
-
-	ut_a(srv_n_file_io_threads <= SRV_MAX_N_IO_THREADS);
-
-	io_limit = 8 * SRV_N_PENDING_IOS_PER_THREAD;
-
-	/* On Windows when using native aio the number of aio requests
-	that a thread can handle at a given time is limited to 32
-	i.e.: SRV_N_PENDING_IOS_PER_THREAD */
-# ifdef __WIN__
-	if (srv_use_native_aio) {
-		io_limit = SRV_N_PENDING_IOS_PER_THREAD;
-	}
-# endif /* __WIN__ */
-
-	if (!os_aio_init(io_limit,
-                         srv_n_read_io_threads,
-                         srv_n_write_io_threads,
-                         SRV_MAX_N_PENDING_SYNC_IOS)) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Fatal error: cannot initialize AIO"
-			" sub-system\n");
-#if defined(LINUX_NATIVE_AIO)
-                fprintf(stderr, "You can try increasing system fs.aio-max-nr to 1048576 or larger or setting innodb_use_native_aio = 0 in my.cnf\n");
-#endif
-                return(DB_ERROR);
-        }
-
-	fil_init(srv_file_per_table ? 50000 : 5000,
-		 srv_max_n_open_files);
-
-	/* Print time to initialize the buffer pool */
-	ut_print_timestamp(stderr);
-	fprintf(stderr,
-		" InnoDB: Initializing buffer pool, size =");
-
-	if (srv_buf_pool_size >= 1024 * 1024 * 1024) {
-		fprintf(stderr,
-			" %.1fG\n",
-			((double) srv_buf_pool_size) / (1024 * 1024 * 1024));
-	} else {
-		fprintf(stderr,
-			" %.1fM\n",
-			((double) srv_buf_pool_size) / (1024 * 1024));
-	}
-
-	err = buf_pool_init(srv_buf_pool_size, (ibool) srv_buf_pool_populate,
-			    srv_buf_pool_instances);
-
-	ut_print_timestamp(stderr);
-	fprintf(stderr,
-		" InnoDB: Completed initialization of buffer pool\n");
-
-	if (err != DB_SUCCESS) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Fatal error: cannot allocate memory"
-			" for the buffer pool\n");
-
-		return(DB_ERROR);
-	}
-
-#ifdef UNIV_DEBUG
-	/* We have observed deadlocks with a 5MB buffer pool but
-	the actual lower limit could very well be a little higher. */
-
-	if (srv_buf_pool_size <= 5 * 1024 * 1024) {
-
-		ut_print_timestamp(stderr);
-		fprintf(stderr, " InnoDB: Warning: Small buffer pool size "
-			"(%luM), the flst_validate() debug function "
-			"can cause a deadlock if the buffer pool fills up.\n",
-			srv_buf_pool_size / 1024 / 1024);
-	}
-#endif
-
-	fsp_init();
-	log_init();
-
-	lock_sys_create(srv_lock_table_size);
-
-	/* Create i/o-handler threads: */
-
-	for (i = 0; i < srv_n_file_io_threads; i++) {
-		n[i] = i;
-
-		os_thread_create(io_handler_thread, n + i, thread_ids + i);
-	}
-
-#ifdef UNIV_LOG_ARCHIVE
-	if (0 != ut_strcmp(srv_log_group_home_dirs[0], srv_arch_dir)) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr, " InnoDB: Error: you must set the log group home dir in my.cnf\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr, " InnoDB: the same as log arch dir.\n");
-
-		return(DB_ERROR);
-	}
-#endif /* UNIV_LOG_ARCHIVE */
-
-	if (sizeof(ulint) == 4
-	    && srv_n_log_files * srv_log_file_size
-	       >= ((ulint)1 << (32 - UNIV_PAGE_SIZE_SHIFT))) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Error: combined size of log files"
-			" must be < 4 GB on 32-bit systems\n");
-
-		return(DB_ERROR);
-	}
-
-	sum_of_new_sizes = 0;
-
-	for (i = 0; i < srv_n_data_files; i++) {
-#ifndef __WIN__
-		if (sizeof(off_t) < 5 && srv_data_file_sizes[i] >= ((ulint)1 << (32 - UNIV_PAGE_SIZE_SHIFT))) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: Error: file size must be < 4 GB"
-				" with this MySQL binary\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: and operating system combination,"
-				" in some OS's < 2 GB\n");
-
-			return(DB_ERROR);
-		}
-#endif
-		sum_of_new_sizes += srv_data_file_sizes[i];
-	}
-
-	if (sum_of_new_sizes < 10485760 / UNIV_PAGE_SIZE) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Error: tablespace size must be"
-			" at least 10 MB\n");
-
-		return(DB_ERROR);
-	}
-
-	err = open_or_create_data_files(&create_new_db,
-					&create_new_doublewrite_file,
-#ifdef UNIV_LOG_ARCHIVE
-					&min_arch_log_no, &max_arch_log_no,
-#endif /* UNIV_LOG_ARCHIVE */
-					&min_flushed_lsn, &max_flushed_lsn,
-					&sum_of_new_sizes);
-	if (err != DB_SUCCESS) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Could not open or create data files.\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: If you tried to add new data files,"
-			" and it failed here,\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: you should now edit innodb_data_file_path"
-			" in my.cnf back\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: to what it was, and remove the"
-			" new ibdata files InnoDB created\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: in this failed attempt. InnoDB only wrote"
-			" those files full of\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: zeros, but did not yet use them in any way."
-			" But be careful: do not\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: remove old data files"
-			" which contain your precious data!\n");
-
-		return((int) err);
-	}
-
-#ifdef UNIV_LOG_ARCHIVE
-	srv_normalize_path_for_win(srv_arch_dir);
-	srv_arch_dir = srv_add_path_separator_if_needed(srv_arch_dir);
-#endif /* UNIV_LOG_ARCHIVE */
-
-	for (i = 0; i < srv_n_log_files; i++) {
-		err = open_or_create_log_file(create_new_db, &log_file_created,
-					      log_opened, 0, i);
-		if (err != DB_SUCCESS) {
-
-			return((int) err);
-		}
-
-		if (log_file_created) {
-			log_created = TRUE;
-		} else {
-			log_opened = TRUE;
-		}
-		if ((log_opened && create_new_db)
-		    || (log_opened && log_created)) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: Error: all log files must be"
-				" created at the same time.\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: All log files must be"
-				" created also in database creation.\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: If you want bigger or smaller"
-				" log files, shut down the\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: database and make sure there"
-				" were no errors in shutdown.\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: Then delete the existing log files."
-				" Edit the .cnf file\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: and start the database again.\n");
-
-			return(DB_ERROR);
-		}
-	}
-
-	/* Open all log files and data files in the system tablespace: we
-	keep them open until database shutdown */
-
-	fil_open_log_and_system_tablespace_files();
-
-	if (log_created && !create_new_db
-#ifdef UNIV_LOG_ARCHIVE
-	    && !srv_archive_recovery
-#endif /* UNIV_LOG_ARCHIVE */
-	    ) {
-		if (max_flushed_lsn != min_flushed_lsn
-#ifdef UNIV_LOG_ARCHIVE
-		    || max_arch_log_no != min_arch_log_no
-#endif /* UNIV_LOG_ARCHIVE */
-		    ) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: Cannot initialize created"
-				" log files because\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: data files were not in sync"
-				" with each other\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: or the data files are corrupt.\n");
-
-			return(DB_ERROR);
-		}
-
-		if (max_flushed_lsn < (ib_uint64_t) 1000) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: Cannot initialize created"
-				" log files because\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: data files are corrupt,"
-				" or new data files were\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: created when the database"
-				" was started previous\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: time but the database"
-				" was not shut down\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: normally after that.\n");
-
-			return(DB_ERROR);
-		}
-
-		mutex_enter(&(log_sys->mutex));
-
-#ifdef UNIV_LOG_ARCHIVE
-		/* Do not + 1 arch_log_no because we do not use log
-		archiving */
-		recv_reset_logs(max_flushed_lsn, max_arch_log_no, TRUE);
-#else
-		recv_reset_logs(max_flushed_lsn, TRUE);
-#endif /* UNIV_LOG_ARCHIVE */
-
-		mutex_exit(&(log_sys->mutex));
-	}
-
-	trx_sys_file_format_init();
-
-	if (create_new_db) {
-		init_log_online();
-
-		mtr_start(&mtr);
-
-		fsp_header_init(0, sum_of_new_sizes, &mtr);
-
-		mtr_commit(&mtr);
-
-		/* To maintain backward compatibility we create only
-		the first rollback segment before the double write buffer.
-		All the remaining rollback segments will be created later,
-		after the double write buffer has been created. */
-		trx_sys_create();
-
-		if (create_new_doublewrite_file) {
-			mtr_start(&mtr);
-			fsp_header_init(TRX_DOUBLEWRITE_SPACE, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9, &mtr);
-			mtr_commit(&mtr);
-
-			trx_sys_dummy_create(TRX_DOUBLEWRITE_SPACE);
-		}
-
-		dict_create();
-
-		srv_startup_is_before_trx_rollback_phase = FALSE;
-
-#ifdef UNIV_LOG_ARCHIVE
-	} else if (srv_archive_recovery) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Starting archive"
-			" recovery from a backup...\n");
-		err = recv_recovery_from_archive_start(
-			min_flushed_lsn, srv_archive_recovery_limit_lsn,
-			min_arch_log_no);
-		if (err != DB_SUCCESS) {
-
-			return(DB_ERROR);
-		}
-		/* Since ibuf init is in dict_boot, and ibuf is needed
-		in any disk i/o, first call dict_boot */
-
-		dict_boot();
-
-		trx_sys_init_at_db_start();
-
-		srv_startup_is_before_trx_rollback_phase = FALSE;
-
-		/* Initialize the fsp free limit global variable in the log
-		system */
-		fsp_header_get_free_limit();
-
-		recv_recovery_from_archive_finish();
-#endif /* UNIV_LOG_ARCHIVE */
-	} else {
-		char*	save_srv_doublewrite_file = NULL;
-
-		if (create_new_doublewrite_file) {
-			/* doublewrite_file cannot be used for recovery yet. */
-			save_srv_doublewrite_file = srv_doublewrite_file;
-			srv_doublewrite_file = NULL;
-		}
-
-		/* Check if we support the max format that is stamped
-		on the system tablespace. 
-		Note:  We are NOT allowed to make any modifications to
-		the TRX_SYS_PAGE_NO page before recovery  because this
-		page also contains the max_trx_id etc. important system
-		variables that are required for recovery.  We need to
-		ensure that we return the system to a state where normal
-		recovery is guaranteed to work. We do this by
-		invalidating the buffer cache, this will force the
-		reread of the page and restoration to its last known
-		consistent state, this is REQUIRED for the recovery
-		process to work. */
-		err = trx_sys_file_format_max_check(
-			srv_max_file_format_at_startup);
-
-		if (err != DB_SUCCESS) {
-			return(err);
-		}
-
-		/* Invalidate the buffer pool to ensure that we reread
-		the page that we read above, during recovery.
-		Note that this is not as heavy weight as it seems. At
-		this point there will be only ONE page in the buf_LRU
-		and there must be no page in the buf_flush list. */
-		buf_pool_invalidate();
-
-		/* We always try to do a recovery, even if the database had
-		been shut down normally: this is the normal startup path */
-
-		err = recv_recovery_from_checkpoint_start(LOG_CHECKPOINT,
-							  IB_ULONGLONG_MAX,
-							  min_flushed_lsn,
-							  max_flushed_lsn);
-		if (err != DB_SUCCESS) {
-
-			return(DB_ERROR);
-		}
-
-		init_log_online();
-
-		/* Since the insert buffer init is in dict_boot, and the
-		insert buffer is needed in any disk i/o, first we call
-		dict_boot(). Note that trx_sys_init_at_db_start() only needs
-		to access space 0, and the insert buffer at this stage already
-		works for space 0. */
-
-		dict_boot();
-		trx_sys_init_at_db_start();
-
-		/* Initialize the fsp free limit global variable in the log
-		system */
-		fsp_header_get_free_limit();
-
-		/* recv_recovery_from_checkpoint_finish needs trx lists which
-		are initialized in trx_sys_init_at_db_start(). */
-
-		recv_recovery_from_checkpoint_finish();
-		if (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE) {
-			/* The following call is necessary for the insert
-			buffer to work with multiple tablespaces. We must
-			know the mapping between space id's and .ibd file
-			names.
-
-			In a crash recovery, we check that the info in data
-			dictionary is consistent with what we already know
-			about space id's from the call of
-			fil_load_single_table_tablespaces().
-
-			In a normal startup, we create the space objects for
-			every table in the InnoDB data dictionary that has
-			an .ibd file.
-
-			We also determine the maximum tablespace id used. */
-
-			dict_check_tablespaces_and_store_max_id(
-				recv_needed_recovery);
-		}
-
-		srv_startup_is_before_trx_rollback_phase = FALSE;
-		recv_recovery_rollback_active();
-
-		/* It is possible that file_format tag has never
-		been set. In this case we initialize it to minimum
-		value.  Important to note that we can do it ONLY after
-		we have finished the recovery process so that the
-		image of TRX_SYS_PAGE_NO is not stale. */
-		trx_sys_file_format_tag_init();
-
-		if (create_new_doublewrite_file) {
-			/* restore the value */
-			srv_doublewrite_file = save_srv_doublewrite_file;
-
-			mtr_start(&mtr);
-			fsp_header_init(TRX_DOUBLEWRITE_SPACE, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9, &mtr);
-			mtr_commit(&mtr);
-
-			trx_sys_dummy_create(TRX_DOUBLEWRITE_SPACE);
-		}
-
-		if (UNIV_UNLIKELY(!dict_verify_xtradb_sys_stats())) {
-			fprintf(stderr, "InnoDB: Warning: "
-				"SYS_STATS table corrupted, recreating\n");
-			dict_recreate_xtradb_sys_stats();
-		}
-	}
-
-	if (!create_new_db && sum_of_new_sizes > 0) {
-		/* New data file(s) were added */
-		mtr_start(&mtr);
-
-		fsp_header_inc_size(0, sum_of_new_sizes, &mtr);
-
-		mtr_commit(&mtr);
-
-		/* Immediately write the log record about increased tablespace
-		size to disk, so that it is durable even if mysqld would crash
-		quickly */
-
-		log_buffer_flush_to_disk();
-	}
-
-#ifdef UNIV_LOG_ARCHIVE
-	/* Archiving is always off under MySQL */
-	if (!srv_log_archive_on) {
-		ut_a(DB_SUCCESS == log_archive_noarchivelog());
-	} else {
-		mutex_enter(&(log_sys->mutex));
-
-		start_archive = FALSE;
-
-		if (log_sys->archiving_state == LOG_ARCH_OFF) {
-			start_archive = TRUE;
-		}
-
-		mutex_exit(&(log_sys->mutex));
-
-		if (start_archive) {
-			ut_a(DB_SUCCESS == log_archive_archivelog());
-		}
-	}
-#endif /* UNIV_LOG_ARCHIVE */
-
-	/* fprintf(stderr, "Max allowed record size %lu\n",
-	page_get_free_space_of_empty() / 2); */
-
-	if (trx_doublewrite == NULL) {
-		/* Create the doublewrite buffer to a new tablespace */
-
-		trx_sys_create_doublewrite_buf();
-	}
-
-	/* Here the double write buffer has already been created and so
-	any new rollback segments will be allocated after the double
-	write buffer. The default segment should already exist.
-	We create the new segments only if it's a new database or
-	the database was shutdown cleanly. */
-
-	/* Note: When creating the extra rollback segments during an upgrade
-	we violate the latching order, even if the change buffer is empty.
-	We make an exception in sync0sync.c and check srv_is_being_started
-	for that violation. It cannot create a deadlock because we are still
-	running in single threaded mode essentially. Only the IO threads
-	should be running at this stage. */
-
-	trx_sys_create_rsegs(TRX_SYS_N_RSEGS - 1);
-
-	/* Create the thread which watches the timeouts for lock waits */
-	os_thread_create(&srv_lock_timeout_thread, NULL,
-			 thread_ids + 2 + SRV_MAX_N_IO_THREADS);
-
-	/* Create the thread which warns of long semaphore waits */
-	os_thread_create(&srv_error_monitor_thread, NULL,
-			 thread_ids + 3 + SRV_MAX_N_IO_THREADS);
-
-	/* Create the thread which prints InnoDB monitor info */
-	os_thread_create(&srv_monitor_thread, NULL,
-			 thread_ids + 4 + SRV_MAX_N_IO_THREADS);
-
-	/* Create the thread which automaticaly dumps/restore buffer pool */
-	os_thread_create(&srv_LRU_dump_restore_thread, NULL,
-			 thread_ids + 5 + SRV_MAX_N_IO_THREADS);
-
-	/* If srv_blocking_lru_restore is TRUE, load buffer pool contents
-	synchronously */
-	if (srv_auto_lru_dump && srv_blocking_lru_restore)
-		buf_LRU_file_restore();
-
-	srv_is_being_started = FALSE;
-
-	err = dict_create_or_check_foreign_constraint_tables();
-
-	if (err != DB_SUCCESS) {
-		return((int)DB_ERROR);
-	}
-
-	/* Create the master thread which does purge and other utility
-	operations */
-
-	os_thread_create(&srv_master_thread, NULL, thread_ids
-			 + (1 + SRV_MAX_N_IO_THREADS));
-
-	/* Currently we allow only a single purge thread. */
-	ut_a(srv_n_purge_threads == 0 || srv_n_purge_threads == 1);
-
-	/* If the user has requested a separate purge thread then
-	start the purge thread. */
-	if (srv_n_purge_threads == 1) {
-		os_thread_create(&srv_purge_thread, NULL, NULL);
-	}
-
-	/* Wait for the purge and master thread to startup. */
-
-	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
-		if (srv_thread_has_reserved_slot(SRV_MASTER) == ULINT_UNDEFINED
-		    || (srv_n_purge_threads == 1
-			&& srv_thread_has_reserved_slot(SRV_WORKER)
-			== ULINT_UNDEFINED)) {
-
-			ut_print_timestamp(stderr);
-			fprintf(stderr, "  InnoDB: "
-				"Waiting for the background threads to "
-				"start\n");
-			os_thread_sleep(1000000);
-		} else {
-			break;
-		}
-	}
-
-#ifdef UNIV_DEBUG
-	/* buf_debug_prints = TRUE; */
-#endif /* UNIV_DEBUG */
-	sum_of_data_file_sizes = 0;
-
-	for (i = 0; i < srv_n_data_files; i++) {
-		sum_of_data_file_sizes += srv_data_file_sizes[i];
-	}
-
-	tablespace_size_in_header = fsp_header_get_tablespace_size();
-
-	if (!srv_auto_extend_last_data_file
-	    && sum_of_data_file_sizes != tablespace_size_in_header) {
-
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Error: tablespace size"
-			" stored in header is %lu pages, but\n",
-			(ulong) tablespace_size_in_header);
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"InnoDB: the sum of data file sizes is %lu pages\n",
-			(ulong) sum_of_data_file_sizes);
-
-		if (srv_force_recovery == 0
-		    && sum_of_data_file_sizes < tablespace_size_in_header) {
-			/* This is a fatal error, the tail of a tablespace is
-			missing */
-
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: Cannot start InnoDB."
-				" The tail of the system tablespace is\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: missing. Have you edited"
-				" innodb_data_file_path in my.cnf in an\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: inappropriate way, removing"
-				" ibdata files from there?\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: You can set innodb_force_recovery=1"
-				" in my.cnf to force\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: a startup if you are trying"
-				" to recover a badly corrupt database.\n");
-
-			return(DB_ERROR);
-		}
-	}
-
-	if (srv_auto_extend_last_data_file
-	    && sum_of_data_file_sizes < tablespace_size_in_header) {
-
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Error: tablespace size stored in header"
-			" is %lu pages, but\n",
-			(ulong) tablespace_size_in_header);
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: the sum of data file sizes"
-			" is only %lu pages\n",
-			(ulong) sum_of_data_file_sizes);
-
-		if (srv_force_recovery == 0) {
-
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: Cannot start InnoDB. The tail of"
-				" the system tablespace is\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: missing. Have you edited"
-				" innodb_data_file_path in my.cnf in an\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: inappropriate way, removing"
-				" ibdata files from there?\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: You can set innodb_force_recovery=1"
-				" in my.cnf to force\n");
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				" InnoDB: a startup if you are trying to"
-				" recover a badly corrupt database.\n");
-
-			return(DB_ERROR);
-		}
-	}
-
-	/* Check that os_fast_mutexes work as expected */
-	os_fast_mutex_init(&srv_os_test_mutex);
-
-	if (0 != os_fast_mutex_trylock(&srv_os_test_mutex)) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Error: pthread_mutex_trylock returns"
-			" an unexpected value on\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: success! Cannot continue.\n");
-		exit(1);
-	}
-
-	os_fast_mutex_unlock(&srv_os_test_mutex);
-
-	os_fast_mutex_lock(&srv_os_test_mutex);
-
-	os_fast_mutex_unlock(&srv_os_test_mutex);
-
-	os_fast_mutex_free(&srv_os_test_mutex);
-
-	if (!srv_file_per_table_original_value
-	    && srv_pass_corrupt_table) {
-		fprintf(stderr, "InnoDB: Warning:"
-			" The option innodb_file_per_table is disabled,"
-			" so using the option innodb_pass_corrupt_table doesn't make sense.\n");
-	}
-
-	if (srv_print_verbose_log) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" Percona XtraDB (http://www.percona.com) %s started; "
-			"log sequence number %llu\n",
-			INNODB_VERSION_STR, srv_start_lsn);
-	}
-
-	if (srv_force_recovery > 0) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: !!! innodb_force_recovery"
-			" is set to %lu !!!\n",
-			(ulong) srv_force_recovery);
-	}
-
-	fflush(stderr);
-
-	if (trx_doublewrite_must_reset_space_ids) {
-		/* Actually, we did not change the undo log format between
-		4.0 and 4.1.1, and we would not need to run purge to
-		completion. Note also that the purge algorithm in 4.1.1
-		can process the history list again even after a full
-		purge, because our algorithm does not cut the end of the
-		history list in all cases so that it would become empty
-		after a full purge. That mean that we may purge 4.0 type
-		undo log even after this phase.
-
-		The insert buffer record format changed between 4.0 and
-		4.1.1. It is essential that the insert buffer is emptied
-		here! */
-
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: You are upgrading to an"
-			" InnoDB version which allows multiple\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: tablespaces. Wait that purge"
-			" and insert buffer merge run to\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: completion...\n");
-		for (;;) {
-			os_thread_sleep(1000000);
-
-			if (0 == strcmp(srv_main_thread_op_info,
-					"waiting for server activity")) {
-
-				ut_a(ibuf_is_empty());
-
-				break;
-			}
-		}
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Full purge and insert buffer merge"
-			" completed.\n");
-
-		trx_sys_mark_upgraded_to_multiple_tablespaces();
-
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: You have now successfully upgraded"
-			" to the multiple tablespaces\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: format. You should NOT DOWNGRADE"
-			" to an earlier version of\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: InnoDB! But if you absolutely need to"
-			" downgrade, see\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: " REFMAN "multiple-tablespaces.html\n"
-			" InnoDB: for instructions.\n");
-	}
-
-	if (srv_force_recovery == 0) {
-		/* In the insert buffer we may have even bigger tablespace
-		id's, because we may have dropped those tablespaces, but
-		insert buffer merge has not had time to clean the records from
-		the ibuf tree. */
-
-		ibuf_update_max_tablespace_id();
-	}
-
-	srv_file_per_table = srv_file_per_table_original_value;
-
-	srv_was_started = TRUE;
-
-	return((int) DB_SUCCESS);
-}
-
-/****************************************************************//**
-Shuts down the InnoDB database.
-@return	DB_SUCCESS or error code */
-UNIV_INTERN
-int
-innobase_shutdown_for_mysql(void)
-/*=============================*/
-{
-	ulint	i;
-	if (!srv_was_started) {
-		if (srv_is_being_started) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: Warning: shutting down"
-				" a not properly started\n"
-				"InnoDB: or created database!\n");
-		}
-
-		return(DB_SUCCESS);
-	}
-
-	/* 1. Flush the buffer pool to disk, write the current lsn to
-	the tablespace header(s), and copy all log data to archive.
-	The step 1 is the real InnoDB shutdown. The remaining steps 2 - ...
-	just free data structures after the shutdown. */
-
-	logs_empty_and_mark_files_at_shutdown();
-
-	if (srv_conc_n_threads != 0) {
-		fprintf(stderr,
-			"InnoDB: Warning: query counter shows %ld queries"
-			" still\n"
-			"InnoDB: inside InnoDB at shutdown\n",
-			srv_conc_n_threads);
-	}
-
-	/* 2. Make all threads created by InnoDB to exit */
-
-	srv_shutdown_state = SRV_SHUTDOWN_EXIT_THREADS;
-
-	/* All threads end up waiting for certain events. Put those events
-	to the signaled state. Then the threads will exit themselves after
-	os_event_wait(). */
-
-	for (i = 0; i < 1000; i++) {
-		/* NOTE: IF YOU CREATE THREADS IN INNODB, YOU MUST EXIT THEM
-		HERE OR EARLIER */
-
-		/* a. Let the lock timeout thread exit */
-		os_event_set(srv_lock_timeout_thread_event);
-
-		/* b. srv error monitor thread exits automatically, no need
-		to do anything here */
-
-		/* c. We wake the master thread so that it exits */
-		srv_wake_master_thread();
-
-		/* d. We wake the purge thread so that it exits */
-		srv_wake_purge_thread();
-
-		/* e. Exit the i/o threads */
-
-		os_aio_wake_all_threads_at_shutdown();
-
-		os_mutex_enter(os_sync_mutex);
-
-		if (os_thread_count == 0) {
-			/* All the threads have exited or are just exiting;
-			NOTE that the threads may not have completed their
-			exit yet. Should we use pthread_join() to make sure
-			they have exited? If we did, we would have to
-			remove the pthread_detach() from
-			os_thread_exit().  Now we just sleep 0.1
-			seconds and hope that is enough! */
-
-			os_mutex_exit(os_sync_mutex);
-
-			os_thread_sleep(100000);
-
-			break;
-		}
-
-		os_mutex_exit(os_sync_mutex);
-
-		os_thread_sleep(100000);
-	}
-
-	if (i == 1000) {
-		fprintf(stderr,
-			"InnoDB: Warning: %lu threads created by InnoDB"
-			" had not exited at shutdown!\n",
-			(ulong) os_thread_count);
-	}
-
-	if (srv_monitor_file) {
-		fclose(srv_monitor_file);
-		srv_monitor_file = 0;
-		if (srv_monitor_file_name) {
-			unlink(srv_monitor_file_name);
-			mem_free(srv_monitor_file_name);
-		}
-	}
-	if (srv_dict_tmpfile) {
-		fclose(srv_dict_tmpfile);
-		srv_dict_tmpfile = 0;
-	}
-
-	if (srv_misc_tmpfile) {
-		fclose(srv_misc_tmpfile);
-		srv_misc_tmpfile = 0;
-	}
-
-	/* This must be disabled before closing the buffer pool
-	and closing the data dictionary.  */
-	btr_search_disable();
-
-	ibuf_close();
-	log_shutdown();
-	lock_sys_close();
-	trx_sys_file_format_close();
-	trx_sys_close();
-
-	mutex_free(&srv_monitor_file_mutex);
-	mutex_free(&srv_dict_tmpfile_mutex);
-	mutex_free(&srv_misc_tmpfile_mutex);
-	dict_close();
-	btr_search_sys_free();
-
-	/* 3. Free all InnoDB's own mutexes and the os_fast_mutexes inside
-	them */
-	os_aio_free();
-	sync_close();
-	srv_free();
-	fil_close();
-
-	/* 4. Free the os_conc_mutex and all os_events and os_mutexes */
-
-	os_sync_free();
-
-	/* 5. Free all allocated memory */
-
-	pars_lexer_close();
-	log_mem_free();
-	buf_pool_free(srv_buf_pool_instances);
-	mem_close();
-
-	/* ut_free_all_mem() frees all allocated memory not freed yet
-	in shutdown, and it will also free the ut_list_mutex, so it
-	should be the last one for all operation */
-	ut_free_all_mem();
-
-	if (os_thread_count != 0
-	    || os_event_count != 0
-	    || os_mutex_count != 0
-	    || os_fast_mutex_count != 0) {
-		fprintf(stderr,
-			"InnoDB: Warning: some resources were not"
-			" cleaned up in shutdown:\n"
-			"InnoDB: threads %lu, events %lu,"
-			" os_mutexes %lu, os_fast_mutexes %lu\n",
-			(ulong) os_thread_count, (ulong) os_event_count,
-			(ulong) os_mutex_count, (ulong) os_fast_mutex_count);
-	}
-
-	if (dict_foreign_err_file) {
-		fclose(dict_foreign_err_file);
-	}
-	if (lock_latest_err_file) {
-		fclose(lock_latest_err_file);
-	}
-
-	if (srv_print_verbose_log) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: Shutdown completed;"
-			" log sequence number %llu\n",
-			srv_shutdown_lsn);
-	}
-
-	srv_was_started = FALSE;
-	srv_start_has_been_called = FALSE;
-
-	return((int) DB_SUCCESS);
-}
-#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/srv/srv0start.cc b/storage/xtradb/srv/srv0start.cc
new file mode 100644
index 00000000000..3ddfd9ab3a4
--- /dev/null
+++ b/storage/xtradb/srv/srv0start.cc
@@ -0,0 +1,3234 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file srv/srv0start.cc
+Starts the InnoDB database server
+
+Created 2/16/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0mem.h"
+#include "mem0mem.h"
+#include "data0data.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "buf0buf.h"
+#include "buf0dump.h"
+#include "os0file.h"
+#include "os0thread.h"
+#include "fil0fil.h"
+#include "fsp0fsp.h"
+#include "rem0rec.h"
+#include "mtr0mtr.h"
+#include "log0log.h"
+#include "log0online.h"
+#include "log0recv.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "trx0trx.h"
+#include "trx0sys.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "rem0rec.h"
+#include "ibuf0ibuf.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#ifndef UNIV_HOTBACKUP
+# include "trx0rseg.h"
+# include "os0proc.h"
+# include "sync0sync.h"
+# include "buf0flu.h"
+# include "buf0rea.h"
+# include "dict0boot.h"
+# include "dict0load.h"
+# include "dict0stats_bg.h"
+# include "que0que.h"
+# include "usr0sess.h"
+# include "lock0lock.h"
+# include "trx0roll.h"
+# include "trx0purge.h"
+# include "lock0lock.h"
+# include "pars0pars.h"
+# include "btr0sea.h"
+# include "rem0cmp.h"
+# include "dict0crea.h"
+# include "row0ins.h"
+# include "row0sel.h"
+# include "row0upd.h"
+# include "row0row.h"
+# include "row0mysql.h"
+# include "btr0pcur.h"
+# include "os0sync.h"
+# include "zlib.h"
+# include "ut0crc32.h"
+# include "os0stacktrace.h"
+
+/** Log sequence number immediately after startup */
+UNIV_INTERN lsn_t	srv_start_lsn;
+/** Log sequence number at shutdown */
+UNIV_INTERN lsn_t	srv_shutdown_lsn;
+
+#ifdef HAVE_DARWIN_THREADS
+# include <sys/utsname.h>
+/** TRUE if the F_FULLFSYNC option is available */
+UNIV_INTERN ibool	srv_have_fullfsync = FALSE;
+#endif
+
+/** TRUE if a raw partition is in use */
+UNIV_INTERN ibool	srv_start_raw_disk_in_use = FALSE;
+
+/** TRUE if the server is being started, before rolling back any
+incomplete transactions */
+UNIV_INTERN ibool	srv_startup_is_before_trx_rollback_phase = FALSE;
+/** TRUE if the server is being started */
+UNIV_INTERN ibool	srv_is_being_started = FALSE;
+/** TRUE if the server was successfully started */
+UNIV_INTERN ibool	srv_was_started = FALSE;
+/** TRUE if innobase_start_or_create_for_mysql() has been called */
+static ibool		srv_start_has_been_called = FALSE;
+
+/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
+SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
+UNIV_INTERN enum srv_shutdown_state	srv_shutdown_state = SRV_SHUTDOWN_NONE;
+
+/** Files comprising the system tablespace */
+static os_file_t	files[1000];
+
+/** io_handler_thread parameters for thread identification */
+static ulint		n[SRV_MAX_N_IO_THREADS + 6];
+/** io_handler_thread identifiers, 32 is the maximum number of purge threads  */
+static os_thread_id_t	thread_ids[SRV_MAX_N_IO_THREADS + 6
+				   + SRV_MAX_N_PURGE_THREADS];
+
+/** We use this mutex to test the return value of pthread_mutex_trylock
+   on successful locking. HP-UX does NOT return 0, though Linux et al do. */
+static os_fast_mutex_t	srv_os_test_mutex;
+
+/** Name of srv_monitor_file */
+static char*	srv_monitor_file_name;
+#endif /* !UNIV_HOTBACKUP */
+
+/** Default undo tablespace size in UNIV_PAGEs count (10MB). */
+static const ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES =
+	((1024 * 1024) * 10) / UNIV_PAGE_SIZE_DEF;
+
+/** */
+#define SRV_N_PENDING_IOS_PER_THREAD	OS_AIO_N_PENDING_IOS_PER_THREAD
+#define SRV_MAX_N_PENDING_SYNC_IOS	100
+
+#ifdef UNIV_PFS_THREAD
+/* Keys to register InnoDB threads with performance schema */
+UNIV_INTERN mysql_pfs_key_t	io_handler_thread_key;
+UNIV_INTERN mysql_pfs_key_t	srv_lock_timeout_thread_key;
+UNIV_INTERN mysql_pfs_key_t	srv_error_monitor_thread_key;
+UNIV_INTERN mysql_pfs_key_t	srv_monitor_thread_key;
+UNIV_INTERN mysql_pfs_key_t	srv_master_thread_key;
+UNIV_INTERN mysql_pfs_key_t	srv_purge_thread_key;
+UNIV_INTERN mysql_pfs_key_t	srv_log_tracking_thread_key;
+#endif /* UNIV_PFS_THREAD */
+
+/*********************************************************************//**
+Convert a numeric string that optionally ends in G or M, to a number
+containing megabytes.
+@return	next character in string */
+static
+char*
+srv_parse_megabytes(
+/*================*/
+	char*	str,	/*!< in: string containing a quantity in bytes */
+	ulint*	megs)	/*!< out: the number in megabytes */
+{
+	char*	endp;
+	ulint	size;
+
+	size = strtoul(str, &endp, 10);
+
+	str = endp;
+
+	switch (*str) {
+	case 'G': case 'g':
+		size *= 1024;
+		/* fall through */
+	case 'M': case 'm':
+		str++;
+		break;
+	default:
+		size /= 1024 * 1024;
+		break;
+	}
+
+	*megs = size;
+	return(str);
+}
+
+/*********************************************************************//**
+Check if a file can be opened in read-write mode.
+@return	true if it doesn't exist or can be opened in rw mode. */
+static
+bool
+srv_file_check_mode(
+/*================*/
+	const char*	name)		/*!< in: filename to check */
+{
+	os_file_stat_t	stat;
+
+	memset(&stat, 0x0, sizeof(stat));
+
+	dberr_t		err = os_file_get_status(name, &stat, true);
+
+	if (err == DB_FAIL) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"os_file_get_status() failed on '%s'. Can't determine "
+			"file permissions", name);
+
+		return(false);
+
+	} else if (err == DB_SUCCESS) {
+
+		/* Note: stat.rw_perm is only valid of files */
+
+		if (stat.type == OS_FILE_TYPE_FILE) {
+			if (!stat.rw_perm) {
+
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"%s can't be opened in %s mode",
+					name,
+					srv_read_only_mode
+					? "read" : "read-write");
+
+				return(false);
+			}
+		} else {
+			/* Not a regular file, bail out. */
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"'%s' not a regular file.", name);
+
+			return(false);
+		}
+	} else {
+
+		/* This is OK. If the file create fails on RO media, there
+		is nothing we can do. */
+
+		ut_a(err == DB_NOT_FOUND);
+	}
+
+	return(true);
+}
+
+/*********************************************************************//**
+Reads the data files and their sizes from a character string given in
+the .cnf file.
+@return	TRUE if ok, FALSE on parse error */
+UNIV_INTERN
+ibool
+srv_parse_data_file_paths_and_sizes(
+/*================================*/
+	char*	str)	/*!< in/out: the data file path string */
+{
+	char*	input_str;
+	char*	path;
+	ulint	size;
+	ulint	i	= 0;
+
+	srv_auto_extend_last_data_file = FALSE;
+	srv_last_file_size_max = 0;
+	srv_data_file_names = NULL;
+	srv_data_file_sizes = NULL;
+	srv_data_file_is_raw_partition = NULL;
+
+	input_str = str;
+
+	/* First calculate the number of data files and check syntax:
+	path:size[M | G];path:size[M | G]... . Note that a Windows path may
+	contain a drive name and a ':'. */
+
+	while (*str != '\0') {
+		path = str;
+
+		while ((*str != ':' && *str != '\0')
+		       || (*str == ':'
+			   && (*(str + 1) == '\\' || *(str + 1) == '/'
+			       || *(str + 1) == ':'))) {
+			str++;
+		}
+
+		if (*str == '\0') {
+			return(FALSE);
+		}
+
+		str++;
+
+		str = srv_parse_megabytes(str, &size);
+
+		if (0 == strncmp(str, ":autoextend",
+				 (sizeof ":autoextend") - 1)) {
+
+			str += (sizeof ":autoextend") - 1;
+
+			if (0 == strncmp(str, ":max:",
+					 (sizeof ":max:") - 1)) {
+
+				str += (sizeof ":max:") - 1;
+
+				str = srv_parse_megabytes(str, &size);
+			}
+
+			if (*str != '\0') {
+
+				return(FALSE);
+			}
+		}
+
+		if (strlen(str) >= 6
+		    && *str == 'n'
+		    && *(str + 1) == 'e'
+		    && *(str + 2) == 'w') {
+			str += 3;
+		}
+
+		if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') {
+			str += 3;
+		}
+
+		if (size == 0) {
+			return(FALSE);
+		}
+
+		i++;
+
+		if (*str == ';') {
+			str++;
+		} else if (*str != '\0') {
+
+			return(FALSE);
+		}
+	}
+
+	if (i == 0) {
+		/* If innodb_data_file_path was defined it must contain
+		at least one data file definition */
+
+		return(FALSE);
+	}
+
+	srv_data_file_names = static_cast<char**>(
+		malloc(i * sizeof *srv_data_file_names));
+
+	srv_data_file_sizes = static_cast<ulint*>(
+		malloc(i * sizeof *srv_data_file_sizes));
+
+	srv_data_file_is_raw_partition = static_cast<ulint*>(
+		malloc(i * sizeof *srv_data_file_is_raw_partition));
+
+	srv_n_data_files = i;
+
+	/* Then store the actual values to our arrays */
+
+	str = input_str;
+	i = 0;
+
+	while (*str != '\0') {
+		path = str;
+
+		/* Note that we must step over the ':' in a Windows path;
+		a Windows path normally looks like C:\ibdata\ibdata1:1G, but
+		a Windows raw partition may have a specification like
+		\\.\C::1Gnewraw or \\.\PHYSICALDRIVE2:1Gnewraw */
+
+		while ((*str != ':' && *str != '\0')
+		       || (*str == ':'
+			   && (*(str + 1) == '\\' || *(str + 1) == '/'
+			       || *(str + 1) == ':'))) {
+			str++;
+		}
+
+		if (*str == ':') {
+			/* Make path a null-terminated string */
+			*str = '\0';
+			str++;
+		}
+
+		str = srv_parse_megabytes(str, &size);
+
+		srv_data_file_names[i] = path;
+		srv_data_file_sizes[i] = size;
+
+		if (0 == strncmp(str, ":autoextend",
+				 (sizeof ":autoextend") - 1)) {
+
+			srv_auto_extend_last_data_file = TRUE;
+
+			str += (sizeof ":autoextend") - 1;
+
+			if (0 == strncmp(str, ":max:",
+					 (sizeof ":max:") - 1)) {
+
+				str += (sizeof ":max:") - 1;
+
+				str = srv_parse_megabytes(
+					str, &srv_last_file_size_max);
+			}
+
+			if (*str != '\0') {
+
+				return(FALSE);
+			}
+		}
+
+		(srv_data_file_is_raw_partition)[i] = 0;
+
+		if (strlen(str) >= 6
+		    && *str == 'n'
+		    && *(str + 1) == 'e'
+		    && *(str + 2) == 'w') {
+			str += 3;
+			(srv_data_file_is_raw_partition)[i] = SRV_NEW_RAW;
+		}
+
+		if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') {
+			str += 3;
+
+			if ((srv_data_file_is_raw_partition)[i] == 0) {
+				(srv_data_file_is_raw_partition)[i] = SRV_OLD_RAW;
+			}
+		}
+
+		i++;
+
+		if (*str == ';') {
+			str++;
+		}
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Frees the memory allocated by srv_parse_data_file_paths_and_sizes()
+and srv_parse_log_group_home_dirs(). */
+UNIV_INTERN
+void
+srv_free_paths_and_sizes(void)
+/*==========================*/
+{
+	free(srv_data_file_names);
+	srv_data_file_names = NULL;
+	free(srv_data_file_sizes);
+	srv_data_file_sizes = NULL;
+	free(srv_data_file_is_raw_partition);
+	srv_data_file_is_raw_partition = NULL;
+}
+
+#ifndef UNIV_HOTBACKUP
+
+static ulint io_tid_i = 0;
+
+/********************************************************************//**
+I/o-handler thread function.
+@return	OS_THREAD_DUMMY_RETURN */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(io_handler_thread)(
+/*==============================*/
+	void*	arg)	/*!< in: pointer to the number of the segment in
+			the aio array */
+{
+	ulint	segment;
+	ulint	tid_i = os_atomic_increment_ulint(&io_tid_i, 1) - 1;
+
+	ut_ad(tid_i < srv_n_file_io_threads);
+
+	segment = *((ulint*) arg);
+
+	srv_io_tids[tid_i] = os_thread_get_tid();
+	os_thread_set_priority(srv_io_tids[tid_i], srv_sched_priority_io);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Io handler thread %lu starts, id %lu\n", segment,
+		os_thread_pf(os_thread_get_curr_id()));
+#endif
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(io_handler_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+	while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) {
+		srv_current_thread_priority = srv_io_thread_priority;
+		fil_aio_wait(segment);
+	}
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit.
+	The thread actually never comes here because it is exited in an
+	os_event_wait(). */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Normalizes a directory path for Windows: converts slashes to backslashes. */
+UNIV_INTERN
+void
+srv_normalize_path_for_win(
+/*=======================*/
+	char*	str __attribute__((unused)))	/*!< in/out: null-terminated
+						character string */
+{
+#ifdef __WIN__
+	for (; *str; str++) {
+
+		if (*str == '/') {
+			*str = '\\';
+		}
+	}
+#endif
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Creates a log file.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+create_log_file(
+/*============*/
+	os_file_t*	file,	/*!< out: file handle */
+	const char*	name)	/*!< in: log file name */
+{
+	ibool		ret;
+
+	*file = os_file_create(
+		innodb_file_log_key, name,
+		OS_FILE_CREATE|OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL,
+		OS_LOG_FILE, &ret);
+
+	if (!ret) {
+		ib_logf(IB_LOG_LEVEL_ERROR, "Cannot create %s", name);
+		return(DB_ERROR);
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Setting log file %s size to %lu MB",
+		name, (ulong) srv_log_file_size
+		>> (20 - UNIV_PAGE_SIZE_SHIFT));
+
+	ret = os_file_set_size(name, *file,
+			       (os_offset_t) srv_log_file_size
+			       << UNIV_PAGE_SIZE_SHIFT);
+	if (!ret) {
+		ib_logf(IB_LOG_LEVEL_ERROR, "Cannot set log file"
+			" %s to size %lu MB", name, (ulong) srv_log_file_size
+			>> (20 - UNIV_PAGE_SIZE_SHIFT));
+		return(DB_ERROR);
+	}
+
+	ret = os_file_close(*file);
+	ut_a(ret);
+
+	return(DB_SUCCESS);
+}
+
+/** Initial number of the first redo log file */
+#define INIT_LOG_FILE0	(SRV_N_LOG_FILES_MAX + 1)
+
+#ifdef DBUG_OFF
+# define RECOVERY_CRASH(x) do {} while(0)
+#else
+# define RECOVERY_CRASH(x) do {						\
+	if (srv_force_recovery_crash == x) {				\
+		fprintf(stderr, "innodb_force_recovery_crash=%lu\n",	\
+			srv_force_recovery_crash);			\
+		fflush(stderr);						\
+		exit(3);						\
+	}								\
+} while (0)
+#endif
+
+/*********************************************************************//**
+Creates all log files.
+@return	DB_SUCCESS or error code */
+static
+dberr_t
+create_log_files(
+/*=============*/
+	bool	create_new_db,	/*!< in: TRUE if new database is being
+				created */
+	char*	logfilename,	/*!< in/out: buffer for log file name */
+	size_t	dirnamelen,	/*!< in: length of the directory path */
+	lsn_t	lsn,		/*!< in: FIL_PAGE_FILE_FLUSH_LSN value */
+	char*&	logfile0)	/*!< out: name of the first log file */
+{
+	if (srv_read_only_mode) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot create log files in read-only mode");
+		return(DB_READ_ONLY);
+	}
+
+	/* We prevent system tablespace creation with existing files in
+	data directory. So we do not delete log files when creating new system
+	tablespace */
+	if (!create_new_db) {
+		/* Remove any old log files. */
+		for (unsigned i = 0; i <= INIT_LOG_FILE0; i++) {
+			sprintf(logfilename + dirnamelen, "ib_logfile%u", i);
+
+			/* Ignore errors about non-existent files or files
+			that cannot be removed. The create_log_file() will
+			return an error when the file exists. */
+#ifdef __WIN__
+			DeleteFile((LPCTSTR) logfilename);
+#else
+			unlink(logfilename);
+#endif
+			/* Crashing after deleting the first
+			file should be recoverable. The buffer
+			pool was clean, and we can simply create
+			all log files from the scratch. */
+			RECOVERY_CRASH(6);
+		}
+	}
+
+	ut_ad(!buf_pool_check_no_pending_io());
+
+	RECOVERY_CRASH(7);
+
+	for (unsigned i = 0; i < srv_n_log_files; i++) {
+		sprintf(logfilename + dirnamelen,
+			"ib_logfile%u", i ? i : INIT_LOG_FILE0);
+
+		dberr_t err = create_log_file(&files[i], logfilename);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	RECOVERY_CRASH(8);
+
+	/* We did not create the first log file initially as
+	ib_logfile0, so that crash recovery cannot find it until it
+	has been completed and renamed. */
+	sprintf(logfilename + dirnamelen, "ib_logfile%u", INIT_LOG_FILE0);
+
+	fil_space_create(
+		logfilename, SRV_LOG_SPACE_FIRST_ID,
+		fsp_flags_set_page_size(0, UNIV_PAGE_SIZE),
+		FIL_LOG);
+	ut_a(fil_validate());
+
+	logfile0 = fil_node_create(
+		logfilename, (ulint) srv_log_file_size,
+		SRV_LOG_SPACE_FIRST_ID, FALSE);
+	ut_a(logfile0);
+
+	for (unsigned i = 1; i < srv_n_log_files; i++) {
+		sprintf(logfilename + dirnamelen, "ib_logfile%u", i);
+
+		if (!fil_node_create(
+			    logfilename,
+			    (ulint) srv_log_file_size,
+			    SRV_LOG_SPACE_FIRST_ID, FALSE)) {
+			ut_error;
+		}
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	/* Create the file space object for archived logs. */
+	fil_space_create("arch_log_space", SRV_LOG_SPACE_FIRST_ID + 1,
+			 0, FIL_LOG);
+#endif
+	log_group_init(0, srv_n_log_files,
+		       srv_log_file_size * UNIV_PAGE_SIZE,
+		       SRV_LOG_SPACE_FIRST_ID,
+		       SRV_LOG_SPACE_FIRST_ID + 1);
+
+	fil_open_log_and_system_tablespace_files();
+
+	/* Create a log checkpoint. */
+	mutex_enter(&log_sys->mutex);
+	ut_d(recv_no_log_write = FALSE);
+	recv_reset_logs(
+#ifdef UNIV_LOG_ARCHIVE
+		UT_LIST_GET_FIRST(log_sys->log_groups)->archived_file_no,
+		TRUE,
+#endif
+		lsn);
+	mutex_exit(&log_sys->mutex);
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Renames the first log file. */
+static
+void
+create_log_files_rename(
+/*====================*/
+	char*	logfilename,	/*!< in/out: buffer for log file name */
+	size_t	dirnamelen,	/*!< in: length of the directory path */
+	lsn_t	lsn,		/*!< in: FIL_PAGE_FILE_FLUSH_LSN value */
+	char*	logfile0)	/*!< in/out: name of the first log file */
+{
+	/* If innodb_flush_method=O_DSYNC,
+	we need to explicitly flush the log buffers. */
+	fil_flush(SRV_LOG_SPACE_FIRST_ID);
+	/* Close the log files, so that we can rename
+	the first one. */
+	fil_close_log_files(false);
+
+	/* Rename the first log file, now that a log
+	checkpoint has been created. */
+	sprintf(logfilename + dirnamelen, "ib_logfile%u", 0);
+
+	RECOVERY_CRASH(9);
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Renaming log file %s to %s", logfile0, logfilename);
+
+	mutex_enter(&log_sys->mutex);
+	ut_ad(strlen(logfile0) == 2 + strlen(logfilename));
+	ibool success = os_file_rename(
+		innodb_file_log_key, logfile0, logfilename);
+	ut_a(success);
+
+	RECOVERY_CRASH(10);
+
+	/* Replace the first file with ib_logfile0. */
+	strcpy(logfile0, logfilename);
+	mutex_exit(&log_sys->mutex);
+
+	fil_open_log_and_system_tablespace_files();
+
+	ib_logf(IB_LOG_LEVEL_WARN, "New log files created, LSN=" LSN_PF, lsn);
+}
+
+/*********************************************************************//**
+Opens a log file.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+open_log_file(
+/*==========*/
+	os_file_t*	file,	/*!< out: file handle */
+	const char*	name,	/*!< in: log file name */
+	os_offset_t*	size)	/*!< out: file size */
+{
+	ibool	ret;
+
+	*file = os_file_create(innodb_file_log_key, name,
+			       OS_FILE_OPEN, OS_FILE_AIO,
+			       OS_LOG_FILE, &ret);
+	if (!ret) {
+		ib_logf(IB_LOG_LEVEL_ERROR, "Unable to open '%s'", name);
+		return(DB_ERROR);
+	}
+
+	*size = os_file_get_size(*file);
+
+	ret = os_file_close(*file);
+	ut_a(ret);
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Creates or opens database data files and closes them.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+open_or_create_data_files(
+/*======================*/
+	ibool*		create_new_db,	/*!< out: TRUE if new database should be
+					created */
+#ifdef UNIV_LOG_ARCHIVE
+	lsn_t*		min_arch_log_no,/*!< out: min of archived log
+					numbers in data files */
+	lsn_t*		max_arch_log_no,/*!< out: max of archived log
+					numbers in data files */
+#endif /* UNIV_LOG_ARCHIVE */
+	lsn_t*		min_flushed_lsn,/*!< out: min of flushed lsn
+					values in data files */
+	lsn_t*		max_flushed_lsn,/*!< out: max of flushed lsn
+					values in data files */
+	ulint*		sum_of_new_sizes)/*!< out: sum of sizes of the
+					new files added */
+{
+	ibool		ret;
+	ulint		i;
+	ibool		one_opened	= FALSE;
+	ibool		one_created	= FALSE;
+	os_offset_t	size;
+	ulint		flags;
+	ulint		space;
+	ulint		rounded_size_pages;
+	char		name[10000];
+
+	if (srv_n_data_files >= 1000) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Can only have < 1000 data files, you have "
+			"defined %lu", (ulong) srv_n_data_files);
+
+		return(DB_ERROR);
+	}
+
+	*sum_of_new_sizes = 0;
+
+	*create_new_db = FALSE;
+
+	srv_normalize_path_for_win(srv_data_home);
+
+	for (i = 0; i < srv_n_data_files; i++) {
+		ulint	dirnamelen;
+
+		srv_normalize_path_for_win(srv_data_file_names[i]);
+		dirnamelen = strlen(srv_data_home);
+
+		ut_a(dirnamelen + strlen(srv_data_file_names[i])
+		     < (sizeof name) - 1);
+
+		memcpy(name, srv_data_home, dirnamelen);
+
+		/* Add a path separator if needed. */
+		if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
+			name[dirnamelen++] = SRV_PATH_SEPARATOR;
+		}
+
+		strcpy(name + dirnamelen, srv_data_file_names[i]);
+
+		/* Note: It will return true if the file doesn' exist. */
+
+		if (!srv_file_check_mode(name)) {
+
+			return(DB_FAIL);
+
+		} else if (srv_data_file_is_raw_partition[i] == 0) {
+
+			/* First we try to create the file: if it already
+			exists, ret will get value FALSE */
+
+			files[i] = os_file_create(
+				innodb_file_data_key, name, OS_FILE_CREATE,
+				OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+
+			if (srv_read_only_mode) {
+
+				if (ret) {
+					goto size_check;
+				}
+
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Opening %s failed!", name);
+
+				return(DB_ERROR);
+
+			} else if (!ret
+				   && os_file_get_last_error(false)
+				   != OS_FILE_ALREADY_EXISTS
+#ifdef UNIV_AIX
+			    	   /* AIX 5.1 after security patch ML7 may have
+			           errno set to 0 here, which causes our
+				   function to return 100; work around that
+				   AIX problem */
+				   && os_file_get_last_error(false) != 100
+#endif /* UNIV_AIX */
+			    ) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Creating or opening %s failed!",
+					name);
+
+				return(DB_ERROR);
+			}
+
+		} else if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) {
+
+			ut_a(!srv_read_only_mode);
+
+			/* The partition is opened, not created; then it is
+			written over */
+
+			srv_start_raw_disk_in_use = TRUE;
+			srv_created_new_raw = TRUE;
+
+			files[i] = os_file_create(
+				innodb_file_data_key, name, OS_FILE_OPEN_RAW,
+				OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+
+			if (!ret) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Error in opening %s", name);
+
+				return(DB_ERROR);
+			}
+		} else if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
+			srv_start_raw_disk_in_use = TRUE;
+
+			ret = FALSE;
+		} else {
+			ut_a(0);
+		}
+
+		if (ret == FALSE) {
+			const char* check_msg;
+			/* We open the data file */
+
+			if (one_created) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Data files can only be added at "
+					"the end of a tablespace, but "
+					"data file %s existed beforehand.",
+					name);
+				return(DB_ERROR);
+			}
+			if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
+				ut_a(!srv_read_only_mode);
+				files[i] = os_file_create(
+					innodb_file_data_key,
+					name, OS_FILE_OPEN_RAW,
+					OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+			} else if (i == 0) {
+				files[i] = os_file_create(
+					innodb_file_data_key,
+					name, OS_FILE_OPEN_RETRY,
+					OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+			} else {
+				files[i] = os_file_create(
+					innodb_file_data_key,
+					name, OS_FILE_OPEN, OS_FILE_NORMAL,
+					OS_DATA_FILE, &ret);
+			}
+
+			if (!ret) {
+
+				os_file_get_last_error(true);
+
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Can't open '%s'", name);
+
+				return(DB_ERROR);
+			}
+
+			if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
+
+				goto skip_size_check;
+			}
+
+size_check:
+			size = os_file_get_size(files[i]);
+			ut_a(size != (os_offset_t) -1);
+			/* Round size downward to megabytes */
+
+			rounded_size_pages = (ulint)
+				(size >> UNIV_PAGE_SIZE_SHIFT);
+
+			if (i == srv_n_data_files - 1
+			    && srv_auto_extend_last_data_file) {
+
+				if (srv_data_file_sizes[i] > rounded_size_pages
+				    || (srv_last_file_size_max > 0
+					&& srv_last_file_size_max
+					< rounded_size_pages)) {
+
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"auto-extending "
+						"data file %s is "
+						"of a different size "
+						"%lu pages (rounded "
+						"down to MB) than specified "
+						"in the .cnf file: "
+						"initial %lu pages, "
+						"max %lu (relevant if "
+						"non-zero) pages!",
+						name,
+						(ulong) rounded_size_pages,
+						(ulong) srv_data_file_sizes[i],
+						(ulong)
+						srv_last_file_size_max);
+
+					return(DB_ERROR);
+				}
+
+				srv_data_file_sizes[i] = rounded_size_pages;
+			}
+
+			if (rounded_size_pages != srv_data_file_sizes[i]) {
+
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Data file %s is of a different "
+					"size %lu pages (rounded down to MB) "
+					"than specified in the .cnf file "
+					"%lu pages!",
+					name,
+					(ulong) rounded_size_pages,
+					(ulong) srv_data_file_sizes[i]);
+
+				return(DB_ERROR);
+			}
+skip_size_check:
+			check_msg = fil_read_first_page(
+				files[i], one_opened, &flags, &space,
+				min_flushed_lsn, max_flushed_lsn);
+
+			if (check_msg) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"%s in data file %s",
+					check_msg, name);
+				return(DB_ERROR);
+			}
+
+			/* The first file of the system tablespace must
+			have space ID = TRX_SYS_SPACE.  The FSP_SPACE_ID
+			field in files greater than ibdata1 are unreliable. */
+			ut_a(one_opened || space == TRX_SYS_SPACE);
+
+			/* Check the flags for the first system tablespace
+			file only. */
+			if (!one_opened
+			    && UNIV_PAGE_SIZE
+			       != fsp_flags_get_page_size(flags)) {
+
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Data file \"%s\" uses page size %lu,"
+					"but the start-up parameter "
+					"is --innodb-page-size=%lu",
+					name,
+					fsp_flags_get_page_size(flags),
+					UNIV_PAGE_SIZE);
+
+				return(DB_ERROR);
+			}
+
+			one_opened = TRUE;
+		} else if (!srv_read_only_mode) {
+			/* We created the data file and now write it full of
+			zeros */
+
+			one_created = TRUE;
+
+			if (i > 0) {
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"Data file %s did not"
+					" exist: new to be created",
+					name);
+			} else {
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"The first specified "
+					"data file %s did not exist: "
+					"a new database to be created!",
+					name);
+
+				*create_new_db = TRUE;
+			}
+
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Setting file %s size to %lu MB",
+				name,
+				(ulong) (srv_data_file_sizes[i]
+					 >> (20 - UNIV_PAGE_SIZE_SHIFT)));
+
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Database physically writes the"
+				" file full: wait...");
+
+			ret = os_file_set_size(
+				name, files[i],
+				(os_offset_t) srv_data_file_sizes[i]
+				<< UNIV_PAGE_SIZE_SHIFT);
+
+			if (!ret) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Error in creating %s: "
+					"probably out of disk space",
+					name);
+
+				return(DB_ERROR);
+			}
+
+			*sum_of_new_sizes += srv_data_file_sizes[i];
+		}
+
+		ret = os_file_close(files[i]);
+		ut_a(ret);
+
+		if (i == 0) {
+			flags = fsp_flags_set_page_size(0, UNIV_PAGE_SIZE);
+			fil_space_create(name, 0, flags, FIL_TABLESPACE);
+		}
+
+		ut_a(fil_validate());
+
+		if (!fil_node_create(name, srv_data_file_sizes[i], 0,
+				     srv_data_file_is_raw_partition[i] != 0)) {
+			return(DB_ERROR);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Create undo tablespace.
+@return	DB_SUCCESS or error code */
+static
+dberr_t
+srv_undo_tablespace_create(
+/*=======================*/
+	const char*	name,		/*!< in: tablespace name */
+	ulint		size)		/*!< in: tablespace size in pages */
+{
+	os_file_t	fh;
+	ibool		ret;
+	dberr_t		err = DB_SUCCESS;
+
+	os_file_create_subdirs_if_needed(name);
+
+	fh = os_file_create(
+		innodb_file_data_key,
+		name,
+		srv_read_only_mode ? OS_FILE_OPEN : OS_FILE_CREATE,
+		OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+
+	if (srv_read_only_mode && ret) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"%s opened in read-only mode", name);
+	} else if (ret == FALSE) {
+		if (os_file_get_last_error(false) != OS_FILE_ALREADY_EXISTS
+#ifdef UNIV_AIX
+			/* AIX 5.1 after security patch ML7 may have
+			errno set to 0 here, which causes our function
+			to return 100; work around that AIX problem */
+		    && os_file_get_last_error(false) != 100
+#endif /* UNIV_AIX */
+		) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Can't create UNDO tablespace %s", name);
+		} else {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Creating system tablespace with"
+				" existing undo tablespaces is not"
+				" supported. Please delete all undo"
+				" tablespaces before creating new"
+				" system tablespace.");
+		}
+		err = DB_ERROR;
+	} else {
+		ut_a(!srv_read_only_mode);
+
+		/* We created the data file and now write it full of zeros */
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Data file %s did not exist: new to be created",
+			name);
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Setting file %s size to %lu MB",
+			name, size >> (20 - UNIV_PAGE_SIZE_SHIFT));
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Database physically writes the file full: wait...");
+
+		ret = os_file_set_size(name, fh, size << UNIV_PAGE_SIZE_SHIFT);
+
+		if (!ret) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Error in creating %s: probably out of "
+				"disk space", name);
+
+			err = DB_ERROR;
+		}
+
+		os_file_close(fh);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Open an undo tablespace.
+@return	DB_SUCCESS or error code */
+static
+dberr_t
+srv_undo_tablespace_open(
+/*=====================*/
+	const char*	name,		/*!< in: tablespace name */
+	ulint		space)		/*!< in: tablespace id */
+{
+	os_file_t	fh;
+	dberr_t		err	= DB_ERROR;
+	ibool		ret;
+	ulint		flags;
+
+	if (!srv_file_check_mode(name)) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"UNDO tablespaces must be %s!",
+			srv_read_only_mode ? "writable" : "readable");
+
+		return(DB_ERROR);
+	}
+
+	fh = os_file_create(
+		innodb_file_data_key, name,
+		OS_FILE_OPEN_RETRY
+		| OS_FILE_ON_ERROR_NO_EXIT
+		| OS_FILE_ON_ERROR_SILENT,
+		OS_FILE_NORMAL,
+		OS_DATA_FILE,
+		&ret);
+
+	/* If the file open was successful then load the tablespace. */
+
+	if (ret) {
+		os_offset_t	size;
+
+		size = os_file_get_size(fh);
+		ut_a(size != (os_offset_t) -1);
+
+		ret = os_file_close(fh);
+		ut_a(ret);
+
+		/* Load the tablespace into InnoDB's internal
+		data structures. */
+
+		/* We set the biggest space id to the undo tablespace
+		because InnoDB hasn't opened any other tablespace apart
+		from the system tablespace. */
+
+		fil_set_max_space_id_if_bigger(space);
+
+		/* Set the compressed page size to 0 (non-compressed) */
+		flags = fsp_flags_set_page_size(0, UNIV_PAGE_SIZE);
+		fil_space_create(name, space, flags, FIL_TABLESPACE);
+
+		ut_a(fil_validate());
+
+		os_offset_t	n_pages = size / UNIV_PAGE_SIZE;
+
+		/* On 64 bit Windows ulint can be 32 bit and os_offset_t
+		is 64 bit. It is OK to cast the n_pages to ulint because
+		the unit has been scaled to pages and they are always
+		32 bit. */
+		if (fil_node_create(name, (ulint) n_pages, space, FALSE)) {
+			err = DB_SUCCESS;
+		}
+	}
+
+	return(err);
+}
+
+/********************************************************************
+Opens the configured number of undo tablespaces.
+@return	DB_SUCCESS or error code */
+static
+dberr_t
+srv_undo_tablespaces_init(
+/*======================*/
+	ibool		create_new_db,		/*!< in: TRUE if new db being
+						created */
+	const ulint	n_conf_tablespaces,	/*!< in: configured undo
+						tablespaces */
+	ulint*		n_opened)		/*!< out: number of UNDO
+						tablespaces successfully
+						discovered and opened */
+{
+	ulint		i;
+	dberr_t		err = DB_SUCCESS;
+	ulint		prev_space_id = 0;
+	ulint		n_undo_tablespaces;
+	ulint		undo_tablespace_ids[TRX_SYS_N_RSEGS + 1];
+
+	*n_opened = 0;
+
+	ut_a(n_conf_tablespaces <= TRX_SYS_N_RSEGS);
+
+	memset(undo_tablespace_ids, 0x0, sizeof(undo_tablespace_ids));
+
+	/* Create the undo spaces only if we are creating a new
+	instance. We don't allow creating of new undo tablespaces
+	in an existing instance (yet).  This restriction exists because
+	we check in several places for SYSTEM tablespaces to be less than
+	the min of user defined tablespace ids. Once we implement saving
+	the location of the undo tablespaces and their space ids this
+	restriction will/should be lifted. */
+
+	for (i = 0; create_new_db && i < n_conf_tablespaces; ++i) {
+		char	name[OS_FILE_MAX_PATH];
+
+		ut_snprintf(
+			name, sizeof(name),
+			"%s%cundo%03lu",
+			srv_undo_dir, SRV_PATH_SEPARATOR, i + 1);
+
+		/* Undo space ids start from 1. */
+		err = srv_undo_tablespace_create(
+			name, SRV_UNDO_TABLESPACE_SIZE_IN_PAGES);
+
+		if (err != DB_SUCCESS) {
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Could not create undo tablespace '%s'.",
+				name);
+
+			return(err);
+		}
+	}
+
+	/* Get the tablespace ids of all the undo segments excluding
+	the system tablespace (0). If we are creating a new instance then
+	we build the undo_tablespace_ids ourselves since they don't
+	already exist. */
+
+	if (!create_new_db) {
+		n_undo_tablespaces = trx_rseg_get_n_undo_tablespaces(
+			undo_tablespace_ids);
+	} else {
+		n_undo_tablespaces = n_conf_tablespaces;
+
+		for (i = 1; i <= n_undo_tablespaces; ++i) {
+			undo_tablespace_ids[i - 1] = i;
+		}
+
+		undo_tablespace_ids[i] = ULINT_UNDEFINED;
+	}
+
+	/* Open all the undo tablespaces that are currently in use. If we
+	fail to open any of these it is a fatal error. The tablespace ids
+	should be contiguous. It is a fatal error because they are required
+	for recovery and are referenced by the UNDO logs (a.k.a RBS). */
+
+	for (i = 0; i < n_undo_tablespaces; ++i) {
+		char	name[OS_FILE_MAX_PATH];
+
+		ut_snprintf(
+			name, sizeof(name),
+			"%s%cundo%03lu",
+			srv_undo_dir, SRV_PATH_SEPARATOR,
+			undo_tablespace_ids[i]);
+
+		/* Should be no gaps in undo tablespace ids. */
+		ut_a(prev_space_id + 1 == undo_tablespace_ids[i]);
+
+		/* The system space id should not be in this array. */
+		ut_a(undo_tablespace_ids[i] != 0);
+		ut_a(undo_tablespace_ids[i] != ULINT_UNDEFINED);
+
+		/* Undo space ids start from 1. */
+
+		err = srv_undo_tablespace_open(name, undo_tablespace_ids[i]);
+
+		if (err != DB_SUCCESS) {
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Unable to open undo tablespace '%s'.", name);
+
+			return(err);
+		}
+
+		prev_space_id = undo_tablespace_ids[i];
+
+		++*n_opened;
+	}
+
+	/* Open any extra unused undo tablespaces. These must be contiguous.
+	We stop at the first failure. These are undo tablespaces that are
+	not in use and therefore not required by recovery. We only check
+	that there are no gaps. */
+
+	for (i = prev_space_id + 1; i < TRX_SYS_N_RSEGS; ++i) {
+		char	name[OS_FILE_MAX_PATH];
+
+		ut_snprintf(
+			name, sizeof(name),
+			"%s%cundo%03lu", srv_undo_dir, SRV_PATH_SEPARATOR, i);
+
+		/* Undo space ids start from 1. */
+		err = srv_undo_tablespace_open(name, i);
+
+		if (err != DB_SUCCESS) {
+			break;
+		}
+
+		++n_undo_tablespaces;
+
+		++*n_opened;
+	}
+
+	/* If the user says that there are fewer than what we find we
+	tolerate that discrepancy but not the inverse. Because there could
+	be unused undo tablespaces for future use. */
+
+	if (n_conf_tablespaces > n_undo_tablespaces) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Expected to open %lu undo "
+			"tablespaces but was able\n",
+			n_conf_tablespaces);
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: to find only %lu undo "
+			"tablespaces.\n", n_undo_tablespaces);
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Set the "
+			"innodb_undo_tablespaces parameter to "
+			"the\n");
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: correct value and retry. Suggested "
+			"value is %lu\n", n_undo_tablespaces);
+
+		return(err != DB_SUCCESS ? err : DB_ERROR);
+
+	} else  if (n_undo_tablespaces > 0) {
+
+		ib_logf(IB_LOG_LEVEL_INFO, "Opened %lu undo tablespaces",
+			n_undo_tablespaces);
+
+		if (n_conf_tablespaces == 0) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Using the system tablespace for all UNDO "
+				"logging because innodb_undo_tablespaces=0");
+		}
+	}
+
+	if (create_new_db) {
+		mtr_t	mtr;
+
+		mtr_start(&mtr);
+
+		/* The undo log tablespace */
+		for (i = 1; i <= n_undo_tablespaces; ++i) {
+
+			fsp_header_init(
+				i, SRV_UNDO_TABLESPACE_SIZE_IN_PAGES, &mtr);
+		}
+
+		mtr_commit(&mtr);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/********************************************************************
+Wait for the purge thread(s) to start up. */
+static
+void
+srv_start_wait_for_purge_to_start()
+/*===============================*/
+{
+	/* Wait for the purge coordinator and master thread to startup. */
+
+	purge_state_t	state = trx_purge_state();
+
+	ut_a(state != PURGE_STATE_DISABLED);
+
+	while (srv_shutdown_state == SRV_SHUTDOWN_NONE
+	       && srv_force_recovery < SRV_FORCE_NO_BACKGROUND
+	       && state == PURGE_STATE_INIT) {
+
+		switch (state = trx_purge_state()) {
+		case PURGE_STATE_RUN:
+		case PURGE_STATE_STOP:
+			break;
+
+		case PURGE_STATE_INIT:
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Waiting for purge to start");
+
+			os_thread_sleep(50000);
+			break;
+
+		case PURGE_STATE_EXIT:
+		case PURGE_STATE_DISABLED:
+			ut_error;
+		}
+	}
+}
+
+/*********************************************************************//**
+Initializes the log tracking subsystem and starts its thread.  */
+static
+void
+init_log_online(void)
+/*=================*/
+{
+	if (UNIV_UNLIKELY(srv_force_recovery > 0 || srv_read_only_mode)) {
+		srv_track_changed_pages = FALSE;
+		return;
+	}
+
+	if (srv_track_changed_pages) {
+
+		log_online_read_init();
+
+		/* Create the thread that follows the redo log to output the
+		   changed page bitmap */
+		os_thread_create(&srv_redo_log_follow_thread, NULL,
+				 thread_ids + 5 + SRV_MAX_N_IO_THREADS);
+	}
+}
+
+/********************************************************************
+Starts InnoDB and creates a new database if database files
+are not found and the user wants.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+innobase_start_or_create_for_mysql(void)
+/*====================================*/
+{
+	ibool		create_new_db;
+	lsn_t		min_flushed_lsn;
+	lsn_t		max_flushed_lsn;
+#ifdef UNIV_LOG_ARCHIVE
+	lsn_t		min_arch_log_no;
+	lsn_t		max_arch_log_no;
+#endif /* UNIV_LOG_ARCHIVE */
+	ulint		sum_of_new_sizes;
+	ulint		sum_of_data_file_sizes;
+	ulint		tablespace_size_in_header;
+	dberr_t		err;
+	unsigned	i;
+	ulint		srv_n_log_files_found = srv_n_log_files;
+	ulint		io_limit;
+	mtr_t		mtr;
+	ib_bh_t*	ib_bh;
+	ulint		n_recovered_trx;
+	char		logfilename[10000];
+	char*		logfile0	= NULL;
+	size_t		dirnamelen;
+
+	if (srv_read_only_mode) {
+		ib_logf(IB_LOG_LEVEL_INFO, "Started in read only mode");
+	}
+
+#ifdef HAVE_DARWIN_THREADS
+# ifdef F_FULLFSYNC
+	/* This executable has been compiled on Mac OS X 10.3 or later.
+	Assume that F_FULLFSYNC is available at run-time. */
+	srv_have_fullfsync = TRUE;
+# else /* F_FULLFSYNC */
+	/* This executable has been compiled on Mac OS X 10.2
+	or earlier.  Determine if the executable is running
+	on Mac OS X 10.3 or later. */
+	struct utsname utsname;
+	if (uname(&utsname)) {
+		ut_print_timestamp(stderr);
+		fputs(" InnoDB: cannot determine Mac OS X version!\n", stderr);
+	} else {
+		srv_have_fullfsync = strcmp(utsname.release, "7.") >= 0;
+	}
+	if (!srv_have_fullfsync) {
+		ut_print_timestamp(stderr);
+		fputs(" InnoDB: On Mac OS X, fsync() may be "
+		      "broken on internal drives,\n", stderr);
+		ut_print_timestamp(stderr);
+		fputs(" InnoDB: making transactions unsafe!\n", stderr);
+	}
+# endif /* F_FULLFSYNC */
+#endif /* HAVE_DARWIN_THREADS */
+
+	if (sizeof(ulint) != sizeof(void*)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error: size of InnoDB's ulint is %lu, "
+			"but size of void*\n", (ulong) sizeof(ulint));
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: is %lu. The sizes should be the same "
+			"so that on a 64-bit\n",
+			(ulong) sizeof(void*));
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: platforms you can allocate more than 4 GB "
+			"of memory.\n");
+	}
+
+	/* If stacktrace is used we set up signal handler for SIGUSR2 signal
+	here. If signal handler set fails we report that and disable
+	stacktrace feature. */
+
+	if (srv_use_stacktrace) {
+#ifdef __linux__
+		 struct sigaction sigact;
+
+		 sigact.sa_sigaction = os_stacktrace_print;
+		 sigact.sa_flags = SA_RESTART | SA_SIGINFO;
+
+		 if (sigaction(SIGUSR2, &sigact, (struct sigaction *)NULL) != 0)
+		 {
+			 fprintf(stderr, " InnoDB:error setting signal handler for %d (%s)\n",
+				 SIGUSR2, strsignal(SIGUSR2));
+			 srv_use_stacktrace = FALSE;
+
+		 }
+#endif /* __linux__ */
+	}
+
+#ifdef UNIV_DEBUG
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: !!!!!!!! UNIV_DEBUG switched on !!!!!!!!!\n");
+#endif
+
+#ifdef UNIV_IBUF_DEBUG
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: !!!!!!!! UNIV_IBUF_DEBUG switched on !!!!!!!!!\n");
+# ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: !!!!!!!! UNIV_IBUF_COUNT_DEBUG switched on "
+		"!!!!!!!!!\n");
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: Crash recovery will fail with UNIV_IBUF_COUNT_DEBUG\n");
+# endif
+#endif
+
+#ifdef UNIV_BLOB_DEBUG
+	fprintf(stderr,
+		"InnoDB: !!!!!!!! UNIV_BLOB_DEBUG switched on !!!!!!!!!\n"
+		"InnoDB: Server restart may fail with UNIV_BLOB_DEBUG\n");
+#endif /* UNIV_BLOB_DEBUG */
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: !!!!!!!! UNIV_SYNC_DEBUG switched on !!!!!!!!!\n");
+#endif
+
+#ifdef UNIV_SEARCH_DEBUG
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: !!!!!!!! UNIV_SEARCH_DEBUG switched on !!!!!!!!!\n");
+#endif
+
+#ifdef UNIV_LOG_LSN_DEBUG
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: !!!!!!!! UNIV_LOG_LSN_DEBUG switched on !!!!!!!!!\n");
+#endif /* UNIV_LOG_LSN_DEBUG */
+#ifdef UNIV_MEM_DEBUG
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: !!!!!!!! UNIV_MEM_DEBUG switched on !!!!!!!!!\n");
+#endif
+
+	if (srv_use_sys_malloc) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"The InnoDB memory heap is disabled");
+	}
+
+#if defined(COMPILER_HINTS_ENABLED)
+	ib_logf(IB_LOG_LEVEL_INFO,
+		" InnoDB: Compiler hints enabled.");
+#endif /* defined(COMPILER_HINTS_ENABLED) */
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"" IB_ATOMICS_STARTUP_MSG "");
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Compressed tables use zlib " ZLIB_VERSION
+#ifdef UNIV_ZIP_DEBUG
+	      " with validation"
+#endif /* UNIV_ZIP_DEBUG */
+	      );
+#ifdef UNIV_ZIP_COPY
+	ib_logf(IB_LOG_LEVEL_INFO, "and extra copying");
+#endif /* UNIV_ZIP_COPY */
+
+
+	/* Since InnoDB does not currently clean up all its internal data
+	structures in MySQL Embedded Server Library server_end(), we
+	print an error message if someone tries to start up InnoDB a
+	second time during the process lifetime. */
+
+	if (srv_start_has_been_called) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: startup called second time "
+			"during the process\n");
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: lifetime. In the MySQL Embedded "
+			"Server Library you\n");
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: cannot call server_init() more "
+			"than once during the\n");
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: process lifetime.\n");
+	}
+
+	srv_start_has_been_called = TRUE;
+
+#ifdef UNIV_DEBUG
+	log_do_write = TRUE;
+#endif /* UNIV_DEBUG */
+	/*	yydebug = TRUE; */
+
+	srv_is_being_started = TRUE;
+	srv_startup_is_before_trx_rollback_phase = TRUE;
+
+#ifdef __WIN__
+	switch (os_get_os_version()) {
+	case OS_WIN95:
+	case OS_WIN31:
+	case OS_WINNT:
+		srv_use_native_conditions = FALSE;
+		/* On Win 95, 98, ME, Win32 subsystem for Windows 3.1,
+		and NT use simulated aio. In NT Windows provides async i/o,
+		but when run in conjunction with InnoDB Hot Backup, it seemed
+		to corrupt the data files. */
+
+		srv_use_native_aio = FALSE;
+		break;
+
+	case OS_WIN2000:
+	case OS_WINXP:
+		/* On 2000 and XP, async IO is available, but no condition variables. */
+		srv_use_native_aio = TRUE;
+		srv_use_native_conditions = FALSE;
+ 		break;
+
+	default:
+		/* Vista and later have both async IO and condition variables */
+		srv_use_native_aio = TRUE;
+		srv_use_native_conditions = TRUE;
+		break;
+	}
+
+#elif defined(LINUX_NATIVE_AIO)
+
+	if (srv_use_native_aio) {
+		ib_logf(IB_LOG_LEVEL_INFO, "Using Linux native AIO");
+	}
+#else
+	/* Currently native AIO is supported only on windows and linux
+	and that also when the support is compiled in. In all other
+	cases, we ignore the setting of innodb_use_native_aio. */
+	srv_use_native_aio = FALSE;
+#endif /* __WIN__ */
+
+	if (srv_file_flush_method_str == NULL) {
+		/* These are the default options */
+
+		srv_unix_file_flush_method = SRV_UNIX_FSYNC;
+
+		srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "fsync")) {
+		srv_unix_file_flush_method = SRV_UNIX_FSYNC;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DSYNC")) {
+		srv_unix_file_flush_method = SRV_UNIX_O_DSYNC;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) {
+		srv_unix_file_flush_method = SRV_UNIX_O_DIRECT;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "ALL_O_DIRECT")) {
+		srv_unix_file_flush_method = SRV_UNIX_ALL_O_DIRECT;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT_NO_FSYNC")) {
+		srv_unix_file_flush_method = SRV_UNIX_O_DIRECT_NO_FSYNC;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) {
+		srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "nosync")) {
+		srv_unix_file_flush_method = SRV_UNIX_NOSYNC;
+#ifdef _WIN32
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) {
+		srv_win_file_flush_method = SRV_WIN_IO_NORMAL;
+		srv_use_native_aio = FALSE;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) {
+		srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+		srv_use_native_aio = FALSE;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str,
+				  "async_unbuffered")) {
+		srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+		srv_use_native_aio = TRUE;
+#endif /* __WIN__ */
+	} else {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unrecognized value %s for innodb_flush_method",
+			srv_file_flush_method_str);
+		return(DB_ERROR);
+	}
+
+	/* Note that the call srv_boot() also changes the values of
+	some variables to the units used by InnoDB internally */
+
+	/* Set the maximum number of threads which can wait for a semaphore
+	inside InnoDB: this is the 'sync wait array' size, as well as the
+	maximum number of threads that can wait in the 'srv_conc array' for
+	their time to enter InnoDB. */
+
+	if (srv_buf_pool_size >= 1000 * 1024 * 1024) {
+		/* If buffer pool is less than 1000 MB,
+		assume fewer threads. Also use only one
+		buffer pool instance */
+		srv_max_n_threads = 50000;
+
+	} else if (srv_buf_pool_size >= 8 * 1024 * 1024) {
+
+		srv_buf_pool_instances = 1;
+		srv_max_n_threads = 10000;
+	} else {
+		srv_buf_pool_instances = 1;
+
+		/* Saves several MB of memory, especially in
+		64-bit computers */
+
+		srv_max_n_threads = 1000;
+	}
+
+	srv_boot();
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"%s CPU crc32 instructions",
+		ut_crc32_sse2_enabled ? "Using" : "Not using");
+
+	if (!srv_read_only_mode) {
+
+		mutex_create(srv_monitor_file_mutex_key,
+			     &srv_monitor_file_mutex, SYNC_NO_ORDER_CHECK);
+
+		if (srv_innodb_status) {
+
+			srv_monitor_file_name = static_cast<char*>(
+				mem_alloc(
+					strlen(fil_path_to_mysql_datadir)
+					+ 20 + sizeof "/innodb_status."));
+
+			sprintf(srv_monitor_file_name, "%s/innodb_status.%lu",
+				fil_path_to_mysql_datadir,
+				os_proc_get_number());
+
+			srv_monitor_file = fopen(srv_monitor_file_name, "w+");
+
+			if (!srv_monitor_file) {
+
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Unable to create %s: %s",
+					srv_monitor_file_name,
+					strerror(errno));
+
+				return(DB_ERROR);
+			}
+		} else {
+			srv_monitor_file_name = NULL;
+			srv_monitor_file = os_file_create_tmpfile();
+
+			if (!srv_monitor_file) {
+				return(DB_ERROR);
+			}
+		}
+
+		mutex_create(srv_dict_tmpfile_mutex_key,
+			     &srv_dict_tmpfile_mutex, SYNC_DICT_OPERATION);
+
+		srv_dict_tmpfile = os_file_create_tmpfile();
+
+		if (!srv_dict_tmpfile) {
+			return(DB_ERROR);
+		}
+
+		mutex_create(srv_misc_tmpfile_mutex_key,
+			     &srv_misc_tmpfile_mutex, SYNC_ANY_LATCH);
+
+		srv_misc_tmpfile = os_file_create_tmpfile();
+
+		if (!srv_misc_tmpfile) {
+			return(DB_ERROR);
+		}
+	}
+
+	/* If user has set the value of innodb_file_io_threads then
+	we'll emit a message telling the user that this parameter
+	is now deprecated. */
+	if (srv_n_file_io_threads != 4) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"innodb_file_io_threads is deprecated. Please use "
+			"innodb_read_io_threads and innodb_write_io_threads "
+			"instead");
+	}
+
+	/* Now overwrite the value on srv_n_file_io_threads */
+	srv_n_file_io_threads = srv_n_read_io_threads;
+
+	if (!srv_read_only_mode) {
+		/* Add the log and ibuf IO threads. */
+		srv_n_file_io_threads += 2;
+		srv_n_file_io_threads += srv_n_write_io_threads;
+	} else {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Disabling background IO write threads.");
+
+		srv_n_write_io_threads = 0;
+	}
+
+	ut_a(srv_n_file_io_threads <= SRV_MAX_N_IO_THREADS);
+
+	io_limit = 8 * SRV_N_PENDING_IOS_PER_THREAD;
+
+	/* On Windows when using native aio the number of aio requests
+	that a thread can handle at a given time is limited to 32
+	i.e.: SRV_N_PENDING_IOS_PER_THREAD */
+# ifdef __WIN__
+	if (srv_use_native_aio) {
+		io_limit = SRV_N_PENDING_IOS_PER_THREAD;
+	}
+# endif /* __WIN__ */
+
+	if (!os_aio_init(io_limit,
+			 srv_n_read_io_threads,
+			 srv_n_write_io_threads,
+			 SRV_MAX_N_PENDING_SYNC_IOS)) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Fatal : Cannot initialize AIO sub-system");
+#if defined(LINUX_NATIVE_AIO)
+		ib_logf(IB_LOG_LEVEL_INFO,
+                        "You can try increasing system fs.aio-max-nr to 1048576 "
+                        "or larger or setting innodb_use_native_aio = 0 in my.cnf");
+#endif
+
+		return(DB_ERROR);
+	}
+
+	fil_init(srv_file_per_table ? 50000 : 5000, srv_max_n_open_files);
+
+	double	size;
+	char	unit;
+
+	if (srv_buf_pool_size >= 1024 * 1024 * 1024) {
+		size = ((double) srv_buf_pool_size) / (1024 * 1024 * 1024);
+		unit = 'G';
+	} else {
+		size = ((double) srv_buf_pool_size) / (1024 * 1024);
+		unit = 'M';
+	}
+
+	/* Print time to initialize the buffer pool */
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Initializing buffer pool, size = %.1f%c", size, unit);
+
+	err = buf_pool_init(srv_buf_pool_size, (ibool) srv_buf_pool_populate,
+			    srv_buf_pool_instances);
+
+	if (err != DB_SUCCESS) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot allocate memory for the buffer pool");
+
+		return(DB_ERROR);
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Completed initialization of buffer pool");
+
+#ifdef UNIV_DEBUG
+	/* We have observed deadlocks with a 5MB buffer pool but
+	the actual lower limit could very well be a little higher. */
+
+	if (srv_buf_pool_size <= 5 * 1024 * 1024) {
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Small buffer pool size (%luM), the flst_validate() "
+			"debug function can cause a deadlock if the "
+			"buffer pool fills up.",
+			srv_buf_pool_size / 1024 / 1024);
+	}
+#endif /* UNIV_DEBUG */
+
+	fsp_init();
+	log_init();
+
+	lock_sys_create(srv_lock_table_size);
+
+	/* Create i/o-handler threads: */
+
+	for (i = 0; i < srv_n_file_io_threads; ++i) {
+
+		n[i] = i;
+
+		os_thread_create(io_handler_thread, n + i, thread_ids + i);
+	}
+
+	if (srv_n_log_files * srv_log_file_size * UNIV_PAGE_SIZE
+	    >= 512ULL * 1024ULL * 1024ULL * 1024ULL) {
+		/* log_block_convert_lsn_to_no() limits the returned block
+		number to 1G and given that OS_FILE_LOG_BLOCK_SIZE is 512
+		bytes, then we have a limit of 512 GB. If that limit is to
+		be raised, then log_block_convert_lsn_to_no() must be
+		modified. */
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Combined size of log files must be < 512 GB");
+
+		return(DB_ERROR);
+	}
+
+	if (srv_n_log_files * srv_log_file_size >= ULINT_MAX) {
+		/* fil_io() takes ulint as an argument and we are passing
+		(next_offset / UNIV_PAGE_SIZE) to it in log_group_write_buf().
+		So (next_offset / UNIV_PAGE_SIZE) must be less than ULINT_MAX.
+		So next_offset must be < ULINT_MAX * UNIV_PAGE_SIZE. This
+		means that we are limited to ULINT_MAX * UNIV_PAGE_SIZE which
+		is 64 TB on 32 bit systems. */
+		fprintf(stderr,
+			" InnoDB: Error: combined size of log files"
+			" must be < %lu GB\n",
+			ULINT_MAX / 1073741824 * UNIV_PAGE_SIZE);
+
+		return(DB_ERROR);
+	}
+
+	sum_of_new_sizes = 0;
+
+	for (i = 0; i < srv_n_data_files; i++) {
+#ifndef __WIN__
+		if (sizeof(off_t) < 5
+		    && srv_data_file_sizes[i]
+		    >= (ulint) (1 << (32 - UNIV_PAGE_SIZE_SHIFT))) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Error: file size must be < 4 GB"
+				" with this MySQL binary\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: and operating system combination,"
+				" in some OS's < 2 GB\n");
+
+			return(DB_ERROR);
+		}
+#endif
+		sum_of_new_sizes += srv_data_file_sizes[i];
+	}
+
+	if (sum_of_new_sizes < 10485760 / UNIV_PAGE_SIZE) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Tablespace size must be at least 10 MB");
+
+		return(DB_ERROR);
+	}
+
+	err = open_or_create_data_files(&create_new_db,
+#ifdef UNIV_LOG_ARCHIVE
+					&min_arch_log_no, &max_arch_log_no,
+#endif /* UNIV_LOG_ARCHIVE */
+					&min_flushed_lsn, &max_flushed_lsn,
+					&sum_of_new_sizes);
+	if (err == DB_FAIL) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"The system tablespace must be writable!");
+
+		return(DB_ERROR);
+
+	} else if (err != DB_SUCCESS) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Could not open or create the system tablespace. If "
+			"you tried to add new data files to the system "
+			"tablespace, and it failed here, you should now "
+			"edit innodb_data_file_path in my.cnf back to what "
+			"it was, and remove the new ibdata files InnoDB "
+			"created in this failed attempt. InnoDB only wrote "
+			"those files full of zeros, but did not yet use "
+			"them in any way. But be careful: do not remove "
+			"old data files which contain your precious data!");
+
+		return(err);
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	srv_normalize_path_for_win(srv_arch_dir);
+#endif /* UNIV_LOG_ARCHIVE */
+
+	dirnamelen = strlen(srv_log_group_home_dir);
+	ut_a(dirnamelen < (sizeof logfilename) - 10 - sizeof "ib_logfile");
+	memcpy(logfilename, srv_log_group_home_dir, dirnamelen);
+
+	/* Add a path separator if needed. */
+	if (dirnamelen && logfilename[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
+		logfilename[dirnamelen++] = SRV_PATH_SEPARATOR;
+	}
+
+	srv_log_file_size_requested = srv_log_file_size;
+
+	if (create_new_db) {
+		bool success = buf_flush_list(ULINT_MAX, LSN_MAX, NULL);
+		ut_a(success);
+
+		min_flushed_lsn = max_flushed_lsn = log_get_lsn();
+
+		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+
+		err = create_log_files(create_new_db, logfilename, dirnamelen,
+				       max_flushed_lsn, logfile0);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	} else {
+		for (i = 0; i < SRV_N_LOG_FILES_MAX; i++) {
+			os_offset_t	size;
+			os_file_stat_t	stat_info;
+
+			sprintf(logfilename + dirnamelen,
+				"ib_logfile%u", i);
+
+			err = os_file_get_status(
+				logfilename, &stat_info, false);
+
+			if (err == DB_NOT_FOUND) {
+				if (i == 0) {
+					if (max_flushed_lsn
+					    != min_flushed_lsn) {
+						ib_logf(IB_LOG_LEVEL_ERROR,
+							"Cannot create"
+							" log files because"
+							" data files are"
+							" corrupt or"
+							" not in sync"
+							" with each other");
+						return(DB_ERROR);
+					}
+
+					if (max_flushed_lsn < (lsn_t) 1000) {
+						ib_logf(IB_LOG_LEVEL_ERROR,
+							"Cannot create"
+							" log files because"
+							" data files are"
+							" corrupt or the"
+							" database was not"
+							" shut down cleanly"
+							" after creating"
+							" the data files.");
+						return(DB_ERROR);
+					}
+
+					err = create_log_files(
+						create_new_db, logfilename,
+						dirnamelen, max_flushed_lsn,
+						logfile0);
+
+					if (err != DB_SUCCESS) {
+						return(err);
+					}
+
+					create_log_files_rename(
+						logfilename, dirnamelen,
+						max_flushed_lsn, logfile0);
+
+					/* Suppress the message about
+					crash recovery. */
+					max_flushed_lsn = min_flushed_lsn
+						= log_get_lsn();
+					goto files_checked;
+				} else if (i < 2) {
+					/* must have at least 2 log files */
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"Only one log file found.");
+					return(err);
+				}
+
+				/* opened all files */
+				break;
+			}
+
+			if (!srv_file_check_mode(logfilename)) {
+				return(DB_ERROR);
+			}
+
+			err = open_log_file(&files[i], logfilename, &size);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+
+			ut_a(size != (os_offset_t) -1);
+
+			if (size & ((1 << UNIV_PAGE_SIZE_SHIFT) - 1)) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Log file %s size "
+					UINT64PF " is not a multiple of"
+					" innodb_page_size",
+					logfilename, size);
+				return(DB_ERROR);
+			}
+
+			size >>= UNIV_PAGE_SIZE_SHIFT;
+
+			if (i == 0) {
+				srv_log_file_size = size;
+			} else if (size != srv_log_file_size) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Log file %s is"
+					" of different size "UINT64PF" bytes"
+					" than other log"
+					" files "UINT64PF" bytes!",
+					logfilename,
+					size << UNIV_PAGE_SIZE_SHIFT,
+					(os_offset_t) srv_log_file_size
+					<< UNIV_PAGE_SIZE_SHIFT);
+				return(DB_ERROR);
+			}
+		}
+
+		srv_n_log_files_found = i;
+
+		/* Create the in-memory file space objects. */
+
+		sprintf(logfilename + dirnamelen, "ib_logfile%u", 0);
+
+		fil_space_create(logfilename,
+				 SRV_LOG_SPACE_FIRST_ID,
+				 fsp_flags_set_page_size(0, UNIV_PAGE_SIZE),
+				 FIL_LOG);
+
+		ut_a(fil_validate());
+
+		/* srv_log_file_size is measured in pages; if page size is 16KB,
+		then we have a limit of 64TB on 32 bit systems */
+		ut_a(srv_log_file_size <= ULINT_MAX);
+
+		for (unsigned j = 0; j < i; j++) {
+			sprintf(logfilename + dirnamelen, "ib_logfile%u", j);
+
+			if (!fil_node_create(logfilename,
+					     (ulint) srv_log_file_size,
+					     SRV_LOG_SPACE_FIRST_ID, FALSE)) {
+				return(DB_ERROR);
+			}
+		}
+
+#ifdef UNIV_LOG_ARCHIVE
+		/* Create the file space object for archived logs. Under
+		MySQL, no archiving ever done. */
+		fil_space_create("arch_log_space", SRV_LOG_SPACE_FIRST_ID + 1,
+				 0, FIL_LOG);
+#endif /* UNIV_LOG_ARCHIVE */
+		log_group_init(0, i, srv_log_file_size * UNIV_PAGE_SIZE,
+			       SRV_LOG_SPACE_FIRST_ID,
+			       SRV_LOG_SPACE_FIRST_ID + 1);
+	}
+
+files_checked:
+	/* Open all log files and data files in the system
+	tablespace: we keep them open until database
+	shutdown */
+
+	fil_open_log_and_system_tablespace_files();
+
+	err = srv_undo_tablespaces_init(
+		create_new_db,
+		srv_undo_tablespaces,
+		&srv_undo_tablespaces_open);
+
+	/* If the force recovery is set very high then we carry on regardless
+	of all errors. Basically this is fingers crossed mode. */
+
+	if (err != DB_SUCCESS
+	    && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
+
+		return(err);
+	}
+
+	/* Initialize objects used by dict stats gathering thread, which
+	can also be used by recovery if it tries to drop some table */
+	if (!srv_read_only_mode) {
+		dict_stats_thread_init();
+	}
+
+	trx_sys_file_format_init();
+
+	trx_sys_create();
+
+	if (create_new_db) {
+
+		ut_a(!srv_read_only_mode);
+		init_log_online();
+
+		mtr_start(&mtr);
+
+		fsp_header_init(0, sum_of_new_sizes, &mtr);
+
+		mtr_commit(&mtr);
+
+		/* To maintain backward compatibility we create only
+		the first rollback segment before the double write buffer.
+		All the remaining rollback segments will be created later,
+		after the double write buffer has been created. */
+		trx_sys_create_sys_pages();
+
+		ib_bh = trx_sys_init_at_db_start();
+		n_recovered_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list);
+
+		/* The purge system needs to create the purge view and
+		therefore requires that the trx_sys is inited. */
+
+		trx_purge_sys_create(srv_n_purge_threads, ib_bh);
+
+		err = dict_create();
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+
+		srv_startup_is_before_trx_rollback_phase = FALSE;
+
+		bool success = buf_flush_list(ULINT_MAX, LSN_MAX, NULL);
+		ut_a(success);
+
+		min_flushed_lsn = max_flushed_lsn = log_get_lsn();
+
+		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+
+		/* Stamp the LSN to the data files. */
+		fil_write_flushed_lsn_to_data_files(max_flushed_lsn, 0);
+
+		fil_flush_file_spaces(FIL_TABLESPACE);
+
+		create_log_files_rename(logfilename, dirnamelen,
+					max_flushed_lsn, logfile0);
+#ifdef UNIV_LOG_ARCHIVE
+	} else if (srv_archive_recovery) {
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			" Starting archive recovery from a backup...");
+
+		err = recv_recovery_from_archive_start(
+			min_flushed_lsn, srv_archive_recovery_limit_lsn,
+			min_arch_log_no);
+		if (err != DB_SUCCESS) {
+
+			return(DB_ERROR);
+		}
+		/* Since ibuf init is in dict_boot, and ibuf is needed
+		in any disk i/o, first call dict_boot */
+
+		err = dict_boot();
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+
+		ib_bh = trx_sys_init_at_db_start();
+		n_recovered_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list);
+
+		/* The purge system needs to create the purge view and
+		therefore requires that the trx_sys is inited. */
+
+		trx_purge_sys_create(srv_n_purge_threads, ib_bh);
+
+		srv_startup_is_before_trx_rollback_phase = FALSE;
+
+		recv_recovery_from_archive_finish();
+#endif /* UNIV_LOG_ARCHIVE */
+	} else {
+
+		/* Check if we support the max format that is stamped
+		on the system tablespace.
+		Note:  We are NOT allowed to make any modifications to
+		the TRX_SYS_PAGE_NO page before recovery  because this
+		page also contains the max_trx_id etc. important system
+		variables that are required for recovery.  We need to
+		ensure that we return the system to a state where normal
+		recovery is guaranteed to work. We do this by
+		invalidating the buffer cache, this will force the
+		reread of the page and restoration to its last known
+		consistent state, this is REQUIRED for the recovery
+		process to work. */
+		err = trx_sys_file_format_max_check(
+			srv_max_file_format_at_startup);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+
+		/* Invalidate the buffer pool to ensure that we reread
+		the page that we read above, during recovery.
+		Note that this is not as heavy weight as it seems. At
+		this point there will be only ONE page in the buf_LRU
+		and there must be no page in the buf_flush list. */
+		buf_pool_invalidate();
+
+		/* We always try to do a recovery, even if the database had
+		been shut down normally: this is the normal startup path */
+
+		err = recv_recovery_from_checkpoint_start(
+			LOG_CHECKPOINT, LSN_MAX,
+			min_flushed_lsn, max_flushed_lsn);
+
+		if (err != DB_SUCCESS) {
+
+			return(DB_ERROR);
+		}
+
+		init_log_online();
+
+		/* Since the insert buffer init is in dict_boot, and the
+		insert buffer is needed in any disk i/o, first we call
+		dict_boot(). Note that trx_sys_init_at_db_start() only needs
+		to access space 0, and the insert buffer at this stage already
+		works for space 0. */
+
+		err = dict_boot();
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+
+		ib_bh = trx_sys_init_at_db_start();
+		n_recovered_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list);
+
+		/* The purge system needs to create the purge view and
+		therefore requires that the trx_sys is inited. */
+
+		trx_purge_sys_create(srv_n_purge_threads, ib_bh);
+
+		/* recv_recovery_from_checkpoint_finish needs trx lists which
+		are initialized in trx_sys_init_at_db_start(). */
+
+		recv_recovery_from_checkpoint_finish();
+
+		if (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE) {
+			/* The following call is necessary for the insert
+			buffer to work with multiple tablespaces. We must
+			know the mapping between space id's and .ibd file
+			names.
+
+			In a crash recovery, we check that the info in data
+			dictionary is consistent with what we already know
+			about space id's from the call of
+			fil_load_single_table_tablespaces().
+
+			In a normal startup, we create the space objects for
+			every table in the InnoDB data dictionary that has
+			an .ibd file.
+
+			We also determine the maximum tablespace id used. */
+			dict_check_t	dict_check;
+
+			if (recv_needed_recovery) {
+				dict_check = DICT_CHECK_ALL_LOADED;
+			} else if (n_recovered_trx) {
+				dict_check = DICT_CHECK_SOME_LOADED;
+			} else {
+				dict_check = DICT_CHECK_NONE_LOADED;
+			}
+
+			dict_check_tablespaces_and_store_max_id(dict_check);
+		}
+
+		if (!srv_force_recovery
+		    && !recv_sys->found_corrupt_log
+		    && (srv_log_file_size_requested != srv_log_file_size
+			|| srv_n_log_files_found != srv_n_log_files)) {
+			/* Prepare to replace the redo log files. */
+
+			if (srv_read_only_mode) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Cannot resize log files "
+					"in read-only mode.");
+				return(DB_READ_ONLY);
+			}
+
+			/* Clean the buffer pool. */
+			bool success = buf_flush_list(
+				ULINT_MAX, LSN_MAX, NULL);
+			ut_a(success);
+
+			RECOVERY_CRASH(1);
+
+			min_flushed_lsn = max_flushed_lsn = log_get_lsn();
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Resizing redo log from %u*%u to %u*%u pages"
+				", LSN=" LSN_PF,
+				(unsigned) i,
+				(unsigned) srv_log_file_size,
+				(unsigned) srv_n_log_files,
+				(unsigned) srv_log_file_size_requested,
+				max_flushed_lsn);
+
+			buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+
+			RECOVERY_CRASH(2);
+
+			/* Flush the old log files. */
+			log_buffer_flush_to_disk();
+			/* If innodb_flush_method=O_DSYNC,
+			we need to explicitly flush the log buffers. */
+			fil_flush(SRV_LOG_SPACE_FIRST_ID);
+
+			ut_ad(max_flushed_lsn == log_get_lsn());
+
+			/* Prohibit redo log writes from any other
+			threads until creating a log checkpoint at the
+			end of create_log_files(). */
+			ut_d(recv_no_log_write = TRUE);
+			ut_ad(!buf_pool_check_no_pending_io());
+
+			RECOVERY_CRASH(3);
+
+			/* Stamp the LSN to the data files. */
+			fil_write_flushed_lsn_to_data_files(
+				max_flushed_lsn, 0);
+
+			fil_flush_file_spaces(FIL_TABLESPACE);
+
+			RECOVERY_CRASH(4);
+
+			/* Close and free the redo log files, so that
+			we can replace them. */
+			fil_close_log_files(true);
+
+			RECOVERY_CRASH(5);
+
+			/* Free the old log file space. */
+			log_group_close_all();
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Starting to delete and rewrite log files.");
+
+			srv_log_file_size = srv_log_file_size_requested;
+
+			err = create_log_files(create_new_db, logfilename,
+					       dirnamelen, max_flushed_lsn,
+					       logfile0);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+
+			/* create_log_files() can increase system lsn that is
+			why FIL_PAGE_FILE_FLUSH_LSN have to be updated */
+			min_flushed_lsn = max_flushed_lsn = log_get_lsn();
+			fil_write_flushed_lsn_to_data_files(min_flushed_lsn, 0);
+			fil_flush_file_spaces(FIL_TABLESPACE);
+
+			create_log_files_rename(logfilename, dirnamelen,
+						log_get_lsn(), logfile0);
+		}
+
+		srv_startup_is_before_trx_rollback_phase = FALSE;
+		recv_recovery_rollback_active();
+
+		/* It is possible that file_format tag has never
+		been set. In this case we initialize it to minimum
+		value.  Important to note that we can do it ONLY after
+		we have finished the recovery process so that the
+		image of TRX_SYS_PAGE_NO is not stale. */
+		trx_sys_file_format_tag_init();
+	}
+
+	if (!create_new_db && sum_of_new_sizes > 0) {
+		/* New data file(s) were added */
+		mtr_start(&mtr);
+
+		fsp_header_inc_size(0, sum_of_new_sizes, &mtr);
+
+		mtr_commit(&mtr);
+
+		/* Immediately write the log record about increased tablespace
+		size to disk, so that it is durable even if mysqld would crash
+		quickly */
+
+		log_buffer_flush_to_disk();
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	/* Archiving is always off under MySQL */
+	if (!srv_log_archive_on) {
+		ut_a(DB_SUCCESS == log_archive_noarchivelog());
+	} else {
+		bool	start_archive;
+
+		mutex_enter(&(log_sys->mutex));
+
+		start_archive = FALSE;
+
+		if (log_sys->archiving_state == LOG_ARCH_OFF) {
+			start_archive = TRUE;
+		}
+
+		mutex_exit(&(log_sys->mutex));
+
+		if (start_archive) {
+			ut_a(DB_SUCCESS == log_archive_archivelog());
+		}
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	/* fprintf(stderr, "Max allowed record size %lu\n",
+	page_get_free_space_of_empty() / 2); */
+
+	if (buf_dblwr == NULL) {
+		/* Create the doublewrite buffer to a new tablespace */
+
+		buf_dblwr_create();
+	}
+
+	/* Here the double write buffer has already been created and so
+	any new rollback segments will be allocated after the double
+	write buffer. The default segment should already exist.
+	We create the new segments only if it's a new database or
+	the database was shutdown cleanly. */
+
+	/* Note: When creating the extra rollback segments during an upgrade
+	we violate the latching order, even if the change buffer is empty.
+	We make an exception in sync0sync.cc and check srv_is_being_started
+	for that violation. It cannot create a deadlock because we are still
+	running in single threaded mode essentially. Only the IO threads
+	should be running at this stage. */
+
+	ut_a(srv_undo_logs > 0);
+	ut_a(srv_undo_logs <= TRX_SYS_N_RSEGS);
+
+	/* The number of rsegs that exist in InnoDB is given by status
+	variable srv_available_undo_logs. The number of rsegs to use can
+	be set using the dynamic global variable srv_undo_logs. */
+
+	srv_available_undo_logs = trx_sys_create_rsegs(
+		srv_undo_tablespaces, srv_undo_logs);
+
+	if (srv_available_undo_logs == ULINT_UNDEFINED) {
+		/* Can only happen if force recovery is set. */
+		ut_a(srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
+		     || srv_read_only_mode);
+		srv_undo_logs = ULONG_UNDEFINED;
+	}
+
+	/* Flush the changes made to TRX_SYS_PAGE by trx_sys_create_rsegs()*/
+	if (!srv_force_recovery && !srv_read_only_mode) {
+		bool success = buf_flush_list(ULINT_MAX, LSN_MAX, NULL);
+		ut_a(success);
+		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+	}
+
+	if (!srv_read_only_mode) {
+		/* Create the thread which watches the timeouts
+		for lock waits */
+		os_thread_create(
+			lock_wait_timeout_thread,
+			NULL, thread_ids + 2 + SRV_MAX_N_IO_THREADS);
+
+		/* Create the thread which warns of long semaphore waits */
+		os_thread_create(
+			srv_error_monitor_thread,
+			NULL, thread_ids + 3 + SRV_MAX_N_IO_THREADS);
+
+		/* Create the thread which prints InnoDB monitor info */
+		os_thread_create(
+			srv_monitor_thread,
+			NULL, thread_ids + 4 + SRV_MAX_N_IO_THREADS);
+	}
+
+	/* Create the SYS_FOREIGN and SYS_FOREIGN_COLS system tables */
+	err = dict_create_or_check_foreign_constraint_tables();
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Create the SYS_TABLESPACES system table */
+	err = dict_create_or_check_sys_tablespace();
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	srv_is_being_started = FALSE;
+
+	ut_a(trx_purge_state() == PURGE_STATE_INIT);
+
+	/* Create the master thread which does purge and other utility
+	operations */
+
+	if (!srv_read_only_mode) {
+
+		os_thread_create(
+			srv_master_thread,
+			NULL, thread_ids + (1 + SRV_MAX_N_IO_THREADS));
+	}
+
+	if (!srv_read_only_mode
+	    && srv_force_recovery < SRV_FORCE_NO_BACKGROUND) {
+
+		os_thread_create(
+			srv_purge_coordinator_thread,
+			NULL, thread_ids + 5 + SRV_MAX_N_IO_THREADS);
+
+		ut_a(UT_ARR_SIZE(thread_ids)
+		     > 5 + srv_n_purge_threads + SRV_MAX_N_IO_THREADS);
+
+		/* We've already created the purge coordinator thread above. */
+		for (i = 1; i < srv_n_purge_threads; ++i) {
+			os_thread_create(
+				srv_worker_thread, NULL,
+				thread_ids + 5 + i + SRV_MAX_N_IO_THREADS);
+		}
+
+		srv_start_wait_for_purge_to_start();
+
+	} else {
+		purge_sys->state = PURGE_STATE_DISABLED;
+	}
+
+	if (!srv_read_only_mode) {
+		os_thread_create(buf_flush_page_cleaner_thread, NULL, NULL);
+	}
+
+#ifdef UNIV_DEBUG
+	/* buf_debug_prints = TRUE; */
+#endif /* UNIV_DEBUG */
+	sum_of_data_file_sizes = 0;
+
+	for (i = 0; i < srv_n_data_files; i++) {
+		sum_of_data_file_sizes += srv_data_file_sizes[i];
+	}
+
+	tablespace_size_in_header = fsp_header_get_tablespace_size();
+
+	if (!srv_read_only_mode
+	    && !srv_auto_extend_last_data_file
+	    && sum_of_data_file_sizes != tablespace_size_in_header) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error: tablespace size"
+			" stored in header is %lu pages, but\n",
+			(ulong) tablespace_size_in_header);
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"InnoDB: the sum of data file sizes is %lu pages\n",
+			(ulong) sum_of_data_file_sizes);
+
+		if (srv_force_recovery == 0
+		    && sum_of_data_file_sizes < tablespace_size_in_header) {
+			/* This is a fatal error, the tail of a tablespace is
+			missing */
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Cannot start InnoDB."
+				" The tail of the system tablespace is\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: missing. Have you edited"
+				" innodb_data_file_path in my.cnf in an\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: inappropriate way, removing"
+				" ibdata files from there?\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: You can set innodb_force_recovery=1"
+				" in my.cnf to force\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: a startup if you are trying"
+				" to recover a badly corrupt database.\n");
+
+			return(DB_ERROR);
+		}
+	}
+
+	if (!srv_read_only_mode
+	    && srv_auto_extend_last_data_file
+	    && sum_of_data_file_sizes < tablespace_size_in_header) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error: tablespace size stored in header"
+			" is %lu pages, but\n",
+			(ulong) tablespace_size_in_header);
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: the sum of data file sizes"
+			" is only %lu pages\n",
+			(ulong) sum_of_data_file_sizes);
+
+		if (srv_force_recovery == 0) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Cannot start InnoDB. The tail of"
+				" the system tablespace is\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: missing. Have you edited"
+				" innodb_data_file_path in my.cnf in an\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: inappropriate way, removing"
+				" ibdata files from there?\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: You can set innodb_force_recovery=1"
+				" in my.cnf to force\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: a startup if you are trying to"
+				" recover a badly corrupt database.\n");
+
+			return(DB_ERROR);
+		}
+	}
+
+	/* Check that os_fast_mutexes work as expected */
+	os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &srv_os_test_mutex);
+
+	if (0 != os_fast_mutex_trylock(&srv_os_test_mutex)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error: pthread_mutex_trylock returns"
+			" an unexpected value on\n");
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: success! Cannot continue.\n");
+		exit(1);
+	}
+
+	os_fast_mutex_unlock(&srv_os_test_mutex);
+
+	os_fast_mutex_lock(&srv_os_test_mutex);
+
+	os_fast_mutex_unlock(&srv_os_test_mutex);
+
+	os_fast_mutex_free(&srv_os_test_mutex);
+
+	if (!srv_file_per_table && srv_pass_corrupt_table) {
+		fprintf(stderr, "InnoDB: Warning:"
+			" The option innodb_file_per_table is disabled,"
+			" so using the option innodb_pass_corrupt_table doesn't make sense.\n");
+	}
+
+	if (srv_print_verbose_log) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			" Percona XtraDB (http://www.percona.com) %s started; "
+			"log sequence number " LSN_PF "",
+			INNODB_VERSION_STR, srv_start_lsn);
+	}
+
+	if (srv_force_recovery > 0) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"!!! innodb_force_recovery is set to %lu !!!",
+			(ulong) srv_force_recovery);
+	}
+
+	if (srv_force_recovery == 0) {
+		/* In the insert buffer we may have even bigger tablespace
+		id's, because we may have dropped those tablespaces, but
+		insert buffer merge has not had time to clean the records from
+		the ibuf tree. */
+
+		ibuf_update_max_tablespace_id();
+	}
+
+	if (!srv_read_only_mode) {
+		/* Create the buffer pool dump/load thread */
+		os_thread_create(buf_dump_thread, NULL, NULL);
+
+		/* Create the dict stats gathering thread */
+		os_thread_create(dict_stats_thread, NULL, NULL);
+
+		/* Create the thread that will optimize the FTS sub-system. */
+		fts_optimize_init();
+	}
+
+	srv_was_started = TRUE;
+
+	return(DB_SUCCESS);
+}
+
+#if 0
+/********************************************************************
+Sync all FTS cache before shutdown */
+static
+void
+srv_fts_close(void)
+/*===============*/
+{
+	dict_table_t*	table;
+
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	     table; table = UT_LIST_GET_NEXT(table_LRU, table)) {
+		fts_t*          fts = table->fts;
+
+		if (fts != NULL) {
+			fts_sync_table(table);
+		}
+	}
+
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU);
+	     table; table = UT_LIST_GET_NEXT(table_LRU, table)) {
+		fts_t*          fts = table->fts;
+
+		if (fts != NULL) {
+			fts_sync_table(table);
+		}
+	}
+}
+#endif
+
+/****************************************************************//**
+Shuts down the InnoDB database.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+innobase_shutdown_for_mysql(void)
+/*=============================*/
+{
+	ulint	i;
+
+	if (!srv_was_started) {
+		if (srv_is_being_started) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Shutting down an improperly started, "
+				"or created database!");
+		}
+
+		return(DB_SUCCESS);
+	}
+
+	if (!srv_read_only_mode) {
+		/* Shutdown the FTS optimize sub system. */
+		fts_optimize_start_shutdown();
+
+		fts_optimize_end();
+	}
+
+	/* 1. Flush the buffer pool to disk, write the current lsn to
+	the tablespace header(s), and copy all log data to archive.
+	The step 1 is the real InnoDB shutdown. The remaining steps 2 - ...
+	just free data structures after the shutdown. */
+
+	logs_empty_and_mark_files_at_shutdown();
+
+	if (srv_conc_get_active_threads() != 0) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Query counter shows %ld queries still "
+			"inside InnoDB at shutdown",
+			srv_conc_get_active_threads());
+	}
+
+	/* 2. Make all threads created by InnoDB to exit */
+
+	srv_shutdown_state = SRV_SHUTDOWN_EXIT_THREADS;
+
+	/* All threads end up waiting for certain events. Put those events
+	to the signaled state. Then the threads will exit themselves after
+	os_event_wait(). */
+
+	for (i = 0; i < 1000; i++) {
+		/* NOTE: IF YOU CREATE THREADS IN INNODB, YOU MUST EXIT THEM
+		HERE OR EARLIER */
+
+		if (!srv_read_only_mode) {
+			/* a. Let the lock timeout thread exit */
+			os_event_set(lock_sys->timeout_event);
+
+			/* b. srv error monitor thread exits automatically,
+			no need to do anything here */
+
+			/* c. We wake the master thread so that it exits */
+			srv_wake_master_thread();
+
+			/* d. Wakeup purge threads. */
+			srv_purge_wakeup();
+		}
+
+		/* e. Exit the i/o threads */
+
+		os_aio_wake_all_threads_at_shutdown();
+
+		/* f. dict_stats_thread is signaled from
+		logs_empty_and_mark_files_at_shutdown() and should have
+		already quit or is quitting right now. */
+
+		os_mutex_enter(os_sync_mutex);
+
+		if (os_thread_count == 0) {
+			/* All the threads have exited or are just exiting;
+			NOTE that the threads may not have completed their
+			exit yet. Should we use pthread_join() to make sure
+			they have exited? If we did, we would have to
+			remove the pthread_detach() from
+			os_thread_exit().  Now we just sleep 0.1
+			seconds and hope that is enough! */
+
+			os_mutex_exit(os_sync_mutex);
+
+			os_thread_sleep(100000);
+
+			break;
+		}
+
+		os_mutex_exit(os_sync_mutex);
+
+		os_thread_sleep(100000);
+	}
+
+	if (i == 1000) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"%lu threads created by InnoDB"
+			" had not exited at shutdown!",
+			(ulong) os_thread_count);
+	}
+
+	if (srv_monitor_file) {
+		fclose(srv_monitor_file);
+		srv_monitor_file = 0;
+		if (srv_monitor_file_name) {
+			unlink(srv_monitor_file_name);
+			mem_free(srv_monitor_file_name);
+		}
+	}
+
+	if (srv_dict_tmpfile) {
+		fclose(srv_dict_tmpfile);
+		srv_dict_tmpfile = 0;
+	}
+
+	if (srv_misc_tmpfile) {
+		fclose(srv_misc_tmpfile);
+		srv_misc_tmpfile = 0;
+	}
+
+	if (!srv_read_only_mode) {
+		dict_stats_thread_deinit();
+	}
+
+	/* This must be disabled before closing the buffer pool
+	and closing the data dictionary.  */
+	btr_search_disable();
+
+	ibuf_close();
+	log_shutdown();
+	lock_sys_close();
+	trx_sys_file_format_close();
+	trx_sys_close();
+
+	/* We don't create these mutexes in RO mode because we don't create
+	the temp files that the cover. */
+	if (!srv_read_only_mode) {
+		mutex_free(&srv_monitor_file_mutex);
+		mutex_free(&srv_dict_tmpfile_mutex);
+		mutex_free(&srv_misc_tmpfile_mutex);
+	}
+
+	dict_close();
+	btr_search_sys_free();
+
+	/* 3. Free all InnoDB's own mutexes and the os_fast_mutexes inside
+	them */
+	os_aio_free();
+	que_close();
+	row_mysql_close();
+	srv_mon_free();
+	sync_close();
+	srv_free();
+	fil_close();
+
+	/* 4. Free the os_conc_mutex and all os_events and os_mutexes */
+
+	os_sync_free();
+
+	/* 5. Free all allocated memory */
+
+	pars_lexer_close();
+	log_mem_free();
+	buf_pool_free(srv_buf_pool_instances);
+	mem_close();
+
+	/* ut_free_all_mem() frees all allocated memory not freed yet
+	in shutdown, and it will also free the ut_list_mutex, so it
+	should be the last one for all operation */
+	ut_free_all_mem();
+
+	if (os_thread_count != 0
+	    || os_event_count != 0
+	    || os_mutex_count != 0
+	    || os_fast_mutex_count != 0) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Some resources were not cleaned up in shutdown: "
+			"threads %lu, events %lu, os_mutexes %lu, "
+			"os_fast_mutexes %lu",
+			(ulong) os_thread_count, (ulong) os_event_count,
+			(ulong) os_mutex_count, (ulong) os_fast_mutex_count);
+	}
+
+	if (dict_foreign_err_file) {
+		fclose(dict_foreign_err_file);
+	}
+
+	if (srv_print_verbose_log) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Shutdown completed; log sequence number " LSN_PF "",
+			srv_shutdown_lsn);
+	}
+
+	srv_was_started = FALSE;
+	srv_start_has_been_called = FALSE;
+
+	return(DB_SUCCESS);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+
+/********************************************************************
+Signal all per-table background threads to shutdown, and wait for them to do
+so. */
+UNIV_INTERN
+void
+srv_shutdown_table_bg_threads(void)
+/*===============================*/
+{
+	dict_table_t*	table;
+	dict_table_t*	first;
+	dict_table_t*	last = NULL;
+
+	mutex_enter(&dict_sys->mutex);
+
+	/* Signal all threads that they should stop. */
+	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	first = table;
+	while (table) {
+		dict_table_t*	next;
+		fts_t*		fts = table->fts;
+
+		if (fts != NULL) {
+			fts_start_shutdown(table, fts);
+		}
+
+		next = UT_LIST_GET_NEXT(table_LRU, table);
+
+		if (!next) {
+			last = table;
+		}
+
+		table = next;
+	}
+
+	/* We must release dict_sys->mutex here; if we hold on to it in the
+	loop below, we will deadlock if any of the background threads try to
+	acquire it (for example, the FTS thread by calling que_eval_sql).
+
+	Releasing it here and going through dict_sys->table_LRU without
+	holding it is safe because:
+
+	 a) MySQL only starts the shutdown procedure after all client
+	 threads have been disconnected and no new ones are accepted, so no
+	 new tables are added or old ones dropped.
+
+	 b) Despite its name, the list is not LRU, and the order stays
+	 fixed.
+
+	To safeguard against the above assumptions ever changing, we store
+	the first and last items in the list above, and then check that
+	they've stayed the same below. */
+
+	mutex_exit(&dict_sys->mutex);
+
+	/* Wait for the threads of each table to stop. This is not inside
+	the above loop, because by signaling all the threads first we can
+	overlap their shutting down delays. */
+	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	ut_a(first == table);
+	while (table) {
+		dict_table_t*	next;
+		fts_t*		fts = table->fts;
+
+		if (fts != NULL) {
+			fts_shutdown(table, fts);
+		}
+
+		next = UT_LIST_GET_NEXT(table_LRU, table);
+
+		if (table == last) {
+			ut_a(!next);
+		}
+
+		table = next;
+	}
+}
+
+/*****************************************************************//**
+Get the meta-data filename from the table name. */
+UNIV_INTERN
+void
+srv_get_meta_data_filename(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: table */
+	char*			filename,	/*!< out: filename */
+	ulint			max_len)	/*!< in: filename max length */
+{
+	ulint			len;
+	char*			path;
+	char*			suffix;
+	static const ulint	suffix_len = strlen(".cfg");
+
+	if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+		dict_get_and_save_data_dir_path(table, false);
+		ut_a(table->data_dir_path);
+
+		path = os_file_make_remote_pathname(
+			table->data_dir_path, table->name, "cfg");
+	} else {
+		path = fil_make_ibd_name(table->name, false);
+	}
+
+	ut_a(path);
+	len = ut_strlen(path);
+	ut_a(max_len >= len);
+
+	suffix = path + (len - suffix_len);
+	if (strncmp(suffix, ".cfg", suffix_len) == 0) {
+		strcpy(filename, path);
+	} else {
+		ut_ad(strncmp(suffix, ".ibd", suffix_len) == 0);
+
+		strncpy(filename, path, len - suffix_len);
+		suffix = filename + (len - suffix_len);
+		strcpy(suffix, ".cfg");
+	}
+
+	mem_free(path);
+
+	srv_normalize_path_for_win(filename);
+}
diff --git a/storage/xtradb/sync/sync0arr.c b/storage/xtradb/sync/sync0arr.cc
index fb5829fb732..3eed5ae47f7 100644
--- a/storage/xtradb/sync/sync0arr.c
+++ b/storage/xtradb/sync/sync0arr.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -18,13 +18,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file sync/sync0arr.c
+@file sync/sync0arr.cc
 The wait array used in synchronization primitives
 
 Created 9/5/1995 Heikki Tuuri
@@ -39,6 +39,7 @@ Created 9/5/1995 Heikki Tuuri
 #include "sync0rw.h"
 #include "os0sync.h"
 #include "os0file.h"
+#include "lock0lock.h"
 #include "srv0srv.h"
 #include "ha_prototypes.h"
 
@@ -78,14 +79,15 @@ any waiting threads who have missed the signal. */
 /** A cell where an individual thread may wait suspended
 until a resource is released. The suspending is implemented
 using an operating system event semaphore. */
-struct sync_cell_struct {
+struct sync_cell_t {
 	void*		wait_object;	/*!< pointer to the object the
 					thread is waiting for; if NULL
 					the cell is free for use */
-	mutex_t*	old_wait_mutex;	/*!< the latest wait mutex in cell */
-	rw_lock_t*	old_wait_rw_lock;
-					/*!< the latest wait rw-lock
-					in cell */
+	void*		old_wait_mutex;	/*!< the latest regular or priority
+					wait mutex in cell */
+	void*		old_wait_rw_lock;
+					/*!< the latest regular or priority
+					wait rw-lock in cell */
 	ulint		request_type;	/*!< lock type requested on the
 					object */
 	const char*	file;		/*!< in debug version file where
@@ -116,33 +118,37 @@ all changes (set or reset) to the state of the event must be made
 while owning the mutex. */
 
 /** Synchronization array */
-struct sync_array_struct {
+struct sync_array_t {
 	ulint		n_reserved;	/*!< number of currently reserved
 					cells in the wait array */
 	ulint		n_cells;	/*!< number of cells in the
 					wait array */
 	sync_cell_t*	array;		/*!< pointer to wait array */
-	ulint		protection;	/*!< this flag tells which
-					mutex protects the data */
-	mutex_t		mutex;		/*!< possible database mutex
+	ib_mutex_t	mutex;		/*!< possible database mutex
 					protecting this data structure */
-	os_mutex_t	os_mutex;	/*!< Possible operating system mutex
+	os_ib_mutex_t	os_mutex;	/*!< Possible operating system mutex
 					protecting the data structure.
 					As this data structure is used in
 					constructing the database mutex,
 					to prevent infinite recursion
 					in implementation, we fall back to
 					an OS mutex. */
-	ulint		sg_count;	/*!< count of how many times an
-					object has been signalled */
 	ulint		res_count;	/*!< count of cell reservations
 					since creation of the array */
 };
 
-#ifdef UNIV_PFS_MUTEX
-/* Key to register the mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	syn_arr_mutex_key;
-#endif
+/** User configured sync array size */
+UNIV_INTERN ulong	srv_sync_array_size = 32;
+
+/** Locally stored copy of srv_sync_array_size */
+static	ulint		sync_array_size;
+
+/** The global array of wait cells for implementation of the database's own
+mutexes and read-write locks */
+static	sync_array_t**	sync_wait_array;
+
+/** count of how many times an object has been signalled */
+static ulint		sg_count;
 
 #ifdef UNIV_SYNC_DEBUG
 /******************************************************************//**
@@ -184,17 +190,7 @@ sync_array_enter(
 /*=============*/
 	sync_array_t*	arr)	/*!< in: sync wait array */
 {
-	ulint	protection;
-
-	protection = arr->protection;
-
-	if (protection == SYNC_ARRAY_OS_MUTEX) {
-		os_mutex_enter(arr->os_mutex);
-	} else if (protection == SYNC_ARRAY_MUTEX) {
-		mutex_enter(&(arr->mutex));
-	} else {
-		ut_error;
-	}
+	os_mutex_enter(arr->os_mutex);
 }
 
 /******************************************************************//**
@@ -205,17 +201,7 @@ sync_array_exit(
 /*============*/
 	sync_array_t*	arr)	/*!< in: sync wait array */
 {
-	ulint	protection;
-
-	protection = arr->protection;
-
-	if (protection == SYNC_ARRAY_OS_MUTEX) {
-		os_mutex_exit(arr->os_mutex);
-	} else if (protection == SYNC_ARRAY_MUTEX) {
-		mutex_exit(&(arr->mutex));
-	} else {
-		ut_error;
-	}
+	os_mutex_exit(arr->os_mutex);
 }
 
 /*******************************************************************//**
@@ -223,15 +209,12 @@ Creates a synchronization wait array. It is protected by a mutex
 which is automatically reserved when the functions operating on it
 are called.
 @return	own: created wait array */
-UNIV_INTERN
+static
 sync_array_t*
 sync_array_create(
 /*==============*/
-	ulint	n_cells,	/*!< in: number of cells in the array
+	ulint	n_cells)	/*!< in: number of cells in the array
 				to create */
-	ulint	protection)	/*!< in: either SYNC_ARRAY_OS_MUTEX or
-				SYNC_ARRAY_MUTEX: determines the type
-				of mutex protecting the data structure */
 {
 	ulint		sz;
 	sync_array_t*	arr;
@@ -239,54 +222,36 @@ sync_array_create(
 	ut_a(n_cells > 0);
 
 	/* Allocate memory for the data structures */
-	arr = ut_malloc(sizeof(sync_array_t));
+	arr = static_cast<sync_array_t*>(ut_malloc(sizeof(*arr)));
 	memset(arr, 0x0, sizeof(*arr));
 
 	sz = sizeof(sync_cell_t) * n_cells;
-	arr->array = ut_malloc(sz);
+	arr->array = static_cast<sync_cell_t*>(ut_malloc(sz));
 	memset(arr->array, 0x0, sz);
 
 	arr->n_cells = n_cells;
-	arr->protection = protection;
 
 	/* Then create the mutex to protect the wait array complex */
-	if (protection == SYNC_ARRAY_OS_MUTEX) {
-		arr->os_mutex = os_mutex_create();
-	} else if (protection == SYNC_ARRAY_MUTEX) {
-		mutex_create(syn_arr_mutex_key,
-			     &arr->mutex, SYNC_NO_ORDER_CHECK);
-	} else {
-		ut_error;
-	}
+	arr->os_mutex = os_mutex_create();
 
 	return(arr);
 }
 
 /******************************************************************//**
 Frees the resources in a wait array. */
-UNIV_INTERN
+static
 void
 sync_array_free(
 /*============*/
 	sync_array_t*	arr)	/*!< in, own: sync wait array */
 {
-	ulint		protection;
-
 	ut_a(arr->n_reserved == 0);
 
 	sync_array_validate(arr);
 
-	protection = arr->protection;
-
 	/* Release the mutex protecting the wait array complex */
 
-	if (protection == SYNC_ARRAY_OS_MUTEX) {
-		os_mutex_free(arr->os_mutex);
-	} else if (protection == SYNC_ARRAY_MUTEX) {
-		mutex_free(&(arr->mutex));
-	} else {
-		ut_error;
-	}
+	os_mutex_free(arr->os_mutex);
 
 	ut_free(arr->array);
 	ut_free(arr);
@@ -330,11 +295,21 @@ sync_cell_get_event(
 	ulint type = cell->request_type;
 
 	if (type == SYNC_MUTEX) {
-		return(((mutex_t *) cell->wait_object)->event);
+		return(((ib_mutex_t*) cell->wait_object)->event);
+	} else if (type == SYNC_PRIO_MUTEX) {
+		return(((ib_prio_mutex_t*) cell->wait_object)
+		       ->high_priority_event);
 	} else if (type == RW_LOCK_WAIT_EX) {
-		return(((rw_lock_t *) cell->wait_object)->wait_ex_event);
+		return(((rw_lock_t*) cell->wait_object)->wait_ex_event);
+	} else if (type == PRIO_RW_LOCK_SHARED) {
+		return(((prio_rw_lock_t *) cell->wait_object)
+		       ->high_priority_s_event);
+	} else if (type == PRIO_RW_LOCK_EX) {
+		return(((prio_rw_lock_t *) cell->wait_object)
+		       ->high_priority_x_event);
 	} else { /* RW_LOCK_SHARED and RW_LOCK_EX wait on the same event */
-		return(((rw_lock_t *) cell->wait_object)->event);
+		ut_ad(type == RW_LOCK_SHARED || type == RW_LOCK_EX);
+		return(((rw_lock_t*) cell->wait_object)->event);
 	}
 }
 
@@ -372,7 +347,7 @@ sync_array_reserve_cell(
 			cell->waiting = FALSE;
 			cell->wait_object = object;
 
-			if (type == SYNC_MUTEX) {
+			if (type == SYNC_MUTEX || type == SYNC_PRIO_MUTEX) {
 				cell->old_wait_mutex = object;
 			} else {
 				cell->old_wait_rw_lock = object;
@@ -395,7 +370,7 @@ sync_array_reserve_cell(
                         event = sync_cell_get_event(cell);
 			cell->signal_count = os_event_reset(event);
 
-			cell->reservation_time = time(NULL);
+			cell->reservation_time = ut_time();
 
 			cell->thread = os_thread_get_curr_id();
 
@@ -434,7 +409,7 @@ sync_array_wait_event(
 	ut_ad(os_thread_get_curr_id() == cell->thread);
 
 	event = sync_cell_get_event(cell);
-		cell->waiting = TRUE;
+	cell->waiting = TRUE;
 
 #ifdef UNIV_SYNC_DEBUG
 
@@ -469,8 +444,10 @@ sync_array_cell_print(
 	FILE*		file,	/*!< in: file where to print */
 	sync_cell_t*	cell)	/*!< in: sync cell */
 {
-	mutex_t*	mutex;
+	ib_mutex_t*	mutex;
+	ib_prio_mutex_t*	prio_mutex;
 	rw_lock_t*	rwlock;
+	prio_rw_lock_t*	prio_rwlock	= NULL;
 	ulint		type;
 	ulint		writer;
 
@@ -492,10 +469,19 @@ sync_array_cell_print(
 #endif
 	}
 
-	if (type == SYNC_MUTEX) {
+	if (type == SYNC_MUTEX || type == SYNC_PRIO_MUTEX) {
 		/* We use old_wait_mutex in case the cell has already
 		been freed meanwhile */
-		mutex = cell->old_wait_mutex;
+		if (type == SYNC_MUTEX) {
+
+			mutex = static_cast<ib_mutex_t*>(cell->old_wait_mutex);
+		} else {
+
+			prio_mutex = static_cast<ib_prio_mutex_t*>
+				(cell->old_wait_mutex);
+			mutex = &prio_mutex->base_mutex;
+		}
+
 
 		fprintf(file,
 			"Mutex at %p '%s', lock var %lu\n"
@@ -510,15 +496,38 @@ sync_array_cell_print(
 #endif /* UNIV_SYNC_DEBUG */
 			(ulong) mutex->waiters);
 
+		if (type == SYNC_PRIO_MUTEX) {
+
+			fprintf(file,
+				"high-priority waiters flag %lu\n",
+				(ulong) prio_mutex->high_priority_waiters);
+		}
+
 	} else if (type == RW_LOCK_EX
 		   || type == RW_LOCK_WAIT_EX
-		   || type == RW_LOCK_SHARED) {
+		   || type == RW_LOCK_SHARED
+		   || type == PRIO_RW_LOCK_SHARED
+		   || type == PRIO_RW_LOCK_EX) {
 
-		fputs(type == RW_LOCK_EX ? "X-lock on"
+		fputs((type == RW_LOCK_EX || type == PRIO_RW_LOCK_EX)
+		      ? "X-lock on"
 		      : type == RW_LOCK_WAIT_EX ? "X-lock (wait_ex) on"
 		      : "S-lock on", file);
 
-		rwlock = cell->old_wait_rw_lock;
+		/* Currently we are unable to tell high priority
+	        RW_LOCK_WAIT_EX waiter from a regular priority one.  Assume
+	        it's a regular one.  */
+		if (type == RW_LOCK_EX || type == RW_LOCK_WAIT_EX
+		    || type == RW_LOCK_SHARED) {
+
+			rwlock = static_cast<rw_lock_t *>
+				(cell->old_wait_rw_lock);
+		} else {
+
+			prio_rwlock = static_cast<prio_rw_lock_t *>
+				(cell->old_wait_rw_lock);
+			rwlock = &prio_rwlock->base_lock;
+		}
 
 		fprintf(file,
 			" RW-latch at %p '%s'\n",
@@ -556,6 +565,15 @@ sync_array_cell_print(
 #endif
 		}
 
+		if (prio_rwlock) {
+			fprintf(file, "high priority S waiters flag %lu, "
+				"high priority X waiters flag %lu, "
+				"wait-exclusive waiter is "
+				"high priority if exists: %lu\n",
+				prio_rwlock->high_priority_s_waiters,
+				prio_rwlock->high_priority_x_waiters,
+				prio_rwlock->high_priority_wait_ex_waiter);
+		}
 	} else {
 		ut_error;
 	}
@@ -608,7 +626,7 @@ sync_array_deadlock_step(
 	ulint		pass,	/*!< in: pass value */
 	ulint		depth)	/*!< in: recursion depth */
 {
-	sync_cell_t*	new;
+	sync_cell_t*	new_cell;
 
 	if (pass != 0) {
 		/* If pass != 0, then we do not know which threads are
@@ -618,17 +636,18 @@ sync_array_deadlock_step(
 		return(FALSE);
 	}
 
-	new = sync_array_find_thread(arr, thread);
+	new_cell = sync_array_find_thread(arr, thread);
 
-	if (UNIV_UNLIKELY(new == start)) {
+	if (new_cell == start) {
 		/* Deadlock */
 		fputs("########################################\n"
 		      "DEADLOCK of threads detected!\n", stderr);
 
 		return(TRUE);
 
-	} else if (new) {
-		return(sync_array_detect_deadlock(arr, start, new, depth + 1));
+	} else if (new_cell) {
+		return(sync_array_detect_deadlock(
+			arr, start, new_cell, depth + 1));
 	}
 	return(FALSE);
 }
@@ -647,7 +666,7 @@ sync_array_detect_deadlock(
 	sync_cell_t*	cell,	/*!< in: cell to search */
 	ulint		depth)	/*!< in: recursion depth */
 {
-	mutex_t*	mutex;
+	ib_mutex_t*	mutex;
 	rw_lock_t*	lock;
 	os_thread_id_t	thread;
 	ibool		ret;
@@ -667,9 +686,15 @@ sync_array_detect_deadlock(
 		return(FALSE); /* No deadlock here */
 	}
 
-	if (cell->request_type == SYNC_MUTEX) {
+	if (cell->request_type == SYNC_MUTEX
+	    || cell->request_type == SYNC_PRIO_MUTEX) {
 
-		mutex = cell->wait_object;
+		if (cell->request_type == SYNC_MUTEX) {
+			mutex = static_cast<ib_mutex_t*>(cell->wait_object);
+		} else {
+			mutex = &(static_cast<ib_prio_mutex_t*>(
+					  cell->wait_object))->base_mutex;
+		}
 
 		if (mutex_get_lock_word(mutex) != 0) {
 
@@ -699,13 +724,14 @@ sync_array_detect_deadlock(
 		return(FALSE); /* No deadlock */
 
 	} else if (cell->request_type == RW_LOCK_EX
+		   || cell->request_type == PRIO_RW_LOCK_EX
 		   || cell->request_type == RW_LOCK_WAIT_EX) {
 
-		lock = cell->wait_object;
-
-		debug = UT_LIST_GET_FIRST(lock->debug_list);
+		lock = static_cast<rw_lock_t*>(cell->wait_object);
 
-		while (debug != NULL) {
+		for (debug = UT_LIST_GET_FIRST(lock->debug_list);
+		     debug != 0;
+		     debug = UT_LIST_GET_NEXT(list, debug)) {
 
 			thread = debug->thread_id;
 
@@ -733,18 +759,18 @@ print:
 					return(TRUE);
 				}
 			}
-
-			debug = UT_LIST_GET_NEXT(list, debug);
 		}
 
 		return(FALSE);
 
-	} else if (cell->request_type == RW_LOCK_SHARED) {
+	} else if (cell->request_type == RW_LOCK_SHARED
+		   || cell->request_type == PRIO_RW_LOCK_SHARED) {
 
-		lock = cell->wait_object;
-		debug = UT_LIST_GET_FIRST(lock->debug_list);
+		lock = static_cast<rw_lock_t*>(cell->wait_object);
 
-		while (debug != NULL) {
+		for (debug = UT_LIST_GET_FIRST(lock->debug_list);
+		     debug != 0;
+		     debug = UT_LIST_GET_NEXT(list, debug)) {
 
 			thread = debug->thread_id;
 
@@ -763,8 +789,6 @@ print:
 					goto print;
 				}
 			}
-
-			debug = UT_LIST_GET_NEXT(list, debug);
 		}
 
 		return(FALSE);
@@ -786,21 +810,28 @@ sync_arr_cell_can_wake_up(
 /*======================*/
 	sync_cell_t*	cell)	/*!< in: cell to search */
 {
-	mutex_t*	mutex;
+	ib_mutex_t*	mutex;
 	rw_lock_t*	lock;
 
-	if (cell->request_type == SYNC_MUTEX) {
+	if (cell->request_type == SYNC_MUTEX
+	    || cell->request_type == SYNC_PRIO_MUTEX) {
 
-		mutex = cell->wait_object;
+		if (cell->request_type == SYNC_MUTEX) {
+			mutex = static_cast<ib_mutex_t*>(cell->wait_object);
+		} else {
+			mutex = &(static_cast<ib_prio_mutex_t*>(
+					  cell->wait_object))->base_mutex;
+		}
 
 		if (mutex_get_lock_word(mutex) == 0) {
 
 			return(TRUE);
 		}
 
-	} else if (cell->request_type == RW_LOCK_EX) {
+	} else if (cell->request_type == RW_LOCK_EX
+		   || cell->request_type == PRIO_RW_LOCK_EX) {
 
-		lock = cell->wait_object;
+		lock = static_cast<rw_lock_t*>(cell->wait_object);
 
 		if (lock->lock_word > 0) {
 		/* Either unlocked or only read locked. */
@@ -810,21 +841,25 @@ sync_arr_cell_can_wake_up(
 
         } else if (cell->request_type == RW_LOCK_WAIT_EX) {
 
-		lock = cell->wait_object;
+		lock = static_cast<rw_lock_t*>(cell->wait_object);
 
                 /* lock_word == 0 means all readers have left */
 		if (lock->lock_word == 0) {
 
 			return(TRUE);
 		}
-	} else if (cell->request_type == RW_LOCK_SHARED) {
-		lock = cell->wait_object;
+	} else if (cell->request_type == RW_LOCK_SHARED
+		   || cell->request_type == PRIO_RW_LOCK_SHARED) {
+		lock = static_cast<rw_lock_t*>(cell->wait_object);
 
                 /* lock_word > 0 means no writer or reserved writer */
 		if (lock->lock_word > 0) {
 
 			return(TRUE);
 		}
+	} else {
+
+		ut_error;
 	}
 
 	return(FALSE);
@@ -862,19 +897,14 @@ sync_array_free_cell(
 Increments the signalled count. */
 UNIV_INTERN
 void
-sync_array_object_signalled(
-/*========================*/
-	sync_array_t*	arr)	/*!< in: wait array */
+sync_array_object_signalled(void)
+/*=============================*/
 {
 #ifdef HAVE_ATOMIC_BUILTINS
-	(void) os_atomic_increment_ulint(&arr->sg_count, 1);
+	(void) os_atomic_increment_ulint(&sg_count, 1);
 #else
-	sync_array_enter(arr);
-
-	arr->sg_count++;
-
-	sync_array_exit(arr);
-#endif
+	++sg_count;
+#endif /* HAVE_ATOMIC_BUILTINS */
 }
 
 /**********************************************************************//**
@@ -885,57 +915,73 @@ function should be called about every 1 second in the server.
 Note that there's a race condition between this thread and mutex_exit
 changing the lock_word and calling signal_object, so sometimes this finds
 threads to wake up even when nothing has gone wrong. */
-UNIV_INTERN
+static
 void
-sync_arr_wake_threads_if_sema_free(void)
-/*====================================*/
+sync_array_wake_threads_if_sema_free_low(
+/*=====================================*/
+	sync_array_t*	arr)		/* in/out: wait array */
 {
-	sync_array_t*	arr	= sync_primary_wait_array;
-	sync_cell_t*	cell;
+	ulint		i = 0;
 	ulint		count;
-	ulint		i;
-	os_event_t      event;
 
 	sync_array_enter(arr);
 
-	i = 0;
-	count = 0;
-
-	while (count < arr->n_reserved) {
+	for (count = 0;  count < arr->n_reserved; ++i) {
+		sync_cell_t*	cell;
 
 		cell = sync_array_get_nth_cell(arr, i);
-		i++;
 
-		if (cell->wait_object == NULL) {
-			continue;
-		}
+		if (cell->wait_object != NULL) {
+
 			count++;
 
 			if (sync_arr_cell_can_wake_up(cell)) {
+				os_event_t      event;
 
-			event = sync_cell_get_event(cell);
+				event = sync_cell_get_event(cell);
 
-			os_event_set(event);
+				os_event_set(event);
+			}
 		}
-
 	}
 
 	sync_array_exit(arr);
 }
 
 /**********************************************************************//**
+If the wakeup algorithm does not work perfectly at semaphore relases,
+this function will do the waking (see the comment in mutex_exit). This
+function should be called about every 1 second in the server.
+
+Note that there's a race condition between this thread and mutex_exit
+changing the lock_word and calling signal_object, so sometimes this finds
+threads to wake up even when nothing has gone wrong. */
+UNIV_INTERN
+void
+sync_arr_wake_threads_if_sema_free(void)
+/*====================================*/
+{
+	ulint		i;
+
+	for (i = 0; i < sync_array_size; ++i) {
+
+		sync_array_wake_threads_if_sema_free_low(
+			sync_wait_array[i]);
+	}
+}
+
+/**********************************************************************//**
 Prints warnings of long semaphore waits to stderr.
 @return	TRUE if fatal semaphore wait threshold was exceeded */
-UNIV_INTERN
+static
 ibool
-sync_array_print_long_waits(
-/*========================*/
+sync_array_print_long_waits_low(
+/*============================*/
+	sync_array_t*	arr,	/*!< in: sync array instance */
 	os_thread_id_t*	waiter,	/*!< out: longest waiting thread */
-	const void**	sema)	/*!< out: longest-waited-for semaphore */
+	const void**	sema,	/*!< out: longest-waited-for semaphore */
+	ibool*		noticed)/*!< out: TRUE if long wait noticed */
 {
-	sync_cell_t*	cell;
-	ibool		old_val;
-	ibool		noticed = FALSE;
 	ulint		i;
 	ulint		fatal_timeout = srv_fatal_semaphore_wait_threshold;
 	ibool		fatal = FALSE;
@@ -958,14 +1004,13 @@ sync_array_print_long_waits(
 # define SYNC_ARRAY_TIMEOUT	240
 #endif
 
-	sync_array_enter(sync_primary_wait_array);
-
-	for (i = 0; i < sync_primary_wait_array->n_cells; i++) {
+	for (i = 0; i < arr->n_cells; i++) {
 
-		double	diff;
-		void*	wait_object;
+		double		diff;
+		sync_cell_t*	cell;
+		void*		wait_object;
 
-		cell = sync_array_get_nth_cell(sync_primary_wait_array, i);
+		cell = sync_array_get_nth_cell(arr, i);
 
 		wait_object = cell->wait_object;
 
@@ -980,7 +1025,7 @@ sync_array_print_long_waits(
 			fputs("InnoDB: Warning: a long semaphore wait:\n",
 			      stderr);
 			sync_array_cell_print(stderr, cell);
-			noticed = TRUE;
+			*noticed = TRUE;
 		}
 
 		if (diff > fatal_timeout) {
@@ -994,12 +1039,47 @@ sync_array_print_long_waits(
 		}
 	}
 
-	sync_array_exit(sync_primary_wait_array);
+#undef SYNC_ARRAY_TIMEOUT
+
+	return(fatal);
+}
+
+/**********************************************************************//**
+Prints warnings of long semaphore waits to stderr.
+@return	TRUE if fatal semaphore wait threshold was exceeded */
+UNIV_INTERN
+ibool
+sync_array_print_long_waits(
+/*========================*/
+	os_thread_id_t*	waiter,	/*!< out: longest waiting thread */
+	const void**	sema)	/*!< out: longest-waited-for semaphore */
+{
+	ulint		i;
+	ibool		fatal = FALSE;
+	ibool		noticed = FALSE;
+
+	for (i = 0; i < sync_array_size; ++i) {
+
+		sync_array_t*	arr = sync_wait_array[i];
+
+		sync_array_enter(arr);
+
+		if (sync_array_print_long_waits_low(
+				arr, waiter, sema, &noticed)) {
+
+			fatal = TRUE;
+		}
+
+		sync_array_exit(arr);
+	}
 
 	if (noticed) {
+		ibool	old_val;
+
 		fprintf(stderr,
 			"InnoDB: ###### Starts InnoDB Monitor"
 			" for 30 secs to print diagnostic info:\n");
+
 		old_val = srv_print_innodb_monitor;
 
 		/* If some crucial semaphore is reserved, then also the InnoDB
@@ -1010,11 +1090,11 @@ sync_array_print_long_waits(
 
 		fprintf(stderr,
 			"InnoDB: Pending preads %lu, pwrites %lu\n",
-			(ulong)os_file_n_pending_preads,
-			(ulong)os_file_n_pending_pwrites);
+			(ulong) os_file_n_pending_preads,
+			(ulong) os_file_n_pending_pwrites);
 
 		srv_print_innodb_monitor = TRUE;
-		os_event_set(srv_lock_timeout_thread_event);
+		os_event_set(lock_sys->timeout_event);
 
 		os_thread_sleep(30000000);
 
@@ -1024,8 +1104,6 @@ sync_array_print_long_waits(
 			" to the standard error stream\n");
 	}
 
-#undef SYNC_ARRAY_TIMEOUT
-
 	return(fatal);
 }
 
@@ -1033,38 +1111,33 @@ sync_array_print_long_waits(
 Prints info of the wait array. */
 static
 void
-sync_array_output_info(
-/*===================*/
+sync_array_print_info_low(
+/*======================*/
 	FILE*		file,	/*!< in: file where to print */
-	sync_array_t*	arr)	/*!< in: wait array; NOTE! caller must own the
-				mutex */
+	sync_array_t*	arr)	/*!< in: wait array */
 {
-	sync_cell_t*	cell;
-	ulint		count;
 	ulint		i;
+	ulint		count = 0;
 
 	fprintf(file,
-		"OS WAIT ARRAY INFO: reservation count %ld, signal count %ld\n",
-						(long) arr->res_count, (long) arr->sg_count);
-	i = 0;
-	count = 0;
+		"OS WAIT ARRAY INFO: reservation count %ld\n",
+		(long) arr->res_count);
 
-	while (count < arr->n_reserved) {
+	for (i = 0; count < arr->n_reserved; ++i) {
+		sync_cell_t*	cell;
 
 		cell = sync_array_get_nth_cell(arr, i);
 
-	if (cell->wait_object != NULL) {
-		count++;
+		if (cell->wait_object != NULL) {
+			count++;
 			sync_array_cell_print(file, cell);
 		}
-
-		i++;
 	}
 }
 
 /**********************************************************************//**
 Prints info of the wait array. */
-UNIV_INTERN
+static
 void
 sync_array_print_info(
 /*==================*/
@@ -1073,7 +1146,95 @@ sync_array_print_info(
 {
 	sync_array_enter(arr);
 
-	sync_array_output_info(file, arr);
+	sync_array_print_info_low(file, arr);
 
 	sync_array_exit(arr);
 }
+
+/**********************************************************************//**
+Create the primary system wait array(s), they are protected by an OS mutex */
+UNIV_INTERN
+void
+sync_array_init(
+/*============*/
+	ulint		n_threads)		/*!< in: Number of slots to
+						create in all arrays */
+{
+	ulint		i;
+	ulint		n_slots;
+
+	ut_a(sync_wait_array == NULL);
+	ut_a(srv_sync_array_size > 0);
+	ut_a(n_threads > srv_sync_array_size);
+
+	sync_array_size = srv_sync_array_size;
+
+	/* We have to use ut_malloc() because the mutex infrastructure
+	hasn't been initialised yet. It is required by mem_alloc() and
+	the heap functions. */
+
+	sync_wait_array = static_cast<sync_array_t**>(
+		ut_malloc(sizeof(*sync_wait_array) * sync_array_size));
+
+	n_slots = 1 + (n_threads - 1) / sync_array_size;
+
+	for (i = 0; i < sync_array_size; ++i) {
+
+		sync_wait_array[i] = sync_array_create(n_slots);
+	}
+}
+
+/**********************************************************************//**
+Close sync array wait sub-system. */
+UNIV_INTERN
+void
+sync_array_close(void)
+/*==================*/
+{
+	ulint		i;
+
+	for (i = 0; i < sync_array_size; ++i) {
+		sync_array_free(sync_wait_array[i]);
+	}
+
+	ut_free(sync_wait_array);
+	sync_wait_array = NULL;
+}
+
+/**********************************************************************//**
+Print info about the sync array(s). */
+UNIV_INTERN
+void
+sync_array_print(
+/*=============*/
+	FILE*		file)		/*!< in/out: Print to this stream */
+{
+	ulint		i;
+
+	for (i = 0; i < sync_array_size; ++i) {
+		sync_array_print_info(file, sync_wait_array[i]);
+	}
+
+	fprintf(file,
+		"OS WAIT ARRAY INFO: signal count %ld\n", (long) sg_count);
+
+}
+
+/**********************************************************************//**
+Get an instance of the sync wait array. */
+UNIV_INTERN
+sync_array_t*
+sync_array_get(void)
+/*================*/
+{
+	ulint		i;
+	static ulint	count;
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	i = os_atomic_increment_ulint(&count, 1);
+#else
+	i = count++;
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+	return(sync_wait_array[i % sync_array_size]);
+}
diff --git a/storage/xtradb/sync/sync0rw.c b/storage/xtradb/sync/sync0rw.cc
index 5068d1679c0..785e877d14c 100644
--- a/storage/xtradb/sync/sync0rw.c
+++ b/storage/xtradb/sync/sync0rw.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -24,7 +24,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file sync/sync0rw.c
+@file sync/sync0rw.cc
 The read-write lock (for thread synchronization)
 
 Created 9/11/1995 Heikki Tuuri
@@ -57,11 +57,11 @@ lock_word == 0:		       Write locked
 			       (-lock_word) is the number of readers
 			       that hold the lock.
 lock_word <= -X_LOCK_DECR:     Recursively write locked. lock_word has been
-			       decremented by X_LOCK_DECR once for each lock,
-			       so the number of locks is:
-			       ((-lock_word) / X_LOCK_DECR) + 1
-When lock_word <= -X_LOCK_DECR, we also know that lock_word % X_LOCK_DECR == 0:
-other values of lock_word are invalid.
+			       decremented by X_LOCK_DECR for the first lock
+			       and the first recursive lock, then by 1 for
+			       each recursive lock thereafter.
+			       So the number of locks is:
+			       (lock_copy == 0) ? 1 : 2 - (lock_copy + X_LOCK_DECR)
 
 The lock_word is always read and updated atomically and consistently, so that
 it always represents the state of the lock, and the state of the lock changes
@@ -124,50 +124,21 @@ wait_ex_event:	A thread may only wait on the wait_ex_event after it has
 		performed the following actions in order:
 		   (1) Decrement lock_word by X_LOCK_DECR.
 		   (2) Record counter value of wait_ex_event (os_event_reset,
-                       called from sync_array_reserve_cell).
+		       called from sync_array_reserve_cell).
 		   (3) Verify that lock_word < 0.
 		(1) must come first to ensures no other threads become reader
-                or next writer, and notifies unlocker that signal must be sent.
-                (2) must come before (3) to ensure the signal is not missed.
+		or next writer, and notifies unlocker that signal must be sent.
+		(2) must come before (3) to ensure the signal is not missed.
 		These restrictions force the above ordering.
 		Immediately before sending the wake-up signal, we should:
 		   Verify lock_word == 0 (waiting thread holds x_lock)
 */
 
-
-/** number of spin waits on rw-latches,
-resulted during shared (read) locks */
-UNIV_INTERN ib_int64_t	rw_s_spin_wait_count	= 0;
-/** number of spin loop rounds on rw-latches,
-resulted during shared (read) locks */
-UNIV_INTERN ib_int64_t	rw_s_spin_round_count	= 0;
-
-/** number of OS waits on rw-latches,
-resulted during shared (read) locks */
-UNIV_INTERN ib_int64_t	rw_s_os_wait_count	= 0;
-
-/** number of unlocks (that unlock shared locks),
-set only when UNIV_SYNC_PERF_STAT is defined */
-UNIV_INTERN ib_int64_t	rw_s_exit_count		= 0;
-
-/** number of spin waits on rw-latches,
-resulted during exclusive (write) locks */
-UNIV_INTERN ib_int64_t	rw_x_spin_wait_count	= 0;
-/** number of spin loop rounds on rw-latches,
-resulted during exclusive (write) locks */
-UNIV_INTERN ib_int64_t	rw_x_spin_round_count	= 0;
-
-/** number of OS waits on rw-latches,
-resulted during exclusive (write) locks */
-UNIV_INTERN ib_int64_t	rw_x_os_wait_count	= 0;
-
-/** number of unlocks (that unlock exclusive locks),
-set only when UNIV_SYNC_PERF_STAT is defined */
-UNIV_INTERN ib_int64_t	rw_x_exit_count		= 0;
+UNIV_INTERN rw_lock_stats_t	rw_lock_stats;
 
 /* The global list of rw-locks */
 UNIV_INTERN rw_lock_list_t	rw_lock_list;
-UNIV_INTERN mutex_t		rw_lock_list_mutex;
+UNIV_INTERN ib_mutex_t		rw_lock_list_mutex;
 
 #ifdef UNIV_PFS_MUTEX
 UNIV_INTERN mysql_pfs_key_t	rw_lock_list_mutex_key;
@@ -179,7 +150,7 @@ UNIV_INTERN mysql_pfs_key_t	rw_lock_mutex_key;
 To modify the debug info list of an rw-lock, this mutex has to be
 acquired in addition to the mutex protecting the lock. */
 
-UNIV_INTERN mutex_t		rw_lock_debug_mutex;
+UNIV_INTERN ib_mutex_t		rw_lock_debug_mutex;
 
 # ifdef UNIV_PFS_MUTEX
 UNIV_INTERN mysql_pfs_key_t	rw_lock_debug_mutex_key;
@@ -258,7 +229,7 @@ rw_lock_create_func(
 	ut_d(lock->mutex.cline = cline);
 
 	lock->mutex.cmutex_name = cmutex_name;
-	ut_d(lock->mutex.mutex_type = 1);
+	ut_d(lock->mutex.ib_mutex_type = 1);
 #else /* INNODB_RW_LOCKS_USE_ATOMICS */
 # ifdef UNIV_DEBUG
 	UT_NOT_USED(cfile_name);
@@ -292,8 +263,8 @@ rw_lock_create_func(
 	lock->last_x_file_name = "not yet reserved";
 	lock->last_s_line = 0;
 	lock->last_x_line = 0;
-	lock->event = os_event_create(NULL);
-	lock->wait_ex_event = os_event_create(NULL);
+	lock->event = os_event_create();
+	lock->wait_ex_event = os_event_create();
 
 	mutex_enter(&rw_lock_list_mutex);
 
@@ -306,6 +277,41 @@ rw_lock_create_func(
 }
 
 /******************************************************************//**
+Creates, or rather, initializes a priority rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+rw_lock_create_func(
+/*================*/
+	prio_rw_lock_t*	lock,		/*!< in: pointer to memory */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+	ulint		level,		/*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+	const char*	cfile_name,	/*!< in: file name where created */
+	ulint		cline,		/*!< in: file line where created */
+#endif /* UNIV_DEBUG */
+	const char*	cmutex_name)	/*!< in: mutex name */
+{
+	rw_lock_create_func(&lock->base_lock,
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+			    level,
+# endif
+			    cfile_name,
+			    cline,
+#endif
+			    cmutex_name);
+	lock->high_priority_s_waiters = 0;
+	lock->high_priority_s_event = os_event_create();
+	lock->high_priority_x_waiters = 0;
+	lock->high_priority_x_event = os_event_create();
+	lock->high_priority_wait_ex_waiter = 0;
+}
+
+/******************************************************************//**
 Calling this function is obligatory only if the memory buffer containing
 the rw-lock is freed. Removes an rw-lock object from the global list. The
 rw-lock is checked to be in the non-locked state. */
@@ -315,14 +321,19 @@ rw_lock_free_func(
 /*==============*/
 	rw_lock_t*	lock)	/*!< in: rw-lock */
 {
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+	ib_mutex_t*	mutex;
+#endif /* !INNODB_RW_LOCKS_USE_ATOMICS */
+
 	ut_ad(rw_lock_validate(lock));
 	ut_a(lock->lock_word == X_LOCK_DECR);
 
+	mutex_enter(&rw_lock_list_mutex);
+
 #ifndef INNODB_RW_LOCKS_USE_ATOMICS
-	mutex_free(rw_lock_get_mutex(lock));
-#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+	mutex = rw_lock_get_mutex(lock);
+#endif /* !INNODB_RW_LOCKS_USE_ATOMICS */
 
-	mutex_enter(&rw_lock_list_mutex);
 	os_event_free(lock->event);
 
 	os_event_free(lock->wait_ex_event);
@@ -337,6 +348,27 @@ rw_lock_free_func(
 	mutex_exit(&rw_lock_list_mutex);
 
 	ut_d(lock->magic_n = 0);
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+	/* We have merely removed the rw_lock from the list, the memory
+	has not been freed. Therefore the pointer to mutex is valid. */
+	mutex_free(mutex);
+#endif /* !INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+Calling this function is obligatory only if the memory buffer containing
+the priority rw-lock is freed. Removes an rw-lock object from the global list.
+The rw-lock is checked to be in the non-locked state. */
+UNIV_INTERN
+void
+rw_lock_free_func(
+/*==============*/
+	prio_rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	os_event_free(lock->high_priority_s_event);
+	os_event_free(lock->high_priority_x_event);
+	rw_lock_free_func(&lock->base_lock);
 }
 
 #ifdef UNIV_DEBUG
@@ -353,94 +385,146 @@ rw_lock_validate(
 	ulint	waiters;
 	lint	lock_word;
 
-	ut_a(lock);
+	ut_ad(lock);
 
 	waiters = rw_lock_get_waiters(lock);
 	lock_word = lock->lock_word;
 
 	ut_ad(lock->magic_n == RW_LOCK_MAGIC_N);
-	ut_a(waiters == 0 || waiters == 1);
-	ut_a(lock_word > -X_LOCK_DECR ||(-lock_word) % X_LOCK_DECR == 0);
+	ut_ad(waiters == 0 || waiters == 1);
+	ut_ad(lock_word > -(2 * X_LOCK_DECR));
+	ut_ad(lock_word <= X_LOCK_DECR);
 
 	return(TRUE);
 }
+
+/******************************************************************//**
+Checks that the priority rw-lock has been initialized and that there are no
+simultaneous shared and exclusive locks.
+@return	TRUE */
+UNIV_INTERN
+ibool
+rw_lock_validate(
+/*=============*/
+	prio_rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	ut_ad(lock->high_priority_s_waiters < 2);
+	ut_ad(lock->high_priority_x_waiters < 2);
+	return(rw_lock_validate(&lock->base_lock));
+}
+
 #endif /* UNIV_DEBUG */
 
 /******************************************************************//**
-Lock an rw-lock in shared mode for the current thread. If the rw-lock is
-locked in exclusive mode, or there is an exclusive lock request waiting,
-the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
-for the lock, before suspending the thread. */
+Lock a regular or priority rw-lock in shared mode for the current thread. If
+the rw-lock is locked in exclusive mode, or there is an exclusive lock request
+waiting, the function spins a preset time (controlled by SYNC_SPIN_ROUNDS),
+waiting for the lock, before suspending the thread. */
 UNIV_INTERN
 void
 rw_lock_s_lock_spin(
 /*================*/
-	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	void*		_lock,	/*!< in: pointer to rw-lock */
 	ulint		pass,	/*!< in: pass value; != 0, if the lock
 				will be passed to another thread to unlock */
+	bool		priority_lock,
+				/*!< in: whether the lock is a priority lock */
+	bool		high_priority,
+				/*!< in: whether we are acquiring a priority
+				lock with high priority */
 	const char*	file_name, /*!< in: file name where lock requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	ulint	 index;	/* index of the reserved wait cell */
-	ulint	 i = 0;	/* spin round count */
+	ulint		index;	/* index of the reserved wait cell */
+	ulint		i = 0;	/* spin round count */
+	sync_array_t*	sync_arr;
+	size_t		counter_index;
+	rw_lock_t*	lock = (rw_lock_t *) _lock;
+
+	/* We reuse the thread id to index into the counter, cache
+	it here for efficiency. */
+
+	counter_index = (size_t) os_thread_get_curr_id();
 
 	ut_ad(rw_lock_validate(lock));
 
-	rw_s_spin_wait_count++;	/*!< Count calls to this function */
+	rw_lock_stats.rw_s_spin_wait_count.add(counter_index, 1);
 lock_loop:
 
-	/* Spin waiting for the writer field to become free */
-	while (i < SYNC_SPIN_ROUNDS && lock->lock_word <= 0) {
-		if (srv_spin_wait_delay) {
-			ut_delay(ut_rnd_interval(0, srv_spin_wait_delay));
+	if (!rw_lock_higher_prio_waiters_exist(priority_lock, high_priority,
+					       lock)) {
+
+		/* Spin waiting for the writer field to become free */
+		while (i < SYNC_SPIN_ROUNDS && lock->lock_word <= 0) {
+			if (srv_spin_wait_delay) {
+				ut_delay(ut_rnd_interval(0,
+							 srv_spin_wait_delay));
+			}
+
+			i++;
 		}
 
-		i++;
-	}
+		if (i == SYNC_SPIN_ROUNDS) {
+			os_thread_yield();
+		}
 
-	if (i == SYNC_SPIN_ROUNDS) {
-		os_thread_yield();
-	}
+		if (srv_print_latch_waits) {
+			fprintf(stderr,
+				"Thread " ULINTPF " spin wait rw-s-lock at %p"
+				" '%s' rnds " ULINTPF "\n",
+				os_thread_pf(os_thread_get_curr_id()),
+				(void*) lock, lock->lock_name, i);
+		}
+	} else {
 
-	if (srv_print_latch_waits) {
-		fprintf(stderr,
-			"Thread %lu spin wait rw-s-lock at %p"
-			" '%s' rnds %lu\n",
-			(ulong) os_thread_pf(os_thread_get_curr_id()),
-			(void*) lock,
-			lock->lock_name, (ulong) i);
+		/* In case of higher priority waiters already present, perform
+		only this part of the spinning code path.  */
+		os_thread_yield();
 	}
 
 	/* We try once again to obtain the lock */
-	if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) {
-		rw_s_spin_round_count += i;
+	if (!rw_lock_higher_prio_waiters_exist(priority_lock, high_priority,
+					       lock)
+	    && (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line))) {
+		rw_lock_stats.rw_s_spin_round_count.add(counter_index, i);
 
 		return; /* Success */
 	} else {
 
-		if (i < SYNC_SPIN_ROUNDS) {
+		if (i > 0 && i < SYNC_SPIN_ROUNDS) {
 			goto lock_loop;
 		}
 
-		rw_s_spin_round_count += i;
+		rw_lock_stats.rw_s_spin_round_count.add(counter_index, i);
+
+		sync_arr = sync_array_get();
 
-		sync_array_reserve_cell(sync_primary_wait_array,
-					lock, RW_LOCK_SHARED,
-					file_name, line,
-					&index);
+		sync_array_reserve_cell(
+			sync_arr, lock,
+			high_priority ? PRIO_RW_LOCK_SHARED : RW_LOCK_SHARED,
+			file_name, line, &index);
 
 		/* Set waiters before checking lock_word to ensure wake-up
-                signal is sent. This may lead to some unnecessary signals. */
-		rw_lock_set_waiter_flag(lock);
+		signal is sent. This may lead to some unnecessary signals. */
+		if (high_priority) {
+			prio_rw_lock_t*	prio_rw_lock
+				= (prio_rw_lock_t *) _lock;
+			prio_rw_lock->high_priority_s_waiters = 1;
+		} else {
+			rw_lock_set_waiter_flag(lock);
+		}
 
-		if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) {
-			sync_array_free_cell(sync_primary_wait_array, index);
+		if (!rw_lock_higher_prio_waiters_exist(priority_lock,
+						       high_priority, lock)
+		    && (TRUE == rw_lock_s_lock_low(lock, pass,
+						   file_name, line))) {
+			sync_array_free_cell(sync_arr, index);
 			return; /* Success */
 		}
 
 		if (srv_print_latch_waits) {
 			fprintf(stderr,
-				"Thread %lu OS wait rw-s-lock at %p"
+				"Thread " ULINTPF " OS wait rw-s-lock at %p"
 				" '%s'\n",
 				os_thread_pf(os_thread_get_curr_id()),
 				(void*) lock, lock->lock_name);
@@ -448,9 +532,9 @@ lock_loop:
 
 		/* these stats may not be accurate */
 		lock->count_os_wait++;
-		rw_s_os_wait_count++;
+		rw_lock_stats.rw_s_os_wait_count.add(counter_index, 1);
 
-		sync_array_wait_event(sync_primary_wait_array, index);
+		sync_array_wait_event(sync_arr, index);
 
 		i = 0;
 		goto lock_loop;
@@ -485,6 +569,10 @@ void
 rw_lock_x_lock_wait(
 /*================*/
 	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	bool		high_priority,
+				/*!< in: if true, the rw lock is a priority
+				lock and is being acquired with high
+				priority  */
 #ifdef UNIV_SYNC_DEBUG
 	ulint		pass,	/*!< in: pass value; != 0, if the lock will
 				be passed to another thread to unlock */
@@ -492,8 +580,15 @@ rw_lock_x_lock_wait(
 	const char*	file_name,/*!< in: file name where lock requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	ulint index;
-	ulint i = 0;
+	ulint		index;
+	ulint		i = 0;
+	sync_array_t*	sync_arr;
+	size_t		counter_index;
+
+	/* We reuse the thread id to index into the counter, cache
+	it here for efficiency. */
+
+	counter_index = (size_t) os_thread_get_curr_id();
 
 	ut_ad(lock->lock_word <= 0);
 
@@ -507,59 +602,69 @@ rw_lock_x_lock_wait(
 		}
 
 		/* If there is still a reader, then go to sleep.*/
-		rw_x_spin_round_count += i;
+		rw_lock_stats.rw_x_spin_round_count.add(counter_index, i);
+
+		sync_arr = sync_array_get();
+
+		sync_array_reserve_cell(
+			sync_arr, lock, RW_LOCK_WAIT_EX,
+			file_name, line, &index);
+
+		if (high_priority) {
+
+			prio_rw_lock_t*	prio_rw_lock
+				= reinterpret_cast<prio_rw_lock_t *>(lock);
+			prio_rw_lock->high_priority_wait_ex_waiter = 1;
+		}
+
 		i = 0;
-		sync_array_reserve_cell(sync_primary_wait_array,
-					lock,
-					RW_LOCK_WAIT_EX,
-					file_name, line,
-					&index);
+
 		/* Check lock_word to ensure wake-up isn't missed.*/
-		if(lock->lock_word < 0) {
+		if (lock->lock_word < 0) {
 
 			/* these stats may not be accurate */
 			lock->count_os_wait++;
-			rw_x_os_wait_count++;
+			rw_lock_stats.rw_x_os_wait_count.add(counter_index, 1);
 
-                        /* Add debug info as it is needed to detect possible
-                        deadlock. We must add info for WAIT_EX thread for
-                        deadlock detection to work properly. */
+			/* Add debug info as it is needed to detect possible
+			deadlock. We must add info for WAIT_EX thread for
+			deadlock detection to work properly. */
 #ifdef UNIV_SYNC_DEBUG
 			rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX,
 					       file_name, line);
 #endif
 
-			sync_array_wait_event(sync_primary_wait_array,
-					      index);
+			sync_array_wait_event(sync_arr, index);
 #ifdef UNIV_SYNC_DEBUG
-			rw_lock_remove_debug_info(lock, pass,
-					       RW_LOCK_WAIT_EX);
+			rw_lock_remove_debug_info(
+				lock, pass, RW_LOCK_WAIT_EX);
 #endif
-                        /* It is possible to wake when lock_word < 0.
-                        We must pass the while-loop check to proceed.*/
+			/* It is possible to wake when lock_word < 0.
+			We must pass the while-loop check to proceed.*/
 		} else {
-			sync_array_free_cell(sync_primary_wait_array,
-					     index);
+			sync_array_free_cell(sync_arr, index);
 		}
 	}
-	rw_x_spin_round_count += i;
+	rw_lock_stats.rw_x_spin_round_count.add(counter_index, i);
 }
 
 /******************************************************************//**
 Low-level function for acquiring an exclusive lock.
-@return	RW_LOCK_NOT_LOCKED if did not succeed, RW_LOCK_EX if success. */
+@return	FALSE if did not succeed, TRUE if success. */
 UNIV_INLINE
 ibool
 rw_lock_x_lock_low(
 /*===============*/
 	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	bool		high_priority,
+				/*!< in: if true, the rw lock is a priority
+			        lock and is being acquired with high
+				priority  */
 	ulint		pass,	/*!< in: pass value; != 0, if the lock will
 				be passed to another thread to unlock */
 	const char*	file_name,/*!< in: file name where lock requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	os_thread_id_t	curr_thread	= os_thread_get_curr_id();
-
 	if (rw_lock_lock_word_decr(lock, X_LOCK_DECR)) {
 
 		/* lock->recursive also tells us if the writer_thread
@@ -569,29 +674,35 @@ rw_lock_x_lock_low(
 		ut_a(!lock->recursive);
 
 		/* Decrement occurred: we are writer or next-writer. */
-		rw_lock_set_writer_id_and_recursion_flag(lock,
-						pass ? FALSE : TRUE);
+		rw_lock_set_writer_id_and_recursion_flag(
+			lock, pass ? FALSE : TRUE);
 
-		rw_lock_x_lock_wait(lock,
+		rw_lock_x_lock_wait(lock, high_priority,
 #ifdef UNIV_SYNC_DEBUG
 				    pass,
 #endif
-                                    file_name, line);
+				    file_name, line);
 
 	} else {
+		os_thread_id_t	thread_id = os_thread_get_curr_id();
+
 		/* Decrement failed: relock or failed lock */
 		if (!pass && lock->recursive
-		    && os_thread_eq(lock->writer_thread, curr_thread)) {
+		    && os_thread_eq(lock->writer_thread, thread_id)) {
 			/* Relock */
-                        lock->lock_word -= X_LOCK_DECR;
+			if (lock->lock_word == 0) {
+				lock->lock_word -= X_LOCK_DECR;
+			} else {
+				--lock->lock_word;
+			}
+
 		} else {
 			/* Another thread locked before us */
 			return(FALSE);
 		}
 	}
 #ifdef UNIV_SYNC_DEBUG
-	rw_lock_add_debug_info(lock, pass, RW_LOCK_EX,
-			       file_name, line);
+	rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, file_name, line);
 #endif
 	lock->last_x_file_name = file_name;
 	lock->last_x_line = (unsigned int) line;
@@ -616,11 +727,23 @@ rw_lock_x_lock_func(
 	ulint		pass,	/*!< in: pass value; != 0, if the lock will
 				be passed to another thread to unlock */
 	const char*	file_name,/*!< in: file name where lock requested */
-	ulint		line)	/*!< in: line where requested */
+	ulint		line,	/*!< in: line where requested */
+	bool		priority_lock,
+				/*!< in: whether the lock is a priority lock */
+	bool		high_priority)
+				/*!< in: whether we are acquiring a priority
+				lock with high priority */
 {
-	ulint	index;	/*!< index of the reserved wait cell */
-	ulint	i;	/*!< spin round count */
-	ibool	spinning = FALSE;
+	ulint		i;	/*!< spin round count */
+	ulint		index;	/*!< index of the reserved wait cell */
+	sync_array_t*	sync_arr;
+	ibool		spinning = FALSE;
+	size_t		counter_index;
+
+	/* We reuse the thread id to index into the counter, cache
+	it here for efficiency. */
+
+	counter_index = (size_t) os_thread_get_curr_id();
 
 	ut_ad(rw_lock_validate(lock));
 #ifdef UNIV_SYNC_DEBUG
@@ -629,18 +752,26 @@ rw_lock_x_lock_func(
 
 	i = 0;
 
+	ut_ad(priority_lock || !high_priority);
+
 lock_loop:
 
-	if (rw_lock_x_lock_low(lock, pass, file_name, line)) {
-		rw_x_spin_round_count += i;
+	if (!rw_lock_higher_prio_waiters_exist(priority_lock, high_priority,
+					       lock)
+	    && rw_lock_x_lock_low(lock, high_priority, pass,
+				  file_name, line)) {
+		rw_lock_stats.rw_x_spin_round_count.add(counter_index, i);
 
 		return;	/* Locking succeeded */
 
-	} else {
+	} else if (!rw_lock_higher_prio_waiters_exist(priority_lock,
+						      high_priority, lock)) {
 
-                if (!spinning) {
-                        spinning = TRUE;
-                        rw_x_spin_wait_count++;
+		if (!spinning) {
+			spinning = TRUE;
+
+			rw_lock_stats.rw_x_spin_wait_count.add(
+				counter_index, 1);
 		}
 
 		/* Spin waiting for the lock_word to become free */
@@ -658,36 +789,51 @@ lock_loop:
 		} else {
 			goto lock_loop;
 		}
+	} else {
+
+		/* In case we skipped spinning because of higher-priority
+		waiters already waiting, perform only this bit of the spinning
+		code path.  */
+		os_thread_yield();
 	}
 
-	rw_x_spin_round_count += i;
+	if (spinning) {
 
-	if (srv_print_latch_waits) {
-		fprintf(stderr,
-			"Thread %lu spin wait rw-x-lock at %p"
-			" '%s' rnds %lu\n",
-			os_thread_pf(os_thread_get_curr_id()), (void*) lock,
-			lock->lock_name, (ulong) i);
+		rw_lock_stats.rw_x_spin_round_count.add(counter_index, i);
+
+		if (srv_print_latch_waits) {
+			fprintf(stderr,
+				"Thread " ULINTPF " spin wait rw-x-lock at %p"
+				" '%s' rnds " ULINTPF "\n",
+				os_thread_pf(os_thread_get_curr_id()),
+				(void*) lock,lock->lock_name, i);
+		}
 	}
 
-	sync_array_reserve_cell(sync_primary_wait_array,
-				lock,
-				RW_LOCK_EX,
-				file_name, line,
-				&index);
+	sync_arr = sync_array_get();
+
+	sync_array_reserve_cell(
+		sync_arr, lock,
+		high_priority ? PRIO_RW_LOCK_EX : RW_LOCK_EX,
+		file_name, line, &index);
 
 	/* Waiters must be set before checking lock_word, to ensure signal
 	is sent. This could lead to a few unnecessary wake-up signals. */
-	rw_lock_set_waiter_flag(lock);
+	if (high_priority) {
+		prio_rw_lock_t*	prio_lock = (prio_rw_lock_t *)lock;
+		prio_lock->high_priority_x_waiters = 1;
+	} else {
+		rw_lock_set_waiter_flag(lock);
+	}
 
-	if (rw_lock_x_lock_low(lock, pass, file_name, line)) {
-		sync_array_free_cell(sync_primary_wait_array, index);
+	if (rw_lock_x_lock_low(lock, high_priority, pass, file_name, line)) {
+		sync_array_free_cell(sync_arr, index);
 		return; /* Locking succeeded */
 	}
 
 	if (srv_print_latch_waits) {
 		fprintf(stderr,
-			"Thread %lu OS wait for rw-x-lock at %p"
+			"Thread " ULINTPF " OS wait for rw-x-lock at %p"
 			" '%s'\n",
 			os_thread_pf(os_thread_get_curr_id()), (void*) lock,
 			lock->lock_name);
@@ -695,14 +841,37 @@ lock_loop:
 
 	/* these stats may not be accurate */
 	lock->count_os_wait++;
-	rw_x_os_wait_count++;
+	rw_lock_stats.rw_x_os_wait_count.add(counter_index, 1);
 
-	sync_array_wait_event(sync_primary_wait_array, index);
+	sync_array_wait_event(sync_arr, index);
 
 	i = 0;
 	goto lock_loop;
 }
 
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock a priority
+rw-lock in exclusive mode for the current thread. If the rw-lock is locked
+in shared or exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the lock, before suspending the thread. If the same thread has an x-lock
+on the rw-lock, locking succeed, with the following exception: if pass != 0,
+only a single x-lock may be taken on the lock. NOTE: If the same thread has
+an s-lock, locking does not succeed! */
+UNIV_INTERN
+void
+rw_lock_x_lock_func(
+/*================*/
+	prio_rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				  be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	rw_lock_x_lock_func(&lock->base_lock, pass, file_name, line, true,
+			    srv_current_thread_priority > 0);
+}
+
 #ifdef UNIV_SYNC_DEBUG
 /******************************************************************//**
 Acquires the debug mutex. We cannot use the mutex defined in sync0sync,
@@ -871,6 +1040,21 @@ rw_lock_own(
 
 	return(FALSE);
 }
+
+/******************************************************************//**
+Checks if the thread has locked the priority rw-lock in the specified mode,
+with the pass value == 0. */
+UNIV_INTERN
+ibool
+rw_lock_own(
+/*========*/
+	prio_rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		lock_type)	/*!< in: lock type: RW_LOCK_SHARED,
+					RW_LOCK_EX */
+{
+	return(rw_lock_own(&lock->base_lock, lock_type));
+}
+
 #endif /* UNIV_SYNC_DEBUG */
 
 /******************************************************************//**
diff --git a/storage/xtradb/sync/sync0sync.c b/storage/xtradb/sync/sync0sync.cc
index 25f96d9817a..e4c34faf13d 100644
--- a/storage/xtradb/sync/sync0sync.c
+++ b/storage/xtradb/sync/sync0sync.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -24,7 +24,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file sync/sync0sync.c
+@file sync/sync0sync.cc
 Mutex, the basic synchronization primitive
 
 Created 9/5/1995 Heikki Tuuri
@@ -38,6 +38,7 @@ Created 9/5/1995 Heikki Tuuri
 #include "sync0rw.h"
 #include "buf0buf.h"
 #include "srv0srv.h"
+#include "btr0types.h"
 #include "buf0types.h"
 #include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
 #ifdef UNIV_SYNC_DEBUG
@@ -171,29 +172,25 @@ Q.E.D. */
 
 /** The number of iterations in the mutex_spin_wait() spin loop.
 Intended for performance monitoring. */
-UNIV_INTERN ib_int64_t	mutex_spin_round_count		= 0;
+UNIV_INTERN ib_counter_t<ib_int64_t, IB_N_SLOTS>	mutex_spin_round_count;
 /** The number of mutex_spin_wait() calls.  Intended for
 performance monitoring. */
-UNIV_INTERN ib_int64_t	mutex_spin_wait_count		= 0;
+UNIV_INTERN ib_counter_t<ib_int64_t, IB_N_SLOTS>	mutex_spin_wait_count;
 /** The number of OS waits in mutex_spin_wait().  Intended for
 performance monitoring. */
-UNIV_INTERN ib_int64_t	mutex_os_wait_count		= 0;
+UNIV_INTERN ib_counter_t<ib_int64_t, IB_N_SLOTS>	mutex_os_wait_count;
 /** The number of mutex_exit() calls. Intended for performance
 monitoring. */
-UNIV_INTERN ib_int64_t	mutex_exit_count		= 0;
-
-/** The global array of wait cells for implementation of the database's own
-mutexes and read-write locks */
-UNIV_INTERN sync_array_t*	sync_primary_wait_array;
+UNIV_INTERN ib_int64_t			mutex_exit_count;
 
 /** This variable is set to TRUE when sync_init is called */
 UNIV_INTERN ibool	sync_initialized	= FALSE;
 
 #ifdef UNIV_SYNC_DEBUG
 /** An acquired mutex or rw-lock and its level in the latching order */
-typedef struct sync_level_struct	sync_level_t;
+struct sync_level_t;
 /** Mutexes or rw-locks held by a thread */
-typedef struct sync_thread_struct	sync_thread_t;
+struct sync_thread_t;
 
 /** The latch levels currently owned by threads are stored in this data
 structure; the size of this array is OS_THREAD_MAX_N */
@@ -201,7 +198,7 @@ structure; the size of this array is OS_THREAD_MAX_N */
 UNIV_INTERN sync_thread_t*	sync_thread_level_arrays;
 
 /** Mutex protecting sync_thread_level_arrays */
-UNIV_INTERN mutex_t		sync_thread_mutex;
+UNIV_INTERN ib_mutex_t		sync_thread_mutex;
 
 # ifdef UNIV_PFS_MUTEX
 UNIV_INTERN mysql_pfs_key_t	sync_thread_mutex_key;
@@ -212,7 +209,7 @@ UNIV_INTERN mysql_pfs_key_t	sync_thread_mutex_key;
 UNIV_INTERN ut_list_base_node_t  mutex_list;
 
 /** Mutex protecting the mutex_list variable */
-UNIV_INTERN mutex_t mutex_list_mutex;
+UNIV_INTERN ib_mutex_t mutex_list_mutex;
 
 #ifdef UNIV_PFS_MUTEX
 UNIV_INTERN mysql_pfs_key_t	mutex_list_mutex_key;
@@ -225,10 +222,8 @@ UNIV_INTERN ibool	sync_order_checks_on	= FALSE;
 /** Number of slots reserved for each OS thread in the sync level array */
 static const ulint SYNC_THREAD_N_LEVELS = 10000;
 
-typedef struct sync_arr_struct sync_arr_t;
-
 /** Array for tracking sync levels per thread. */
-struct sync_arr_struct {
+struct sync_arr_t {
 	ulint		in_use;		/*!< Number of active cells */
 	ulint		n_elems;	/*!< Number of elements in the array */
 	ulint		max_elems;	/*!< Maximum elements */
@@ -238,14 +233,14 @@ struct sync_arr_struct {
 };
 
 /** Mutexes or rw-locks held by a thread */
-struct sync_thread_struct{
+struct sync_thread_t{
 	os_thread_id_t	id;		/*!< OS thread id */
 	sync_arr_t*	levels;		/*!< level array for this thread; if
 					this is NULL this slot is unused */
 };
 
 /** An acquired mutex or rw-lock and its level in the latching order */
-struct sync_level_struct{
+struct sync_level_t{
 	void*		latch;		/*!< pointer to a mutex or an
 					rw-lock; NULL means that
 					the slot is empty */
@@ -268,7 +263,7 @@ UNIV_INTERN
 void
 mutex_create_func(
 /*==============*/
-	mutex_t*	mutex,		/*!< in: pointer to memory */
+	ib_mutex_t*	mutex,		/*!< in: pointer to memory */
 #ifdef UNIV_DEBUG
 # ifdef UNIV_SYNC_DEBUG
 	ulint		level,		/*!< in: level */
@@ -281,11 +276,11 @@ mutex_create_func(
 #if defined(HAVE_ATOMIC_BUILTINS)
 	mutex_reset_lock_word(mutex);
 #else
-	os_fast_mutex_init(&(mutex->os_fast_mutex));
+	os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mutex->os_fast_mutex);
 	mutex->lock_word = 0;
 #endif
-	mutex->event = os_event_create(NULL);
-	mutex->waiters = 0;
+	mutex->event = os_event_create();
+	mutex_set_waiters(mutex, 0);
 #ifdef UNIV_DEBUG
 	mutex->magic_n = MUTEX_MAGIC_N;
 #endif /* UNIV_DEBUG */
@@ -300,15 +295,6 @@ mutex_create_func(
 #endif /* UNIV_DEBUG */
 	mutex->count_os_wait = 0;
 	mutex->cmutex_name=	  cmutex_name;
-#ifdef UNIV_DEBUG
-	mutex->count_using=	  0;
-	mutex->mutex_type=	  0;
-	mutex->lspent_time=	  0;
-	mutex->lmax_spent_time=     0;
-	mutex->count_spin_loop= 0;
-	mutex->count_spin_rounds=   0;
-	mutex->count_os_yield=  0;
-#endif /* UNIV_DEBUG */
 
 	/* Check that lock_word is aligned; this is important on Intel */
 	ut_ad(((ulint)(&(mutex->lock_word))) % 4 == 0);
@@ -335,6 +321,40 @@ mutex_create_func(
 }
 
 /******************************************************************//**
+Creates, or rather, initializes a priority mutex object in a specified memory
+location (which must be appropriately aligned). The mutex is initialized
+in the reset state. Explicit freeing of the mutex with mutex_free is
+necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+mutex_create_func(
+/*==============*/
+	ib_prio_mutex_t*	mutex,		/*!< in: pointer to memory */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+	ulint			level,		/*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+	const char*		cfile_name,	/*!< in: file name where
+						created */
+	ulint			cline,		/*!< in: file line where
+						created */
+#endif /* UNIV_DEBUG */
+	const char*		cmutex_name)	/*!< in: mutex name */
+{
+	mutex_create_func(&mutex->base_mutex,
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+			  level,
+#endif /* UNIV_SYNC_DEBUG */
+			  cfile_name,
+			  cline,
+#endif /* UNIV_DEBUG */
+			  cmutex_name);
+	mutex->high_priority_waiters = 0;
+	mutex->high_priority_event = os_event_create();
+}
+
+/******************************************************************//**
 NOTE! Use the corresponding macro mutex_free(), not directly this function!
 Calling this function is obligatory only if the memory buffer containing
 the mutex is freed. Removes a mutex object from the mutex list. The mutex
@@ -343,7 +363,7 @@ UNIV_INTERN
 void
 mutex_free_func(
 /*============*/
-	mutex_t*	mutex)	/*!< in: mutex */
+	ib_mutex_t*	mutex)	/*!< in: mutex */
 {
 	ut_ad(mutex_validate(mutex));
 	ut_a(mutex_get_lock_word(mutex) == 0);
@@ -391,6 +411,23 @@ func_exit:
 #ifdef UNIV_DEBUG
 	mutex->magic_n = 0;
 #endif /* UNIV_DEBUG */
+	return;
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro mutex_free(), not directly this function!
+Calling this function is obligatory only if the memory buffer containing
+the mutex is freed. Removes a priority mutex object from the mutex list. The
+mutex is checked to be in the reset state. */
+UNIV_INTERN
+void
+mutex_free_func(
+/*============*/
+	ib_prio_mutex_t*	mutex)	/*!< in: mutex */
+{
+	ut_a(mutex->high_priority_waiters == 0);
+	os_event_free(mutex->high_priority_event);
+	mutex_free_func(&mutex->base_mutex);
 }
 
 /********************************************************************//**
@@ -402,7 +439,7 @@ UNIV_INTERN
 ulint
 mutex_enter_nowait_func(
 /*====================*/
-	mutex_t*	mutex,		/*!< in: pointer to mutex */
+	ib_mutex_t*	mutex,		/*!< in: pointer to mutex */
 	const char*	file_name __attribute__((unused)),
 					/*!< in: file name where mutex
 					requested */
@@ -411,7 +448,7 @@ mutex_enter_nowait_func(
 {
 	ut_ad(mutex_validate(mutex));
 
-	if (!mutex_test_and_set(mutex)) {
+	if (!ib_mutex_test_and_set(mutex)) {
 
 		ut_d(mutex->thread_id = os_thread_get_curr_id());
 #ifdef UNIV_SYNC_DEBUG
@@ -432,7 +469,7 @@ UNIV_INTERN
 ibool
 mutex_validate(
 /*===========*/
-	const mutex_t*	mutex)	/*!< in: mutex */
+	const ib_mutex_t*	mutex)	/*!< in: mutex */
 {
 	ut_a(mutex);
 	ut_a(mutex->magic_n == MUTEX_MAGIC_N);
@@ -448,13 +485,27 @@ UNIV_INTERN
 ibool
 mutex_own(
 /*======*/
-	const mutex_t*	mutex)	/*!< in: mutex */
+	const ib_mutex_t*	mutex)	/*!< in: mutex */
 {
 	ut_ad(mutex_validate(mutex));
 
 	return(mutex_get_lock_word(mutex) == 1
 	       && os_thread_eq(mutex->thread_id, os_thread_get_curr_id()));
 }
+
+/******************************************************************//**
+Checks that the current thread owns the priority mutex. Works only
+in the debug version.
+@return	TRUE if owns */
+UNIV_INTERN
+ibool
+mutex_own(
+/*======*/
+	const ib_prio_mutex_t*	mutex)	/*!< in: priority mutex */
+{
+	return mutex_own(&mutex->base_mutex);
+}
+
 #endif /* UNIV_DEBUG */
 
 /******************************************************************//**
@@ -463,18 +514,9 @@ UNIV_INTERN
 void
 mutex_set_waiters(
 /*==============*/
-	mutex_t*	mutex,	/*!< in: mutex */
+	ib_mutex_t*	mutex,	/*!< in: mutex */
 	ulint		n)	/*!< in: value to set */
 {
-#ifdef INNODB_RW_LOCKS_USE_ATOMICS
-	ut_ad(mutex);
-
-	if (n) {
-		os_compare_and_swap_ulint(&mutex->waiters, 0, 1);
-	} else {
-		os_compare_and_swap_ulint(&mutex->waiters, 1, 0);
-	}
-#else
 	volatile ulint*	ptr;		/* declared volatile to ensure that
 					the value is stored to memory */
 	ut_ad(mutex);
@@ -483,24 +525,34 @@ mutex_set_waiters(
 
 	*ptr = n;		/* Here we assume that the write of a single
 				word in memory is atomic */
-#endif
 }
 
 /******************************************************************//**
-Reserves a mutex for the current thread. If the mutex is reserved, the
-function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
-for the mutex before suspending the thread. */
+Reserves a mutex or a priority mutex for the current thread. If the mutex is
+reserved, the function spins a preset time (controlled by SYNC_SPIN_ROUNDS),
+waiting for the mutex before suspending the thread. */
 UNIV_INTERN
 void
 mutex_spin_wait(
 /*============*/
-	mutex_t*	mutex,		/*!< in: pointer to mutex */
+	void*		_mutex,		/*!< in: pointer to mutex */
+	bool		high_priority,	/*!< in: whether the mutex is a
+					priority mutex with high priority
+					specified */
 	const char*	file_name,	/*!< in: file name where mutex
 					requested */
 	ulint		line)		/*!< in: line where requested */
 {
-	ulint	   index; /* index of the reserved wait cell */
-	ulint	   i;	  /* spin round count */
+	ulint		i;		/* spin round count */
+	ulint		index;		/* index of the reserved wait cell */
+	sync_array_t*	sync_arr;
+	size_t		counter_index;
+	/* The typecast below is performed for some of the priority mutexes
+	too, when !high_priority.  This exploits the fact that regular mutex is
+	a prefix of the priority mutex in memory.  */
+	ib_mutex_t*	mutex = (ib_mutex_t *) _mutex;
+
+	counter_index = (size_t) os_thread_get_curr_id();
 
 	ut_ad(mutex);
 
@@ -508,7 +560,7 @@ mutex_spin_wait(
 	isn't exact. Moved out of ifdef that follows because we are willing
 	to sacrifice the cost of counting this as the data is valuable.
 	Count the number of calls to mutex_spin_wait. */
-	mutex_spin_wait_count++;
+	mutex_spin_wait_count.add(counter_index, 1);
 
 mutex_loop:
 
@@ -521,7 +573,6 @@ mutex_loop:
 	a memory word. */
 
 spin_loop:
-	ut_d(mutex->count_spin_loop++);
 
 	while (mutex_get_lock_word(mutex) != 0 && i < SYNC_SPIN_ROUNDS) {
 		if (srv_spin_wait_delay) {
@@ -532,39 +583,25 @@ spin_loop:
 	}
 
 	if (i == SYNC_SPIN_ROUNDS) {
-#ifdef UNIV_DEBUG
-		mutex->count_os_yield++;
-#endif /* UNIV_DEBUG */
 		os_thread_yield();
 	}
 
-#ifdef UNIV_SRV_PRINT_LATCH_WAITS
-	fprintf(stderr,
-		"Thread %lu spin wait mutex at %p"
-		" '%s' rnds %lu\n",
-		(ulong) os_thread_pf(os_thread_get_curr_id()), (void*) mutex,
-		mutex->cmutex_name, (ulong) i);
-#endif
-
-	mutex_spin_round_count += i;
-
-	ut_d(mutex->count_spin_rounds += i);
+	mutex_spin_round_count.add(counter_index, i);
 
-	if (mutex_test_and_set(mutex) == 0) {
+	if (ib_mutex_test_and_set(mutex) == 0) {
 		/* Succeeded! */
 
 		ut_d(mutex->thread_id = os_thread_get_curr_id());
 #ifdef UNIV_SYNC_DEBUG
 		mutex_set_debug_info(mutex, file_name, line);
 #endif
-
-		goto finish_timing;
+		return;
 	}
 
 	/* We may end up with a situation where lock_word is 0 but the OS
 	fast mutex is still reserved. On FreeBSD the OS does not seem to
 	schedule a thread which is constantly calling pthread_mutex_trylock
-	(in mutex_test_and_set implementation). Then we could end up
+	(in ib_mutex_test_and_set implementation). Then we could end up
 	spinning here indefinitely. The following 'i++' stops this infinite
 	spin. */
 
@@ -574,8 +611,11 @@ spin_loop:
 		goto spin_loop;
 	}
 
-	sync_array_reserve_cell(sync_primary_wait_array, mutex,
-				SYNC_MUTEX, file_name, line, &index);
+	sync_arr = sync_array_get();
+
+	sync_array_reserve_cell(
+		sync_arr, mutex, high_priority ? SYNC_PRIO_MUTEX : SYNC_MUTEX,
+		file_name, line, &index);
 
 	/* The memory order of the array reservation and the change in the
 	waiters field is important: when we suspend a thread, we first
@@ -583,28 +623,25 @@ spin_loop:
 	released in mutex_exit, the waiters field is first set to zero and
 	then the event is set to the signaled state. */
 
-	mutex_set_waiters(mutex, 1);
+	if (high_priority) {
+		((ib_prio_mutex_t *)_mutex)->high_priority_waiters = 1;
+	} else {
+		mutex_set_waiters(mutex, 1);
+	}
 
 	/* Try to reserve still a few times */
 	for (i = 0; i < 4; i++) {
-		if (mutex_test_and_set(mutex) == 0) {
+		if (ib_mutex_test_and_set(mutex) == 0) {
 			/* Succeeded! Free the reserved wait cell */
 
-			sync_array_free_cell(sync_primary_wait_array, index);
+			sync_array_free_cell(sync_arr, index);
 
 			ut_d(mutex->thread_id = os_thread_get_curr_id());
 #ifdef UNIV_SYNC_DEBUG
 			mutex_set_debug_info(mutex, file_name, line);
 #endif
 
-#ifdef UNIV_SRV_PRINT_LATCH_WAITS
-			fprintf(stderr, "Thread %lu spin wait succeeds at 2:"
-				" mutex at %p\n",
-				(ulong) os_thread_pf(os_thread_get_curr_id()),
-				(void*) mutex);
-#endif
-
-			goto finish_timing;
+			return;
 
 			/* Note that in this case we leave the waiters field
 			set to 1. We cannot reset it to zero, as we do not
@@ -616,24 +653,12 @@ spin_loop:
 	after the change in the wait array and the waiters field was made.
 	Now there is no risk of infinite wait on the event. */
 
-#ifdef UNIV_SRV_PRINT_LATCH_WAITS
-	fprintf(stderr,
-		"Thread %lu OS wait mutex at %p '%s' rnds %lu\n",
-		(ulong) os_thread_pf(os_thread_get_curr_id()), (void*) mutex,
-		mutex->cmutex_name, (ulong) i);
-#endif
-
-	mutex_os_wait_count++;
+	mutex_os_wait_count.add(counter_index, 1);
 
 	mutex->count_os_wait++;
 
-
-	sync_array_wait_event(sync_primary_wait_array, index);
+	sync_array_wait_event(sync_arr, index);
 	goto mutex_loop;
-
-finish_timing:
-
-	return;
 }
 
 /******************************************************************//**
@@ -642,14 +667,14 @@ UNIV_INTERN
 void
 mutex_signal_object(
 /*================*/
-	mutex_t*	mutex)	/*!< in: mutex */
+	ib_mutex_t*	mutex)	/*!< in: mutex */
 {
 	mutex_set_waiters(mutex, 0);
 
 	/* The memory order of resetting the waiters field and
 	signaling the object is important. See LEMMA 1 above. */
 	os_event_set(mutex->event);
-	sync_array_object_signalled(sync_primary_wait_array);
+	sync_array_object_signalled();
 }
 
 #ifdef UNIV_SYNC_DEBUG
@@ -659,7 +684,7 @@ UNIV_INTERN
 void
 mutex_set_debug_info(
 /*=================*/
-	mutex_t*	mutex,		/*!< in: mutex */
+	ib_mutex_t*	mutex,		/*!< in: mutex */
 	const char*	file_name,	/*!< in: file where requested */
 	ulint		line)		/*!< in: line where requested */
 {
@@ -678,7 +703,7 @@ UNIV_INTERN
 void
 mutex_get_debug_info(
 /*=================*/
-	mutex_t*	mutex,		/*!< in: mutex */
+	ib_mutex_t*	mutex,		/*!< in: mutex */
 	const char**	file_name,	/*!< out: file where requested */
 	ulint*		line,		/*!< out: line where requested */
 	os_thread_id_t* thread_id)	/*!< out: id of the thread which owns
@@ -699,7 +724,7 @@ mutex_list_print_info(
 /*==================*/
 	FILE*	file)		/*!< in: file where to print */
 {
-	mutex_t*	mutex;
+	ib_mutex_t*	mutex;
 	const char*	file_name;
 	ulint		line;
 	os_thread_id_t	thread_id;
@@ -742,7 +767,7 @@ ulint
 mutex_n_reserved(void)
 /*==================*/
 {
-	mutex_t*	mutex;
+	ib_mutex_t*	mutex;
 	ulint		count	= 0;
 
 	mutex_enter(&mutex_list_mutex);
@@ -841,9 +866,9 @@ sync_print_warning(
 	const sync_level_t*	slot)	/*!< in: slot for which to
 					print warning */
 {
-	mutex_t*	mutex;
+	ib_mutex_t*	mutex;
 
-	mutex = slot->latch;
+	mutex = static_cast<ib_mutex_t*>(slot->latch);
 
 	if (mutex->magic_n == MUTEX_MAGIC_N) {
 		fprintf(stderr,
@@ -867,7 +892,9 @@ sync_print_warning(
 			fputs("Not locked\n", stderr);
 		}
 	} else {
-		rw_lock_t*	lock = slot->latch;
+		rw_lock_t*	lock;
+
+		lock = static_cast<rw_lock_t*>(slot->latch);
 
 		rw_lock_print(lock);
 	}
@@ -1027,7 +1054,8 @@ sync_thread_levels_nonempty_gen(
 		if (slot->latch != NULL
 		    && (!dict_mutex_allowed
 			|| (slot->level != SYNC_DICT
-			    && slot->level != SYNC_DICT_OPERATION))) {
+			    && slot->level != SYNC_DICT_OPERATION
+			    && slot->level != SYNC_FTS_CACHE))) {
 
 			mutex_exit(&sync_thread_mutex);
 			ut_error;
@@ -1122,10 +1150,10 @@ sync_thread_add_level(
 		return;
 	}
 
-	if ((latch == (void*)&sync_thread_mutex)
-	    || (latch == (void*)&mutex_list_mutex)
-	    || (latch == (void*)&rw_lock_debug_mutex)
-	    || (latch == (void*)&rw_lock_list_mutex)) {
+	if ((latch == (void*) &sync_thread_mutex)
+	    || (latch == (void*) &mutex_list_mutex)
+	    || (latch == (void*) &rw_lock_debug_mutex)
+	    || (latch == (void*) &rw_lock_list_mutex)) {
 
 		return;
 	}
@@ -1146,7 +1174,7 @@ sync_thread_add_level(
 		   + (sizeof(*array->elems) * SYNC_THREAD_N_LEVELS);
 
 		/* We have to allocate the level array for a new thread */
-		array = calloc(sz, sizeof(char));
+		array = static_cast<sync_arr_t*>(calloc(sz, sizeof(char)));
 		ut_a(array != NULL);
 
 		array->next_free = ULINT_UNDEFINED;
@@ -1186,15 +1214,21 @@ sync_thread_add_level(
 	case SYNC_MEM_POOL:
 	case SYNC_MEM_HASH:
 	case SYNC_RECV:
+	case SYNC_FTS_BG_THREADS:
 	case SYNC_WORK_QUEUE:
+	case SYNC_FTS_OPTIMIZE:
+	case SYNC_FTS_CACHE:
+	case SYNC_FTS_CACHE_INIT:
+	case SYNC_LOG_ONLINE:
 	case SYNC_LOG:
 	case SYNC_LOG_FLUSH_ORDER:
 	case SYNC_ANY_LATCH:
-	case SYNC_OUTER_ANY_LATCH:
 	case SYNC_FILE_FORMAT_TAG:
 	case SYNC_DOUBLEWRITE:
-	case SYNC_TRX_LOCK_HEAP:
-	case SYNC_KERNEL:
+	case SYNC_THREADS:
+	case SYNC_LOCK_SYS:
+	case SYNC_LOCK_WAIT_SYS:
+	case SYNC_TRX_SYS:
 	case SYNC_IBUF_BITMAP_MUTEX:
 	case SYNC_RSEG:
 	case SYNC_TRX_UNDO:
@@ -1206,6 +1240,8 @@ sync_thread_add_level(
 	case SYNC_TRX_I_S_RWLOCK:
 	case SYNC_TRX_I_S_LAST_READ:
 	case SYNC_IBUF_MUTEX:
+	case SYNC_INDEX_ONLINE_LOG:
+	case SYNC_STATS_AUTO_RECALC:
 		if (!sync_thread_levels_g(array, level, TRUE)) {
 			fprintf(stderr,
 				"InnoDB: sync_thread_levels_g(array, %lu)"
@@ -1213,14 +1249,38 @@ sync_thread_add_level(
 			ut_error;
 		}
 		break;
-	case SYNC_SEARCH_SYS:
-	case SYNC_BUF_LRU_LIST:
+	case SYNC_TRX:
+		/* Either the thread must own the lock_sys->mutex, or
+		it is allowed to own only ONE trx->mutex. */
+		if (!sync_thread_levels_g(array, level, FALSE)) {
+			ut_a(sync_thread_levels_g(array, level - 1, TRUE));
+			ut_a(sync_thread_levels_contain(array, SYNC_LOCK_SYS));
+		}
+		break;
+	case SYNC_SEARCH_SYS: {
+		/* Verify the lock order inside the split btr_search_latch
+		array */
+		bool found_current = false;
+		for (ulint i = 0; i < btr_search_index_num; i++) {
+			if (&btr_search_latch_arr[i] == latch) {
+				found_current = true;
+			} else if (found_current) {
+				ut_ad(!rw_lock_own(&btr_search_latch_arr[i],
+						   RW_LOCK_SHARED));
+				ut_ad(!rw_lock_own(&btr_search_latch_arr[i],
+						   RW_LOCK_EX));
+			}
+		}
+		ut_ad(found_current);
+
+		/* fallthrough */
+	}
 	case SYNC_BUF_FLUSH_LIST:
-	case SYNC_BUF_PAGE_HASH:
+	case SYNC_BUF_LRU_LIST:
 	case SYNC_BUF_FREE_LIST:
 	case SYNC_BUF_ZIP_FREE:
 	case SYNC_BUF_ZIP_HASH:
-	case SYNC_BUF_POOL:
+	case SYNC_BUF_FLUSH_STATE:
 		/* We can have multiple mutexes of this type therefore we
 		can only check whether the greater than condition holds. */
 		if (!sync_thread_levels_g(array, level-1, TRUE)) {
@@ -1231,18 +1291,19 @@ sync_thread_add_level(
 		}
 		break;
 
+
+	case SYNC_BUF_PAGE_HASH:
+		/* Multiple page_hash locks are only allowed during
+		buf_validate. */
+		/* Fall through */
+
 	case SYNC_BUF_BLOCK:
-		/* Either the thread must own the buffer pool mutex
-		(buf_pool->mutex), or it is allowed to latch only ONE
-		buffer block (block->mutex or buf_pool->zip_mutex). */
 		if (!sync_thread_levels_g(array, level, FALSE)) {
 			ut_a(sync_thread_levels_g(array, level - 1, TRUE));
-			/* the exact rule is not fixed yet, for now */
-			//ut_a(sync_thread_levels_contain(array, SYNC_BUF_LRU_LIST));
 		}
 		break;
 	case SYNC_REC_LOCK:
-		if (sync_thread_levels_contain(array, SYNC_KERNEL)) {
+		if (sync_thread_levels_contain(array, SYNC_LOCK_SYS)) {
 			ut_a(sync_thread_levels_g(array, SYNC_REC_LOCK - 1,
 						  TRUE));
 		} else {
@@ -1289,8 +1350,7 @@ sync_thread_add_level(
 		ut_a(sync_thread_levels_contain(array, SYNC_RSEG));
 		break;
 	case SYNC_RSEG_HEADER_NEW:
-		ut_a(sync_thread_levels_contain(array, SYNC_KERNEL)
-		     && sync_thread_levels_contain(array, SYNC_FSP_PAGE));
+		ut_a(sync_thread_levels_contain(array, SYNC_FSP_PAGE));
 		break;
 	case SYNC_TREE_NODE:
 		ut_a(sync_thread_levels_contain(array, SYNC_INDEX_TREE)
@@ -1392,10 +1452,10 @@ sync_thread_reset_level(
 		return(FALSE);
 	}
 
-	if ((latch == (void*)&sync_thread_mutex)
-	    || (latch == (void*)&mutex_list_mutex)
-	    || (latch == (void*)&rw_lock_debug_mutex)
-	    || (latch == (void*)&rw_lock_list_mutex)) {
+	if ((latch == (void*) &sync_thread_mutex)
+	    || (latch == (void*) &mutex_list_mutex)
+	    || (latch == (void*) &rw_lock_debug_mutex)
+	    || (latch == (void*) &rw_lock_list_mutex)) {
 
 		return(FALSE);
 	}
@@ -1447,7 +1507,7 @@ sync_thread_reset_level(
 		return(TRUE);
 	}
 
-	if (((mutex_t*) latch)->magic_n != MUTEX_MAGIC_N) {
+	if (((ib_mutex_t*) latch)->magic_n != MUTEX_MAGIC_N) {
 		rw_lock_t*	rw_lock;
 
 		rw_lock = (rw_lock_t*) latch;
@@ -1478,17 +1538,15 @@ sync_init(void)
 
 	sync_initialized = TRUE;
 
-	/* Create the primary system wait array which is protected by an OS
-	mutex */
+	sync_array_init(OS_THREAD_MAX_N);
 
-	sync_primary_wait_array = sync_array_create(OS_THREAD_MAX_N,
-						    SYNC_ARRAY_OS_MUTEX);
 #ifdef UNIV_SYNC_DEBUG
 	/* Create the thread latch level array where the latch levels
 	are stored for each OS thread */
 
-	sync_thread_level_arrays = calloc(
-		sizeof(sync_thread_t), OS_THREAD_MAX_N);
+	sync_thread_level_arrays = static_cast<sync_thread_t*>(
+		calloc(sizeof(sync_thread_t), OS_THREAD_MAX_N));
+
 	ut_a(sync_thread_level_arrays != NULL);
 
 #endif /* UNIV_SYNC_DEBUG */
@@ -1512,7 +1570,7 @@ sync_init(void)
 	mutex_create(rw_lock_debug_mutex_key, &rw_lock_debug_mutex,
 		     SYNC_NO_ORDER_CHECK);
 
-	rw_lock_debug_event = os_event_create(NULL);
+	rw_lock_debug_event = os_event_create();
 	rw_lock_debug_waiters = FALSE;
 #endif /* UNIV_SYNC_DEBUG */
 }
@@ -1553,9 +1611,9 @@ void
 sync_close(void)
 /*===========*/
 {
-	mutex_t*	mutex;
+	ib_mutex_t*	mutex;
 
-	sync_array_free(sync_primary_wait_array);
+	sync_array_close();
 
 	for (mutex = UT_LIST_GET_FIRST(mutex_list);
 	     mutex != NULL;
@@ -1570,20 +1628,20 @@ sync_close(void)
 
 		mutex_free(mutex);
 
-	        mutex = UT_LIST_GET_FIRST(mutex_list);
+		mutex = UT_LIST_GET_FIRST(mutex_list);
 	}
 
 	mutex_free(&mutex_list_mutex);
 #ifdef UNIV_SYNC_DEBUG
 	mutex_free(&sync_thread_mutex);
 
-	/* Switch latching order checks on in sync0sync.c */
+	/* Switch latching order checks on in sync0sync.cc */
 	sync_order_checks_on = FALSE;
 
 	sync_thread_level_arrays_free();
 #endif /* UNIV_SYNC_DEBUG */
 
-	sync_initialized = FALSE;	
+	sync_initialized = FALSE;
 }
 
 /*******************************************************************//**
@@ -1594,34 +1652,34 @@ sync_print_wait_info(
 /*=================*/
 	FILE*	file)		/*!< in: file where to print */
 {
-#ifdef UNIV_SYNC_DEBUG
-	fprintf(file, "Mutex exits %llu, rws exits %llu, rwx exits %llu\n",
-		mutex_exit_count, rw_s_exit_count, rw_x_exit_count);
-#endif
-
 	fprintf(file,
-		"Mutex spin waits %llu, rounds %llu, OS waits %llu\n"
-		"RW-shared spins %llu, rounds %llu, OS waits %llu\n"
-		"RW-excl spins %llu, rounds %llu, OS waits %llu\n",
-		mutex_spin_wait_count,
-		mutex_spin_round_count,
-		mutex_os_wait_count,
-		rw_s_spin_wait_count,
-		rw_s_spin_round_count,
-		rw_s_os_wait_count,
-		rw_x_spin_wait_count,
-		rw_x_spin_round_count,
-		rw_x_os_wait_count);
+		"Mutex spin waits "UINT64PF", rounds "UINT64PF", "
+		"OS waits "UINT64PF"\n"
+		"RW-shared spins "UINT64PF", rounds "UINT64PF", "
+		"OS waits "UINT64PF"\n"
+		"RW-excl spins "UINT64PF", rounds "UINT64PF", "
+		"OS waits "UINT64PF"\n",
+		(ib_uint64_t) mutex_spin_wait_count,
+		(ib_uint64_t) mutex_spin_round_count,
+		(ib_uint64_t) mutex_os_wait_count,
+		(ib_uint64_t) rw_lock_stats.rw_s_spin_wait_count,
+		(ib_uint64_t) rw_lock_stats.rw_s_spin_round_count,
+		(ib_uint64_t) rw_lock_stats.rw_s_os_wait_count,
+		(ib_uint64_t) rw_lock_stats.rw_x_spin_wait_count,
+		(ib_uint64_t) rw_lock_stats.rw_x_spin_round_count,
+		(ib_uint64_t) rw_lock_stats.rw_x_os_wait_count);
 
 	fprintf(file,
 		"Spin rounds per wait: %.2f mutex, %.2f RW-shared, "
 		"%.2f RW-excl\n",
 		(double) mutex_spin_round_count /
 		(mutex_spin_wait_count ? mutex_spin_wait_count : 1),
-		(double) rw_s_spin_round_count /
-		(rw_s_spin_wait_count ? rw_s_spin_wait_count : 1),
-		(double) rw_x_spin_round_count /
-		(rw_x_spin_wait_count ? rw_x_spin_wait_count : 1));
+		(double) rw_lock_stats.rw_s_spin_round_count /
+		(rw_lock_stats.rw_s_spin_wait_count
+		 ? rw_lock_stats.rw_s_spin_wait_count : 1),
+		(double) rw_lock_stats.rw_x_spin_round_count /
+		(rw_lock_stats.rw_x_spin_wait_count
+		 ? rw_lock_stats.rw_x_spin_wait_count : 1));
 }
 
 /*******************************************************************//**
@@ -1638,7 +1696,7 @@ sync_print(
 	rw_lock_list_print_info(file);
 #endif /* UNIV_SYNC_DEBUG */
 
-	sync_array_print_info(file, sync_primary_wait_array);
+	sync_array_print(file);
 
 	sync_print_wait_info(file);
 }
diff --git a/storage/xtradb/trx/trx0i_s.c b/storage/xtradb/trx/trx0i_s.cc
index 8b3a83585cc..f5d4a6c862f 100644
--- a/storage/xtradb/trx/trx0i_s.c
+++ b/storage/xtradb/trx/trx0i_s.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file trx/trx0i_s.c
+@file trx/trx0i_s.cc
 INFORMATION SCHEMA innodb_trx, innodb_locks and
 innodb_lock_waits tables fetch code.
 
@@ -131,31 +131,31 @@ noop because it will be empty. */
 /** Memory for each table in the intermediate buffer is allocated in
 separate chunks. These chunks are considered to be concatenated to
 represent one flat array of rows. */
-typedef struct i_s_mem_chunk_struct {
+struct i_s_mem_chunk_t {
 	ulint	offset;		/*!< offset, in number of rows */
 	ulint	rows_allocd;	/*!< the size of this chunk, in number
 				of rows */
 	void*	base;		/*!< start of the chunk */
-} i_s_mem_chunk_t;
+};
 
 /** This represents one table's cache. */
-typedef struct i_s_table_cache_struct {
+struct i_s_table_cache_t {
 	ulint		rows_used;	/*!< number of used rows */
 	ulint		rows_allocd;	/*!< number of allocated rows */
 	ulint		row_size;	/*!< size of a single row */
 	i_s_mem_chunk_t	chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /*!< array of
 					memory chunks that stores the
 					rows */
-} i_s_table_cache_t;
+};
 
 /** This structure describes the intermediate buffer */
-struct trx_i_s_cache_struct {
+struct trx_i_s_cache_t {
 	rw_lock_t	rw_lock;	/*!< read-write lock protecting
 					the rest of this structure */
 	ullint		last_read;	/*!< last time the cache was read;
 					measured in microseconds since
 					epoch */
-	mutex_t		last_read_mutex;/*!< mutex protecting the
+	ib_mutex_t		last_read_mutex;/*!< mutex protecting the
 					last_read member - it is updated
 					inside a shared lock of the
 					rw_lock member */
@@ -172,9 +172,9 @@ struct trx_i_s_cache_struct {
 /** Number of hash cells in the cache storage */
 #define CACHE_STORAGE_HASH_CELLS	2048
 	ha_storage_t*	storage;	/*!< storage for external volatile
-					data that can possibly not be
-					available later, when we release
-					the kernel mutex */
+					data that may become unavailable
+					when we release
+					lock_sys->mutex or trx_sys->mutex */
 	ulint		mem_allocd;	/*!< the amount of memory
 					allocated with mem_alloc*() */
 	ibool		is_truncated;	/*!< this is TRUE if the memory
@@ -476,7 +476,7 @@ fill_trx_row(
 	size_t		stmt_len;
 	const char*	s;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	row->trx_id = trx->id;
 	row->trx_started = (ib_time_t) trx->start_time;
@@ -485,9 +485,10 @@ fill_trx_row(
 	ut_ad(requested_lock_row == NULL
 	      || i_s_locks_row_validate(requested_lock_row));
 
-	if (trx->wait_lock != NULL) {
+	if (trx->lock.wait_lock != NULL) {
+
 		ut_a(requested_lock_row != NULL);
-		row->trx_wait_started = (ib_time_t) trx->wait_started;
+		row->trx_wait_started = (ib_time_t) trx->lock.wait_started;
 	} else {
 		ut_a(requested_lock_row == NULL);
 		row->trx_wait_started = 0;
@@ -505,6 +506,7 @@ fill_trx_row(
 	}
 
 	row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd);
+
 	stmt = innobase_get_stmt(trx->mysql_thd, &stmt_len);
 
 	if (stmt != NULL) {
@@ -517,9 +519,10 @@ fill_trx_row(
 		memcpy(query, stmt, stmt_len);
 		query[stmt_len] = '\0';
 
-		row->trx_query = ha_storage_put_memlim(
-			cache->storage, query, stmt_len + 1,
-			MAX_ALLOWED_FOR_STORAGE(cache));
+		row->trx_query = static_cast<const char*>(
+			ha_storage_put_memlim(
+				cache->storage, query, stmt_len + 1,
+				MAX_ALLOWED_FOR_STORAGE(cache)));
 
 		row->trx_query_cs = innobase_get_charset(trx->mysql_thd);
 
@@ -553,11 +556,15 @@ thd_done:
 
 	row->trx_tables_locked = trx->mysql_n_tables_locked;
 
-	row->trx_lock_structs = UT_LIST_GET_LEN(trx->trx_locks);
+	/* These are protected by both trx->mutex or lock_sys->mutex,
+	or just lock_sys->mutex. For reading, it suffices to hold
+	lock_sys->mutex. */
+
+	row->trx_lock_structs = UT_LIST_GET_LEN(trx->lock.trx_locks);
 
-	row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock_heap);
+	row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock.lock_heap);
 
-	row->trx_rows_locked = lock_number_of_rows_locked(trx);
+	row->trx_rows_locked = lock_number_of_rows_locked(&trx->lock);
 
 	row->trx_rows_modified = trx->undo_no;
 
@@ -605,6 +612,10 @@ thd_done:
 
 	row->trx_search_latch_timeout = trx->search_latch_timeout;
 
+	row->trx_is_read_only = trx->read_only;
+
+	row->trx_is_autocommit_non_locking = trx_is_autocommit_non_locking(trx);
+
 	return(TRUE);
 }
 
@@ -1132,25 +1143,25 @@ add_trx_relevant_locks_to_cache(
 					requested lock row, or NULL or
 					undefined */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	/* If transaction is waiting we add the wait lock and all locks
 	from another transactions that are blocking the wait lock. */
-	if (trx->que_state == TRX_QUE_LOCK_WAIT) {
+	if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
 
 		const lock_t*		curr_lock;
 		ulint			wait_lock_heap_no;
 		i_s_locks_row_t*	blocking_lock_row;
 		lock_queue_iterator_t	iter;
 
-		ut_a(trx->wait_lock != NULL);
+		ut_a(trx->lock.wait_lock != NULL);
 
 		wait_lock_heap_no
-			= wait_lock_get_heap_no(trx->wait_lock);
+			= wait_lock_get_heap_no(trx->lock.wait_lock);
 
 		/* add the requested lock */
 		*requested_lock_row
-			= add_lock_to_cache(cache, trx->wait_lock,
+			= add_lock_to_cache(cache, trx->lock.wait_lock,
 					    wait_lock_heap_no);
 
 		/* memory could not be allocated */
@@ -1162,17 +1173,18 @@ add_trx_relevant_locks_to_cache(
 		/* then iterate over the locks before the wait lock and
 		add the ones that are blocking it */
 
-		lock_queue_iterator_reset(&iter, trx->wait_lock,
+		lock_queue_iterator_reset(&iter, trx->lock.wait_lock,
 					  ULINT_UNDEFINED);
 
-		curr_lock = lock_queue_iterator_get_prev(&iter);
-		while (curr_lock != NULL) {
+		for (curr_lock = lock_queue_iterator_get_prev(&iter);
+		     curr_lock != NULL;
+		     curr_lock = lock_queue_iterator_get_prev(&iter)) {
 
-			if (lock_has_to_wait(trx->wait_lock,
+			if (lock_has_to_wait(trx->lock.wait_lock,
 					     curr_lock)) {
 
 				/* add the lock that is
-				blocking trx->wait_lock */
+				blocking trx->lock.wait_lock */
 				blocking_lock_row
 					= add_lock_to_cache(
 						cache, curr_lock,
@@ -1197,8 +1209,6 @@ add_trx_relevant_locks_to_cache(
 					return(FALSE);
 				}
 			}
-
-			curr_lock = lock_queue_iterator_get_prev(&iter);
 		}
 	} else {
 
@@ -1268,26 +1278,49 @@ Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the
 table cache buffer. Cache must be locked for write. */
 static
 void
-fetch_data_into_cache(
-/*==================*/
-	trx_i_s_cache_t*	cache)	/*!< in/out: cache */
+fetch_data_into_cache_low(
+/*======================*/
+	trx_i_s_cache_t*	cache,		/*!< in/out: cache */
+	ibool			only_ac_nl,	/*!< in: only select non-locking
+						autocommit transactions */
+	trx_list_t*		trx_list)	/*!< in: trx list */
 {
-	trx_t*			trx;
-	i_s_trx_row_t*		trx_row;
-	i_s_locks_row_t*	requested_lock_row;
+	const trx_t*		trx;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(trx_list == &trx_sys->rw_trx_list
+	      || trx_list == &trx_sys->ro_trx_list
+	      || trx_list == &trx_sys->mysql_trx_list);
 
-	trx_i_s_cache_clear(cache);
+	ut_ad(only_ac_nl == (trx_list == &trx_sys->mysql_trx_list));
 
-	/* We iterate over the list of all transactions and add each one
+	/* Iterate over the transaction list and add each one
 	to innodb_trx's cache. We also add all locks that are relevant
 	to each transaction into innodb_locks' and innodb_lock_waits'
 	caches. */
 
-	for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+	for (trx = UT_LIST_GET_FIRST(*trx_list);
 	     trx != NULL;
-	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+	     trx =
+	     (trx_list == &trx_sys->mysql_trx_list
+	      ? UT_LIST_GET_NEXT(mysql_trx_list, trx)
+	      : UT_LIST_GET_NEXT(trx_list, trx))) {
+
+		i_s_trx_row_t*		trx_row;
+		i_s_locks_row_t*	requested_lock_row;
+
+		if (trx->state == TRX_STATE_NOT_STARTED
+		    || (only_ac_nl && !trx_is_autocommit_non_locking(trx))) {
+
+			continue;
+		}
+
+		assert_trx_nonlocking_or_in_list(trx);
+
+		ut_ad(trx->in_ro_trx_list
+		      == (trx_list == &trx_sys->ro_trx_list));
+
+		ut_ad(trx->in_rw_trx_list
+		      == (trx_list == &trx_sys->rw_trx_list));
 
 		if (!add_trx_relevant_locks_to_cache(cache, trx,
 						     &requested_lock_row)) {
@@ -1315,6 +1348,28 @@ fetch_data_into_cache(
 			return;
 		}
 	}
+}
+
+/*******************************************************************//**
+Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the
+table cache buffer. Cache must be locked for write. */
+static
+void
+fetch_data_into_cache(
+/*==================*/
+	trx_i_s_cache_t*	cache)	/*!< in/out: cache */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	trx_i_s_cache_clear(cache);
+
+	fetch_data_into_cache_low(cache, FALSE, &trx_sys->rw_trx_list);
+	fetch_data_into_cache_low(cache, FALSE, &trx_sys->ro_trx_list);
+
+	/* Only select autocommit non-locking selects because they can
+	only be on the MySQL transaction list (TRUE). */
+	fetch_data_into_cache_low(cache, TRUE, &trx_sys->mysql_trx_list);
 
 	cache->is_truncated = FALSE;
 }
@@ -1335,11 +1390,16 @@ trx_i_s_possibly_fetch_data_into_cache(
 	}
 
 	/* We need to read trx_sys and record/table lock queues */
-	mutex_enter(&kernel_mutex);
+
+	lock_mutex_enter();
+
+	mutex_enter(&trx_sys->mutex);
 
 	fetch_data_into_cache(cache);
 
-	mutex_exit(&kernel_mutex);
+	mutex_exit(&trx_sys->mutex);
+
+	lock_mutex_exit();
 
 	return(0);
 }
@@ -1367,8 +1427,8 @@ trx_i_s_cache_init(
 {
 	/* The latching is done in the following order:
 	acquire trx_i_s_cache_t::rw_lock, X
-	acquire kernel_mutex
-	release kernel_mutex
+	acquire lock mutex
+	release lock mutex
 	release trx_i_s_cache_t::rw_lock
 	acquire trx_i_s_cache_t::rw_lock, S
 	acquire trx_i_s_cache_t::last_read_mutex
@@ -1593,7 +1653,7 @@ trx_i_s_create_lock_id(
 	} else {
 		/* table lock */
 		res_len = ut_snprintf(lock_id, lock_id_size,
-				      TRX_ID_FMT ":%llu",
+				      TRX_ID_FMT":"UINT64PF,
 				      row->lock_trx_id,
 				      row->lock_table_id);
 	}
@@ -1605,3 +1665,24 @@ trx_i_s_create_lock_id(
 
 	return(lock_id);
 }
+
+UNIV_INTERN
+void
+trx_i_s_get_lock_sys_memory_usage(ulint *constant, ulint *variable)
+{
+	trx_t* trx;
+
+	*constant = lock_sys->rec_hash->n_cells * sizeof(hash_cell_t);
+	*variable = 0;
+
+	if (trx_sys) {
+		mutex_enter(&trx_sys->mutex);
+		trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
+		while (trx) {
+			*variable += ((trx->lock.lock_heap) ? mem_heap_get_size(trx->lock.lock_heap) : 0);
+			trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
+		}
+		mutex_exit(&trx_sys->mutex);
+	}
+
+}
diff --git a/storage/xtradb/trx/trx0purge.c b/storage/xtradb/trx/trx0purge.cc
index d343a73c9d8..3dfcf23c3f5 100644
--- a/storage/xtradb/trx/trx0purge.c
+++ b/storage/xtradb/trx/trx0purge.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file trx/trx0purge.c
+@file trx/trx0purge.cc
 Purge old versions
 
 Created 3/26/1996 Heikki Tuuri
@@ -31,7 +31,6 @@ Created 3/26/1996 Heikki Tuuri
 
 #include "fsp0fsp.h"
 #include "mach0data.h"
-#include "mtr0log.h"
 #include "trx0rseg.h"
 #include "trx0trx.h"
 #include "trx0roll.h"
@@ -42,7 +41,16 @@ Created 3/26/1996 Heikki Tuuri
 #include "row0upd.h"
 #include "trx0rec.h"
 #include "srv0srv.h"
+#include "srv0start.h"
 #include "os0thread.h"
+#include "srv0mon.h"
+#include "mtr0log.h"
+
+/** Maximum allowable purge history length.  <=0 means 'infinite'. */
+UNIV_INTERN ulong		srv_max_purge_lag = 0;
+
+/** Max DML user threads delay in micro-seconds. */
+UNIV_INTERN ulong		srv_max_purge_lag_delay = 0;
 
 /** The global data structure coordinating a purge */
 UNIV_INTERN trx_purge_t*	purge_sys = NULL;
@@ -65,155 +73,33 @@ UNIV_INTERN mysql_pfs_key_t	purge_sys_bh_mutex_key;
 UNIV_INTERN my_bool		srv_purge_view_update_only_debug;
 #endif /* UNIV_DEBUG */
 
-/*****************************************************************//**
-Checks if trx_id is >= purge_view: then it is guaranteed that its update
-undo log still exists in the system.
-@return TRUE if is sure that it is preserved, also if the function
-returns FALSE, it is possible that the undo log still exists in the
-system */
-UNIV_INTERN
-ibool
-trx_purge_update_undo_must_exist(
-/*=============================*/
-	trx_id_t	trx_id)	/*!< in: transaction id */
-{
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
-#endif /* UNIV_SYNC_DEBUG */
-
-	if (!read_view_sees_trx_id(purge_sys->view, trx_id)) {
-
-		return(TRUE);
-	}
-
-	return(FALSE);
-}
-
-/*=================== PURGE RECORD ARRAY =============================*/
-
-/*******************************************************************//**
-Stores info of an undo log record during a purge.
-@return	pointer to the storage cell */
-static
-trx_undo_inf_t*
-trx_purge_arr_store_info(
-/*=====================*/
-	trx_id_t	trx_no,	/*!< in: transaction number */
-	undo_no_t	undo_no)/*!< in: undo number */
-{
-	trx_undo_inf_t*	cell;
-	trx_undo_arr_t*	arr;
-	ulint		i;
-
-	arr = purge_sys->arr;
-
-	for (i = 0;; i++) {
-		cell = trx_undo_arr_get_nth_info(arr, i);
-
-		if (!(cell->in_use)) {
-			/* Not in use, we may store here */
-			cell->undo_no = undo_no;
-			cell->trx_no = trx_no;
-			cell->in_use = TRUE;
-
-			arr->n_used++;
-
-			return(cell);
-		}
-	}
-}
-
-/*******************************************************************//**
-Removes info of an undo log record during a purge. */
-UNIV_INLINE
-void
-trx_purge_arr_remove_info(
-/*======================*/
-	trx_undo_inf_t*	cell)	/*!< in: pointer to the storage cell */
-{
-	trx_undo_arr_t*	arr;
-
-	arr = purge_sys->arr;
-
-	cell->in_use = FALSE;
-
-	ut_ad(arr->n_used > 0);
-
-	arr->n_used--;
-}
-
-/*******************************************************************//**
-Gets the biggest pair of a trx number and an undo number in a purge array. */
-static
-void
-trx_purge_arr_get_biggest(
-/*======================*/
-	trx_undo_arr_t*	arr,	/*!< in: purge array */
-	trx_id_t*	trx_no,	/*!< out: transaction number: 0
-				if array is empty */
-	undo_no_t*	undo_no)/*!< out: undo number */
-{
-	trx_undo_inf_t*	cell;
-	trx_id_t	pair_trx_no;
-	undo_no_t	pair_undo_no;
-	ulint		i;
-	ulint		n;
-
-	n = arr->n_used;
-	pair_trx_no = 0;
-	pair_undo_no = 0;
-
-	if (n) {
-		for (i = 0;; i++) {
-			cell = trx_undo_arr_get_nth_info(arr, i);
-
-			if (!cell->in_use) {
-				continue;
-			}
-
-			if ((cell->trx_no > pair_trx_no)
-			    || ((cell->trx_no == pair_trx_no)
-				&& cell->undo_no >= pair_undo_no)) {
-
-				pair_trx_no = cell->trx_no;
-				pair_undo_no = cell->undo_no;
-			}
-
-			if (!--n) {
-				break;
-			}
-		}
-	}
-
-	*trx_no = pair_trx_no;
-	*undo_no = pair_undo_no;
-}
-
 /****************************************************************//**
 Builds a purge 'query' graph. The actual purge is performed by executing
 this query graph.
 @return	own: the query graph */
 static
 que_t*
-trx_purge_graph_build(void)
-/*=======================*/
+trx_purge_graph_build(
+/*==================*/
+	trx_t*		trx,			/*!< in: transaction */
+	ulint		n_purge_threads)	/*!< in: number of purge
+						threads */
 {
+	ulint		i;
 	mem_heap_t*	heap;
 	que_fork_t*	fork;
-	que_thr_t*	thr;
-	/*	que_thr_t*	thr2; */
 
 	heap = mem_heap_create(512);
 	fork = que_fork_create(NULL, NULL, QUE_FORK_PURGE, heap);
-	fork->trx = purge_sys->trx;
-
-	thr = que_thr_create(fork, heap);
+	fork->trx = trx;
 
-	thr->child = row_purge_node_create(thr, heap);
+	for (i = 0; i < n_purge_threads; ++i) {
+		que_thr_t*	thr;
 
-	/*	thr2 = que_thr_create(fork, fork, heap);
+		thr = que_thr_create(fork, heap);
 
-	thr2->child = row_purge_node_create(fork, thr2, heap);	 */
+		thr->child = row_purge_node_create(thr, heap);
+	}
 
 	return(fork);
 }
@@ -225,22 +111,18 @@ UNIV_INTERN
 void
 trx_purge_sys_create(
 /*=================*/
-	ib_bh_t*	ib_bh)	/*!< in, own: UNDO log min binary heap */
+	ulint		n_purge_threads,	/*!< in: number of purge
+						threads */
+	ib_bh_t*	ib_bh)			/*!< in, own: UNDO log min
+						binary heap */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	purge_sys = static_cast<trx_purge_t*>(mem_zalloc(sizeof(*purge_sys)));
 
-	purge_sys = mem_zalloc(sizeof(trx_purge_t));
+	purge_sys->state = PURGE_STATE_INIT;
+	purge_sys->event = os_event_create();
 
 	/* Take ownership of ib_bh, we are responsible for freeing it. */
 	purge_sys->ib_bh = ib_bh;
-	purge_sys->state = TRX_STOP_PURGE;
-
-	purge_sys->n_pages_handled = 0;
-
-	purge_sys->purge_trx_no = 0;
-	purge_sys->purge_undo_no = 0;
-	purge_sys->next_stored = FALSE;
-	ut_d(purge_sys->done_trx_no = 0);
 
 	rw_lock_create(trx_purge_latch_key,
 		       &purge_sys->latch, SYNC_PURGE_LATCH);
@@ -251,21 +133,27 @@ trx_purge_sys_create(
 
 	purge_sys->heap = mem_heap_create(256);
 
-	purge_sys->arr = trx_undo_arr_create();
+	ut_a(n_purge_threads > 0);
 
 	purge_sys->sess = sess_open();
 
 	purge_sys->trx = purge_sys->sess->trx;
 
-	purge_sys->trx->is_purge = 1;
+	ut_a(purge_sys->trx->sess == purge_sys->sess);
 
-	ut_a(trx_start_low(purge_sys->trx, ULINT_UNDEFINED));
+	/* A purge transaction is not a real transaction, we use a transaction
+	here only because the query threads code requires it. It is otherwise
+	quite unnecessary. We should get rid of it eventually. */
+	purge_sys->trx->id = 0;
+	purge_sys->trx->start_time = ut_time();
+	purge_sys->trx->state = TRX_STATE_ACTIVE;
+	purge_sys->trx->op_info = "purge trx";
 
-	purge_sys->query = trx_purge_graph_build();
+	purge_sys->query = trx_purge_graph_build(
+		purge_sys->trx, n_purge_threads);
 
-	purge_sys->prebuilt_view =
-		read_view_oldest_copy_or_open_new(0, NULL);
-	purge_sys->view = purge_sys->prebuilt_view;
+	purge_sys->view = read_view_purge_open(purge_sys->prebuilt_clone,
+					       purge_sys->prebuilt_view);
 }
 
 /************************************************************************
@@ -275,34 +163,21 @@ void
 trx_purge_sys_close(void)
 /*======================*/
 {
-	ut_ad(!mutex_own(&kernel_mutex));
-
 	que_graph_free(purge_sys->query);
 
-	ut_a(purge_sys->sess->trx->is_purge);
-	purge_sys->sess->trx->state = TRX_NOT_STARTED;
+	ut_a(purge_sys->trx->id == 0);
+	ut_a(purge_sys->sess->trx == purge_sys->trx);
 
-	mutex_enter(&kernel_mutex);
-	trx_release_descriptor(purge_sys->sess->trx);
-	mutex_exit(&kernel_mutex);
+	purge_sys->trx->state = TRX_STATE_NOT_STARTED;
 
 	sess_close(purge_sys->sess);
-	purge_sys->sess = NULL;
 
-	if (purge_sys->view != NULL) {
-		/* Because acquiring the kernel mutex is a pre-condition
-		of read_view_close(). We don't really need it here. */
-		mutex_enter(&kernel_mutex);
+	purge_sys->sess = NULL;
 
-		read_view_close(purge_sys->view);
-		read_view_free(purge_sys->prebuilt_view);
-		purge_sys->prebuilt_view = NULL;
-		purge_sys->view = NULL;
+	read_view_free(purge_sys->prebuilt_view);
+	read_view_free(purge_sys->prebuilt_clone);
 
-		mutex_exit(&kernel_mutex);
-	}
-
-	trx_undo_arr_free(purge_sys->arr);
+	purge_sys->view = NULL;
 
 	rw_lock_free(&purge_sys->latch);
 	mutex_free(&purge_sys->bh_mutex);
@@ -311,6 +186,10 @@ trx_purge_sys_close(void)
 
 	ib_bh_free(purge_sys->ib_bh);
 
+	os_event_free(purge_sys->event);
+
+	purge_sys->event = NULL;
+
 	mem_free(purge_sys);
 
 	purge_sys = NULL;
@@ -331,21 +210,18 @@ trx_purge_add_update_undo_to_history(
 	mtr_t*	mtr)		/*!< in: mtr */
 {
 	trx_undo_t*	undo;
+	trx_rseg_t*	rseg;
 	trx_rsegf_t*	rseg_header;
 	trx_ulogf_t*	undo_header;
 
 	undo = trx->update_undo;
-
-	ut_ad(undo);
-
-	ut_ad(mutex_own(&undo->rseg->mutex));
+	rseg = undo->rseg;
 
 	rseg_header = trx_rsegf_get(
 		undo->rseg->space, undo->rseg->zip_size, undo->rseg->page_no,
 		mtr);
 
 	undo_header = undo_page + undo->hdr_offset;
-	/* Add the log as the first in the history list */
 
 	if (undo->state != TRX_UNDO_CACHED) {
 		ulint		hist_size;
@@ -364,6 +240,8 @@ trx_purge_add_update_undo_to_history(
 
 		trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr);
 
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
+
 		hist_size = mtr_read_ulint(
 			rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr);
 
@@ -375,40 +253,36 @@ trx_purge_add_update_undo_to_history(
 			hist_size + undo->size, MLOG_4BYTES, mtr);
 	}
 
-	flst_add_first(
-		rseg_header + TRX_RSEG_HISTORY,
-		undo_header + TRX_UNDO_HISTORY_NODE, mtr);
+	/* Add the log as the first in the history list */
+	flst_add_first(rseg_header + TRX_RSEG_HISTORY,
+		       undo_header + TRX_UNDO_HISTORY_NODE, mtr);
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	os_atomic_increment_ulint(&trx_sys->rseg_history_len, 1);
+#else
+	mutex_enter(&trx_sys->mutex);
+	++trx_sys->rseg_history_len;
+	mutex_exit(&trx_sys->mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
 
-	/* Write the trx number to the undo log header */
+	srv_wake_purge_thread_if_not_active();
 
+	/* Write the trx number to the undo log header */
 	mlog_write_ull(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr);
 
 	/* Write information about delete markings to the undo log header */
 
 	if (!undo->del_marks) {
-		mlog_write_ulint(
-			undo_header + TRX_UNDO_DEL_MARKS, FALSE,
-			MLOG_2BYTES, mtr);
+		mlog_write_ulint(undo_header + TRX_UNDO_DEL_MARKS, FALSE,
+				 MLOG_2BYTES, mtr);
 	}
 
-	if (undo->rseg->last_page_no == FIL_NULL) {
-		undo->rseg->last_trx_no = trx->no;
-		undo->rseg->last_offset = undo->hdr_offset;
-		undo->rseg->last_page_no = undo->hdr_page_no;
-		undo->rseg->last_del_marks = undo->del_marks;
-
-		/* FIXME: Add a bin heap validate function to check that
-		the rseg exists. */
+	if (rseg->last_page_no == FIL_NULL) {
+		rseg->last_page_no = undo->hdr_page_no;
+		rseg->last_offset = undo->hdr_offset;
+		rseg->last_trx_no = trx->no;
+		rseg->last_del_marks = undo->del_marks;
 	}
-
-	mutex_enter(&kernel_mutex);
-	trx_sys->rseg_history_len++;
-	mutex_exit(&kernel_mutex);
-
-//	if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) { /*should wake up always*/
-		/* Inform the purge thread that there is work to do. */
-		srv_wake_purge_thread_if_not_active();
-//	}
 }
 
 /**********************************************************************//**
@@ -424,49 +298,55 @@ trx_purge_free_segment(
 					will cut off from the end of the
 					history list */
 {
-	page_t*		undo_page;
+	mtr_t		mtr;
 	trx_rsegf_t*	rseg_hdr;
 	trx_ulogf_t*	log_hdr;
 	trx_usegf_t*	seg_hdr;
-	ibool		freed;
 	ulint		seg_size;
 	ulint		hist_size;
 	ibool		marked		= FALSE;
-	mtr_t		mtr;
 
 	/*	fputs("Freeing an update undo log segment\n", stderr); */
 
-loop:
-	mtr_start(&mtr);
-	mutex_enter(&(rseg->mutex));
+	for (;;) {
+		page_t*	undo_page;
 
-	rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size,
-				 rseg->page_no, &mtr);
+		mtr_start(&mtr);
 
-	undo_page = trx_undo_page_get(rseg->space, rseg->zip_size,
-				      hdr_addr.page, &mtr);
-	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
-	log_hdr = undo_page + hdr_addr.boffset;
+		mutex_enter(&rseg->mutex);
 
-	/* Mark the last undo log totally purged, so that if the system
-	crashes, the tail of the undo log will not get accessed again. The
-	list of pages in the undo log tail gets inconsistent during the
-	freeing of the segment, and therefore purge should not try to access
-	them again. */
+		rseg_hdr = trx_rsegf_get(
+			rseg->space, rseg->zip_size, rseg->page_no, &mtr);
 
-	if (!marked) {
-		mlog_write_ulint(log_hdr + TRX_UNDO_DEL_MARKS, FALSE,
-				 MLOG_2BYTES, &mtr);
-		marked = TRUE;
-	}
+		undo_page = trx_undo_page_get(
+			rseg->space, rseg->zip_size, hdr_addr.page, &mtr);
 
-	freed = fseg_free_step_not_header(seg_hdr + TRX_UNDO_FSEG_HEADER,
-					  &mtr);
-	if (!freed) {
-		mutex_exit(&(rseg->mutex));
-		mtr_commit(&mtr);
+		seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+		log_hdr = undo_page + hdr_addr.boffset;
+
+		/* Mark the last undo log totally purged, so that if the
+		system crashes, the tail of the undo log will not get accessed
+		again. The list of pages in the undo log tail gets inconsistent
+		during the freeing of the segment, and therefore purge should
+		not try to access them again. */
 
-		goto loop;
+		if (!marked) {
+			mlog_write_ulint(
+				log_hdr + TRX_UNDO_DEL_MARKS, FALSE,
+				MLOG_2BYTES, &mtr);
+
+			marked = TRUE;
+		}
+
+		if (fseg_free_step_not_header(
+			seg_hdr + TRX_UNDO_FSEG_HEADER, &mtr)) {
+
+			break;
+		}
+
+		mutex_exit(&rseg->mutex);
+
+		mtr_commit(&mtr);
 	}
 
 	/* The page list may now be inconsistent, but the length field
@@ -483,22 +363,22 @@ loop:
 	flst_cut_end(rseg_hdr + TRX_RSEG_HISTORY,
 		     log_hdr + TRX_UNDO_HISTORY_NODE, n_removed_logs, &mtr);
 
-	mutex_enter(&kernel_mutex);
-	ut_ad(trx_sys->rseg_history_len >= n_removed_logs);
+#ifdef HAVE_ATOMIC_BUILTINS
+	os_atomic_decrement_ulint(&trx_sys->rseg_history_len, n_removed_logs);
+#else
+	mutex_enter(&trx_sys->mutex);
 	trx_sys->rseg_history_len -= n_removed_logs;
-	mutex_exit(&kernel_mutex);
+	mutex_exit(&trx_sys->mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
 
-	freed = FALSE;
+	do {
 
-	while (!freed) {
 		/* Here we assume that a file segment with just the header
 		page can be freed in a few steps, so that the buffer pool
 		is not flooded with bufferfixed pages: see the note in
-		fsp0fsp.c. */
+		fsp0fsp.cc. */
 
-		freed = fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER,
-				       &mtr);
-	}
+	} while(!fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER, &mtr));
 
 	hist_size = mtr_read_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE,
 				   MLOG_4BYTES, &mtr);
@@ -522,12 +402,8 @@ static
 void
 trx_purge_truncate_rseg_history(
 /*============================*/
-	trx_rseg_t*	rseg,		/*!< in: rollback segment */
-	trx_id_t	limit_trx_no,	/*!< in: remove update undo logs whose
-					trx number is < limit_trx_no */
-	undo_no_t	limit_undo_no)	/*!< in: if transaction number is equal
-					to limit_trx_no, truncate undo records
-					with undo number < limit_undo_no */
+	trx_rseg_t*		rseg,		/*!< in: rollback segment */
+	const purge_iter_t*	limit)		/*!< in: truncate offset */
 {
 	fil_addr_t	hdr_addr;
 	fil_addr_t	prev_hdr_addr;
@@ -561,20 +437,26 @@ loop:
 				      hdr_addr.page, &mtr);
 
 	log_hdr = undo_page + hdr_addr.boffset;
+
 	undo_trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
 
-	if (undo_trx_no >= limit_trx_no) {
-		if (undo_trx_no == limit_trx_no) {
-			trx_undo_truncate_start(rseg, rseg->space,
-						hdr_addr.page,
-						hdr_addr.boffset,
-						limit_undo_no);
+	if (undo_trx_no >= limit->trx_no) {
+
+		if (undo_trx_no == limit->trx_no) {
+
+			trx_undo_truncate_start(
+				rseg, rseg->space, hdr_addr.page,
+				hdr_addr.boffset, limit->undo_no);
 		}
 
-		mutex_enter(&kernel_mutex);
-		ut_a(trx_sys->rseg_history_len >= n_removed_logs);
+#ifdef HAVE_ATOMIC_BUILTINS
+		os_atomic_decrement_ulint(
+			&trx_sys->rseg_history_len, n_removed_logs);
+#else
+		mutex_enter(&trx_sys->mutex);
 		trx_sys->rseg_history_len -= n_removed_logs;
-		mutex_exit(&kernel_mutex);
+		mutex_exit(&trx_sys->mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
 
 		flst_truncate_end(rseg_hdr + TRX_RSEG_HISTORY,
 				  log_hdr + TRX_UNDO_HISTORY_NODE,
@@ -624,60 +506,30 @@ Removes unnecessary history data from rollback segments. NOTE that when this
 function is called, the caller must not have any latches on undo log pages! */
 static
 void
-trx_purge_truncate_history(void)
-/*============================*/
+trx_purge_truncate_history(
+/*========================*/
+	purge_iter_t*		limit,		/*!< in: truncate limit */
+	const read_view_t*	view)		/*!< in: purge view */
 {
-	trx_rseg_t*	rseg;
-	trx_id_t	limit_trx_no;
-	undo_no_t	limit_undo_no;
-
-	trx_purge_arr_get_biggest(
-		purge_sys->arr, &limit_trx_no, &limit_undo_no);
-
-	if (limit_trx_no == 0) {
-
-		limit_trx_no = purge_sys->purge_trx_no;
-		limit_undo_no = purge_sys->purge_undo_no;
-	}
+	ulint		i;
 
 	/* We play safe and set the truncate limit at most to the purge view
 	low_limit number, though this is not necessary */
 
-	if (limit_trx_no >= purge_sys->view->low_limit_no) {
-		limit_trx_no = purge_sys->view->low_limit_no;
-		limit_undo_no = 0;
+	if (limit->trx_no >= view->low_limit_no) {
+		limit->trx_no = view->low_limit_no;
+		limit->undo_no = 0;
 	}
 
-	ut_ad(limit_trx_no <= purge_sys->view->low_limit_no);
+	ut_ad(limit->trx_no <= purge_sys->view->low_limit_no);
 
-	for (rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
-	     rseg != NULL;
-	     rseg = UT_LIST_GET_NEXT(rseg_list, rseg)) {
+	for (i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+		trx_rseg_t*	rseg = trx_sys->rseg_array[i];
 
-		trx_purge_truncate_rseg_history(
-			rseg, limit_trx_no, limit_undo_no);
-	}
-}
-
-/********************************************************************//**
-Does a truncate if the purge array is empty. NOTE that when this function is
-called, the caller must not have any latches on undo log pages! */
-UNIV_INLINE
-void
-trx_purge_truncate_if_arr_empty(void)
-/*=================================*/
-{
-	static ulint	count;
-
-#ifdef UNIV_DEBUG
-	if (purge_sys->arr->n_used == 0) {
-		purge_sys->done_trx_no = purge_sys->purge_trx_no;
-	}
-#endif /* UNIV_DEBUG */
-
-	if (!(++count % TRX_SYS_N_RSEGS) && purge_sys->arr->n_used == 0) {
-
-		trx_purge_truncate_history();
+		if (rseg != NULL) {
+			ut_a(rseg->id == i);
+			trx_purge_truncate_rseg_history(rseg, limit);
+		}
 	}
 }
 
@@ -688,8 +540,11 @@ static
 void
 trx_purge_rseg_get_next_history_log(
 /*================================*/
-	trx_rseg_t*	rseg)	/*!< in: rollback segment */
+	trx_rseg_t*	rseg,		/*!< in: rollback segment */
+	ulint*		n_pages_handled)/*!< in/out: number of UNDO pages
+					handled */
 {
+	const void*	ptr;
 	page_t*		undo_page;
 	trx_ulogf_t*	log_hdr;
 	fil_addr_t	prev_log_addr;
@@ -697,14 +552,13 @@ trx_purge_rseg_get_next_history_log(
 	ibool		del_marks;
 	mtr_t		mtr;
 	rseg_queue_t	rseg_queue;
-	const void*	ptr;
 
 	mutex_enter(&(rseg->mutex));
 
 	ut_a(rseg->last_page_no != FIL_NULL);
 
-	purge_sys->purge_trx_no = rseg->last_trx_no + 1;
-	purge_sys->purge_undo_no = 0;
+	purge_sys->iter.trx_no = rseg->last_trx_no + 1;
+	purge_sys->iter.undo_no = 0;
 	purge_sys->next_stored = FALSE;
 
 	mtr_start(&mtr);
@@ -716,7 +570,7 @@ trx_purge_rseg_get_next_history_log(
 
 	/* Increase the purge page count by one for every handled log */
 
-	purge_sys->n_pages_handled++;
+	(*n_pages_handled)++;
 
 	prev_log_addr = trx_purge_get_log_from_hist(
 		flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr));
@@ -729,10 +583,10 @@ trx_purge_rseg_get_next_history_log(
 		mutex_exit(&(rseg->mutex));
 		mtr_commit(&mtr);
 
-		mutex_enter(&kernel_mutex);
+		mutex_enter(&trx_sys->mutex);
 
 		/* Add debug code to track history list corruption reported
-		on the MySQL mailing list on Nov 9, 2004. The fut0lst.c
+		on the MySQL mailing list on Nov 9, 2004. The fut0lst.cc
 		file-based list was corrupt. The prev node pointer was
 		FIL_NULL, even though the list length was over 8 million nodes!
 		We assume that purge truncates the history list in large
@@ -752,12 +606,13 @@ trx_purge_rseg_get_next_history_log(
 			ut_ad(0);
 		}
 
-		mutex_exit(&kernel_mutex);
+		mutex_exit(&trx_sys->mutex);
 
 		return;
 	}
 
-	mutex_exit(&(rseg->mutex));
+	mutex_exit(&rseg->mutex);
+
 	mtr_commit(&mtr);
 
 	/* Read the trx number and del marks from the previous log header */
@@ -795,7 +650,7 @@ trx_purge_rseg_get_next_history_log(
 
 	mutex_exit(&purge_sys->bh_mutex);
 
-	mutex_exit(&(rseg->mutex));
+	mutex_exit(&rseg->mutex);
 }
 
 /***********************************************************************//**
@@ -839,18 +694,16 @@ trx_purge_get_rseg_with_min_trx_id(
 
 	ut_a(purge_sys->rseg->last_page_no != FIL_NULL);
 
-	/* We assume in purge of externally stored fields
-	that space id == 0 */
-	ut_a(purge_sys->rseg->space == 0);
+	/* We assume in purge of externally stored fields that space id is
+	in the range of UNDO tablespace space ids */
+	ut_a(purge_sys->rseg->space <= srv_undo_tablespaces_open);
 
 	zip_size = purge_sys->rseg->zip_size;
 
-	ut_a(purge_sys->purge_trx_no <= purge_sys->rseg->last_trx_no);
-
-	purge_sys->purge_trx_no = purge_sys->rseg->last_trx_no;
+	ut_a(purge_sys->iter.trx_no <= purge_sys->rseg->last_trx_no);
 
+	purge_sys->iter.trx_no = purge_sys->rseg->last_trx_no;
 	purge_sys->hdr_offset = purge_sys->rseg->last_offset;
-
 	purge_sys->hdr_page_no = purge_sys->rseg->last_page_no;
 
 	mutex_exit(&purge_sys->rseg->mutex);
@@ -867,21 +720,22 @@ trx_purge_read_undo_rec(
 	trx_purge_t*	purge_sys,		/*!< in/out: purge instance */
 	ulint		zip_size)		/*!< in: block size or 0 */
 {
+	ulint		offset;
 	ulint		page_no;
-	ulint		offset = 0;
-	ib_uint64_t	undo_no = 0;
+	ib_uint64_t	undo_no;
 
 	purge_sys->hdr_offset = purge_sys->rseg->last_offset;
 	page_no = purge_sys->hdr_page_no = purge_sys->rseg->last_page_no;
 
 	if (purge_sys->rseg->last_del_marks) {
 		mtr_t		mtr;
-		trx_undo_rec_t*	undo_rec;
+		trx_undo_rec_t*	undo_rec = NULL;
 
 		mtr_start(&mtr);
 
 		undo_rec = trx_undo_get_first_rec(
-			0 /* System space id */, zip_size,
+			purge_sys->rseg->space,
+			zip_size,
 			purge_sys->hdr_page_no,
 			purge_sys->hdr_offset, RW_S_LATCH, &mtr);
 
@@ -889,14 +743,20 @@ trx_purge_read_undo_rec(
 			offset = page_offset(undo_rec);
 			undo_no = trx_undo_rec_get_undo_no(undo_rec);
 			page_no = page_get_page_no(page_align(undo_rec));
+		} else {
+			offset = 0;
+			undo_no = 0;
 		}
 
 		mtr_commit(&mtr);
+	} else {
+		offset = 0;
+		undo_no = 0;
 	}
 
 	purge_sys->offset = offset;
 	purge_sys->page_no = page_no;
-	purge_sys->purge_undo_no = undo_no;
+	purge_sys->iter.undo_no = undo_no;
 
 	purge_sys->next_stored = TRUE;
 }
@@ -918,7 +778,6 @@ trx_purge_choose_next_log(void)
 	zip_size = trx_purge_get_rseg_with_min_trx_id(purge_sys);
 
 	if (purge_sys->rseg != NULL) {
-
 		trx_purge_read_undo_rec(purge_sys, zip_size);
 	} else {
 		/* There is nothing to do yet. */
@@ -933,23 +792,23 @@ static
 trx_undo_rec_t*
 trx_purge_get_next_rec(
 /*===================*/
-	mem_heap_t*	heap)	/*!< in: memory heap where copied */
+	ulint*		n_pages_handled,/*!< in/out: number of UNDO pages
+					handled */
+	mem_heap_t*	heap)		/*!< in: memory heap where copied */
 {
 	trx_undo_rec_t*	rec;
 	trx_undo_rec_t*	rec_copy;
 	trx_undo_rec_t*	rec2;
-	trx_undo_rec_t*	next_rec;
 	page_t*		undo_page;
 	page_t*		page;
 	ulint		offset;
 	ulint		page_no;
 	ulint		space;
 	ulint		zip_size;
-	ulint		type;
-	ulint		cmpl_info;
 	mtr_t		mtr;
 
 	ut_ad(purge_sys->next_stored);
+	ut_ad(purge_sys->iter.trx_no < purge_sys->view->low_limit_no);
 
 	space = purge_sys->rseg->space;
 	zip_size = purge_sys->rseg->zip_size;
@@ -960,7 +819,8 @@ trx_purge_get_next_rec(
 		/* It is the dummy undo log record, which means that there is
 		no need to purge this undo log */
 
-		trx_purge_rseg_get_next_history_log(purge_sys->rseg);
+		trx_purge_rseg_get_next_history_log(
+			purge_sys->rseg, n_pages_handled);
 
 		/* Look for the next undo log and record to purge */
 
@@ -978,6 +838,10 @@ trx_purge_get_next_rec(
 	rec2 = rec;
 
 	for (;;) {
+		ulint		type;
+		trx_undo_rec_t*	next_rec;
+		ulint		cmpl_info;
+
 		/* Try first to find the next record which requires a purge
 		operation from the same page of the same undo log */
 
@@ -1015,7 +879,8 @@ trx_purge_get_next_rec(
 	if (rec2 == NULL) {
 		mtr_commit(&mtr);
 
-		trx_purge_rseg_get_next_history_log(purge_sys->rseg);
+		trx_purge_rseg_get_next_history_log(
+			purge_sys->rseg, n_pages_handled);
 
 		/* Look for the next undo log and record to purge */
 
@@ -1023,20 +888,20 @@ trx_purge_get_next_rec(
 
 		mtr_start(&mtr);
 
-		undo_page = trx_undo_page_get_s_latched(space, zip_size,
-							page_no, &mtr);
+		undo_page = trx_undo_page_get_s_latched(
+			space, zip_size, page_no, &mtr);
 
 		rec = undo_page + offset;
 	} else {
 		page = page_align(rec2);
 
-		purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec2);
-		purge_sys->page_no = page_get_page_no(page);
 		purge_sys->offset = rec2 - page;
+		purge_sys->page_no = page_get_page_no(page);
+		purge_sys->iter.undo_no = trx_undo_rec_get_undo_no(rec2);
 
 		if (undo_page != page) {
 			/* We advance to a new page of the undo log: */
-			purge_sys->n_pages_handled++;
+			(*n_pages_handled)++;
 		}
 	}
 
@@ -1052,88 +917,262 @@ Fetches the next undo log record from the history list to purge. It must be
 released with the corresponding release function.
 @return copy of an undo log record or pointer to trx_purge_dummy_rec,
 if the whole undo log can skipped in purge; NULL if none left */
-UNIV_INTERN
+static __attribute__((warn_unused_result, nonnull))
 trx_undo_rec_t*
 trx_purge_fetch_next_rec(
 /*=====================*/
-	roll_ptr_t*	roll_ptr,/*!< out: roll pointer to undo record */
-	trx_undo_inf_t** cell,	/*!< out: storage cell for the record in the
-				purge array */
-	mem_heap_t*	heap)	/*!< in: memory heap where copied */
+	roll_ptr_t*	roll_ptr,	/*!< out: roll pointer to undo record */
+	ulint*		n_pages_handled,/*!< in/out: number of UNDO log pages
+					handled */
+	mem_heap_t*	heap)		/*!< in: memory heap where copied */
 {
-	trx_undo_rec_t*	undo_rec;
-
-
-	if (purge_sys->state == TRX_STOP_PURGE) {
-		trx_purge_truncate_if_arr_empty();
-
-		return(NULL);
-	} else if (!purge_sys->next_stored) {
+	if (!purge_sys->next_stored) {
 		trx_purge_choose_next_log();
 
 		if (!purge_sys->next_stored) {
-			purge_sys->state = TRX_STOP_PURGE;
-
-			trx_purge_truncate_if_arr_empty();
 
 			if (srv_print_thread_releases) {
 				fprintf(stderr,
 					"Purge: No logs left in the"
-					" history list; pages handled %lu\n",
-					(ulong) purge_sys->n_pages_handled);
+					" history list\n");
 			}
 
 			return(NULL);
 		}
 	}
 
-	if (purge_sys->n_pages_handled >= purge_sys->handle_limit) {
+	if (purge_sys->iter.trx_no >= purge_sys->view->low_limit_no) {
 
-		purge_sys->state = TRX_STOP_PURGE;
+		return(NULL);
+	}
 
-		trx_purge_truncate_if_arr_empty();
+	/* fprintf(stderr, "Thread %lu purging trx %llu undo record %llu\n",
+	os_thread_get_curr_id(), iter->trx_no, iter->undo_no); */
 
-		return(NULL);
-	} else if (purge_sys->purge_trx_no >= purge_sys->view->low_limit_no) {
-		purge_sys->state = TRX_STOP_PURGE;
+	*roll_ptr = trx_undo_build_roll_ptr(
+		FALSE, purge_sys->rseg->id,
+		purge_sys->page_no, purge_sys->offset);
 
-		trx_purge_truncate_if_arr_empty();
+	/* The following call will advance the stored values of the
+	purge iterator. */
 
-		return(NULL);
+	return(trx_purge_get_next_rec(n_pages_handled, heap));
+}
+
+/*******************************************************************//**
+This function runs a purge batch.
+@return	number of undo log pages handled in the batch */
+static
+ulint
+trx_purge_attach_undo_recs(
+/*=======================*/
+	ulint		n_purge_threads,/*!< in: number of purge threads */
+	trx_purge_t*	purge_sys,	/*!< in/out: purge instance */
+	purge_iter_t*	limit,		/*!< out: records read up to */
+	ulint		batch_size)	/*!< in: no. of pages to purge */
+{
+	que_thr_t*	thr;
+	ulint		i = 0;
+	ulint		n_pages_handled = 0;
+	ulint		n_thrs = UT_LIST_GET_LEN(purge_sys->query->thrs);
+
+	ut_a(n_purge_threads > 0);
+
+	*limit = purge_sys->iter;
+
+	/* Debug code to validate some pre-requisites and reset done flag. */
+	for (thr = UT_LIST_GET_FIRST(purge_sys->query->thrs);
+	     thr != NULL && i < n_purge_threads;
+	     thr = UT_LIST_GET_NEXT(thrs, thr), ++i) {
+
+		purge_node_t*		node;
+
+		/* Get the purge node. */
+		node = (purge_node_t*) thr->child;
+
+		ut_a(que_node_get_type(node) == QUE_NODE_PURGE);
+		ut_a(node->undo_recs == NULL);
+		ut_a(node->done);
+
+		node->done = FALSE;
 	}
 
-	/* fprintf(stderr, "Thread %lu purging trx %llu undo record %llu\n",
-	os_thread_get_curr_id(),
-	(ullint) purge_sys->purge_trx_no,
-	(ullint) purge_sys->purge_undo_no); */
+	/* There should never be fewer nodes than threads, the inverse
+	however is allowed because we only use purge threads as needed. */
+	ut_a(i == n_purge_threads);
 
+	/* Fetch and parse the UNDO records. The UNDO records are added
+	to a per purge node vector. */
+	thr = UT_LIST_GET_FIRST(purge_sys->query->thrs);
+	ut_a(n_thrs > 0 && thr != NULL);
 
-	*roll_ptr = trx_undo_build_roll_ptr(
-		FALSE, (purge_sys->rseg)->id, purge_sys->page_no,
-		purge_sys->offset);
+	ut_ad(trx_purge_check_limit());
+
+	i = 0;
 
-	*cell = trx_purge_arr_store_info(
-		purge_sys->purge_trx_no, purge_sys->purge_undo_no);
+	for (;;) {
+		purge_node_t*		node;
+		trx_purge_rec_t*	purge_rec;
 
-	ut_ad(purge_sys->purge_trx_no < purge_sys->view->low_limit_no);
+		ut_a(!thr->is_active);
 
-	/* The following call will advance the stored values of purge_trx_no
-	and purge_undo_no, therefore we had to store them first */
+		/* Get the purge node. */
+		node = (purge_node_t*) thr->child;
+		ut_a(que_node_get_type(node) == QUE_NODE_PURGE);
+
+		purge_rec = static_cast<trx_purge_rec_t*>(
+			mem_heap_zalloc(node->heap, sizeof(*purge_rec)));
+
+		/* Track the max {trx_id, undo_no} for truncating the
+		UNDO logs once we have purged the records. */
+
+		if (purge_sys->iter.trx_no > limit->trx_no
+		    || (purge_sys->iter.trx_no == limit->trx_no
+			&& purge_sys->iter.undo_no >= limit->undo_no)) {
+
+			*limit = purge_sys->iter;
+		}
 
-	undo_rec = trx_purge_get_next_rec(heap);
+		/* Fetch the next record, and advance the purge_sys->iter. */
+		purge_rec->undo_rec = trx_purge_fetch_next_rec(
+			&purge_rec->roll_ptr, &n_pages_handled, node->heap);
 
-	return(undo_rec);
+		if (purge_rec->undo_rec != NULL) {
+
+			if (node->undo_recs == NULL) {
+				node->undo_recs = ib_vector_create(
+					ib_heap_allocator_create(node->heap),
+					sizeof(trx_purge_rec_t),
+					batch_size);
+			} else {
+				ut_a(!ib_vector_is_empty(node->undo_recs));
+			}
+
+			ib_vector_push(node->undo_recs, purge_rec);
+
+			if (n_pages_handled >= batch_size) {
+
+				break;
+			}
+		} else {
+			break;
+		}
+
+		thr = UT_LIST_GET_NEXT(thrs, thr);
+
+		if (!(++i % n_purge_threads)) {
+			thr = UT_LIST_GET_FIRST(purge_sys->query->thrs);
+		}
+
+		ut_a(thr != NULL);
+	}
+
+	ut_ad(trx_purge_check_limit());
+
+	return(n_pages_handled);
 }
 
 /*******************************************************************//**
-Releases a reserved purge undo record. */
-UNIV_INTERN
+Calculate the DML delay required.
+@return delay in microseconds or ULINT_MAX */
+static
+ulint
+trx_purge_dml_delay(void)
+/*=====================*/
+{
+	/* Determine how much data manipulation language (DML) statements
+	need to be delayed in order to reduce the lagging of the purge
+	thread. */
+	ulint	delay = 0; /* in microseconds; default: no delay */
+
+	/* If purge lag is set (ie. > 0) then calculate the new DML delay.
+	Note: we do a dirty read of the trx_sys_t data structure here,
+	without holding trx_sys->mutex. */
+
+	if (srv_max_purge_lag > 0) {
+		float	ratio;
+
+		ratio = float(trx_sys->rseg_history_len) / srv_max_purge_lag;
+
+		if (ratio > 1.0) {
+			/* If the history list length exceeds the
+			srv_max_purge_lag, the data manipulation
+			statements are delayed by at least 5000
+			microseconds. */
+			delay = (ulint) ((ratio - .5) * 10000);
+		}
+
+		if (delay > srv_max_purge_lag_delay) {
+			delay = srv_max_purge_lag_delay;
+		}
+
+		MONITOR_SET(MONITOR_DML_PURGE_DELAY, delay);
+	}
+
+	return(delay);
+}
+
+/*******************************************************************//**
+Wait for pending purge jobs to complete. */
+static
 void
-trx_purge_rec_release(
-/*==================*/
-	trx_undo_inf_t*	cell)	/*!< in: storage cell */
+trx_purge_wait_for_workers_to_complete(
+/*===================================*/
+	trx_purge_t*	purge_sys)	/*!< in: purge instance */
 {
-	trx_purge_arr_remove_info(cell);
+	ulint		n_submitted = purge_sys->n_submitted;
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	/* Ensure that the work queue empties out. */
+	while (!os_compare_and_swap_ulint(
+			&purge_sys->n_completed, n_submitted, n_submitted)) {
+#else
+	mutex_enter(&purge_sys->bh_mutex);
+
+	while (purge_sys->n_completed < n_submitted) {
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+#ifndef HAVE_ATOMIC_BUILTINS
+		mutex_exit(&purge_sys->bh_mutex);
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+		if (srv_get_task_queue_length() > 0) {
+			srv_release_threads(SRV_WORKER, 1);
+		}
+
+		os_thread_yield();
+
+#ifndef HAVE_ATOMIC_BUILTINS
+		mutex_enter(&purge_sys->bh_mutex);
+#endif /* !HAVE_ATOMIC_BUILTINS */
+	}
+
+#ifndef HAVE_ATOMIC_BUILTINS
+	mutex_exit(&purge_sys->bh_mutex);
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+	/* None of the worker threads should be doing any work. */
+	ut_a(purge_sys->n_submitted == purge_sys->n_completed);
+
+	/* There should be no outstanding tasks as long
+	as the worker threads are active. */
+	ut_a(srv_get_task_queue_length() == 0);
+}
+
+/******************************************************************//**
+Remove old historical changes from the rollback segments. */
+static
+void
+trx_purge_truncate(void)
+/*====================*/
+{
+	ut_ad(trx_purge_check_limit());
+
+	if (purge_sys->limit.trx_no == 0) {
+		trx_purge_truncate_history(&purge_sys->iter, purge_sys->view);
+	} else {
+		trx_purge_truncate_history(&purge_sys->limit, purge_sys->view);
+	}
 }
 
 /*******************************************************************//**
@@ -1143,112 +1182,227 @@ UNIV_INTERN
 ulint
 trx_purge(
 /*======*/
-	ulint	limit)		/*!< in: the maximum number of records to
-				purge in one batch */
+	ulint	n_purge_threads,	/*!< in: number of purge tasks
+					to submit to the work queue */
+	ulint	batch_size,		/*!< in: the maximum number of records
+					to purge in one batch */
+	bool	truncate)		/*!< in: truncate history if true */
 {
-	que_thr_t*	thr;
-	ulint		old_pages_handled;
+	que_thr_t*	thr = NULL;
+	ulint		n_pages_handled;
 
-	ut_a(purge_sys->trx->n_active_thrs == 0);
+	ut_a(n_purge_threads > 0);
 
-	rw_lock_x_lock(&purge_sys->latch);
+	srv_dml_needed_delay = trx_purge_dml_delay();
 
-	mutex_enter(&kernel_mutex);
+	/* The number of tasks submitted should be completed. */
+	ut_a(purge_sys->n_submitted == purge_sys->n_completed);
 
-	/* Close and free the old purge view */
+	rw_lock_x_lock(&purge_sys->latch);
 
-	read_view_close(purge_sys->view);
 	purge_sys->view = NULL;
+
 	mem_heap_empty(purge_sys->heap);
 
-	/* Determine how much data manipulation language (DML) statements
-	need to be delayed in order to reduce the lagging of the purge
-	thread. */
-	srv_dml_needed_delay = 0; /* in microseconds; default: no delay */
+	purge_sys->view = read_view_purge_open(purge_sys->prebuilt_clone,
+					       purge_sys->prebuilt_view);
 
-	/* If we cannot advance the 'purge view' because of an old
-	'consistent read view', then the DML statements cannot be delayed.
-	Also, srv_max_purge_lag <= 0 means 'infinity'. */
-	if (srv_max_purge_lag > 0) {
-		float	ratio = (float) trx_sys->rseg_history_len
-			/ srv_max_purge_lag;
-		if (ratio > ULINT_MAX / 10000) {
-			/* Avoid overflow: maximum delay is 4295 seconds */
-			srv_dml_needed_delay = ULINT_MAX;
-		} else if (ratio > 1) {
-			/* If the history list length exceeds the
-			innodb_max_purge_lag, the
-			data manipulation statements are delayed
-			by at least 5000 microseconds. */
-			srv_dml_needed_delay = (ulint) ((ratio - .5) * 10000);
-		}
+	rw_lock_x_unlock(&purge_sys->latch);
+
+#ifdef UNIV_DEBUG
+	if (srv_purge_view_update_only_debug) {
+		return(0);
 	}
+#endif
 
-	purge_sys->view = read_view_oldest_copy_or_open_new(
-		0, purge_sys->prebuilt_view);
+	/* Fetch the UNDO recs that need to be purged. */
+	n_pages_handled = trx_purge_attach_undo_recs(
+		n_purge_threads, purge_sys, &purge_sys->limit, batch_size);
+
+	/* Do we do an asynchronous purge or not ? */
+	if (n_purge_threads > 1) {
+		ulint	i = 0;
+
+		/* Submit the tasks to the work queue. */
+		for (i = 0; i < n_purge_threads - 1; ++i) {
+			thr = que_fork_scheduler_round_robin(
+				purge_sys->query, thr);
+
+			ut_a(thr != NULL);
+
+			srv_que_task_enqueue_low(thr);
+		}
 
-	mutex_exit(&kernel_mutex);
+		thr = que_fork_scheduler_round_robin(purge_sys->query, thr);
+		ut_a(thr != NULL);
 
-	rw_lock_x_unlock(&(purge_sys->latch));
+		purge_sys->n_submitted += n_purge_threads - 1;
+
+		goto run_synchronously;
+
+	/* Do it synchronously. */
+	} else {
+		thr = que_fork_scheduler_round_robin(purge_sys->query, NULL);
+		ut_ad(thr);
+
+run_synchronously:
+		++purge_sys->n_submitted;
+
+		que_run_threads(thr);
+
+		os_atomic_inc_ulint(
+			&purge_sys->bh_mutex, &purge_sys->n_completed, 1);
+
+		if (n_purge_threads > 1) {
+			trx_purge_wait_for_workers_to_complete(purge_sys);
+		}
+	}
+
+	ut_a(purge_sys->n_submitted == purge_sys->n_completed);
 
 #ifdef UNIV_DEBUG
-	if (srv_purge_view_update_only_debug) {
-		return(0);
+	if (purge_sys->limit.trx_no == 0) {
+		purge_sys->done = purge_sys->iter;
+	} else {
+		purge_sys->done = purge_sys->limit;
 	}
-#endif
+#endif /* UNIV_DEBUG */
 
-	purge_sys->state = TRX_PURGE_ON;
+	if (truncate) {
+		trx_purge_truncate();
+	}
 
-	purge_sys->handle_limit = purge_sys->n_pages_handled + limit;
+	MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1);
+	MONITOR_INC_VALUE(MONITOR_PURGE_N_PAGE_HANDLED, n_pages_handled);
 
-	old_pages_handled = purge_sys->n_pages_handled;
+	return(n_pages_handled);
+}
 
+/*******************************************************************//**
+Get the purge state.
+@return purge state. */
+UNIV_INTERN
+purge_state_t
+trx_purge_state(void)
+/*=================*/
+{
+	purge_state_t	state;
 
-	mutex_enter(&kernel_mutex);
+	rw_lock_x_lock(&purge_sys->latch);
+
+	state = purge_sys->state;
+
+	rw_lock_x_unlock(&purge_sys->latch);
 
-	thr = que_fork_start_command(purge_sys->query);
+	return(state);
+}
 
-	ut_ad(thr);
+/*******************************************************************//**
+Stop purge and wait for it to stop, move to PURGE_STATE_STOP. */
+UNIV_INTERN
+void
+trx_purge_stop(void)
+/*================*/
+{
+	purge_state_t	state;
+	ib_int64_t	sig_count = os_event_reset(purge_sys->event);
+
+	ut_a(srv_n_purge_threads > 0);
+
+	rw_lock_x_lock(&purge_sys->latch);
 
-	mutex_exit(&kernel_mutex);
+	ut_a(purge_sys->state != PURGE_STATE_INIT);
+	ut_a(purge_sys->state != PURGE_STATE_EXIT);
+	ut_a(purge_sys->state != PURGE_STATE_DISABLED);
 
-	if (srv_print_thread_releases) {
+	++purge_sys->n_stop;
 
-		fputs("Starting purge\n", stderr);
+	state = purge_sys->state;
+
+	if (state == PURGE_STATE_RUN) {
+		ib_logf(IB_LOG_LEVEL_INFO, "Stopping purge");
+
+		/* We need to wakeup the purge thread in case it is suspended,
+		so that it can acknowledge the state change. */
+
+		srv_purge_wakeup();
 	}
 
-	que_run_threads(thr);
+	purge_sys->state = PURGE_STATE_STOP;
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+	if (state != PURGE_STATE_STOP) {
+
+		/* Wait for purge coordinator to signal that it
+		is suspended. */
+		os_event_wait_low(purge_sys->event, sig_count);
+	} else { 
+		bool	once = true; 
+
+		rw_lock_x_lock(&purge_sys->latch);
+
+		/* Wait for purge to signal that it has actually stopped. */ 
+		while (purge_sys->running) { 
+
+			if (once) { 
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"Waiting for purge to stop");
+				once = false; 
+			}
+
+			rw_lock_x_unlock(&purge_sys->latch);
+
+			os_thread_sleep(10000); 
 
-	if (srv_print_thread_releases) {
+			rw_lock_x_lock(&purge_sys->latch);
+		} 
 
-		fprintf(stderr,
-			"Purge ends; pages handled %lu\n",
-			(ulong) purge_sys->n_pages_handled);
+		rw_lock_x_unlock(&purge_sys->latch);
 	}
 
-	return((ulint) (purge_sys->n_pages_handled - old_pages_handled));
+	MONITOR_INC_VALUE(MONITOR_PURGE_STOP_COUNT, 1);
 }
 
-/******************************************************************//**
-Prints information of the purge system to stderr. */
+/*******************************************************************//**
+Resume purge, move to PURGE_STATE_RUN. */
 UNIV_INTERN
 void
-trx_purge_sys_print(void)
-/*=====================*/
+trx_purge_run(void)
+/*===============*/
 {
-	fprintf(stderr, "InnoDB: Purge system view:\n");
-	read_view_print(stderr, purge_sys->view);
-
-	fprintf(stderr, "InnoDB: Purge trx n:o " TRX_ID_FMT
-		", undo n:o " TRX_ID_FMT "\n",
-		(ullint) purge_sys->purge_trx_no,
-		(ullint) purge_sys->purge_undo_no);
-	fprintf(stderr,
-		"InnoDB: Purge next stored %lu, page_no %lu, offset %lu,\n"
-		"InnoDB: Purge hdr_page_no %lu, hdr_offset %lu\n",
-		(ulong) purge_sys->next_stored,
-		(ulong) purge_sys->page_no,
-		(ulong) purge_sys->offset,
-		(ulong) purge_sys->hdr_page_no,
-		(ulong) purge_sys->hdr_offset);
+	rw_lock_x_lock(&purge_sys->latch);
+
+	switch(purge_sys->state) {
+	case PURGE_STATE_INIT:
+	case PURGE_STATE_EXIT:
+	case PURGE_STATE_DISABLED:
+		ut_error;
+
+	case PURGE_STATE_RUN:
+	case PURGE_STATE_STOP:
+		break;
+	}
+
+	if (purge_sys->n_stop > 0) {
+
+		ut_a(purge_sys->state == PURGE_STATE_STOP);
+
+		--purge_sys->n_stop;
+
+		if (purge_sys->n_stop == 0) {
+
+			ib_logf(IB_LOG_LEVEL_INFO, "Resuming purge");
+
+			purge_sys->state = PURGE_STATE_RUN;
+		}
+
+		MONITOR_INC_VALUE(MONITOR_PURGE_RESUME_COUNT, 1);
+	} else {
+		ut_a(purge_sys->state == PURGE_STATE_RUN);
+	}
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+	srv_purge_wakeup();
 }
diff --git a/storage/xtradb/trx/trx0rec.c b/storage/xtradb/trx/trx0rec.cc
index ef42152aeb7..a698b37c2a6 100644
--- a/storage/xtradb/trx/trx0rec.c
+++ b/storage/xtradb/trx/trx0rec.cc
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file trx/trx0rec.c
+@file trx/trx0rec.cc
 Transaction undo log record
 
 Created 3/26/1996 Heikki Tuuri
@@ -287,7 +287,7 @@ trx_undo_rec_get_pars(
 					TRX_UNDO_INSERT_REC, ... */
 	ulint*		cmpl_info,	/*!< out: compiler info, relevant only
 					for update type records */
-	ibool*		updated_extern,	/*!< out: TRUE if we updated an
+	bool*		updated_extern,	/*!< out: true if we updated an
 					externally stored fild */
 	undo_no_t*	undo_no,	/*!< out: undo log record number */
 	table_id_t*	table_id)	/*!< out: table id */
@@ -300,12 +300,8 @@ trx_undo_rec_get_pars(
 	type_cmpl = mach_read_from_1(ptr);
 	ptr++;
 
-	if (type_cmpl & TRX_UNDO_UPD_EXTERN) {
-		*updated_extern = TRUE;
-		type_cmpl -= TRX_UNDO_UPD_EXTERN;
-	} else {
-		*updated_extern = FALSE;
-	}
+	*updated_extern = !!(type_cmpl & TRX_UNDO_UPD_EXTERN);
+	type_cmpl &= ~TRX_UNDO_UPD_EXTERN;
 
 	*type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1);
 	*cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT;
@@ -353,8 +349,9 @@ trx_undo_rec_get_col_val(
 		ut_ad(*len > *orig_len);
 		/* @see dtuple_convert_big_rec() */
 		ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE);
+
 		/* we do not have access to index->table here
-		ut_ad(dict_table_get_format(index->table) >= DICT_TF_FORMAT_ZIP
+		ut_ad(dict_table_get_format(index->table) >= UNIV_FORMAT_B
 		      || *len >= col->max_prefix
 		      + BTR_EXTERN_FIELD_REF_SIZE);
 		*/
@@ -587,6 +584,7 @@ trx_undo_page_report_modify(
 	/* Store first some general parameters to the undo log */
 
 	if (!update) {
+		ut_ad(!rec_get_deleted_flag(rec, dict_table_is_comp(table)));
 		type_cmpl = TRX_UNDO_DEL_MARK_REC;
 	} else if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) {
 		type_cmpl = TRX_UNDO_UPD_DEL_REC;
@@ -670,27 +668,14 @@ trx_undo_page_report_modify(
 	/* Save to the undo log the old values of the columns to be updated. */
 
 	if (update) {
-		ulint	extended = 0;
-
 		if (trx_undo_left(undo_page, ptr) < 5) {
 
 			return(0);
 		}
 
-		if (srv_use_sys_stats_table
-		    && index == UT_LIST_GET_FIRST(dict_sys->sys_stats->indexes)) {
-			for (i = 0; i < upd_get_n_fields(update); i++) {
-				ulint	pos = upd_get_nth_field(update, i)->field_no;
-
-				if (pos >= rec_offs_n_fields(offsets)) {
-					extended++;
-				}
-			}
-		}
-
-		ptr += mach_write_compressed(ptr, upd_get_n_fields(update) - extended);
+		ptr += mach_write_compressed(ptr, upd_get_n_fields(update));
 
-		for (i = 0; i < upd_get_n_fields(update) - extended; i++) {
+		for (i = 0; i < upd_get_n_fields(update); i++) {
 
 			ulint	pos = upd_get_nth_field(update, i)->field_no;
 
@@ -973,7 +958,9 @@ trx_undo_update_rec_get_update(
 	/* Store first trx id and roll ptr to update vector */
 
 	upd_field = upd_get_nth_field(update, n_fields);
-	buf = mem_heap_alloc(heap, DATA_TRX_ID_LEN);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_TRX_ID_LEN));
+
 	trx_write_trx_id(buf, trx_id);
 
 	upd_field_set_field_no(upd_field,
@@ -982,7 +969,9 @@ trx_undo_update_rec_get_update(
 	dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN);
 
 	upd_field = upd_get_nth_field(update, n_fields + 1);
-	buf = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_ROLL_PTR_LEN));
+
 	trx_write_roll_ptr(buf, roll_ptr);
 
 	upd_field_set_field_no(
@@ -1048,8 +1037,9 @@ trx_undo_update_rec_get_update(
 }
 
 /*******************************************************************//**
-Builds a partial row from an update undo log record. It contains the
-columns which occur as ordering in any index of the table.
+Builds a partial row from an update undo log record, for purge.
+It contains the columns which occur as ordering in any index of the table.
+Any missing columns are indicated by col->mtype == DATA_MISSING.
 @return	pointer to remaining part of undo record */
 UNIV_INTERN
 byte*
@@ -1083,7 +1073,12 @@ trx_undo_rec_get_partial_row(
 
 	*row = dtuple_create(heap, row_len);
 
-	dict_table_copy_types(*row, index->table);
+	/* Mark all columns in the row uninitialized, so that
+	we can distinguish missing fields from fields that are SQL NULL. */
+	for (ulint i = 0; i < row_len; i++) {
+		dfield_get_type(dtuple_get_nth_field(*row, i))
+			->mtype = DATA_MISSING;
+	}
 
 	end_ptr = ptr + mach_read_from_2(ptr);
 	ptr += 2;
@@ -1105,7 +1100,9 @@ trx_undo_rec_get_partial_row(
 		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
 
 		dfield = dtuple_get_nth_field(*row, col_no);
-
+		dict_col_copy_type(
+			dict_table_get_nth_col(index->table, col_no),
+			dfield_get_type(dfield));
 		dfield_set_data(dfield, field, len);
 
 		if (len != UNIV_SQL_NULL
@@ -1120,9 +1117,9 @@ trx_undo_rec_get_partial_row(
 				ut_a(dfield_get_len(dfield)
 				     >= BTR_EXTERN_FIELD_REF_SIZE);
 				ut_a(dict_table_get_format(index->table)
-				     >= DICT_TF_FORMAT_ZIP
+				     >= UNIV_FORMAT_B
 				     || dfield_get_len(dfield)
-				     >= REC_ANTELOPE_MAX_INDEX_COL_LEN 
+				     >= REC_ANTELOPE_MAX_INDEX_COL_LEN
 				     + BTR_EXTERN_FIELD_REF_SIZE);
 			}
 		}
@@ -1185,7 +1182,7 @@ transaction and in consistent reads that must look to the history of this
 transaction.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
-ulint
+dberr_t
 trx_undo_report_row_operation(
 /*==========================*/
 	ulint		flags,		/*!< in: if BTR_NO_UNDO_LOG_FLAG bit is
@@ -1204,6 +1201,7 @@ trx_undo_report_row_operation(
 	const rec_t*	rec,		/*!< in: in case of an update or delete
 					marking, the record in the clustered
 					index, otherwise NULL */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec) */
 	roll_ptr_t*	roll_ptr)	/*!< out: rollback pointer to the
 					inserted undo log record,
 					0 if BTR_NO_UNDO_LOG
@@ -1215,16 +1213,14 @@ trx_undo_report_row_operation(
 	buf_block_t*	undo_block;
 	trx_rseg_t*	rseg;
 	mtr_t		mtr;
-	ulint		err		= DB_SUCCESS;
-	mem_heap_t*	heap		= NULL;
-	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
-	ulint*		offsets		= offsets_;
+	dberr_t		err		= DB_SUCCESS;
 #ifdef UNIV_DEBUG
 	int		loop_count	= 0;
 #endif /* UNIV_DEBUG */
-	rec_offs_init(offsets_);
 
+	ut_ad(!srv_read_only_mode);
 	ut_a(dict_index_is_clust(index));
+	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
 
 	if (flags & BTR_NO_UNDO_LOG_FLAG) {
 
@@ -1238,55 +1234,61 @@ trx_undo_report_row_operation(
 	      || (clust_entry && !update && !rec));
 
 	trx = thr_get_trx(thr);
+
+	/* This table is visible only to the session that created it. */
+	if (trx->read_only) {
+		ut_ad(!srv_read_only_mode);
+		/* MySQL should block writes to non-temporary tables. */
+		ut_a(DICT_TF2_FLAG_IS_SET(index->table, DICT_TF2_TEMPORARY));
+		if (trx->rseg == 0) {
+			trx_assign_rseg(trx);
+		}
+	}
+
 	rseg = trx->rseg;
 
-	mutex_enter(&(trx->undo_mutex));
+	mtr_start(&mtr);
+	mutex_enter(&trx->undo_mutex);
 
 	/* If the undo log is not assigned yet, assign one */
 
-	if (op_type == TRX_UNDO_INSERT_OP) {
+	switch (op_type) {
+	case TRX_UNDO_INSERT_OP:
+		undo = trx->insert_undo;
 
-		if (trx->insert_undo == NULL) {
+		if (undo == NULL) {
 
 			err = trx_undo_assign_undo(trx, TRX_UNDO_INSERT);
-		}
+			undo = trx->insert_undo;
 
-		undo = trx->insert_undo;
-
-		if (UNIV_UNLIKELY(!undo)) {
-			/* Did not succeed */
-			ut_ad(err != DB_SUCCESS);
-			mutex_exit(&(trx->undo_mutex));
+			if (undo == NULL) {
+				/* Did not succeed */
+				ut_ad(err != DB_SUCCESS);
+				goto err_exit;
+			}
 
-			return(err);
+			ut_ad(err == DB_SUCCESS);
 		}
-
-		ut_ad(err == DB_SUCCESS);
-	} else {
+		break;
+	default:
 		ut_ad(op_type == TRX_UNDO_MODIFY_OP);
 
-		if (trx->update_undo == NULL) {
+		undo = trx->update_undo;
 
+		if (undo == NULL) {
 			err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE);
+			undo = trx->update_undo;
 
-		}
-
-		undo = trx->update_undo;
-
-		if (UNIV_UNLIKELY(!undo)) {
-			/* Did not succeed */
-			ut_ad(err != DB_SUCCESS);
-			mutex_exit(&(trx->undo_mutex));
-			return(err);
+			if (undo == NULL) {
+				/* Did not succeed */
+				ut_ad(err != DB_SUCCESS);
+				goto err_exit;
+			}
 		}
 
 		ut_ad(err == DB_SUCCESS);
-		offsets = rec_get_offsets(rec, index, offsets,
-					  ULINT_UNDEFINED, &heap);
 	}
 
-	mtr_start(&mtr);
-
 	page_no = undo->last_page_no;
 	undo_block = buf_page_get_gen(
 		undo->space, undo->zip_size, page_no, RW_X_LATCH,
@@ -1300,10 +1302,13 @@ trx_undo_report_row_operation(
 		undo_page = buf_block_get_frame(undo_block);
 		ut_ad(page_no == buf_block_get_page_no(undo_block));
 
-		if (op_type == TRX_UNDO_INSERT_OP) {
+		switch (op_type) {
+		case TRX_UNDO_INSERT_OP:
 			offset = trx_undo_page_report_insert(
 				undo_page, trx, index, clust_entry, &mtr);
-		} else {
+			break;
+		default:
+			ut_ad(op_type == TRX_UNDO_MODIFY_OP);
 			offset = trx_undo_page_report_modify(
 				undo_page, trx, index, rec, offsets, update,
 				cmpl_info, &mtr);
@@ -1360,8 +1365,7 @@ trx_undo_report_row_operation(
 			*roll_ptr = trx_undo_build_roll_ptr(
 				op_type == TRX_UNDO_INSERT_OP,
 				rseg->id, page_no, offset);
-			err = DB_SUCCESS;
-			goto func_exit;
+			return(DB_SUCCESS);
 		}
 
 		ut_ad(page_no == undo->last_page_no);
@@ -1378,6 +1382,7 @@ trx_undo_report_row_operation(
 		mutex_enter(&rseg->mutex);
 		undo_block = trx_undo_add_page(trx, undo, &mtr);
 		mutex_exit(&rseg->mutex);
+
 		page_no = undo->last_page_no;
 	} while (undo_block != NULL);
 
@@ -1387,10 +1392,6 @@ trx_undo_report_row_operation(
 err_exit:
 	mutex_exit(&trx->undo_mutex);
 	mtr_commit(&mtr);
-func_exit:
-	if (UNIV_LIKELY_NULL(heap)) {
-		mem_heap_free(heap);
-	}
 	return(err);
 }
 
@@ -1435,59 +1436,62 @@ trx_undo_get_undo_rec_low(
 /******************************************************************//**
 Copies an undo record to heap.
 
-NOTE: the caller must have latches on the clustered index page and
-purge_view.
+NOTE: the caller must have latches on the clustered index page.
 
-@return DB_SUCCESS, or DB_MISSING_HISTORY if the undo log has been
-truncated and we cannot fetch the old version */
-UNIV_INTERN
-ulint
+@retval true if the undo log has been
+truncated and we cannot fetch the old version
+@retval false if the undo log record is available  */
+static __attribute__((nonnull, warn_unused_result))
+bool
 trx_undo_get_undo_rec(
 /*==================*/
 	roll_ptr_t	roll_ptr,	/*!< in: roll pointer to record */
 	trx_id_t	trx_id,		/*!< in: id of the trx that generated
 					the roll pointer: it points to an
 					undo log of this transaction */
-	trx_undo_rec_t** undo_rec,	/*!< out, own: copy of the record */
+	trx_undo_rec_t**undo_rec,	/*!< out, own: copy of the record */
 	mem_heap_t*	heap)		/*!< in: memory heap where copied */
 {
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
-#endif /* UNIV_SYNC_DEBUG */
-
-	if (!trx_purge_update_undo_must_exist(trx_id)) {
+	bool		missing_history;
 
-		/* It may be that the necessary undo log has already been
-		deleted */
+	rw_lock_s_lock(&purge_sys->latch);
+	missing_history = read_view_sees_trx_id(purge_sys->view, trx_id);
 
-		return(DB_MISSING_HISTORY);
+	if (!missing_history) {
+		*undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
 	}
 
-	*undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
+	rw_lock_s_unlock(&purge_sys->latch);
 
-	return(DB_SUCCESS);
+	return(missing_history);
 }
 
+#ifdef UNIV_DEBUG
+#define ATTRIB_USED_ONLY_IN_DEBUG
+#else /* UNIV_DEBUG */
+#define ATTRIB_USED_ONLY_IN_DEBUG	__attribute__((unused))
+#endif /* UNIV_DEBUG */
+
 /*******************************************************************//**
-Build a previous version of a clustered index record. This function checks
-that the caller has a latch on the index page of the clustered index record
-and an s-latch on the purge_view. This guarantees that the stack of versions
-is locked all the way down to the purge_view.
-@return DB_SUCCESS, or DB_MISSING_HISTORY if the previous version is
-earlier than purge_view, which means that it may have been removed,
-DB_ERROR if corrupted record */
+Build a previous version of a clustered index record. The caller must
+hold a latch on the index page of the clustered index record.
+@retval true if previous version was built, or if it was an insert
+or the table has been rebuilt
+@retval false if the previous version is earlier than purge_view,
+which means that it may have been removed */
 UNIV_INTERN
-ulint
+bool
 trx_undo_prev_version_build(
 /*========================*/
-	const rec_t*	index_rec,/*!< in: clustered index record in the
+	const rec_t*	index_rec ATTRIB_USED_ONLY_IN_DEBUG,
+				/*!< in: clustered index record in the
 				index tree */
-	mtr_t*		index_mtr __attribute__((unused)),
+	mtr_t*		index_mtr ATTRIB_USED_ONLY_IN_DEBUG,
 				/*!< in: mtr which contains the latch to
 				index_rec page and purge_view */
 	const rec_t*	rec,	/*!< in: version of a clustered index record */
 	dict_index_t*	index,	/*!< in: clustered index */
-	ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
 	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
 				needed is allocated */
 	rec_t**		old_vers)/*!< out, own: previous version, or NULL if
@@ -1504,63 +1508,48 @@ trx_undo_prev_version_build(
 	table_id_t	table_id;
 	trx_id_t	trx_id;
 	roll_ptr_t	roll_ptr;
-	roll_ptr_t	old_roll_ptr;
 	upd_t*		update;
 	byte*		ptr;
 	ulint		info_bits;
 	ulint		cmpl_info;
-	ibool		dummy_extern;
+	bool		dummy_extern;
 	byte*		buf;
-	ulint		err;
 #ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(&purge_sys->latch, RW_LOCK_SHARED));
 #endif /* UNIV_SYNC_DEBUG */
 	ut_ad(mtr_memo_contains_page(index_mtr, index_rec, MTR_MEMO_PAGE_S_FIX)
 	      || mtr_memo_contains_page(index_mtr, index_rec,
 					MTR_MEMO_PAGE_X_FIX));
 	ut_ad(rec_offs_validate(rec, index, offsets));
-
-	if (!dict_index_is_clust(index)) {
-		fprintf(stderr, "InnoDB: Error: trying to access"
-			" update undo rec for non-clustered index %s\n"
-			"InnoDB: Submit a detailed bug report to"
-			" http://bugs.mysql.com\n"
-			"InnoDB: index record ", index->name);
-		rec_print(stderr, index_rec, index);
-		fputs("\n"
-		      "InnoDB: record version ", stderr);
-		rec_print_new(stderr, rec, offsets);
-		putc('\n', stderr);
-		ut_ad(0);
-		return(DB_ERROR);
-	}
+	ut_a(dict_index_is_clust(index));
 
 	roll_ptr = row_get_rec_roll_ptr(rec, index, offsets);
-	old_roll_ptr = roll_ptr;
 
 	*old_vers = NULL;
 
 	if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
-
 		/* The record rec is the first inserted version */
-
-		return(DB_SUCCESS);
+		return(true);
 	}
 
 	rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
 
-	err = trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap);
-
-	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
-		/* The undo record may already have been purged.
-		This should never happen in InnoDB. */
-
-		return(err);
+	if (trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap)) {
+		/* The undo record may already have been purged,
+		during purge or semi-consistent read. */
+		return(false);
 	}
 
 	ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
 				    &dummy_extern, &undo_no, &table_id);
 
+	if (table_id != index->table->id) {
+		/* The table should have been rebuilt, but purge has
+		not yet removed the undo log records for the
+		now-dropped old table (table_id). */
+		return(true);
+	}
+
 	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
 					       &info_bits);
 
@@ -1591,59 +1580,11 @@ trx_undo_prev_version_build(
 	ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id,
 					     roll_ptr, info_bits,
 					     NULL, heap, &update);
+	ut_a(ptr);
 
-	if (UNIV_UNLIKELY(table_id != index->table->id)) {
-		ptr = NULL;
-
-		fprintf(stderr,
-			"InnoDB: Error: trying to access update undo rec"
-			" for table %s\n"
-			"InnoDB: but the table id in the"
-			" undo record is wrong\n"
-			"InnoDB: Submit a detailed bug report"
-			" to http://bugs.mysql.com\n"
-			"InnoDB: Run also CHECK TABLE %s\n",
-			index->table_name, index->table_name);
-	}
-
-	if (ptr == NULL) {
-		/* The record was corrupted, return an error; these printfs
-		should catch an elusive bug in row_vers_old_has_index_entry */
-
-		fprintf(stderr,
-			"InnoDB: table %s, index %s, n_uniq %lu\n"
-			"InnoDB: undo rec address %p, type %lu cmpl_info %lu\n"
-			"InnoDB: undo rec table id %llu,"
-			" index table id %llu\n"
-			"InnoDB: dump of 150 bytes in undo rec: ",
-			index->table_name, index->name,
-			(ulong) dict_index_get_n_unique(index),
-			undo_rec, (ulong) type, (ulong) cmpl_info,
-			(ullint) table_id,
-			(ullint) index->table->id);
-		ut_print_buf(stderr, undo_rec, 150);
-		fputs("\n"
-		      "InnoDB: index record ", stderr);
-		rec_print(stderr, index_rec, index);
-		fputs("\n"
-		      "InnoDB: record version ", stderr);
-		rec_print_new(stderr, rec, offsets);
-		fprintf(stderr, "\n"
-			"InnoDB: Record trx id " TRX_ID_FMT
-			", update rec trx id " TRX_ID_FMT "\n"
-			"InnoDB: Roll ptr in rec " TRX_ID_FMT
-			", in update rec" TRX_ID_FMT "\n",
-			(ullint) rec_trx_id, (ullint) trx_id,
-			(ullint) old_roll_ptr, (ullint) roll_ptr);
-
-		trx_purge_sys_print();
-		ut_ad(0);
-		return(DB_ERROR);
-	}
-
-# ifdef UNIV_BLOB_NULL_DEBUG
+# if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 	ut_a(!rec_offs_any_null_extern(rec, offsets));
-# endif /* UNIV_BLOB_NULL_DEBUG */
+# endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 
 	if (row_upd_changes_field_size_or_external(index, offsets, update)) {
 		ulint	n_ext;
@@ -1660,11 +1601,24 @@ trx_undo_prev_version_build(
 		delete-marked record by trx_id, no transactions need to access
 		the BLOB. */
 
+		/* the row_upd_changes_disowned_external(update) call could be
+		omitted, but the synchronization on purge_sys->latch is likely
+		more expensive. */
+
 		if ((update->info_bits & REC_INFO_DELETED_FLAG)
-		    && read_view_sees_trx_id(purge_sys->view, trx_id)) {
-			/* treat as a fresh insert, not to
-			cause assertion error at the caller. */
-			return(DB_SUCCESS);
+		    && row_upd_changes_disowned_external(update)) {
+			bool	missing_extern;
+
+			rw_lock_s_lock(&purge_sys->latch);
+			missing_extern = read_view_sees_trx_id(purge_sys->view,
+							       trx_id);
+			rw_lock_s_unlock(&purge_sys->latch);
+
+			if (missing_extern) {
+				/* treat as a fresh insert, not to
+				cause assertion error at the caller. */
+				return(true);
+			}
 		}
 
 		/* We have to set the appropriate extern storage bits in the
@@ -1673,26 +1627,30 @@ trx_undo_prev_version_build(
 		those fields that update updates to become externally stored
 		fields. Store the info: */
 
-		entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index,
-					       offsets, &n_ext, heap);
+		entry = row_rec_to_index_entry(
+			rec, index, offsets, &n_ext, heap);
 		n_ext += btr_push_update_extern_fields(entry, update, heap);
 		/* The page containing the clustered index record
 		corresponding to entry is latched in mtr.  Thus the
 		following call is safe. */
 		row_upd_index_replace_new_col_vals(entry, index, update, heap);
 
-		buf = mem_heap_alloc(heap, rec_get_converted_size(index, entry,
-								  n_ext));
+		buf = static_cast<byte*>(
+			mem_heap_alloc(
+				heap,
+				rec_get_converted_size(index, entry, n_ext)));
 
 		*old_vers = rec_convert_dtuple_to_rec(buf, index,
 						      entry, n_ext);
 	} else {
-		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+		buf = static_cast<byte*>(
+			mem_heap_alloc(heap, rec_offs_size(offsets)));
+
 		*old_vers = rec_copy(buf, rec, offsets);
 		rec_offs_make_valid(*old_vers, index, offsets);
 		row_upd_rec_in_place(*old_vers, index, offsets, update, NULL);
 	}
 
-	return(DB_SUCCESS);
+	return(true);
 }
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/trx/trx0roll.c b/storage/xtradb/trx/trx0roll.cc
index 25c1d5d4692..1089607c6d1 100644
--- a/storage/xtradb/trx/trx0roll.c
+++ b/storage/xtradb/trx/trx0roll.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file trx/trx0roll.c
+@file trx/trx0roll.cc
 Transaction rollback
 
 Created 3/26/1996 Heikki Tuuri
@@ -38,10 +38,13 @@ Created 3/26/1996 Heikki Tuuri
 #include "que0que.h"
 #include "usr0sess.h"
 #include "srv0start.h"
+#include "read0read.h"
 #include "row0undo.h"
 #include "row0mysql.h"
 #include "lock0lock.h"
 #include "pars0pars.h"
+#include "srv0mon.h"
+#include "trx0sys.h"
 
 /** This many pages must be undone before a truncate is tried within
 rollback */
@@ -57,164 +60,273 @@ static undo_no_t	trx_roll_max_undo_no;
 /** Auxiliary variable which tells the previous progress % we printed */
 static ulint		trx_roll_progress_printed_pct;
 
+/****************************************************************//**
+Finishes a transaction rollback. */
+static
+void
+trx_rollback_finish(
+/*================*/
+	trx_t*		trx);	/*!< in: transaction */
+
 /*******************************************************************//**
-Rollback a transaction used in MySQL.
-@return	error code or DB_SUCCESS */
-UNIV_INTERN
-int
-trx_general_rollback_for_mysql(
-/*===========================*/
+Rollback a transaction used in MySQL. */
+static
+void
+trx_rollback_to_savepoint_low(
+/*==========================*/
 	trx_t*		trx,	/*!< in: transaction handle */
 	trx_savept_t*	savept)	/*!< in: pointer to savepoint undo number, if
 				partial rollback requested, or NULL for
 				complete rollback */
 {
-	mem_heap_t*	heap;
 	que_thr_t*	thr;
+	mem_heap_t*	heap;
 	roll_node_t*	roll_node;
 
-	/* Tell Innobase server that there might be work for
-	utility threads: */
-
-	srv_active_wake_master_thread();
-
-	trx_start_if_not_started(trx);
-
 	heap = mem_heap_create(512);
 
 	roll_node = roll_node_create(heap);
 
-	if (savept) {
+	if (savept != NULL) {
 		roll_node->partial = TRUE;
 		roll_node->savept = *savept;
+		assert_trx_in_list(trx);
+	}  else {
+		assert_trx_nonlocking_or_in_list(trx);
 	}
 
 	trx->error_state = DB_SUCCESS;
 
-	thr = pars_complete_graph_for_exec(roll_node, trx, heap);
+	if (trx->insert_undo || trx->update_undo) {
+		thr = pars_complete_graph_for_exec(roll_node, trx, heap);
 
-	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
-	que_run_threads(thr);
+		ut_a(thr == que_fork_start_command(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
 
-	mutex_enter(&kernel_mutex);
+		que_run_threads(thr);
 
-	while (trx->que_state != TRX_QUE_RUNNING) {
+		ut_a(roll_node->undo_thr != NULL);
+		que_run_threads(roll_node->undo_thr);
 
-		mutex_exit(&kernel_mutex);
-
-		os_thread_sleep(100000);
+		/* Free the memory reserved by the undo graph. */
+		que_graph_free(static_cast<que_t*>(
+				       roll_node->undo_thr->common.parent));
+	}
 
-		mutex_enter(&kernel_mutex);
+	if (savept == NULL) {
+		trx_rollback_finish(trx);
+		MONITOR_INC(MONITOR_TRX_ROLLBACK);
+	} else {
+		trx->lock.que_state = TRX_QUE_RUNNING;
+		MONITOR_INC(MONITOR_TRX_ROLLBACK_SAVEPOINT);
 	}
 
-	mutex_exit(&kernel_mutex);
+	ut_a(trx->error_state == DB_SUCCESS);
+	ut_a(trx->lock.que_state == TRX_QUE_RUNNING);
 
 	mem_heap_free(heap);
 
-	ut_a(trx->error_state == DB_SUCCESS);
+	MONITOR_DEC(MONITOR_TRX_ACTIVE);
+}
+
+/*******************************************************************//**
+Rollback a transaction to a given savepoint or do a complete rollback.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+trx_rollback_to_savepoint(
+/*======================*/
+	trx_t*		trx,	/*!< in: transaction handle */
+	trx_savept_t*	savept)	/*!< in: pointer to savepoint undo number, if
+				partial rollback requested, or NULL for
+				complete rollback */
+{
+	ut_ad(!trx_mutex_own(trx));
 
 	/* Tell Innobase server that there might be work for
 	utility threads: */
 
 	srv_active_wake_master_thread();
 
-	return((int) trx->error_state);
+	trx_start_if_not_started_xa(trx);
+
+	trx_rollback_to_savepoint_low(trx, savept);
+
+	/* Tell Innobase server that there might be work for
+	utility threads: */
+
+	srv_active_wake_master_thread();
+
+	return(trx->error_state);
+}
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return	error code or DB_SUCCESS */
+static
+dberr_t
+trx_rollback_for_mysql_low(
+/*=======================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	srv_active_wake_master_thread();
+
+	trx->op_info = "rollback";
+
+	/* If we are doing the XA recovery of prepared transactions,
+	then the transaction object does not have an InnoDB session
+	object, and we set a dummy session that we use for all MySQL
+	transactions. */
+
+	trx_rollback_to_savepoint_low(trx, NULL);
+
+	trx->op_info = "";
+
+	ut_a(trx->error_state == DB_SUCCESS);
+
+	srv_active_wake_master_thread();
+
+	return(trx->error_state);
 }
 
 /*******************************************************************//**
 Rollback a transaction used in MySQL.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 trx_rollback_for_mysql(
 /*===================*/
-	trx_t*	trx)	/*!< in: transaction handle */
+	trx_t*	trx)	/*!< in/out: transaction */
 {
-	int	err;
-
-	if (trx->state == TRX_NOT_STARTED) {
-
+	/* We are reading trx->state without holding trx_sys->mutex
+	here, because the rollback should be invoked for a running
+	active MySQL transaction (or recovered prepared transaction)
+	that is associated with the current thread. */
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		ut_ad(trx->in_mysql_trx_list);
 		return(DB_SUCCESS);
-	}
 
-	trx->op_info = "rollback";
-
-	/* If we are doing the XA recovery of prepared transactions, then
-	the transaction object does not have an InnoDB session object, and we
-	set a dummy session that we use for all MySQL transactions. */
+	case TRX_STATE_ACTIVE:
+		ut_ad(trx->in_mysql_trx_list);
+		assert_trx_nonlocking_or_in_list(trx);
+		return(trx_rollback_for_mysql_low(trx));
 
-	err = trx_general_rollback_for_mysql(trx, NULL);
+	case TRX_STATE_PREPARED:
+		ut_ad(!trx_is_autocommit_non_locking(trx));
+		return(trx_rollback_for_mysql_low(trx));
 
-	trx->op_info = "";
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		assert_trx_in_list(trx);
+		break;
+	}
 
-	return(err);
+	ut_error;
+	return(DB_CORRUPTION);
 }
 
 /*******************************************************************//**
 Rollback the latest SQL statement for MySQL.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-int
+dberr_t
 trx_rollback_last_sql_stat_for_mysql(
 /*=================================*/
-	trx_t*	trx)	/*!< in: transaction handle */
+	trx_t*	trx)	/*!< in/out: transaction */
 {
-	int	err;
+	dberr_t	err;
 
-	if (trx->state == TRX_NOT_STARTED) {
+	/* We are reading trx->state without holding trx_sys->mutex
+	here, because the statement rollback should be invoked for a
+	running active MySQL transaction that is associated with the
+	current thread. */
+	ut_ad(trx->in_mysql_trx_list);
 
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
 		return(DB_SUCCESS);
+	case TRX_STATE_ACTIVE:
+		assert_trx_nonlocking_or_in_list(trx);
+
+		trx->op_info = "rollback of SQL statement";
+
+		err = trx_rollback_to_savepoint(
+			trx, &trx->last_sql_stat_start);
+
+		if (trx->fts_trx) {
+			fts_savepoint_rollback_last_stmt(trx);
+		}
+
+		/* The following call should not be needed,
+		but we play it safe: */
+		trx_mark_sql_stat_end(trx);
+
+		trx->op_info = "";
+
+		return(err);
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		/* The statement rollback is only allowed on an ACTIVE
+		transaction, not a PREPARED or COMMITTED one. */
+		break;
 	}
 
-	trx->op_info = "rollback of SQL statement";
+	ut_error;
+	return(DB_CORRUPTION);
+}
 
-	err = trx_general_rollback_for_mysql(trx, &trx->last_sql_stat_start);
-	/* The following call should not be needed, but we play safe: */
-	trx_mark_sql_stat_end(trx);
+/*******************************************************************//**
+Search for a savepoint using name.
+@return savepoint if found else NULL */
+static
+trx_named_savept_t*
+trx_savepoint_find(
+/*===============*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name)			/*!< in: savepoint name */
+{
+	trx_named_savept_t*	savep;
 
-	trx->op_info = "";
+	for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	     savep != NULL;
+	     savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) {
 
-	return(err);
+		if (0 == ut_strcmp(savep->name, name)) {
+			return(savep);
+		}
+	}
+
+	return(NULL);
 }
 
 /*******************************************************************//**
 Frees a single savepoint struct. */
-UNIV_INTERN
+static
 void
 trx_roll_savepoint_free(
 /*=====================*/
 	trx_t*			trx,	/*!< in: transaction handle */
 	trx_named_savept_t*	savep)	/*!< in: savepoint to free */
 {
-	ut_a(savep != NULL);
-	ut_a(UT_LIST_GET_LEN(trx->trx_savepoints) > 0);
-
 	UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep);
 	mem_free(savep->name);
 	mem_free(savep);
 }
 
 /*******************************************************************//**
-Frees savepoint structs starting from savep, if savep == NULL then
-free all savepoints. */
+Frees savepoint structs starting from savep. */
 UNIV_INTERN
 void
 trx_roll_savepoints_free(
 /*=====================*/
 	trx_t*			trx,	/*!< in: transaction handle */
-	trx_named_savept_t*	savep)	/*!< in: free all savepoints > this one;
-					if this is NULL, free all savepoints
-					of trx */
+	trx_named_savept_t*	savep)	/*!< in: free all savepoints starting
+					with this savepoint i*/
 {
-	trx_named_savept_t*	next_savep;
-
-	if (savep == NULL) {
-		savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
-	} else {
-		savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
-	}
-
 	while (savep != NULL) {
+		trx_named_savept_t*	next_savep;
+
 		next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
 
 		trx_roll_savepoint_free(trx, savep);
@@ -232,8 +344,58 @@ the row, these locks are naturally released in the rollback. Savepoints which
 were set after this savepoint are deleted.
 @return if no savepoint of the name found then DB_NO_SAVEPOINT,
 otherwise DB_SUCCESS */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+trx_rollback_to_savepoint_for_mysql_low(
+/*====================================*/
+	trx_t*			trx,	/*!< in/out: transaction */
+	trx_named_savept_t*	savep,	/*!< in/out: savepoint */
+	ib_int64_t*		mysql_binlog_cache_pos)
+					/*!< out: the MySQL binlog
+					cache position corresponding
+					to this savepoint; MySQL needs
+					this information to remove the
+					binlog entries of the queries
+					executed after the savepoint */
+{
+	dberr_t	err;
+
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(trx->in_mysql_trx_list);
+
+	/* Free all savepoints strictly later than savep. */
+
+	trx_roll_savepoints_free(
+		trx, UT_LIST_GET_NEXT(trx_savepoints, savep));
+
+	*mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;
+
+	trx->op_info = "rollback to a savepoint";
+
+	err = trx_rollback_to_savepoint(trx, &savep->savept);
+
+	/* Store the current undo_no of the transaction so that
+	we know where to roll back if we have to roll back the
+	next SQL statement: */
+
+	trx_mark_sql_stat_end(trx);
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 trx_rollback_to_savepoint_for_mysql(
 /*================================*/
 	trx_t*		trx,			/*!< in: transaction handle */
@@ -246,49 +408,38 @@ trx_rollback_to_savepoint_for_mysql(
 						executed after the savepoint */
 {
 	trx_named_savept_t*	savep;
-	ulint			err;
 
-	savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	/* We are reading trx->state without holding trx_sys->mutex
+	here, because the savepoint rollback should be invoked for a
+	running active MySQL transaction that is associated with the
+	current thread. */
+	ut_ad(trx->in_mysql_trx_list);
 
-	while (savep != NULL) {
-		if (0 == ut_strcmp(savep->name, savepoint_name)) {
-			/* Found */
-			break;
-		}
-		savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
-	}
+	savep = trx_savepoint_find(trx, savepoint_name);
 
 	if (savep == NULL) {
-
 		return(DB_NO_SAVEPOINT);
 	}
 
-	if (trx->state == TRX_NOT_STARTED) {
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
 		ut_print_timestamp(stderr);
 		fputs("  InnoDB: Error: transaction has a savepoint ", stderr);
 		ut_print_name(stderr, trx, FALSE, savep->name);
 		fputs(" though it is not started\n", stderr);
 		return(DB_ERROR);
+	case TRX_STATE_ACTIVE:
+		return(trx_rollback_to_savepoint_for_mysql_low(
+				trx, savep, mysql_binlog_cache_pos));
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		/* The savepoint rollback is only allowed on an ACTIVE
+		transaction, not a PREPARED or COMMITTED one. */
+		break;
 	}
 
-	/* We can now free all savepoints strictly later than this one */
-
-	trx_roll_savepoints_free(trx, savep);
-
-	*mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;
-
-	trx->op_info = "rollback to a savepoint";
-
-	err = trx_general_rollback_for_mysql(trx, &savep->savept);
-
-	/* Store the current undo_no of the transaction so that we know where
-	to roll back if we have to roll back the next SQL statement: */
-
-	trx_mark_sql_stat_end(trx);
-
-	trx->op_info = "";
-
-	return(err);
+	ut_error;
+	return(DB_CORRUPTION);
 }
 
 /*******************************************************************//**
@@ -298,7 +449,7 @@ savepoint and replaces it with a new. Savepoints are deleted in a transaction
 commit or rollback.
 @return	always DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 trx_savepoint_for_mysql(
 /*====================*/
 	trx_t*		trx,			/*!< in: transaction handle */
@@ -310,20 +461,9 @@ trx_savepoint_for_mysql(
 {
 	trx_named_savept_t*	savep;
 
-	ut_a(trx);
-	ut_a(savepoint_name);
-
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
-	savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
-
-	while (savep != NULL) {
-		if (0 == ut_strcmp(savep->name, savepoint_name)) {
-			/* Found */
-			break;
-		}
-		savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
-	}
+	savep = trx_savepoint_find(trx, savepoint_name);
 
 	if (savep) {
 		/* There is a savepoint with the same name: free that */
@@ -336,7 +476,7 @@ trx_savepoint_for_mysql(
 
 	/* Create a new savepoint and add it as the last in the list */
 
-	savep = mem_alloc(sizeof(trx_named_savept_t));
+	savep = static_cast<trx_named_savept_t*>(mem_alloc(sizeof(*savep)));
 
 	savep->name = mem_strdup(savepoint_name);
 
@@ -355,7 +495,7 @@ savepoint are left as is.
 @return if no savepoint of the name found then DB_NO_SAVEPOINT,
 otherwise DB_SUCCESS */
 UNIV_INTERN
-ulint
+dberr_t
 trx_release_savepoint_for_mysql(
 /*============================*/
 	trx_t*		trx,			/*!< in: transaction handle */
@@ -363,18 +503,16 @@ trx_release_savepoint_for_mysql(
 {
 	trx_named_savept_t*	savep;
 
-	savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(trx->in_mysql_trx_list);
 
-	/* Search for the savepoint by name and free if found. */
-	while (savep != NULL) {
-		if (0 == ut_strcmp(savep->name, savepoint_name)) {
-			trx_roll_savepoint_free(trx, savep);
-			return(DB_SUCCESS);
-		}
-		savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+	savep = trx_savepoint_find(trx, savepoint_name);
+
+	if (savep != NULL) {
+		trx_roll_savepoint_free(trx, savep);
 	}
 
-	return(DB_NO_SAVEPOINT);
+	return(savep != NULL ? DB_SUCCESS : DB_NO_SAVEPOINT);
 }
 
 /*******************************************************************//**
@@ -436,17 +574,22 @@ trx_rollback_active(
 	thr->child = roll_node;
 	roll_node->common.parent = thr;
 
-	mutex_enter(&kernel_mutex);
-
 	trx->graph = fork;
 
 	ut_a(thr == que_fork_start_command(fork));
 
+	mutex_enter(&trx_sys->mutex);
+
 	trx_roll_crash_recv_trx	= trx;
+
 	trx_roll_max_undo_no = trx->undo_no;
+
 	trx_roll_progress_printed_pct = 0;
+
 	rows_to_undo = trx_roll_max_undo_no;
 
+	mutex_exit(&trx_sys->mutex);
+
 	if (rows_to_undo > 1000000000) {
 		rows_to_undo = rows_to_undo / 1000000;
 		unit = "M";
@@ -456,9 +599,8 @@ trx_rollback_active(
 	fprintf(stderr,
 		"  InnoDB: Rolling back trx with id " TRX_ID_FMT ", %lu%s"
 		" rows to undo\n",
-		(ullint) trx->id,
+		trx->id,
 		(ulong) rows_to_undo, unit);
-	mutex_exit(&kernel_mutex);
 
 	if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) {
 		row_mysql_lock_data_dictionary(trx);
@@ -466,48 +608,51 @@ trx_rollback_active(
 	}
 
 	que_run_threads(thr);
+	ut_a(roll_node->undo_thr != NULL);
 
-	mutex_enter(&kernel_mutex);
-
-	while (trx->que_state != TRX_QUE_RUNNING) {
-
-		mutex_exit(&kernel_mutex);
+	que_run_threads(roll_node->undo_thr);
 
-		fprintf(stderr,
-			"InnoDB: Waiting for rollback of trx id "
-			TRX_ID_FMT " to end\n",
-			(ullint) trx->id);
-		os_thread_sleep(100000);
+	trx_rollback_finish(thr_get_trx(roll_node->undo_thr));
 
-		mutex_enter(&kernel_mutex);
-	}
+	/* Free the memory reserved by the undo graph */
+	que_graph_free(static_cast<que_t*>(
+			       roll_node->undo_thr->common.parent));
 
-	mutex_exit(&kernel_mutex);
+	ut_a(trx->lock.que_state == TRX_QUE_RUNNING);
 
 	if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE
 	    && trx->table_id != 0) {
 
-		/* If the transaction was for a dictionary operation, we
-		drop the relevant table, if it still exists */
+		/* If the transaction was for a dictionary operation,
+		we drop the relevant table only if it is not flagged
+		as DISCARDED. If it still exists. */
 
-		fprintf(stderr,
-			"InnoDB: Dropping table with id %llu"
-			" in recovery if it exists\n",
-			(ullint) trx->table_id);
+		table = dict_table_open_on_id(
+			trx->table_id, dictionary_locked,
+			DICT_TABLE_OP_NORMAL);
+
+		if (table && !dict_table_is_discarded(table)) {
 
-		table = dict_table_get_on_id_low(trx->table_id);
+			dberr_t	err;
 
-		if (table) {
-			ulint	err;
+			/* Ensure that the table doesn't get evicted from the
+			cache, keeps things simple for drop. */
 
-			fputs("InnoDB: Table found: dropping table ", stderr);
-			ut_print_name(stderr, trx, TRUE, table->name);
-			fputs(" in recovery\n", stderr);
+			if (table->can_be_evicted) {
+				dict_table_move_from_lru_to_non_lru(table);
+			}
+
+			dict_table_close(table, dictionary_locked, FALSE);
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Dropping table '%s', with id " UINT64PF " "
+				"in recovery",
+				table->name, trx->table_id);
 
 			err = row_drop_table_for_mysql(table->name, trx, TRUE);
 			trx_commit_for_mysql(trx);
 
-			ut_a(err == (int) DB_SUCCESS);
+			ut_a(err == DB_SUCCESS);
 		}
 	}
 
@@ -515,15 +660,72 @@ trx_rollback_active(
 		row_mysql_unlock_data_dictionary(trx);
 	}
 
-	fprintf(stderr, "\nInnoDB: Rolling back of trx id " TRX_ID_FMT
-		" completed\n",
-		(ullint) trx->id);
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Rollback of trx with id " TRX_ID_FMT " completed", trx->id);
+
 	mem_heap_free(heap);
 
 	trx_roll_crash_recv_trx	= NULL;
 }
 
 /*******************************************************************//**
+Rollback or clean up any resurrected incomplete transactions. It assumes
+that the caller holds the trx_sys_t::mutex and it will release the
+lock if it does a clean up or rollback.
+@return TRUE if the transaction was cleaned up or rolled back
+and trx_sys->mutex was released. */
+static
+ibool
+trx_rollback_resurrected(
+/*=====================*/
+	trx_t*	trx,	/*!< in: transaction to rollback or clean */
+	ibool	all)	/*!< in: FALSE=roll back dictionary transactions;
+			TRUE=roll back all non-PREPARED transactions */
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	/* The trx->is_recovered flag and trx->state are set
+	atomically under the protection of the trx->mutex (and
+	lock_sys->mutex) in lock_trx_release_locks(). We do not want
+	to accidentally clean up a non-recovered transaction here. */
+
+	trx_mutex_enter(trx);
+	bool		is_recovered	= trx->is_recovered;
+	trx_state_t	state		= trx->state;
+	trx_mutex_exit(trx);
+
+	if (!is_recovered) {
+		return(FALSE);
+	}
+
+	switch (state) {
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		mutex_exit(&trx_sys->mutex);
+		fprintf(stderr,
+			"InnoDB: Cleaning up trx with id " TRX_ID_FMT "\n",
+			trx->id);
+		trx_cleanup_at_db_startup(trx);
+		trx_free_for_background(trx);
+		return(TRUE);
+	case TRX_STATE_ACTIVE:
+		if (all || trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) {
+			mutex_exit(&trx_sys->mutex);
+			trx_rollback_active(trx);
+			trx_free_for_background(trx);
+			return(TRUE);
+		}
+		return(FALSE);
+	case TRX_STATE_PREPARED:
+		return(FALSE);
+	case TRX_STATE_NOT_STARTED:
+		break;
+	}
+
+	ut_error;
+	return(FALSE);
+}
+
+/*******************************************************************//**
 Rollback or clean up any incomplete transactions which were
 encountered in crash recovery.  If the transaction already was
 committed, then we clean up a possible insert undo log. If the
@@ -537,10 +739,11 @@ trx_rollback_or_clean_recovered(
 {
 	trx_t*	trx;
 
-	mutex_enter(&kernel_mutex);
+	ut_a(srv_force_recovery < SRV_FORCE_NO_TRX_UNDO);
+
+	if (trx_sys_get_n_rw_trx() == 0) {
 
-	if (!UT_LIST_GET_FIRST(trx_sys->trx_list)) {
-		goto leave_function;
+		return;
 	}
 
 	if (all) {
@@ -549,40 +752,38 @@ trx_rollback_or_clean_recovered(
 			" of uncommitted transactions\n");
 	}
 
-	mutex_exit(&kernel_mutex);
+	/* Note: For XA recovered transactions, we rely on MySQL to
+	do rollback. They will be in TRX_STATE_PREPARED state. If the server
+	is shutdown and they are still lingering in trx_sys_t::trx_list
+	then the shutdown will hang. */
 
-loop:
-	mutex_enter(&kernel_mutex);
+	/* Loop over the transaction list as long as there are
+	recovered transactions to clean up or recover. */
 
-	for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list); trx;
-	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
-		if (!trx->is_recovered) {
-			continue;
-		}
+	do {
+		mutex_enter(&trx_sys->mutex);
+
+		for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+		     trx != NULL;
+		     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
 
-		switch (trx->state) {
-		case TRX_NOT_STARTED:
-		case TRX_PREPARED:
-			continue;
-
-		case TRX_COMMITTED_IN_MEMORY:
-			mutex_exit(&kernel_mutex);
-			fprintf(stderr,
-				"InnoDB: Cleaning up trx with id "
-				TRX_ID_FMT "\n",
-				(ullint) trx->id);
-			trx_cleanup_at_db_startup(trx);
-			goto loop;
-
-		case TRX_ACTIVE:
-			if (all || trx_get_dict_operation(trx)
-			    != TRX_DICT_OP_NONE) {
-				mutex_exit(&kernel_mutex);
-				trx_rollback_active(trx);
-				goto loop;
+			assert_trx_in_rw_list(trx);
+
+			/* If this function does a cleanup or rollback
+			then it will release the trx_sys->mutex, therefore
+			we need to reacquire it before retrying the loop. */
+
+			if (trx_rollback_resurrected(trx, all)) {
+
+				mutex_enter(&trx_sys->mutex);
+
+				break;
 			}
 		}
-	}
+
+		mutex_exit(&trx_sys->mutex);
+
+	} while (trx != NULL);
 
 	if (all) {
 		ut_print_timestamp(stderr);
@@ -590,9 +791,6 @@ loop:
 			"  InnoDB: Rollback of non-prepared"
 			" transactions completed\n");
 	}
-
-leave_function:
-	mutex_exit(&kernel_mutex);
 }
 
 /*******************************************************************//**
@@ -602,14 +800,16 @@ committed, then we clean up a possible insert undo log. If the
 transaction was not yet committed, then we roll it back.
 Note: this is done in a background thread.
 @return	a dummy parameter */
-UNIV_INTERN
+extern "C" UNIV_INTERN
 os_thread_ret_t
-trx_rollback_or_clean_all_recovered(
-/*================================*/
+DECLARE_THREAD(trx_rollback_or_clean_all_recovered)(
+/*================================================*/
 	void*	arg __attribute__((unused)))
 			/*!< in: a dummy parameter required by
 			os_thread_create */
 {
+	ut_ad(!srv_read_only_mode);
+
 #ifdef UNIV_PFS_THREAD
 	pfs_register_thread(trx_rollback_clean_thread_key);
 #endif /* UNIV_PFS_THREAD */
@@ -627,30 +827,25 @@ trx_rollback_or_clean_all_recovered(
 /*******************************************************************//**
 Creates an undo number array.
 @return	own: undo number array */
-UNIV_INTERN
+static
 trx_undo_arr_t*
-trx_undo_arr_create(void)
-/*=====================*/
+trx_undo_arr_create(
+/*================*/
+	ulint		n_cells)	/*!< Number of cells */
 {
 	trx_undo_arr_t*	arr;
 	mem_heap_t*	heap;
-	ulint		i;
+	ulint		sz = sizeof(*arr) + sizeof(*arr->infos) * n_cells;
 
-	heap = mem_heap_create(1024);
+	heap = mem_heap_create(sz);
 
-	arr = mem_heap_alloc(heap, sizeof(trx_undo_arr_t));
+	arr = static_cast<trx_undo_arr_t*>(mem_heap_zalloc(heap, sz));
 
-	arr->infos = mem_heap_alloc(heap, sizeof(trx_undo_inf_t)
-				    * UNIV_MAX_PARALLELISM);
-	arr->n_cells = UNIV_MAX_PARALLELISM;
-	arr->n_used = 0;
+	arr->n_cells = n_cells;
 
-	arr->heap = heap;
+	arr->infos = (trx_undo_inf_t*) (arr + 1);
 
-	for (i = 0; i < UNIV_MAX_PARALLELISM; i++) {
-
-		(trx_undo_arr_get_nth_info(arr, i))->in_use = FALSE;
-	}
+	arr->heap = heap;
 
 	return(arr);
 }
@@ -663,8 +858,6 @@ trx_undo_arr_free(
 /*==============*/
 	trx_undo_arr_t*	arr)	/*!< in: undo number array */
 {
-	ut_ad(arr->n_used == 0);
-
 	mem_heap_free(arr->heap);
 }
 
@@ -678,19 +871,18 @@ trx_undo_arr_store_info(
 	trx_t*		trx,	/*!< in: transaction */
 	undo_no_t	undo_no)/*!< in: undo number */
 {
-	trx_undo_inf_t*	cell;
-	trx_undo_inf_t*	stored_here;
+	ulint		i;
 	trx_undo_arr_t*	arr;
+	ulint		n = 0;
 	ulint		n_used;
-	ulint		n;
-	ulint		i;
+	trx_undo_inf_t*	stored_here = NULL;
 
-	n = 0;
 	arr = trx->undo_no_arr;
 	n_used = arr->n_used;
-	stored_here = NULL;
 
-	for (i = 0;; i++) {
+	for (i = 0; i < arr->n_cells; i++) {
+		trx_undo_inf_t*	cell;
+
 		cell = trx_undo_arr_get_nth_info(arr, i);
 
 		if (!cell->in_use) {
@@ -727,6 +919,10 @@ trx_undo_arr_store_info(
 			return(TRUE);
 		}
 	}
+
+	ut_error;
+
+	return(FALSE);
 }
 
 /*******************************************************************//**
@@ -738,22 +934,19 @@ trx_undo_arr_remove_info(
 	trx_undo_arr_t*	arr,	/*!< in: undo number array */
 	undo_no_t	undo_no)/*!< in: undo number */
 {
-	trx_undo_inf_t*	cell;
 	ulint		i;
 
-	for (i = 0;; i++) {
-		cell = trx_undo_arr_get_nth_info(arr, i);
+	for (i = 0; i < arr->n_cells; i++) {
 
-		if (cell->in_use
-		    && cell->undo_no == undo_no) {
+		trx_undo_inf_t*	cell;
 
-			cell->in_use = FALSE;
+		cell = trx_undo_arr_get_nth_info(arr, i);
 
+		if (cell->in_use && cell->undo_no == undo_no) {
+			cell->in_use = FALSE;
 			ut_ad(arr->n_used > 0);
-
-			arr->n_used--;
-
-			return;
+			--arr->n_used;
+			break;
 		}
 	}
 }
@@ -765,46 +958,40 @@ static
 undo_no_t
 trx_undo_arr_get_biggest(
 /*=====================*/
-	trx_undo_arr_t*	arr)	/*!< in: undo number array */
+	const trx_undo_arr_t*	arr)	/*!< in: undo number array */
 {
-	trx_undo_inf_t*	cell;
-	ulint		n_used;
-	undo_no_t	biggest;
-	ulint		n;
 	ulint		i;
+	undo_no_t	biggest = 0;
+	ulint		n_checked = 0;
 
-	n = 0;
-	n_used = arr->n_used;
-	biggest = 0;
+	for (i = 0; i < arr->n_cells && n_checked < arr->n_used; ++i) {
 
-	for (i = 0;; i++) {
-		cell = trx_undo_arr_get_nth_info(arr, i);
+		const trx_undo_inf_t*	cell = &arr->infos[i];
 
 		if (cell->in_use) {
-			n++;
+
+			++n_checked;
+
 			if (cell->undo_no > biggest) {
 
 				biggest = cell->undo_no;
 			}
 		}
-
-		if (n == n_used) {
-			return(biggest);
-		}
 	}
+
+	return(biggest);
 }
 
 /***********************************************************************//**
 Tries truncate the undo logs. */
-UNIV_INTERN
+static
 void
 trx_roll_try_truncate(
 /*==================*/
 	trx_t*	trx)	/*!< in/out: transaction */
 {
-	trx_undo_arr_t*	arr;
-	undo_no_t	limit;
-	undo_no_t	biggest;
+	undo_no_t		limit;
+	const trx_undo_arr_t*	arr;
 
 	ut_ad(mutex_own(&(trx->undo_mutex)));
 	ut_ad(mutex_own(&((trx->rseg)->mutex)));
@@ -816,6 +1003,8 @@ trx_roll_try_truncate(
 	limit = trx->undo_no;
 
 	if (arr->n_used > 0) {
+		undo_no_t	biggest;
+
 		biggest = trx_undo_arr_get_biggest(arr);
 
 		if (biggest >= limit) {
@@ -850,19 +1039,21 @@ trx_roll_pop_top_rec(
 	trx_undo_rec_t*	prev_rec;
 	page_t*		prev_rec_page;
 
-	ut_ad(mutex_own(&(trx->undo_mutex)));
+	ut_ad(mutex_own(&trx->undo_mutex));
+
+	undo_page = trx_undo_page_get_s_latched(
+		undo->space, undo->zip_size, undo->top_page_no, mtr);
 
-	undo_page = trx_undo_page_get_s_latched(undo->space, undo->zip_size,
-						undo->top_page_no, mtr);
 	offset = undo->top_offset;
 
 	/*	fprintf(stderr, "Thread %lu undoing trx " TRX_ID_FMT
 			" undo record " TRX_ID_FMT "\n",
 	os_thread_get_curr_id(), trx->id, undo->top_undo_no); */
 
-	prev_rec = trx_undo_get_prev_rec(undo_page + offset,
-					 undo->hdr_page_no, undo->hdr_offset,
-					 mtr);
+	prev_rec = trx_undo_get_prev_rec(
+		undo_page + offset, undo->hdr_page_no, undo->hdr_offset,
+		true, mtr);
+
 	if (prev_rec == NULL) {
 
 		undo->empty = TRUE;
@@ -915,11 +1106,11 @@ try_again:
 	mutex_enter(&(trx->undo_mutex));
 
 	if (trx->pages_undone >= TRX_ROLL_TRUNC_THRESHOLD) {
-		mutex_enter(&(rseg->mutex));
+		mutex_enter(&rseg->mutex);
 
 		trx_roll_try_truncate(trx);
 
-		mutex_exit(&(rseg->mutex));
+		mutex_exit(&rseg->mutex);
 	}
 
 	ins_undo = trx->insert_undo;
@@ -935,8 +1126,7 @@ try_again:
 		undo = ins_undo;
 	}
 
-	if (!undo || undo->empty
-	    || limit > undo->top_undo_no) {
+	if (!undo || undo->empty || limit > undo->top_undo_no) {
 
 		if ((trx->undo_no_arr)->n_used == 0) {
 			/* Rollback is ending */
@@ -953,15 +1143,11 @@ try_again:
 		return(NULL);
 	}
 
-	if (undo == ins_undo) {
-		is_insert = TRUE;
-	} else {
-		is_insert = FALSE;
-	}
+	is_insert = (undo == ins_undo);
+
+	*roll_ptr = trx_undo_build_roll_ptr(
+		is_insert, undo->rseg->id, undo->top_page_no, undo->top_offset);
 
-	*roll_ptr = trx_undo_build_roll_ptr(is_insert, (undo->rseg)->id,
-					    undo->top_page_no,
-					    undo->top_offset);
 	mtr_start(&mtr);
 
 	undo_rec = trx_roll_pop_top_rec(trx, undo, &mtr);
@@ -1055,83 +1241,13 @@ trx_undo_rec_release(
 	mutex_exit(&(trx->undo_mutex));
 }
 
-/*********************************************************************//**
-Starts a rollback operation. */
-UNIV_INTERN
-void
-trx_rollback(
-/*=========*/
-	trx_t*		trx,	/*!< in: transaction */
-	trx_sig_t*	sig,	/*!< in: signal starting the rollback */
-	que_thr_t**	next_thr)/*!< in/out: next query thread to run;
-				if the value which is passed in is
-				a pointer to a NULL pointer, then the
-				calling function can start running
-				a new query thread; if the passed value is
-				NULL, the parameter is ignored */
-{
-	que_t*		roll_graph;
-	que_thr_t*	thr;
-	/*	que_thr_t*	thr2; */
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad((trx->undo_no_arr == NULL) || ((trx->undo_no_arr)->n_used == 0));
-
-	/* Initialize the rollback field in the transaction */
-
-	switch (sig->type) {
-	case TRX_SIG_TOTAL_ROLLBACK:
-		trx->roll_limit = 0;
-		break;
-	case TRX_SIG_ROLLBACK_TO_SAVEPT:
-		trx->roll_limit = (sig->savept).least_undo_no;
-		break;
-	case TRX_SIG_ERROR_OCCURRED:
-		trx->roll_limit = trx->last_sql_stat_start.least_undo_no;
-		break;
-	default:
-		ut_error;
-	}
-
-	ut_a(trx->roll_limit <= trx->undo_no);
-
-	trx->pages_undone = 0;
-
-	if (trx->undo_no_arr == NULL) {
-		trx->undo_no_arr = trx_undo_arr_create();
-	}
-
-	/* Build a 'query' graph which will perform the undo operations */
-
-	roll_graph = trx_roll_graph_build(trx);
-
-	trx->graph = roll_graph;
-	trx->que_state = TRX_QUE_ROLLING_BACK;
-
-	thr = que_fork_start_command(roll_graph);
-
-	ut_ad(thr);
-
-	/*	thr2 = que_fork_start_command(roll_graph);
-
-	ut_ad(thr2); */
-
-	if (next_thr && (*next_thr == NULL)) {
-		*next_thr = thr;
-		/*		srv_que_task_enqueue_low(thr2); */
-	} else {
-		srv_que_task_enqueue_low(thr);
-		/*		srv_que_task_enqueue_low(thr2); */
-	}
-}
-
 /****************************************************************//**
 Builds an undo 'query' graph for a transaction. The actual rollback is
 performed by executing this query graph like a query subprocedure call.
 The reply about the completion of the rollback will be sent by this
 graph.
 @return	own: the query graph */
-UNIV_INTERN
+static
 que_t*
 trx_roll_graph_build(
 /*=================*/
@@ -1140,147 +1256,76 @@ trx_roll_graph_build(
 	mem_heap_t*	heap;
 	que_fork_t*	fork;
 	que_thr_t*	thr;
-	/*	que_thr_t*	thr2; */
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(trx_mutex_own(trx));
 
 	heap = mem_heap_create(512);
 	fork = que_fork_create(NULL, NULL, QUE_FORK_ROLLBACK, heap);
 	fork->trx = trx;
 
 	thr = que_thr_create(fork, heap);
-	/*	thr2 = que_thr_create(fork, heap); */
 
 	thr->child = row_undo_node_create(trx, thr, heap);
-	/*	thr2->child = row_undo_node_create(trx, thr2, heap); */
 
 	return(fork);
 }
 
 /*********************************************************************//**
-Finishes error processing after the necessary partial rollback has been
-done. */
+Starts a rollback operation, creates the UNDO graph that will do the
+actual undo operation.
+@return query graph thread that will perform the UNDO operations. */
 static
-void
-trx_finish_error_processing(
-/*========================*/
-	trx_t*	trx)	/*!< in: transaction */
+que_thr_t*
+trx_rollback_start(
+/*===============*/
+	trx_t*		trx,		/*!< in: transaction */
+	ib_id_t		roll_limit)	/*!< in: rollback to undo no (for
+					partial undo), 0 if we are rolling back
+					the entire transaction */
 {
-	trx_sig_t*	sig;
-	trx_sig_t*	next_sig;
-
-	ut_ad(mutex_own(&kernel_mutex));
+	que_t*		roll_graph;
 
-	sig = UT_LIST_GET_FIRST(trx->signals);
+	ut_ad(trx_mutex_own(trx));
 
-	while (sig != NULL) {
-		next_sig = UT_LIST_GET_NEXT(signals, sig);
+	ut_ad(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0);
 
-		if (sig->type == TRX_SIG_ERROR_OCCURRED) {
+	/* Initialize the rollback field in the transaction */
 
-			trx_sig_remove(trx, sig);
-		}
+	trx->roll_limit = roll_limit;
 
-		sig = next_sig;
-	}
+	ut_a(trx->roll_limit <= trx->undo_no);
 
-	trx->que_state = TRX_QUE_RUNNING;
-}
+	trx->pages_undone = 0;
 
-/*********************************************************************//**
-Finishes a partial rollback operation. */
-static
-void
-trx_finish_partial_rollback_off_kernel(
-/*===================================*/
-	trx_t*		trx,	/*!< in: transaction */
-	que_thr_t**	next_thr)/*!< in/out: next query thread to run;
-				if the value which is passed in is a pointer
-				to a NULL pointer, then the calling function
-				can start running a new query thread; if this
-				parameter is NULL, it is ignored */
-{
-	trx_sig_t*	sig;
+	if (trx->undo_no_arr == NULL) {
+		/* Single query thread -> 1 */
+		trx->undo_no_arr = trx_undo_arr_create(1);
+	}
 
-	ut_ad(mutex_own(&kernel_mutex));
+	/* Build a 'query' graph which will perform the undo operations */
 
-	sig = UT_LIST_GET_FIRST(trx->signals);
+	roll_graph = trx_roll_graph_build(trx);
 
-	/* Remove the signal from the signal queue and send reply message
-	to it */
+	trx->graph = roll_graph;
 
-	trx_sig_reply(sig, next_thr);
-	trx_sig_remove(trx, sig);
+	trx->lock.que_state = TRX_QUE_ROLLING_BACK;
 
-	trx->que_state = TRX_QUE_RUNNING;
+	return(que_fork_start_command(roll_graph));
 }
 
 /****************************************************************//**
 Finishes a transaction rollback. */
-UNIV_INTERN
+static
 void
-trx_finish_rollback_off_kernel(
-/*===========================*/
-	que_t*		graph,	/*!< in: undo graph which can now be freed */
-	trx_t*		trx,	/*!< in: transaction */
-	que_thr_t**	next_thr)/*!< in/out: next query thread to run;
-				if the value which is passed in is
-				a pointer to a NULL pointer, then the
-				calling function can start running
-				a new query thread; if this parameter is
-				NULL, it is ignored */
+trx_rollback_finish(
+/*================*/
+	trx_t*		trx)	/*!< in: transaction */
 {
-	trx_sig_t*	sig;
-	trx_sig_t*	next_sig;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
 	ut_a(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0);
 
-	/* Free the memory reserved by the undo graph */
-	que_graph_free(graph);
-
-	sig = UT_LIST_GET_FIRST(trx->signals);
-
-	if (sig->type == TRX_SIG_ROLLBACK_TO_SAVEPT) {
-
-		trx_finish_partial_rollback_off_kernel(trx, next_thr);
-
-		return;
-
-	} else if (sig->type == TRX_SIG_ERROR_OCCURRED) {
-
-		trx_finish_error_processing(trx);
-
-		return;
-	}
-
-#ifdef UNIV_DEBUG
-	if (lock_print_waits) {
-		fprintf(stderr, "Trx " TRX_ID_FMT " rollback finished\n",
-			(ullint) trx->id);
-	}
-#endif /* UNIV_DEBUG */
-
-	trx_commit_off_kernel(trx);
-
-	/* Remove all TRX_SIG_TOTAL_ROLLBACK signals from the signal queue and
-	send reply messages to them */
-
-	trx->que_state = TRX_QUE_RUNNING;
-
-	while (sig != NULL) {
-		next_sig = UT_LIST_GET_NEXT(signals, sig);
-
-		if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
+	trx_commit(trx);
 
-			trx_sig_reply(sig, next_thr);
-
-			trx_sig_remove(trx, sig);
-		}
-
-		sig = next_sig;
-	}
+	trx->lock.que_state = TRX_QUE_RUNNING;
 }
 
 /*********************************************************************//**
@@ -1294,11 +1339,11 @@ roll_node_create(
 {
 	roll_node_t*	node;
 
-	node = mem_heap_alloc(heap, sizeof(roll_node_t));
-	node->common.type = QUE_NODE_ROLLBACK;
+	node = static_cast<roll_node_t*>(mem_heap_zalloc(heap, sizeof(*node)));
+
 	node->state = ROLL_NODE_SEND;
 
-	node->partial = FALSE;
+	node->common.type = QUE_NODE_ROLLBACK;
 
 	return(node);
 }
@@ -1313,10 +1358,8 @@ trx_rollback_step(
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	roll_node_t*	node;
-	ulint		sig_no;
-	trx_savept_t*	savept;
 
-	node = thr->run_node;
+	node = static_cast<roll_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK);
 
@@ -1325,33 +1368,30 @@ trx_rollback_step(
 	}
 
 	if (node->state == ROLL_NODE_SEND) {
-		mutex_enter(&kernel_mutex);
+		trx_t*		trx;
+		ib_id_t		roll_limit = 0;
 
-		node->state = ROLL_NODE_WAIT;
+		trx = thr_get_trx(thr);
 
-		if (node->partial) {
-			sig_no = TRX_SIG_ROLLBACK_TO_SAVEPT;
-			savept = &(node->savept);
-		} else {
-			sig_no = TRX_SIG_TOTAL_ROLLBACK;
-			savept = NULL;
-		}
+		trx_mutex_enter(trx);
 
-		/* Send a rollback signal to the transaction */
+		node->state = ROLL_NODE_WAIT;
 
-		trx_sig_send(thr_get_trx(thr), sig_no, TRX_SIG_SELF, thr,
-			     savept, NULL);
+		ut_a(node->undo_thr == NULL);
 
-		thr->state = QUE_THR_SIG_REPLY_WAIT;
+		roll_limit = node->partial ? node->savept.least_undo_no : 0;
 
-		mutex_exit(&kernel_mutex);
+		trx_commit_or_rollback_prepare(trx);
 
-		return(NULL);
-	}
+		node->undo_thr = trx_rollback_start(trx, roll_limit);
 
-	ut_ad(node->state == ROLL_NODE_WAIT);
+		trx_mutex_exit(trx);
 
-	thr->run_node = que_node_get_parent(node);
+	} else {
+		ut_ad(node->state == ROLL_NODE_WAIT);
+
+		thr->run_node = que_node_get_parent(node);
+	}
 
 	return(thr);
 }
diff --git a/storage/xtradb/trx/trx0rseg.c b/storage/xtradb/trx/trx0rseg.cc
index ed3c27326d4..003d1036a8c 100644
--- a/storage/xtradb/trx/trx0rseg.c
+++ b/storage/xtradb/trx/trx0rseg.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2011, Oracle Corpn. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file trx/trx0rseg.c
+@file trx/trx0rseg.cc
 Rollback segment
 
 Created 3/26/1996 Heikki Tuuri
@@ -33,32 +33,14 @@ Created 3/26/1996 Heikki Tuuri
 #include "fut0lst.h"
 #include "srv0srv.h"
 #include "trx0purge.h"
+#include "ut0bh.h"
+#include "srv0mon.h"
 
 #ifdef UNIV_PFS_MUTEX
 /* Key to register rseg_mutex_key with performance schema */
 UNIV_INTERN mysql_pfs_key_t	rseg_mutex_key;
 #endif /* UNIV_PFS_MUTEX */
 
-/******************************************************************//**
-Looks for a rollback segment, based on the rollback segment id.
-@return	rollback segment */
-UNIV_INTERN
-trx_rseg_t*
-trx_rseg_get_on_id(
-/*===============*/
-	ulint	id)	/*!< in: rollback segment id */
-{
-	trx_rseg_t*	rseg;
-
-	ut_a(id < TRX_SYS_N_RSEGS);
-
-	rseg = trx_sys->rseg_array[id];
-
-	ut_a(rseg == NULL || id == rseg->id);
-
-	return(rseg);
-}
-
 /****************************************************************//**
 Creates a rollback segment header. This function is called only when
 a new rollback segment is created in the database.
@@ -81,13 +63,11 @@ trx_rseg_header_create(
 	buf_block_t*	block;
 
 	ut_ad(mtr);
-	ut_ad(mutex_own(&kernel_mutex));
 	ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL),
 				MTR_MEMO_X_LOCK));
 
 	/* Allocate a new file segment for the rollback segment */
-	block = fseg_create(space, 0,
-			    TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr);
+	block = fseg_create(space, 0, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr);
 
 	if (block == NULL) {
 		/* No space left */
@@ -137,6 +117,7 @@ trx_rseg_mem_free(
 	trx_rseg_t*	rseg)	/* in, own: instance to free */
 {
 	trx_undo_t*	undo;
+	trx_undo_t*	next_undo;
 
 	mutex_free(&rseg->mutex);
 
@@ -144,29 +125,36 @@ trx_rseg_mem_free(
 	ut_a(UT_LIST_GET_LEN(rseg->update_undo_list) == 0);
 	ut_a(UT_LIST_GET_LEN(rseg->insert_undo_list) == 0);
 
-	undo = UT_LIST_GET_FIRST(rseg->update_undo_cached);
+	for (undo = UT_LIST_GET_FIRST(rseg->update_undo_cached);
+	     undo != NULL;
+	     undo = next_undo) {
+
+		next_undo = UT_LIST_GET_NEXT(undo_list, undo);
 
-	while (undo != NULL) {
-		trx_undo_t*	prev_undo = undo;
+		UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo);
 
-		undo = UT_LIST_GET_NEXT(undo_list, undo);
-		UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, prev_undo);
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
 
-		trx_undo_mem_free(prev_undo);
+		trx_undo_mem_free(undo);
 	}
 
-	undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached);
+	for (undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached);
+	     undo != NULL;
+	     undo = next_undo) {
 
-	while (undo != NULL) {
-		trx_undo_t*	prev_undo = undo;
+		next_undo = UT_LIST_GET_NEXT(undo_list, undo);
 
-		undo = UT_LIST_GET_NEXT(undo_list, undo);
-		UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, prev_undo);
+		UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo);
 
-		trx_undo_mem_free(prev_undo);
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+
+		trx_undo_mem_free(undo);
 	}
 
-	trx_sys_set_nth_rseg(trx_sys, rseg->id, NULL);
+	/* const_cast<trx_rseg_t*>() because this function is
+	like a destructor.  */
+
+	*((trx_rseg_t**) trx_sys->rseg_array + rseg->id) = NULL;
 
 	mem_free(rseg);
 }
@@ -198,9 +186,7 @@ trx_rseg_mem_create(
 	trx_ulogf_t*	undo_log_hdr;
 	ulint		sum_of_undo_sizes;
 
-	ut_ad(mutex_own(&kernel_mutex));
-
-	rseg = mem_zalloc(sizeof(trx_rseg_t));
+	rseg = static_cast<trx_rseg_t*>(mem_zalloc(sizeof(trx_rseg_t)));
 
 	rseg->id = id;
 	rseg->space = space;
@@ -209,41 +195,43 @@ trx_rseg_mem_create(
 
 	mutex_create(rseg_mutex_key, &rseg->mutex, SYNC_RSEG);
 
-	UT_LIST_ADD_LAST(rseg_list, trx_sys->rseg_list, rseg);
-
-	trx_sys_set_nth_rseg(trx_sys, id, rseg);
+	/* const_cast<trx_rseg_t*>() because this function is
+	like a constructor.  */
+	*((trx_rseg_t**) trx_sys->rseg_array + rseg->id) = rseg;
 
 	rseg_header = trx_rsegf_get_new(space, zip_size, page_no, mtr);
 
-	rseg->max_size = mtr_read_ulint(rseg_header + TRX_RSEG_MAX_SIZE,
-					MLOG_4BYTES, mtr);
+	rseg->max_size = mtr_read_ulint(
+		rseg_header + TRX_RSEG_MAX_SIZE, MLOG_4BYTES, mtr);
 
 	/* Initialize the undo log lists according to the rseg header */
 
 	sum_of_undo_sizes = trx_undo_lists_init(rseg);
 
-	rseg->curr_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
-					 MLOG_4BYTES, mtr)
+	rseg->curr_size = mtr_read_ulint(
+		rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr)
 		+ 1 + sum_of_undo_sizes;
 
 	len = flst_get_len(rseg_header + TRX_RSEG_HISTORY, mtr);
+
 	if (len > 0) {
-		const void*	ptr;
 		rseg_queue_t	rseg_queue;
 
 		trx_sys->rseg_history_len += len;
 
 		node_addr = trx_purge_get_log_from_hist(
 			flst_get_last(rseg_header + TRX_RSEG_HISTORY, mtr));
+
 		rseg->last_page_no = node_addr.page;
 		rseg->last_offset = node_addr.boffset;
 
-		undo_log_hdr = trx_undo_page_get(rseg->space, rseg->zip_size,
-						 node_addr.page,
-						 mtr) + node_addr.boffset;
+		undo_log_hdr = trx_undo_page_get(
+			rseg->space, rseg->zip_size, node_addr.page,
+			mtr) + node_addr.boffset;
 
 		rseg->last_trx_no = mach_read_from_8(
 			undo_log_hdr + TRX_UNDO_TRX_NO);
+
 		rseg->last_del_marks = mtr_read_ulint(
 			undo_log_hdr + TRX_UNDO_DEL_MARKS, MLOG_2BYTES, mtr);
 
@@ -251,6 +239,8 @@ trx_rseg_mem_create(
 		rseg_queue.trx_no = rseg->last_trx_no;
 
 		if (rseg->last_page_no != FIL_NULL) {
+			const void*	ptr;
+
 			/* There is no need to cover this operation by the purge
 			mutex because we are still bootstrapping. */
 
@@ -266,7 +256,7 @@ trx_rseg_mem_create(
 
 /********************************************************************
 Creates the memory copies for the rollback segments and initializes the
-rseg list and array in trx_sys at a database startup. */
+rseg array in trx_sys at a database startup. */
 static
 void
 trx_rseg_create_instance(
@@ -282,9 +272,7 @@ trx_rseg_create_instance(
 
 		page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
 
-		if (page_no == FIL_NULL) {
-			trx_sys_set_nth_rseg(trx_sys, i, NULL);
-		} else {
+		if (page_no != FIL_NULL) {
 			ulint		space;
 			ulint		zip_size;
 			trx_rseg_t*	rseg = NULL;
@@ -299,6 +287,8 @@ trx_rseg_create_instance(
 				i, space, zip_size, page_no, ib_bh, mtr);
 
 			ut_a(rseg->id == i);
+		} else {
+			ut_a(trx_sys->rseg_array[i] == NULL);
 		}
 	}
 }
@@ -308,8 +298,9 @@ Creates a rollback segment.
 @return pointer to new rollback segment if create successful */
 UNIV_INTERN
 trx_rseg_t*
-trx_rseg_create(void)
-/*=================*/
+trx_rseg_create(
+/*============*/
+	ulint		space)		/*!< in: id of UNDO tablespace */
 {
 	mtr_t		mtr;
 	ulint		slot_no;
@@ -318,29 +309,26 @@ trx_rseg_create(void)
 	mtr_start(&mtr);
 
 	/* To obey the latching order, acquire the file space
-	x-latch before the kernel mutex. */
-	mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), &mtr);
-
-	mutex_enter(&kernel_mutex);
+	x-latch before the trx_sys->mutex. */
+	mtr_x_lock(fil_space_get_latch(space, NULL), &mtr);
 
 	slot_no = trx_sysf_rseg_find_free(&mtr);
 
 	if (slot_no != ULINT_UNDEFINED) {
-		ulint		space;
+		ulint		id;
 		ulint		page_no;
 		ulint		zip_size;
 		trx_sysf_t*	sys_header;
 
 		page_no = trx_rseg_header_create(
-			TRX_SYS_SPACE, 0, ULINT_MAX, slot_no, &mtr);
+			space, 0, ULINT_MAX, slot_no, &mtr);
 
 		ut_a(page_no != FIL_NULL);
 
-		ut_ad(!trx_rseg_get_on_id(slot_no));
-
 		sys_header = trx_sysf_get(&mtr);
 
-		space = trx_sysf_rseg_get_space(sys_header, slot_no, &mtr);
+		id = trx_sysf_rseg_get_space(sys_header, slot_no, &mtr);
+		ut_a(id == space);
 
 		zip_size = space ? fil_space_get_zip_size(space) : 0;
 
@@ -349,26 +337,89 @@ trx_rseg_create(void)
 			purge_sys->ib_bh, &mtr);
 	}
 
-	mutex_exit(&kernel_mutex);
 	mtr_commit(&mtr);
 
 	return(rseg);
 }
 
-/********************************************************************
-Initialize the rollback instance list. */
+/*********************************************************************//**
+Creates the memory copies for rollback segments and initializes the
+rseg array in trx_sys at a database startup. */
 UNIV_INTERN
 void
-trx_rseg_list_and_array_init(
-/*=========================*/
-	trx_sysf_t*	sys_header,	/*!< in: trx system header */
+trx_rseg_array_init(
+/*================*/
+	trx_sysf_t*	sys_header,	/* in/out: trx system header */
 	ib_bh_t*	ib_bh,		/*!< in: rseg queue */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	UT_LIST_INIT(trx_sys->rseg_list);
-
 	trx_sys->rseg_history_len = 0;
 
 	trx_rseg_create_instance(sys_header, ib_bh, mtr);
 }
 
+/********************************************************************
+Get the number of unique rollback tablespaces in use except space id 0.
+The last space id will be the sentinel value ULINT_UNDEFINED. The array
+will be sorted on space id. Note: space_ids should have have space for
+TRX_SYS_N_RSEGS + 1 elements.
+@return number of unique rollback tablespaces in use. */
+UNIV_INTERN
+ulint
+trx_rseg_get_n_undo_tablespaces(
+/*============================*/
+	ulint*		space_ids)	/*!< out: array of space ids of
+					UNDO tablespaces */
+{
+	ulint		i;
+	mtr_t		mtr;
+	trx_sysf_t*	sys_header;
+	ulint		n_undo_tablespaces = 0;
+	ulint		space_ids_aux[TRX_SYS_N_RSEGS + 1];
+
+	mtr_start(&mtr);
+
+	sys_header = trx_sysf_get(&mtr);
+
+	for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+		ulint	page_no;
+		ulint	space;
+
+		page_no = trx_sysf_rseg_get_page_no(sys_header, i, &mtr);
+
+		if (page_no == FIL_NULL) {
+			continue;
+		}
+
+		space = trx_sysf_rseg_get_space(sys_header, i, &mtr);
+
+		if (space != 0) {
+			ulint	j;
+			ibool	found = FALSE;
+
+			for (j = 0; j < n_undo_tablespaces; ++j) {
+				if (space_ids[j] == space) {
+					found = TRUE;
+					break;
+				}
+			}
+
+			if (!found) {
+				ut_a(n_undo_tablespaces <= i);
+				space_ids[n_undo_tablespaces++] = space;
+			}
+		}
+	}
+
+	mtr_commit(&mtr);
+
+	ut_a(n_undo_tablespaces <= TRX_SYS_N_RSEGS);
+
+	space_ids[n_undo_tablespaces] = ULINT_UNDEFINED;
+
+	if (n_undo_tablespaces > 0) {
+		ut_ulint_sort(space_ids, space_ids_aux, 0, n_undo_tablespaces);
+	}
+
+	return(n_undo_tablespaces);
+}
diff --git a/storage/xtradb/trx/trx0sys.c b/storage/xtradb/trx/trx0sys.c
deleted file mode 100644
index a56e55c0e19..00000000000
--- a/storage/xtradb/trx/trx0sys.c
+++ /dev/null
@@ -1,2049 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file trx/trx0sys.c
-Transaction system
-
-Created 3/26/1996 Heikki Tuuri
-*******************************************************/
-
-#include "trx0sys.h"
-
-#ifdef UNIV_NONINL
-#include "trx0sys.ic"
-#endif
-
-#ifndef UNIV_HOTBACKUP
-#include "fsp0fsp.h"
-#include "mtr0log.h"
-#include "mtr0log.h"
-#include "trx0trx.h"
-#include "trx0rseg.h"
-#include "trx0undo.h"
-#include "srv0srv.h"
-#include "srv0start.h"
-#include "trx0purge.h"
-#include "log0log.h"
-#include "log0recv.h"
-#include "os0file.h"
-#include "read0read.h"
-
-/** The file format tag structure with id and name. */
-struct file_format_struct {
-	ulint		id;		/*!< id of the file format */
-	const char*	name;		/*!< text representation of the
-					file format */
-	mutex_t		mutex;		/*!< covers changes to the above
-					fields */
-};
-
-/** The file format tag */
-typedef struct file_format_struct	file_format_t;
-
-/** The transaction system */
-UNIV_INTERN trx_sys_t*		trx_sys		= NULL;
-/** The doublewrite buffer */
-UNIV_INTERN trx_doublewrite_t*	trx_doublewrite = NULL;
-
-/** The following is set to TRUE when we are upgrading from pre-4.1
-format data files to the multiple tablespaces format data files */
-UNIV_INTERN ibool	trx_doublewrite_must_reset_space_ids	= FALSE;
-/** Set to TRUE when the doublewrite buffer is being created */
-UNIV_INTERN ibool	trx_doublewrite_buf_is_being_created = FALSE;
-
-/** The following is TRUE when we are using the database in the
-post-4.1 format, i.e., we have successfully upgraded, or have created
-a new database installation */
-UNIV_INTERN ibool	trx_sys_multiple_tablespace_format	= FALSE;
-
-/** In a MySQL replication slave, in crash recovery we store the master log
-file name and position here. */
-/* @{ */
-/** Master binlog file name */
-UNIV_INTERN char	trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN];
-/** Master binlog file position.  We have successfully got the updates
-up to this position.  -1 means that no crash recovery was needed, or
-there was no master log position info inside InnoDB.*/
-UNIV_INTERN ib_int64_t	trx_sys_mysql_master_log_pos	= -1;
-/* @} */
-
-UNIV_INTERN char	trx_sys_mysql_relay_log_name[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN];
-UNIV_INTERN ib_int64_t	trx_sys_mysql_relay_log_pos	= -1;
-
-/** If this MySQL server uses binary logging, after InnoDB has been inited
-and if it has done a crash recovery, we store the binlog file name and position
-here. */
-/* @{ */
-/** Binlog file name */
-UNIV_INTERN char	trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
-/** Binlog file position, or -1 if unknown */
-UNIV_INTERN ib_int64_t	trx_sys_mysql_bin_log_pos	= -1;
-/* @} */
-#endif /* !UNIV_HOTBACKUP */
-
-/** List of animal names representing file format. */
-static const char*	file_format_name_map[] = {
-	"Antelope",
-	"Barracuda",
-	"Cheetah",
-	"Dragon",
-	"Elk",
-	"Fox",
-	"Gazelle",
-	"Hornet",
-	"Impala",
-	"Jaguar",
-	"Kangaroo",
-	"Leopard",
-	"Moose",
-	"Nautilus",
-	"Ocelot",
-	"Porpoise",
-	"Quail",
-	"Rabbit",
-	"Shark",
-	"Tiger",
-	"Urchin",
-	"Viper",
-	"Whale",
-	"Xenops",
-	"Yak",
-	"Zebra"
-};
-
-/** The number of elements in the file format name array. */
-static const ulint	FILE_FORMAT_NAME_N
-	= sizeof(file_format_name_map) / sizeof(file_format_name_map[0]);
-
-#ifdef UNIV_PFS_MUTEX
-/* Key to register the mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	trx_doublewrite_mutex_key;
-UNIV_INTERN mysql_pfs_key_t	file_format_max_mutex_key;
-#endif /* UNIV_PFS_MUTEX */
-
-#ifndef UNIV_HOTBACKUP
-#ifdef UNIV_DEBUG
-/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
-UNIV_INTERN uint	trx_rseg_n_slots_debug = 0;
-#endif
-
-/** This is used to track the maximum file format id known to InnoDB. It's
-updated via SET GLOBAL innodb_file_format_max = 'x' or when we open
-or create a table. */
-static	file_format_t	file_format_max;
-
-/****************************************************************//**
-Determines if a page number is located inside the doublewrite buffer.
-@return TRUE if the location is inside the two blocks of the
-doublewrite buffer */
-UNIV_INTERN
-ibool
-trx_doublewrite_page_inside(
-/*========================*/
-	ulint	page_no)	/*!< in: page number */
-{
-	if (trx_doublewrite == NULL) {
-
-		return(FALSE);
-	}
-
-	if (page_no >= trx_doublewrite->block1
-	    && page_no < trx_doublewrite->block1
-	    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-		return(TRUE);
-	}
-
-	if (page_no >= trx_doublewrite->block2
-	    && page_no < trx_doublewrite->block2
-	    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-		return(TRUE);
-	}
-
-	return(FALSE);
-}
-
-/****************************************************************//**
-Creates or initialializes the doublewrite buffer at a database start. */
-static
-void
-trx_doublewrite_init(
-/*=================*/
-	byte*	doublewrite)	/*!< in: pointer to the doublewrite buf
-				header on trx sys page */
-{
-	trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t));
-
-	/* Since we now start to use the doublewrite buffer, no need to call
-	fsync() after every write to a data file */
-#ifdef UNIV_DO_FLUSH
-	os_do_not_call_flush_at_each_write = TRUE;
-#endif /* UNIV_DO_FLUSH */
-
-	mutex_create(trx_doublewrite_mutex_key,
-		     &trx_doublewrite->mutex, SYNC_DOUBLEWRITE);
-
-	trx_doublewrite->first_free = 0;
-
-	trx_doublewrite->block1 = mach_read_from_4(
-		doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
-	trx_doublewrite->block2 = mach_read_from_4(
-		doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
-	trx_doublewrite->write_buf_unaligned = ut_malloc(
-		(1 + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE);
-
-	trx_doublewrite->write_buf = ut_align(
-		trx_doublewrite->write_buf_unaligned, UNIV_PAGE_SIZE);
-	trx_doublewrite->buf_block_arr = mem_alloc(
-		2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * sizeof(void*));
-}
-
-/****************************************************************//**
-Marks the trx sys header when we have successfully upgraded to the >= 4.1.x
-multiple tablespace format. */
-UNIV_INTERN
-void
-trx_sys_mark_upgraded_to_multiple_tablespaces(void)
-/*===============================================*/
-{
-	buf_block_t*	block;
-	byte*		doublewrite;
-	mtr_t		mtr;
-
-	/* We upgraded to 4.1.x and reset the space id fields in the
-	doublewrite buffer. Let us mark to the trx_sys header that the upgrade
-	has been done. */
-
-	mtr_start(&mtr);
-
-	block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
-			     RW_X_LATCH, &mtr);
-	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
-
-	doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
-
-	mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
-			 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
-			 MLOG_4BYTES, &mtr);
-	mtr_commit(&mtr);
-
-	/* Flush the modified pages to disk and make a checkpoint */
-	log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
-
-	trx_sys_multiple_tablespace_format = TRUE;
-}
-
-/****************************************************************//**
-Creates the doublewrite buffer to a new InnoDB installation. The header of the
-doublewrite buffer is placed on the trx system header page. */
-UNIV_INTERN
-void
-trx_sys_create_doublewrite_buf(void)
-/*================================*/
-{
-	buf_block_t*	block;
-	buf_block_t*	block2;
-	buf_block_t*	new_block;
-	byte*	doublewrite;
-	byte*	fseg_header;
-	ulint	page_no;
-	ulint	prev_page_no;
-	ulint	i;
-	mtr_t	mtr;
-
-	if (trx_doublewrite) {
-		/* Already inited */
-
-		return;
-	}
-
-start_again:
-	mtr_start(&mtr);
-	trx_doublewrite_buf_is_being_created = TRUE;
-
-	block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
-			     RW_X_LATCH, &mtr);
-	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
-
-	doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
-
-	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
-	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
-		/* The doublewrite buffer has already been created:
-		just read in some numbers */
-
-		trx_doublewrite_init(doublewrite);
-
-		mtr_commit(&mtr);
-		trx_doublewrite_buf_is_being_created = FALSE;
-	} else {
-		fprintf(stderr,
-			"InnoDB: Doublewrite buffer not found:"
-			" creating new\n");
-
-		if (buf_pool_get_curr_size()
-		    < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
-			+ FSP_EXTENT_SIZE / 2 + 100)
-		       * UNIV_PAGE_SIZE)) {
-			fprintf(stderr,
-				"InnoDB: Cannot create doublewrite buffer:"
-				" you must\n"
-				"InnoDB: increase your buffer pool size.\n"
-				"InnoDB: Cannot continue operation.\n");
-
-			exit(1);
-		}
-
-		block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
-				     TRX_SYS_DOUBLEWRITE
-				     + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
-
-		/* fseg_create acquires a second latch on the page,
-		therefore we must declare it: */
-
-		buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
-
-		if (block2 == NULL) {
-			fprintf(stderr,
-				"InnoDB: Cannot create doublewrite buffer:"
-				" you must\n"
-				"InnoDB: increase your tablespace size.\n"
-				"InnoDB: Cannot continue operation.\n");
-
-			/* We exit without committing the mtr to prevent
-			its modifications to the database getting to disk */
-
-			exit(1);
-		}
-
-		fseg_header = buf_block_get_frame(block)
-			+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG;
-		prev_page_no = 0;
-
-		for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
-			     + FSP_EXTENT_SIZE / 2; i++) {
-			new_block = fseg_alloc_free_page(
-				fseg_header, prev_page_no + 1, FSP_UP, &mtr);
-			if (new_block == NULL) {
-				fprintf(stderr,
-					"InnoDB: Cannot create doublewrite"
-					" buffer: you must\n"
-					"InnoDB: increase your"
-					" tablespace size.\n"
-					"InnoDB: Cannot continue operation.\n"
-					);
-
-				exit(1);
-			}
-
-			/* We read the allocated pages to the buffer pool;
-			when they are written to disk in a flush, the space
-			id and page number fields are also written to the
-			pages. When we at database startup read pages
-			from the doublewrite buffer, we know that if the
-			space id and page number in them are the same as
-			the page position in the tablespace, then the page
-			has not been written to in doublewrite. */
-
-			ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
-			page_no = buf_block_get_page_no(new_block);
-
-			if (i == FSP_EXTENT_SIZE / 2) {
-				ut_a(page_no == FSP_EXTENT_SIZE);
-				mlog_write_ulint(doublewrite
-						 + TRX_SYS_DOUBLEWRITE_BLOCK1,
-						 page_no, MLOG_4BYTES, &mtr);
-				mlog_write_ulint(doublewrite
-						 + TRX_SYS_DOUBLEWRITE_REPEAT
-						 + TRX_SYS_DOUBLEWRITE_BLOCK1,
-						 page_no, MLOG_4BYTES, &mtr);
-			} else if (i == FSP_EXTENT_SIZE / 2
-				   + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-				ut_a(page_no == 2 * FSP_EXTENT_SIZE);
-				mlog_write_ulint(doublewrite
-						 + TRX_SYS_DOUBLEWRITE_BLOCK2,
-						 page_no, MLOG_4BYTES, &mtr);
-				mlog_write_ulint(doublewrite
-						 + TRX_SYS_DOUBLEWRITE_REPEAT
-						 + TRX_SYS_DOUBLEWRITE_BLOCK2,
-						 page_no, MLOG_4BYTES, &mtr);
-			} else if (i > FSP_EXTENT_SIZE / 2) {
-				ut_a(page_no == prev_page_no + 1);
-			}
-
-			prev_page_no = page_no;
-		}
-
-		mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
-				 TRX_SYS_DOUBLEWRITE_MAGIC_N,
-				 MLOG_4BYTES, &mtr);
-		mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
-				 + TRX_SYS_DOUBLEWRITE_REPEAT,
-				 TRX_SYS_DOUBLEWRITE_MAGIC_N,
-				 MLOG_4BYTES, &mtr);
-
-		mlog_write_ulint(doublewrite
-				 + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
-				 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
-				 MLOG_4BYTES, &mtr);
-		mtr_commit(&mtr);
-
-		/* Flush the modified pages to disk and make a checkpoint */
-		log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
-
-		fprintf(stderr, "InnoDB: Doublewrite buffer created\n");
-
-		trx_sys_multiple_tablespace_format = TRUE;
-
-		goto start_again;
-	}
-
-    if (srv_doublewrite_file) {
-	/* the same doublewrite buffer to TRX_SYS_SPACE should exist.
-	check and create if not exist.*/
-
-	mtr_start(&mtr);
-	trx_doublewrite_buf_is_being_created = TRUE;
-
-	block = buf_page_get(TRX_DOUBLEWRITE_SPACE, 0, TRX_SYS_PAGE_NO,
-			     RW_X_LATCH, &mtr);
-	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
-
-	doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
-
-	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
-	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
-		/* The doublewrite buffer has already been created:
-		just read in some numbers */
-
-		trx_doublewrite_init(doublewrite);
-
-		mtr_commit(&mtr);
-		trx_doublewrite_buf_is_being_created = FALSE;
-	} else {
-		fprintf(stderr,
-			"InnoDB: Doublewrite buffer not found in the doublewrite file:"
-			" creating new doublewrite buffer.\n");
-
-		if (buf_pool_get_curr_size()
-		    < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
-			+ FSP_EXTENT_SIZE / 2 + 100)
-		       * UNIV_PAGE_SIZE)) {
-			fprintf(stderr,
-				"InnoDB: Cannot create the doublewrite buffer:"
-				" You must\n"
-				"InnoDB: increase your buffer pool size.\n"
-				"InnoDB: Cannot continue processing.\n");
-
-			exit(1);
-		}
-
-		block2 = fseg_create(TRX_DOUBLEWRITE_SPACE, TRX_SYS_PAGE_NO,
-				     TRX_SYS_DOUBLEWRITE
-				     + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
-
-		/* fseg_create acquires a second latch on the page,
-		therefore we must declare it: */
-
-		buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
-
-		if (block2 == NULL) {
-			fprintf(stderr,
-				"InnoDB: Cannot create the doublewrite buffer:"
-				" You must\n"
-				"InnoDB: increase your tablespace size.\n"
-				"InnoDB: Cannot continue processing.\n");
-
-			/* We exit without committing the mtr to prevent
-			its modifications to the database getting to disk */
-
-			exit(1);
-		}
-
-		fseg_header = buf_block_get_frame(block)
-			+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG;
-		prev_page_no = 0;
-
-		for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
-			     + FSP_EXTENT_SIZE / 2; i++) {
-			new_block = fseg_alloc_free_page(
-				fseg_header, prev_page_no + 1, FSP_UP, &mtr);
-			if (new_block == NULL) {
-				fprintf(stderr,
-					"InnoDB: Cannot create doublewrite"
-					" buffer: you must\n"
-					"InnoDB: increase your"
-					" tablespace size.\n"
-					"InnoDB: Cannot continue operation.\n"
-					);
-
-				exit(1);
-			}
-
-			/* We read the allocated pages to the buffer pool;
-			when they are written to disk in a flush, the space
-			id and page number fields are also written to the
-			pages. When we at database startup read pages
-			from the doublewrite buffer, we know that if the
-			space id and page number in them are the same as
-			the page position in the tablespace, then the page
-			has not been written to in doublewrite. */
-
-			ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
-			page_no = buf_block_get_page_no(new_block);
-
-			if (i == FSP_EXTENT_SIZE / 2) {
-				ut_a(page_no == FSP_EXTENT_SIZE);
-				mlog_write_ulint(doublewrite
-						 + TRX_SYS_DOUBLEWRITE_BLOCK1,
-						 page_no, MLOG_4BYTES, &mtr);
-				mlog_write_ulint(doublewrite
-						 + TRX_SYS_DOUBLEWRITE_REPEAT
-						 + TRX_SYS_DOUBLEWRITE_BLOCK1,
-						 page_no, MLOG_4BYTES, &mtr);
-			} else if (i == FSP_EXTENT_SIZE / 2
-				   + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-				ut_a(page_no == 2 * FSP_EXTENT_SIZE);
-				mlog_write_ulint(doublewrite
-						 + TRX_SYS_DOUBLEWRITE_BLOCK2,
-						 page_no, MLOG_4BYTES, &mtr);
-				mlog_write_ulint(doublewrite
-						 + TRX_SYS_DOUBLEWRITE_REPEAT
-						 + TRX_SYS_DOUBLEWRITE_BLOCK2,
-						 page_no, MLOG_4BYTES, &mtr);
-			} else if (i > FSP_EXTENT_SIZE / 2) {
-				ut_a(page_no == prev_page_no + 1);
-			}
-
-			prev_page_no = page_no;
-		}
-
-		mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
-				 TRX_SYS_DOUBLEWRITE_MAGIC_N,
-				 MLOG_4BYTES, &mtr);
-		mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
-				 + TRX_SYS_DOUBLEWRITE_REPEAT,
-				 TRX_SYS_DOUBLEWRITE_MAGIC_N,
-				 MLOG_4BYTES, &mtr);
-
-		mlog_write_ulint(doublewrite
-				 + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
-				 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
-				 MLOG_4BYTES, &mtr);
-		mtr_commit(&mtr);
-
-		/* Flush the modified pages to disk and make a checkpoint */
-		log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
-
-		fprintf(stderr, "InnoDB: Doublewrite buffer created in the doublewrite file\n");
-		trx_sys_multiple_tablespace_format = TRUE;
-	}
-	trx_doublewrite_buf_is_being_created = FALSE;
-    }
-}
-
-/****************************************************************//**
-At a database startup initializes the doublewrite buffer memory structure if
-we already have a doublewrite buffer created in the data files. If we are
-upgrading to an InnoDB version which supports multiple tablespaces, then this
-function performs the necessary update operations. If we are in a crash
-recovery, this function uses a possible doublewrite buffer to restore
-half-written pages in the data files. */
-UNIV_INTERN
-void
-trx_sys_doublewrite_init_or_restore_pages(
-/*======================================*/
-	ibool	restore_corrupt_pages)	/*!< in: TRUE=restore pages */
-{
-	byte*	buf;
-	byte*	read_buf;
-	byte*	unaligned_read_buf;
-	ulint	block1;
-	ulint	block2;
-	ulint	source_page_no;
-	byte*	page;
-	byte*	doublewrite;
-	ulint	doublewrite_space_id;
-	ulint	space_id;
-	ulint	page_no;
-	ulint	i;
-
-	doublewrite_space_id = (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE);
-
-	if (srv_doublewrite_file) {
-		fprintf(stderr,
-			"InnoDB: doublewrite file '%s' is used.\n",
-			srv_doublewrite_file);
-	}
-
-	/* We do the file i/o past the buffer pool */
-
-	unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
-	read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE);
-
-	/* Read the trx sys header to check if we are using the doublewrite
-	buffer */
-
-	fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, TRX_SYS_PAGE_NO, 0,
-	       UNIV_PAGE_SIZE, read_buf, NULL);
-	doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
-
-	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
-	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
-		/* The doublewrite buffer has been created */
-
-		trx_doublewrite_init(doublewrite);
-
-		block1 = trx_doublewrite->block1;
-		block2 = trx_doublewrite->block2;
-
-		buf = trx_doublewrite->write_buf;
-	} else {
-		goto leave_func;
-	}
-
-	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
-	    != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
-
-		/* We are upgrading from a version < 4.1.x to a version where
-		multiple tablespaces are supported. We must reset the space id
-		field in the pages in the doublewrite buffer because starting
-		from this version the space id is stored to
-		FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
-
-		trx_doublewrite_must_reset_space_ids = TRUE;
-
-		fprintf(stderr,
-			"InnoDB: Resetting space id's in the"
-			" doublewrite buffer\n");
-	} else {
-		trx_sys_multiple_tablespace_format = TRUE;
-	}
-
-	/* Read the pages from the doublewrite buffer to memory */
-
-	fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block1, 0,
-	       TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
-	       buf, NULL);
-	fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block2, 0,
-	       TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
-	       buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
-	       NULL);
-	/* Check if any of these pages is half-written in data files, in the
-	intended position */
-
-	page = buf;
-
-	for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
-
-		page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
-
-		if (trx_doublewrite_must_reset_space_ids) {
-
-			space_id = 0;
-			mach_write_to_4(page
-					+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0);
-			/* We do not need to calculate new checksums for the
-			pages because the field .._SPACE_ID does not affect
-			them. Write the page back to where we read it from. */
-
-			if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-				source_page_no = block1 + i;
-			} else {
-				source_page_no = block2
-					+ i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
-			}
-
-			fil_io(OS_FILE_WRITE, TRUE, 0, 0, source_page_no, 0,
-			       UNIV_PAGE_SIZE, page, NULL);
-			/* printf("Resetting space id in page %lu\n",
-			source_page_no); */
-		} else {
-			space_id = mach_read_from_4(
-				page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
-		}
-
-		if (!restore_corrupt_pages) {
-			/* The database was shut down gracefully: no need to
-			restore pages */
-
-		} else if (!fil_tablespace_exists_in_mem(space_id)) {
-			/* Maybe we have dropped the single-table tablespace
-			and this page once belonged to it: do nothing */
-
-		} else if (!fil_check_adress_in_tablespace(space_id,
-							   page_no)) {
-			fprintf(stderr,
-				"InnoDB: Warning: a page in the"
-				" doublewrite buffer is not within space\n"
-				"InnoDB: bounds; space id %lu"
-				" page number %lu, page %lu in"
-				" doublewrite buf.\n",
-				(ulong) space_id, (ulong) page_no, (ulong) i);
-
-		} else if ((space_id == TRX_SYS_SPACE
-			    || (srv_doublewrite_file && space_id == TRX_DOUBLEWRITE_SPACE))
-			   && ((page_no >= block1
-				&& page_no
-				< block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
-			       || (page_no >= block2
-				   && page_no
-				   < (block2
-				      + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)))) {
-
-			/* It is an unwritten doublewrite buffer page:
-			do nothing */
-		} else {
-			ulint	zip_size = fil_space_get_zip_size(space_id);
-
-			/* Read in the actual page from the file */
-			fil_io(OS_FILE_READ, TRUE, space_id, zip_size,
-			       page_no, 0,
-			       zip_size ? zip_size : UNIV_PAGE_SIZE,
-			       read_buf, NULL);
-
-			if (srv_recovery_stats && recv_recovery_is_on()) {
-				mutex_enter(&(recv_sys->mutex));
-				recv_sys->stats_doublewrite_check_pages++;
-				mutex_exit(&(recv_sys->mutex));
-			}
-
-			/* Check if the page is corrupt */
-
-			if (UNIV_UNLIKELY
-			    (buf_page_is_corrupted(
-				    TRUE, read_buf, zip_size))) {
-
-				fprintf(stderr,
-					"InnoDB: Warning: database page"
-					" corruption or a failed\n"
-					"InnoDB: file read of"
-					" space %lu page %lu.\n"
-					"InnoDB: Trying to recover it from"
-					" the doublewrite buffer.\n",
-					(ulong) space_id, (ulong) page_no);
-
-				if (buf_page_is_corrupted(
-					    TRUE, page, zip_size)) {
-					fprintf(stderr,
-						"InnoDB: Dump of the page:\n");
-					buf_page_print(
-						read_buf, zip_size,
-						BUF_PAGE_PRINT_NO_CRASH);
-					fprintf(stderr,
-						"InnoDB: Dump of"
-						" corresponding page"
-						" in doublewrite buffer:\n");
-					buf_page_print(
-						page, zip_size,
-						BUF_PAGE_PRINT_NO_CRASH);
-
-					fprintf(stderr,
-						"InnoDB: Also the page in the"
-						" doublewrite buffer"
-						" is corrupt.\n"
-						"InnoDB: Cannot continue"
-						" operation.\n"
-						"InnoDB: You can try to"
-						" recover the database"
-						" with the my.cnf\n"
-						"InnoDB: option:\n"
-						"InnoDB:"
-						" innodb_force_recovery=6\n");
-					ut_error;
-				}
-
-				/* Write the good page from the
-				doublewrite buffer to the intended
-				position */
-
-				fil_io(OS_FILE_WRITE, TRUE, space_id,
-				       zip_size, page_no, 0,
-				       zip_size ? zip_size : UNIV_PAGE_SIZE,
-				       page, NULL);
-
-				if (srv_recovery_stats && recv_recovery_is_on()) {
-					mutex_enter(&(recv_sys->mutex));
-					recv_sys->stats_doublewrite_overwrite_pages++;
-					mutex_exit(&(recv_sys->mutex));
-				}
-
-				fprintf(stderr,
-					"InnoDB: Recovered the page from"
-					" the doublewrite buffer.\n");
-			}
-		}
-
-		page += UNIV_PAGE_SIZE;
-	}
-
-	fil_flush_file_spaces(FIL_TABLESPACE);
-
-leave_func:
-	ut_free(unaligned_read_buf);
-}
-
-/****************************************************************//**
-Checks that trx is in the trx list.
-@return	TRUE if is in */
-UNIV_INTERN
-ibool
-trx_in_trx_list(
-/*============*/
-	trx_t*	in_trx)	/*!< in: trx */
-{
-	trx_t*	trx;
-
-	ut_ad(mutex_own(&(kernel_mutex)));
-
-	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
-
-	while (trx != NULL) {
-
-		if (trx == in_trx) {
-
-			return(TRUE);
-		}
-
-		trx = UT_LIST_GET_NEXT(trx_list, trx);
-	}
-
-	return(FALSE);
-}
-
-/*****************************************************************//**
-Writes the value of max_trx_id to the file based trx system header. */
-UNIV_INTERN
-void
-trx_sys_flush_max_trx_id(void)
-/*==========================*/
-{
-	trx_sysf_t*	sys_header;
-	mtr_t		mtr;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	mtr_start(&mtr);
-
-	sys_header = trx_sysf_get(&mtr);
-
-	mlog_write_ull(sys_header + TRX_SYS_TRX_ID_STORE,
-		       trx_sys->max_trx_id, &mtr);
-	mtr_commit(&mtr);
-}
-
-/*****************************************************************//**
-Updates the offset information about the end of the MySQL binlog entry
-which corresponds to the transaction just being committed. In a MySQL
-replication slave updates the latest master binlog position up to which
-replication has proceeded. */
-UNIV_INTERN
-void
-trx_sys_update_mysql_binlog_offset(
-/*===============================*/
-	trx_sysf_t*	sys_header,
-	const char*	file_name_in,/*!< in: MySQL log file name */
-	ib_int64_t	offset,	/*!< in: position in that log file */
-	ulint		field,	/*!< in: offset of the MySQL log info field in
-				the trx sys header */
-	mtr_t*		mtr)	/*!< in: mtr */
-{
-	const char*	file_name;
-
-	if (ut_strlen(file_name_in) >= TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN) {
-
-		/* We cannot fit the name to the 512 bytes we have reserved */
-		/* -> To store relay log file information, file_name must fit to the 480 bytes */
-
-		file_name = "";
-	} else {
-		file_name = file_name_in;
-	}
-
-	if (mach_read_from_4(sys_header + field
-			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
-	    != TRX_SYS_MYSQL_LOG_MAGIC_N) {
-
-		mlog_write_ulint(sys_header + field
-				 + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD,
-				 TRX_SYS_MYSQL_LOG_MAGIC_N,
-				 MLOG_4BYTES, mtr);
-	}
-
-	if (0 != strcmp((char*) (sys_header + field + TRX_SYS_MYSQL_LOG_NAME),
-			file_name)) {
-
-		mlog_write_string(sys_header + field
-				  + TRX_SYS_MYSQL_LOG_NAME,
-				  (byte*) file_name, 1 + ut_strlen(file_name),
-				  mtr);
-	}
-
-	if (mach_read_from_4(sys_header + field
-			     + TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0
-	    || (offset >> 32) > 0) {
-
-		mlog_write_ulint(sys_header + field
-				 + TRX_SYS_MYSQL_LOG_OFFSET_HIGH,
-				 (ulint)(offset >> 32),
-				 MLOG_4BYTES, mtr);
-	}
-
-	mlog_write_ulint(sys_header + field
-			 + TRX_SYS_MYSQL_LOG_OFFSET_LOW,
-			 (ulint)(offset & 0xFFFFFFFFUL),
-			 MLOG_4BYTES, mtr);
-}
-
-/*****************************************************************//**
-Stores the MySQL binlog offset info in the trx system header if
-the magic number shows it valid, and print the info to stderr */
-UNIV_INTERN
-void
-trx_sys_print_mysql_binlog_offset(void)
-/*===================================*/
-{
-	trx_sysf_t*	sys_header;
-	mtr_t		mtr;
-	ulint		trx_sys_mysql_bin_log_pos_high;
-	ulint		trx_sys_mysql_bin_log_pos_low;
-
-	mtr_start(&mtr);
-
-	sys_header = trx_sysf_get(&mtr);
-
-	if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
-			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
-	    != TRX_SYS_MYSQL_LOG_MAGIC_N) {
-
-		mtr_commit(&mtr);
-
-		return;
-	}
-
-	trx_sys_mysql_bin_log_pos_high = mach_read_from_4(
-		sys_header + TRX_SYS_MYSQL_LOG_INFO
-		+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH);
-	trx_sys_mysql_bin_log_pos_low = mach_read_from_4(
-		sys_header + TRX_SYS_MYSQL_LOG_INFO
-		+ TRX_SYS_MYSQL_LOG_OFFSET_LOW);
-
-	trx_sys_mysql_bin_log_pos
-		= (((ib_int64_t)trx_sys_mysql_bin_log_pos_high) << 32)
-		+ (ib_int64_t)trx_sys_mysql_bin_log_pos_low;
-
-	ut_memcpy(trx_sys_mysql_bin_log_name,
-		  sys_header + TRX_SYS_MYSQL_LOG_INFO
-		  + TRX_SYS_MYSQL_LOG_NAME, TRX_SYS_MYSQL_LOG_NAME_LEN);
-
-	fprintf(stderr,
-		"InnoDB: Last MySQL binlog file position %lu %lu,"
-		" file name %s\n",
-		trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low,
-		trx_sys_mysql_bin_log_name);
-
-	mtr_commit(&mtr);
-}
-
-/*****************************************************************//**
-Reads the log coordinates at the given offset in the trx sys header. */
-static
-void
-trx_sys_read_log_pos(
-/*=================*/
-	const trx_sysf_t*	sys_header,	/*!< in: the trx sys header */
-	uint			header_offset,	/*!< in: coord offset in the
-						header */
-	char*			log_fn,		/*!< out: the log file name */
-	ib_int64_t*		log_pos)	/*!< out: the log poistion */
-{
-	ut_memcpy(log_fn, sys_header + header_offset + TRX_SYS_MYSQL_LOG_NAME,
-		  TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN);
-
-	*log_pos =
-		(((ib_int64_t)mach_read_from_4(sys_header + header_offset
-				+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) << 32)
-		+ mach_read_from_4(sys_header + header_offset
-				   + TRX_SYS_MYSQL_LOG_OFFSET_LOW);
-}
-
-/*****************************************************************//**
-Prints to stderr the MySQL master log offset info in the trx system header
-PREPARE set of fields if the magic number shows it valid and stores it
-in global variables. */
-UNIV_INTERN
-void
-trx_sys_print_mysql_master_log_pos(void)
-/*====================================*/
-{
-	trx_sysf_t*	sys_header;
-	mtr_t		mtr;
-
-	mtr_start(&mtr);
-
-	sys_header = trx_sysf_get(&mtr);
-
-	if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
-			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
-	    != TRX_SYS_MYSQL_LOG_MAGIC_N) {
-
-		mtr_commit(&mtr);
-
-		return;
-	}
-
-	/* Copy the master log position info to global variables we can
-	use in ha_innobase.cc to initialize glob_mi to right values */
-	trx_sys_read_log_pos(sys_header, TRX_SYS_MYSQL_MASTER_LOG_INFO,
-			     trx_sys_mysql_master_log_name,
-			     &trx_sys_mysql_master_log_pos);
-
-	trx_sys_read_log_pos(sys_header, TRX_SYS_MYSQL_RELAY_LOG_INFO,
-			     trx_sys_mysql_relay_log_name,
-			     &trx_sys_mysql_relay_log_pos);
-
-	mtr_commit(&mtr);
-
-	fprintf(stderr,
-		"InnoDB: In a MySQL replication slave the last"
-		" master binlog file\n"
-		"InnoDB: position %llu, file name %s\n",
-		trx_sys_mysql_master_log_pos,
-		trx_sys_mysql_master_log_name);
-
-	fprintf(stderr,
-		"InnoDB: and relay log file\n"
-		"InnoDB: position %llu, file name %s\n",
-		trx_sys_mysql_relay_log_pos,
-		trx_sys_mysql_relay_log_name);
-}
-
-/*****************************************************************//**
-Prints to stderr the MySQL master log offset info in the trx system header
-COMMIT set of fields if the magic number shows it valid and stores it
-in global variables. */
-UNIV_INTERN
-void
-trx_sys_print_committed_mysql_master_log_pos(void)
-/*==============================================*/
-{
-	trx_sysf_t*	sys_header;
-	mtr_t		mtr;
-
-	mtr_start(&mtr);
-
-	sys_header = trx_sysf_get(&mtr);
-
-	if (mach_read_from_4(sys_header + TRX_SYS_COMMIT_MASTER_LOG_INFO
-			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
-	    != TRX_SYS_MYSQL_LOG_MAGIC_N) {
-
-		mtr_commit(&mtr);
-
-		return;
-	}
-
-	/* Copy the master log position info to global variables we can
-	   use in ha_innobase.cc to initialize glob_mi to right values */
-	trx_sys_read_log_pos(sys_header, TRX_SYS_COMMIT_MASTER_LOG_INFO,
-			     trx_sys_mysql_master_log_name,
-			     &trx_sys_mysql_master_log_pos);
-
-	trx_sys_read_log_pos(sys_header, TRX_SYS_COMMIT_RELAY_LOG_INFO,
-			     trx_sys_mysql_relay_log_name,
-			     &trx_sys_mysql_relay_log_pos);
-
-	mtr_commit(&mtr);
-
-	fprintf(stderr,
-		"InnoDB: In a MySQL replication slave the last"
-		" master binlog file\n"
-		"InnoDB: position %llu, file name %s\n",
-		trx_sys_mysql_master_log_pos, trx_sys_mysql_master_log_name);
-
-	fprintf(stderr,
-		"InnoDB: and relay log file\n"
-		"InnoDB: position %llu, file name %s\n",
-		trx_sys_mysql_relay_log_pos, trx_sys_mysql_relay_log_name);
-}
-
-/****************************************************************//**
-Looks for a free slot for a rollback segment in the trx system file copy.
-@return	slot index or ULINT_UNDEFINED if not found */
-UNIV_INTERN
-ulint
-trx_sysf_rseg_find_free(
-/*====================*/
-	mtr_t*	mtr)	/*!< in: mtr */
-{
-	trx_sysf_t*	sys_header;
-	ulint		page_no;
-	ulint		i;
-
-	ut_ad(mutex_own(&(kernel_mutex)));
-
-	sys_header = trx_sysf_get(mtr);
-
-	for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
-
-		page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
-
-		if (page_no == FIL_NULL) {
-
-			return(i);
-		}
-	}
-
-	return(ULINT_UNDEFINED);
-}
-
-/*****************************************************************//**
-Creates the file page for the transaction system. This function is called only
-at the database creation, before trx_sys_init. */
-static
-void
-trx_sysf_create(
-/*============*/
-	mtr_t*	mtr)	/*!< in: mtr */
-{
-	trx_sysf_t*	sys_header;
-	ulint		slot_no;
-	buf_block_t*	block;
-	page_t*		page;
-	ulint		page_no;
-	byte*		ptr;
-	ulint		len;
-
-	ut_ad(mtr);
-
-	/* Note that below we first reserve the file space x-latch, and
-	then enter the kernel: we must do it in this order to conform
-	to the latching order rules. */
-
-	mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), mtr);
-	mutex_enter(&kernel_mutex);
-
-	/* Create the trx sys file block in a new allocated file segment */
-	block = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
-			    mtr);
-	buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
-
-	ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO);
-
-	page = buf_block_get_frame(block);
-
-	mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS,
-			 MLOG_2BYTES, mtr);
-
-	/* Reset the doublewrite buffer magic number to zero so that we
-	know that the doublewrite buffer has not yet been created (this
-	suppresses a Valgrind warning) */
-
-	mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE
-			 + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr);
-
-	sys_header = trx_sysf_get(mtr);
-
-	/* Start counting transaction ids from number 1 up */
-	mach_write_to_8(sys_header + TRX_SYS_TRX_ID_STORE, 1);
-
-	/* Reset the rollback segment slots.  Old versions of InnoDB
-	define TRX_SYS_N_RSEGS as 256 (TRX_SYS_OLD_N_RSEGS) and expect
-	that the whole array is initialized. */
-	ptr = TRX_SYS_RSEGS + sys_header;
-	len = ut_max(TRX_SYS_OLD_N_RSEGS, TRX_SYS_N_RSEGS)
-		* TRX_SYS_RSEG_SLOT_SIZE;
-	memset(ptr, 0xff, len);
-	ptr += len;
-	ut_a(ptr <= page + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END));
-
-	/* Initialize all of the page.  This part used to be uninitialized. */
-	memset(ptr, 0, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page - ptr);
-
-	mlog_log_string(sys_header, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
-			+ page - sys_header, mtr);
-
-	/* Create the first rollback segment in the SYSTEM tablespace */
-	slot_no = trx_sysf_rseg_find_free(mtr);
-	page_no = trx_rseg_header_create(TRX_SYS_SPACE, 0, ULINT_MAX, slot_no,
-					 mtr);
-	ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
-	ut_a(page_no == FSP_FIRST_RSEG_PAGE_NO);
-
-	mutex_exit(&kernel_mutex);
-}
-
-/*****************************************************************//**
-Compare two trx_rseg_t instances on last_trx_no. */
-static
-int
-trx_rseg_compare_last_trx_no(
-/*=========================*/
-	const void*	p1,		/*!< in: elem to compare */
-	const void*	p2)		/*!< in: elem to compare */
-{
-	ib_int64_t	cmp;
-
-	const rseg_queue_t*	rseg_q1 = (const rseg_queue_t*) p1;
-	const rseg_queue_t*	rseg_q2 = (const rseg_queue_t*) p2;
-
-	cmp = rseg_q1->trx_no - rseg_q2->trx_no;
-
-	if (cmp < 0) {
-		return(-1);
-	} else if (cmp > 0) {
-		return(1);
-	}
-
-	return(0);
-}
-
-/*****************************************************************//**
-Creates dummy of the file page for the transaction system. */
-static
-void
-trx_sysf_dummy_create(
-/*==================*/
-	ulint	space,
-	mtr_t*	mtr)
-{
-	buf_block_t*	block;
-	page_t*		page;
-
-	ut_ad(mtr);
-
-	/* Note that below we first reserve the file space x-latch, and
-	then enter the kernel: we must do it in this order to conform
-	to the latching order rules. */
-
-	mtr_x_lock(fil_space_get_latch(space, NULL), mtr);
-	mutex_enter(&kernel_mutex);
-
-	/* Create the trx sys file block in a new allocated file segment */
-	block = fseg_create(space, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
-			    mtr);
-	buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
-
-	fprintf(stderr, "%lu\n", buf_block_get_page_no(block));
-	ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO);
-
-	page = buf_block_get_frame(block);
-
-	mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS,
-			 MLOG_2BYTES, mtr);
-
-	/* Reset the doublewrite buffer magic number to zero so that we
-	know that the doublewrite buffer has not yet been created (this
-	suppresses a Valgrind warning) */
-
-	mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE
-			 + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr);
-
-#ifdef UNDEFINED
-	/* TODO: REMOVE IT: The bellow is not needed, I think */
-	sys_header = trx_sysf_get(mtr);
-
-	/* Start counting transaction ids from number 1 up */
-	mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE,
-			  ut_dulint_create(0, 1), mtr);
-
-	/* Reset the rollback segment slots */
-	for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
-
-		trx_sysf_rseg_set_space(sys_header, i, ULINT_UNDEFINED, mtr);
-		trx_sysf_rseg_set_page_no(sys_header, i, FIL_NULL, mtr);
-	}
-
-	/* The remaining area (up to the page trailer) is uninitialized.
-	Silence Valgrind warnings about it. */
-	UNIV_MEM_VALID(sys_header + (TRX_SYS_RSEGS
-				     + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
-				     + TRX_SYS_RSEG_SPACE),
-		       (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
-			- (TRX_SYS_RSEGS
-			   + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
-			   + TRX_SYS_RSEG_SPACE))
-		       + page - sys_header);
-
-	/* Create the first rollback segment in the SYSTEM tablespace */
-	page_no = trx_rseg_header_create(space, 0, ULINT_MAX, &slot_no,
-					 mtr);
-	ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
-	ut_a(page_no != FIL_NULL);
-#endif
-
-	mutex_exit(&kernel_mutex);
-}
-
-/*****************************************************************//**
-Creates and initializes the central memory structures for the transaction
-system. This is called when the database is started. */
-UNIV_INTERN
-void
-trx_sys_init_at_db_start(void)
-/*==========================*/
-{
-	trx_sysf_t*	sys_header;
-	ib_uint64_t	rows_to_undo	= 0;
-	const char*	unit		= "";
-	trx_t*		trx;
-	mtr_t		mtr;
-	ib_bh_t*	ib_bh;
-
-	mtr_start(&mtr);
-
-	ut_ad(trx_sys == NULL);
-
-	mutex_enter(&kernel_mutex);
-
-	/* We create the min binary heap here and pass ownership to
-	purge when we init the purge sub-system. Purge is responsible
-	for freeing the binary heap. */
-
-	ib_bh = ib_bh_create(
-		trx_rseg_compare_last_trx_no,
-		sizeof(rseg_queue_t), TRX_SYS_N_RSEGS);
-
-	trx_sys = mem_zalloc(sizeof(*trx_sys));
-
-	/* Allocate the trx descriptors array */
-	trx_sys->descriptors = ut_malloc(sizeof(trx_id_t) *
-					 TRX_DESCR_ARRAY_INITIAL_SIZE);
-	trx_sys->descr_n_max = TRX_DESCR_ARRAY_INITIAL_SIZE;
-	trx_sys->descr_n_used = 0;
-	srv_descriptors_memory = TRX_DESCR_ARRAY_INITIAL_SIZE *
-		sizeof(trx_id_t);
-
-	sys_header = trx_sysf_get(&mtr);
-
-	trx_rseg_list_and_array_init(sys_header, ib_bh, &mtr);
-
-	trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
-
-	/* VERY important: after the database is started, max_trx_id value is
-	divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in
-	trx_sys_get_new_trx_id will evaluate to TRUE when the function
-	is first time called, and the value for trx id will be written
-	to the disk-based header! Thus trx id values will not overlap when
-	the database is repeatedly started! */
-
-	trx_sys->max_trx_id = 2 * TRX_SYS_TRX_ID_WRITE_MARGIN
-		+ ut_uint64_align_up(mach_read_from_8(sys_header
-						   + TRX_SYS_TRX_ID_STORE),
-				     TRX_SYS_TRX_ID_WRITE_MARGIN);
-
-	UT_LIST_INIT(trx_sys->mysql_trx_list);
-	trx_dummy_sess = sess_open();
-	trx_lists_init_at_db_start();
-
-	if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
-		trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
-
-		for (;;) {
-
-			if (trx->state != TRX_PREPARED) {
-				rows_to_undo += trx->undo_no;
-			}
-
-			trx = UT_LIST_GET_NEXT(trx_list, trx);
-
-			if (!trx) {
-				break;
-			}
-		}
-
-		if (rows_to_undo > 1000000000) {
-			unit = "M";
-			rows_to_undo = rows_to_undo / 1000000;
-		}
-
-		fprintf(stderr,
-			"InnoDB: %lu transaction(s) which must be"
-			" rolled back or cleaned up\n"
-			"InnoDB: in total %lu%s row operations to undo\n",
-			(ulong) UT_LIST_GET_LEN(trx_sys->trx_list),
-			(ulong) rows_to_undo, unit);
-
-		fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n",
-			(ullint) trx_sys->max_trx_id);
-	}
-
-	UT_LIST_INIT(trx_sys->view_list);
-
-	/* Transfer ownership to purge. */
-	trx_purge_sys_create(ib_bh);
-
-	mutex_exit(&kernel_mutex);
-
-	mtr_commit(&mtr);
-}
-
-/*****************************************************************//**
-Creates and initializes the transaction system at the database creation. */
-UNIV_INTERN
-void
-trx_sys_create(void)
-/*================*/
-{
-	mtr_t	mtr;
-
-	mtr_start(&mtr);
-
-	trx_sysf_create(&mtr);
-
-	mtr_commit(&mtr);
-
-	trx_sys_init_at_db_start();
-}
-
-/*****************************************************************//**
-Update the file format tag.
-@return	always TRUE */
-static
-ibool
-trx_sys_file_format_max_write(
-/*==========================*/
-	ulint		format_id,	/*!< in: file format id */
-	const char**	name)		/*!< out: max file format name, can
-					be NULL */
-{
-	mtr_t		mtr;
-	byte*		ptr;
-	buf_block_t*	block;
-	ib_uint64_t	tag_value;
-
-	mtr_start(&mtr);
-
-	block = buf_page_get(
-		TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
-
-	file_format_max.id = format_id;
-	file_format_max.name = trx_sys_file_format_id_to_name(format_id);
-
-	ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG;
-	tag_value = format_id + TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
-
-	if (name) {
-		*name = file_format_max.name;
-	}
-
-	mlog_write_ull(ptr, tag_value, &mtr);
-
-	mtr_commit(&mtr);
-
-	return(TRUE);
-}
-
-/*****************************************************************//**
-Read the file format tag.
-@return	the file format or ULINT_UNDEFINED if not set. */
-static
-ulint
-trx_sys_file_format_max_read(void)
-/*==============================*/
-{
-	mtr_t			mtr;
-	const byte*		ptr;
-	const buf_block_t*	block;
-	ib_id_t			file_format_id;
-
-	/* Since this is called during the startup phase it's safe to
-	read the value without a covering mutex. */
-	mtr_start(&mtr);
-
-	block = buf_page_get(
-		TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
-
-	ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG;
-	file_format_id = mach_read_from_8(ptr);
-
-	mtr_commit(&mtr);
-
-	file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
-
-	if (file_format_id >= FILE_FORMAT_NAME_N) {
-
-		/* Either it has never been tagged, or garbage in it. */
-		return(ULINT_UNDEFINED);
-	}
-
-	return((ulint) file_format_id);
-}
-
-/*****************************************************************//**
-Get the name representation of the file format from its id.
-@return	pointer to the name */
-UNIV_INTERN
-const char*
-trx_sys_file_format_id_to_name(
-/*===========================*/
-	const ulint	id)	/*!< in: id of the file format */
-{
-	ut_a(id < FILE_FORMAT_NAME_N);
-
-	return(file_format_name_map[id]);
-}
-
-/*****************************************************************//**
-Check for the max file format tag stored on disk. Note: If max_format_id
-is == DICT_TF_FORMAT_MAX + 1 then we only print a warning.
-@return	DB_SUCCESS or error code */
-UNIV_INTERN
-ulint
-trx_sys_file_format_max_check(
-/*==========================*/
-	ulint	max_format_id)	/*!< in: max format id to check */
-{
-	ulint	format_id;
-
-	/* Check the file format in the tablespace. Do not try to
-	recover if the file format is not supported by the engine
-	unless forced by the user. */
-	format_id = trx_sys_file_format_max_read();
-	if (format_id == ULINT_UNDEFINED) {
-		/* Format ID was not set. Set it to minimum possible
-		value. */
-		format_id = DICT_TF_FORMAT_MIN;
-	}
-
-	ut_print_timestamp(stderr);
-	fprintf(stderr,
-		" InnoDB: highest supported file format is %s.\n",
-		trx_sys_file_format_id_to_name(DICT_TF_FORMAT_MAX));
-
-	if (format_id > DICT_TF_FORMAT_MAX) {
-
-		ut_a(format_id < FILE_FORMAT_NAME_N);
-
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: %s: the system tablespace is in a file "
-			"format that this version doesn't support - %s\n",
-			((max_format_id <= DICT_TF_FORMAT_MAX)
-				? "Error" : "Warning"),
-			trx_sys_file_format_id_to_name(format_id));
-
-		if (max_format_id <= DICT_TF_FORMAT_MAX) {
-			return(DB_ERROR);
-		}
-	}
-
-	format_id = (format_id > max_format_id) ? format_id : max_format_id;
-
-	/* We don't need a mutex here, as this function should only
-	be called once at start up. */
-	file_format_max.id = format_id;
-	file_format_max.name = trx_sys_file_format_id_to_name(format_id);
-
-	return(DB_SUCCESS);
-}
-
-/*****************************************************************//**
-Set the file format id unconditionally except if it's already the
-same value.
-@return	TRUE if value updated */
-UNIV_INTERN
-ibool
-trx_sys_file_format_max_set(
-/*========================*/
-	ulint		format_id,	/*!< in: file format id */
-	const char**	name)		/*!< out: max file format name or
-					NULL if not needed. */
-{
-	ibool		ret = FALSE;
-
-	ut_a(format_id <= DICT_TF_FORMAT_MAX);
-
-	mutex_enter(&file_format_max.mutex);
-
-	/* Only update if not already same value. */
-	if (format_id != file_format_max.id) {
-
-		ret = trx_sys_file_format_max_write(format_id, name);
-	}
-
-	mutex_exit(&file_format_max.mutex);
-
-	return(ret);
-}
-
-/********************************************************************//**
-Tags the system table space with minimum format id if it has not been
-tagged yet.
-WARNING: This function is only called during the startup and AFTER the
-redo log application during recovery has finished. */
-UNIV_INTERN
-void
-trx_sys_file_format_tag_init(void)
-/*==============================*/
-{
-	ulint	format_id;
-
-	format_id = trx_sys_file_format_max_read();
-
-	/* If format_id is not set then set it to the minimum. */
-	if (format_id == ULINT_UNDEFINED) {
-		trx_sys_file_format_max_set(DICT_TF_FORMAT_MIN, NULL);
-	}
-}
-
-/********************************************************************//**
-Update the file format tag in the system tablespace only if the given
-format id is greater than the known max id.
-@return	TRUE if format_id was bigger than the known max id */
-UNIV_INTERN
-ibool
-trx_sys_file_format_max_upgrade(
-/*============================*/
-	const char**	name,		/*!< out: max file format name */
-	ulint		format_id)	/*!< in: file format identifier */
-{
-	ibool		ret = FALSE;
-
-	ut_a(name);
-	ut_a(file_format_max.name != NULL);
-	ut_a(format_id <= DICT_TF_FORMAT_MAX);
-
-	mutex_enter(&file_format_max.mutex);
-
-	if (format_id > file_format_max.id) {
-
-		ret = trx_sys_file_format_max_write(format_id, name);
-	}
-
-	mutex_exit(&file_format_max.mutex);
-
-	return(ret);
-}
-
-/*****************************************************************//**
-Get the name representation of the file format from its id.
-@return	pointer to the max format name */
-UNIV_INTERN
-const char*
-trx_sys_file_format_max_get(void)
-/*=============================*/
-{
-	return(file_format_max.name);
-}
-
-/*****************************************************************//**
-Initializes the tablespace tag system. */
-UNIV_INTERN
-void
-trx_sys_file_format_init(void)
-/*==========================*/
-{
-	mutex_create(file_format_max_mutex_key,
-		     &file_format_max.mutex, SYNC_FILE_FORMAT_TAG);
-
-	/* We don't need a mutex here, as this function should only
-	be called once at start up. */
-	file_format_max.id = DICT_TF_FORMAT_MIN;
-
-	file_format_max.name = trx_sys_file_format_id_to_name(
-		file_format_max.id);
-}
-
-/*****************************************************************//**
-Closes the tablespace tag system. */
-UNIV_INTERN
-void
-trx_sys_file_format_close(void)
-/*===========================*/
-{
-	/* Does nothing at the moment */
-}
-
-/*****************************************************************//**
-Creates and initializes the dummy transaction system page for tablespace. */
-UNIV_INTERN
-void
-trx_sys_dummy_create(
-/*=================*/
-	ulint	space)
-{
-	mtr_t	mtr;
-
-	/* This function is only for doublewrite file for now */
-	ut_a(space == TRX_DOUBLEWRITE_SPACE);
-
-	mtr_start(&mtr);
-
-	trx_sysf_dummy_create(space, &mtr);
-
-	mtr_commit(&mtr);
-}
-
-/*********************************************************************
-Creates the rollback segments */
-UNIV_INTERN
-void
-trx_sys_create_rsegs(
-/*=================*/
-	ulint	n_rsegs)	/*!< number of rollback segments to create */
-{
-	ulint	new_rsegs = 0;
-
-	/* Do not create additional rollback segments if
-	innodb_force_recovery has been set and the database
-	was not shutdown cleanly. */
-	if (!srv_force_recovery && !recv_needed_recovery) {
-		ulint	i;
-
-		for (i = 0;  i < n_rsegs; ++i) {
-
-			if (trx_rseg_create() != NULL) {
-				++new_rsegs;
-			} else {
-				break;
-			}
-		}
-	}
-
-	if (new_rsegs > 0) {
-		fprintf(stderr,
-			"InnoDB: %lu rollback segment(s) active.\n",
-		       	new_rsegs);
-	}
-}
-
-#else /* !UNIV_HOTBACKUP */
-/*****************************************************************//**
-Prints to stderr the MySQL binlog info in the system header if the
-magic number shows it valid. */
-UNIV_INTERN
-void
-trx_sys_print_mysql_binlog_offset_from_page(
-/*========================================*/
-	const byte*	page)	/*!< in: buffer containing the trx
-				system header page, i.e., page number
-				TRX_SYS_PAGE_NO in the tablespace */
-{
-	const trx_sysf_t*	sys_header;
-
-	sys_header = page + TRX_SYS;
-
-	if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
-			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
-	    == TRX_SYS_MYSQL_LOG_MAGIC_N) {
-
-		fprintf(stderr,
-			"ibbackup: Last MySQL binlog file position %lu %lu,"
-			" file name %s\n",
-			(ulong) mach_read_from_4(
-				sys_header + TRX_SYS_MYSQL_LOG_INFO
-				+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
-			(ulong) mach_read_from_4(
-				sys_header + TRX_SYS_MYSQL_LOG_INFO
-				+ TRX_SYS_MYSQL_LOG_OFFSET_LOW),
-			sys_header + TRX_SYS_MYSQL_LOG_INFO
-			+ TRX_SYS_MYSQL_LOG_NAME);
-	}
-}
-
-
-/* THESE ARE COPIED FROM NON-HOTBACKUP PART OF THE INNODB SOURCE TREE
-   (This code duplication should be fixed at some point!)
-*/
-
-#define	TRX_SYS_SPACE	0	/* the SYSTEM tablespace */
-/* The offset of the file format tag on the trx system header page */
-#define TRX_SYS_FILE_FORMAT_TAG		(UNIV_PAGE_SIZE - 16)
-/* We use these random constants to reduce the probability of reading
-garbage (from previous versions) that maps to an actual format id. We
-use these as bit masks at the time of  reading and writing from/to disk. */
-#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW	3645922177UL
-#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH	2745987765UL
-
-/* END OF COPIED DEFINITIONS */
-
-
-/*****************************************************************//**
-Reads the file format id from the first system table space file.
-Even if the call succeeds and returns TRUE, the returned format id
-may be ULINT_UNDEFINED signalling that the format id was not present
-in the data file.
-@return TRUE if call succeeds */
-UNIV_INTERN
-ibool
-trx_sys_read_file_format_id(
-/*========================*/
-	const char *pathname,  /*!< in: pathname of the first system
-				        table space file */
-	ulint *format_id)      /*!< out: file format of the system table
-				         space */
-{
-	os_file_t	file;
-	ibool		success;
-	byte		buf[UNIV_PAGE_SIZE * 2];
-	page_t*		page = ut_align(buf, UNIV_PAGE_SIZE);
-	const byte*	ptr;
-	ib_id_t		file_format_id;
-
-	*format_id = ULINT_UNDEFINED;
-
-	file = os_file_create_simple_no_error_handling(
-		innodb_file_data_key,
-		pathname,
-		OS_FILE_OPEN,
-		OS_FILE_READ_ONLY,
-		&success
-	);
-	if (!success) {
-		/* The following call prints an error message */
-		os_file_get_last_error(TRUE);
-
-		ut_print_timestamp(stderr);
-
-		fprintf(stderr,
-"  ibbackup: Error: trying to read system tablespace file format,\n"
-"  ibbackup: but could not open the tablespace file %s!\n",
-			pathname
-		);
-		return(FALSE);
-	}
-
-	/* Read the page on which file format is stored */
-
-	success = os_file_read_no_error_handling(
-		file, page, TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE, 0, UNIV_PAGE_SIZE
-	);
-	if (!success) {
-		/* The following call prints an error message */
-		os_file_get_last_error(TRUE);
-
-		ut_print_timestamp(stderr);
-
-		fprintf(stderr,
-"  ibbackup: Error: trying to read system table space file format,\n"
-"  ibbackup: but failed to read the tablespace file %s!\n",
-			pathname
-		);
-		os_file_close(file);
-		return(FALSE);
-	}
-	os_file_close(file);
-
-	/* get the file format from the page */
-	ptr = page + TRX_SYS_FILE_FORMAT_TAG;
-	file_format_id = mach_read_from_8(ptr);
-	file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
-
-	if (file_format_id >= FILE_FORMAT_NAME_N) {
-
-		/* Either it has never been tagged, or garbage in it. */
-		return(TRUE);
-	}
-
-	*format_id = (ulint) file_format_id;
-
-	return(TRUE);
-}
-
-
-/*****************************************************************//**
-Reads the file format id from the given per-table data file.
-@return TRUE if call succeeds */
-UNIV_INTERN
-ibool
-trx_sys_read_pertable_file_format_id(
-/*=================================*/
-	const char *pathname,  /*!< in: pathname of a per-table
-				        datafile */
-	ulint *format_id)      /*!< out: file format of the per-table
-				         data file */
-{
-	os_file_t	file;
-	ibool		success;
-	byte		buf[UNIV_PAGE_SIZE * 2];
-	page_t*		page = ut_align(buf, UNIV_PAGE_SIZE);
-	const byte*	ptr;
-	ib_uint32_t	flags;
-
-	*format_id = ULINT_UNDEFINED;
-
-	file = os_file_create_simple_no_error_handling(
-		innodb_file_data_key,
-		pathname,
-		OS_FILE_OPEN,
-		OS_FILE_READ_ONLY,
-		&success
-	);
-	if (!success) {
-		/* The following call prints an error message */
-		os_file_get_last_error(TRUE);
-        
-		ut_print_timestamp(stderr);
-        
-		fprintf(stderr,
-"  ibbackup: Error: trying to read per-table tablespace format,\n"
-"  ibbackup: but could not open the tablespace file %s!\n",
-			pathname
-		);
-		return(FALSE);
-	}
-
-	/* Read the first page of the per-table datafile */
-
-	success = os_file_read_no_error_handling(
-		file, page, 0, 0, UNIV_PAGE_SIZE
-	);
-	if (!success) {
-		/* The following call prints an error message */
-		os_file_get_last_error(TRUE);
-        
-		ut_print_timestamp(stderr);
-        
-		fprintf(stderr,
-"  ibbackup: Error: trying to per-table data file format,\n"
-"  ibbackup: but failed to read the tablespace file %s!\n",
-			pathname
-		);
-		os_file_close(file);
-		return(FALSE);
-	}
-	os_file_close(file);
-
-	/* get the file format from the page */
-	ptr = page + 54;
-	flags = mach_read_from_4(ptr);
-	if (flags == 0) {
-		/* file format is Antelope */
-		*format_id = 0;
-		return (TRUE);
-	} else if (flags & 1) {
-		/* tablespace flags are ok */
-		*format_id = (flags / 32) % 128;
-		return (TRUE);
-	} else {
-		/* bad tablespace flags */
-		return(FALSE);
-	}
-}
-
-
-/*****************************************************************//**
-Get the name representation of the file format from its id.
-@return	pointer to the name */
-UNIV_INTERN
-const char*
-trx_sys_file_format_id_to_name(
-/*===========================*/
-	const ulint	id)	/*!< in: id of the file format */
-{
-	if (!(id < FILE_FORMAT_NAME_N)) {
-		/* unknown id */
-		return ("Unknown");
-	}
-
-	return(file_format_name_map[id]);
-}
-
-#endif /* !UNIV_HOTBACKUP */
-
-#ifndef UNIV_HOTBACKUP
-/*********************************************************************
-Shutdown/Close the transaction system. */
-UNIV_INTERN
-void
-trx_sys_close(void)
-/*===============*/
-{
-	trx_t*		trx;
-	trx_rseg_t*	rseg;
-	read_view_t*	view;
-
-	ut_ad(trx_sys != NULL);
-	ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
-
-	/* Check that all read views are closed except read view owned
-	by a purge. */
-
-	if (UT_LIST_GET_LEN(trx_sys->view_list) > 1) {
-		fprintf(stderr,
-			"InnoDB: Error: all read views were not closed"
-			" before shutdown:\n"
-			"InnoDB: %lu read views open \n",
-			UT_LIST_GET_LEN(trx_sys->view_list) - 1);
-	}
-
-	sess_close(trx_dummy_sess);
-	trx_dummy_sess = NULL;
-
-	trx_purge_sys_close();
-
-	mutex_enter(&kernel_mutex);
-
-	/* Free the double write data structures. */
-	ut_a(trx_doublewrite != NULL);
-	ut_free(trx_doublewrite->write_buf_unaligned);
-	trx_doublewrite->write_buf_unaligned = NULL;
-
-	mem_free(trx_doublewrite->buf_block_arr);
-	trx_doublewrite->buf_block_arr = NULL;
-
-	mutex_free(&trx_doublewrite->mutex);
-	mem_free(trx_doublewrite);
-	trx_doublewrite = NULL;
-
-	/* Only prepared transactions may be left in the system. Free them. */
-	ut_a(UT_LIST_GET_LEN(trx_sys->trx_list) == trx_n_prepared);
-
-	while ((trx = UT_LIST_GET_FIRST(trx_sys->trx_list)) != NULL) {
-		trx_free_prepared(trx);
-	}
-
-	/* There can't be any active transactions. */
-	rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
-
-	while (rseg != NULL) {
-		trx_rseg_t*	prev_rseg = rseg;
-
-		rseg = UT_LIST_GET_NEXT(rseg_list, prev_rseg);
-		UT_LIST_REMOVE(rseg_list, trx_sys->rseg_list, prev_rseg);
-
-		trx_rseg_mem_free(prev_rseg);
-	}
-
-	view = UT_LIST_GET_FIRST(trx_sys->view_list);
-
-	while (view != NULL) {
-		read_view_t*	prev_view = view;
-
-		view = UT_LIST_GET_NEXT(view_list, prev_view);
-
-		/* Views are allocated from the trx_sys->global_read_view_heap.
-		So, we simply remove the element here. */
-		UT_LIST_REMOVE(view_list, trx_sys->view_list, prev_view);
-	}
-
-	ut_a(UT_LIST_GET_LEN(trx_sys->trx_list) == 0);
-	ut_a(UT_LIST_GET_LEN(trx_sys->rseg_list) == 0);
-	ut_a(UT_LIST_GET_LEN(trx_sys->view_list) == 0);
-	ut_a(UT_LIST_GET_LEN(trx_sys->mysql_trx_list) == 0);
-
-	ut_ad(trx_sys->descr_n_used == 0);
-	ut_free(trx_sys->descriptors);
-
-	mem_free(trx_sys);
-
-	trx_sys = NULL;
-	mutex_exit(&kernel_mutex);
-}
-#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/trx/trx0sys.cc b/storage/xtradb/trx/trx0sys.cc
new file mode 100644
index 00000000000..b86ee90b1e9
--- /dev/null
+++ b/storage/xtradb/trx/trx0sys.cc
@@ -0,0 +1,1323 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0sys.cc
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0sys.h"
+
+#ifdef UNIV_NONINL
+#include "trx0sys.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+#include "fsp0fsp.h"
+#include "mtr0log.h"
+#include "mtr0log.h"
+#include "trx0trx.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "os0file.h"
+#include "read0read.h"
+
+/** The file format tag structure with id and name. */
+struct file_format_t {
+	ulint		id;		/*!< id of the file format */
+	const char*	name;		/*!< text representation of the
+					file format */
+	ib_mutex_t		mutex;		/*!< covers changes to the above
+					fields */
+};
+
+/** The transaction system */
+UNIV_INTERN trx_sys_t*		trx_sys		= NULL;
+
+/** In a MySQL replication slave, in crash recovery we store the master log
+file name and position here. */
+/* @{ */
+/** Master binlog file name */
+UNIV_INTERN char	trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
+/** Master binlog file position.  We have successfully got the updates
+up to this position.  -1 means that no crash recovery was needed, or
+there was no master log position info inside InnoDB.*/
+UNIV_INTERN ib_int64_t	trx_sys_mysql_master_log_pos	= -1;
+/* @} */
+
+/** If this MySQL server uses binary logging, after InnoDB has been inited
+and if it has done a crash recovery, we store the binlog file name and position
+here. */
+/* @{ */
+/** Binlog file name */
+UNIV_INTERN char	trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
+/** Binlog file position, or -1 if unknown */
+UNIV_INTERN ib_int64_t	trx_sys_mysql_bin_log_pos	= -1;
+/* @} */
+#endif /* !UNIV_HOTBACKUP */
+
+/** List of animal names representing file format. */
+static const char*	file_format_name_map[] = {
+	"Antelope",
+	"Barracuda",
+	"Cheetah",
+	"Dragon",
+	"Elk",
+	"Fox",
+	"Gazelle",
+	"Hornet",
+	"Impala",
+	"Jaguar",
+	"Kangaroo",
+	"Leopard",
+	"Moose",
+	"Nautilus",
+	"Ocelot",
+	"Porpoise",
+	"Quail",
+	"Rabbit",
+	"Shark",
+	"Tiger",
+	"Urchin",
+	"Viper",
+	"Whale",
+	"Xenops",
+	"Yak",
+	"Zebra"
+};
+
+/** The number of elements in the file format name array. */
+static const ulint	FILE_FORMAT_NAME_N
+	= sizeof(file_format_name_map) / sizeof(file_format_name_map[0]);
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	file_format_max_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	trx_sys_mutex_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_DEBUG
+/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
+UNIV_INTERN uint	trx_rseg_n_slots_debug = 0;
+#endif
+
+/** This is used to track the maximum file format id known to InnoDB. It's
+updated via SET GLOBAL innodb_file_format_max = 'x' or when we open
+or create a table. */
+static	file_format_t	file_format_max;
+
+#ifdef UNIV_DEBUG
+/****************************************************************//**
+Checks whether a trx is in one of rw_trx_list or ro_trx_list.
+@return	TRUE if is in */
+UNIV_INTERN
+ibool
+trx_in_trx_list(
+/*============*/
+	const trx_t*	in_trx)	/*!< in: transaction */
+{
+	const trx_t*	trx;
+	trx_list_t*	trx_list;
+
+	/* Non-locking autocommits should not hold any locks. */
+	assert_trx_in_list(in_trx);
+
+	trx_list = in_trx->read_only
+		? &trx_sys->ro_trx_list : &trx_sys->rw_trx_list;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_ad(trx_assert_started(in_trx));
+
+	for (trx = UT_LIST_GET_FIRST(*trx_list);
+	     trx != NULL && trx != in_trx;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		assert_trx_in_list(trx);
+		ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list));
+	}
+
+	return(trx != NULL);
+}
+#endif /* UNIV_DEBUG */
+
+/*****************************************************************//**
+Writes the value of max_trx_id to the file based trx system header. */
+UNIV_INTERN
+void
+trx_sys_flush_max_trx_id(void)
+/*==========================*/
+{
+	mtr_t		mtr;
+	trx_sysf_t*	sys_header;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	if (!srv_read_only_mode) {
+		mtr_start(&mtr);
+
+		sys_header = trx_sysf_get(&mtr);
+
+		mlog_write_ull(
+			sys_header + TRX_SYS_TRX_ID_STORE,
+			trx_sys->max_trx_id, &mtr);
+
+		mtr_commit(&mtr);
+	}
+}
+
+/*****************************************************************//**
+Updates the offset information about the end of the MySQL binlog entry
+which corresponds to the transaction just being committed. In a MySQL
+replication slave updates the latest master binlog position up to which
+replication has proceeded. */
+UNIV_INTERN
+void
+trx_sys_update_mysql_binlog_offset(
+/*===============================*/
+	const char*	file_name,/*!< in: MySQL log file name */
+	ib_int64_t	offset,	/*!< in: position in that log file */
+	ulint		field,	/*!< in: offset of the MySQL log info field in
+				the trx sys header */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	trx_sysf_t*	sys_header;
+
+	if (ut_strlen(file_name) >= TRX_SYS_MYSQL_LOG_NAME_LEN) {
+
+		/* We cannot fit the name to the 512 bytes we have reserved */
+
+		return;
+	}
+
+	sys_header = trx_sysf_get(mtr);
+
+	if (mach_read_from_4(sys_header + field
+			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+	    != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+		mlog_write_ulint(sys_header + field
+				 + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD,
+				 TRX_SYS_MYSQL_LOG_MAGIC_N,
+				 MLOG_4BYTES, mtr);
+	}
+
+	if (0 != strcmp((char*) (sys_header + field + TRX_SYS_MYSQL_LOG_NAME),
+			file_name)) {
+
+		mlog_write_string(sys_header + field
+				  + TRX_SYS_MYSQL_LOG_NAME,
+				  (byte*) file_name, 1 + ut_strlen(file_name),
+				  mtr);
+	}
+
+	if (mach_read_from_4(sys_header + field
+			     + TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0
+	    || (offset >> 32) > 0) {
+
+		mlog_write_ulint(sys_header + field
+				 + TRX_SYS_MYSQL_LOG_OFFSET_HIGH,
+				 (ulint)(offset >> 32),
+				 MLOG_4BYTES, mtr);
+	}
+
+	mlog_write_ulint(sys_header + field
+			 + TRX_SYS_MYSQL_LOG_OFFSET_LOW,
+			 (ulint)(offset & 0xFFFFFFFFUL),
+			 MLOG_4BYTES, mtr);
+}
+
+/*****************************************************************//**
+Stores the MySQL binlog offset info in the trx system header if
+the magic number shows it valid, and print the info to stderr */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset(void)
+/*===================================*/
+{
+	trx_sysf_t*	sys_header;
+	mtr_t		mtr;
+	ulint		trx_sys_mysql_bin_log_pos_high;
+	ulint		trx_sys_mysql_bin_log_pos_low;
+
+	mtr_start(&mtr);
+
+	sys_header = trx_sysf_get(&mtr);
+
+	if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+	    != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+		mtr_commit(&mtr);
+
+		return;
+	}
+
+	trx_sys_mysql_bin_log_pos_high = mach_read_from_4(
+		sys_header + TRX_SYS_MYSQL_LOG_INFO
+		+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH);
+	trx_sys_mysql_bin_log_pos_low = mach_read_from_4(
+		sys_header + TRX_SYS_MYSQL_LOG_INFO
+		+ TRX_SYS_MYSQL_LOG_OFFSET_LOW);
+
+	trx_sys_mysql_bin_log_pos
+		= (((ib_int64_t) trx_sys_mysql_bin_log_pos_high) << 32)
+		+ (ib_int64_t) trx_sys_mysql_bin_log_pos_low;
+
+	ut_memcpy(trx_sys_mysql_bin_log_name,
+		  sys_header + TRX_SYS_MYSQL_LOG_INFO
+		  + TRX_SYS_MYSQL_LOG_NAME, TRX_SYS_MYSQL_LOG_NAME_LEN);
+
+	fprintf(stderr,
+		"InnoDB: Last MySQL binlog file position %lu %lu,"
+		" file name %s\n",
+		trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low,
+		trx_sys_mysql_bin_log_name);
+
+	mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Prints to stderr the MySQL master log offset info in the trx system header if
+the magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_master_log_pos(void)
+/*====================================*/
+{
+	trx_sysf_t*	sys_header;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	sys_header = trx_sysf_get(&mtr);
+
+	if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+	    != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+		mtr_commit(&mtr);
+
+		return;
+	}
+
+	fprintf(stderr,
+		"InnoDB: In a MySQL replication slave the last"
+		" master binlog file\n"
+		"InnoDB: position %lu %lu, file name %s\n",
+		(ulong) mach_read_from_4(sys_header
+					 + TRX_SYS_MYSQL_MASTER_LOG_INFO
+					 + TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
+		(ulong) mach_read_from_4(sys_header
+					 + TRX_SYS_MYSQL_MASTER_LOG_INFO
+					 + TRX_SYS_MYSQL_LOG_OFFSET_LOW),
+		sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+		+ TRX_SYS_MYSQL_LOG_NAME);
+	/* Copy the master log position info to global variables we can
+	use in ha_innobase.cc to initialize glob_mi to right values */
+
+	ut_memcpy(trx_sys_mysql_master_log_name,
+		  sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+		  + TRX_SYS_MYSQL_LOG_NAME,
+		  TRX_SYS_MYSQL_LOG_NAME_LEN);
+
+	trx_sys_mysql_master_log_pos
+		= (((ib_int64_t) mach_read_from_4(
+			    sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+			    + TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) << 32)
+		+ ((ib_int64_t) mach_read_from_4(
+			   sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+			   + TRX_SYS_MYSQL_LOG_OFFSET_LOW));
+	mtr_commit(&mtr);
+}
+
+/****************************************************************//**
+Looks for a free slot for a rollback segment in the trx system file copy.
+@return	slot index or ULINT_UNDEFINED if not found */
+UNIV_INTERN
+ulint
+trx_sysf_rseg_find_free(
+/*====================*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	ulint		i;
+	trx_sysf_t*	sys_header;
+
+	sys_header = trx_sysf_get(mtr);
+
+	for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+		ulint	page_no;
+
+		page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
+
+		if (page_no == FIL_NULL) {
+
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/*****************************************************************//**
+Creates the file page for the transaction system. This function is called only
+at the database creation, before trx_sys_init. */
+static
+void
+trx_sysf_create(
+/*============*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	trx_sysf_t*	sys_header;
+	ulint		slot_no;
+	buf_block_t*	block;
+	page_t*		page;
+	ulint		page_no;
+	byte*		ptr;
+	ulint		len;
+
+	ut_ad(mtr);
+
+	/* Note that below we first reserve the file space x-latch, and
+	then enter the kernel: we must do it in this order to conform
+	to the latching order rules. */
+
+	mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), mtr);
+
+	/* Create the trx sys file block in a new allocated file segment */
+	block = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
+			    mtr);
+	buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+
+	ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO);
+
+	page = buf_block_get_frame(block);
+
+	mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS,
+			 MLOG_2BYTES, mtr);
+
+	/* Reset the doublewrite buffer magic number to zero so that we
+	know that the doublewrite buffer has not yet been created (this
+	suppresses a Valgrind warning) */
+
+	mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE
+			 + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr);
+
+	sys_header = trx_sysf_get(mtr);
+
+	/* Start counting transaction ids from number 1 up */
+	mach_write_to_8(sys_header + TRX_SYS_TRX_ID_STORE, 1);
+
+	/* Reset the rollback segment slots.  Old versions of InnoDB
+	define TRX_SYS_N_RSEGS as 256 (TRX_SYS_OLD_N_RSEGS) and expect
+	that the whole array is initialized. */
+	ptr = TRX_SYS_RSEGS + sys_header;
+	len = ut_max(TRX_SYS_OLD_N_RSEGS, TRX_SYS_N_RSEGS)
+		* TRX_SYS_RSEG_SLOT_SIZE;
+	memset(ptr, 0xff, len);
+	ptr += len;
+	ut_a(ptr <= page + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END));
+
+	/* Initialize all of the page.  This part used to be uninitialized. */
+	memset(ptr, 0, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page - ptr);
+
+	mlog_log_string(sys_header, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
+			+ page - sys_header, mtr);
+
+	/* Create the first rollback segment in the SYSTEM tablespace */
+	slot_no = trx_sysf_rseg_find_free(mtr);
+	page_no = trx_rseg_header_create(TRX_SYS_SPACE, 0, ULINT_MAX, slot_no,
+					 mtr);
+
+	ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
+	ut_a(page_no == FSP_FIRST_RSEG_PAGE_NO);
+}
+
+/*****************************************************************//**
+Compare two trx_rseg_t instances on last_trx_no. */
+static
+int
+trx_rseg_compare_last_trx_no(
+/*=========================*/
+	const void*	p1,		/*!< in: elem to compare */
+	const void*	p2)		/*!< in: elem to compare */
+{
+	ib_int64_t	cmp;
+
+	const rseg_queue_t*	rseg_q1 = (const rseg_queue_t*) p1;
+	const rseg_queue_t*	rseg_q2 = (const rseg_queue_t*) p2;
+
+	cmp = rseg_q1->trx_no - rseg_q2->trx_no;
+
+	if (cmp < 0) {
+		return(-1);
+	} else if (cmp > 0) {
+		return(1);
+	}
+
+	return(0);
+}
+
+/*****************************************************************//**
+Creates and initializes the central memory structures for the transaction
+system. This is called when the database is started.
+@return min binary heap of rsegs to purge */
+UNIV_INTERN
+ib_bh_t*
+trx_sys_init_at_db_start(void)
+/*==========================*/
+{
+	mtr_t		mtr;
+	ib_bh_t*	ib_bh;
+	trx_sysf_t*	sys_header;
+	ib_uint64_t	rows_to_undo	= 0;
+	const char*	unit		= "";
+
+	/* We create the min binary heap here and pass ownership to
+	purge when we init the purge sub-system. Purge is responsible
+	for freeing the binary heap. */
+
+	ib_bh = ib_bh_create(
+		trx_rseg_compare_last_trx_no,
+		sizeof(rseg_queue_t), TRX_SYS_N_RSEGS);
+
+	mtr_start(&mtr);
+
+	/* Allocate the trx descriptors array */
+	trx_sys->descriptors = static_cast<trx_id_t*>(
+		ut_malloc(sizeof(trx_id_t) *
+			  TRX_DESCR_ARRAY_INITIAL_SIZE));
+	trx_sys->descr_n_max = TRX_DESCR_ARRAY_INITIAL_SIZE;
+	trx_sys->descr_n_used = 0;
+	srv_descriptors_memory = TRX_DESCR_ARRAY_INITIAL_SIZE *
+		sizeof(trx_id_t);
+
+	sys_header = trx_sysf_get(&mtr);
+
+	if (srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
+		trx_rseg_array_init(sys_header, ib_bh, &mtr);
+	}
+
+	/* VERY important: after the database is started, max_trx_id value is
+	divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in
+	trx_sys_get_new_trx_id will evaluate to TRUE when the function
+	is first time called, and the value for trx id will be written
+	to the disk-based header! Thus trx id values will not overlap when
+	the database is repeatedly started! */
+
+	trx_sys->max_trx_id = 2 * TRX_SYS_TRX_ID_WRITE_MARGIN
+		+ ut_uint64_align_up(mach_read_from_8(sys_header
+						   + TRX_SYS_TRX_ID_STORE),
+				     TRX_SYS_TRX_ID_WRITE_MARGIN);
+
+	ut_d(trx_sys->rw_max_trx_id = trx_sys->max_trx_id);
+
+	UT_LIST_INIT(trx_sys->mysql_trx_list);
+
+	trx_dummy_sess = sess_open();
+
+	trx_lists_init_at_db_start();
+
+	/* This S lock is not strictly required, it is here only to satisfy
+	the debug code (assertions). We are still running in single threaded
+	bootstrap mode. */
+
+	mutex_enter(&trx_sys->mutex);
+
+	ut_a(UT_LIST_GET_LEN(trx_sys->ro_trx_list) == 0);
+
+	if (UT_LIST_GET_LEN(trx_sys->rw_trx_list) > 0) {
+		const trx_t*	trx;
+
+		for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+		     trx != NULL;
+		     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+			ut_ad(trx->is_recovered);
+			assert_trx_in_rw_list(trx);
+
+			if (trx_state_eq(trx, TRX_STATE_ACTIVE)) {
+				rows_to_undo += trx->undo_no;
+			}
+		}
+
+		if (rows_to_undo > 1000000000) {
+			unit = "M";
+			rows_to_undo = rows_to_undo / 1000000;
+		}
+
+		fprintf(stderr,
+			"InnoDB: %lu transaction(s) which must be"
+			" rolled back or cleaned up\n"
+			"InnoDB: in total %lu%s row operations to undo\n",
+			(ulong) UT_LIST_GET_LEN(trx_sys->rw_trx_list),
+			(ulong) rows_to_undo, unit);
+
+		fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n",
+			trx_sys->max_trx_id);
+	}
+
+	mutex_exit(&trx_sys->mutex);
+
+	UT_LIST_INIT(trx_sys->view_list);
+
+	mtr_commit(&mtr);
+
+	return(ib_bh);
+}
+
+/*****************************************************************//**
+Creates the trx_sys instance and initializes ib_bh and mutex. */
+UNIV_INTERN
+void
+trx_sys_create(void)
+/*================*/
+{
+	ut_ad(trx_sys == NULL);
+
+	trx_sys = static_cast<trx_sys_t*>(mem_zalloc(sizeof(*trx_sys)));
+
+	mutex_create(trx_sys_mutex_key, &trx_sys->mutex, SYNC_TRX_SYS);
+}
+
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+UNIV_INTERN
+void
+trx_sys_create_sys_pages(void)
+/*==========================*/
+{
+	mtr_t	mtr;
+
+	mtr_start(&mtr);
+
+	trx_sysf_create(&mtr);
+
+	mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Update the file format tag.
+@return	always TRUE */
+static
+ibool
+trx_sys_file_format_max_write(
+/*==========================*/
+	ulint		format_id,	/*!< in: file format id */
+	const char**	name)		/*!< out: max file format name, can
+					be NULL */
+{
+	mtr_t		mtr;
+	byte*		ptr;
+	buf_block_t*	block;
+	ib_uint64_t	tag_value;
+
+	mtr_start(&mtr);
+
+	block = buf_page_get(
+		TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
+
+	file_format_max.id = format_id;
+	file_format_max.name = trx_sys_file_format_id_to_name(format_id);
+
+	ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG;
+	tag_value = format_id + TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
+
+	if (name) {
+		*name = file_format_max.name;
+	}
+
+	mlog_write_ull(ptr, tag_value, &mtr);
+
+	mtr_commit(&mtr);
+
+	return(TRUE);
+}
+
+/*****************************************************************//**
+Read the file format tag.
+@return	the file format or ULINT_UNDEFINED if not set. */
+static
+ulint
+trx_sys_file_format_max_read(void)
+/*==============================*/
+{
+	mtr_t			mtr;
+	const byte*		ptr;
+	const buf_block_t*	block;
+	ib_id_t			file_format_id;
+
+	/* Since this is called during the startup phase it's safe to
+	read the value without a covering mutex. */
+	mtr_start(&mtr);
+
+	block = buf_page_get(
+		TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
+
+	ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG;
+	file_format_id = mach_read_from_8(ptr);
+
+	mtr_commit(&mtr);
+
+	file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
+
+	if (file_format_id >= FILE_FORMAT_NAME_N) {
+
+		/* Either it has never been tagged, or garbage in it. */
+		return(ULINT_UNDEFINED);
+	}
+
+	return((ulint) file_format_id);
+}
+
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+	const ulint	id)	/*!< in: id of the file format */
+{
+	ut_a(id < FILE_FORMAT_NAME_N);
+
+	return(file_format_name_map[id]);
+}
+
+/*****************************************************************//**
+Check for the max file format tag stored on disk. Note: If max_format_id
+is == UNIV_FORMAT_MAX + 1 then we only print a warning.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+trx_sys_file_format_max_check(
+/*==========================*/
+	ulint	max_format_id)	/*!< in: max format id to check */
+{
+	ulint	format_id;
+
+	/* Check the file format in the tablespace. Do not try to
+	recover if the file format is not supported by the engine
+	unless forced by the user. */
+	format_id = trx_sys_file_format_max_read();
+	if (format_id == ULINT_UNDEFINED) {
+		/* Format ID was not set. Set it to minimum possible
+		value. */
+		format_id = UNIV_FORMAT_MIN;
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Highest supported file format is %s.",
+		trx_sys_file_format_id_to_name(UNIV_FORMAT_MAX));
+
+	if (format_id > UNIV_FORMAT_MAX) {
+
+		ut_a(format_id < FILE_FORMAT_NAME_N);
+
+		ib_logf(max_format_id <= UNIV_FORMAT_MAX
+			? IB_LOG_LEVEL_ERROR : IB_LOG_LEVEL_WARN,
+			"The system tablespace is in a file "
+			"format that this version doesn't support - %s.",
+			trx_sys_file_format_id_to_name(format_id));
+
+		if (max_format_id <= UNIV_FORMAT_MAX) {
+			return(DB_ERROR);
+		}
+	}
+
+	format_id = (format_id > max_format_id) ? format_id : max_format_id;
+
+	/* We don't need a mutex here, as this function should only
+	be called once at start up. */
+	file_format_max.id = format_id;
+	file_format_max.name = trx_sys_file_format_id_to_name(format_id);
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Set the file format id unconditionally except if it's already the
+same value.
+@return	TRUE if value updated */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_set(
+/*========================*/
+	ulint		format_id,	/*!< in: file format id */
+	const char**	name)		/*!< out: max file format name or
+					NULL if not needed. */
+{
+	ibool		ret = FALSE;
+
+	ut_a(format_id <= UNIV_FORMAT_MAX);
+
+	mutex_enter(&file_format_max.mutex);
+
+	/* Only update if not already same value. */
+	if (format_id != file_format_max.id) {
+
+		ret = trx_sys_file_format_max_write(format_id, name);
+	}
+
+	mutex_exit(&file_format_max.mutex);
+
+	return(ret);
+}
+
+/********************************************************************//**
+Tags the system table space with minimum format id if it has not been
+tagged yet.
+WARNING: This function is only called during the startup and AFTER the
+redo log application during recovery has finished. */
+UNIV_INTERN
+void
+trx_sys_file_format_tag_init(void)
+/*==============================*/
+{
+	ulint	format_id;
+
+	format_id = trx_sys_file_format_max_read();
+
+	/* If format_id is not set then set it to the minimum. */
+	if (format_id == ULINT_UNDEFINED) {
+		trx_sys_file_format_max_set(UNIV_FORMAT_MIN, NULL);
+	}
+}
+
+/********************************************************************//**
+Update the file format tag in the system tablespace only if the given
+format id is greater than the known max id.
+@return	TRUE if format_id was bigger than the known max id */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_upgrade(
+/*============================*/
+	const char**	name,		/*!< out: max file format name */
+	ulint		format_id)	/*!< in: file format identifier */
+{
+	ibool		ret = FALSE;
+
+	ut_a(name);
+	ut_a(file_format_max.name != NULL);
+	ut_a(format_id <= UNIV_FORMAT_MAX);
+
+	mutex_enter(&file_format_max.mutex);
+
+	if (format_id > file_format_max.id) {
+
+		ret = trx_sys_file_format_max_write(format_id, name);
+	}
+
+	mutex_exit(&file_format_max.mutex);
+
+	return(ret);
+}
+
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the max format name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_max_get(void)
+/*=============================*/
+{
+	return(file_format_max.name);
+}
+
+/*****************************************************************//**
+Initializes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_init(void)
+/*==========================*/
+{
+	mutex_create(file_format_max_mutex_key,
+		     &file_format_max.mutex, SYNC_FILE_FORMAT_TAG);
+
+	/* We don't need a mutex here, as this function should only
+	be called once at start up. */
+	file_format_max.id = UNIV_FORMAT_MIN;
+
+	file_format_max.name = trx_sys_file_format_id_to_name(
+		file_format_max.id);
+}
+
+/*****************************************************************//**
+Closes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_close(void)
+/*===========================*/
+{
+	/* Does nothing at the moment */
+}
+
+/*********************************************************************
+Creates the rollback segments.
+@return number of rollback segments that are active. */
+UNIV_INTERN
+ulint
+trx_sys_create_rsegs(
+/*=================*/
+	ulint	n_spaces,	/*!< number of tablespaces for UNDO logs */
+	ulint	n_rsegs)	/*!< number of rollback segments to create */
+{
+	mtr_t	mtr;
+	ulint	n_used;
+
+	ut_a(n_spaces < TRX_SYS_N_RSEGS);
+	ut_a(n_rsegs <= TRX_SYS_N_RSEGS);
+
+	if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO || srv_read_only_mode) {
+		return(ULINT_UNDEFINED);
+	}
+
+	/* This is executed in single-threaded mode therefore it is not
+	necessary to use the same mtr in trx_rseg_create(). n_used cannot
+	change while the function is executing. */
+
+	mtr_start(&mtr);
+	n_used = trx_sysf_rseg_find_free(&mtr);
+	mtr_commit(&mtr);
+
+	if (n_used == ULINT_UNDEFINED) {
+		n_used = TRX_SYS_N_RSEGS;
+	}
+
+	/* Do not create additional rollback segments if innodb_force_recovery
+	has been set and the database was not shutdown cleanly. */
+
+	if (!srv_force_recovery && !recv_needed_recovery && n_used < n_rsegs) {
+		ulint	i;
+		ulint	new_rsegs = n_rsegs - n_used;
+
+		for (i = 0; i < new_rsegs; ++i) {
+			ulint	space;
+
+			/* Tablespace 0 is the system tablespace. All UNDO
+			log tablespaces start from 1. */
+
+			if (n_spaces > 0) {
+				space = (i % n_spaces) + 1;
+			} else {
+				space = 0; /* System tablespace */
+			}
+
+			if (trx_rseg_create(space) != NULL) {
+				++n_used;
+			} else {
+				break;
+			}
+		}
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"%lu rollback segment(s) are active.", n_used);
+
+	return(n_used);
+}
+
+#else /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Prints to stderr the MySQL binlog info in the system header if the
+magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset_from_page(
+/*========================================*/
+	const byte*	page)	/*!< in: buffer containing the trx
+				system header page, i.e., page number
+				TRX_SYS_PAGE_NO in the tablespace */
+{
+	const trx_sysf_t*	sys_header;
+
+	sys_header = page + TRX_SYS;
+
+	if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+	    == TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+		fprintf(stderr,
+			"ibbackup: Last MySQL binlog file position %lu %lu,"
+			" file name %s\n",
+			(ulong) mach_read_from_4(
+				sys_header + TRX_SYS_MYSQL_LOG_INFO
+				+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
+			(ulong) mach_read_from_4(
+				sys_header + TRX_SYS_MYSQL_LOG_INFO
+				+ TRX_SYS_MYSQL_LOG_OFFSET_LOW),
+			sys_header + TRX_SYS_MYSQL_LOG_INFO
+			+ TRX_SYS_MYSQL_LOG_NAME);
+	}
+}
+
+/*****************************************************************//**
+Reads the file format id from the first system table space file.
+Even if the call succeeds and returns TRUE, the returned format id
+may be ULINT_UNDEFINED signalling that the format id was not present
+in the data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_file_format_id(
+/*========================*/
+	const char *pathname,  /*!< in: pathname of the first system
+				        table space file */
+	ulint *format_id)      /*!< out: file format of the system table
+				         space */
+{
+	os_file_t	file;
+	ibool		success;
+	byte		buf[UNIV_PAGE_SIZE * 2];
+	page_t*		page = ut_align(buf, UNIV_PAGE_SIZE);
+	const byte*	ptr;
+	ib_id_t		file_format_id;
+
+	*format_id = ULINT_UNDEFINED;
+
+	file = os_file_create_simple_no_error_handling(
+		innodb_file_data_key,
+		pathname,
+		OS_FILE_OPEN,
+		OS_FILE_READ_ONLY,
+		&success
+	);
+	if (!success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(true);
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			"  ibbackup: Error: trying to read system tablespace "
+			"file format,\n"
+			"  ibbackup: but could not open the tablespace "
+			"file %s!\n", pathname);
+		return(FALSE);
+	}
+
+	/* Read the page on which file format is stored */
+
+	success = os_file_read_no_error_handling(
+		file, page, TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE, UNIV_PAGE_SIZE);
+
+	if (!success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(true);
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			"  ibbackup: Error: trying to read system tablespace "
+			"file format,\n"
+			"  ibbackup: but failed to read the tablespace "
+			"file %s!\n", pathname);
+
+		os_file_close(file);
+		return(FALSE);
+	}
+	os_file_close(file);
+
+	/* get the file format from the page */
+	ptr = page + TRX_SYS_FILE_FORMAT_TAG;
+	file_format_id = mach_read_from_8(ptr);
+	file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
+
+	if (file_format_id >= FILE_FORMAT_NAME_N) {
+
+		/* Either it has never been tagged, or garbage in it. */
+		return(TRUE);
+	}
+
+	*format_id = (ulint) file_format_id;
+
+	return(TRUE);
+}
+
+/*****************************************************************//**
+Reads the file format id from the given per-table data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_pertable_file_format_id(
+/*=================================*/
+	const char *pathname,  /*!< in: pathname of a per-table
+				        datafile */
+	ulint *format_id)      /*!< out: file format of the per-table
+				         data file */
+{
+	os_file_t	file;
+	ibool		success;
+	byte		buf[UNIV_PAGE_SIZE * 2];
+	page_t*		page = ut_align(buf, UNIV_PAGE_SIZE);
+	const byte*	ptr;
+	ib_uint32_t	flags;
+
+	*format_id = ULINT_UNDEFINED;
+
+	file = os_file_create_simple_no_error_handling(
+		innodb_file_data_key,
+		pathname,
+		OS_FILE_OPEN,
+		OS_FILE_READ_ONLY,
+		&success
+	);
+	if (!success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(true);
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			"  ibbackup: Error: trying to read per-table "
+			"tablespace format,\n"
+			"  ibbackup: but could not open the tablespace "
+			"file %s!\n", pathname);
+
+		return(FALSE);
+	}
+
+	/* Read the first page of the per-table datafile */
+
+	success = os_file_read_no_error_handling(file, page, 0, UNIV_PAGE_SIZE);
+
+	if (!success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(true);
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			"  ibbackup: Error: trying to per-table data file "
+			"format,\n"
+			"  ibbackup: but failed to read the tablespace "
+			"file %s!\n", pathname);
+
+		os_file_close(file);
+		return(FALSE);
+	}
+	os_file_close(file);
+
+	/* get the file format from the page */
+	ptr = page + 54;
+	flags = mach_read_from_4(ptr);
+	if (flags == 0) {
+		/* file format is Antelope */
+		*format_id = 0;
+		return(TRUE);
+	} else if (flags & 1) {
+		/* tablespace flags are ok */
+		*format_id = (flags / 32) % 128;
+		return(TRUE);
+	} else {
+		/* bad tablespace flags */
+		return(FALSE);
+	}
+}
+
+
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+	const ulint	id)	/*!< in: id of the file format */
+{
+	if (!(id < FILE_FORMAT_NAME_N)) {
+		/* unknown id */
+		return("Unknown");
+	}
+
+	return(file_format_name_map[id]);
+}
+
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************
+Shutdown/Close the transaction system. */
+UNIV_INTERN
+void
+trx_sys_close(void)
+/*===============*/
+{
+	ulint		i;
+	trx_t*		trx;
+	read_view_t*	view;
+
+	ut_ad(trx_sys != NULL);
+	ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
+
+	/* Check that all read views are closed except read view owned
+	by a purge. */
+
+	mutex_enter(&trx_sys->mutex);
+
+	if (UT_LIST_GET_LEN(trx_sys->view_list) > 1) {
+		fprintf(stderr,
+			"InnoDB: Error: all read views were not closed"
+			" before shutdown:\n"
+			"InnoDB: %lu read views open \n",
+			UT_LIST_GET_LEN(trx_sys->view_list) - 1);
+	}
+
+	mutex_exit(&trx_sys->mutex);
+
+	sess_close(trx_dummy_sess);
+	trx_dummy_sess = NULL;
+
+	trx_purge_sys_close();
+
+	/* Free the double write data structures. */
+	buf_dblwr_free();
+
+	mutex_enter(&trx_sys->mutex);
+
+	ut_a(UT_LIST_GET_LEN(trx_sys->ro_trx_list) == 0);
+
+	/* Only prepared transactions may be left in the system. Free them. */
+	ut_a(UT_LIST_GET_LEN(trx_sys->rw_trx_list) == trx_sys->n_prepared_trx);
+
+	while ((trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list)) != NULL) {
+		trx_free_prepared(trx);
+	}
+
+	/* There can't be any active transactions. */
+	for (i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+		trx_rseg_t*	rseg;
+
+		rseg = trx_sys->rseg_array[i];
+
+		if (rseg != NULL) {
+			trx_rseg_mem_free(rseg);
+		} else {
+			break;
+		}
+	}
+
+	view = UT_LIST_GET_FIRST(trx_sys->view_list);
+
+	while (view != NULL) {
+		read_view_t*	prev_view = view;
+
+		view = UT_LIST_GET_NEXT(view_list, prev_view);
+
+		/* Views are allocated from the trx_sys->global_read_view_heap.
+		So, we simply remove the element here. */
+		UT_LIST_REMOVE(view_list, trx_sys->view_list, prev_view);
+	}
+
+	ut_a(UT_LIST_GET_LEN(trx_sys->view_list) == 0);
+	ut_a(UT_LIST_GET_LEN(trx_sys->ro_trx_list) == 0);
+	ut_a(UT_LIST_GET_LEN(trx_sys->rw_trx_list) == 0);
+	ut_a(UT_LIST_GET_LEN(trx_sys->mysql_trx_list) == 0);
+
+	mutex_exit(&trx_sys->mutex);
+
+	mutex_free(&trx_sys->mutex);
+
+	ut_ad(trx_sys->descr_n_used == 0);
+	ut_free(trx_sys->descriptors);
+
+	mem_free(trx_sys);
+
+	trx_sys = NULL;
+}
+
+/*********************************************************************
+Check if there are any active (non-prepared) transactions.
+@return total number of active transactions or 0 if none */
+UNIV_INTERN
+ulint
+trx_sys_any_active_transactions(void)
+/*=================================*/
+{
+	ulint	total_trx = 0;
+
+	mutex_enter(&trx_sys->mutex);
+
+	total_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list)
+		  + UT_LIST_GET_LEN(trx_sys->mysql_trx_list);
+
+	ut_a(total_trx >= trx_sys->n_prepared_trx);
+	total_trx -= trx_sys->n_prepared_trx;
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(total_trx);
+}
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Validate the trx_list_t.
+@return TRUE if valid. */
+static
+ibool
+trx_sys_validate_trx_list_low(
+/*===========================*/
+	trx_list_t*	trx_list)	/*!< in: &trx_sys->ro_trx_list
+					or &trx_sys->rw_trx_list */
+{
+	const trx_t*	trx;
+	const trx_t*	prev_trx = NULL;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_ad(trx_list == &trx_sys->ro_trx_list
+	      || trx_list == &trx_sys->rw_trx_list);
+
+	for (trx = UT_LIST_GET_FIRST(*trx_list);
+	     trx != NULL;
+	     prev_trx = trx, trx = UT_LIST_GET_NEXT(trx_list, prev_trx)) {
+
+		assert_trx_in_list(trx);
+		ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list));
+
+		ut_a(prev_trx == NULL || prev_trx->id > trx->id);
+	}
+
+	return(TRUE);
+}
+
+/*************************************************************//**
+Validate the trx_sys_t::ro_trx_list and trx_sys_t::rw_trx_list.
+@return TRUE if lists are valid. */
+UNIV_INTERN
+ibool
+trx_sys_validate_trx_list(void)
+/*===========================*/
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_a(trx_sys_validate_trx_list_low(&trx_sys->ro_trx_list));
+	ut_a(trx_sys_validate_trx_list_low(&trx_sys->rw_trx_list));
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/trx/trx0trx.c b/storage/xtradb/trx/trx0trx.c
deleted file mode 100644
index b2df6d471f1..00000000000
--- a/storage/xtradb/trx/trx0trx.c
+++ /dev/null
@@ -1,2445 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file trx/trx0trx.c
-The transaction
-
-Created 3/26/1996 Heikki Tuuri
-*******************************************************/
-
-#include "trx0trx.h"
-
-#ifdef UNIV_NONINL
-#include "trx0trx.ic"
-#endif
-
-#include "trx0undo.h"
-#include "trx0rseg.h"
-#include "log0log.h"
-#include "que0que.h"
-#include "lock0lock.h"
-#include "trx0roll.h"
-#include "usr0sess.h"
-#include "read0read.h"
-#include "srv0srv.h"
-#include "btr0sea.h"
-#include "os0proc.h"
-#include "trx0xa.h"
-#include "trx0purge.h"
-#include "ha_prototypes.h"
-
-/** Dummy session used currently in MySQL interface */
-UNIV_INTERN sess_t*		trx_dummy_sess = NULL;
-
-/** Number of transactions currently allocated for MySQL: protected by
-the kernel mutex */
-UNIV_INTERN ulint	trx_n_mysql_transactions = 0;
-/** Number of transactions currently in the XA PREPARED state: protected by
-the kernel mutex */
-UNIV_INTERN ulint	trx_n_prepared = 0;
-
-#ifdef UNIV_PFS_MUTEX
-/* Key to register the mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	trx_undo_mutex_key;
-#endif /* UNIV_PFS_MUTEX */
-
-/*************************************************************//**
-Set detailed error message for the transaction. */
-UNIV_INTERN
-void
-trx_set_detailed_error(
-/*===================*/
-	trx_t*		trx,	/*!< in: transaction struct */
-	const char*	msg)	/*!< in: detailed error message */
-{
-	ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error));
-}
-
-/*************************************************************//**
-Set detailed error message for the transaction from a file. Note that the
-file is rewinded before reading from it. */
-UNIV_INTERN
-void
-trx_set_detailed_error_from_file(
-/*=============================*/
-	trx_t*	trx,	/*!< in: transaction struct */
-	FILE*	file)	/*!< in: file to read message from */
-{
-	os_file_read_string(file, trx->detailed_error,
-			    sizeof(trx->detailed_error));
-}
-
-/*************************************************************//**
-Callback function for trx_find_descriptor() to compare trx IDs. */
-UNIV_INTERN
-int
-trx_descr_cmp(
-/*==========*/
-	const void *a,	/*!< in: pointer to first comparison argument */
-	const void *b)	/*!< in: pointer to second comparison argument */
-{
-	const trx_id_t*	da = (const trx_id_t*) a;
-	const trx_id_t*	db = (const trx_id_t*) b;
-
-	if (*da < *db) {
-		return -1;
-	} else if (*da > *db) {
-		return 1;
-	}
-
-	return 0;
-}
-
-/*************************************************************//**
-Reserve a slot for a given trx in the global descriptors array. */
-UNIV_INLINE
-void
-trx_reserve_descriptor(
-/*===================*/
-	const trx_t* trx)	/*!< in: trx pointer */
-{
-	ulint		n_used;
-	ulint		n_max;
-	trx_id_t*	descr;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(!trx_find_descriptor(trx_sys->descriptors,
-				   trx_sys->descr_n_used,
-				   trx->id));
-
-	n_used = trx_sys->descr_n_used + 1;
-	n_max = trx_sys->descr_n_max;
-
-	if (UNIV_UNLIKELY(n_used > n_max)) {
-
-		n_max = n_max * 2;
-
-		trx_sys->descriptors =
-			ut_realloc(trx_sys->descriptors,
-				   n_max * sizeof(trx_id_t));
-
-		trx_sys->descr_n_max = n_max;
-		srv_descriptors_memory = n_max * sizeof(trx_id_t);
-	}
-
-	descr = trx_sys->descriptors + n_used - 1;
-
-	if (UNIV_UNLIKELY(n_used > 1 && trx->id < descr[-1])) {
-
-		/* Find the slot where it should be inserted. We could use a
-		binary search, but in reality linear search should be faster,
-		because the slot we are looking for is near the array end. */
-
-		trx_id_t*	tdescr;
-
-		for (tdescr = descr - 1;
-		     tdescr >= trx_sys->descriptors && *tdescr > trx->id;
-		     tdescr--) {
-		}
-
-		tdescr++;
-
-		ut_memmove(tdescr + 1, tdescr, (descr - tdescr) *
-			   sizeof(trx_id_t));
-
-		descr = tdescr;
-	}
-
-	*descr = trx->id;
-
-	trx_sys->descr_n_used = n_used;
-}
-
-/*************************************************************//**
-Release a slot for a given trx in the global descriptors array. */
-UNIV_INTERN
-void
-trx_release_descriptor(
-/*===================*/
-	trx_t* trx)	/*!< in: trx pointer */
-{
-	ulint		size;
-	trx_id_t*	descr;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	if (UNIV_LIKELY(trx->is_in_trx_serial_list)) {
-
-		UT_LIST_REMOVE(trx_serial_list, trx_sys->trx_serial_list,
-			       trx);
-		trx->is_in_trx_serial_list = 0;
-	}
-
-	descr = trx_find_descriptor(trx_sys->descriptors,
-				    trx_sys->descr_n_used,
-				    trx->id);
-
-	if (UNIV_UNLIKELY(descr == NULL)) {
-
-		return;
-	}
-
-	size = (trx_sys->descriptors + trx_sys->descr_n_used - 1 - descr) *
-		sizeof(trx_id_t);
-
-	if (UNIV_LIKELY(size > 0)) {
-
-		ut_memmove(descr, descr + 1, size);
-	}
-
-	trx_sys->descr_n_used--;
-}
-
-/****************************************************************//**
-Creates and initializes a transaction object.
-@return	own: the transaction */
-UNIV_INTERN
-trx_t*
-trx_create(
-/*=======*/
-	sess_t*	sess)	/*!< in: session */
-{
-	trx_t*	trx;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(sess);
-
-	trx = mem_alloc(sizeof(trx_t));
-
-	trx->magic_n = TRX_MAGIC_N;
-
-	trx->op_info = "";
-
-	trx->is_purge = 0;
-	trx->is_recovered = 0;
-	trx->state = TRX_NOT_STARTED;
-
-	trx->is_registered = 0;
-	trx->active_commit_ordered = 0;
-
-	trx->start_time = ut_time();
-
-	trx->isolation_level = TRX_ISO_REPEATABLE_READ;
-
-	trx->id = 0;
-	trx->no = IB_ULONGLONG_MAX;
-	trx->is_in_trx_serial_list = 0;
-
-	trx->support_xa = TRUE;
-
-	trx->fake_changes = FALSE;
-
-	trx->check_foreigns = TRUE;
-	trx->check_unique_secondary = TRUE;
-
-	trx->flush_log_later = FALSE;
-	trx->must_flush_log_later = FALSE;
-
-	trx->dict_operation = TRX_DICT_OP_NONE;
-	trx->table_id = 0;
-
-	trx->mysql_thd = NULL;
-	trx->duplicates = 0;
-
-	trx->n_mysql_tables_in_use = 0;
-	trx->mysql_n_tables_locked = 0;
-
-	trx->mysql_log_file_name = NULL;
-	trx->mysql_log_offset = 0;
-	trx->mysql_master_log_file_name = "";
-	trx->mysql_master_log_pos = 0;
-	trx->mysql_relay_log_file_name = "";
-	trx->mysql_relay_log_pos = 0;
-
-	trx->idle_start = 0;
-	trx->last_stmt_start = 0;
-
-	mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO);
-
-	trx->rseg = NULL;
-
-	trx->undo_no = 0;
-	trx->last_sql_stat_start.least_undo_no = 0;
-	trx->insert_undo = NULL;
-	trx->update_undo = NULL;
-	trx->undo_no_arr = NULL;
-
-	trx->error_state = DB_SUCCESS;
-	trx->error_key_num = 0;
-	trx->detailed_error[0] = '\0';
-
-	trx->sess = sess;
-	trx->que_state = TRX_QUE_RUNNING;
-	trx->n_active_thrs = 0;
-
-	trx->handling_signals = FALSE;
-
-	UT_LIST_INIT(trx->signals);
-	UT_LIST_INIT(trx->reply_signals);
-
-	trx->graph = NULL;
-
-	trx->wait_lock = NULL;
-	trx->was_chosen_as_deadlock_victim = FALSE;
-	UT_LIST_INIT(trx->wait_thrs);
-
-	trx->lock_heap = mem_heap_create_in_buffer(256);
-	UT_LIST_INIT(trx->trx_locks);
-
-	UT_LIST_INIT(trx->trx_savepoints);
-
-	trx->dict_operation_lock_mode = 0;
-	trx->has_search_latch = FALSE;
-	trx->search_latch_timeout = BTR_SEA_TIMEOUT;
-
-	trx->declared_to_be_inside_innodb = FALSE;
-	trx->n_tickets_to_enter_innodb = 0;
-
-	trx->global_read_view = NULL;
-	trx->read_view = NULL;
-	trx->prebuilt_view = NULL;
-
-	trx->io_reads = 0;
-	trx->io_read = 0;
-	trx->io_reads_wait_timer = 0;
-	trx->lock_que_wait_timer = 0;
-	trx->innodb_que_wait_timer = 0;
-	trx->distinct_page_access = 0;
-	trx->distinct_page_access_hash = NULL;
-	trx->take_stats = FALSE;
-
-	/* Set X/Open XA transaction identification to NULL */
-	memset(&trx->xid, 0, sizeof(trx->xid));
-	trx->xid.formatID = -1;
-
-	trx->n_autoinc_rows = 0;
-
-	/* Remember to free the vector explicitly. */
-	trx->autoinc_locks = ib_vector_create(
-		mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 4), 4);
-
-	return(trx);
-}
-
-/********************************************************************//**
-Creates a transaction object for MySQL.
-@return	own: transaction object */
-UNIV_INTERN
-trx_t*
-trx_allocate_for_mysql(void)
-/*========================*/
-{
-	trx_t*	trx;
-
-	mutex_enter(&kernel_mutex);
-
-	trx = trx_create(trx_dummy_sess);
-
-	trx_n_mysql_transactions++;
-
-	UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx);
-
-	mutex_exit(&kernel_mutex);
-
-	if (UNIV_UNLIKELY(trx->take_stats)) {
-		trx->distinct_page_access_hash = mem_alloc(DPAH_SIZE);
-		memset(trx->distinct_page_access_hash, 0, DPAH_SIZE);
-	}
-
-	return(trx);
-}
-
-/********************************************************************//**
-Creates a transaction object for background operations by the master thread.
-@return	own: transaction object */
-UNIV_INTERN
-trx_t*
-trx_allocate_for_background(void)
-/*=============================*/
-{
-	trx_t*	trx;
-
-	mutex_enter(&kernel_mutex);
-
-	trx = trx_create(trx_dummy_sess);
-
-	mutex_exit(&kernel_mutex);
-
-	return(trx);
-}
-
-/********************************************************************//**
-Frees a transaction object. */
-UNIV_INTERN
-void
-trx_free(
-/*=====*/
-	trx_t*	trx)	/*!< in, own: trx object */
-{
-	ut_ad(mutex_own(&kernel_mutex));
-
-	if (trx->declared_to_be_inside_innodb) {
-		ut_print_timestamp(stderr);
-		fputs("  InnoDB: Error: Freeing a trx which is declared"
-		      " to be processing\n"
-		      "InnoDB: inside InnoDB.\n", stderr);
-		trx_print(stderr, trx, 600);
-		putc('\n', stderr);
-
-		/* This is an error but not a fatal error. We must keep
-		the counters like srv_conc_n_threads accurate. */
-		srv_conc_force_exit_innodb(trx);
-	}
-
-	if (trx->n_mysql_tables_in_use != 0
-	    || trx->mysql_n_tables_locked != 0) {
-
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: Error: MySQL is freeing a thd\n"
-			"InnoDB: though trx->n_mysql_tables_in_use is %lu\n"
-			"InnoDB: and trx->mysql_n_tables_locked is %lu.\n",
-			(ulong)trx->n_mysql_tables_in_use,
-			(ulong)trx->mysql_n_tables_locked);
-
-		trx_print(stderr, trx, 600);
-
-		ut_print_buf(stderr, trx, sizeof(trx_t));
-		putc('\n', stderr);
-	}
-
-	ut_a(trx->magic_n == TRX_MAGIC_N);
-
-	trx->magic_n = 11112222;
-
-	ut_a(trx->state == TRX_NOT_STARTED);
-
-	mutex_free(&(trx->undo_mutex));
-
-	ut_a(trx->insert_undo == NULL);
-	ut_a(trx->update_undo == NULL);
-
-	if (trx->undo_no_arr) {
-		trx_undo_arr_free(trx->undo_no_arr);
-	}
-
-	ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
-	ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);
-
-	ut_a(trx->wait_lock == NULL);
-	ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
-
-	ut_a(!trx->has_search_latch);
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(!btr_search_own_any());
-#endif
-
-	ut_a(trx->dict_operation_lock_mode == 0);
-
-	if (trx->lock_heap) {
-		mem_heap_free(trx->lock_heap);
-	}
-
-	ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0);
-
-	if (trx->prebuilt_view != NULL) {
-		read_view_free(trx->prebuilt_view);
-	}
-
-	ut_a(trx->read_view == NULL);
-
-	ut_a(ib_vector_is_empty(trx->autoinc_locks));
-	/* We allocated a dedicated heap for the vector. */
-	ib_vector_free(trx->autoinc_locks);
-
-	trx_release_descriptor(trx);
-
-	mem_free(trx);
-}
-
-/********************************************************************//**
-At shutdown, frees a transaction object that is in the PREPARED state. */
-UNIV_INTERN
-void
-trx_free_prepared(
-/*==============*/
-	trx_t*	trx)	/*!< in, own: trx object */
-{
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_a(trx->state == TRX_PREPARED);
-	ut_a(trx->magic_n == TRX_MAGIC_N);
-
-	/* Prepared transactions are sort of active; they allow
-	ROLLBACK and COMMIT operations. Because the system does not
-	contain any other transactions than prepared transactions at
-	the shutdown stage and because a transaction cannot become
-	PREPARED while holding locks, it is safe to release the locks
-	held by PREPARED transactions here at shutdown.*/
-	lock_release_off_kernel(trx);
-
-	trx_undo_free_prepared(trx);
-
-	mutex_free(&trx->undo_mutex);
-
-	if (trx->undo_no_arr) {
-		trx_undo_arr_free(trx->undo_no_arr);
-	}
-
-	ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
-	ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);
-
-	ut_a(trx->wait_lock == NULL);
-	ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
-
-	ut_a(!trx->has_search_latch);
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(!btr_search_own_any());
-#endif
-
-	ut_a(trx->dict_operation_lock_mode == 0);
-
-	if (trx->lock_heap) {
-		mem_heap_free(trx->lock_heap);
-	}
-
-	ut_a(ib_vector_is_empty(trx->autoinc_locks));
-	ib_vector_free(trx->autoinc_locks);
-
-	trx_release_descriptor(trx);
-
-	if (trx->prebuilt_view != NULL) {
-		read_view_free(trx->prebuilt_view);
-	}
-
-	UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
-
-	ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->trx_list));
-
-	mem_free(trx);
-}
-
-/********************************************************************//**
-Frees a transaction object for MySQL. */
-UNIV_INTERN
-void
-trx_free_for_mysql(
-/*===============*/
-	trx_t*	trx)	/*!< in, own: trx object */
-{
-	if (trx->distinct_page_access_hash)
-	{
-		mem_free(trx->distinct_page_access_hash);
-		trx->distinct_page_access_hash= NULL;
-	}
-
-	mutex_enter(&kernel_mutex);
-
-	UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx);
-
-	trx_free(trx);
-
-	ut_a(trx_n_mysql_transactions > 0);
-
-	trx_n_mysql_transactions--;
-
-	mutex_exit(&kernel_mutex);
-}
-
-/********************************************************************//**
-Frees a transaction object of a background operation of the master thread. */
-UNIV_INTERN
-void
-trx_free_for_background(
-/*====================*/
-	trx_t*	trx)	/*!< in, own: trx object */
-{
-	if (trx->distinct_page_access_hash)
-	{
-		mem_free(trx->distinct_page_access_hash);
-		trx->distinct_page_access_hash= NULL;
-	}
-
-	mutex_enter(&kernel_mutex);
-
-	trx_free(trx);
-
-	mutex_exit(&kernel_mutex);
-}
-
-/****************************************************************//**
-Inserts the trx handle in the trx system trx list in the right position.
-The list is sorted on the trx id so that the biggest id is at the list
-start. This function is used at the database startup to insert incomplete
-transactions to the list. */
-static
-void
-trx_list_insert_ordered(
-/*====================*/
-	trx_t*	trx)	/*!< in: trx handle */
-{
-	trx_t*	trx2;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	trx2 = UT_LIST_GET_FIRST(trx_sys->trx_list);
-
-	while (trx2 != NULL) {
-		if (trx->id >= trx2->id) {
-
-			ut_ad(trx->id > trx2->id);
-			break;
-		}
-		trx2 = UT_LIST_GET_NEXT(trx_list, trx2);
-	}
-
-	if (trx2 != NULL) {
-		trx2 = UT_LIST_GET_PREV(trx_list, trx2);
-
-		if (trx2 == NULL) {
-			UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
-		} else {
-			UT_LIST_INSERT_AFTER(trx_list, trx_sys->trx_list,
-					     trx2, trx);
-		}
-	} else {
-		UT_LIST_ADD_LAST(trx_list, trx_sys->trx_list, trx);
-	}
-}
-
-/****************************************************************//**
-Creates trx objects for transactions and initializes the trx list of
-trx_sys at database start. Rollback segment and undo log lists must
-already exist when this function is called, because the lists of
-transactions to be rolled back or cleaned up are built based on the
-undo log lists. */
-UNIV_INTERN
-void
-trx_lists_init_at_db_start(void)
-/*============================*/
-{
-	trx_rseg_t*	rseg;
-	trx_undo_t*	undo;
-	trx_t*		trx;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	UT_LIST_INIT(trx_sys->trx_list);
-	UT_LIST_INIT(trx_sys->trx_serial_list);
-
-	/* Look from the rollback segments if there exist undo logs for
-	transactions */
-
-	rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
-
-	while (rseg != NULL) {
-		undo = UT_LIST_GET_FIRST(rseg->insert_undo_list);
-
-		while (undo != NULL) {
-
-			trx = trx_create(trx_dummy_sess);
-
-			trx->is_recovered = TRUE;
-			trx->id = undo->trx_id;
-			trx->xid = undo->xid;
-			trx->insert_undo = undo;
-			trx->rseg = rseg;
-
-			if (undo->state != TRX_UNDO_ACTIVE) {
-
-				/* Prepared transactions are left in
-				the prepared state waiting for a
-				commit or abort decision from MySQL */
-
-				if (undo->state == TRX_UNDO_PREPARED) {
-
-					fprintf(stderr,
-						"InnoDB: Transaction "
-						TRX_ID_FMT
-						" was in the"
-						" XA prepared state.\n",
-						(ullint) trx->id);
-
-					if (srv_force_recovery == 0) {
-
-						trx->state = TRX_PREPARED;
-						trx_n_prepared++;
-					} else {
-						fprintf(stderr,
-							"InnoDB: Since"
-							" innodb_force_recovery"
-							" > 0, we will"
-							" rollback it"
-							" anyway.\n");
-
-						trx->state = TRX_ACTIVE;
-					}
-
-					trx_reserve_descriptor(trx);
-				} else {
-					trx->state = TRX_COMMITTED_IN_MEMORY;
-				}
-
-				/* We give a dummy value for the trx no;
-				this should have no relevance since purge
-				is not interested in committed transaction
-				numbers, unless they are in the history
-				list, in which case it looks the number
-				from the disk based undo log structure */
-
-				trx->no = trx->id;
-			} else {
-				trx->state = TRX_ACTIVE;
-
-				/* A running transaction always has the number
-				field inited to IB_ULONGLONG_MAX */
-
-				trx->no = IB_ULONGLONG_MAX;
-
-				trx_reserve_descriptor(trx);
-
-			}
-
-			if (undo->dict_operation) {
-				trx_set_dict_operation(
-					trx, TRX_DICT_OP_TABLE);
-				trx->table_id = undo->table_id;
-			}
-
-			if (!undo->empty) {
-				trx->undo_no = undo->top_undo_no + 1;
-			}
-
-			trx_list_insert_ordered(trx);
-
-			undo = UT_LIST_GET_NEXT(undo_list, undo);
-		}
-
-		undo = UT_LIST_GET_FIRST(rseg->update_undo_list);
-
-		while (undo != NULL) {
-			trx = trx_get_on_id(undo->trx_id);
-
-			if (NULL == trx) {
-				trx = trx_create(trx_dummy_sess);
-
-				trx->is_recovered = TRUE;
-				trx->id = undo->trx_id;
-				trx->xid = undo->xid;
-
-				if (undo->state != TRX_UNDO_ACTIVE) {
-
-					/* Prepared transactions are left in
-					the prepared state waiting for a
-					commit or abort decision from MySQL */
-
-					if (undo->state == TRX_UNDO_PREPARED) {
-						fprintf(stderr,
-							"InnoDB: Transaction "
-							TRX_ID_FMT " was in the"
-							" XA prepared state.\n",
-							(ullint) trx->id);
-
-						if (srv_force_recovery == 0) {
-
-							trx->state
-								= TRX_PREPARED;
-							trx_n_prepared++;
-						} else {
-							fprintf(stderr,
-								"InnoDB: Since"
-								" innodb_force_recovery"
-								" > 0, we will"
-								" rollback it"
-								" anyway.\n");
-
-							trx->state = TRX_ACTIVE;
-							trx_reserve_descriptor(
-								trx);
-						}
-					} else {
-						trx->state
-							= TRX_COMMITTED_IN_MEMORY;
-					}
-
-					/* We give a dummy value for the trx
-					number */
-
-					trx->no = trx->id;
-				} else {
-					trx->state = TRX_ACTIVE;
-					/* A running transaction always has
-					the number field inited to
-					IB_ULONGLONG_MAX */
-
-					trx->no = IB_ULONGLONG_MAX;
-
-					trx_reserve_descriptor(trx);
-				}
-
-				trx->rseg = rseg;
-				trx_list_insert_ordered(trx);
-
-				if (undo->dict_operation) {
-					trx_set_dict_operation(
-						trx, TRX_DICT_OP_TABLE);
-					trx->table_id = undo->table_id;
-				}
-			}
-
-			trx->update_undo = undo;
-
-			if ((!undo->empty)
-			    && undo->top_undo_no >= trx->undo_no) {
-
-				trx->undo_no = undo->top_undo_no + 1;
-			}
-
-			undo = UT_LIST_GET_NEXT(undo_list, undo);
-		}
-
-		rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
-	}
-}
-
-/******************************************************************//**
-Assigns a rollback segment to a transaction in a round-robin fashion.
-@return	assigned rollback segment instance */
-UNIV_INLINE
-trx_rseg_t*
-trx_assign_rseg(
-/*============*/
-	ulint	max_undo_logs)	/*!< in: maximum number of UNDO logs to use */
-{
-	trx_rseg_t*	rseg = trx_sys->latest_rseg;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
-
-	if (rseg == NULL || rseg->id == max_undo_logs - 1) {
-		rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
-	}
-
-	trx_sys->latest_rseg = rseg;
-
-	return(rseg);
-}
-
-/****************************************************************//**
-Starts a new transaction.
-@return	TRUE */
-UNIV_INTERN
-ibool
-trx_start_low(
-/*==========*/
-	trx_t*	trx,	/*!< in: transaction */
-	ulint	rseg_id)/*!< in: rollback segment id; if ULINT_UNDEFINED
-			is passed, the system chooses the rollback segment
-			automatically in a round-robin fashion */
-{
-	trx_rseg_t*	rseg;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(trx->rseg == NULL);
-
-	if (trx->is_purge) {
-		trx->id = 0;
-		/* Don't reserve a descriptor, since this trx is not added to
-		trx_list. */
-		trx->state = TRX_ACTIVE;
-		trx->start_time = time(NULL);
-
-		return(TRUE);
-	}
-
-	ut_ad(trx->state != TRX_ACTIVE);
-
-	ut_a(rseg_id == ULINT_UNDEFINED);
-
-	rseg = trx_assign_rseg(srv_rollback_segments);
-
-	trx->id = trx_sys_get_new_trx_id();
-
-	/* The initial value for trx->no: IB_ULONGLONG_MAX is used in
-	read_view_open_now: */
-
-	trx->no = IB_ULONGLONG_MAX;
-
-	trx->rseg = rseg;
-
-	trx->state = TRX_ACTIVE;
-
-	trx_reserve_descriptor(trx);
-
-	trx->start_time = time(NULL);
-
-	UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
-
-	return(TRUE);
-}
-
-/****************************************************************//**
-Starts a new transaction.
-@return	TRUE */
-UNIV_INTERN
-ibool
-trx_start(
-/*======*/
-	trx_t*	trx,	/*!< in: transaction */
-	ulint	rseg_id)/*!< in: rollback segment id; if ULINT_UNDEFINED
-			is passed, the system chooses the rollback segment
-			automatically in a round-robin fashion */
-{
-	ibool	ret;
-
-	/* Update the info whether we should skip XA steps that eat CPU time
-	For the duration of the transaction trx->support_xa is not reread
-	from thd so any changes in the value take effect in the next
-	transaction. This is to avoid a scenario where some undo
-	generated by a transaction, has XA stuff, and other undo,
-	generated by the same transaction, doesn't. */
-	trx->support_xa = thd_supports_xa(trx->mysql_thd);
-
-	mutex_enter(&kernel_mutex);
-
-	ret = trx_start_low(trx, rseg_id);
-
-	mutex_exit(&kernel_mutex);
-
-	return(ret);
-}
-
-/****************************************************************//**
-Set the transaction serialisation number. */
-static
-void
-trx_serialisation_number_get(
-/*=========================*/
-	trx_t*		trx)	/*!< in: transaction */
-{
-	trx_rseg_t*	rseg;
-
-	rseg = trx->rseg;
-
-	ut_ad(mutex_own(&rseg->mutex));
-
-	mutex_enter(&kernel_mutex);
-
-	trx->no = trx_sys_get_new_trx_id();
-
-	if (UNIV_LIKELY(trx->is_in_trx_serial_list == 0)) {
-
-		UT_LIST_ADD_LAST(trx_serial_list, trx_sys->trx_serial_list,
-				 trx);
-
-		trx->is_in_trx_serial_list = 1;
-	}
-
-	/* If the rollack segment is not empty then the
-	new trx_t::no can't be less than any trx_t::no
-	already in the rollback segment. User threads only
-	produce events when a rollback segment is empty. */
-
-	if (rseg->last_page_no == FIL_NULL) {
-		void*		ptr;
-		rseg_queue_t	rseg_queue;
-
-		rseg_queue.rseg = rseg;
-		rseg_queue.trx_no = trx->no;
-
-		mutex_enter(&purge_sys->bh_mutex);
-
-		/* This is to reduce the pressure on the kernel mutex,
-		though in reality it should make very little (read no)
-		difference because this code path is only taken when the
-		rbs is empty. */
-
-		mutex_exit(&kernel_mutex);
-
-		ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue);
-		ut_a(ptr);
-
-		mutex_exit(&purge_sys->bh_mutex);
-	} else {
-		mutex_exit(&kernel_mutex);
-	}
-}
-
-/****************************************************************//**
-Assign the transaction its history serialisation number and write the
-update UNDO log record to the assigned rollback segment.
-@return the LSN of the UNDO log write. */
-static
-ib_uint64_t
-trx_write_serialisation_history(
-/*============================*/
-	trx_t*		trx)	/*!< in: transaction */
-{
-	mtr_t		mtr;
-	trx_rseg_t*	rseg;
-	trx_sysf_t*	sys_header = NULL;
-
-	ut_ad(!mutex_own(&kernel_mutex));
-
-	rseg = trx->rseg;
-
-	mtr_start(&mtr);
-
-	/* Change the undo log segment states from TRX_UNDO_ACTIVE
-	to some other state: these modifications to the file data
-	structure define the transaction as committed in the file
-	based domain, at the serialization point of the log sequence
-	number lsn obtained below. */
-
-	if (trx->update_undo != NULL) {
-		page_t*		undo_hdr_page;
-		trx_undo_t*	undo = trx->update_undo;
-
-		/* We have to hold the rseg mutex because update
-		log headers have to be put to the history list in the
-		(serialisation) order of the UNDO trx number. This is
-		required for the purge in-memory data structures too. */
-
-		mutex_enter(&rseg->mutex);
-
-		/* Assign the transaction serialisation number and also
-		update the purge min binary heap if this is the first
-		UNDO log being written to the assigned rollback segment. */
-
-		trx_serialisation_number_get(trx);
-
-		/* It is not necessary to obtain trx->undo_mutex here
-		because only a single OS thread is allowed to do the
-		transaction commit for this transaction. */
-
-		undo_hdr_page = trx_undo_set_state_at_finish(undo, &mtr);
-
-		trx_undo_update_cleanup(trx, undo_hdr_page, &mtr);
-	} else {
-		mutex_enter(&rseg->mutex);
-	}
-
-	if (trx->insert_undo != NULL) {
-		trx_undo_set_state_at_finish(trx->insert_undo, &mtr);
-	}
-
-	mutex_exit(&rseg->mutex);
-
-	/* Update the latest MySQL binlog name and offset info
-	in trx sys header if MySQL binlogging is on or the database
-	server is a MySQL replication slave */
-
-	if (trx->mysql_log_file_name
-	    && trx->mysql_log_file_name[0] != '\0') {
-		if (!sys_header) {
-			sys_header = trx_sysf_get(&mtr);
-		}
-
-		trx_sys_update_mysql_binlog_offset(
-			sys_header,
-			trx->mysql_log_file_name,
-			trx->mysql_log_offset,
-			TRX_SYS_MYSQL_LOG_INFO, &mtr);
-
-		trx->mysql_log_file_name = NULL;
-	}
-
-	if (trx->mysql_master_log_file_name[0] != '\0') {
-		/* This database server is a MySQL replication slave */
-		if (!sys_header) {
-			sys_header = trx_sysf_get(&mtr);
-		}
-
-		trx_sys_update_mysql_binlog_offset(
-			sys_header,
-			trx->mysql_relay_log_file_name,
-			trx->mysql_relay_log_pos,
-			TRX_SYS_COMMIT_RELAY_LOG_INFO, &mtr);
-
-		trx_sys_update_mysql_binlog_offset(
-			sys_header,
-			trx->mysql_master_log_file_name,
-			trx->mysql_master_log_pos,
-			TRX_SYS_COMMIT_MASTER_LOG_INFO, &mtr);
-
-		trx_sys_update_mysql_binlog_offset(
-			sys_header,
-			trx->mysql_relay_log_file_name,
-			trx->mysql_relay_log_pos,
-			TRX_SYS_MYSQL_RELAY_LOG_INFO, &mtr);
-
-		trx_sys_update_mysql_binlog_offset(
-			sys_header,
-			trx->mysql_master_log_file_name,
-			trx->mysql_master_log_pos,
-			TRX_SYS_MYSQL_MASTER_LOG_INFO, &mtr);
-
-		trx->mysql_master_log_file_name = "";
-	}
-
-	/* The following call commits the mini-transaction, making the
-	whole transaction committed in the file-based world, at this
-	log sequence number. The transaction becomes 'durable' when
-	we write the log to disk, but in the logical sense the commit
-	in the file-based data structures (undo logs etc.) happens
-	here.
-
-	NOTE that transaction numbers, which are assigned only to
-	transactions with an update undo log, do not necessarily come
-	in exactly the same order as commit lsn's, if the transactions
-	have different rollback segments. To get exactly the same
-	order we should hold the kernel mutex up to this point,
-	adding to the contention of the kernel mutex. However, if
-	a transaction T2 is able to see modifications made by
-	a transaction T1, T2 will always get a bigger transaction
-	number and a bigger commit lsn than T1. */
-
-	/*--------------*/
-	mtr_commit(&mtr);
-	/*--------------*/
-
-	return(mtr.end_lsn);
-}
-
-/****************************************************************//**
-Commits a transaction. */
-UNIV_INTERN
-void
-trx_commit_off_kernel(
-/*==================*/
-	trx_t*	trx)	/*!< in: transaction */
-{
-	ib_uint64_t	lsn;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	trx->must_flush_log_later = FALSE;
-
-	/* If the transaction made any updates then we need to write the
-	UNDO logs for the updates to the assigned rollback segment. */
-
-	if (trx->insert_undo != NULL || trx->update_undo != NULL) {
-		mutex_exit(&kernel_mutex);
-
-		lsn = trx_write_serialisation_history(trx);
-
-		mutex_enter(&kernel_mutex);
-	} else {
-		lsn = 0;
-	}
-
-	ut_ad(trx->state == TRX_ACTIVE || trx->state == TRX_PREPARED);
-	ut_ad(mutex_own(&kernel_mutex));
-
-	if (UNIV_UNLIKELY(trx->state == TRX_PREPARED)) {
-		ut_a(trx_n_prepared > 0);
-		trx_n_prepared--;
-	}
-
-	/* The following assignment makes the transaction committed in memory
-	and makes its changes to data visible to other transactions.
-	NOTE that there is a small discrepancy from the strict formal
-	visibility rules here: a human user of the database can see
-	modifications made by another transaction T even before the necessary
-	log segment has been flushed to the disk. If the database happens to
-	crash before the flush, the user has seen modifications from T which
-	will never be a committed transaction. However, any transaction T2
-	which sees the modifications of the committing transaction T, and
-	which also itself makes modifications to the database, will get an lsn
-	larger than the committing transaction T. In the case where the log
-	flush fails, and T never gets committed, also T2 will never get
-	committed. */
-
-	/*--------------------------------------*/
-	trx->state = TRX_COMMITTED_IN_MEMORY;
-	/* The following also removes trx from trx_serial_list */
-	trx_release_descriptor(trx);
-	/*--------------------------------------*/
-
-	/* If we release kernel_mutex below and we are still doing
-	recovery i.e.: back ground rollback thread is still active
-	then there is a chance that the rollback thread may see
-	this trx as COMMITTED_IN_MEMORY and goes adhead to clean it
-	up calling trx_cleanup_at_db_startup(). This can happen
-	in the case we are committing a trx here that is left in
-	PREPARED state during the crash. Note that commit of the
-	rollback of a PREPARED trx happens in the recovery thread
-	while the rollback of other transactions happen in the
-	background thread. To avoid this race we unconditionally
-	unset the is_recovered flag from the trx. */
-
-	trx->is_recovered = FALSE;
-
-	lock_release_off_kernel(trx);
-
-	if (trx->global_read_view) {
-		read_view_close(trx->global_read_view);
-		trx->global_read_view = NULL;
-	}
-
-	trx->read_view = NULL;
-
-	if (lsn) {
-		ulint	flush_log_at_trx_commit;
-
-		mutex_exit(&kernel_mutex);
-
-		if (trx->insert_undo != NULL) {
-
-			trx_undo_insert_cleanup(trx);
-		}
-
-		if (srv_use_global_flush_log_at_trx_commit) {
-			flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
-		} else {
-			flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
-		}
-
-		/* NOTE that we could possibly make a group commit more
-		efficient here: call os_thread_yield here to allow also other
-		trxs to come to commit! */
-
-		/*-------------------------------------*/
-
-		/* Depending on the my.cnf options, we may now write the log
-		buffer to the log files, making the transaction durable if
-		the OS does not crash. We may also flush the log files to
-		disk, making the transaction durable also at an OS crash or a
-		power outage.
-
-		The idea in InnoDB's group commit is that a group of
-		transactions gather behind a trx doing a physical disk write
-		to log files, and when that physical write has been completed,
-		one of those transactions does a write which commits the whole
-		group. Note that this group commit will only bring benefit if
-		there are > 2 users in the database. Then at least 2 users can
-		gather behind one doing the physical log write to disk.
-
-		If we are calling trx_commit() under prepare_commit_mutex, we
-		will delay possible log write and flush to a separate function
-		trx_commit_complete_for_mysql(), which is only called when the
-		thread has released the mutex. This is to make the
-		group commit algorithm to work. Otherwise, the prepare_commit
-		mutex would serialize all commits and prevent a group of
-		transactions from gathering. */
-
-		if (trx->flush_log_later) {
-			/* Do nothing yet */
-			trx->must_flush_log_later = TRUE;
-		} else if (flush_log_at_trx_commit == 0) {
-			/* Do nothing */
-		} else if (flush_log_at_trx_commit == 1 ||
-			   flush_log_at_trx_commit == 3) {
-			if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
-				/* Write the log but do not flush it to disk */
-
-				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
-						FALSE);
-			} else {
-				/* Write the log to the log files AND flush
-				them to disk */
-
-				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
-			}
-		} else if (flush_log_at_trx_commit == 2) {
-
-			/* Write the log but do not flush it to disk */
-
-			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
-		} else {
-			ut_error;
-		}
-
-		trx->commit_lsn = lsn;
-
-		/*-------------------------------------*/
-
-		mutex_enter(&kernel_mutex);
-	}
-
-	/* Free all savepoints */
-	trx_roll_free_all_savepoints(trx);
-
-	trx->state = TRX_NOT_STARTED;
-	trx->rseg = NULL;
-	trx->undo_no = 0;
-	trx->last_sql_stat_start.least_undo_no = 0;
-
-	ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
-	ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0);
-
-	UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
-
-	ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->trx_list));
-
-	trx->error_state = DB_SUCCESS;
-}
-
-/****************************************************************//**
-Cleans up a transaction at database startup. The cleanup is needed if
-the transaction already got to the middle of a commit when the database
-crashed, and we cannot roll it back. */
-UNIV_INTERN
-void
-trx_cleanup_at_db_startup(
-/*======================*/
-	trx_t*	trx)	/*!< in: transaction */
-{
-	if (trx->insert_undo != NULL) {
-
-		trx_undo_insert_cleanup(trx);
-	}
-
-	trx->state = TRX_NOT_STARTED;
-	trx_release_descriptor(trx);
-	trx->rseg = NULL;
-	trx->undo_no = 0;
-	trx->last_sql_stat_start.least_undo_no = 0;
-
-	UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
-
-	ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->trx_list));
-}
-
-/********************************************************************//**
-Assigns a read view for a consistent read query. All the consistent reads
-within the same transaction will get the same read view, which is created
-when this function is first called for a new started transaction.
-@return	consistent read view */
-UNIV_INTERN
-read_view_t*
-trx_assign_read_view(
-/*=================*/
-	trx_t*	trx)	/*!< in: active transaction */
-{
-	ut_ad(trx->state == TRX_ACTIVE);
-
-	if (trx->read_view) {
-		return(trx->read_view);
-	}
-
-	mutex_enter(&kernel_mutex);
-
-	trx->read_view = read_view_open_now(trx->id, trx->prebuilt_view, TRUE);
-	trx->prebuilt_view = trx->read_view;
-	trx->global_read_view = trx->read_view;
-
-	mutex_exit(&kernel_mutex);
-
-	return(trx->read_view);
-}
-
-/****************************************************************//**
-Commits a transaction. NOTE that the kernel mutex is temporarily released. */
-static
-void
-trx_handle_commit_sig_off_kernel(
-/*=============================*/
-	trx_t*		trx,		/*!< in: transaction */
-	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread */
-{
-	trx_sig_t*	sig;
-	trx_sig_t*	next_sig;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	trx->que_state = TRX_QUE_COMMITTING;
-
-	trx_commit_off_kernel(trx);
-
-	ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
-
-	/* Remove all TRX_SIG_COMMIT signals from the signal queue and send
-	reply messages to them */
-
-	sig = UT_LIST_GET_FIRST(trx->signals);
-
-	while (sig != NULL) {
-		next_sig = UT_LIST_GET_NEXT(signals, sig);
-
-		if (sig->type == TRX_SIG_COMMIT) {
-
-			trx_sig_reply(sig, next_thr);
-			trx_sig_remove(trx, sig);
-		}
-
-		sig = next_sig;
-	}
-
-	trx->que_state = TRX_QUE_RUNNING;
-}
-
-/***********************************************************//**
-The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to
-the TRX_QUE_RUNNING state and releases query threads which were
-waiting for a lock in the wait_thrs list. */
-UNIV_INTERN
-void
-trx_end_lock_wait(
-/*==============*/
-	trx_t*	trx)	/*!< in: transaction */
-{
-	que_thr_t*	thr;
-	ulint           sec;
-	ulint           ms;
-	ib_uint64_t     now;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
-
-	thr = UT_LIST_GET_FIRST(trx->wait_thrs);
-
-	while (thr != NULL) {
-		que_thr_end_wait_no_next_thr(thr);
-
-		UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
-
-		thr = UT_LIST_GET_FIRST(trx->wait_thrs);
-	}
-
-	if (UNIV_UNLIKELY(trx->take_stats)) {
-		ut_usectime(&sec, &ms);
-		now = (ib_uint64_t)sec * 1000000 + ms;
-		trx->lock_que_wait_timer += (ulint)(now - trx->lock_que_wait_ustarted);
-	}
-	trx->que_state = TRX_QUE_RUNNING;
-}
-
-/***********************************************************//**
-Moves the query threads in the lock wait list to the SUSPENDED state and puts
-the transaction to the TRX_QUE_RUNNING state. */
-static
-void
-trx_lock_wait_to_suspended(
-/*=======================*/
-	trx_t*	trx)	/*!< in: transaction in the TRX_QUE_LOCK_WAIT state */
-{
-	que_thr_t*	thr;
-	ulint           sec;
-	ulint           ms;
-	ib_uint64_t     now;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
-
-	thr = UT_LIST_GET_FIRST(trx->wait_thrs);
-
-	while (thr != NULL) {
-		thr->state = QUE_THR_SUSPENDED;
-
-		UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
-
-		thr = UT_LIST_GET_FIRST(trx->wait_thrs);
-	}
-
-	if (UNIV_UNLIKELY(trx->take_stats)) {
-		ut_usectime(&sec, &ms);
-		now = (ib_uint64_t)sec * 1000000 + ms;
-		trx->lock_que_wait_timer += (ulint)(now - trx->lock_que_wait_ustarted);
-	}
-	trx->que_state = TRX_QUE_RUNNING;
-}
-
-/***********************************************************//**
-Moves the query threads in the sig reply wait list of trx to the SUSPENDED
-state. */
-static
-void
-trx_sig_reply_wait_to_suspended(
-/*============================*/
-	trx_t*	trx)	/*!< in: transaction */
-{
-	trx_sig_t*	sig;
-	que_thr_t*	thr;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	sig = UT_LIST_GET_FIRST(trx->reply_signals);
-
-	while (sig != NULL) {
-		thr = sig->receiver;
-
-		ut_ad(thr->state == QUE_THR_SIG_REPLY_WAIT);
-
-		thr->state = QUE_THR_SUSPENDED;
-
-		sig->receiver = NULL;
-
-		UT_LIST_REMOVE(reply_signals, trx->reply_signals, sig);
-
-		sig = UT_LIST_GET_FIRST(trx->reply_signals);
-	}
-}
-
-/*****************************************************************//**
-Checks the compatibility of a new signal with the other signals in the
-queue.
-@return	TRUE if the signal can be queued */
-static
-ibool
-trx_sig_is_compatible(
-/*==================*/
-	trx_t*	trx,	/*!< in: trx handle */
-	ulint	type,	/*!< in: signal type */
-	ulint	sender)	/*!< in: TRX_SIG_SELF or TRX_SIG_OTHER_SESS */
-{
-	trx_sig_t*	sig;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	if (UT_LIST_GET_LEN(trx->signals) == 0) {
-
-		return(TRUE);
-	}
-
-	if (sender == TRX_SIG_SELF) {
-		if (type == TRX_SIG_ERROR_OCCURRED) {
-
-			return(TRUE);
-
-		} else if (type == TRX_SIG_BREAK_EXECUTION) {
-
-			return(TRUE);
-		} else {
-			return(FALSE);
-		}
-	}
-
-	ut_ad(sender == TRX_SIG_OTHER_SESS);
-
-	sig = UT_LIST_GET_FIRST(trx->signals);
-
-	if (type == TRX_SIG_COMMIT) {
-		while (sig != NULL) {
-
-			if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
-
-				return(FALSE);
-			}
-
-			sig = UT_LIST_GET_NEXT(signals, sig);
-		}
-
-		return(TRUE);
-
-	} else if (type == TRX_SIG_TOTAL_ROLLBACK) {
-		while (sig != NULL) {
-
-			if (sig->type == TRX_SIG_COMMIT) {
-
-				return(FALSE);
-			}
-
-			sig = UT_LIST_GET_NEXT(signals, sig);
-		}
-
-		return(TRUE);
-
-	} else if (type == TRX_SIG_BREAK_EXECUTION) {
-
-		return(TRUE);
-	} else {
-		ut_error;
-
-		return(FALSE);
-	}
-}
-
-/****************************************************************//**
-Sends a signal to a trx object. */
-UNIV_INTERN
-void
-trx_sig_send(
-/*=========*/
-	trx_t*		trx,		/*!< in: trx handle */
-	ulint		type,		/*!< in: signal type */
-	ulint		sender,		/*!< in: TRX_SIG_SELF or
-					TRX_SIG_OTHER_SESS */
-	que_thr_t*	receiver_thr,	/*!< in: query thread which wants the
-					reply, or NULL; if type is
-					TRX_SIG_END_WAIT, this must be NULL */
-	trx_savept_t*	savept,		/*!< in: possible rollback savepoint, or
-					NULL */
-	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread; if the parameter
-					is NULL, it is ignored */
-{
-	trx_sig_t*	sig;
-	trx_t*		receiver_trx;
-
-	ut_ad(trx);
-	ut_ad(mutex_own(&kernel_mutex));
-
-	if (!trx_sig_is_compatible(trx, type, sender)) {
-		/* The signal is not compatible with the other signals in
-		the queue: die */
-
-		ut_error;
-	}
-
-	/* Queue the signal object */
-
-	if (UT_LIST_GET_LEN(trx->signals) == 0) {
-
-		/* The signal list is empty: the 'sig' slot must be unused
-		(we improve performance a bit by avoiding mem_alloc) */
-		sig = &(trx->sig);
-	} else {
-		/* It might be that the 'sig' slot is unused also in this
-		case, but we choose the easy way of using mem_alloc */
-
-		sig = mem_alloc(sizeof(trx_sig_t));
-	}
-
-	UT_LIST_ADD_LAST(signals, trx->signals, sig);
-
-	sig->type = type;
-	sig->sender = sender;
-	sig->receiver = receiver_thr;
-
-	if (savept) {
-		sig->savept = *savept;
-	}
-
-	if (receiver_thr) {
-		receiver_trx = thr_get_trx(receiver_thr);
-
-		UT_LIST_ADD_LAST(reply_signals, receiver_trx->reply_signals,
-				 sig);
-	}
-
-	if (trx->sess->state == SESS_ERROR) {
-
-		trx_sig_reply_wait_to_suspended(trx);
-	}
-
-	if ((sender != TRX_SIG_SELF) || (type == TRX_SIG_BREAK_EXECUTION)) {
-		ut_error;
-	}
-
-	/* If there were no other signals ahead in the queue, try to start
-	handling of the signal */
-
-	if (UT_LIST_GET_FIRST(trx->signals) == sig) {
-
-		trx_sig_start_handle(trx, next_thr);
-	}
-}
-
-/****************************************************************//**
-Ends signal handling. If the session is in the error state, and
-trx->graph_before_signal_handling != NULL, then returns control to the error
-handling routine of the graph (currently just returns the control to the
-graph root which then will send an error message to the client). */
-UNIV_INTERN
-void
-trx_end_signal_handling(
-/*====================*/
-	trx_t*	trx)	/*!< in: trx */
-{
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(trx->handling_signals == TRUE);
-
-	trx->handling_signals = FALSE;
-
-	trx->graph = trx->graph_before_signal_handling;
-
-	if (trx->graph && (trx->sess->state == SESS_ERROR)) {
-
-		que_fork_error_handle(trx, trx->graph);
-	}
-}
-
-/****************************************************************//**
-Starts handling of a trx signal. */
-UNIV_INTERN
-void
-trx_sig_start_handle(
-/*=================*/
-	trx_t*		trx,		/*!< in: trx handle */
-	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread; if the parameter
-					is NULL, it is ignored */
-{
-	trx_sig_t*	sig;
-	ulint		type;
-loop:
-	/* We loop in this function body as long as there are queued signals
-	we can process immediately */
-
-	ut_ad(trx);
-	ut_ad(mutex_own(&kernel_mutex));
-
-	if (trx->handling_signals && (UT_LIST_GET_LEN(trx->signals) == 0)) {
-
-		trx_end_signal_handling(trx);
-
-		return;
-	}
-
-	if (trx->state == TRX_NOT_STARTED) {
-
-		trx_start_low(trx, ULINT_UNDEFINED);
-	}
-
-	/* If the trx is in a lock wait state, moves the waiting query threads
-	to the suspended state */
-
-	if (trx->que_state == TRX_QUE_LOCK_WAIT) {
-
-		trx_lock_wait_to_suspended(trx);
-	}
-
-	/* If the session is in the error state and this trx has threads
-	waiting for reply from signals, moves these threads to the suspended
-	state, canceling wait reservations; note that if the transaction has
-	sent a commit or rollback signal to itself, and its session is not in
-	the error state, then nothing is done here. */
-
-	if (trx->sess->state == SESS_ERROR) {
-		trx_sig_reply_wait_to_suspended(trx);
-	}
-
-	/* If there are no running query threads, we can start processing of a
-	signal, otherwise we have to wait until all query threads of this
-	transaction are aware of the arrival of the signal. */
-
-	if (trx->n_active_thrs > 0) {
-
-		return;
-	}
-
-	if (trx->handling_signals == FALSE) {
-		trx->graph_before_signal_handling = trx->graph;
-
-		trx->handling_signals = TRUE;
-	}
-
-	sig = UT_LIST_GET_FIRST(trx->signals);
-	type = sig->type;
-
-	if (type == TRX_SIG_COMMIT) {
-
-		trx_handle_commit_sig_off_kernel(trx, next_thr);
-
-	} else if ((type == TRX_SIG_TOTAL_ROLLBACK)
-		   || (type == TRX_SIG_ROLLBACK_TO_SAVEPT)) {
-
-		trx_rollback(trx, sig, next_thr);
-
-		/* No further signals can be handled until the rollback
-		completes, therefore we return */
-
-		return;
-
-	} else if (type == TRX_SIG_ERROR_OCCURRED) {
-
-		trx_rollback(trx, sig, next_thr);
-
-		/* No further signals can be handled until the rollback
-		completes, therefore we return */
-
-		return;
-
-	} else if (type == TRX_SIG_BREAK_EXECUTION) {
-
-		trx_sig_reply(sig, next_thr);
-		trx_sig_remove(trx, sig);
-	} else {
-		ut_error;
-	}
-
-	goto loop;
-}
-
-/****************************************************************//**
-Send the reply message when a signal in the queue of the trx has been
-handled. */
-UNIV_INTERN
-void
-trx_sig_reply(
-/*==========*/
-	trx_sig_t*	sig,		/*!< in: signal */
-	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread */
-{
-	trx_t*	receiver_trx;
-
-	ut_ad(sig);
-	ut_ad(mutex_own(&kernel_mutex));
-
-	if (sig->receiver != NULL) {
-		ut_ad((sig->receiver)->state == QUE_THR_SIG_REPLY_WAIT);
-
-		receiver_trx = thr_get_trx(sig->receiver);
-
-		UT_LIST_REMOVE(reply_signals, receiver_trx->reply_signals,
-			       sig);
-		ut_ad(receiver_trx->sess->state != SESS_ERROR);
-
-		que_thr_end_wait(sig->receiver, next_thr);
-
-		sig->receiver = NULL;
-
-	}
-}
-
-/****************************************************************//**
-Removes a signal object from the trx signal queue. */
-UNIV_INTERN
-void
-trx_sig_remove(
-/*===========*/
-	trx_t*		trx,	/*!< in: trx handle */
-	trx_sig_t*	sig)	/*!< in, own: signal */
-{
-	ut_ad(trx && sig);
-	ut_ad(mutex_own(&kernel_mutex));
-
-	ut_ad(sig->receiver == NULL);
-
-	UT_LIST_REMOVE(signals, trx->signals, sig);
-	sig->type = 0;	/* reset the field to catch possible bugs */
-
-	if (sig != &(trx->sig)) {
-		mem_free(sig);
-	}
-}
-
-/*********************************************************************//**
-Creates a commit command node struct.
-@return	own: commit node struct */
-UNIV_INTERN
-commit_node_t*
-commit_node_create(
-/*===============*/
-	mem_heap_t*	heap)	/*!< in: mem heap where created */
-{
-	commit_node_t*	node;
-
-	node = mem_heap_alloc(heap, sizeof(commit_node_t));
-	node->common.type  = QUE_NODE_COMMIT;
-	node->state = COMMIT_NODE_SEND;
-
-	return(node);
-}
-
-/***********************************************************//**
-Performs an execution step for a commit type node in a query graph.
-@return	query thread to run next, or NULL */
-UNIV_INTERN
-que_thr_t*
-trx_commit_step(
-/*============*/
-	que_thr_t*	thr)	/*!< in: query thread */
-{
-	commit_node_t*	node;
-	que_thr_t*	next_thr;
-
-	node = thr->run_node;
-
-	ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
-
-	if (thr->prev_node == que_node_get_parent(node)) {
-		node->state = COMMIT_NODE_SEND;
-	}
-
-	if (node->state == COMMIT_NODE_SEND) {
-		mutex_enter(&kernel_mutex);
-
-		node->state = COMMIT_NODE_WAIT;
-
-		next_thr = NULL;
-
-		thr->state = QUE_THR_SIG_REPLY_WAIT;
-
-		/* Send the commit signal to the transaction */
-
-		trx_sig_send(thr_get_trx(thr), TRX_SIG_COMMIT, TRX_SIG_SELF,
-			     thr, NULL, &next_thr);
-
-		mutex_exit(&kernel_mutex);
-
-		return(next_thr);
-	}
-
-	ut_ad(node->state == COMMIT_NODE_WAIT);
-
-	node->state = COMMIT_NODE_SEND;
-
-	thr->run_node = que_node_get_parent(node);
-
-	return(thr);
-}
-
-/**********************************************************************//**
-Does the transaction commit for MySQL.
-@return	DB_SUCCESS or error number */
-UNIV_INTERN
-ulint
-trx_commit_for_mysql(
-/*=================*/
-	trx_t*	trx)	/*!< in: trx handle */
-{
-	/* Because we do not do the commit by sending an Innobase
-	sig to the transaction, we must here make sure that trx has been
-	started. */
-
-	ut_a(trx);
-
-	trx_start_if_not_started(trx);
-
-	trx->op_info = "committing";
-
-	mutex_enter(&kernel_mutex);
-
-	trx_commit_off_kernel(trx);
-
-	mutex_exit(&kernel_mutex);
-
-	trx->op_info = "";
-
-	return(DB_SUCCESS);
-}
-
-/**********************************************************************//**
-If required, flushes the log to disk if we called trx_commit_for_mysql()
-with trx->flush_log_later == TRUE.
-@return	0 or error number */
-UNIV_INTERN
-ulint
-trx_commit_complete_for_mysql(
-/*==========================*/
-	trx_t*	trx)	/*!< in: trx handle */
-{
-	ib_uint64_t	lsn	= trx->commit_lsn;
-	ulint		flush_log_at_trx_commit;
-
-	ut_a(trx);
-
-	trx->op_info = "flushing log";
-
-	if (srv_use_global_flush_log_at_trx_commit) {
-		flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
-	} else {
-		flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
-	}
-
-	if (!trx->must_flush_log_later) {
-		/* Do nothing */
-	} else if (flush_log_at_trx_commit == 0) {
-		/* Do nothing */
-	} else if (flush_log_at_trx_commit == 1 && trx->active_commit_ordered) {
-		/* Do nothing - we already flushed the prepare and binlog write
-		to disk, so transaction is durable (will be recovered from
-		binlog if necessary) */
-	} else if (flush_log_at_trx_commit == 1 || flush_log_at_trx_commit == 3) {
-		if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
-			/* Write the log but do not flush it to disk */
-
-			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
-		} else {
-			/* Write the log to the log files AND flush them to
-			disk */
-
-			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
-		}
-	} else if (flush_log_at_trx_commit == 2) {
-
-		/* Write the log but do not flush it to disk */
-
-		log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
-	} else {
-		ut_error;
-	}
-
-	trx->must_flush_log_later = FALSE;
-
-	trx->op_info = "";
-
-	return(0);
-}
-
-/**********************************************************************//**
-Marks the latest SQL statement ended. */
-UNIV_INTERN
-void
-trx_mark_sql_stat_end(
-/*==================*/
-	trx_t*	trx)	/*!< in: trx handle */
-{
-	ut_a(trx);
-
-	if (trx->state == TRX_NOT_STARTED) {
-		trx->undo_no = 0;
-	}
-
-	trx->last_sql_stat_start.least_undo_no = trx->undo_no;
-}
-
-/**********************************************************************//**
-Prints info about a transaction to the given file. The caller must own the
-kernel mutex. */
-UNIV_INTERN
-void
-trx_print(
-/*======*/
-	FILE*	f,		/*!< in: output stream */
-	trx_t*	trx,		/*!< in: transaction */
-	ulint	max_query_len)	/*!< in: max query length to print, or 0 to
-				   use the default max length */
-{
-	ibool	newline;
-
-	fprintf(f, "TRANSACTION " TRX_ID_FMT, (ullint) trx->id);
-
-	switch (trx->state) {
-	case TRX_NOT_STARTED:
-		fputs(", not started", f);
-		break;
-	case TRX_ACTIVE:
-		fprintf(f, ", ACTIVE %lu sec",
-			(ulong)difftime(time(NULL), trx->start_time));
-		break;
-	case TRX_PREPARED:
-		fprintf(f, ", ACTIVE (PREPARED) %lu sec",
-			(ulong)difftime(time(NULL), trx->start_time));
-		break;
-	case TRX_COMMITTED_IN_MEMORY:
-		fputs(", COMMITTED IN MEMORY", f);
-		break;
-	default:
-		fprintf(f, " state %lu", (ulong) trx->state);
-	}
-
-	if (*trx->op_info) {
-		putc(' ', f);
-		fputs(trx->op_info, f);
-	}
-
-	if (trx->is_recovered) {
-		fputs(" recovered trx", f);
-	}
-
-	if (trx->is_purge) {
-		fputs(" purge trx", f);
-	}
-
-	if (trx->declared_to_be_inside_innodb) {
-		fprintf(f, ", thread declared inside InnoDB %lu",
-			(ulong) trx->n_tickets_to_enter_innodb);
-	}
-
-	putc('\n', f);
-
-	if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
-		fprintf(f, "mysql tables in use %lu, locked %lu\n",
-			(ulong) trx->n_mysql_tables_in_use,
-			(ulong) trx->mysql_n_tables_locked);
-	}
-
-	newline = TRUE;
-
-	switch (trx->que_state) {
-	case TRX_QUE_RUNNING:
-		newline = FALSE; break;
-	case TRX_QUE_LOCK_WAIT:
-		fputs("LOCK WAIT ", f); break;
-	case TRX_QUE_ROLLING_BACK:
-		fputs("ROLLING BACK ", f); break;
-	case TRX_QUE_COMMITTING:
-		fputs("COMMITTING ", f); break;
-	default:
-		fprintf(f, "que state %lu ", (ulong) trx->que_state);
-	}
-
-	if (0 < UT_LIST_GET_LEN(trx->trx_locks)
-	    || mem_heap_get_size(trx->lock_heap) > 400) {
-		newline = TRUE;
-
-		fprintf(f, "%lu lock struct(s), heap size %lu,"
-			" %lu row lock(s)",
-			(ulong) UT_LIST_GET_LEN(trx->trx_locks),
-			(ulong) mem_heap_get_size(trx->lock_heap),
-			(ulong) lock_number_of_rows_locked(trx));
-	}
-
-	if (trx->has_search_latch) {
-		newline = TRUE;
-		fputs(", holds adaptive hash latch", f);
-	}
-
-	if (trx->undo_no != 0) {
-		newline = TRUE;
-		fprintf(f, ", undo log entries %llu",
-			(ullint) trx->undo_no);
-	}
-
-	if (newline) {
-		putc('\n', f);
-	}
-
-	if (trx->mysql_thd != NULL) {
-		innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len);
-	}
-}
-
-/*******************************************************************//**
-Compares the "weight" (or size) of two transactions. Transactions that
-have edited non-transactional tables are considered heavier than ones
-that have not.
-@return	TRUE if weight(a) >= weight(b) */
-UNIV_INTERN
-ibool
-trx_weight_ge(
-/*==========*/
-	const trx_t*	a,	/*!< in: the first transaction to be compared */
-	const trx_t*	b)	/*!< in: the second transaction to be compared */
-{
-	ibool	a_notrans_edit;
-	ibool	b_notrans_edit;
-
-	/* If mysql_thd is NULL for a transaction we assume that it has
-	not edited non-transactional tables. */
-
-	a_notrans_edit = a->mysql_thd != NULL
-		&& thd_has_edited_nontrans_tables(a->mysql_thd);
-
-	b_notrans_edit = b->mysql_thd != NULL
-		&& thd_has_edited_nontrans_tables(b->mysql_thd);
-
-	if (a_notrans_edit != b_notrans_edit) {
-
-		return(a_notrans_edit);
-	}
-
-	/* Either both had edited non-transactional tables or both had
-	not, we fall back to comparing the number of altered/locked
-	rows. */
-
-#if 0
-	fprintf(stderr,
-		"%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n",
-		__func__,
-		a->undo_no, UT_LIST_GET_LEN(a->trx_locks),
-		b->undo_no, UT_LIST_GET_LEN(b->trx_locks));
-#endif
-
-	return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
-}
-
-/****************************************************************//**
-Prepares a transaction. */
-UNIV_INTERN
-void
-trx_prepare_off_kernel(
-/*===================*/
-	trx_t*	trx)	/*!< in: transaction */
-{
-	trx_rseg_t*	rseg;
-	ib_uint64_t	lsn		= 0;
-	mtr_t		mtr;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	rseg = trx->rseg;
-
-	if (trx->insert_undo != NULL || trx->update_undo != NULL) {
-
-		mutex_exit(&kernel_mutex);
-
-		mtr_start(&mtr);
-
-		/* Change the undo log segment states from TRX_UNDO_ACTIVE
-		to TRX_UNDO_PREPARED: these modifications to the file data
-		structure define the transaction as prepared in the
-		file-based world, at the serialization point of lsn. */
-
-		mutex_enter(&(rseg->mutex));
-
-		if (trx->insert_undo != NULL) {
-
-			/* It is not necessary to obtain trx->undo_mutex here
-			because only a single OS thread is allowed to do the
-			transaction prepare for this transaction. */
-
-			trx_undo_set_state_at_prepare(trx, trx->insert_undo,
-						      &mtr);
-		}
-
-		if (trx->update_undo) {
-			trx_undo_set_state_at_prepare(
-				trx, trx->update_undo, &mtr);
-		}
-
-		mutex_exit(&(rseg->mutex));
-
-		if (trx->mysql_master_log_file_name[0] != '\0') {
-			/* This database server is a MySQL replication slave */
-			trx_sysf_t*	sys_header	= trx_sysf_get(&mtr);
-
-			trx_sys_update_mysql_binlog_offset(
-				sys_header,
-				trx->mysql_relay_log_file_name,
-				trx->mysql_relay_log_pos,
-				TRX_SYS_MYSQL_RELAY_LOG_INFO, &mtr);
-			trx_sys_update_mysql_binlog_offset(
-				sys_header,
-				trx->mysql_master_log_file_name,
-				trx->mysql_master_log_pos,
-				TRX_SYS_MYSQL_MASTER_LOG_INFO, &mtr);
-			trx->mysql_master_log_file_name = "";
-		}
-
-		/*--------------*/
-		mtr_commit(&mtr);	/* This mtr commit makes the
-					transaction prepared in the file-based
-					world */
-		/*--------------*/
-		lsn = mtr.end_lsn;
-
-		mutex_enter(&kernel_mutex);
-	}
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	/*--------------------------------------*/
-	if (UNIV_UNLIKELY(trx->state != TRX_ACTIVE)) {
-
-		trx_reserve_descriptor(trx);
-	}
-	trx->state = TRX_PREPARED;
-	trx_n_prepared++;
-	/*--------------------------------------*/
-
-	if (lsn) {
-		ulint	flush_log_at_trx_commit;
-
-		/* Depending on the my.cnf options, we may now write the log
-		buffer to the log files, making the prepared state of the
-		transaction durable if the OS does not crash. We may also
-		flush the log files to disk, making the prepared state of the
-		transaction durable also at an OS crash or a power outage.
-
-		The idea in InnoDB's group prepare is that a group of
-		transactions gather behind a trx doing a physical disk write
-		to log files, and when that physical write has been completed,
-		one of those transactions does a write which prepares the whole
-		group. Note that this group prepare will only bring benefit if
-		there are > 2 users in the database. Then at least 2 users can
-		gather behind one doing the physical log write to disk.
-
-		TODO: find out if MySQL holds some mutex when calling this.
-		That would spoil our group prepare algorithm. */
-
-		mutex_exit(&kernel_mutex);
-
-		if (srv_use_global_flush_log_at_trx_commit) {
-			flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
-		} else {
-			flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
-		}
-
-		if (flush_log_at_trx_commit == 0) {
-			/* Do nothing */
-		} else if (flush_log_at_trx_commit == 1 || flush_log_at_trx_commit == 3) {
-			if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
-				/* Write the log but do not flush it to disk */
-
-				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
-						FALSE);
-			} else {
-				/* Write the log to the log files AND flush
-				them to disk */
-
-				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
-			}
-		} else if (flush_log_at_trx_commit == 2) {
-
-			/* Write the log but do not flush it to disk */
-
-			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
-		} else {
-			ut_error;
-		}
-
-		mutex_enter(&kernel_mutex);
-	}
-}
-
-/**********************************************************************//**
-Does the transaction prepare for MySQL.
-@return	0 or error number */
-UNIV_INTERN
-ulint
-trx_prepare_for_mysql(
-/*==================*/
-	trx_t*	trx)	/*!< in: trx handle */
-{
-	/* Because we do not do the prepare by sending an Innobase
-	sig to the transaction, we must here make sure that trx has been
-	started. */
-
-	ut_a(trx);
-
-	trx->op_info = "preparing";
-
-	trx_start_if_not_started(trx);
-
-	mutex_enter(&kernel_mutex);
-
-	trx_prepare_off_kernel(trx);
-
-	mutex_exit(&kernel_mutex);
-
-	trx->op_info = "";
-
-	return(0);
-}
-
-/**********************************************************************//**
-This function is used to find number of prepared transactions and
-their transaction objects for a recovery.
-@return	number of prepared transactions stored in xid_list */
-UNIV_INTERN
-int
-trx_recover_for_mysql(
-/*==================*/
-	XID*	xid_list,	/*!< in/out: prepared transactions */
-	ulint	len)		/*!< in: number of slots in xid_list */
-{
-	trx_t*	trx;
-	ulint	count = 0;
-
-	ut_ad(xid_list);
-	ut_ad(len);
-
-	/* We should set those transactions which are in the prepared state
-	to the xid_list */
-
-	mutex_enter(&kernel_mutex);
-
-	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
-
-	while (trx) {
-		if (trx->state == TRX_PREPARED) {
-			xid_list[count] = trx->xid;
-
-			if (count == 0) {
-				ut_print_timestamp(stderr);
-				fprintf(stderr,
-					"  InnoDB: Starting recovery for"
-					" XA transactions...\n");
-			}
-
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: Transaction " TRX_ID_FMT " in"
-				" prepared state after recovery\n",
-				(ullint) trx->id);
-
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: Transaction contains changes"
-				" to %llu rows\n",
-				(ullint) trx->undo_no);
-
-			count++;
-
-			if (count == len) {
-				break;
-			}
-		}
-
-		trx = UT_LIST_GET_NEXT(trx_list, trx);
-	}
-
-	mutex_exit(&kernel_mutex);
-
-	if (count > 0){
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: %lu transactions in prepared state"
-			" after recovery\n",
-			(ulong) count);
-	}
-
-	return ((int) count);
-}
-
-/*******************************************************************//**
-This function is used to find one X/Open XA distributed transaction
-which is in the prepared state
-@return	trx or NULL; on match, the trx->xid will be invalidated */
-UNIV_INTERN
-trx_t*
-trx_get_trx_by_xid(
-/*===============*/
-	const XID*	xid)	/*!< in: X/Open XA transaction identifier */
-{
-	trx_t*	trx;
-
-	if (xid == NULL) {
-
-		return(NULL);
-	}
-
-	mutex_enter(&kernel_mutex);
-
-	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
-
-	while (trx) {
-		/* Compare two X/Open XA transaction id's: their
-		length should be the same and binary comparison
-		of gtrid_length+bqual_length bytes should be
-		the same */
-
-		if (trx->is_recovered
-		    && trx->state == TRX_PREPARED
-		    && xid->gtrid_length == trx->xid.gtrid_length
-		    && xid->bqual_length == trx->xid.bqual_length
-		    && memcmp(xid->data, trx->xid.data,
-			      xid->gtrid_length + xid->bqual_length) == 0) {
-
-			/* Invalidate the XID, so that subsequent calls
-			will not find it. */
-			memset(&trx->xid, 0, sizeof(trx->xid));
-			trx->xid.formatID = -1;
-			break;
-		}
-
-		trx = UT_LIST_GET_NEXT(trx_list, trx);
-	}
-
-	mutex_exit(&kernel_mutex);
-
-	return(trx);
-}
diff --git a/storage/xtradb/trx/trx0trx.cc b/storage/xtradb/trx/trx0trx.cc
new file mode 100644
index 00000000000..432cb8f6330
--- /dev/null
+++ b/storage/xtradb/trx/trx0trx.cc
@@ -0,0 +1,2502 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0trx.cc
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "btr0types.h"
+#include "trx0trx.h"
+
+#ifdef UNIV_NONINL
+#include "trx0trx.ic"
+#endif
+
+#include "trx0undo.h"
+#include "trx0rseg.h"
+#include "log0log.h"
+#include "que0que.h"
+#include "lock0lock.h"
+#include "trx0roll.h"
+#include "usr0sess.h"
+#include "read0read.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "btr0sea.h"
+#include "os0proc.h"
+#include "trx0xa.h"
+#include "trx0rec.h"
+#include "trx0purge.h"
+#include "ha_prototypes.h"
+#include "srv0mon.h"
+#include "ut0vec.h"
+
+#include<set>
+
+/** Set of table_id */
+typedef std::set<table_id_t>	table_id_set;
+
+/** Dummy session used currently in MySQL interface */
+UNIV_INTERN sess_t*		trx_dummy_sess = NULL;
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	trx_mutex_key;
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	trx_undo_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/*************************************************************//**
+Set detailed error message for the transaction. */
+UNIV_INTERN
+void
+trx_set_detailed_error(
+/*===================*/
+	trx_t*		trx,	/*!< in: transaction struct */
+	const char*	msg)	/*!< in: detailed error message */
+{
+	ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error));
+}
+
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+UNIV_INTERN
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+	trx_t*	trx,	/*!< in: transaction struct */
+	FILE*	file)	/*!< in: file to read message from */
+{
+	os_file_read_string(file, trx->detailed_error,
+			    sizeof(trx->detailed_error));
+}
+
+/*************************************************************//**
+Callback function for trx_find_descriptor() to compare trx IDs. */
+UNIV_INTERN
+int
+trx_descr_cmp(
+/*==========*/
+	const void *a,	/*!< in: pointer to first comparison argument */
+	const void *b)	/*!< in: pointer to second comparison argument */
+{
+	const trx_id_t*	da = (const trx_id_t*) a;
+	const trx_id_t*	db = (const trx_id_t*) b;
+
+	if (*da < *db) {
+		return -1;
+	} else if (*da > *db) {
+		return 1;
+	}
+
+	return 0;
+}
+
+/*************************************************************//**
+Reserve a slot for a given trx in the global descriptors array. */
+UNIV_INLINE
+void
+trx_reserve_descriptor(
+/*===================*/
+	const trx_t* trx)	/*!< in: trx pointer */
+{
+	ulint		n_used;
+	ulint		n_max;
+	trx_id_t*	descr;
+
+	ut_ad(mutex_own(&trx_sys->mutex) || srv_is_being_started);
+	ut_ad(srv_is_being_started ||
+	      !trx_find_descriptor(trx_sys->descriptors,
+				   trx_sys->descr_n_used,
+				   trx->id));
+
+	n_used = trx_sys->descr_n_used + 1;
+	n_max = trx_sys->descr_n_max;
+
+	if (UNIV_UNLIKELY(n_used > n_max)) {
+
+		n_max = n_max * 2;
+
+		trx_sys->descriptors = static_cast<trx_id_t*>(
+			ut_realloc(trx_sys->descriptors,
+				   n_max * sizeof(trx_id_t)));
+
+		trx_sys->descr_n_max = n_max;
+		srv_descriptors_memory = n_max * sizeof(trx_id_t);
+	}
+
+	descr = trx_sys->descriptors + n_used - 1;
+
+	if (UNIV_UNLIKELY(n_used > 1 && trx->id < descr[-1])) {
+
+		/* Find the slot where it should be inserted. We could use a
+		binary search, but in reality linear search should be faster,
+		because the slot we are looking for is near the array end. */
+
+		trx_id_t*	tdescr;
+
+		for (tdescr = descr - 1;
+		     tdescr >= trx_sys->descriptors && *tdescr > trx->id;
+		     tdescr--) {
+		}
+
+		tdescr++;
+
+		ut_memmove(tdescr + 1, tdescr, (descr - tdescr) *
+			   sizeof(trx_id_t));
+
+		descr = tdescr;
+	}
+
+	*descr = trx->id;
+
+	trx_sys->descr_n_used = n_used;
+}
+
+/*************************************************************//**
+Release a slot for a given trx in the global descriptors array. */
+UNIV_INTERN
+void
+trx_release_descriptor(
+/*===================*/
+	trx_t* trx)	/*!< in: trx pointer */
+{
+	ulint		size;
+	trx_id_t*	descr;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	if (UNIV_LIKELY(trx->in_trx_serial_list)) {
+
+		UT_LIST_REMOVE(trx_serial_list, trx_sys->trx_serial_list,
+			       trx);
+		trx->in_trx_serial_list = false;
+	}
+
+	descr = trx_find_descriptor(trx_sys->descriptors,
+				    trx_sys->descr_n_used,
+				    trx->id);
+
+	if (UNIV_UNLIKELY(descr == NULL)) {
+
+		return;
+	}
+
+	size = (trx_sys->descriptors + trx_sys->descr_n_used - 1 - descr) *
+		sizeof(trx_id_t);
+
+	if (UNIV_LIKELY(size > 0)) {
+
+		ut_memmove(descr, descr + 1, size);
+	}
+
+	trx_sys->descr_n_used--;
+}
+
+/****************************************************************//**
+Creates and initializes a transaction object. It must be explicitly
+started with trx_start_if_not_started() before using it. The default
+isolation level is TRX_ISO_REPEATABLE_READ.
+@return transaction instance, should never be NULL */
+static
+trx_t*
+trx_create(void)
+/*============*/
+{
+	trx_t*		trx;
+	mem_heap_t*	heap;
+	ib_alloc_t*	heap_alloc;
+
+	trx = static_cast<trx_t*>(mem_zalloc(sizeof(*trx)));
+
+	mutex_create(trx_mutex_key, &trx->mutex, SYNC_TRX);
+
+	trx->magic_n = TRX_MAGIC_N;
+
+	trx->active_commit_ordered = 0;
+	trx->state = TRX_STATE_NOT_STARTED;
+
+	trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+	trx->no = TRX_ID_MAX;
+	trx->in_trx_serial_list = false;
+
+	trx->support_xa = TRUE;
+
+	trx->fake_changes = FALSE;
+
+	trx->check_foreigns = TRUE;
+	trx->check_unique_secondary = TRUE;
+
+	trx->dict_operation = TRX_DICT_OP_NONE;
+
+	trx->idle_start = 0;
+	trx->last_stmt_start = 0;
+
+	mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO);
+
+	trx->error_state = DB_SUCCESS;
+
+	trx->lock.que_state = TRX_QUE_RUNNING;
+
+	trx->lock.lock_heap = mem_heap_create_typed(
+		256, MEM_HEAP_FOR_LOCK_HEAP);
+
+	trx->search_latch_timeout = BTR_SEA_TIMEOUT;
+
+	trx->io_reads = 0;
+	trx->io_read = 0;
+	trx->io_reads_wait_timer = 0;
+	trx->lock_que_wait_timer = 0;
+	trx->innodb_que_wait_timer = 0;
+	trx->distinct_page_access = 0;
+	trx->distinct_page_access_hash = NULL;
+	trx->take_stats = FALSE;
+
+	trx->xid.formatID = -1;
+
+	trx->op_info = "";
+
+	heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8);
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	/* Remember to free the vector explicitly in trx_free(). */
+	trx->autoinc_locks = ib_vector_create(heap_alloc, sizeof(void**), 4);
+
+	/* Remember to free the vector explicitly in trx_free(). */
+	heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 128);
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	trx->lock.table_locks = ib_vector_create(
+		heap_alloc, sizeof(void**), 32);
+
+	return(trx);
+}
+
+/********************************************************************//**
+Creates a transaction object for background operations by the master thread.
+@return	own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_background(void)
+/*=============================*/
+{
+	trx_t*	trx;
+
+	trx = trx_create();
+
+	trx->sess = trx_dummy_sess;
+
+	return(trx);
+}
+
+/********************************************************************//**
+Creates a transaction object for MySQL.
+@return	own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_mysql(void)
+/*========================*/
+{
+	trx_t*	trx;
+
+	trx = trx_allocate_for_background();
+
+	mutex_enter(&trx_sys->mutex);
+
+	ut_d(trx->in_mysql_trx_list = TRUE);
+	UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx);
+
+	mutex_exit(&trx_sys->mutex);
+
+	if (UNIV_UNLIKELY(trx->take_stats)) {
+		trx->distinct_page_access_hash
+			= static_cast<byte *>(mem_alloc(DPAH_SIZE));
+		memset(trx->distinct_page_access_hash, 0, DPAH_SIZE);
+	}
+
+	return(trx);
+}
+
+/********************************************************************//**
+Frees a transaction object without releasing the corresponding descriptor.
+Should be used by callers that already own trx_sys->mutex. */
+static
+void
+trx_free_low(
+/*=========*/
+	trx_t*	trx)	/*!< in, own: trx object */
+{
+	ut_a(trx->magic_n == TRX_MAGIC_N);
+	ut_ad(!trx->in_ro_trx_list);
+	ut_ad(!trx->in_rw_trx_list);
+	ut_ad(!trx->in_mysql_trx_list);
+
+	mutex_free(&trx->undo_mutex);
+
+	if (trx->undo_no_arr != NULL) {
+		trx_undo_arr_free(trx->undo_no_arr);
+	}
+
+	ut_a(trx->lock.wait_lock == NULL);
+	ut_a(trx->lock.wait_thr == NULL);
+
+	ut_a(!trx->has_search_latch);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!btr_search_own_any());
+#endif
+
+	ut_a(trx->dict_operation_lock_mode == 0);
+
+	if (trx->lock.lock_heap) {
+		mem_heap_free(trx->lock.lock_heap);
+	}
+
+	ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+
+	ut_a(ib_vector_is_empty(trx->autoinc_locks));
+	/* We allocated a dedicated heap for the vector. */
+	ib_vector_free(trx->autoinc_locks);
+
+	if (trx->lock.table_locks != NULL) {
+		/* We allocated a dedicated heap for the vector. */
+		ib_vector_free(trx->lock.table_locks);
+	}
+
+	mutex_free(&trx->mutex);
+
+	read_view_free(trx->prebuilt_view);
+
+	mem_free(trx);
+}
+
+/********************************************************************//**
+Frees a transaction object. */
+static
+void
+trx_free(
+/*=========*/
+	trx_t*	trx)	/*!< in, own: trx object */
+{
+	mutex_enter(&trx_sys->mutex);
+	trx_release_descriptor(trx);
+	mutex_exit(&trx_sys->mutex);
+
+	trx_free_low(trx);
+}
+
+/********************************************************************//**
+Frees a transaction object of a background operation of the master thread. */
+UNIV_INTERN
+void
+trx_free_for_background(
+/*====================*/
+	trx_t*	trx)	/*!< in, own: trx object */
+{
+
+	if (trx->distinct_page_access_hash)
+	{
+		mem_free(trx->distinct_page_access_hash);
+		trx->distinct_page_access_hash= NULL;
+	}
+
+	if (trx->declared_to_be_inside_innodb) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Freeing a trx (%p, " TRX_ID_FMT ") which is declared "
+			"to be processing inside InnoDB", trx, trx->id);
+
+		trx_print(stderr, trx, 600);
+		putc('\n', stderr);
+
+		/* This is an error but not a fatal error. We must keep
+		the counters like srv_conc_n_threads accurate. */
+		srv_conc_force_exit_innodb(trx);
+	}
+
+	if (trx->n_mysql_tables_in_use != 0
+	    || trx->mysql_n_tables_locked != 0) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"MySQL is freeing a thd though "
+			"trx->n_mysql_tables_in_use is %lu and "
+			"trx->mysql_n_tables_locked is %lu.",
+			(ulong) trx->n_mysql_tables_in_use,
+			(ulong) trx->mysql_n_tables_locked);
+
+		trx_print(stderr, trx, 600);
+		ut_print_buf(stderr, trx, sizeof(trx_t));
+		putc('\n', stderr);
+	}
+
+	ut_a(trx->state == TRX_STATE_NOT_STARTED);
+	ut_a(trx->insert_undo == NULL);
+	ut_a(trx->update_undo == NULL);
+	ut_a(trx->read_view == NULL);
+
+	trx_free(trx);
+}
+
+/********************************************************************//**
+At shutdown, frees a transaction object that is in the PREPARED state. */
+UNIV_INTERN
+void
+trx_free_prepared(
+/*==============*/
+	trx_t*	trx)	/*!< in, own: trx object */
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_a(trx_state_eq(trx, TRX_STATE_PREPARED));
+	ut_a(trx->magic_n == TRX_MAGIC_N);
+
+	trx_undo_free_prepared(trx);
+
+	assert_trx_in_rw_list(trx);
+
+	ut_a(!trx->read_only);
+
+	UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx);
+	ut_d(trx->in_rw_trx_list = FALSE);
+
+	trx_release_descriptor(trx);
+
+	/* Undo trx_resurrect_table_locks(). */
+	UT_LIST_INIT(trx->lock.trx_locks);
+
+	trx_free_low(trx);
+
+	ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->rw_trx_list));
+}
+
+/********************************************************************//**
+Frees a transaction object for MySQL. */
+UNIV_INTERN
+void
+trx_free_for_mysql(
+/*===============*/
+	trx_t*	trx)	/*!< in, own: trx object */
+{
+	if (trx->distinct_page_access_hash)
+	{
+		mem_free(trx->distinct_page_access_hash);
+		trx->distinct_page_access_hash= NULL;
+	}
+
+	mutex_enter(&trx_sys->mutex);
+
+	ut_ad(trx->in_mysql_trx_list);
+	ut_d(trx->in_mysql_trx_list = FALSE);
+	UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx);
+
+	ut_ad(trx_sys_validate_trx_list());
+
+	mutex_exit(&trx_sys->mutex);
+
+	trx_free_for_background(trx);
+}
+
+/****************************************************************//**
+Inserts the trx handle in the trx system trx list in the right position.
+The list is sorted on the trx id so that the biggest id is at the list
+start. This function is used at the database startup to insert incomplete
+transactions to the list. */
+static
+void
+trx_list_rw_insert_ordered(
+/*=======================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	trx_t*	trx2;
+
+	ut_ad(!trx->read_only);
+
+	ut_d(trx->start_file = __FILE__);
+	ut_d(trx->start_line = __LINE__);
+
+	ut_a(srv_is_being_started);
+	ut_ad(!trx->in_ro_trx_list);
+	ut_ad(!trx->in_rw_trx_list);
+	ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+	ut_ad(trx->is_recovered);
+
+	for (trx2 = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+	     trx2 != NULL;
+	     trx2 = UT_LIST_GET_NEXT(trx_list, trx2)) {
+
+		assert_trx_in_rw_list(trx2);
+
+		if (trx->id >= trx2->id) {
+
+			ut_ad(trx->id > trx2->id);
+			break;
+		}
+	}
+
+	if (trx2 != NULL) {
+		trx2 = UT_LIST_GET_PREV(trx_list, trx2);
+
+		if (trx2 == NULL) {
+			UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx);
+			ut_d(trx_sys->rw_max_trx_id = trx->id);
+		} else {
+			UT_LIST_INSERT_AFTER(
+				trx_list, trx_sys->rw_trx_list, trx2, trx);
+		}
+	} else {
+		UT_LIST_ADD_LAST(trx_list, trx_sys->rw_trx_list, trx);
+	}
+
+	ut_ad(!trx->in_rw_trx_list);
+	ut_d(trx->in_rw_trx_list = TRUE);
+}
+
+/****************************************************************//**
+Resurrect the table locks for a resurrected transaction. */
+static
+void
+trx_resurrect_table_locks(
+/*======================*/
+	trx_t*			trx,	/*!< in/out: transaction */
+	const trx_undo_t*	undo)	/*!< in: undo log */
+{
+	mtr_t			mtr;
+	page_t*			undo_page;
+	trx_undo_rec_t*		undo_rec;
+	table_id_set		tables;
+
+	ut_ad(undo == trx->insert_undo || undo == trx->update_undo);
+
+	if (trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)
+	    || undo->empty) {
+		return;
+	}
+
+	mtr_start(&mtr);
+	/* trx_rseg_mem_create() may have acquired an X-latch on this
+	page, so we cannot acquire an S-latch. */
+	undo_page = trx_undo_page_get(
+		undo->space, undo->zip_size, undo->top_page_no, &mtr);
+	undo_rec = undo_page + undo->top_offset;
+
+	do {
+		ulint		type;
+		ulint		cmpl_info;
+		bool		updated_extern;
+		undo_no_t	undo_no;
+		table_id_t	table_id;
+
+		page_t*		undo_rec_page = page_align(undo_rec);
+
+		if (undo_rec_page != undo_page) {
+			if (!mtr_memo_release(&mtr,
+					      buf_block_align(undo_page),
+					      MTR_MEMO_PAGE_X_FIX)) {
+				/* The page of the previous undo_rec
+				should have been latched by
+				trx_undo_page_get() or
+				trx_undo_get_prev_rec(). */
+				ut_ad(0);
+			}
+
+			undo_page = undo_rec_page;
+		}
+
+		trx_undo_rec_get_pars(
+			undo_rec, &type, &cmpl_info,
+			&updated_extern, &undo_no, &table_id);
+		tables.insert(table_id);
+
+		undo_rec = trx_undo_get_prev_rec(
+			undo_rec, undo->hdr_page_no,
+			undo->hdr_offset, false, &mtr);
+	} while (undo_rec);
+
+	mtr_commit(&mtr);
+
+	for (table_id_set::const_iterator i = tables.begin();
+	     i != tables.end(); i++) {
+		if (dict_table_t* table = dict_table_open_on_id(
+			    *i, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE)) {
+			if (table->ibd_file_missing
+			    || dict_table_is_temporary(table)) {
+				mutex_enter(&dict_sys->mutex);
+				dict_table_close(table, TRUE, FALSE);
+				dict_table_remove_from_cache(table);
+				mutex_exit(&dict_sys->mutex);
+				continue;
+			}
+
+			lock_table_ix_resurrect(table, trx);
+
+			DBUG_PRINT("ib_trx",
+				   ("resurrect" TRX_ID_FMT
+				    "  table '%s' IX lock from %s undo",
+				    trx->id, table->name,
+				    undo == trx->insert_undo
+				    ? "insert" : "update"));
+
+			dict_table_close(table, FALSE, FALSE);
+		}
+	}
+}
+
+/****************************************************************//**
+Resurrect the transactions that were doing inserts the time of the
+crash, they need to be undone.
+@return trx_t instance  */
+static
+trx_t*
+trx_resurrect_insert(
+/*=================*/
+	trx_undo_t*	undo,		/*!< in: entry to UNDO */
+	trx_rseg_t*	rseg)		/*!< in: rollback segment */
+{
+	trx_t*		trx;
+
+	trx = trx_allocate_for_background();
+
+	trx->rseg = rseg;
+	trx->xid = undo->xid;
+	trx->id = undo->trx_id;
+	trx->insert_undo = undo;
+	trx->is_recovered = TRUE;
+
+	/* This is single-threaded startup code, we do not need the
+	protection of trx->mutex or trx_sys->mutex here. */
+
+	if (undo->state != TRX_UNDO_ACTIVE) {
+
+		/* Prepared transactions are left in the prepared state
+		waiting for a commit or abort decision from MySQL */
+
+		if (undo->state == TRX_UNDO_PREPARED) {
+
+			fprintf(stderr,
+				"InnoDB: Transaction " TRX_ID_FMT " was in the"
+				" XA prepared state.\n", trx->id);
+
+			if (srv_force_recovery == 0) {
+
+				trx->state = TRX_STATE_PREPARED;
+				trx_sys->n_prepared_trx++;
+				trx_sys->n_prepared_recovered_trx++;
+			} else {
+				fprintf(stderr,
+					"InnoDB: Since innodb_force_recovery"
+					" > 0, we will rollback it anyway.\n");
+
+				trx->state = TRX_STATE_ACTIVE;
+			}
+		} else {
+			trx->state = TRX_STATE_COMMITTED_IN_MEMORY;
+		}
+
+		/* We give a dummy value for the trx no; this should have no
+		relevance since purge is not interested in committed
+		transaction numbers, unless they are in the history
+		list, in which case it looks the number from the disk based
+		undo log structure */
+
+		trx->no = trx->id;
+	} else {
+		trx->state = TRX_STATE_ACTIVE;
+
+		/* A running transaction always has the number
+		field inited to TRX_ID_MAX */
+
+		trx->no = TRX_ID_MAX;
+	}
+
+	if (undo->dict_operation) {
+		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+		trx->table_id = undo->table_id;
+	}
+
+	if (!undo->empty) {
+		trx->undo_no = undo->top_undo_no + 1;
+	}
+
+	return(trx);
+}
+
+/****************************************************************//**
+Prepared transactions are left in the prepared state waiting for a
+commit or abort decision from MySQL */
+static
+void
+trx_resurrect_update_in_prepared_state(
+/*===================================*/
+	trx_t*			trx,	/*!< in,out: transaction */
+	const trx_undo_t*	undo)	/*!< in: update UNDO record */
+{
+	/* This is single-threaded startup code, we do not need the
+	protection of trx->mutex or trx_sys->mutex here. */
+
+	if (undo->state == TRX_UNDO_PREPARED) {
+		fprintf(stderr,
+			"InnoDB: Transaction " TRX_ID_FMT
+			" was in the XA prepared state.\n", trx->id);
+
+		if (srv_force_recovery == 0) {
+			if (trx_state_eq(trx, TRX_STATE_NOT_STARTED)) {
+				trx_sys->n_prepared_trx++;
+				trx_sys->n_prepared_recovered_trx++;
+			} else {
+				ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED));
+			}
+
+			trx->state = TRX_STATE_PREPARED;
+		} else {
+			fprintf(stderr,
+				"InnoDB: Since innodb_force_recovery"
+				" > 0, we will rollback it anyway.\n");
+
+			trx->state = TRX_STATE_ACTIVE;
+		}
+	} else {
+		trx->state = TRX_STATE_COMMITTED_IN_MEMORY;
+	}
+}
+
+/****************************************************************//**
+Resurrect the transactions that were doing updates the time of the
+crash, they need to be undone. */
+static
+void
+trx_resurrect_update(
+/*=================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	trx_undo_t*	undo,	/*!< in/out: update UNDO record */
+	trx_rseg_t*	rseg)	/*!< in/out: rollback segment */
+{
+	trx->rseg = rseg;
+	trx->xid = undo->xid;
+	trx->id = undo->trx_id;
+	trx->update_undo = undo;
+	trx->is_recovered = TRUE;
+
+	/* This is single-threaded startup code, we do not need the
+	protection of trx->mutex or trx_sys->mutex here. */
+
+	if (undo->state != TRX_UNDO_ACTIVE) {
+		trx_resurrect_update_in_prepared_state(trx, undo);
+
+		/* We give a dummy value for the trx number */
+
+		trx->no = trx->id;
+
+	} else {
+		trx->state = TRX_STATE_ACTIVE;
+
+		/* A running transaction always has the number field inited to
+		TRX_ID_MAX */
+
+		trx->no = TRX_ID_MAX;
+	}
+
+	if (undo->dict_operation) {
+		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+		trx->table_id = undo->table_id;
+	}
+
+	if (!undo->empty && undo->top_undo_no >= trx->undo_no) {
+
+		trx->undo_no = undo->top_undo_no + 1;
+	}
+}
+
+/****************************************************************//**
+Creates trx objects for transactions and initializes the trx list of
+trx_sys at database start. Rollback segment and undo log lists must
+already exist when this function is called, because the lists of
+transactions to be rolled back or cleaned up are built based on the
+undo log lists. */
+UNIV_INTERN
+void
+trx_lists_init_at_db_start(void)
+/*============================*/
+{
+	ulint		i;
+
+	ut_a(srv_is_being_started);
+
+	UT_LIST_INIT(trx_sys->ro_trx_list);
+	UT_LIST_INIT(trx_sys->rw_trx_list);
+	UT_LIST_INIT(trx_sys->trx_serial_list);
+
+	/* Look from the rollback segments if there exist undo logs for
+	transactions */
+
+	for (i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+		trx_undo_t*	undo;
+		trx_rseg_t*	rseg;
+
+		rseg = trx_sys->rseg_array[i];
+
+		if (rseg == NULL) {
+			continue;
+		}
+
+		/* Resurrect transactions that were doing inserts. */
+		for (undo = UT_LIST_GET_FIRST(rseg->insert_undo_list);
+		     undo != NULL;
+		     undo = UT_LIST_GET_NEXT(undo_list, undo)) {
+			trx_t*	trx;
+
+			trx = trx_resurrect_insert(undo, rseg);
+
+			if (trx->state == TRX_STATE_ACTIVE ||
+			    trx->state == TRX_STATE_PREPARED) {
+
+				trx_reserve_descriptor(trx);
+			}
+			trx_list_rw_insert_ordered(trx);
+
+			trx_resurrect_table_locks(trx, undo);
+		}
+
+		/* Ressurrect transactions that were doing updates. */
+		for (undo = UT_LIST_GET_FIRST(rseg->update_undo_list);
+		     undo != NULL;
+		     undo = UT_LIST_GET_NEXT(undo_list, undo)) {
+			trx_t*	trx;
+			ibool	trx_created;
+
+			/* Check the trx_sys->rw_trx_list first. */
+			mutex_enter(&trx_sys->mutex);
+			trx = trx_get_rw_trx_by_id(undo->trx_id);
+			mutex_exit(&trx_sys->mutex);
+
+			if (trx == NULL) {
+				trx = trx_allocate_for_background();
+				trx_created = TRUE;
+			} else {
+				trx_created = FALSE;
+			}
+
+			trx_resurrect_update(trx, undo, rseg);
+
+			if (trx_created) {
+				if (trx->state == TRX_STATE_ACTIVE ||
+				    trx->state == TRX_STATE_PREPARED) {
+
+					trx_reserve_descriptor(trx);
+				}
+				trx_list_rw_insert_ordered(trx);
+			}
+
+			trx_resurrect_table_locks(trx, undo);
+		}
+	}
+}
+
+/******************************************************************//**
+Assigns a rollback segment to a transaction in a round-robin fashion.
+@return	assigned rollback segment instance */
+static
+trx_rseg_t*
+trx_assign_rseg_low(
+/*================*/
+	ulong	max_undo_logs,	/*!< in: maximum number of UNDO logs to use */
+	ulint	n_tablespaces)	/*!< in: number of rollback tablespaces */
+{
+	ulint		i;
+	trx_rseg_t*	rseg;
+	static ulint	latest_rseg = 0;
+
+	if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO || srv_read_only_mode) {
+		ut_a(max_undo_logs == ULONG_UNDEFINED);
+		return(NULL);
+	}
+
+	/* This breaks true round robin but that should be OK. */
+
+	ut_a(max_undo_logs > 0 && max_undo_logs <= TRX_SYS_N_RSEGS);
+
+	i = latest_rseg++;
+        i %= max_undo_logs;
+
+	/* Note: The assumption here is that there can't be any gaps in
+	the array. Once we implement more flexible rollback segment
+	management this may not hold. The assertion checks for that case. */
+
+	ut_a(trx_sys->rseg_array[0] != NULL);
+
+	/* Skip the system tablespace if we have more than one tablespace
+	defined for rollback segments. We want all UNDO records to be in
+	the non-system tablespaces. */
+
+	do {
+		rseg = trx_sys->rseg_array[i];
+		ut_a(rseg == NULL || i == rseg->id);
+
+		i = (rseg == NULL) ? 0 : i + 1;
+
+	} while (rseg == NULL
+		 || (rseg->space == 0
+		     && n_tablespaces > 0
+		     && trx_sys->rseg_array[1] != NULL));
+
+	return(rseg);
+}
+
+/****************************************************************//**
+Assign a read-only transaction a rollback-segment, if it is attempting
+to write to a TEMPORARY table. */
+UNIV_INTERN
+void
+trx_assign_rseg(
+/*============*/
+	trx_t*		trx)		/*!< A read-only transaction that
+					needs to be assigned a RBS. */
+{
+	ut_a(trx->rseg == 0);
+	ut_a(trx->read_only);
+	ut_a(!srv_read_only_mode);
+	ut_a(!trx_is_autocommit_non_locking(trx));
+
+	trx->rseg = trx_assign_rseg_low(srv_undo_logs, srv_undo_tablespaces);
+}
+
+/****************************************************************//**
+Starts a transaction. */
+static
+void
+trx_start_low(
+/*==========*/
+	trx_t*	trx)		/*!< in: transaction */
+{
+	ut_ad(trx->rseg == NULL);
+
+	ut_ad(trx->start_file != 0);
+	ut_ad(trx->start_line != 0);
+	ut_ad(!trx->is_recovered);
+	ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+	ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+
+	/* Check whether it is an AUTOCOMMIT SELECT */
+	trx->auto_commit = thd_trx_is_auto_commit(trx->mysql_thd);
+
+	trx->read_only =
+		(!trx->ddl && thd_trx_is_read_only(trx->mysql_thd))
+		|| srv_read_only_mode;
+
+	if (!trx->auto_commit) {
+		++trx->will_lock;
+	} else if (trx->will_lock == 0) {
+		trx->read_only = TRUE;
+	}
+
+	if (!trx->read_only) {
+		trx->rseg = trx_assign_rseg_low(
+			srv_undo_logs, srv_undo_tablespaces);
+	}
+
+	/* The initial value for trx->no: TRX_ID_MAX is used in
+	read_view_open_now: */
+
+	trx->no = TRX_ID_MAX;
+
+	ut_a(ib_vector_is_empty(trx->autoinc_locks));
+	ut_a(ib_vector_is_empty(trx->lock.table_locks));
+
+	mutex_enter(&trx_sys->mutex);
+
+	/* If this transaction came from trx_allocate_for_mysql(),
+	trx->in_mysql_trx_list would hold. In that case, the trx->state
+	change must be protected by the trx_sys->mutex, so that
+	lock_print_info_all_transactions() will have a consistent view. */
+
+	trx->state = TRX_STATE_ACTIVE;
+
+	trx->id = trx_sys_get_new_trx_id();
+
+	ut_ad(!trx->in_rw_trx_list);
+	ut_ad(!trx->in_ro_trx_list);
+
+	if (trx->read_only) {
+
+		/* Note: The trx_sys_t::ro_trx_list doesn't really need to
+		be ordered, we should exploit this using a list type that
+		doesn't need a list wide lock to increase concurrency. */
+
+		if (!trx_is_autocommit_non_locking(trx)) {
+			UT_LIST_ADD_FIRST(trx_list, trx_sys->ro_trx_list, trx);
+			ut_d(trx->in_ro_trx_list = TRUE);
+		}
+	} else {
+
+		ut_ad(trx->rseg != NULL
+		      || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
+
+		ut_ad(!trx_is_autocommit_non_locking(trx));
+		UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx);
+		ut_d(trx->in_rw_trx_list = TRUE);
+		ut_d(trx_sys->rw_max_trx_id = trx->id);
+
+		trx_reserve_descriptor(trx);
+	}
+
+	ut_ad(trx_sys_validate_trx_list());
+
+	mutex_exit(&trx_sys->mutex);
+
+	trx->start_time = ut_time();
+
+	MONITOR_INC(MONITOR_TRX_ACTIVE);
+}
+
+/****************************************************************//**
+Set the transaction serialisation number. */
+static
+void
+trx_serialisation_number_get(
+/*=========================*/
+	trx_t*		trx)	/*!< in: transaction */
+{
+	trx_rseg_t*	rseg;
+
+	rseg = trx->rseg;
+
+	ut_ad(mutex_own(&rseg->mutex));
+
+	mutex_enter(&trx_sys->mutex);
+
+	trx->no = trx_sys_get_new_trx_id();
+
+	if (UNIV_LIKELY(!trx->in_trx_serial_list)) {
+
+		UT_LIST_ADD_LAST(trx_serial_list, trx_sys->trx_serial_list,
+				 trx);
+
+		trx->in_trx_serial_list = true;
+	}
+
+	/* If the rollack segment is not empty then the
+	new trx_t::no can't be less than any trx_t::no
+	already in the rollback segment. User threads only
+	produce events when a rollback segment is empty. */
+
+	if (rseg->last_page_no == FIL_NULL) {
+		void*		ptr;
+		rseg_queue_t	rseg_queue;
+
+		rseg_queue.rseg = rseg;
+		rseg_queue.trx_no = trx->no;
+
+		mutex_enter(&purge_sys->bh_mutex);
+
+		/* This is to reduce the pressure on the trx_sys_t::mutex
+		though in reality it should make very little (read no)
+		difference because this code path is only taken when the
+		rbs is empty. */
+
+		mutex_exit(&trx_sys->mutex);
+
+		ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue);
+		ut_a(ptr);
+
+		mutex_exit(&purge_sys->bh_mutex);
+	} else {
+		mutex_exit(&trx_sys->mutex);
+	}
+}
+
+/****************************************************************//**
+Assign the transaction its history serialisation number and write the
+update UNDO log record to the assigned rollback segment. */
+static __attribute__((nonnull))
+void
+trx_write_serialisation_history(
+/*============================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	trx_rseg_t*	rseg;
+
+	rseg = trx->rseg;
+
+	/* Change the undo log segment states from TRX_UNDO_ACTIVE
+	to some other state: these modifications to the file data
+	structure define the transaction as committed in the file
+	based domain, at the serialization point of the log sequence
+	number lsn obtained below. */
+
+	if (trx->update_undo != NULL) {
+		page_t*		undo_hdr_page;
+		trx_undo_t*	undo = trx->update_undo;
+
+		/* We have to hold the rseg mutex because update
+		log headers have to be put to the history list in the
+		(serialisation) order of the UNDO trx number. This is
+		required for the purge in-memory data structures too. */
+
+		mutex_enter(&rseg->mutex);
+
+		/* Assign the transaction serialisation number and also
+		update the purge min binary heap if this is the first
+		UNDO log being written to the assigned rollback segment. */
+
+		trx_serialisation_number_get(trx);
+
+		/* It is not necessary to obtain trx->undo_mutex here
+		because only a single OS thread is allowed to do the
+		transaction commit for this transaction. */
+
+		undo_hdr_page = trx_undo_set_state_at_finish(undo, mtr);
+
+		trx_undo_update_cleanup(trx, undo_hdr_page, mtr);
+	} else {
+		mutex_enter(&rseg->mutex);
+	}
+
+	if (trx->insert_undo != NULL) {
+		trx_undo_set_state_at_finish(trx->insert_undo, mtr);
+	}
+
+	mutex_exit(&rseg->mutex);
+
+	MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
+
+	/* Update the latest MySQL binlog name and offset info
+	in trx sys header if MySQL binlogging is on or the database
+	server is a MySQL replication slave */
+
+	if (trx->mysql_log_file_name
+	    && trx->mysql_log_file_name[0] != '\0') {
+
+		trx_sys_update_mysql_binlog_offset(
+			trx->mysql_log_file_name,
+			trx->mysql_log_offset,
+			TRX_SYS_MYSQL_LOG_INFO, mtr);
+
+		trx->mysql_log_file_name = NULL;
+	}
+}
+
+/********************************************************************
+Finalize a transaction containing updates for a FTS table. */
+static __attribute__((nonnull))
+void
+trx_finalize_for_fts_table(
+/*=======================*/
+        fts_trx_table_t*        ftt)            /* in: FTS trx table */
+{
+	fts_t*                  fts = ftt->table->fts;
+	fts_doc_ids_t*          doc_ids = ftt->added_doc_ids;
+
+	mutex_enter(&fts->bg_threads_mutex);
+
+	if (fts->fts_status & BG_THREAD_STOP) {
+		/* The table is about to be dropped, no use
+		adding anything to its work queue. */
+
+		mutex_exit(&fts->bg_threads_mutex);
+	} else {
+		mem_heap_t*     heap;
+		mutex_exit(&fts->bg_threads_mutex);
+
+		ut_a(fts->add_wq);
+
+		heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg);
+
+		ib_wqueue_add(fts->add_wq, doc_ids, heap);
+
+		/* fts_trx_table_t no longer owns the list. */
+		ftt->added_doc_ids = NULL;
+	}
+}
+
+/******************************************************************//**
+Finalize a transaction containing updates to FTS tables. */
+static __attribute__((nonnull))
+void
+trx_finalize_for_fts(
+/*=================*/
+	trx_t*	trx,		/*!< in/out: transaction */
+	bool	is_commit)	/*!< in: true if the transaction was
+				committed, false if it was rolled back. */
+{
+	if (is_commit) {
+		const ib_rbt_node_t*	node;
+		ib_rbt_t*		tables;
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_last(trx->fts_trx->savepoints));
+
+		tables = savepoint->tables;
+
+		for (node = rbt_first(tables);
+		     node;
+		     node = rbt_next(tables, node)) {
+			fts_trx_table_t**	ftt;
+
+			ftt = rbt_value(fts_trx_table_t*, node);
+
+			if ((*ftt)->added_doc_ids) {
+				trx_finalize_for_fts_table(*ftt);
+			}
+		}
+	}
+
+	fts_trx_free(trx->fts_trx);
+	trx->fts_trx = NULL;
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk based on the value of
+innodb_flush_log_at_trx_commit. */
+static
+void
+trx_flush_log_if_needed_low(
+/*========================*/
+	lsn_t	lsn,	/*!< in: lsn up to which logs are to be
+			flushed. */
+	trx_t*	trx)	/*!< in: transaction */
+{
+	ulint	flush_log_at_trx_commit;
+
+	flush_log_at_trx_commit = srv_use_global_flush_log_at_trx_commit
+		? thd_flush_log_at_trx_commit(NULL)
+		: thd_flush_log_at_trx_commit(trx->mysql_thd);
+
+	switch (flush_log_at_trx_commit) {
+	case 0:
+		/* Do nothing */
+		break;
+	case 1:
+        case 3:
+		/* Write the log and optionally flush it to disk */
+		log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
+				srv_unix_file_flush_method != SRV_UNIX_NOSYNC);
+		break;
+	case 2:
+		/* Write the log but do not flush it to disk */
+		log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+
+		break;
+	default:
+		ut_error;
+	}
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk based on the value of
+innodb_flush_log_at_trx_commit. */
+static __attribute__((nonnull))
+void
+trx_flush_log_if_needed(
+/*====================*/
+	lsn_t	lsn,	/*!< in: lsn up to which logs are to be
+			flushed. */
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	trx->op_info = "flushing log";
+	trx_flush_log_if_needed_low(lsn, trx);
+	trx->op_info = "";
+}
+
+/****************************************************************//**
+Commits a transaction in memory. */
+static __attribute__((nonnull))
+void
+trx_commit_in_memory(
+/*=================*/
+	trx_t*	trx,	/*!< in/out: transaction */
+	lsn_t	lsn)	/*!< in: log sequence number of the mini-transaction
+			commit of trx_write_serialisation_history(), or 0
+			if the transaction did not modify anything */
+{
+	trx->must_flush_log_later = FALSE;
+
+	if (trx_is_autocommit_non_locking(trx)) {
+		ut_ad(trx->read_only);
+		ut_a(!trx->is_recovered);
+		ut_ad(trx->rseg == NULL);
+		ut_ad(!trx->in_ro_trx_list);
+		ut_ad(!trx->in_rw_trx_list);
+
+		/* Note: We are asserting without holding the lock mutex. But
+		that is OK because this transaction is not waiting and cannot
+		be rolled back and no new locks can (or should not) be added
+		becuase it is flagged as a non-locking read-only transaction. */
+
+		ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+
+		/* This state change is not protected by any mutex, therefore
+		there is an inherent race here around state transition during
+		printouts. We ignore this race for the sake of efficiency.
+		However, the trx_sys_t::mutex will protect the trx_t instance
+		and it cannot be removed from the mysql_trx_list and freed
+		without first acquiring the trx_sys_t::mutex. */
+
+		ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+
+		trx->state = TRX_STATE_NOT_STARTED;
+
+		read_view_remove(trx->global_read_view, false);
+
+		MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT);
+	} else {
+		lock_trx_release_locks(trx);
+
+		/* Remove the transaction from the list of active
+		transactions now that it no longer holds any user locks. */
+
+		ut_ad(trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY));
+
+		mutex_enter(&trx_sys->mutex);
+
+		assert_trx_in_list(trx);
+
+		if (trx->read_only) {
+			UT_LIST_REMOVE(trx_list, trx_sys->ro_trx_list, trx);
+			ut_d(trx->in_ro_trx_list = FALSE);
+			MONITOR_INC(MONITOR_TRX_RO_COMMIT);
+		} else {
+			UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx);
+			ut_d(trx->in_rw_trx_list = FALSE);
+			ut_ad(trx_sys->descr_n_used <=
+			      UT_LIST_GET_LEN(trx_sys->rw_trx_list));
+			MONITOR_INC(MONITOR_TRX_RW_COMMIT);
+		}
+
+		/* If this transaction came from trx_allocate_for_mysql(),
+		trx->in_mysql_trx_list would hold. In that case, the
+		trx->state change must be protected by trx_sys->mutex, so that
+		lock_print_info_all_transactions() will have a consistent
+		view. */
+
+		trx->state = TRX_STATE_NOT_STARTED;
+
+		/* We already own the trx_sys_t::mutex, by doing it here we
+		avoid a potential context switch later. */
+		read_view_remove(trx->global_read_view, true);
+
+		ut_ad(trx_sys_validate_trx_list());
+
+		mutex_exit(&trx_sys->mutex);
+	}
+
+	if (trx->global_read_view != NULL) {
+
+		trx->global_read_view = NULL;
+	}
+
+	trx->read_view = NULL;
+
+	if (lsn) {
+		ulint	flush_log_at_trx_commit;
+
+		if (trx->insert_undo != NULL) {
+
+			trx_undo_insert_cleanup(trx);
+		}
+
+		if (srv_use_global_flush_log_at_trx_commit) {
+			flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
+		} else {
+			flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
+		}
+
+		/* NOTE that we could possibly make a group commit more
+		efficient here: call os_thread_yield here to allow also other
+		trxs to come to commit! */
+
+		/*-------------------------------------*/
+
+		/* Depending on the my.cnf options, we may now write the log
+		buffer to the log files, making the transaction durable if
+		the OS does not crash. We may also flush the log files to
+		disk, making the transaction durable also at an OS crash or a
+		power outage.
+
+		The idea in InnoDB's group commit is that a group of
+		transactions gather behind a trx doing a physical disk write
+		to log files, and when that physical write has been completed,
+		one of those transactions does a write which commits the whole
+		group. Note that this group commit will only bring benefit if
+		there are > 2 users in the database. Then at least 2 users can
+		gather behind one doing the physical log write to disk.
+
+		If we are calling trx_commit() under prepare_commit_mutex, we
+		will delay possible log write and flush to a separate function
+		trx_commit_complete_for_mysql(), which is only called when the
+		thread has released the mutex. This is to make the
+		group commit algorithm to work. Otherwise, the prepare_commit
+		mutex would serialize all commits and prevent a group of
+		transactions from gathering. */
+
+		if (trx->flush_log_later) {
+			/* Do nothing yet */
+			trx->must_flush_log_later = TRUE;
+		} else if (flush_log_at_trx_commit == 0
+			   || thd_requested_durability(trx->mysql_thd)
+			   == HA_IGNORE_DURABILITY) {
+			/* Do nothing */
+		} else {
+			trx_flush_log_if_needed(lsn, trx);
+		}
+
+		trx->commit_lsn = lsn;
+	}
+
+	/* undo_no is non-zero if we're doing the final commit. */
+	bool			not_rollback = trx->undo_no != 0;
+	/* Free all savepoints, starting from the first. */
+	trx_named_savept_t*	savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	trx_roll_savepoints_free(trx, savep);
+
+	trx->rseg = NULL;
+	trx->undo_no = 0;
+	trx->last_sql_stat_start.least_undo_no = 0;
+
+	trx->ddl = false;
+#ifdef UNIV_DEBUG
+	ut_ad(trx->start_file != 0);
+	ut_ad(trx->start_line != 0);
+	trx->start_file = 0;
+	trx->start_line = 0;
+#endif /* UNIV_DEBUG */
+
+	trx->will_lock = 0;
+	trx->read_only = FALSE;
+	trx->auto_commit = FALSE;
+
+        if (trx->fts_trx) {
+                trx_finalize_for_fts(trx, not_rollback);
+        }
+
+	ut_ad(trx->lock.wait_thr == NULL);
+	ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+	ut_ad(!trx->in_ro_trx_list);
+	ut_ad(!trx->in_rw_trx_list);
+
+	trx->dict_operation = TRX_DICT_OP_NONE;
+
+	ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->rw_trx_list));
+
+	trx->error_state = DB_SUCCESS;
+
+	/* trx->in_mysql_trx_list would hold between
+	trx_allocate_for_mysql() and trx_free_for_mysql(). It does not
+	hold for recovered transactions or system transactions. */
+}
+
+/****************************************************************//**
+Commits a transaction and a mini-transaction. */
+UNIV_INTERN
+void
+trx_commit_low(
+/*===========*/
+	trx_t*	trx,	/*!< in/out: transaction */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction (will be committed),
+			or NULL if trx made no modifications */
+{
+	lsn_t	lsn;
+
+	assert_trx_nonlocking_or_in_list(trx);
+	ut_ad(!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY));
+	ut_ad(!mtr || mtr->state == MTR_ACTIVE);
+	ut_ad(!mtr == !(trx->insert_undo || trx->update_undo));
+
+	/* undo_no is non-zero if we're doing the final commit. */
+	if (trx->fts_trx && trx->undo_no != 0) {
+		dberr_t	error;
+
+		ut_a(!trx_is_autocommit_non_locking(trx));
+
+		error = fts_commit(trx);
+
+		/* FTS-FIXME: Temporarily tolerate DB_DUPLICATE_KEY
+		instead of dying. This is a possible scenario if there
+		is a crash between insert to DELETED table committing
+		and transaction committing. The fix would be able to
+		return error from this function */
+		if (error != DB_SUCCESS && error != DB_DUPLICATE_KEY) {
+			/* FTS-FIXME: once we can return values from this
+			function, we should do so and signal an error
+			instead of just dying. */
+
+			ut_error;
+		}
+	}
+
+	if (mtr) {
+		trx_write_serialisation_history(trx, mtr);
+		/* The following call commits the mini-transaction, making the
+		whole transaction committed in the file-based world, at this
+		log sequence number. The transaction becomes 'durable' when
+		we write the log to disk, but in the logical sense the commit
+		in the file-based data structures (undo logs etc.) happens
+		here.
+
+		NOTE that transaction numbers, which are assigned only to
+		transactions with an update undo log, do not necessarily come
+		in exactly the same order as commit lsn's, if the transactions
+		have different rollback segments. To get exactly the same
+		order we should hold the kernel mutex up to this point,
+		adding to the contention of the kernel mutex. However, if
+		a transaction T2 is able to see modifications made by
+		a transaction T1, T2 will always get a bigger transaction
+		number and a bigger commit lsn than T1. */
+
+		/*--------------*/
+		mtr_commit(mtr);
+		/*--------------*/
+		lsn = mtr->end_lsn;
+	} else {
+		lsn = 0;
+	}
+
+	trx_commit_in_memory(trx, lsn);
+}
+
+/****************************************************************//**
+Commits a transaction. */
+UNIV_INTERN
+void
+trx_commit(
+/*=======*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	mtr_t	local_mtr;
+	mtr_t*	mtr;
+
+	if (trx->insert_undo || trx->update_undo) {
+		mtr = &local_mtr;
+		mtr_start(mtr);
+	} else {
+		mtr = NULL;
+	}
+
+	trx_commit_low(trx, mtr);
+}
+
+/****************************************************************//**
+Cleans up a transaction at database startup. The cleanup is needed if
+the transaction already got to the middle of a commit when the database
+crashed, and we cannot roll it back. */
+UNIV_INTERN
+void
+trx_cleanup_at_db_startup(
+/*======================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	ut_ad(trx->is_recovered);
+
+	if (trx->insert_undo != NULL) {
+
+		trx_undo_insert_cleanup(trx);
+	}
+
+	trx->rseg = NULL;
+	trx->undo_no = 0;
+	trx->last_sql_stat_start.least_undo_no = 0;
+
+	mutex_enter(&trx_sys->mutex);
+
+	ut_a(!trx->read_only);
+
+	UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx);
+	ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->rw_trx_list));
+
+	assert_trx_in_rw_list(trx);
+	ut_d(trx->in_rw_trx_list = FALSE);
+
+	trx->state = TRX_STATE_NOT_STARTED;
+	trx_release_descriptor(trx);
+
+	mutex_exit(&trx_sys->mutex);
+
+	/* Change the transaction state without mutex protection, now
+	that it no longer is in the trx_list. Recovered transactions
+	are never placed in the mysql_trx_list. */
+	ut_ad(trx->is_recovered);
+	ut_ad(!trx->in_ro_trx_list);
+	ut_ad(!trx->in_rw_trx_list);
+	ut_ad(!trx->in_mysql_trx_list);
+}
+
+/********************************************************************//**
+Assigns a read view for a consistent read query. All the consistent reads
+within the same transaction will get the same read view, which is created
+when this function is first called for a new started transaction.
+@return	consistent read view */
+UNIV_INTERN
+read_view_t*
+trx_assign_read_view(
+/*=================*/
+	trx_t*	trx)	/*!< in: active transaction */
+{
+	ut_ad(trx->state == TRX_STATE_ACTIVE);
+
+	if (trx->read_view != NULL) {
+		return(trx->read_view);
+	}
+
+	trx->read_view = read_view_open_now(trx->id, trx->prebuilt_view);
+	trx->global_read_view = trx->read_view;
+
+	return(trx->read_view);
+}
+
+/****************************************************************//**
+Prepares a transaction for commit/rollback. */
+UNIV_INTERN
+void
+trx_commit_or_rollback_prepare(
+/*===========================*/
+	trx_t*	trx)		/*!< in/out: transaction */
+{
+	/* We are reading trx->state without holding trx_sys->mutex
+	here, because the commit or rollback should be invoked for a
+	running (or recovered prepared) transaction that is associated
+	with the current thread. */
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx_start_low(trx);
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+	case TRX_STATE_PREPARED:
+		/* If the trx is in a lock wait state, moves the waiting
+		query thread to the suspended state */
+
+		if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+			ulint		sec;
+			ulint		ms;
+			ib_uint64_t	now;
+
+			ut_a(trx->lock.wait_thr != NULL);
+			trx->lock.wait_thr->state = QUE_THR_SUSPENDED;
+			trx->lock.wait_thr = NULL;
+
+			if (UNIV_UNLIKELY(trx->take_stats)) {
+				ut_usectime(&sec, &ms);
+				now = (ib_uint64_t)sec * 1000000 + ms;
+				trx->lock_que_wait_timer
+					+= (ulint)
+					(now - trx->lock_que_wait_ustarted);
+			}
+
+			trx->lock.que_state = TRX_QUE_RUNNING;
+		}
+
+		ut_a(trx->lock.n_active_thrs == 1);
+		return;
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/*********************************************************************//**
+Creates a commit command node struct.
+@return	own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+trx_commit_node_create(
+/*===================*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	commit_node_t*	node;
+
+	node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node)));
+	node->common.type  = QUE_NODE_COMMIT;
+	node->state = COMMIT_NODE_SEND;
+
+	return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return	query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_commit_step(
+/*============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	commit_node_t*	node;
+
+	node = static_cast<commit_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = COMMIT_NODE_SEND;
+	}
+
+	if (node->state == COMMIT_NODE_SEND) {
+		trx_t*	trx;
+
+		node->state = COMMIT_NODE_WAIT;
+
+		trx = thr_get_trx(thr);
+
+		ut_a(trx->lock.wait_thr == NULL);
+		ut_a(trx->lock.que_state != TRX_QUE_LOCK_WAIT);
+
+		trx_commit_or_rollback_prepare(trx);
+
+		trx->lock.que_state = TRX_QUE_COMMITTING;
+
+		trx_commit(trx);
+
+		ut_ad(trx->lock.wait_thr == NULL);
+
+		trx->lock.que_state = TRX_QUE_RUNNING;
+
+		thr = NULL;
+	} else {
+		ut_ad(node->state == COMMIT_NODE_WAIT);
+
+		node->state = COMMIT_NODE_SEND;
+
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+trx_commit_for_mysql(
+/*=================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	/* Because we do not do the commit by sending an Innobase
+	sig to the transaction, we must here make sure that trx has been
+	started. */
+
+	ut_a(trx);
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		/* Update the info whether we should skip XA steps that eat
+		CPU time.
+
+		For the duration of the transaction trx->support_xa is
+		not reread from thd so any changes in the value take
+		effect in the next transaction. This is to avoid a
+		scenario where some undo log records generated by a
+		transaction contain XA information and other undo log
+		records, generated by the same transaction do not. */
+		trx->support_xa = thd_supports_xa(trx->mysql_thd);
+
+		ut_d(trx->start_file = __FILE__);
+		ut_d(trx->start_line = __LINE__);
+
+		trx_start_low(trx);
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+	case TRX_STATE_PREPARED:
+		trx->op_info = "committing";
+		trx_commit(trx);
+		MONITOR_DEC(MONITOR_TRX_ACTIVE);
+		trx->op_info = "";
+		return(DB_SUCCESS);
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE. */
+UNIV_INTERN
+void
+trx_commit_complete_for_mysql(
+/*==========================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	ut_a(trx);
+
+	if (!trx->must_flush_log_later
+	    || thd_requested_durability(trx->mysql_thd)
+	       == HA_IGNORE_DURABILITY) {
+		return;
+	}
+
+	ulint	flush_log_at_trx_commit;
+
+	flush_log_at_trx_commit = srv_use_global_flush_log_at_trx_commit
+		? thd_flush_log_at_trx_commit(NULL)
+		: thd_flush_log_at_trx_commit(trx->mysql_thd);
+
+	if (flush_log_at_trx_commit == 1 && trx->active_commit_ordered) {
+		return;
+	}
+
+	trx_flush_log_if_needed(trx->commit_lsn, trx);
+
+	trx->must_flush_log_later = FALSE;
+}
+
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+UNIV_INTERN
+void
+trx_mark_sql_stat_end(
+/*==================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	ut_a(trx);
+
+	switch (trx->state) {
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	case TRX_STATE_NOT_STARTED:
+		trx->undo_no = 0;
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+
+		if (trx->fts_trx) {
+			fts_savepoint_laststmt_refresh(trx);
+		}
+
+		return;
+	}
+
+	ut_error;
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+Caller must hold trx_sys->mutex. */
+UNIV_INTERN
+void
+trx_print_low(
+/*==========*/
+	FILE*		f,
+			/*!< in: output stream */
+	const trx_t*	trx,
+			/*!< in: transaction */
+	ulint		max_query_len,
+			/*!< in: max query length to print,
+			or 0 to use the default max length */
+	ulint		n_rec_locks,
+			/*!< in: lock_number_of_rows_locked(&trx->lock) */
+	ulint		n_trx_locks,
+			/*!< in: length of trx->lock.trx_locks */
+	ulint		heap_size)
+			/*!< in: mem_heap_get_size(trx->lock.lock_heap) */
+{
+	ibool		newline;
+	const char*	op_info;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	fprintf(f, "TRANSACTION " TRX_ID_FMT, trx->id);
+
+	/* trx->state cannot change from or to NOT_STARTED while we
+	are holding the trx_sys->mutex. It may change from ACTIVE to
+	PREPARED or COMMITTED. */
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		fputs(", not started", f);
+		goto state_ok;
+	case TRX_STATE_ACTIVE:
+		fprintf(f, ", ACTIVE %lu sec",
+			(ulong) difftime(time(NULL), trx->start_time));
+		goto state_ok;
+	case TRX_STATE_PREPARED:
+		fprintf(f, ", ACTIVE (PREPARED) %lu sec",
+			(ulong) difftime(time(NULL), trx->start_time));
+		goto state_ok;
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		fputs(", COMMITTED IN MEMORY", f);
+		goto state_ok;
+	}
+	fprintf(f, ", state %lu", (ulong) trx->state);
+	ut_ad(0);
+state_ok:
+
+	/* prevent a race condition */
+	op_info = trx->op_info;
+
+	if (*op_info) {
+		putc(' ', f);
+		fputs(op_info, f);
+	}
+
+	if (trx->is_recovered) {
+		fputs(" recovered trx", f);
+	}
+
+	if (trx->declared_to_be_inside_innodb) {
+		fprintf(f, ", thread declared inside InnoDB %lu",
+			(ulong) trx->n_tickets_to_enter_innodb);
+	}
+
+	putc('\n', f);
+
+	if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
+		fprintf(f, "mysql tables in use %lu, locked %lu\n",
+			(ulong) trx->n_mysql_tables_in_use,
+			(ulong) trx->mysql_n_tables_locked);
+	}
+
+	newline = TRUE;
+
+	/* trx->lock.que_state of an ACTIVE transaction may change
+	while we are not holding trx->mutex. We perform a dirty read
+	for performance reasons. */
+
+	switch (trx->lock.que_state) {
+	case TRX_QUE_RUNNING:
+		newline = FALSE; break;
+	case TRX_QUE_LOCK_WAIT:
+		fputs("LOCK WAIT ", f); break;
+	case TRX_QUE_ROLLING_BACK:
+		fputs("ROLLING BACK ", f); break;
+	case TRX_QUE_COMMITTING:
+		fputs("COMMITTING ", f); break;
+	default:
+		fprintf(f, "que state %lu ", (ulong) trx->lock.que_state);
+	}
+
+	if (n_trx_locks > 0 || heap_size > 400) {
+		newline = TRUE;
+
+		fprintf(f, "%lu lock struct(s), heap size %lu,"
+			" %lu row lock(s)",
+			(ulong) n_trx_locks,
+			(ulong) heap_size,
+			(ulong) n_rec_locks);
+	}
+
+	if (trx->has_search_latch) {
+		newline = TRUE;
+		fputs(", holds adaptive hash latch", f);
+	}
+
+	if (trx->undo_no != 0) {
+		newline = TRUE;
+		fprintf(f, ", undo log entries "TRX_ID_FMT, trx->undo_no);
+	}
+
+	if (newline) {
+		putc('\n', f);
+	}
+
+	if (trx->mysql_thd != NULL) {
+		innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len);
+	}
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+The caller must hold lock_sys->mutex and trx_sys->mutex.
+When possible, use trx_print() instead. */
+UNIV_INTERN
+void
+trx_print_latched(
+/*==============*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	trx_print_low(f, trx, max_query_len,
+		      lock_number_of_rows_locked(&trx->lock),
+		      UT_LIST_GET_LEN(trx->lock.trx_locks),
+		      mem_heap_get_size(trx->lock.lock_heap));
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+Acquires and releases lock_sys->mutex and trx_sys->mutex. */
+UNIV_INTERN
+void
+trx_print(
+/*======*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+{
+	ulint	n_rec_locks;
+	ulint	n_trx_locks;
+	ulint	heap_size;
+
+	lock_mutex_enter();
+	n_rec_locks = lock_number_of_rows_locked(&trx->lock);
+	n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
+	heap_size = mem_heap_get_size(trx->lock.lock_heap);
+	lock_mutex_exit();
+
+	mutex_enter(&trx_sys->mutex);
+	trx_print_low(f, trx, max_query_len,
+		      n_rec_locks, n_trx_locks, heap_size);
+	mutex_exit(&trx_sys->mutex);
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Asserts that a transaction has been started.
+The caller must hold trx_sys->mutex.
+@return TRUE if started */
+UNIV_INTERN
+ibool
+trx_assert_started(
+/*===============*/
+	const trx_t*	trx)	/*!< in: transaction */
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	/* Non-locking autocommits should not hold any locks and this
+	function is only called from the locking code. */
+	assert_trx_in_list(trx);
+
+	/* trx->state can change from or to NOT_STARTED while we are holding
+	trx_sys->mutex for non-locking autocommit selects but not for other
+	types of transactions. It may change from ACTIVE to PREPARED. Unless
+	we are holding lock_sys->mutex, it may also change to COMMITTED. */
+
+	switch (trx->state) {
+	case TRX_STATE_PREPARED:
+		return(TRUE);
+
+	case TRX_STATE_ACTIVE:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		return(TRUE);
+
+	case TRX_STATE_NOT_STARTED:
+		break;
+	}
+
+	ut_error;
+	return(FALSE);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Compares the "weight" (or size) of two transactions. Transactions that
+have edited non-transactional tables are considered heavier than ones
+that have not.
+@return	TRUE if weight(a) >= weight(b) */
+UNIV_INTERN
+ibool
+trx_weight_ge(
+/*==========*/
+	const trx_t*	a,	/*!< in: the first transaction to be compared */
+	const trx_t*	b)	/*!< in: the second transaction to be compared */
+{
+	ibool	a_notrans_edit;
+	ibool	b_notrans_edit;
+
+	/* If mysql_thd is NULL for a transaction we assume that it has
+	not edited non-transactional tables. */
+
+	a_notrans_edit = a->mysql_thd != NULL
+		&& thd_has_edited_nontrans_tables(a->mysql_thd);
+
+	b_notrans_edit = b->mysql_thd != NULL
+		&& thd_has_edited_nontrans_tables(b->mysql_thd);
+
+	if (a_notrans_edit != b_notrans_edit) {
+
+		return(a_notrans_edit);
+	}
+
+	/* Either both had edited non-transactional tables or both had
+	not, we fall back to comparing the number of altered/locked
+	rows. */
+
+#if 0
+	fprintf(stderr,
+		"%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n",
+		__func__,
+		a->undo_no, UT_LIST_GET_LEN(a->lock.trx_locks),
+		b->undo_no, UT_LIST_GET_LEN(b->lock.trx_locks));
+#endif
+
+	return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
+}
+
+/****************************************************************//**
+Prepares a transaction. */
+static
+void
+trx_prepare(
+/*========*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	trx_rseg_t*	rseg;
+	lsn_t		lsn;
+	mtr_t		mtr;
+
+	rseg = trx->rseg;
+	/* Only fresh user transactions can be prepared.
+	Recovered transactions cannot. */
+	ut_a(!trx->is_recovered);
+
+	if (trx->insert_undo != NULL || trx->update_undo != NULL) {
+
+		mtr_start(&mtr);
+
+		/* Change the undo log segment states from TRX_UNDO_ACTIVE
+		to TRX_UNDO_PREPARED: these modifications to the file data
+		structure define the transaction as prepared in the
+		file-based world, at the serialization point of lsn. */
+
+		mutex_enter(&rseg->mutex);
+
+		if (trx->insert_undo != NULL) {
+
+			/* It is not necessary to obtain trx->undo_mutex here
+			because only a single OS thread is allowed to do the
+			transaction prepare for this transaction. */
+
+			trx_undo_set_state_at_prepare(trx, trx->insert_undo,
+						      &mtr);
+		}
+
+		if (trx->update_undo) {
+			trx_undo_set_state_at_prepare(
+				trx, trx->update_undo, &mtr);
+		}
+
+		mutex_exit(&rseg->mutex);
+
+		/*--------------*/
+		mtr_commit(&mtr);	/* This mtr commit makes the
+					transaction prepared in the file-based
+					world */
+		/*--------------*/
+		lsn = mtr.end_lsn;
+		ut_ad(lsn);
+	} else {
+		lsn = 0;
+	}
+
+	/*--------------------------------------*/
+	ut_a(trx->state == TRX_STATE_ACTIVE);
+	mutex_enter(&trx_sys->mutex);
+	trx->state = TRX_STATE_PREPARED;
+	trx_sys->n_prepared_trx++;
+	mutex_exit(&trx_sys->mutex);
+	/*--------------------------------------*/
+
+	if (lsn) {
+		/* Depending on the my.cnf options, we may now write the log
+		buffer to the log files, making the prepared state of the
+		transaction durable if the OS does not crash. We may also
+		flush the log files to disk, making the prepared state of the
+		transaction durable also at an OS crash or a power outage.
+
+		The idea in InnoDB's group prepare is that a group of
+		transactions gather behind a trx doing a physical disk write
+		to log files, and when that physical write has been completed,
+		one of those transactions does a write which prepares the whole
+		group. Note that this group prepare will only bring benefit if
+		there are > 2 users in the database. Then at least 2 users can
+		gather behind one doing the physical log write to disk.
+
+		TODO: find out if MySQL holds some mutex when calling this.
+		That would spoil our group prepare algorithm. */
+
+		trx_flush_log_if_needed(lsn, trx);
+	}
+}
+
+/**********************************************************************//**
+Does the transaction prepare for MySQL. */
+UNIV_INTERN
+void
+trx_prepare_for_mysql(
+/*==================*/
+	trx_t*	trx)	/*!< in/out: trx handle */
+{
+	trx_start_if_not_started_xa(trx);
+
+	trx->op_info = "preparing";
+
+	trx_prepare(trx);
+
+	trx->op_info = "";
+}
+
+/**********************************************************************//**
+This function is used to find number of prepared transactions and
+their transaction objects for a recovery.
+@return	number of prepared transactions stored in xid_list */
+UNIV_INTERN
+int
+trx_recover_for_mysql(
+/*==================*/
+	XID*	xid_list,	/*!< in/out: prepared transactions */
+	ulint	len)		/*!< in: number of slots in xid_list */
+{
+	const trx_t*	trx;
+	ulint		count = 0;
+
+	ut_ad(xid_list);
+	ut_ad(len);
+
+	/* We should set those transactions which are in the prepared state
+	to the xid_list */
+
+	mutex_enter(&trx_sys->mutex);
+
+	for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		assert_trx_in_rw_list(trx);
+
+		/* The state of a read-write transaction cannot change
+		from or to NOT_STARTED while we are holding the
+		trx_sys->mutex. It may change to PREPARED, but not if
+		trx->is_recovered. It may also change to COMMITTED. */
+		if (trx_state_eq(trx, TRX_STATE_PREPARED)) {
+			xid_list[count] = trx->xid;
+
+			if (count == 0) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: Starting recovery for"
+					" XA transactions...\n");
+			}
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Transaction " TRX_ID_FMT " in"
+				" prepared state after recovery\n",
+				trx->id);
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Transaction contains changes"
+				" to "TRX_ID_FMT" rows\n",
+				trx->undo_no);
+
+			count++;
+
+			if (count == len) {
+				break;
+			}
+		}
+	}
+
+	mutex_exit(&trx_sys->mutex);
+
+	if (count > 0){
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: %d transactions in prepared state"
+			" after recovery\n",
+			int (count));
+	}
+
+	return(int (count));
+}
+
+/*******************************************************************//**
+This function is used to find one X/Open XA distributed transaction
+which is in the prepared state
+@return	trx on match, the trx->xid will be invalidated;
+note that the trx may have been committed, unless the caller is
+holding lock_sys->mutex */
+static __attribute__((nonnull, warn_unused_result))
+trx_t*
+trx_get_trx_by_xid_low(
+/*===================*/
+	const XID*	xid)		/*!< in: X/Open XA transaction
+					identifier */
+{
+	trx_t*		trx;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		assert_trx_in_rw_list(trx);
+
+		/* Compare two X/Open XA transaction id's: their
+		length should be the same and binary comparison
+		of gtrid_length+bqual_length bytes should be
+		the same */
+
+		if (trx->is_recovered
+		    && trx_state_eq(trx, TRX_STATE_PREPARED)
+		    && xid->gtrid_length == trx->xid.gtrid_length
+		    && xid->bqual_length == trx->xid.bqual_length
+		    && memcmp(xid->data, trx->xid.data,
+			      xid->gtrid_length + xid->bqual_length) == 0) {
+
+			/* Invalidate the XID, so that subsequent calls
+			will not find it. */
+			memset(&trx->xid, 0, sizeof(trx->xid));
+			trx->xid.formatID = -1;
+			break;
+		}
+	}
+
+	return(trx);
+}
+
+/*******************************************************************//**
+This function is used to find one X/Open XA distributed transaction
+which is in the prepared state
+@return	trx or NULL; on match, the trx->xid will be invalidated;
+note that the trx may have been committed, unless the caller is
+holding lock_sys->mutex */
+UNIV_INTERN
+trx_t*
+trx_get_trx_by_xid(
+/*===============*/
+	const XID*	xid)	/*!< in: X/Open XA transaction identifier */
+{
+	trx_t*	trx;
+
+	if (xid == NULL) {
+
+		return(NULL);
+	}
+
+	mutex_enter(&trx_sys->mutex);
+
+	/* Recovered/Resurrected transactions are always only on the
+	trx_sys_t::rw_trx_list. */
+	trx = trx_get_trx_by_xid_low(xid);
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(trx);
+}
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+UNIV_INTERN
+void
+trx_start_if_not_started_xa_low(
+/*============================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+
+		/* Update the info whether we should skip XA steps
+		that eat CPU time.
+
+		For the duration of the transaction trx->support_xa is
+		not reread from thd so any changes in the value take
+		effect in the next transaction. This is to avoid a
+		scenario where some undo generated by a transaction,
+		has XA stuff, and other undo, generated by the same
+		transaction, doesn't. */
+		trx->support_xa = thd_supports_xa(trx->mysql_thd);
+
+		trx_start_low(trx);
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+		return;
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+UNIV_INTERN
+void
+trx_start_if_not_started_low(
+/*=========================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx_start_low(trx);
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+		return;
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/*************************************************************//**
+Starts the transaction for a DDL operation. */
+UNIV_INTERN
+void
+trx_start_for_ddl_low(
+/*==================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	trx_dict_op_t	op)	/*!< in: dictionary operation type */
+{
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		/* Flag this transaction as a dictionary operation, so that
+		the data dictionary will be locked in crash recovery. */
+
+		trx_set_dict_operation(trx, op);
+
+		/* Ensure it is not flagged as an auto-commit-non-locking
+		transation. */
+		trx->will_lock = 1;
+
+		trx->ddl = true;
+
+		trx_start_low(trx);
+		return;
+
+	case TRX_STATE_ACTIVE:
+		/* We have this start if not started idiom, therefore we
+		can't add stronger checks here. */
+		trx->ddl = true;
+
+		ut_ad(trx->dict_operation != TRX_DICT_OP_NONE);
+		ut_ad(trx->will_lock > 0);
+		return;
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
diff --git a/storage/xtradb/trx/trx0undo.c b/storage/xtradb/trx/trx0undo.cc
index 3d794c69c8b..290271c6cab 100644
--- a/storage/xtradb/trx/trx0undo.c
+++ b/storage/xtradb/trx/trx0undo.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file trx/trx0undo.c
+@file trx/trx0undo.cc
 Transaction undo log
 
 Created 3/26/1996 Heikki Tuuri
@@ -39,6 +39,7 @@ Created 3/26/1996 Heikki Tuuri
 #include "srv0start.h"
 #include "trx0rec.h"
 #include "trx0purge.h"
+#include "srv0mon.h"
 
 /* How should the old versions in the history list be managed?
    ----------------------------------------------------------
@@ -79,7 +80,7 @@ can still remove old versions from the bottom of the stack. */
    -------------------------------------------------------------------
 latches?
 -------
-The contention of the kernel mutex should be minimized. When a transaction
+The contention of the trx_sys_t::mutex should be minimized. When a transaction
 does its first insert or modify in an index, an undo log is assigned for it.
 Then we must have an x-latch to the rollback segment header.
 	When the transaction does more modifys or rolls back, the undo log is
@@ -158,6 +159,7 @@ trx_undo_get_prev_rec_from_prev_page(
 	trx_undo_rec_t*	rec,	/*!< in: undo record */
 	ulint		page_no,/*!< in: undo log header page number */
 	ulint		offset,	/*!< in: undo log header offset on page */
+	bool		shared,	/*!< in: true=S-latch, false=X-latch */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
 	ulint	space;
@@ -180,8 +182,12 @@ trx_undo_get_prev_rec_from_prev_page(
 	space = page_get_space_id(undo_page);
 	zip_size = fil_space_get_zip_size(space);
 
-	prev_page = trx_undo_page_get_s_latched(space, zip_size,
-						prev_page_no, mtr);
+	buf_block_t*	block = buf_page_get(space, zip_size, prev_page_no,
+					     shared ? RW_S_LATCH : RW_X_LATCH,
+					     mtr);
+	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+	prev_page = buf_block_get_frame(block);
 
 	return(trx_undo_page_get_last_rec(prev_page, page_no, offset));
 }
@@ -196,6 +202,7 @@ trx_undo_get_prev_rec(
 	trx_undo_rec_t*	rec,	/*!< in: undo record */
 	ulint		page_no,/*!< in: undo log header page number */
 	ulint		offset,	/*!< in: undo log header offset on page */
+	bool		shared,	/*!< in: true=S-latch, false=X-latch */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
 	trx_undo_rec_t*	prev_rec;
@@ -211,7 +218,7 @@ trx_undo_get_prev_rec(
 	previous record */
 
 	return(trx_undo_get_prev_rec_from_prev_page(rec, page_no, offset,
-						    mtr));
+						    shared, mtr));
 }
 
 /***********************************************************************//**
@@ -412,8 +419,8 @@ trx_undo_page_init(
 Creates a new undo log segment in file.
 @return DB_SUCCESS if page creation OK possible error codes are:
 DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 trx_undo_seg_create(
 /*================*/
 	trx_rseg_t*	rseg __attribute__((unused)),/*!< in: rollback segment */
@@ -434,7 +441,7 @@ trx_undo_seg_create(
 	trx_usegf_t*	seg_hdr;
 	ulint		n_reserved;
 	ibool		success;
-	ulint		err = DB_SUCCESS;
+	dberr_t		err = DB_SUCCESS;
 
 	ut_ad(mtr && id && rseg_hdr);
 	ut_ad(mutex_own(&(rseg->mutex)));
@@ -501,6 +508,8 @@ trx_undo_seg_create(
 			       page_get_page_no(*undo_page), mtr);
 	*id = slot_no;
 
+	MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED);
+
 	return(err);
 }
 
@@ -607,13 +616,13 @@ trx_undo_write_xid(
 	mtr_t*		mtr)	/*!< in: mtr */
 {
 	mlog_write_ulint(log_hdr + TRX_UNDO_XA_FORMAT,
-			 (ulint)xid->formatID, MLOG_4BYTES, mtr);
+			 (ulint) xid->formatID, MLOG_4BYTES, mtr);
 
 	mlog_write_ulint(log_hdr + TRX_UNDO_XA_TRID_LEN,
-			 (ulint)xid->gtrid_length, MLOG_4BYTES, mtr);
+			 (ulint) xid->gtrid_length, MLOG_4BYTES, mtr);
 
 	mlog_write_ulint(log_hdr + TRX_UNDO_XA_BQUAL_LEN,
-			 (ulint)xid->bqual_length, MLOG_4BYTES, mtr);
+			 (ulint) xid->bqual_length, MLOG_4BYTES, mtr);
 
 	mlog_write_string(log_hdr + TRX_UNDO_XA_XID, (const byte*) xid->data,
 			  XIDDATASIZE, mtr);
@@ -628,7 +637,7 @@ trx_undo_read_xid(
 	trx_ulogf_t*	log_hdr,/*!< in: undo log header */
 	XID*		xid)	/*!< out: X/Open XA Transaction Identification */
 {
-	xid->formatID = (long)mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT);
+	xid->formatID = (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT);
 
 	xid->gtrid_length
 		= (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_TRID_LEN);
@@ -894,7 +903,6 @@ trx_undo_add_page(
 	ulint		n_reserved;
 
 	ut_ad(mutex_own(&(trx->undo_mutex)));
-	ut_ad(!mutex_own(&kernel_mutex));
 	ut_ad(mutex_own(&(trx->rseg->mutex)));
 
 	rseg = trx->rseg;
@@ -969,7 +977,6 @@ trx_undo_free_page(
 	ulint		zip_size;
 
 	ut_a(hdr_page_no != page_no);
-	ut_ad(!mutex_own(&kernel_mutex));
 	ut_ad(mutex_own(&(rseg->mutex)));
 
 	zip_size = rseg->zip_size;
@@ -1218,8 +1225,6 @@ trx_undo_seg_free(
 
 		mtr_start(&mtr);
 
-		ut_ad(!mutex_own(&kernel_mutex));
-
 		mutex_enter(&(rseg->mutex));
 
 		seg_header = trx_undo_page_get(undo->space, undo->zip_size,
@@ -1237,6 +1242,8 @@ trx_undo_seg_free(
 				&mtr);
 			trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL,
 					       &mtr);
+
+			MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
 		}
 
 		mutex_exit(&(rseg->mutex));
@@ -1355,6 +1362,7 @@ add_to_list:
 		} else {
 			UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_cached,
 					 undo);
+			MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
 		}
 	} else {
 		ut_ad(type == TRX_UNDO_UPDATE);
@@ -1364,6 +1372,7 @@ add_to_list:
 		} else {
 			UT_LIST_ADD_LAST(undo_list, rseg->update_undo_cached,
 					 undo);
+			MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
 		}
 	}
 
@@ -1381,8 +1390,6 @@ trx_undo_lists_init(
 /*================*/
 	trx_rseg_t*	rseg)	/*!< in: rollback segment memory object */
 {
-	ulint		page_no;
-	trx_undo_t*	undo;
 	ulint		size	= 0;
 	trx_rsegf_t*	rseg_header;
 	ulint		i;
@@ -1395,10 +1402,12 @@ trx_undo_lists_init(
 
 	mtr_start(&mtr);
 
-	rseg_header = trx_rsegf_get_new(rseg->space, rseg->zip_size,
-					rseg->page_no, &mtr);
+	rseg_header = trx_rsegf_get_new(
+		rseg->space, rseg->zip_size, rseg->page_no, &mtr);
 
 	for (i = 0; i < TRX_RSEG_N_SLOTS; i++) {
+		ulint	page_no;
+
 		page_no = trx_rsegf_get_nth_undo(rseg_header, i, &mtr);
 
 		/* In forced recovery: try to avoid operations which look
@@ -1409,8 +1418,11 @@ trx_undo_lists_init(
 		if (page_no != FIL_NULL
 		    && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
 
-			undo = trx_undo_mem_create_at_db_start(rseg, i,
-							       page_no, &mtr);
+			trx_undo_t*	undo;
+
+			undo = trx_undo_mem_create_at_db_start(
+				rseg, i, page_no, &mtr);
+
 			size += undo->size;
 
 			mtr_commit(&mtr);
@@ -1420,6 +1432,9 @@ trx_undo_lists_init(
 			rseg_header = trx_rsegf_get(
 				rseg->space, rseg->zip_size, rseg->page_no,
 				&mtr);
+
+			/* Found a used slot */
+			MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED);
 		}
 	}
 
@@ -1455,11 +1470,11 @@ trx_undo_mem_create(
 		ut_error;
 	}
 
-	undo = mem_alloc(sizeof(trx_undo_t));
+	undo = static_cast<trx_undo_t*>(mem_alloc(sizeof(*undo)));
 
 	if (undo == NULL) {
 
-		return NULL;
+		return(NULL);
 	}
 
 	undo->id = id;
@@ -1542,8 +1557,8 @@ Creates a new undo log.
 @return DB_SUCCESS if successful in creating the new undo lob object,
 possible error codes are: DB_TOO_MANY_CONCURRENT_TRXS
 DB_OUT_OF_FILE_SPACE DB_OUT_OF_MEMORY */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
 trx_undo_create(
 /*============*/
 	trx_t*		trx,	/*!< in: transaction */
@@ -1562,7 +1577,7 @@ trx_undo_create(
 	ulint		offset;
 	ulint		id;
 	page_t*		undo_page;
-	ulint		err;
+	dberr_t		err;
 
 	ut_ad(mutex_own(&(rseg->mutex)));
 
@@ -1639,6 +1654,8 @@ trx_undo_reuse_cached(
 		}
 
 		UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo);
+
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
 	} else {
 		ut_ad(type == TRX_UNDO_UPDATE);
 
@@ -1649,6 +1666,8 @@ trx_undo_reuse_cached(
 		}
 
 		UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo);
+
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
 	}
 
 	ut_ad(undo->size == 1);
@@ -1730,10 +1749,10 @@ trx_undo_mark_as_dict_operation(
 Assigns an undo log for a transaction. A new undo log is created or a cached
 undo log reused.
 @return DB_SUCCESS if undo log assign successful, possible error codes
-are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE
+are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE DB_READ_ONLY
 DB_OUT_OF_MEMORY */
 UNIV_INTERN
-ulint
+dberr_t
 trx_undo_assign_undo(
 /*=================*/
 	trx_t*		trx,	/*!< in: transaction */
@@ -1742,10 +1761,13 @@ trx_undo_assign_undo(
 	trx_rseg_t*	rseg;
 	trx_undo_t*	undo;
 	mtr_t		mtr;
-	ulint		err = DB_SUCCESS;
+	dberr_t		err = DB_SUCCESS;
 
 	ut_ad(trx);
-	ut_ad(trx->rseg);
+
+	if (trx->rseg == NULL) {
+		return(DB_READ_ONLY);
+	}
 
 	rseg = trx->rseg;
 
@@ -1753,15 +1775,19 @@ trx_undo_assign_undo(
 
 	mtr_start(&mtr);
 
-	ut_ad(!mutex_own(&kernel_mutex));
+	mutex_enter(&rseg->mutex);
 
-	mutex_enter(&(rseg->mutex));
+	DBUG_EXECUTE_IF(
+		"ib_create_table_fail_too_many_trx",
+		err = DB_TOO_MANY_CONCURRENT_TRXS;
+		goto func_exit;
+	);
 
 	undo = trx_undo_reuse_cached(trx, rseg, type, trx->id, &trx->xid,
 				     &mtr);
 	if (undo == NULL) {
 		err = trx_undo_create(trx, rseg, type, trx->id, &trx->xid,
-								&undo, &mtr);
+				      &undo, &mtr);
 		if (err != DB_SUCCESS) {
 
 			goto func_exit;
@@ -1786,7 +1812,7 @@ func_exit:
 	mutex_exit(&(rseg->mutex));
 	mtr_commit(&mtr);
 
-	return err;
+	return(err);
 }
 
 /******************************************************************//**
@@ -1804,9 +1830,6 @@ trx_undo_set_state_at_finish(
 	page_t*		undo_page;
 	ulint		state;
 
-	ut_ad(undo);
-	ut_ad(mtr);
-
 	if (undo->id >= TRX_RSEG_N_SLOTS) {
 		fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
 			(ulong) undo->id);
@@ -1919,9 +1942,10 @@ trx_undo_update_cleanup(
 	if (undo->state == TRX_UNDO_CACHED) {
 
 		UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_cached, undo);
+
+		MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
 	} else {
-		ut_ad(undo->state == TRX_UNDO_TO_PURGE
-		      || undo->state == TRX_UNDO_TO_FREE);
+		ut_ad(undo->state == TRX_UNDO_TO_PURGE);
 
 		trx_undo_mem_free(undo);
 	}
@@ -1953,6 +1977,8 @@ trx_undo_insert_cleanup(
 	if (undo->state == TRX_UNDO_CACHED) {
 
 		UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_cached, undo);
+
+		MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
 	} else {
 		ut_ad(undo->state == TRX_UNDO_TO_FREE);
 
diff --git a/storage/xtradb/usr/usr0sess.c b/storage/xtradb/usr/usr0sess.cc
index eed377ec09e..ab7ba6bea09 100644
--- a/storage/xtradb/usr/usr0sess.c
+++ b/storage/xtradb/usr/usr0sess.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file usr/usr0sess.c
+@file usr/usr0sess.cc
 Sessions
 
 Created 6/25/1996 Heikki Tuuri
@@ -41,13 +41,12 @@ sess_open(void)
 {
 	sess_t*	sess;
 
-	ut_ad(mutex_own(&kernel_mutex));
-
-	sess = mem_alloc(sizeof(sess_t));
+	sess = static_cast<sess_t*>(mem_zalloc(sizeof(*sess)));
 
 	sess->state = SESS_ACTIVE;
 
-	sess->trx = trx_create(sess);
+	sess->trx = trx_allocate_for_background();
+	sess->trx->sess = sess;
 
 	UT_LIST_INIT(sess->graphs);
 
@@ -62,8 +61,6 @@ sess_close(
 /*=======*/
 	sess_t*	sess)	/*!< in, own: session object */
 {
-	ut_ad(!mutex_own(&kernel_mutex));
-
 	ut_a(UT_LIST_GET_LEN(sess->graphs) == 0);
 
 	trx_free_for_background(sess->trx);
diff --git a/storage/xtradb/ut/ut0bh.c b/storage/xtradb/ut/ut0bh.cc
index 6d1f881917b..1a3038a0d71 100644
--- a/storage/xtradb/ut/ut0bh.c
+++ b/storage/xtradb/ut/ut0bh.cc
@@ -1,11 +1,6 @@
 /***************************************************************************//**
-Copyright (c) 2010, 2011, Oracle Corpn. All Rights Reserved.
 
-Portions of this file contain modifications contributed and copyrighted by
-Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
-are described briefly in the InnoDB documentation. The contributions by
-Sun Microsystems are incorporated with their permission, and subject to the
-conditions contained in the file COPYING.Sun_Microsystems.
+Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -16,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /******************************************************************//**
-@file ut/ut0bh.c
+@file ut/ut0bh.cc
 Binary min-heap implementation.
 
 Created 2010-05-28 by Sunny Bains
diff --git a/storage/xtradb/ut/ut0byte.c b/storage/xtradb/ut/ut0byte.cc
index 3d84df52818..bc592edc6bf 100644
--- a/storage/xtradb/ut/ut0byte.c
+++ b/storage/xtradb/ut/ut0byte.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /***************************************************************//**
-@file ut/ut0byte.c
+@file ut/ut0byte.cc
 Byte utilities
 
 Created 5/11/1994 Heikki Tuuri
diff --git a/storage/xtradb/ut/ut0crc32.cc b/storage/xtradb/ut/ut0crc32.cc
new file mode 100644
index 00000000000..1caf27ebae3
--- /dev/null
+++ b/storage/xtradb/ut/ut0crc32.cc
@@ -0,0 +1,318 @@
+/*****************************************************************************
+
+Copyright (C) 2009, 2010 Facebook, Inc. All Rights Reserved.
+Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***************************************************************//**
+@file ut/ut0crc32.cc
+CRC32 implementation from Facebook, based on the zlib implementation.
+
+Created Aug 8, 2011, Vasil Dimov, based on mysys/my_crc32.c and
+mysys/my_perf.c, contributed by Facebook under the following license.
+********************************************************************/
+
+/* Copyright (C) 2009-2010 Facebook, Inc.  All Rights Reserved.
+
+   Dual licensed under BSD license and GPLv2.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+   1. Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+   2. Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY FACEBOOK, INC. ``AS IS'' AND ANY EXPRESS OR
+   IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+   MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+   EVENT SHALL FACEBOOK, INC. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+   OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+   OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+   ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   This program is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the Free
+   Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   You should have received a copy of the GNU General Public License along with
+   this program; if not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/* The below CRC32 implementation is based on the implementation included with
+ * zlib with modifications to process 8 bytes at a time and using SSE 4.2
+ * extentions when available.  The polynomial constant has been changed to
+ * match the one used by SSE 4.2 and does not return the same value as the
+ * version used by zlib.  This implementation only supports 64-bit
+ * little-endian processors.  The original zlib copyright notice follows. */
+
+/* crc32.c -- compute the CRC-32 of a buf stream
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
+ * CRC methods: exclusive-oring 32 bits of buf at a time, and pre-computing
+ * tables for updating the shift register in one step with three exclusive-ors
+ * instead of four steps with four exclusive-ors.  This results in about a
+ * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
+ */
+
+#include "univ.i"
+#include "ut0crc32.h"
+
+#include <string.h>
+
+ib_ut_crc32_t	ut_crc32;
+
+/* Precalculated table used to generate the CRC32 if the CPU does not
+have support for it */
+static ib_uint32_t	ut_crc32_slice8_table[8][256];
+static ibool		ut_crc32_slice8_table_initialized = FALSE;
+
+/* Flag that tells whether the CPU supports CRC32 or not */
+UNIV_INTERN bool	ut_crc32_sse2_enabled = false;
+
+/********************************************************************//**
+Initializes the table that is used to generate the CRC32 if the CPU does
+not have support for it. */
+static
+void
+ut_crc32_slice8_table_init()
+/*========================*/
+{
+	/* bit-reversed poly 0x1EDC6F41 (from SSE42 crc32 instruction) */
+	static const ib_uint32_t	poly = 0x82f63b78;
+	ib_uint32_t			n;
+	ib_uint32_t			k;
+	ib_uint32_t			c;
+
+	for (n = 0; n < 256; n++) {
+		c = n;
+		for (k = 0; k < 8; k++) {
+			c = (c & 1) ? (poly ^ (c >> 1)) : (c >> 1);
+		}
+		ut_crc32_slice8_table[0][n] = c;
+	}
+
+	for (n = 0; n < 256; n++) {
+		c = ut_crc32_slice8_table[0][n];
+		for (k = 1; k < 8; k++) {
+			c = ut_crc32_slice8_table[0][c & 0xFF] ^ (c >> 8);
+			ut_crc32_slice8_table[k][n] = c;
+		}
+	}
+
+	ut_crc32_slice8_table_initialized = TRUE;
+}
+
+#if defined(__GNUC__) && defined(__x86_64__)
+/********************************************************************//**
+Fetches CPU info */
+static
+void
+ut_cpuid(
+/*=====*/
+	ib_uint32_t	vend[3],	/*!< out: CPU vendor */
+	ib_uint32_t*	model,		/*!< out: CPU model */
+	ib_uint32_t*	family,		/*!< out: CPU family */
+	ib_uint32_t*	stepping,	/*!< out: CPU stepping */
+	ib_uint32_t*	features_ecx,	/*!< out: CPU features ecx */
+	ib_uint32_t*	features_edx)	/*!< out: CPU features edx */
+{
+	ib_uint32_t	sig;
+	asm("cpuid" : "=b" (vend[0]), "=c" (vend[2]), "=d" (vend[1]) : "a" (0));
+	asm("cpuid" : "=a" (sig), "=c" (*features_ecx), "=d" (*features_edx)
+	    : "a" (1)
+	    : "ebx");
+
+	*model = ((sig >> 4) & 0xF);
+	*family = ((sig >> 8) & 0xF);
+	*stepping = (sig & 0xF);
+
+	if (memcmp(vend, "GenuineIntel", 12) == 0
+	    || (memcmp(vend, "AuthenticAMD", 12) == 0 && *family == 0xF)) {
+
+		*model += (((sig >> 16) & 0xF) << 4);
+		*family += ((sig >> 20) & 0xFF);
+	}
+}
+
+/* opcodes taken from objdump of "crc32b (%%rdx), %%rcx"
+for RHEL4 support (GCC 3 doesn't support this instruction) */
+#define ut_crc32_sse42_byte \
+	asm(".byte 0xf2, 0x48, 0x0f, 0x38, 0xf0, 0x0a" \
+	    : "=c"(crc) : "c"(crc), "d"(buf)); \
+	len--, buf++
+
+/* opcodes taken from objdump of "crc32q (%%rdx), %%rcx"
+for RHEL4 support (GCC 3 doesn't support this instruction) */
+#define ut_crc32_sse42_quadword \
+	asm(".byte 0xf2, 0x48, 0x0f, 0x38, 0xf1, 0x0a" \
+	    : "=c"(crc) : "c"(crc), "d"(buf)); \
+	len -= 8, buf += 8
+#endif /* defined(__GNUC__) && defined(__x86_64__) */
+
+/********************************************************************//**
+Calculates CRC32 using CPU instructions.
+@return CRC-32C (polynomial 0x11EDC6F41) */
+UNIV_INLINE
+ib_uint32_t
+ut_crc32_sse42(
+/*===========*/
+	const byte*	buf,	/*!< in: data over which to calculate CRC32 */
+	ulint		len)	/*!< in: data length */
+{
+#if defined(__GNUC__) && defined(__x86_64__)
+	ib_uint64_t	crc = (ib_uint32_t) (-1);
+
+	ut_a(ut_crc32_sse2_enabled);
+
+	while (len && ((ulint) buf & 7)) {
+		ut_crc32_sse42_byte;
+	}
+
+	while (len >= 32) {
+		ut_crc32_sse42_quadword;
+		ut_crc32_sse42_quadword;
+		ut_crc32_sse42_quadword;
+		ut_crc32_sse42_quadword;
+	}
+
+	while (len >= 8) {
+		ut_crc32_sse42_quadword;
+	}
+
+	while (len) {
+		ut_crc32_sse42_byte;
+	}
+
+	return((ib_uint32_t) ((~crc) & 0xFFFFFFFF));
+#else
+	ut_error;
+	/* silence compiler warning about unused parameters */
+	return((ib_uint32_t) buf[len]);
+#endif /* defined(__GNUC__) && defined(__x86_64__) */
+}
+
+#define ut_crc32_slice8_byte \
+	crc = (crc >> 8) ^ ut_crc32_slice8_table[0][(crc ^ *buf++) & 0xFF]; \
+	len--
+
+#define ut_crc32_slice8_quadword \
+	crc ^= *(ib_uint64_t*) buf; \
+	crc = ut_crc32_slice8_table[7][(crc      ) & 0xFF] ^ \
+	      ut_crc32_slice8_table[6][(crc >>  8) & 0xFF] ^ \
+	      ut_crc32_slice8_table[5][(crc >> 16) & 0xFF] ^ \
+	      ut_crc32_slice8_table[4][(crc >> 24) & 0xFF] ^ \
+	      ut_crc32_slice8_table[3][(crc >> 32) & 0xFF] ^ \
+	      ut_crc32_slice8_table[2][(crc >> 40) & 0xFF] ^ \
+	      ut_crc32_slice8_table[1][(crc >> 48) & 0xFF] ^ \
+	      ut_crc32_slice8_table[0][(crc >> 56)]; \
+	len -= 8, buf += 8
+
+/********************************************************************//**
+Calculates CRC32 manually.
+@return CRC-32C (polynomial 0x11EDC6F41) */
+UNIV_INLINE
+ib_uint32_t
+ut_crc32_slice8(
+/*============*/
+	const byte*	buf,	/*!< in: data over which to calculate CRC32 */
+	ulint		len)	/*!< in: data length */
+{
+	ib_uint64_t	crc = (ib_uint32_t) (-1);
+
+	ut_a(ut_crc32_slice8_table_initialized);
+
+	while (len && ((ulint) buf & 7)) {
+		ut_crc32_slice8_byte;
+	}
+
+	while (len >= 32) {
+		ut_crc32_slice8_quadword;
+		ut_crc32_slice8_quadword;
+		ut_crc32_slice8_quadword;
+		ut_crc32_slice8_quadword;
+	}
+
+	while (len >= 8) {
+		ut_crc32_slice8_quadword;
+	}
+
+	while (len) {
+		ut_crc32_slice8_byte;
+	}
+
+	return((ib_uint32_t) ((~crc) & 0xFFFFFFFF));
+}
+
+/********************************************************************//**
+Initializes the data structures used by ut_crc32(). Does not do any
+allocations, would not hurt if called twice, but would be pointless. */
+UNIV_INTERN
+void
+ut_crc32_init()
+/*===========*/
+{
+#if defined(__GNUC__) && defined(__x86_64__)
+	ib_uint32_t	vend[3];
+	ib_uint32_t	model;
+	ib_uint32_t	family;
+	ib_uint32_t	stepping;
+	ib_uint32_t	features_ecx;
+	ib_uint32_t	features_edx;
+
+	ut_cpuid(vend, &model, &family, &stepping,
+		 &features_ecx, &features_edx);
+
+	/* Valgrind does not understand the CRC32 instructions:
+
+	vex amd64->IR: unhandled instruction bytes: 0xF2 0x48 0xF 0x38 0xF0 0xA
+	valgrind: Unrecognised instruction at address 0xad3db5.
+	Your program just tried to execute an instruction that Valgrind
+	did not recognise.  There are two possible reasons for this.
+	1. Your program has a bug and erroneously jumped to a non-code
+	   location.  If you are running Memcheck and you just saw a
+	   warning about a bad jump, it's probably your program's fault.
+	2. The instruction is legitimate but Valgrind doesn't handle it,
+	   i.e. it's Valgrind's fault.  If you think this is the case or
+	   you are not sure, please let us know and we'll try to fix it.
+	Either way, Valgrind will now raise a SIGILL signal which will
+	probably kill your program.
+
+	*/
+#ifndef UNIV_DEBUG_VALGRIND
+	ut_crc32_sse2_enabled = (features_ecx >> 20) & 1;
+#endif /* UNIV_DEBUG_VALGRIND */
+
+#endif /* defined(__GNUC__) && defined(__x86_64__) */
+
+	if (ut_crc32_sse2_enabled) {
+		ut_crc32 = ut_crc32_sse42;
+	} else {
+		ut_crc32_slice8_table_init();
+		ut_crc32 = ut_crc32_slice8;
+	}
+}
diff --git a/storage/xtradb/ut/ut0dbg.c b/storage/xtradb/ut/ut0dbg.cc
index c37b37ab606..a1cad144da4 100644
--- a/storage/xtradb/ut/ut0dbg.c
+++ b/storage/xtradb/ut/ut0dbg.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /*****************************************************************//**
-@file ut/ut0dbg.c
+@file ut/ut0dbg.cc
 Debug utilities for Innobase.
 
 Created 1/30/1994 Heikki Tuuri
diff --git a/storage/xtradb/ut/ut0list.c b/storage/xtradb/ut/ut0list.cc
index e8f1e3a9bff..f906061d185 100644
--- a/storage/xtradb/ut/ut0list.c
+++ b/storage/xtradb/ut/ut0list.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /*******************************************************************//**
-@file ut/ut0list.c
+@file ut/ut0list.cc
 A double-linked list
 
 Created 4/26/2006 Osku Salerma
@@ -36,7 +36,9 @@ ib_list_t*
 ib_list_create(void)
 /*=================*/
 {
-	ib_list_t*	list = mem_alloc(sizeof(ib_list_t));
+	ib_list_t*	list;
+
+	list = static_cast<ib_list_t*>(mem_alloc(sizeof(*list)));
 
 	list->first = NULL;
 	list->last = NULL;
@@ -55,7 +57,9 @@ ib_list_create_heap(
 /*================*/
 	mem_heap_t*	heap)	/*!< in: memory heap to use */
 {
-	ib_list_t*	list = mem_heap_alloc(heap, sizeof(ib_list_t));
+	ib_list_t*	list;
+
+	list = static_cast<ib_list_t*>(mem_heap_alloc(heap, sizeof(*list)));
 
 	list->first = NULL;
 	list->last = NULL;
@@ -122,7 +126,10 @@ ib_list_add_after(
 	void*		data,		/*!< in: data */
 	mem_heap_t*	heap)		/*!< in: memory heap to use */
 {
-	ib_list_node_t*	node = mem_heap_alloc(heap, sizeof(ib_list_node_t));
+	ib_list_node_t*	node;
+
+	node = static_cast<ib_list_node_t*>(
+		mem_heap_alloc(heap, sizeof(*node)));
 
 	node->data = data;
 
@@ -191,4 +198,6 @@ ib_list_remove(
 
 		list->last = node->prev;
 	}
+
+	node->prev = node->next = NULL;
 }
diff --git a/storage/xtradb/ut/ut0mem.c b/storage/xtradb/ut/ut0mem.cc
index 944c1f1e049..2bb5d9ce332 100644
--- a/storage/xtradb/ut/ut0mem.c
+++ b/storage/xtradb/ut/ut0mem.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file ut/ut0mem.c
+@file ut/ut0mem.cc
 Memory primitives
 
 Created 5/11/1994 Heikki Tuuri
@@ -35,9 +35,6 @@ Created 5/11/1994 Heikki Tuuri
 
 #include <stdlib.h>
 
-/** This struct is placed first in every allocated memory block */
-typedef struct ut_mem_block_struct ut_mem_block_t;
-
 /** The total amount of memory currently allocated from the operating
 system with os_mem_alloc_large() or malloc().  Does not count malloc()
 if srv_use_sys_malloc is set.  Protected by ut_list_mutex. */
@@ -46,15 +43,20 @@ UNIV_INTERN ulint		ut_total_allocated_memory	= 0;
 /** Mutex protecting ut_total_allocated_memory and ut_mem_block_list */
 UNIV_INTERN os_fast_mutex_t	ut_list_mutex;
 
+#ifdef UNIV_PFS_MUTEX
+/* Key to register server_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	ut_list_mutex_key;
+#endif
+
 /** Dynamically allocated memory block */
-struct ut_mem_block_struct{
+struct ut_mem_block_t{
 	UT_LIST_NODE_T(ut_mem_block_t) mem_block_list;
 			/*!< mem block list node */
 	ulint	size;	/*!< size of allocated memory */
 	ulint	magic_n;/*!< magic number (UT_MEM_MAGIC_N) */
 };
 
-/** The value of ut_mem_block_struct::magic_n.  Used in detecting
+/** The value of ut_mem_block_t::magic_n.  Used in detecting
 memory corruption. */
 #define UT_MEM_MAGIC_N	1601650166
 
@@ -77,7 +79,7 @@ ut_mem_init(void)
 /*=============*/
 {
 	ut_a(!ut_mem_block_list_inited);
-	os_fast_mutex_init(&ut_list_mutex);
+	os_fast_mutex_init(ut_list_mutex_key, &ut_list_mutex);
 	UT_LIST_INIT(ut_mem_block_list);
 	ut_mem_block_list_inited = TRUE;
 }
@@ -185,16 +187,16 @@ retry:
 
 	UNIV_MEM_ALLOC(ret, n + sizeof(ut_mem_block_t));
 
-	((ut_mem_block_t*)ret)->size = n + sizeof(ut_mem_block_t);
-	((ut_mem_block_t*)ret)->magic_n = UT_MEM_MAGIC_N;
+	((ut_mem_block_t*) ret)->size = n + sizeof(ut_mem_block_t);
+	((ut_mem_block_t*) ret)->magic_n = UT_MEM_MAGIC_N;
 
 	ut_total_allocated_memory += n + sizeof(ut_mem_block_t);
 
 	UT_LIST_ADD_FIRST(mem_block_list, ut_mem_block_list,
-			  ((ut_mem_block_t*)ret));
+			  ((ut_mem_block_t*) ret));
 	os_fast_mutex_unlock(&ut_list_mutex);
 
-	return((void*)((byte*)ret + sizeof(ut_mem_block_t)));
+	return((void*)((byte*) ret + sizeof(ut_mem_block_t)));
 #else /* !UNIV_HOTBACKUP */
 	void*	ret = malloc(n);
 	ut_a(ret || !assert_on_error);
@@ -222,7 +224,7 @@ ut_free(
 		return;
 	}
 
-	block = (ut_mem_block_t*)((byte*)ptr - sizeof(ut_mem_block_t));
+	block = (ut_mem_block_t*)((byte*) ptr - sizeof(ut_mem_block_t));
 
 	os_fast_mutex_lock(&ut_list_mutex);
 
@@ -242,7 +244,7 @@ ut_free(
 
 #ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
-Implements realloc. This is needed by /pars/lexyy.c. Otherwise, you should not
+Implements realloc. This is needed by /pars/lexyy.cc. Otherwise, you should not
 use this function because the allocation functions in mem0mem.h are the
 recommended ones in InnoDB.
 
@@ -293,7 +295,7 @@ ut_realloc(
 		return(NULL);
 	}
 
-	block = (ut_mem_block_t*)((byte*)ptr - sizeof(ut_mem_block_t));
+	block = (ut_mem_block_t*)((byte*) ptr - sizeof(ut_mem_block_t));
 
 	ut_a(block->magic_n == UT_MEM_MAGIC_N);
 
@@ -438,6 +440,33 @@ ut_strcount(
 	return(count);
 }
 
+/********************************************************************
+Concatenate 3 strings.*/
+
+char*
+ut_str3cat(
+/*=======*/
+				/* out, own: concatenated string, must be
+				freed with mem_free() */
+	const char*	s1,	/* in: string 1 */
+	const char*	s2,	/* in: string 2 */
+	const char*	s3)	/* in: string 3 */
+{
+	char*	s;
+	ulint	s1_len = strlen(s1);
+	ulint	s2_len = strlen(s2);
+	ulint	s3_len = strlen(s3);
+
+	s = static_cast<char*>(mem_alloc(s1_len + s2_len + s3_len + 1));
+
+	memcpy(s, s1, s1_len);
+	memcpy(s + s1_len, s2, s2_len);
+	memcpy(s + s1_len + s2_len, s3, s3_len);
+
+	s[s1_len + s2_len + s3_len] = '\0';
+
+	return(s);
+}
 /**********************************************************************//**
 Replace every occurrence of s1 in str with s2. Overlapping instances of s1
 are only replaced once.
@@ -457,7 +486,7 @@ ut_strreplace(
 	ulint		s1_len = strlen(s1);
 	ulint		s2_len = strlen(s2);
 	ulint		count = 0;
-	int		len_delta = (int)s2_len - (int)s1_len;
+	int		len_delta = (int) s2_len - (int) s1_len;
 
 	str_end = str + str_len;
 
@@ -467,7 +496,9 @@ ut_strreplace(
 		count = ut_strcount(str, s1);
 	}
 
-	new_str = mem_alloc(str_len + count * len_delta + 1);
+	new_str = static_cast<char*>(
+		mem_alloc(str_len + count * len_delta + 1));
+
 	ptr = new_str;
 
 	while (str) {
diff --git a/storage/xtradb/ut/ut0rbt.c b/storage/xtradb/ut/ut0rbt.cc
index 34a2c04468b..a6c02a8514a 100644
--- a/storage/xtradb/ut/ut0rbt.c
+++ b/storage/xtradb/ut/ut0rbt.cc
@@ -1,12 +1,6 @@
 /***************************************************************************//**
 
-Copyright (c) 2007, 2010, Innobase Oy. All Rights Reserved.
-
-Portions of this file contain modifications contributed and copyrighted by
-Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
-are described briefly in the InnoDB documentation. The contributions by
-Sun Microsystems are incorporated with their permission, and subject to the
-conditions contained in the file COPYING.Sun_Microsystems.
+Copyright (c) 2007, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 /********************************************************************//**
@@ -89,8 +83,21 @@ rbt_check_ordering(
 	/* Iterate over all the nodes, comparing each node with the prev */
 	for (node = rbt_first(tree); node; node = rbt_next(tree, prev)) {
 
-		if (prev && tree->compare(prev->value, node->value) >= 0) {
-			return(FALSE);
+		if (prev) {
+			int	result;
+
+			if (tree->cmp_arg) {
+				result = tree->compare_with_arg(
+					tree->cmp_arg, prev->value,
+					node->value);
+			} else {
+				result = tree->compare(
+					prev->value, node->value);
+			}
+
+			if (result >= 0) {
+				return(FALSE);
+			}
 		}
 
 		prev = node;
@@ -266,7 +273,13 @@ rbt_tree_insert(
 	while (current != tree->nil) {
 
 		parent.last = current;
-		parent.result = tree->compare(key, current->value);
+
+		if (tree->cmp_arg) {
+			parent.result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			parent.result = tree->compare(key, current->value);
+		}
 
 		if (parent.result < 0) {
 			current = current->left;
@@ -439,7 +452,7 @@ rbt_find_predecessor(
 		ib_rbt_node_t*	parent = current->parent;
 
 		/* Cast away the const. */
-		prev = (ib_rbt_node_t*)current;
+		prev = (ib_rbt_node_t*) current;
 
 		while (parent != tree->root && prev == parent->left) {
 			prev = parent;
@@ -749,6 +762,30 @@ rbt_free(
 }
 
 /**********************************************************************//**
+Create an instance of a red black tree, whose comparison function takes
+an argument
+@return	an empty rb tree */
+UNIV_INTERN
+ib_rbt_t*
+rbt_create_arg_cmp(
+/*===============*/
+	size_t		sizeof_value,		/*!< in: sizeof data item */
+	ib_rbt_arg_compare
+			compare,		/*!< in: fn to compare items */
+	void*		cmp_arg)		/*!< in: compare fn arg */
+{
+	ib_rbt_t*       tree;
+
+	ut_a(cmp_arg);
+
+	tree = rbt_create(sizeof_value, NULL);
+	tree->cmp_arg = cmp_arg;
+	tree->compare_with_arg = compare;
+
+	return(tree);
+}
+
+/**********************************************************************//**
 Create an instance of a red black tree.
 @return	an empty rb tree */
 UNIV_INTERN
@@ -856,7 +893,7 @@ rbt_add_preallocated_node(
 	}
 
 	/* Append the node, the hope here is that the caller knows
-	   what s/he is doing. */
+	what s/he is doing. */
 	rbt_tree_add_child(tree, parent, node);
 	rbt_balance_tree(tree, node);
 
@@ -883,7 +920,14 @@ rbt_lookup(
 
 	/* Regular binary search. */
 	while (current != tree->nil) {
-		int	result = tree->compare(key, current->value);
+		int	result;
+
+		if (tree->cmp_arg) {
+			result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			result = tree->compare(key, current->value);
+		}
 
 		if (result < 0) {
 			current = current->left;
@@ -958,7 +1002,14 @@ rbt_lower_bound(
 	ib_rbt_node_t*	current = ROOT(tree);
 
 	while (current != tree->nil) {
-		int result = tree->compare(key, current->value);
+		int	result;
+
+		if (tree->cmp_arg) {
+			result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			result = tree->compare(key, current->value);
+		}
 
 		if (result > 0) {
 
@@ -992,7 +1043,14 @@ rbt_upper_bound(
 	ib_rbt_node_t*	current = ROOT(tree);
 
 	while (current != tree->nil) {
-		int result = tree->compare(key, current->value);
+		int	result;
+
+		if (tree->cmp_arg) {
+			result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			result = tree->compare(key, current->value);
+		}
 
 		if (result > 0) {
 
@@ -1032,7 +1090,13 @@ rbt_search(
 	while (current != tree->nil) {
 
 		parent->last = current;
-		parent->result = tree->compare(key, current->value);
+
+		if (tree->cmp_arg) {
+			parent->result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			parent->result = tree->compare(key, current->value);
+		}
 
 		if (parent->result > 0) {
 			current = current->right;
@@ -1057,7 +1121,10 @@ rbt_search_cmp(
 	const ib_rbt_t*	tree,			/*!< in: rb tree */
 	ib_rbt_bound_t*	parent,			/*!< in: search bounds */
 	const void*	key,			/*!< in: key to search */
-	ib_rbt_compare	compare)		/*!< in: fn to compare items */
+	ib_rbt_compare	compare,		/*!< in: fn to compare items */
+	ib_rbt_arg_compare
+			arg_compare)		/*!< in: fn to compare items
+						with argument */
 {
 	ib_rbt_node_t*	current = ROOT(tree);
 
@@ -1068,7 +1135,14 @@ rbt_search_cmp(
 	while (current != tree->nil) {
 
 		parent->last = current;
-		parent->result = compare(key, current->value);
+
+		if (arg_compare) {
+			ut_ad(tree->cmp_arg);
+			parent->result = arg_compare(
+				tree->cmp_arg, key, current->value);
+		} else {
+			parent->result = compare(key, current->value);
+		}
 
 		if (parent->result > 0) {
 			current = current->right;
@@ -1224,7 +1298,7 @@ rbt_merge_uniq_destructive(
 	for (src_node = (ib_rbt_node_t*) rbt_first(src); src_node; /* */) {
 		ib_rbt_node_t*	prev = src_node;
 
-		src_node = (ib_rbt_node_t*)rbt_next(src, prev);
+		src_node = (ib_rbt_node_t*) rbt_next(src, prev);
 
 		/* Skip duplicates. */
 		if (rbt_search(dst, &parent, prev->value) != 0) {
diff --git a/storage/xtradb/ut/ut0rnd.c b/storage/xtradb/ut/ut0rnd.cc
index feaee0d0864..3b4d7381181 100644
--- a/storage/xtradb/ut/ut0rnd.c
+++ b/storage/xtradb/ut/ut0rnd.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /***************************************************************//**
-@file ut/ut0rnd.c
+@file ut/ut0rnd.cc
 Random numbers and hashing
 
 Created 5/11/1994 Heikki Tuuri
@@ -59,14 +59,14 @@ ut_find_prime(
 		pow2 = 2 * pow2;
 	}
 
-	if ((double)n < 1.05 * (double)pow2) {
-		n = (ulint) ((double)n * UT_RANDOM_1);
+	if ((double) n < 1.05 * (double) pow2) {
+		n = (ulint) ((double) n * UT_RANDOM_1);
 	}
 
 	pow2 = 2 * pow2;
 
-	if ((double)n > 0.95 * (double)pow2) {
-		n = (ulint) ((double)n * UT_RANDOM_2);
+	if ((double) n > 0.95 * (double) pow2) {
+		n = (ulint) ((double) n * UT_RANDOM_2);
 	}
 
 	if (n > pow2 - 20) {
@@ -77,7 +77,7 @@ ut_find_prime(
 	n more random (especially, if it was not near
 	a power of 2), we then multiply it by a random number. */
 
-	n = (ulint) ((double)n * UT_RANDOM_3);
+	n = (ulint) ((double) n * UT_RANDOM_3);
 
 	for (;; n++) {
 		i = 2;
diff --git a/storage/xtradb/ut/ut0ut.c b/storage/xtradb/ut/ut0ut.cc
index 08ced5f56ef..f8a1593937a 100644
--- a/storage/xtradb/ut/ut0ut.c
+++ b/storage/xtradb/ut/ut0ut.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /***************************************************************//**
-@file ut/ut0ut.c
+@file ut/ut0ut.cc
 Various utilities for Innobase.
 
 Created 5/11/1994 Heikki Tuuri
@@ -25,6 +25,11 @@ Created 5/11/1994 Heikki Tuuri
 
 #include "ut0ut.h"
 
+#ifndef UNIV_INNOCHECKSUM
+
+#include "ut0sort.h"
+#include "os0thread.h" /* thread-ID */
+
 #ifdef UNIV_NONINL
 #include "ut0ut.ic"
 #endif
@@ -34,6 +39,7 @@ Created 5/11/1994 Heikki Tuuri
 #include <ctype.h>
 
 #ifndef UNIV_HOTBACKUP
+# include "btr0types.h"
 # include "trx0trx.h"
 # include "ha_prototypes.h"
 # include "mysql_com.h" /* NAME_LEN */
@@ -93,26 +99,6 @@ reimplement this function. */
 #define	ut_gettimeofday		gettimeofday
 #endif
 
-/********************************************************//**
-Gets the high 32 bits in a ulint. That is makes a shift >> 32,
-but since there seem to be compiler bugs in both gcc and Visual C++,
-we do this by a special conversion.
-@return	a >> 32 */
-UNIV_INTERN
-ulint
-ut_get_high32(
-/*==========*/
-	ulint	a)	/*!< in: ulint */
-{
-	ib_int64_t	i;
-
-	i = (ib_int64_t)a;
-
-	i = i >> 32;
-
-	return((ulint)i);
-}
-
 /**********************************************************//**
 Returns system time. We do not specify the format of the time returned:
 the only way to manipulate it is to use the function ut_difftime.
@@ -224,6 +210,8 @@ ut_difftime(
 	return(difftime(time2, time1));
 }
 
+#endif /* !UNIV_INNOCHECKSUM */
+
 /**********************************************************//**
 Prints a timestamp to a file. */
 UNIV_INTERN
@@ -232,43 +220,51 @@ ut_print_timestamp(
 /*===============*/
 	FILE*  file) /*!< in: file where to print */
 {
+	ulint thread_id = 0;
+
+#ifndef UNIV_INNOCHECKSUM
+	thread_id = os_thread_pf(os_thread_get_curr_id());
+#endif
+
 #ifdef __WIN__
 	SYSTEMTIME cal_tm;
 
 	GetLocalTime(&cal_tm);
 
-	fprintf(file,"%02d%02d%02d %2d:%02d:%02d",
-		(int)cal_tm.wYear % 100,
-		(int)cal_tm.wMonth,
-		(int)cal_tm.wDay,
-		(int)cal_tm.wHour,
-		(int)cal_tm.wMinute,
-		(int)cal_tm.wSecond);
+	fprintf(file, "%d-%02d-%02d %02d:%02d:%02d %lx",
+		(int) cal_tm.wYear,
+		(int) cal_tm.wMonth,
+		(int) cal_tm.wDay,
+		(int) cal_tm.wHour,
+		(int) cal_tm.wMinute,
+		(int) cal_tm.wSecond,
+		thread_id);
 #else
-#ifdef HAVE_LOCALTIME_R
-	struct tm  cal_tm;
-#endif
 	struct tm* cal_tm_ptr;
 	time_t	   tm;
 
-	time(&tm);
-
 #ifdef HAVE_LOCALTIME_R
+	struct tm  cal_tm;
+	time(&tm);
 	localtime_r(&tm, &cal_tm);
 	cal_tm_ptr = &cal_tm;
 #else
+	time(&tm);
 	cal_tm_ptr = localtime(&tm);
 #endif
-	fprintf(file,"%02d%02d%02d %2d:%02d:%02d",
-		cal_tm_ptr->tm_year % 100,
+	fprintf(file, "%d-%02d-%02d %02d:%02d:%02d %lx",
+		cal_tm_ptr->tm_year + 1900,
 		cal_tm_ptr->tm_mon + 1,
 		cal_tm_ptr->tm_mday,
 		cal_tm_ptr->tm_hour,
 		cal_tm_ptr->tm_min,
-		cal_tm_ptr->tm_sec);
+		cal_tm_ptr->tm_sec,
+		thread_id);
 #endif
 }
 
+#ifndef UNIV_INNOCHECKSUM
+
 /**********************************************************//**
 Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
 UNIV_INTERN
@@ -283,25 +279,23 @@ ut_sprintf_timestamp(
 	GetLocalTime(&cal_tm);
 
 	sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
-		(int)cal_tm.wYear % 100,
-		(int)cal_tm.wMonth,
-		(int)cal_tm.wDay,
-		(int)cal_tm.wHour,
-		(int)cal_tm.wMinute,
-		(int)cal_tm.wSecond);
+		(int) cal_tm.wYear % 100,
+		(int) cal_tm.wMonth,
+		(int) cal_tm.wDay,
+		(int) cal_tm.wHour,
+		(int) cal_tm.wMinute,
+		(int) cal_tm.wSecond);
 #else
-#ifdef HAVE_LOCALTIME_R
-	struct tm  cal_tm;
-#endif
 	struct tm* cal_tm_ptr;
 	time_t	   tm;
 
-	time(&tm);
-
 #ifdef HAVE_LOCALTIME_R
+	struct tm  cal_tm;
+	time(&tm);
 	localtime_r(&tm, &cal_tm);
 	cal_tm_ptr = &cal_tm;
 #else
+	time(&tm);
 	cal_tm_ptr = localtime(&tm);
 #endif
 	sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
@@ -330,25 +324,23 @@ ut_sprintf_timestamp_without_extra_chars(
 	GetLocalTime(&cal_tm);
 
 	sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d",
-		(int)cal_tm.wYear % 100,
-		(int)cal_tm.wMonth,
-		(int)cal_tm.wDay,
-		(int)cal_tm.wHour,
-		(int)cal_tm.wMinute,
-		(int)cal_tm.wSecond);
+		(int) cal_tm.wYear % 100,
+		(int) cal_tm.wMonth,
+		(int) cal_tm.wDay,
+		(int) cal_tm.wHour,
+		(int) cal_tm.wMinute,
+		(int) cal_tm.wSecond);
 #else
-#ifdef HAVE_LOCALTIME_R
-	struct tm  cal_tm;
-#endif
 	struct tm* cal_tm_ptr;
 	time_t	   tm;
 
-	time(&tm);
-
 #ifdef HAVE_LOCALTIME_R
+	struct tm  cal_tm;
+	time(&tm);
 	localtime_r(&tm, &cal_tm);
 	cal_tm_ptr = &cal_tm;
 #else
+	time(&tm);
 	cal_tm_ptr = localtime(&tm);
 #endif
 	sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d",
@@ -376,27 +368,25 @@ ut_get_year_month_day(
 
 	GetLocalTime(&cal_tm);
 
-	*year = (ulint)cal_tm.wYear;
-	*month = (ulint)cal_tm.wMonth;
-	*day = (ulint)cal_tm.wDay;
+	*year = (ulint) cal_tm.wYear;
+	*month = (ulint) cal_tm.wMonth;
+	*day = (ulint) cal_tm.wDay;
 #else
-#ifdef HAVE_LOCALTIME_R
-	struct tm  cal_tm;
-#endif
 	struct tm* cal_tm_ptr;
 	time_t	   tm;
 
-	time(&tm);
-
 #ifdef HAVE_LOCALTIME_R
+	struct tm  cal_tm;
+	time(&tm);
 	localtime_r(&tm, &cal_tm);
 	cal_tm_ptr = &cal_tm;
 #else
+	time(&tm);
 	cal_tm_ptr = localtime(&tm);
 #endif
-	*year = (ulint)cal_tm_ptr->tm_year + 1900;
-	*month = (ulint)cal_tm_ptr->tm_mon + 1;
-	*day = (ulint)cal_tm_ptr->tm_mday;
+	*year = (ulint) cal_tm_ptr->tm_year + 1900;
+	*month = (ulint) cal_tm_ptr->tm_mon + 1;
+	*day = (ulint) cal_tm_ptr->tm_mday;
 #endif
 }
 #endif /* UNIV_HOTBACKUP */
@@ -446,13 +436,13 @@ ut_print_buf(
 
 	fprintf(file, " len %lu; hex ", len);
 
-	for (data = (const byte*)buf, i = 0; i < len; i++) {
+	for (data = (const byte*) buf, i = 0; i < len; i++) {
 		fprintf(file, "%02lx", (ulong)*data++);
 	}
 
 	fputs("; asc ", file);
 
-	data = (const byte*)buf;
+	data = (const byte*) buf;
 
 	for (i = 0; i < len; i++) {
 		int	c = (int) *data++;
@@ -462,6 +452,21 @@ ut_print_buf(
 	putc(';', file);
 }
 
+/**********************************************************************//**
+Sort function for ulint arrays. */
+UNIV_INTERN
+void
+ut_ulint_sort(
+/*==========*/
+	ulint*	arr,		/*!< in/out: array to sort */
+	ulint*	aux_arr,	/*!< in/out: aux array to use in sort */
+	ulint	low,		/*!< in: lower bound */
+	ulint	high)		/*!< in: upper bound */
+{
+	UT_SORT_FUNCTION_BODY(ut_ulint_sort, arr, aux_arr, low, high,
+			      ut_ulint_cmp);
+}
+
 /*************************************************************//**
 Calculates fast the number rounded up to the nearest power of 2.
 @return	first power of 2 which is >= n */
@@ -520,7 +525,7 @@ void
 ut_print_name(
 /*==========*/
 	FILE*		f,	/*!< in: output stream */
-	trx_t*		trx,	/*!< in: transaction */
+	const trx_t*	trx,	/*!< in: transaction */
 	ibool		table_id,/*!< in: TRUE=print a table name,
 				FALSE=print other identifier */
 	const char*	name)	/*!< in: name to print */
@@ -538,7 +543,7 @@ void
 ut_print_namel(
 /*===========*/
 	FILE*		f,	/*!< in: output stream */
-	trx_t*		trx,	/*!< in: transaction (NULL=no quotes) */
+	const trx_t*	trx,	/*!< in: transaction (NULL=no quotes) */
 	ibool		table_id,/*!< in: TRUE=print a table name,
 				FALSE=print other identifier */
 	const char*	name,	/*!< in: name to print */
@@ -558,6 +563,50 @@ ut_print_namel(
 }
 
 /**********************************************************************//**
+Formats a table or index name, quoted as an SQL identifier. If the name
+contains a slash '/', the result will contain two identifiers separated by
+a period (.), as in SQL database_name.identifier.
+@return pointer to 'formatted' */
+UNIV_INTERN
+char*
+ut_format_name(
+/*===========*/
+	const char*	name,		/*!< in: table or index name, must be
+					'\0'-terminated */
+	ibool		is_table,	/*!< in: if TRUE then 'name' is a table
+					name */
+	char*		formatted,	/*!< out: formatted result, will be
+					'\0'-terminated */
+	ulint		formatted_size)	/*!< out: no more than this number of
+					bytes will be written to 'formatted' */
+{
+	switch (formatted_size) {
+	case 1:
+		formatted[0] = '\0';
+		/* FALL-THROUGH */
+	case 0:
+		return(formatted);
+	}
+
+	char*	end;
+
+	end = innobase_convert_name(formatted, formatted_size,
+				    name, strlen(name), NULL, is_table);
+
+	/* If the space in 'formatted' was completely used, then sacrifice
+	the last character in order to write '\0' at the end. */
+	if ((ulint) (end - formatted) == formatted_size) {
+		end--;
+	}
+
+	ut_a((ulint) (end - formatted) < formatted_size);
+
+	*end = '\0';
+
+	return(formatted);
+}
+
+/**********************************************************************//**
 Catenate files. */
 UNIV_INTERN
 void
@@ -587,6 +636,26 @@ ut_copy_file(
 #ifdef __WIN__
 # include <stdarg.h>
 /**********************************************************************//**
+A substitute for vsnprintf(3), formatted output conversion into
+a limited buffer. Note: this function DOES NOT return the number of
+characters that would have been printed if the buffer was unlimited because
+VC's _vsnprintf() returns -1 in this case and we would need to call
+_vscprintf() in addition to estimate that but we would need another copy
+of "ap" for that and VC does not provide va_copy(). */
+UNIV_INTERN
+void
+ut_vsnprintf(
+/*=========*/
+	char*		str,	/*!< out: string */
+	size_t		size,	/*!< in: str size */
+	const char*	fmt,	/*!< in: format */
+	va_list		ap)	/*!< in: format values */
+{
+	_vsnprintf(str, size, fmt, ap);
+	str[size - 1] = '\0';
+}
+
+/**********************************************************************//**
 A substitute for snprintf(3), formatted output conversion into
 a limited buffer.
 @return number of characters that would have been printed if the size
@@ -633,7 +702,7 @@ UNIV_INTERN
 const char*
 ut_strerr(
 /*======*/
-	enum db_err	num)	/*!< in: error number */
+	dberr_t	num)	/*!< in: error number */
 {
 	switch (num) {
 	case DB_SUCCESS:
@@ -642,6 +711,8 @@ ut_strerr(
 		return("Success, record lock created");
 	case DB_ERROR:
 		return("Generic error");
+	case DB_READ_ONLY:
+		return("Read only transaction");
 	case DB_INTERRUPTED:
 		return("Operation interrupted");
 	case DB_OUT_OF_MEMORY:
@@ -686,10 +757,12 @@ ut_strerr(
 		return("Cannot drop constraint");
 	case DB_NO_SAVEPOINT:
 		return("No such savepoint");
-	case DB_TABLESPACE_ALREADY_EXISTS:
+	case DB_TABLESPACE_EXISTS:
 		return("Tablespace already exists");
 	case DB_TABLESPACE_DELETED:
-		return("No such tablespace");
+		return("Tablespace deleted or being deleted");
+	case DB_TABLESPACE_NOT_FOUND:
+		return("Tablespace not found");
 	case DB_LOCK_TABLE_FULL:
 		return("Lock structs have exhausted the buffer pool");
 	case DB_FOREIGN_DUPLICATE_KEY:
@@ -700,8 +773,8 @@ ut_strerr(
 		return("Too many concurrent transactions");
 	case DB_UNSUPPORTED:
 		return("Unsupported");
-	case DB_PRIMARY_KEY_IS_NULL:
-		return("Primary key is NULL");
+	case DB_INVALID_NULL:
+		return("NULL value encountered in NOT NULL column");
 	case DB_STATS_DO_NOT_EXIST:
 		return("Persistent statistics do not exist");
 	case DB_FAIL:
@@ -720,6 +793,8 @@ ut_strerr(
 		return("No index on referencing keys in referencing table");
 	case DB_PARENT_NO_INDEX:
 		return("No index on referenced keys in referenced table");
+	case DB_FTS_INVALID_DOCID:
+		return("FTS Doc ID cannot be zero");
 	case DB_INDEX_CORRUPT:
 		return("Index corrupted");
 	case DB_UNDO_RECORD_TOO_BIG:
@@ -728,10 +803,25 @@ ut_strerr(
 		return("End of index");
         case DB_SEARCH_ABORTED_BY_USER:
                 return("Operation was interrupted by end user");
+	case DB_IO_ERROR:
+		return("I/O error");
 	case DB_TABLE_IN_FK_CHECK:
 		return("Table is being used in foreign key check");
+	case DB_DATA_MISMATCH:
+		return("data mismatch");
+	case DB_SCHEMA_NOT_LOCKED:
+		return("schema not locked");
+	case DB_NOT_FOUND:
+		return("not found");
+	case DB_ONLINE_LOG_TOO_BIG:
+		return("Log size exceeded during online index creation");
+	case DB_DICT_CHANGED:
+		return("Table dictionary has changed");
 	case DB_IDENTIFIER_TOO_LONG:
 		return("Identifier name is too long");
+	case DB_FTS_EXCEED_RESULT_CACHE_LIMIT:
+		return("FTS query exceeds result cache limit");
+
 	/* do not add default: in order to produce a warning if new code
 	is added to the enum but not added here */
 	}
@@ -744,3 +834,4 @@ ut_strerr(
 	/* NOT REACHED */
 	return("Unknown error");
 }
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/xtradb/ut/ut0vec.c b/storage/xtradb/ut/ut0vec.cc
index b2f8683bec8..5842d9f1c0e 100644
--- a/storage/xtradb/ut/ut0vec.c
+++ b/storage/xtradb/ut/ut0vec.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /*******************************************************************//**
-@file ut/ut0vec.c
+@file ut/ut0vec.cc
 A vector of pointers to data items
 
 Created 4/6/2006 Osku Salerma
@@ -27,53 +27,52 @@ Created 4/6/2006 Osku Salerma
 #ifdef UNIV_NONINL
 #include "ut0vec.ic"
 #endif
-#include <string.h>
+#include "mem0mem.h"
 
-/****************************************************************//**
-Create a new vector with the given initial size.
-@return	vector */
+/********************************************************************
+Create a new vector with the given initial size. */
 UNIV_INTERN
 ib_vector_t*
 ib_vector_create(
 /*=============*/
-	mem_heap_t*	heap,	/*!< in: heap */
-	ulint		size)	/*!< in: initial size */
+					/* out: vector */
+	ib_alloc_t*	allocator,	/* in: vector allocator */
+	ulint		sizeof_value,	/* in: size of data item */
+	ulint		size)		/* in: initial size */
 {
 	ib_vector_t*	vec;
 
 	ut_a(size > 0);
 
-	vec = mem_heap_alloc(heap, sizeof(*vec));
+	vec = static_cast<ib_vector_t*>(
+		allocator->mem_malloc(allocator, sizeof(*vec)));
 
-	vec->heap = heap;
-	vec->data = mem_heap_alloc(heap, sizeof(void*) * size);
 	vec->used = 0;
 	vec->total = size;
+	vec->allocator = allocator;
+	vec->sizeof_value = sizeof_value;
+
+	vec->data = static_cast<void*>(
+		allocator->mem_malloc(allocator, vec->sizeof_value * size));
 
 	return(vec);
 }
 
-/****************************************************************//**
-Push a new element to the vector, increasing its size if necessary. */
+/********************************************************************
+Resize the vector, currently the vector can only grow and we
+expand the number of elements it can hold by 2 times. */
 UNIV_INTERN
 void
-ib_vector_push(
-/*===========*/
-	ib_vector_t*	vec,	/*!< in: vector */
-	void*		elem)	/*!< in: data element */
+ib_vector_resize(
+/*=============*/
+	ib_vector_t*	vec)		/* in: vector */
 {
-	if (vec->used >= vec->total) {
-		void**	new_data;
-		ulint	new_total = vec->total * 2;
-
-		new_data = mem_heap_alloc(vec->heap,
-					  sizeof(void*) * new_total);
-		memcpy(new_data, vec->data, sizeof(void*) * vec->total);
+	ulint		new_total = vec->total * 2;
+	ulint		old_size = vec->used * vec->sizeof_value;
+	ulint		new_size = new_total * vec->sizeof_value;
 
-		vec->data = new_data;
-		vec->total = new_total;
-	}
+	vec->data = static_cast<void*>(vec->allocator->mem_resize(
+		vec->allocator, vec->data, old_size, new_size));
 
-	vec->data[vec->used] = elem;
-	vec->used++;
+	vec->total = new_total;
 }
diff --git a/storage/xtradb/ut/ut0wqueue.c b/storage/xtradb/ut/ut0wqueue.cc
index e6885b206eb..d1ba36b3b00 100644
--- a/storage/xtradb/ut/ut0wqueue.c
+++ b/storage/xtradb/ut/ut0wqueue.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,15 +11,15 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 #include "ut0wqueue.h"
 
 /*******************************************************************//**
-@file ut/ut0wqueue.c
+@file ut/ut0wqueue.cc
 A work queue
 
 Created 4/26/2006 Osku Salerma
@@ -33,14 +33,14 @@ ib_wqueue_t*
 ib_wqueue_create(void)
 /*===================*/
 {
-	ib_wqueue_t*	wq = mem_alloc(sizeof(ib_wqueue_t));
+	ib_wqueue_t*	wq = static_cast<ib_wqueue_t*>(mem_alloc(sizeof(*wq)));
 
 	/* Function ib_wqueue_create() has not been used anywhere,
 	not necessary to instrument this mutex */
 	mutex_create(PFS_NOT_INSTRUMENTED, &wq->mutex, SYNC_WORK_QUEUE);
 
 	wq->items = ib_list_create();
-	wq->event = os_event_create(NULL);
+	wq->event = os_event_create();
 
 	return(wq);
 }
@@ -53,8 +53,6 @@ ib_wqueue_free(
 /*===========*/
 	ib_wqueue_t*	wq)	/*!< in: work queue */
 {
-	ut_a(!ib_list_get_first(wq->items));
-
 	mutex_free(&wq->mutex);
 	ib_list_free(wq->items);
 	os_event_free(wq->event);
@@ -118,3 +116,60 @@ ib_wqueue_wait(
 
 	return(node->data);
 }
+
+
+/********************************************************************
+Wait for a work item to appear in the queue for specified time. */
+
+void*
+ib_wqueue_timedwait(
+/*================*/
+					/* out: work item or NULL on timeout*/
+	ib_wqueue_t*	wq,		/* in: work queue */
+	ib_time_t	wait_in_usecs)	/* in: wait time in micro seconds */
+{
+	ib_list_node_t*	node = NULL;
+
+	for (;;) {
+		ulint		error;
+		ib_int64_t	sig_count;
+
+		mutex_enter(&wq->mutex);
+
+		node = ib_list_get_first(wq->items);
+
+		if (node) {
+			ib_list_remove(wq->items, node);
+
+			mutex_exit(&wq->mutex);
+			break;
+		}
+
+		sig_count = os_event_reset(wq->event);
+
+		mutex_exit(&wq->mutex);
+
+		error = os_event_wait_time_low(wq->event,
+					       (ulint) wait_in_usecs,
+					       sig_count);
+
+		if (error == OS_SYNC_TIME_EXCEEDED) {
+			break;
+		}
+	}
+
+	return(node ? node->data : NULL);
+}
+
+/********************************************************************
+Check if queue is empty. */
+
+ibool
+ib_wqueue_is_empty(
+/*===============*/
+					/* out: TRUE if queue empty
+					else FALSE */
+	const ib_wqueue_t*	wq)	/* in: work queue */
+{
+	return(ib_list_is_empty(wq->items));
+}
author	Sergei Golubchik <sergii@pisem.net>	2013-12-22 17:06:50 +0100
committer	Sergei Golubchik <sergii@pisem.net>	2013-12-22 17:06:50 +0100
commit	ffa8c4cfcc41d4f160e3bdfca5cfd4b01a7d6e63 (patch)
tree	728585c36f22a5db3cea796430883d0ebc5c05eb
parent	e27c34f9e4ca15c797fcd3191ee5679c2f237a09 (diff)
parent	52c26f7a1f675185d2ef1a28aca7f9bcc67c6414 (diff)
download	mariadb-git-ffa8c4cfcc41d4f160e3bdfca5cfd4b01a7d6e63.tar.gz